From d5128df26ec7b5ea2724c02da0c06b5b3099eba4 Mon Sep 17 00:00:00 2001 From: LIAO HENG Date: Thu, 25 Jun 2026 00:22:23 +0800 Subject: [PATCH 01/20] =?UTF-8?q?Add:=20fully=5Fdistributed=5Fwithin=5Fcor?= =?UTF-8?q?e=20runtime=20=E2=80=94=20SPMD=20on-core=20orchestration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new runtime where orchestration, scheduling, and execution all run in SPMD fashion on the AICore workers themselves, with AICPU reduced to a thin init/handoff/teardown stub (no scheduler). Each core builds, owns, and executes its own tasks. Design pillars (see docs/fully_distributed_within_core.md): - Task ownership via a claim race over two global cursors (cube_cursor for AIC-anchored, vector_cursor for AIV-only). - owner = builder = executor, with core-type matching. - Full per-core duplicate TensorMap for dependency discovery. - Per-core private task ring + one global completion-flag ring driving a run-ahead, pull-based execution loop. - block.won anchor/follower co-ownership for multi-core (MIX / 2V) tasks. - Bounded GM output-heap ring reclaimed by a global completion frontier. Includes: - docs/fully_distributed_within_core.md (authoritative design, in Chinese). - src/{a2a3,a5}/runtime/fully_distributed_within_core/ runtime skeleton + dist_engine (a2a3 validated on a2a3sim via benchmark_bgemm + mix_coown). - examples/a2a3/fully_distributed_within_core/ migrated examples. - tests/st/a2a3/fully_distributed_within_core/ migrated ST tests. Co-authored-by: Cursor --- docs/fully_distributed_within_core.md | 822 +++++ .../kernels/aiv/kernel_consumer.cpp | 59 + .../kernels/aiv/kernel_notify_wait.cpp | 31 + .../kernels/aiv/kernel_producer_notify.cpp | 79 + .../async_notify_orchestration.cpp | 65 + .../test_async_notify_demo.py | 177 + .../kernels/aic/kernel_gemm_tile.cpp | 150 + .../kernels/aiv/kernel_tile_add.cpp | 107 + .../kernels/orchestration/bgemm_orch.cpp | 120 + .../benchmark_bgemm/test_benchmark_bgemm.py | 119 + .../kernels/aiv/kernel_consumer.cpp | 36 + .../kernels/aiv/kernel_notify_wait.cpp | 32 + .../kernels/aiv/kernel_producer.cpp | 63 + .../orchestration/deferred_notify_orch.cpp | 68 + .../test_deferred_notify_demo.py | 189 ++ .../mix_coown/kernels/aic/kernel_mm.cpp | 148 + .../mix_coown/kernels/aiv/kernel_add_v0.cpp | 101 + .../mix_coown/kernels/aiv/kernel_add_v1.cpp | 101 + .../mix_coown/kernels/aiv/kernel_sum.cpp | 108 + .../kernels/orchestration/mix_coown_orch.cpp | 103 + .../mix_coown/test_mix_coown.py | 112 + .../kernels/aic/aic_pv_matmul.cpp | 114 + .../kernels/aic/aic_qk_matmul.cpp | 115 + .../kernels/aiv/aiv_online_update.cpp | 256 ++ .../kernels/aiv/aiv_softmax_prepare.cpp | 156 + .../orchestration/paged_attention_orch.cpp | 292 ++ .../paged_attention/test_paged_attention.py | 197 ++ .../kernels/aic/aic_pv_matmul.cpp | 114 + .../kernels/aic/aic_qk_matmul.cpp | 115 + .../kernels/aiv/aiv_online_update.cpp | 256 ++ .../kernels/aiv/aiv_softmax_prepare.cpp | 156 + .../orchestration/paged_attention_orch.cpp | 311 ++ .../test_paged_attention.py | 201 ++ .../test_paged_attention_ringbuffer.py | 115 + .../kernels/aic/aic_pv_matmul.cpp | 168 + .../kernels/aic/aic_qk_matmul.cpp | 156 + .../kernels/aiv/aiv_online_update.cpp | 255 ++ .../kernels/aiv/aiv_softmax_prepare.cpp | 292 ++ .../orchestration/paged_attention_orch.cpp | 352 ++ .../test_paged_attention_unroll.py | 132 + .../kernels/aic/down_proj.cpp | 328 ++ .../kernels/aic/gate_proj.cpp | 331 ++ .../qwen3_14b_decode/kernels/aic/kv_proj.cpp | 597 ++++ .../qwen3_14b_decode/kernels/aic/out_proj.cpp | 269 ++ .../qwen3_14b_decode/kernels/aic/q_proj.cpp | 357 ++ .../kernels/aic/qk_matmul.cpp | 307 ++ .../kernels/aic/sv_matmul.cpp | 291 ++ .../qwen3_14b_decode/kernels/aic/up_proj.cpp | 331 ++ .../kernels/aiv/attention_writeback.cpp | 129 + .../kernels/aiv/copy_hidden.cpp | 141 + .../qwen3_14b_decode/kernels/aiv/copy_out.cpp | 140 + .../kernels/aiv/down_proj_residual.cpp | 172 + .../kernels/aiv/online_softmax.cpp | 550 +++ .../kernels/aiv/out_proj_residual.cpp | 183 + .../kernels/aiv/post_rmsnorm.cpp | 324 ++ .../qwen3_14b_decode/kernels/aiv/q_pad.cpp | 126 + .../qwen3_14b_decode/kernels/aiv/qk_norm.cpp | 456 +++ .../qwen3_14b_decode/kernels/aiv/rmsnorm.cpp | 383 +++ .../kernels/aiv/rope_kv_cache.cpp | 593 ++++ .../qwen3_14b_decode/kernels/aiv/silu.cpp | 219 ++ .../qwen3_14b_decode/kernels/aiv/softmax.cpp | 313 ++ .../kernels/orchestration/qwen3_decode.cpp | 455 +++ .../qwen3_14b_decode/test_qwen3_14b_decode.py | 250 ++ .../kernels/aiv/kernel_add.cpp | 90 + .../kernels/aiv/kernel_noop.cpp | 33 + .../orchestration/scalar_data_orch.cpp | 265 ++ .../scalar_data_test/test_scalar_data.py | 83 + .../kernels/aiv/kernel_consumer.cpp | 64 + .../kernels/aiv/kernel_sdma_tget_async.cpp | 71 + .../sdma_async_completion_orch.cpp | 52 + .../test_sdma_async_completion_demo.py | 209 ++ .../vector_example/kernels/aiv/kernel_add.cpp | 90 + .../kernels/aiv/kernel_add_scalar.cpp | 89 + .../vector_example/kernels/aiv/kernel_mul.cpp | 90 + .../orchestration/example_orchestration.cpp | 113 + .../vector_example/test_vector_example.py | 72 + .../aicore/aicore_executor.cpp | 107 + .../aicpu/aicpu_executor.cpp | 873 +++++ .../build_config.py | 32 + .../common/intrinsic.h | 199 ++ .../common/pto_runtime_status.h | 52 + .../docs/MULTI_RING.md | 330 ++ .../docs/RUNTIME_LOGIC.md | 39 + .../docs/SCALAR_DATA_ACCESS.md | 137 + .../docs/SUBMIT_BY_CLUSTER.md | 222 ++ .../docs/device_log_profiling.md | 166 + .../docs/profiling_levels.md | 480 +++ .../host/dep_gen_replay.cpp | 784 +++++ .../host/dep_gen_replay.h | 106 + .../host/runtime_compile_info.cpp | 27 + .../host/runtime_maker.cpp | 692 ++++ .../orchestration/common.cpp | 197 ++ .../orchestration/pto_arg_with_deps.h | 140 + .../orchestration/pto_orchestration_api.h | 385 +++ .../runtime/aicore_completion_mailbox.h | 189 ++ .../runtime/aicore_completion_mailbox_types.h | 67 + .../backend/sdma/sdma_completion_kernel.h | 143 + .../backend/sdma/sdma_completion_scheduler.h | 66 + .../runtime/common.h | 39 + .../runtime/dist_engine.cpp | 1195 +++++++ .../runtime/dist_engine.h | 55 + .../runtime/pto2_dispatch_payload.h | 97 + .../runtime/pto_async_kernel_api.h | 157 + .../runtime/pto_async_wait.h | 303 ++ .../runtime/pto_completion_token.h | 45 + .../runtime/pto_constants.h | 19 + .../runtime/pto_dep_compute.h | 155 + .../runtime/pto_orchestrator.cpp | 972 ++++++ .../runtime/pto_orchestrator.h | 209 ++ .../runtime/pto_ring_buffer.cpp | 168 + .../runtime/pto_ring_buffer.h | 693 ++++ .../runtime/pto_runtime2.cpp | 287 ++ .../runtime/pto_runtime2.h | 290 ++ .../runtime/pto_runtime2_types.h | 524 +++ .../runtime/pto_shared_memory.h | 270 ++ .../runtime/pto_submit_types.h | 161 + .../runtime/pto_tensormap.h | 723 ++++ .../runtime/pto_types.h | 614 ++++ .../runtime/runtime.h | 356 ++ .../runtime/scheduler/pto_scheduler.cpp | 109 + .../runtime/scheduler/pto_scheduler.h | 1485 +++++++++ .../runtime/scheduler/scheduler_cold_path.cpp | 1093 ++++++ .../scheduler/scheduler_completion.cpp | 614 ++++ .../runtime/scheduler/scheduler_context.h | 423 +++ .../runtime/scheduler/scheduler_dispatch.cpp | 1501 +++++++++ .../runtime/scheduler/scheduler_types.h | 468 +++ .../runtime/shared/pto_runtime2_init.cpp | 466 +++ .../runtime/shared/pto_shared_memory.cpp | 268 ++ .../runtime/shared/pto_tensormap.cpp | 261 ++ .../runtime/shared/runtime.cpp | 169 + .../runtime/tensor_create_info.h | 147 + .../aicore/aicore_executor.cpp | 195 ++ .../aicpu/aicpu_executor.cpp | 848 +++++ .../build_config.py | 32 + .../common/intrinsic.h | 202 ++ .../common/pto_runtime_status.h | 52 + .../docs/MULTI_RING.md | 330 ++ .../docs/RUNTIME_LOGIC.md | 39 + .../docs/SCALAR_DATA_ACCESS.md | 137 + .../docs/SUBMIT_BY_CLUSTER.md | 222 ++ .../docs/device_log_profiling.md | 166 + .../docs/profiling_levels.md | 450 +++ .../host/dep_gen_replay.cpp | 784 +++++ .../host/dep_gen_replay.h | 106 + .../host/runtime_compile_info.cpp | 27 + .../host/runtime_maker.cpp | 691 ++++ .../orchestration/common.cpp | 197 ++ .../orchestration/pto_arg_with_deps.h | 138 + .../orchestration/pto_orchestration_api.h | 386 +++ .../runtime/aicore_completion_mailbox.h | 189 ++ .../runtime/aicore_completion_mailbox_types.h | 67 + .../backend/sdma/sdma_completion_kernel.h | 146 + .../backend/sdma/sdma_completion_scheduler.h | 69 + .../runtime/common.h | 39 + .../runtime/pto2_dispatch_payload.h | 93 + .../runtime/pto_async_kernel_api.h | 157 + .../runtime/pto_async_wait.h | 303 ++ .../runtime/pto_completion_token.h | 45 + .../runtime/pto_constants.h | 19 + .../runtime/pto_dep_compute.h | 155 + .../runtime/pto_orchestrator.cpp | 977 ++++++ .../runtime/pto_orchestrator.h | 206 ++ .../runtime/pto_ring_buffer.cpp | 168 + .../runtime/pto_ring_buffer.h | 694 ++++ .../runtime/pto_runtime2.cpp | 287 ++ .../runtime/pto_runtime2.h | 291 ++ .../runtime/pto_runtime2_types.h | 420 +++ .../runtime/pto_shared_memory.h | 270 ++ .../runtime/pto_submit_types.h | 161 + .../runtime/pto_tensormap.h | 723 ++++ .../runtime/pto_types.h | 602 ++++ .../runtime/runtime.h | 379 +++ .../runtime/scheduler/pto_scheduler.cpp | 109 + .../runtime/scheduler/pto_scheduler.h | 1267 +++++++ .../runtime/scheduler/scheduler_cold_path.cpp | 1096 ++++++ .../scheduler/scheduler_completion.cpp | 514 +++ .../runtime/scheduler/scheduler_context.h | 387 +++ .../runtime/scheduler/scheduler_dispatch.cpp | 1020 ++++++ .../runtime/scheduler/scheduler_types.h | 464 +++ .../runtime/shared/pto_runtime2_init.cpp | 457 +++ .../runtime/shared/pto_shared_memory.cpp | 268 ++ .../runtime/shared/pto_tensormap.cpp | 261 ++ .../runtime/shared/runtime.cpp | 174 + .../runtime/tensor_create_info.h | 147 + .../kernels/aic/kernel_matmul.cpp | 133 + .../kernels/aiv/kernel_add.cpp | 93 + .../orchestration/alternating_orch.cpp | 126 + .../test_alternating_matmul_add.py | 132 + .../kernels/aic/aic_pv_matmul.cpp | 137 + .../kernels/aic/aic_qk_matmul.cpp | 144 + .../kernels/aiv/aiv_online_update.cpp | 230 ++ .../kernels/aiv/aiv_softmax_prepare.cpp | 200 ++ .../orchestration/paged_attention_orch.cpp | 215 ++ .../test_batch_paged_attention.py | 213 ++ .../orchestration/chain_barrier_orch.cpp | 94 + .../dfx/dep_gen/test_dep_gen.py | 259 ++ .../dfx/dep_gen/test_dep_gen_chain.py | 215 ++ .../dfx/l2_swimlane/__init__.py | 8 + .../dfx/l2_swimlane/_swimlane_validate.py | 240 ++ .../orchestration/chained_mix_orch.cpp | 126 + .../dfx/l2_swimlane/test_l2_swimlane.py | 99 + .../dfx/l2_swimlane/test_l2_swimlane_mixed.py | 142 + .../dfx/pmu/test_pmu.py | 111 + .../dfx/scope_stats/test_scope_stats.py | 134 + .../orchestration/partial_dump_orch.cpp | 91 + .../dfx/tensor_dump/test_tensor_dump.py | 189 ++ .../kernels/aic/kernel_copy_first.cpp | 57 + .../kernels/aic/kernel_write_const.cpp | 51 + .../kernels/orchestration/dummy_task_orch.cpp | 146 + .../dummy_task/test_dummy_task.py | 120 + .../dynamic_register/test_dynamic_register.py | 445 +++ .../aic/kernel_write_const_visible.cpp | 55 + .../orchestration/fanin_lookup_perf_orch.cpp | 93 + .../test_fanin_lookup_perf.py | 87 + .../kernels/aic/kernel_matmul.cpp | 133 + .../mixed_example/kernels/aiv/kernel_add.cpp | 96 + .../kernels/aiv/kernel_add_standalone.cpp | 81 + .../mixed_example/kernels/aiv/kernel_mul.cpp | 97 + .../kernels/aiv/kernel_mul_standalone.cpp | 81 + .../kernels/orchestration/mixed_orch.cpp | 163 + .../mixed_example/test_mixed_example.py | 159 + .../test_multi_round_paged_attention.py | 155 + .../orch_so_cache/test_orch_so_cache.py | 120 + .../kernels/aic/aic_pv_matmul.cpp | 169 + .../kernels/aic/aic_qk_matmul.cpp | 157 + .../kernels/aiv/aiv_online_update.cpp | 256 ++ .../kernels/aiv/aiv_softmax_prepare.cpp | 293 ++ .../orchestration/paged_attention_orch.cpp | 379 +++ .../test_paged_attention_unroll.py | 132 + .../kernels/aic/aic_pv_matmul.cpp | 171 + .../kernels/aic/aic_qk_matmul.cpp | 135 + .../kernels/aiv/aiv_online_update.cpp | 262 ++ .../kernels/aiv/aiv_softmax_prepare.cpp | 277 ++ .../orchestration/paged_attention_orch.cpp | 186 ++ .../test_paged_attention_unroll_4dims.py | 144 + .../prepared_callable/conftest.py | 58 + .../test_prepared_callable.py | 302 ++ .../kernels/aic/kernel_spmd_read.cpp | 64 + .../kernels/aiv/kernel_spmd_read.cpp | 69 + .../kernels/orchestration/spmd_basic_orch.cpp | 56 + .../spmd_basic/test_spmd_basic.py | 98 + .../kernels/aic/kernel_write.cpp | 61 + .../kernels/aiv/kernel_write.cpp | 63 + .../spmd_batch_dispatch_oob_orch.cpp | 64 + .../test_spmd_batch_dispatch_oob.py | 82 + .../kernels/aiv/kernel_spmd_write.cpp | 65 + .../spmd_multiblock_aiv_orch.cpp | 71 + .../test_spmd_multiblock_aiv.py | 72 + .../kernels/aic/kernel_spmd_mix.cpp | 63 + .../kernels/aiv/kernel_spmd_mix.cpp | 66 + .../spmd_multiblock_mix_orch.cpp | 80 + .../test_spmd_multiblock_mix.py | 77 + .../kernels/mix/paged_attention_parallel.cpp | 851 +++++ .../spmd_paged_attention_orch.cpp | 159 + .../test_spmd_paged_attention.py | 129 + .../kernels/aic/paged_attention_highperf.cpp | 195 ++ .../kernels/kernel/pa_entry.cce | 172 + .../kernels/kernel/pa_kernel.cce | 2946 +++++++++++++++++ .../paged_attention_highperf_orch.cpp | 74 + .../kernels/pa_tiling.py | 484 +++ .../kernels/tiling/pa_tiling_struct.h | 97 + .../test_spmd_paged_attention_highperf.py | 377 +++ .../orchestration/spmd_starvation_orch.cpp | 95 + .../spmd_starvation/test_spmd_starvation.py | 108 + .../orchestration/spmd_sync_start_orch.cpp | 77 + .../spmd_sync_start/test_spmd_sync_start.py | 82 + .../spmd_sync_start_aiv_orch.cpp | 71 + .../test_spmd_sync_start_aiv.py | 66 + .../spmd_sync_start_edge_orch.cpp | 77 + .../test_spmd_sync_start_edge.py | 82 + .../spmd_sync_start_stress_orch.cpp | 104 + .../test_spmd_sync_start_stress.py | 119 + .../test_l3_dependency.py | 106 + .../test_l3_group.py | 120 + 274 files changed, 68537 insertions(+) create mode 100644 docs/fully_distributed_within_core.md create mode 100644 examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_consumer.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_notify_wait.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_producer_notify.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/orchestration/async_notify_orchestration.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/async_notify_demo/test_async_notify_demo.py create mode 100644 examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/benchmark_bgemm/test_benchmark_bgemm.py create mode 100644 examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_consumer.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_notify_wait.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_producer.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/orchestration/deferred_notify_orch.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/deferred_notify_demo/test_deferred_notify_demo.py create mode 100644 examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aic/kernel_mm.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v0.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v1.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_sum.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/mix_coown/kernels/orchestration/mix_coown_orch.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/mix_coown/test_mix_coown.py create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_pv_matmul.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_qk_matmul.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_online_update.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention/kernels/orchestration/paged_attention_orch.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention/test_paged_attention.py create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_pv_matmul.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_qk_matmul.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_online_update.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/orchestration/paged_attention_orch.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/test_paged_attention.py create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_pv_matmul.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_qk_matmul.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_online_update.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/orchestration/paged_attention_orch.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/test_paged_attention_unroll.py create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/down_proj.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/gate_proj.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/kv_proj.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/out_proj.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/q_proj.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/qk_matmul.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/sv_matmul.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/up_proj.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/attention_writeback.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_hidden.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_out.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/down_proj_residual.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/online_softmax.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/out_proj_residual.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/post_rmsnorm.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/q_pad.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/qk_norm.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rmsnorm.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rope_kv_cache.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/silu.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/softmax.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/orchestration/qwen3_decode.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/test_qwen3_14b_decode.py create mode 100644 examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_add.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_noop.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/scalar_data_test/test_scalar_data.py create mode 100644 examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_consumer.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_sdma_tget_async.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/orchestration/sdma_async_completion_orch.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/test_sdma_async_completion_demo.py create mode 100644 examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add_scalar.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_mul.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/vector_example/kernels/orchestration/example_orchestration.cpp create mode 100644 examples/a2a3/fully_distributed_within_core/vector_example/test_vector_example.py create mode 100644 src/a2a3/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/build_config.py create mode 100644 src/a2a3/runtime/fully_distributed_within_core/common/intrinsic.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/common/pto_runtime_status.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/docs/MULTI_RING.md create mode 100644 src/a2a3/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md create mode 100644 src/a2a3/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md create mode 100644 src/a2a3/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md create mode 100644 src/a2a3/runtime/fully_distributed_within_core/docs/device_log_profiling.md create mode 100644 src/a2a3/runtime/fully_distributed_within_core/docs/profiling_levels.md create mode 100644 src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/host/runtime_maker.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/orchestration/common.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/common.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_wait.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_completion_token.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_constants.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_submit_types.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_tensormap.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/pto_types.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/runtime.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp create mode 100644 src/a2a3/runtime/fully_distributed_within_core/runtime/tensor_create_info.h create mode 100644 src/a5/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/build_config.py create mode 100644 src/a5/runtime/fully_distributed_within_core/common/intrinsic.h create mode 100644 src/a5/runtime/fully_distributed_within_core/common/pto_runtime_status.h create mode 100644 src/a5/runtime/fully_distributed_within_core/docs/MULTI_RING.md create mode 100644 src/a5/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md create mode 100644 src/a5/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md create mode 100644 src/a5/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md create mode 100644 src/a5/runtime/fully_distributed_within_core/docs/device_log_profiling.md create mode 100644 src/a5/runtime/fully_distributed_within_core/docs/profiling_levels.md create mode 100644 src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.h create mode 100644 src/a5/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/host/runtime_maker.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/orchestration/common.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h create mode 100644 src/a5/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/common.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_async_wait.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_completion_token.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_constants.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_submit_types.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_tensormap.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/pto_types.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/runtime.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp create mode 100644 src/a5/runtime/fully_distributed_within_core/runtime/tensor_create_info.h create mode 100644 tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aic/kernel_matmul.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aiv/kernel_add.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/test_alternating_matmul_add.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_online_update.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/test_batch_paged_attention.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/kernels/orchestration/chain_barrier_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen_chain.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/__init__.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/_swimlane_validate.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/kernels/orchestration/chained_mix_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/test_l2_swimlane.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/test_l2_swimlane_mixed.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/dfx/pmu/test_pmu.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/dfx/scope_stats/test_scope_stats.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/dfx/tensor_dump/kernels/orchestration/partial_dump_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/dfx/tensor_dump/test_tensor_dump.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/dummy_task/kernels/aic/kernel_copy_first.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/dummy_task/kernels/aic/kernel_write_const.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/dummy_task/kernels/orchestration/dummy_task_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/dummy_task/test_dummy_task.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/dynamic_register/test_dynamic_register.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/fanin_lookup_perf/kernels/aic/kernel_write_const_visible.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/fanin_lookup_perf/kernels/orchestration/fanin_lookup_perf_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/fanin_lookup_perf/test_fanin_lookup_perf.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aic/kernel_matmul.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_add.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_add_standalone.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_mul.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/aiv/kernel_mul_standalone.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/mixed_example/kernels/orchestration/mixed_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/mixed_example/test_mixed_example.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/multi_round_paged_attention/test_multi_round_paged_attention.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/orch_so_cache/test_orch_so_cache.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_pv_matmul.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aic/aic_qk_matmul.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_online_update.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/aiv/aiv_softmax_prepare.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll/test_paged_attention_unroll.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aic/aic_pv_matmul.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aic/aic_qk_matmul.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aiv/aiv_online_update.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/aiv/aiv_softmax_prepare.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/kernels/orchestration/paged_attention_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/paged_attention_unroll_4dims/test_paged_attention_unroll_4dims.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/prepared_callable/conftest.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/prepared_callable/test_prepared_callable.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_basic/kernels/aic/kernel_spmd_read.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_basic/kernels/aiv/kernel_spmd_read.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_basic/test_spmd_basic.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/kernels/aic/kernel_write.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/kernels/aiv/kernel_write.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/kernels/orchestration/spmd_batch_dispatch_oob_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_batch_dispatch_oob/test_spmd_batch_dispatch_oob.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_multiblock_mix/test_spmd_multiblock_mix.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention/kernels/mix/paged_attention_parallel.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention/kernels/orchestration/spmd_paged_attention_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention/test_spmd_paged_attention.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/aic/paged_attention_highperf.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/kernel/pa_entry.cce create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/kernel/pa_kernel.cce create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/orchestration/paged_attention_highperf_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/pa_tiling.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/kernels/tiling/pa_tiling_struct.h create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_paged_attention_highperf/test_spmd_paged_attention_highperf.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_starvation/test_spmd_starvation.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_sync_start/test_spmd_sync_start.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_edge/test_spmd_sync_start_edge.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp create mode 100644 tests/st/a2a3/fully_distributed_within_core/spmd_sync_start_stress/test_spmd_sync_start_stress.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/test_l3_dependency.py create mode 100644 tests/st/a2a3/fully_distributed_within_core/test_l3_group.py diff --git a/docs/fully_distributed_within_core.md b/docs/fully_distributed_within_core.md new file mode 100644 index 000000000..759d9372a --- /dev/null +++ b/docs/fully_distributed_within_core.md @@ -0,0 +1,822 @@ +# AICore 上的全分布式 Runtime + +本文档定义 **simpler** 的一种运行模式:编排(orchestration)、调度(scheduling) +与执行(execution)全部以 SPMD 方式运行**在 AICore 自身**之上,**AICPU 完全不参与**。 +不存在独立的调度器:每个核自行构建、拥有并执行自己的任务。 + +这是一份自洽的设计。第一部分描述系统如何工作(核的行为 + 伪代码);第二部分列举 +各数据结构及其共享特性(全局共享 / 每核私有 / 每核复制)。 + +本设计所替代的、当前以 AICPU 为中心的模型,参见 +[chip-level-arch.md](chip-level-arch.md) 与 [scheduler.md](scheduler.md)。编排编写 +API(`rt_submit_aic_task` / `rt_submit_aiv_task`,`pto_orchestration_api.h`)参见 +`src/{arch}/runtime/` 下的 `tensormap_and_ringbuffer` runtime。 + +--- + +# 第一部分 — 系统设计 + +## 1. 概述 + +- 编排函数**被加载并同时运行在每一个参与的 AICore 上**(SPMD)。所有核执行完全相同 + 的编排程序。 +- 每个核同时是**编排器 + 调度器 + worker**。经典的“调度器↔worker”握手(任务门铃、 + ready 队列、完成邮箱、依赖连线线程)被**彻底取消**。 +- 面向编排的 API 保持不变。通用原语是 `rt_submit_task(MixedKernels, args)`; + `rt_submit_aic_task` / `rt_submit_aiv_task` 只是它的轻量便捷封装(**不存在** + `rt_submit_mixed_task`——MIX 任务就是一个填了多个 kernel 槽的 `MixedKernels`)。 + 在这些 API 背后,runtime 决定所有权、在本地构建任务,随后由同一个核执行它。 +- AICPU 不在编排与调度的关键路径上。 + +本设计建立在以下四个支柱之上(下文逐一展开): + +1. 任务所有权的**抢占竞争(claim race)**(§2)。 +2. **owner = builder = executor**,并配合核类型匹配(§3)。 +3. 用于依赖发现的**每核全量复制 TensorMap**(§4)。 +4. **每核私有任务环 + 一个全局完成标志环**,驱动一个采用拉取式依赖解析的 + run-ahead 执行循环(§5–§6)。 + +## 2. 任务所有权 —— 抢占竞争(Claim Race) + +所有核走**完全相同**的、确定性的 submit 序列。任务身份就是它在该序列中的位置:第 N 次 +`rt_submit_*` 调用在每个核上都是**任务 id `N`**,与最终由谁执行无关。 + +所有权由以下两个量驱动: + +| 计数器 | 作用域 | 含义 | +| ------ | ------ | ---- | +| `claim_cursor[T]`(`cube_cursor`、`vector_cursor`) | **全局、原子** | 类型 `T` 已被认领任务 id 的高水位线。共**两个** cursor(cube = AIC-anchored,vector = AIV-only),二者都索引同一个共享 id 空间(§3.1) | +| `local_current_task_index` | **每核** | 本核走 submit 序列时当前到达的任务 id | + +每次 `rt_submit_*`,匹配 anchor 类型的核执行如下逻辑(设 `T` 为此任务类型——若 AIC-anchored +则为 cube,若 AIV-only 则为 vector): + +```text +local_current_task_index++ # 到达下一个 submit 点 = 任务 id N +if local_current_task_index > claim_cursor[T]: # 我是否领先于 T 的高水位线? + # 本核是 T 类型中走得最靠前的 → 它 WIN,拥有任务 N。 + claim_cursor[T] = local_current_task_index # 发布(原子) + own = true +else: + # 已有一个 T 类型的核更早认领了此 id(它跑在前面)。 + own = false +``` + +胜者是该任务 id 的唯一 owner。所有权决定的是*谁来构建与执行*;它**不会**改变任务 id—— +该 id 是处处使用的确定性 submit 序号(完成标志环的索引、以及每个核的 producer 引用)。 +对于多核任务,胜者是 *anchor*;与它配对的同 block 核共同拥有其余子任务(§3.1)。 + +为什么需要两个 cursor(以及为什么单一共享 cursor 是错的)在 §3.1 解释:两个 cursor 扫过 +同一 id 空间,各自只认领自己类型的 id,并**跨过**另一类型的 id,因此落后类型尚未认领的 +id 只是在等待它自己的 cursor —— 它们绝不会被跳过。 + +> 确切原子原语(`atomic_fetch_max`,无则 CAS 回路)与内存序在 §11.1 定为规范; +> 语义上每个任务 id 恰好有一个 anchor 胜出。 + +## 3. owner = builder = executor;核类型匹配 + +**抢到任务的提交者就是它的 owner。** owner 同时负责任务的**创建**(构建 +descriptor/payload、记录 fan-in producer id)与**执行**(调用 incore 函数)。一个核只会 +认领它自己能执行的类型的任务。 + +任务由 `MixedKernels` 描述,最多携带三个子任务槽: + +```cpp +struct MixedKernels { + int32_t aic_kernel_id { INVALID_KERNEL_ID }; // AIC 子任务 + int32_t aiv0_kernel_id { INVALID_KERNEL_ID }; // AIV 子任务 0 + int32_t aiv1_kernel_id { INVALID_KERNEL_ID }; // AIV 子任务 1 +}; +``` + +`active_mask` = 哪些槽有效,它恰好记录了一个 MIX 任务的 AIV 数量——**1C+1V** +(`aic` + `aiv0`)还是 **1C+2V**(`aic` + `aiv0` + `aiv1`)。这一区分对所有权很关键: +1C+1V 任务只绑定 AIV0_c,让 AIV1_c 保持空闲(§3.1)。因此任务是以下之一:AIC-only、 +AIV-only(1 个或 2 个 AIV 子任务)、或 **MIX**(AIC + 1 个或 2 个 AIV 子任务)。 + +| 任务形态 | 子任务槽 | owner | +| -------- | -------- | ----- | +| **AIC-only** | `aic` | 任意一个 AIC 核 | +| **AIV-only (1V)** | `aiv0` | **任意一个 AIV 核(AIV0 或 AIV1)** | +| **AIV-only (2V)** | `aiv0`、`aiv1` | 同一 block 的两个 AIV 核 | +| **MIX (1C+1V)** | `aic`、`aiv0` | 一个 AIC + 同 block 一个 AIV(共同 owner) | +| **MIX (1C+2V)** | `aic`、`aiv0`、`aiv1` | 一个 AIC + 同 block 两个 AIV(共同 owner) | + +单槽封装(`rt_submit_aic_task` → 填 `aic`,`rt_submit_aiv_task` → 填 `aiv0`)是常见路径; +多槽任务直接走 `rt_submit_task(MixedKernels, …)`。 + +**单核 vs 多核——竞争资格按“类型”而非“固定槽角色”。** 竞争一个任务的资格由任务**类型** +(cube / vector)决定,而非某个具体的 `aiv0`/`aiv1` 角色: + +- **单核任务(1C、1V)**:没有配对、没有 anchor/follower。任意一个**匹配类型**的核通过 §2 的 + claim race 认领,胜者独自构建并执行那唯一的子任务。特别地,**1V(AIV-only 单核)由所有 AIV 核 + 竞争——AIV0 与 AIV1 同等参与**;胜者执行 `aiv0_kernel_id`,与它在 block 中是 AIV0 还是 AIV1 + 无关(两者都是 vector 核,可执行任意 AIV kernel)。 +- **多核任务(2V、MIX)**:需要同一物理 block 的多个核共同拥有,走 §3.1 的固定配对(anchor 胜出 + 后把其余子任务推送给同 block 伙伴)。 + +换言之,`aiv0`/`aiv1` 的“固定角色”**只**在多核任务里用来把子任务映射到 block 内具体的核;对单核 +任务它不构成竞争限制。 + +### 3.1 通过固定物理配对实现多核任务的共同所有权 + +本节**只针对多核任务**(任意 MIX 任务,以及 2V 的 AIV-only 情况)——它们含多于一个有效子任务 +槽,必须被多个核同时拥有。单核任务(1C、1V)不走本节机制:由任意匹配类型的核(1V 即任意 AIV 核 +AIV0/AIV1)通过 §2 的 claim race 直接认领、独自执行,无 anchor/follower。本节规定多核任务的 +共同 owner 如何被选出、如何达成一致——这是模型中最难的部分。 + +**配对被 FIXED(固定)到硬件 block。** 核被组织成硬件 block(cluster);在本平台上一个 +block = **1 AIC + 2 AIV**(AIV0、AIV1)。这个 block 是永久的共同所有权单位:AIC_c 与 +AIV0_c、AIV1_c 静态配对。不存在动态配对选举。子任务槽到 block 内角色是固定映射: + +| 子任务槽 | 由谁执行(block `c` 内) | +| -------- | ------------------------ | +| `aic_kernel_id` | AIC_c | +| `aiv0_kernel_id` | AIV0_c | +| `aiv1_kernel_id` | AIV1_c | + +**Anchor + 同 block 跟随规则。** 一个多核任务只被**认领一次**,由一个 *anchor* 核认领; +其 block 的其余核跟随: + +1. **谁竞争(anchor 类型)**:竞争按任务**类型** `T` 进行——含 AIC 子任务的任务(所有 MIX) + 是 **cube 类型,只有 AIC 核竞争**;纯 AIV 的 2V 是 **vector 类型,由所有 AIV 核(AIV0/AIV1) + 竞争**。胜出者即该任务的 **anchor**,它执行**自己物理角色**对应的那个槽(AIC 胜者执行 `aic`; + 2V 由某个 AIV 胜出则执行它自己角色的 `aiv0`/`aiv1`),其余激活槽推送给同 block 伙伴。 + **MIX 的 vector co-owner 绝不靠自己竞争得来**——它*完全*由“哪个 AIC 胜出”决定,即由胜者 + 所在的 block 决定(一个 AIV 核绝不会因为先到达就赢得某 MIX 的 vector 子任务)。 +2. 抢占竞争(§2)**仅在 anchor 类型之间**进行,竞争对象是 `cursor[T]`。胜出的 anchor 核 + 所在的 **block** 成为拥有该任务的 block。anchor 在胜出时**一次性解析整个任务的 fan-in** + producer id(从它在 `N` 处的 TensorMap 副本读取,各核内容相同——§4),把*自己*那个槽的 + 子任务构建进自己的私有环,并把该任务**其余激活槽**的子任务记录**推送(deposit)**进一张 + **以任务 id 为键的 block-local 投递表** —— `block.won[N]` —— 内容为 + `{active_mask = M, 各激活槽 kernel id, args, 已解析的 fan-in producer id, 剩余子任务计数 + = popcount(M)}`。 +3. 同 block 的 follower 核**既不竞争、也不在自己的编排走位上对该任务做“等待 anchor 决定” + 的判断**——它**永不因 anchor 而阻塞**。follower 的所有权完全靠 anchor 的**推送**到达: + follower **异步地从 `block.won` 抽取(drain)**属于自己槽的子任务投递,在私有环有空槽时 + 把它构建进环。follower 在自己的编排走位中遇到 MIX 任务时,只做 §4 的无条件 TensorMap + 更新,然后继续前进,**不**对该 MIX 任务做任何所有权决定、**不**等待它的 anchor。 + +**为什么是 anchor 推送,而不是 follower 自己走位 + 等待。** 两个 cursor 独立推进(§2),所以 +cube 与 vector 的进度可能任意错位。若让 follower 在自己的走位上“走到 N 再判断我的 block 是否 +赢了 N”,当它的 anchor 落后(`cube_cursor < vector_cursor`)时,follower 就无法区分“anchor +还没决定 N”与“anchor 输了 N(别的 block 赢了)”,只能**阻塞等待** anchor 推进到 N——这会把 +vector 的吞吐死死耦合到 cube 的吞吐上,是不可接受的。**改为 anchor 推送即彻底消除这种 per-task +阻塞**: + +- **cube 落后时**:`block.won` 里还没有给这个 AIV 的 MIX 投递 → AIV **不等待**,继续竞争并执行 + 它自己的 AIV-only 任务(以及抽取已到的其他投递)。零停顿。 +- **cube 领先时**:投递在 `block.won` 中累积 → AIV 有空槽就抽取构建。若 AIV 落后到填满 + `block.won`,则 anchor **暂缓认领新的多核任务**(反压;见 §6 中 anchor 转去执行 Phase B 而 + 非自旋),方向正确:不让 cube 无限超前。 + +`block.won` 以任务 id 为键(而非单一会被覆盖的槽),既承载每任务的剩余子任务计数,也允许同一 +block 多个并发多核任务的投递互不串扰。由于配对是静态的,投递的目标 follower 由 anchor 所在 +block 唯一确定,无需任何跨 block 协商。 + +> 唯一残留的等待发生在**收尾**:若某 block 的 anchor 严重落后,它的 follower 在做完自己其余 +> 全部工作、私有环清空后,可能要在终止前空转,等 anchor 把最后的多核子任务推送过来(§7)。 +> 这是固定配对的固有代价——多核子任务的归属由 anchor 的认领决定;它不是 per-task 的串行阻塞, +> 而只是尾部的一次空转,且在 cube 密集(cube 领先)的常见场景下根本不出现。 + +**按形态的行为(设胜出 anchor 在 block `c`):** + +| 任务形态(`active_mask`) | 谁竞争 | Anchor(胜者) | 被推送子任务的 follower | 同 block 未被绑定(保持空闲) | +| ------------------------- | ------ | -------------- | ----------------------- | ----------------------------- | +| **1C + 2V**(多核) | 所有 AIC | AIC_c | AIV0_c、AIV1_c | — | +| **1C + 1V**(多核) | 所有 AIC | AIC_c | AIV0_c | **AIV1_c** | +| **2V**(多核,AIV-only) | 所有 AIV(AIV0/AIV1) | 胜出的那个 AIV_c | 同 block 的另一个 AIV_c | AIC_c | +| **1C**(单核,AIC-only) | 所有 AIC | 胜者独自执行,无配对 | — | (不涉及 block 配对) | +| **1V**(单核,AIV-only) | **所有 AIV(AIV0/AIV1)** | 胜者独自执行,无配对 | — | (不涉及 block 配对) | + +多核任务(前三行)的 follower 身份都由 anchor 所在 block 唯一确定——不存在跨 block 协商。单核 +任务(后两行)没有 anchor/follower,胜者是哪个核就由哪个核独自执行;**1V 由 AIV0 与 AIV1 同等 +竞争**。 + +**未被绑定的 block 伙伴不是闲着——它对其他任务保持空闲可用。** 当一个 block 赢得一个不激活 +某 block 伙伴槽位的任务时,那个核就**不被该任务占用**,且**绝不能**因它而阻塞或等待。它继续 +运行自己的编排,继续竞争并拥有其类型的其他任务。具体地: + +- 一个 **1C+1V** 任务只绑定 AIC_c + AIV0_c。**AIV1_c 是空闲的**,可继续竞争、认领并执行其他 + AIV 任务(它自己竞争到的任意 1V/2V AIV-only 任务,或本 block 后续某个 1C+2V 任务的 AIV1 槽)。 +- 一个 **1C(AIC-only)** 任务只绑定一个 AIC 核;AIV 核**都**对 AIV 工作保持空闲。 +- 一个 **1V(AIV-only)** 任务是单核:由**任意一个 AIV 核(AIV0 或 AIV1)**竞争得到并独自执行, + 其余 AIV 核与 AIC 核保持空闲。它不绑定任何固定角色。 + +这是模型的自然结论:每个核都走相同的确定性 submit 序列,并逐任务判断自己的槽是否激活。在某个 +自己的槽未激活的 submit 点,该核就是不绑定该任务(但它仍执行 §4 的无条件 TensorMap 更新), +然后继续——去认领它下一个有资格的任务。每个任务记录的 `active_mask`(1C+1V vs 1C+2V 等) +就是告诉每个 block 伙伴自己是被绑定还是空闲的依据。 + +**多核任务只有一个完成标志。** 即使有多个共同 owner,一个任务也恰好只有一个全局 +`task_completed_flag[N]`。每个共同 owner 执行自己的子任务后,递减 `block.won[N]` 中那个用 +`popcount(active_mask)` 初始化的**per-task 剩余计数器**。(该计数器存在以 id 为键的记录里, +而非单一 block 字段,因此同一 block 的多个并发 MIX 任务不会互相串扰。)把计数器递减到零的那个 +共同 owner(最后完成的子任务)执行唯一一次全局写 `task_completed_flag[N] = true`。因此无论 +任务有多少个子任务,消费者都只看到一个原子的完成信号。每个共同 owner 在自己的子任务完成后 +立即释放自己的私有环槽位。 + +**Claim 流一致性 —— 同一任务 id 空间上的两个全局 cursor。** + +只有**一个**任务 id 空间——确定性 submit 序列(第 N 次 submit = id `N`),处处用于完成标志 +环与 producer 引用。 + +所有权由**两个全局 claim cursor** 决定,二者都由所有核共享,且都索引进*同一个* id 空间: + +- `cube_cursor` —— 已认领的 **cube(AIC-anchored)** 任务 id 的高水位线(AIC-only 与所有 + MIX 任务)。 +- `vector_cursor` —— 已认领的 **vector(AIV-only)** 任务 id 的高水位线。 + +一个到达类型 `T` 的任务 `N` 的核,当且仅当 `N > cursor[T]` 时赢得它;赢得后把 `cursor[T]` +推进到 `N`。一个核只会推进它自己类型的 cursor;它**跨过**另一类型的 id 而不去碰它。 + +两个 cursor 在共享 id 空间上**独立**推进,因此任意时刻其中一个可能领先于另一个。**推进一个 +cursor 不会认领它跨过的另一类型的 id。** 因此在领先 cursor 与落后 cursor 之间的 id 区间里 +可能存在**尚未认领的空洞**——这些是*落后*类型的、还没有任何核到达的 id。这是正确的,不是 bug: +一个空洞只表示“暂时还没认领”;当一个该类型的核到达它时,落后类型的 cursor 会把它填上。 + +```text +任务 id: 0 1 2 3 4 5 6 +类型: C V C C V V C + ^cube_cursor=3 (cube 任务 0,2,3 已认领) + ^vector_cursor=1 (vector 任务 1 已认领) +空洞: id 4 和 5 是位于 cube_cursor 之下的 vector 任务——仍 UNCLAIMED, + 等待 vector_cursor 推进到它们。没有 orphaning。 +``` + +在单一类型内部不存在空洞:每个核按 id 递增顺序遇到该类型的任务,而 cursor(一个单调高水位线) +总是被设为刚刚认领的那个 id——因此该类型中所有 ≤ 其 cursor 的 id 都已被某个核拥有。(计数器的 +确切表示属于实现细节——§11。) + +**取舍。** 固定配对消除了一切跨 block 协商,并把唯一的共享协调状态保持在 **block-local** +(1 AIC + 2 AIV 共享一小块区域),而非全局 per-task。代价是多核任务没有跨 block 的负载均衡; +动态配对方案是未来的改进(§11)。 + +### 3.2 为什么 vector 不竞争 MIX(以及“不会缺失 co-owner”的论证) + +> 这一节直接回答一个常见疑问:既然 vector 不参与 MIX 的竞争,会不会出现“cube 认领了某个 MIX +> 任务,却没有任何 vector 核作为它的 co-owner”?答案是**不会**。并解释为什么不采用“让 vector +> 也竞争 MIX”或“先到先得、由后到的同 block cube 反向认领”的替代方案。 + +**结论一:vector 核不参与 MIX 的竞争。** MIX 永远 cube-anchored(§3.1)。vector 核遇到一个 +MIX 任务时走的是 follower 路径:它**不**碰 `vector_cursor`,只按 id 查 `block.won[N]`,看自己 +所在 block 的 AIC 是否赢了。它“先到达” MIX 任务这件事不授予它任何东西。 + +**结论二:永远不会缺失 vector co-owner。** 原因有三条,缺一不可: + +1. MIX 任务是 cube 任务,**只**会推进 `cube_cursor`。`vector_cursor` 永远不认领 MIX 任务—— + 即便 `vector_cursor` 追上甚至越过 `cube_cursor`,它也只是在认领它路过的 *AIV-only* 任务, + 绝不会“占用”任何 MIX 任务。所以不存在“被 vector_cursor 抢走却没有 vector 执行者”的 MIX 任务。 +2. 当某个 AIC 核 `AIC_x` 赢得 MIX 任务 `N` 时,它的 vector co-owner 由**固定物理配对**确定: + 就是同 block 的 `AIV0_x`(若 1C+2V 还有 `AIV1_x`)。这个身份在胜负确定的瞬间就被钉死, + 不需要任何额外竞争或选举。 +3. 当 `AIC_x` 赢得 `N` 时,它把 `AIV0_x`(及 1C+2V 的 `AIV1_x`)的子任务**推送**进 + `block.won[N]`(§3.1);`AIV0_x` 异步抽取并执行。**co-owner 的存在是被保证的。** + +**那么 `vector_cursor` 追上 `cube_cursor` 时究竟会发生什么?会不会变成 blocking wait?** +不会。注意 MIX 归属靠 **anchor 推送**而非 follower 走位判断(§3.1),所以: + +- **cube 落后(`cube_cursor < vector_cursor`)时**:AIC 还没认领 `N`,因此 `block.won` 里还没有 + 给 AIV 的投递。AIV **不阻塞、不空等**——它继续竞争并执行自己的 AIV-only 任务,同时抽取已到的 + 其他投递。它在自己的走位上遇到 MIX 任务时只做 TensorMap 更新就走,**不**对该任务做归属判断、 + **不**等待它的 cube 伙伴。 +- 等 AIC 日后认领到 `N`,投递才出现在 `block.won`,AIV 再抽取执行。 + +换言之,不存在“AIV 走到 MIX 任务就 blocking wait 到 cube 追上来”的情况——这正是把旧设计的 +`wait_until(block.anchor_progress >= N)` 去掉、改为推送的原因。唯一残留的等待是**尾部空转** +(§3.1、§7):若某 block 的 AIC 严重落后,AIV 做完其余全部工作后会在终止前等 AIC 推送最后的 +多核子任务。这不是 per-task 串行阻塞,且 cube 领先(常见)时根本不出现。 + +**为什么不让 vector 也竞争 MIX(方案 A)。** 因为 MIX 的 AIC 与 AIV 子任务必须在**同一物理 +block 内协同执行**(共享 local memory / 相互配合,这正是固定配对的意义),所以所有权的单位 +是 **block**,不是单个核。若允许 vector 核也去 anchor 一个 MIX 任务,会立刻破坏 §2 的 cursor +不变式: + +- 若让 vector 核去推进 `cube_cursor` 来认领 MIX,它就会把位于旧 `cube_cursor` 与 `N` 之间的 + 那些 **cube-only 任务 orphan 掉**(跳过且无人认领)——这正是双 cursor 设计要避免的问题。 +- 若让 vector 核在 `vector_cursor` 上 anchor MIX,而某个 cube 核同时在 `cube_cursor` 上 anchor + 同一个 MIX `N`,那么同一任务会被两个 cursor 各认领一次 → **两个不同的 block 都认为自己拥有 + `N`**(跨 block 撕裂 / 双重认领)。错误。 + +因此结论是:**每一类任务必须只有一个 anchor 类**。MIX 选 cube 作为唯一的 anchor 类,保证 +claim 是单写者、无 orphan、无跨 block 双重认领。 + +**为什么“先到先得 + 后到的 cube 反向认领”(方案 B)也不采用。** 这个想法只能作为 **block +内部**的“探测优化”(block 内谁先到达 `N` 谁就代表本 block 发布认领),而**不能**跨 block—— +跨 block 的正确性仍然要求一条单一的 claim 流,且该流必须是 cube 的(否则就 orphan 掉 cube-only +任务,同方案 A)。也就是说,即便 block 内允许 vector 先“代发布”,真正权威的 anchor 流仍是 cube +的 `cube_cursor`。其收益只是偶尔省去 follower 的一次等待,却显著增加了 block 内两条 cursor +交叉认领的复杂度与正确性论证负担。因此当前**不采用**,仅在 §11 作为未来可选优化列出。 + +> 一句话总结:vector 不竞争 MIX 是**有意为之**的正确选择。co-owner 由固定配对保证存在;让 +> vector 参与只会重新引入 orphan 或跨 block 双重认领。需要权衡的不是“会不会缺 co-owner”,而是 +> cube 落后时 follower 的等待——这属于负载均衡/性能问题,留待动态配对方案(§11)解决。 + +## 4. 依赖发现 —— 每核全量复制 TensorMap + +依赖与今天完全一样,从 tensor 的读/写重叠推导,途径是一个把 tensor 区域映射到其 +**producer 任务 id** 的 **TensorMap**。本 runtime 的决定是: + +> **TensorMap 是每核全量 DUPLICATE(复制)—— 每个核持有一份完整、相同的副本。它绝不被 +> 分区,也绝不做成私有/部分。** + +**为什么部分 map 是错的。** producer 条目只在处理某任务的 `OUTPUT`/`INOUT` tensor 时创建。 +若一个核只为它*拥有*的任务插入,它的 map 就会缺失所有由别的核拥有的任务产出的 tensor;本核 +上的某个消费者去查这样一个 tensor 会查不到——依赖发现会悄无声息地失效。 + +**所要求的 submit 行为(胜者 AND 败者都做)。** 为保持副本完整,submit 路径被拆分:TensorMap +维护是**无条件**的,只有 build+execute 才受所有权门控。每次 `rt_submit_*`,*每个*核都做: + +1. **查**每个 `INPUT` / `INOUT` tensor → 解析出本任务的 fan-in producer 任务 id。 +2. **插**每个 `OUTPUT` **以及 `INOUT`** tensor → 以**本任务 id**作为 producer 登记。`INOUT` + 两侧都算——它消费旧版本(第 1 步)并产出新版本(第 2 步)。 + +**胜者**额外构建并执行该任务;**败者**在 TensorMap 更新后停止并前进。 + +因为 submit 流与任务 id 在各核之间是确定且相同的,每个核重建出**相同**的 TensorMap。各核仅在 +**进度**上不同:跑得更靠前的核有更多条目,但每个条目都与其他核在同一逻辑位置产出的一致—— +**内容相同,进度不同**。 + +**取舍。** 每个核都要付出完整的 TensorMap 插入/查询开销与内存,即使是它永远不会执行的任务。 +作为回报,解析 producer **零跨核通信**:消费者的 fan-in producer id 在本地副本里就能拿到,在 +构建时存入任务的私有环槽位,执行时再对全局完成标志环轮询。 + +## 5. 任务存储 —— 私有环 + 全局完成标志 + +AICPU 模型的全局任务环被移除。两个结构替代它们: + +- **每核私有任务环** —— 每个核拥有一个小环,存放它已认领的任务,保存每个任务的 + descriptor + payload + 本地状态(kernel id、args、fan-in producer id)。其他核都不读它; + 无锁。容量: + + ```cpp + #define PRIVATE_TASK_SLOT_NUM 8 + ``` + +- **全局 `task_completed_flag` 环** —— *唯一*全局共享的 per-task 状态:每个任务 id 一个 + 一次性置位的布尔,标记完成。各核轮询它以检查某个 fan-in producer 是否已完成。 + +这使依赖解析成为**拉取(pull)**模型(消费者轮询 producer 标志),而非**推送(push)**模型 +(producer 遍历 fanout 列表)。**没有 fanout 列表、没有 fanin/fanout 引用计数、没有依赖列表 +池、也没有完成邮箱。** + +### 5.1 私有任务环与 `block.won` 是两个分开的 ring + +私有任务环与 `block.won`(§3.1、§8.1)**是两个独立的结构,职责不同,不可混为一谈**: + +| | **私有任务环** | **`block.won[N]`** | +| ---- | ---- | ---- | +| 归属 | **每核私有**(每个 worker 各一个) | **block-共享**(1 AIC + 2 AIV 共一份) | +| 作用 | **执行队列**:存放本核已拥有、要*亲自执行*的(子)任务 | anchor → follower 的**投递/交接箱**:暂存多核任务中 anchor 没亲自构建的其余激活槽子任务 | +| 谁读写 | 仅本核读写,单一 owner、无锁 | anchor 插入(release)、follower 抽取(acquire)、`remaining` 原子递减 | +| 谁会用到 | 所有任务(含单核 1C/1V) | **仅多核任务(2V / MIX)**;单核任务根本不碰它 | +| 容量含义 | 默认 8:封顶“单核可超前多少” | 默认 8:封顶“anchor 相对 follower 可超前多少”,满则触发反压(§11.2) | + +**真正的执行永远只发生在各核自己的私有任务环里。** `block.won` 不是执行环,只是把多核子任务从 +anchor **搬运**到 follower 私有环的中转站。两者如何配合: + +``` +anchor 赢下多核任务 N: + ├─ 自己物理角色那一槽 ──→ 写进【anchor 自己的私有任务环】(亲自执行) + └─ 其余激活槽 ──→ 写进【block.won[N]】(投递给伙伴) + +follower 异步抽取: + 从【block.won[N]】取出属于自己槽的项 ──→ 写进【follower 自己的私有任务环】(再亲自执行) + +子任务一旦进入某核私有环,其执行、置完成标志、block.won[N].remaining 递减都照常进行; +remaining 归零时释放该 block.won 条目。 +``` + +单核任务(1C / 1V)的胜者直接把唯一子任务写进自己的私有环执行,**没有配对、没有投递、不写 +`block.won`**。 + +## 6. 核执行循环(Run-Ahead) + +每个核运行下面的循环。编排**向前跑(run ahead)**,认领并构建(子)任务,直到私有环填满, +然后通过执行就绪任务来腾空。填满环就是反压信号。该循环从单个物理核 `self` 的视角写出,它在 +所在 block 中的角色是 `{AIC, AIV0, AIV1}` 之一。竞争按**任务类型**进行(vector 任务由 AIV0/AIV1 +同等竞争);单核任务胜者独自执行,多核任务胜者作 anchor 并把其余子任务推送给同 block 伙伴 +(§3、§3.1)。 + +```text +# 全局(所有核共享),一个共享任务 id 空间(§2、§3.1): +# cube_cursor : 已认领的 AIC-anchored 任务 id 高水位线 +# vector_cursor : 已认领的 AIV-only 任务 id 高水位线 +# 每核: +# self.role ∈ {AIC, AIV0, AIV1} +# my_type(self) = cube (若 self 是 AIC) / vector (若 self 是 AIV0 或 AIV1) +# local_current_task_index : 本核已到达的任务 id + +loop: + # --- 阶段 A:在编排中向前跑 --- + while 私有环有空槽 AND 编排未结束: + 推进编排到下一个 submit 点 # 任务 id N + local_current_task_index = N + M = task.active_mask # 记录 1C+1V vs 1C+2V 等 + + # (1) TensorMap 维护是无条件的(胜者、败者、follower 都做)—— §4: + # - 查 INPUT/INOUT tensor → fan-in producer 任务 id + # - 插 OUTPUT + INOUT tensor → 以本任务 id 作为 producer + update_tensormap(task) + + # (2) 确定本任务的类型与 cursor(§2、§3): + # 竞争资格按“类型”,不按固定槽角色:cube 任务由 AIC 竞争; + # vector 任务由所有 AIV 核(AIV0 与 AIV1)竞争。 + T = (cube if M.has(aic) else vector) # 有 AIC → cube;否则 vector(含 1V 与 2V) + cursor[T] = (cube_cursor if T==cube else vector_cursor) + + if my_type(self) == T: + # 我是该类型的合格竞争者(vector 任务时 AIV0/AIV1 都在此参与)。 + if popcount(M) > 1 AND block.won 已满: # 多核反压:先于认领检查(§11.3) + break # 让位给有空闲的 block / 稍后重试;退出 Phase A 去 Phase B + # 单原子推进:返回旧值;旧值 < N 即我赢。恰一胜者且无跳过见 §11.1。 + old = atomic_fetch_max(cursor[T], N) # N = local_current_task_index + if old < N: # WIN:我是 owner/anchor + fanin_ids = resolve_fanin(task) # 一次性解析整任务 fan-in(本地 TensorMap) + if popcount(M) == 1: + # 单核(1C 或 1V):我独自执行那唯一的子任务, + # 与我是 AIV0 还是 AIV1 无关,无配对、无推送。 + 把该唯一子任务构建进一个空闲私有环槽 + else: + # 多核(2V / MIX):我是 anchor。构建我自己物理角色对应的槽, + # 把其余激活槽推送给同 block 伙伴(以 id 为键,互不串扰)。§3.1 + 把我自己角色的槽对应的子任务构建进一个空闲私有环槽 + block.won[N] = { active_mask:M, kernels, args, fanin_ids, + remaining: popcount(M) } # block-shared(§3.1) + # else(old >= N):已有一个 T 类型的核认领了 N(它跑在前面)→ 跳过 + # else: 类型不匹配(例如 AIC 核遇到 1V 任务)→ 只做了 TensorMap,跳过 + + # --- 抽取 anchor 推送给我的多核子任务(异步,非阻塞)--- + # 同 block 的 anchor 胜出某多核任务后,会把它没亲自构建的其余激活槽放进 block.won。 + # 本核按自己的物理角色(AIV0→aiv0 / AIV1→aiv1 …)抽取属于自己的那个槽。 + while 私有环有空槽 AND block.won 有“我角色对应槽”尚未被本核构建的待处理项: + 从 block.won 取出该子任务,构建进一个空闲私有环槽 # fan-in 已由 anchor 解析好 + # 注:取空就停;没有投递时不等待,本核继续做自己的活 + + # --- 阶段 B:从私有环中腾空就绪的(子)任务 --- + freed = 0 + for each 私有环中已占用的槽: + if 所有 fan-in producer 的 task_completed_flag == true: # 依赖已满足(pull) + execute(slot) # 调用我的 incore 函数 + # 完成:多核任务只有一个全局标志,由其共同 owner 中 + # 最后完成的子任务置位(§3.1)。 + if slot.is_multicore: + if atomic_dec(block.won[slot.task_id].remaining) == 0: + task_completed_flag[slot.task_id] = true # 最后一个子任务胜出 + free block.won[slot.task_id] # 回收以 id 为键的记录 + else: + task_completed_flag[slot.task_id] = true # 单核:直接置位 + free(slot) # 释放我自己的槽;无 fanout 计数 + freed++ + + if freed == 0: + # 环已满且无就绪任务:持续重扫,直到某个 producer(在别的核上) + # 置位了我们正在等待的标志,然后执行它。 + continue # 重扫阶段 B + # 至少腾出一个槽 → 回到阶段 A 去认领更多任务 + + if 编排已结束 AND 私有环为空: + break # 本核完成 +``` + +性质: + +- **MIX = anchor 推送 + follower 异步抽取(§3.1)。** AIC 核为 MIX 任务 anchor,胜出后把其余 + 激活槽的子任务推送进以 id 为键的 block 投递表 `block.won[N]`;block 的 AIV 核绝不为它竞争、 + **也绝不阻塞等待**——它只异步从 `block.won` 抽取属于自己槽的投递并构建。cube 落后时 AIV 没有 + 待抽取的投递,便继续做自己的 AIV-only 工作(零停顿);cube 领先时投递累积、AIV 有空槽就抽取, + 若 AIV 落后到填满 `block.won`,anchor 暂缓认领新多核任务(反压,转去 Phase B)。槽未激活的 + block 伙伴(例如 **1C+1V 上的 AIV1**)从不收到投递,照常去认领其他工作。 +- **每任务一个标志,由最后一个子任务置位。** 单核任务直接置 `task_completed_flag`;多核任务 + 递减一个 block-local 计数器(= `popcount(active_mask)`),由最后完成的子任务置位。消费者 + 始终看到一个原子完成信号。 +- **反压** = 私有环填满(`PRIVATE_TASK_SLOT_NUM` 个槽)。 +- **即时回收槽**:每个共同 owner 在*自己*的子任务完成时释放*自己*的槽。没有全局环尾推进, + 没有跨核的槽复位协调,因为环是私有的。 +- **前向进展**:环满且无就绪任务时重扫(自旋),直到另一个核的完成标志解锁某个任务;一旦腾出 + 一个槽,该核就回到编排去竞争新任务。 + +## 7. 终止 + +一个核在其编排不再产生任务**且**私有环为空(所有拥有的任务都已执行)时结束。对 follower +(AIV)还有一条额外条件:它必须等到**其 block 的 anchor 编排也结束**且 `block.won` 中再无 +针对它的待抽取投递——否则可能有尚未推送的多核子任务漏执行。这就是 §3.1 提到的**尾部空转**: +当某 block 的 anchor 严重落后时,它的 follower 做完自身其余全部工作后,会在终止前空转等待 +anchor 推送最后的多核子任务。这不是 per-task 串行阻塞,只发生在收尾,且 cube 领先时不出现。 + +所有核都结束时达到全局完成;最终的图输出位置被发布以供 host 拷回(见 §8 的 +`graph_output_ptr`)。一个全局“所有核完成”屏障替代了旧的单一 `orchestrator_done` 标志。 + +--- + +# 第二部分 — 数据结构与共享特性 + +## 8. 共享模型 + +每个结构被归为以下之一: + +| 类别 | 含义 | +| ---- | ---- | +| **全局共享** | 唯一权威实例;多个核读/写;需要显式访问机制 | +| **block-共享** | 仅在一个固定 block(1 AIC + 2 AIV)的核之间共享;用于 MIX 共同所有权(§3.1) | +| **每核私有** | 由单个核拥有;无跨核可见性 | +| **每核复制** | 每核复制一份;内容相同、各自独立重建(或只读副本) | + +### 8.1 新引入的结构 + +| 结构 | 类别 | 作用 | 访问机制 | +| ---- | ---- | ---- | -------- | +| `cursor[T]`:`cube_cursor` / `vector_cursor` | **全局共享** | 每个类型的 claim 高水位线;到达 `N` 时 `old < N` 即胜出并拥有该任务(§2、§3.1) | 单条 `atomic_fetch_max(cursor[T], N)`(无则 CAS 回路),acq-rel;无跳过性证明见 §11.1 | +| `task_completed_flag` 连续完成前沿 `F` / 回收前沿 `R` | **全局共享** | `F` = 全已完成前缀;`R = F − H` 决定堆/标志环回收(§9.5、§11.3、§11.4) | `F` 协作式 CAS 推进;`R` 派生;单调 | +| `local_current_task_index` | **每核私有** | 编排进度游标;每次 submit `++` | 普通标量 | +| **私有任务环**(`PRIVATE_TASK_SLOT_NUM = 8`) | **每核私有** | 保存已拥有的(子)任务:descriptor + payload + 本地状态 + fan-in producer id | 无(单一 owner,无锁) | +| `task_completed_flag` 环 | **全局共享** | 每任务 id 一个一次性置位布尔;唯一共享的 per-task 状态 | 最后一个(子)任务 owner 做 release 存储;消费者做 acquire 加载(轮询) | +| **`block.won[N]` —— 以 id 为键的子任务投递表** | **block-共享** | anchor → follower 的**推送**通道,以任务 id 为键:`{active_mask M, 各激活槽 kernels/args, 已解析 fan-in, 剩余计数}`。anchor 胜出时把其余激活槽子任务投递进来;follower **异步抽取**属于自己槽的项(不阻塞、不按走位等待)。承载每任务剩余计数,互不串扰(§3.1)。填满时 anchor 暂缓认领新多核任务(反压) | anchor 插入(release);follower 抽取(acquire);`remaining` 原子递减;最后一个子任务完成时释放条目 | + +### 8.2 TensorMap + +| 结构 | 类别 | 作用 | 访问机制 | +| ---- | ---- | ---- | -------- | +| `PTO2TensorMap` / `PTO2TensorMapEntry` | **每核复制(全量)** | tensor 区域 → producer 任务 id;在每个核上相同地构建(§4) | 无跨核锁;通过重放确定性 submit 流重建。有效性由 `task_completed_flag` 环开窗 | + +### 8.3 全局共享,超出 per-task 状态之外 + +| 结构 | 类别 | 作用 | 访问机制 | +| ---- | ---- | ---- | -------- | +| GM 输出堆(打包的输出缓冲) | **全局共享(物理)** | 任务输出/中间结果的后备存储,可被任意核作为下游输入读取 | 一块全局物理区域;分配记账(堆顶、scope arena 基址)是**每核复制、确定性**的(§9),写入由 owner 完成。完整策略见 §9 | +| `heap_top` / scope arena 基址栈 | **每核复制(确定性,非全局)** | 在确定性 submit 重放中无条件推进,使任务 N 的输出地址成为 id 的纯函数(§9) | 无原子、无跨核通信;与 TensorMap 同理(§4) | +| `heap_reclaim_frontier`(全局回收水位线) | **全局共享** | 全局最旧“仍可能被读”的任务 id;据此在 id 顺序上回收堆(§9) | 由完成标志环 + 各核进度最小值推导;单调 | +| `func_id_to_addr_`(kernel id → GM 地址) | **全局共享,只读** | 把 `kernel_id` 解析为要调用的 incore 函数 | init 时一次性设置,之后只读 | +| `graph_output_ptr` / `graph_output_size` | **全局共享** | 供 host 拷回的最终输出位置 | 产出核做原子发布 | +| 全局错误字(原 `orch_error_code`) | **全局共享** | 任意核的致命错误 → 所有核 + host | 原子;首个写者胜出 | +| “所有核完成”屏障(原 `orchestrator_done`) | **全局共享** | 全局终止检测(§7) | 原子计数器 / 屏障 | + +### 8.4 每核私有的编排状态 + +| 结构 | 类别 | 作用 | 访问机制 | +| ---- | ---- | ---- | -------- | +| Scope 栈(`scope_stack_top` + 各层 arena 基址) | **每核复制(确定性)** | `PTO2_SCOPE` 生命周期跟踪;同时界定 GM 输出堆的 arena 栈(§9)。各核结构相同、进度不同 | 无锁;由确定性重放重建。注意:原 `scope_tasks[]`/`scope_begins[]` 用于 fanout 引用记账,新模型已不需要(§9、§10) | +| Fan-in producer-id 列表(每个环槽一份) | **每核私有** | 构建时解析出的 producer 任务 id,执行时轮询 | 无 | +| 本地致命标志 | **每核私有** | 快路径致命错误;升级到全局错误字 | 本地标志 + 原子发布 | +| 核数常量(`total_cluster_count`、`total_aiv_count`) | **每核复制(只读)** | 资格 / 合理性检查 | init 时一次性设置 | + +## 9. 动态内存管理(全局输出堆) + +任务的输出/中间缓冲分配在一块 GM 堆上。由于**一个核产出的 output 可能被另一个核作为输入读取**, +这块堆必须是**全局可寻址**的。本节给出分布式 runtime 下的内存管理策略与数据结构,并说明它相对 +当前 AICPU 模型的“stack of ring + scope”实现需要如何更新。 + +### 9.1 当前(AICPU 集中式)模型回顾 + +- **统一分配器 `PTO2TaskAllocator`**:把**任务槽环**与**堆环(heap ring)**合并分配。单一 + orchestrator 单线程推进,用普通 store 写 `heap_top`(bump),无需 CAS。 +- **回收**:调度器把“最旧已 CONSUMED 任务”推进 `last_task_alive`;分配器据该任务的 + `packed_buffer_end` 反推 `heap_tail`,环形回收(分配从 `top` bump,到尾部则在 `tail` 足够时 + 绕回,缓冲不跨越绕回边界)。 +- **stack of ring**:按 scope 深度复制成 `PTO2_MAX_RING_DEPTH`(=4) 套 {TaskRing, HeapRing, + DepPool},使内层 scope 可独立于外层回收。 +- **scope(`PTO2_SCOPE`)**:用 `scope_tasks[]`/`scope_begins[]` 记录本 scope 的任务;每个任务 + 持有一个 +1 的 fanout 引用,`scope_end` 才释放——从而保证输出缓冲的生命周期 =(真实消费者 + 全部完成)**且**(scope_end)。`TaskOutputTensors` 的引用只在其 `PTO2_SCOPE` 内有效。 + +### 9.2 哪些前提失效、需要更新 + +新模型(§2–§7)取消了集中 orchestrator 与 scheduler,因此上面多数机制的前提不再成立: + +| 旧机制 | 在新模型中的处置 | +| ------ | ---------------- | +| 单 orchestrator 普通-store bump | **失效**:现在每个核都为自己拥有的任务分配输出。多写者下 `heap_top` 不能再用普通 store。 | +| `last_task_alive`/CONSUMED 驱动回收 | **失效**:无 scheduler、无 CONSUMED 状态。回收改由全局完成前沿(§9.5)驱动。 | +| 每 scope 深度的 TaskRing / DepPool / FaninPool | **移除**(§10):任务槽改为每核私有环(§5),无依赖列表。 | +| fanout 引用 + scope_end 释放 | **失效**:无 fanout/refcount。生命周期改由“窗口/前沿 + scope arena 折叠”界定(§9.4、§9.5)。 | +| “stack of ring” | **收敛**为“**每核私有任务环**(§5) + **scope arena 栈**(§9.4)”,后者只管 GM 输出堆。 | + +结论:**stack-ring 需要更新**——任务环部分整体移除,堆部分保留但分配方式与回收方式都要改; +**scope 需要保留但语义简化**(不再做 fanout 引用记账,改为 arena 栈 + 确定性重放)。 + +### 9.3 分配:确定性、每核复制的布局(无原子、无通信) + +核心思想与 §4 的“每核全量复制 TensorMap”一致:**因为 submit 序列与每个任务的输出大小在各核上 +完全确定且相同,输出缓冲的布局也可以被每个核确定性地复算。** + +- 每个核在确定性 submit 重放中,对**每一个**任务(无论自己是否拥有——胜者、败者、follower 一视同仁) + **无条件**推进一份**每核复制**的堆顶 `heap_top`。任务 `N` 的输出偏移 = 其所在 arena 基址 + + 该 arena 内 `N` 之前所有任务输出大小的前缀和。 +- 因此 `addr(N)` 是 submit 序列(及确定性大小)的**纯函数**:每个核为任务 `N` 算出**完全相同**的 + 地址。owner 负责写数据;任何核都能**不经通信**算出任意任务的输出地址。 + +这取代了旧的“单 orchestrator bump”(多核下不可行),也**优于全局原子 bump**:原子 `fetch_add` +会让地址依赖跨核的 bump 顺序而**非确定**,消费者便无法自行算出 producer 地址,必须额外发布地址 + +读地址,引入跨核通信。确定性复制方案两者皆免。 + +> **TensorMap 与地址的关系。** TensorMap 把 tensor 区域映射到 producer 任务 id(§4)。消费者拿到 +> producer id 后,用上面同一套确定性布局即可算出其输出地址(或在 TensorMap 条目里直接缓存这个 +> 确定性地址,因为它在每个核上都相同)。无需 producer 主动发布地址。 + +### 9.4 Scope = 确定性复制的 arena 栈 + +`PTO2_SCOPE` 在新模型里仍然是确定性编排程序的一部分(每个核执行相同的嵌套结构),因此 scope 栈 +是**每核复制且各核相同**的(与 TensorMap 同理)。它现在的职责是界定 GM 输出堆的 **arena 栈**: + +- **scope begin**:把当前 `heap_top` 记为新 arena 的基址,压栈(这是旧“stack of ring”里 + per-depth 独立回收的分布式对应物)。 +- scope 内任务:在该 arena 内确定性 bump 分配(§9.3)。 +- **scope end**:把堆顶折叠回该 arena 基址,**一次性回收**该 scope 内所有“不外逃”的输出(LIFO + 栈式回收,干净且 O(1))。**外逃输出**(被该 scope 之外的任务消费的 tensor)必须分配在/提升到 + **父 arena**,以便在折叠后存活。 +- 对**长 scope**(任务很多、不能等到 scope_end 才回收),在 arena 内部用 §9.5 的窗口/前沿机制做 + 环形回收,先行回收已不再被读的缓冲。 + +`TaskOutputTensors` 的**单 scope 有效**规则保持不变:它返回的引用指向 owner 私有环槽中的 tensor +存储,不得逃出其 `PTO2_SCOPE`;跨 scope 的数据流一律通过 TensorMap 按 id 查 producer + 上述确定性 +地址完成,而非通过 `TaskOutputTensors` 句柄。 + +### 9.5 回收:窗口/前沿,取代 `last_task_alive`/CONSUMED + +由于布局在 id 顺序上确定地 bump,回收也自然按 id 顺序进行(任务 `N` 的缓冲位于 `N+1` 之前)。 +难点在于判断“`N` 的缓冲何时不再被读”。新模型用**全局完成前沿**而非 fanout 精确计数: + +- 维护一个**全局回收水位线** `heap_reclaim_frontier`,由 `task_completed_flag` 环加上**各核进度 + 最小值**(最慢的核/最旧未完成任务)推导。它表示“所有 id ≤ 该值的任务都已完成且其消费者也已完成”。 +- 给定**有界依赖跨度** `H`(保证任务 `N` 的所有消费者 id ≤ `N + H`),当全局完成前沿越过 `F` 时, + 所有 id ≤ `F − H` 的输出可安全回收——把堆尾推进,腾出位置给后续(确定性布局中绕回到该位置的) + 更晚任务。 +- 这与 §11 的 “`task_completed_flag` 环开窗”使用**同一个窗口**:该窗口同时裁剪复制的 TensorMap + 与 GM 堆。 +- **scope_end** 对“不外逃”输出提供额外的、更早的粗粒度回收边界(§9.4)。 +- **反压**:堆(或当前 arena)满时,想为新拥有任务分配的核**暂缓认领**并自旋等待前沿推进——与 + 私有环填满的反压(§6)同一性质,方向一致(不让快核无限超前于回收)。 + +> **正确性要点。** 一个缓冲只有在其**全部消费者执行完毕**后才能回收。窗口法用有界跨度 `H` + +> 全局完成前沿保证这一点;若某图的依赖跨度可能超过 `H`,必须把 `H`/堆容量调大,否则属配置错误 +> (类比旧模型的 heap/window 死锁诊断)。精确的“按 tensor 最后消费者”回收(利用 TensorMap 中 +> 同一区域被新 producer 覆盖这一确定性事件)是更省内存的改进方向,列入 §11。 + +### 9.6 数据结构小结 + +| 结构 | 类别 | 作用 | +| ---- | ---- | ---- | +| GM 输出堆(物理区域) | **全局共享(物理)** | 唯一一块全局可寻址的输出后备存储 | +| `heap_top` | **每核复制(确定性)** | 确定性 bump 堆顶;每核相同,无原子 | +| scope arena 基址栈 + `scope_stack_top` | **每核复制(确定性)** | scope→arena 映射;scope_end 折叠回收 | +| `heap_reclaim_frontier` | **全局共享** | 回收水位线,由完成前沿推导 | +| `graph_output_ptr` / `graph_output_size` | **全局共享** | 最终图输出位置,供 host 拷回 | + +被移除:`PTO2TaskAllocator` 的任务环部分、`last_task_alive`/`heap_tail`(基于 CONSUMED)、per-depth +`DepListPool`/`FaninPool`、`scope_tasks[]`/`scope_begins[]` 的 fanout 记账(§10)。 + +## 10. 被移除的结构(相对 AICPU 的 `tensormap_and_ringbuffer`) + +统一的 worker-scheduler 模型删除了整个子系统: + +| 被移除 | 为什么消失 | +| ------ | ---------- | +| `PTO2SchedulerState`、`RingSchedState` | 无调度器实体——每个核调度自己的环 | +| `PTO2ReadyQueue`、`dummy_ready_queue`、`early_dispatch_queue` | owner 执行自己的就绪任务;无分派队列 | +| `PTO2SpscQueue` + `WiringState` | 无独立连线权威;无 fanout 可连 | +| `fanout_lock`、`fanout_head`、`PTO2DepListPool`、`PTO2FaninPool` 溢出 | 无 fanout 列表——依赖经标志环拉取 | +| `fanin_refcount`、`fanout_refcount`、`completed_subtasks` | 被完成标志轮询替代 | +| `Handshake` 门铃、`Runtime::workers[]`、`AICoreCompletionMailbox` | 无调度器→worker 分派握手 | +| SM 中的全局 `PTO2TaskDescriptor` / `PTO2TaskPayload` / `PTO2TaskSlotState` 环 | 被每核私有任务环替代 | +| `current_task_index`(环头)/ `last_task_alive`(环尾)流控 | 被 claim 计数器 + 每核环空槽替代 | +| `task_state`(PENDING/COMPLETED/CONSUMED)、每线程 `sched_error_*` | 被单一全局 `task_completed_flag` 与单一错误字替代 | +| `PTO2TaskAllocator` 的**任务环**部分、`heap_tail`(基于 CONSUMED 反推) | 堆分配改为每核复制的确定性 bump;回收改为全局完成前沿(§9) | +| per-depth “stack of ring” 的 TaskRing | 收敛为每核私有环(§5)+ scope arena 栈(§9);堆 arena 仍按 scope 分层 | +| `scope_tasks[]` / `scope_begins[]` 的 fanout 引用记账 | scope 不再持有 +1 fanout 引用;生命周期由窗口/前沿 + arena 折叠界定(§9) | + +编排 API 表面(`PTO2RuntimeOps`、`rt_submit_*`)**保留**;只有 `submit_task` 背后的实现改变 +(认领 → 无条件 TensorMap 更新 → 有条件的私有环构建 → 稍后执行)。 + +## 11. 实现规范(原开放问题的决议) + +本节把先前列为开放的问题逐一定为具体方案。先约定全局常量: + +| 常量 | 含义 | 默认 | +| ---- | ---- | ---- | +| `W` | 全局窗口(`task_completed_flag` 环、复制 TensorMap、GM 堆共用),2 的幂 | ≥ `Δ + H` | +| `Δ` | 任一核相对全局完成前沿可向前跑的最大 id 跨度(由反压封顶) | 由 `PRIVATE_TASK_SLOT_NUM`、堆容量决定 | +| `H` | 依赖跨度上界:任一 producer 的最后消费者 id ≤ producer id + `H` | 按图配置 | +| `F` | 全局连续完成前沿:使所有 id ≤ `F` 的任务都已完成的最大前缀 | 运行期推进 | +| `R` | 回收前沿 `= F − H`:id ≤ `R` 的输出可安全回收 | 由 `F` 推导 | +| `BLOCK_WON_SLOTS` | 每 block 的 `block.won` 投递环容量 | `PRIVATE_TASK_SLOT_NUM`(=8) | + +### 11.1 Claim 原子性 + 两条流的无跳过(原“Claim 原子性”“每 anchor 类型 claim 计数器”) + +**原语:单条 `atomic_fetch_max`。** 一个类型为 `T` 的核到达任务 `N` 时执行 +`old = atomic_fetch_max(cursor[T], N)`(`cursor[T]` 为 GM 上一个 64 位字),**`old < N` 即胜出**, +否则 `N` 已被认领。单原子、无循环。若硬件无 `fetch_max`,等价 CAS 回路: +`do { c = load(cursor[T]); if (N <= c) return LOST; } while (!CAS(cursor[T], c, N)); return WON;` +内存序取 **acq-rel**(release 发布胜利,acquire 观察既有认领)。所有权判定只依赖 cursor 本身; +真正的产出数据另由完成标志同步(§11.5)。 + +**恰一胜者且无跳过(取代“claim 计数器”)。** 每个 `T` 核按 id 递增顺序遇到 `T` 任务,`cursor[T]` +只会取到真实的 `T` 任务 id 值。在任何核尝试第 `k` 个 `T` 任务 `t_k` 之前,它必先尝试过 `t_{k-1}` +(于是其时 `cursor[T] ≥ t_{k-1}`);而 `cursor[T]` 的相邻取值之间没有别的 `T` id,故它只能从 +`t_{k-1}` 跃到 `t_k`——**不跳过任何 `T` id,且每个恰被一个核置位(fetch_max 的单调性保证)**。 +`cube_cursor` 与 `vector_cursor` 各自对自己的子序列单调推进、互不干扰,全局任务 id 仍是单一确定 +序列。两个 cursor 的存在与必要性见 §2、§3.1。 + +### 11.2 `block.won` 容量与反压(原“`block.won` 投递表大小与偏移”) + +- **容量**:每 block 一个小定长环,`BLOCK_WON_SLOTS`(默认 = `PRIVATE_TASK_SLOT_NUM` = 8)个条目, + 每条目 = 一个多核任务推送给本 block 的子任务集 + 剩余计数。界限依据:anchor 的超前量本就被其 + 自身私有环(8)封顶,每赢一个多核任务至多占 anchor 1 个环槽 + 1 个 `block.won` 条目,故 8 足够 + (可更小)。 +- **反压(已落入 §6 伪代码)**:anchor 在**认领之前**检查 `block.won` 是否有空位;满则**不认领** + (不执行 `fetch_max`),退出 Phase A 去 Phase B 执行就绪任务(从而让 follower 抽取、腾空 + `block.won`)。被让出的多核任务由**另一个有空闲的 block 的 anchor 认领**(天然负载均衡)或本核 + 稍后重试。 +- **无死锁**:根任务无依赖恒就绪;执行持续腾空私有环与 `block.won`;DAG 无环 → 前向进展恒成立。 + 唯一残留是 §8 的尾部空转。 + +### 11.3 完成标志环大小与回绕(原“`task_completed_flag` 环大小与回绕”) + +- `task_completed_flag` 是 `W` 个一次性置位布尔的环,`flag(N)` 位于 `N & (W−1)`。 +- **`W` 取 2 的幂且 ≥ `Δ + H`**:`Δ` 是最快核相对完成前沿的最大超前(由私有环 + 堆反压封顶), + `H` 是依赖跨度上界(§11.4)。同一个 `W` 同时给复制 TensorMap 与 GM 堆开窗。 +- **回绕/ABA**:当回收前沿 `R`(§11.4)越过 `N` 时,把 `flag(N)` 复位为 false,槽位让给 `N+W`。 + 不变式:消费者只在构建了依赖 `N` 的任务**之后**(即走位已过 `N`)才轮询 `flag(N)`,而 `W ≥ Δ+H` + 保证 `N` 的标志仍被需要时 `N+W` 尚未被认领 → 不会别名。**更稳健的可选做法**:在槽内连同 true 写入 + producer 的 `N`(消费者校验 `slot.id == N`),用代/epoch 戳彻底杜绝 ABA,与 `W` 大小无关。 + +### 11.4 GM 堆细化:`H`、容量、前沿推导、外逃输出(原“GM 输出堆的细化”) + +- **`H`(依赖跨度上界)**:配置上界。运行期校验:若某消费者的 producer id < (当前 − `H`),或某分配 + 将覆盖尚不可回收的区域,即判为容量/配置错误(类比旧模型的 heap-deadlock 诊断)→ 调大 `H`/堆。 +- **堆/arena 容量** ≥ 工作集 = 窗口 `(R, top]` 内各任务输出大小之和;超出则报诊断。 +- **`F`(连续完成前沿)**:全局原子、单调。**协作式推进**——任一核置位 `flag(N)` 后, + `while flag(F+1) == true: CAS(F, F, F+1)`。无锁、任意核可推进、开销摊薄。 +- **`R = F − H`(回收前沿)**:全局派生量。某 arena 的 `heap_tail` = 任务 `R` 在该 arena 内的确定性 + 偏移;因布局确定,每个核都算出相同的 `heap_tail`。核要在确定性偏移 `X` 上分配任务 `M` 时,须等 + `X` 处上一占用者的任务 id ≤ `R`(即回收已到位)——这就是堆侧反压。 +- **外逃输出(promotion 的处置)**:**默认不做运行期提升**。堆按单一全局确定性 bump + 前沿回收 + (§9.5),它对任意依赖(含跨 scope)都正确,无需前向信息。**scope-arena 折叠**(scope_end 处 + LIFO 即时回收)只作为**可选优化**,仅施加于**静态可证/标注为“无外逃”**的 scope;含外逃输出的 + scope 退回前沿回收。如此既无需在产出时预知外逃,也保证正确。 +- **“按 tensor 最后消费者”的精确回收**:**降级为可选优化,正确性不依赖它**。精确的最后消费者需要 + 前向信息/两遍扫描/引用计数(已移除),故以 `H`-窗口为已定的主用机制;精确回收作为省内存改进 + 留作未来工作(不阻塞)。 + +### 11.5 跨核标志可见性(原“跨核标志可见性”) + +- **producer 次序**:写输出到 GM → 把输出区域 writeback/flush 到所有核读取的一致性点(GM/L2)→ + **release-store** `flag(N) = true`。 +- **consumer 次序**:**acquire-load** `flag(N)`;见 true 后(acquire 栅栏)再读 producer 的输出区域; + 非一致缓存平台上对该区域做 invalidate 或旁路缓存读。 +- **一致缓存平台**:标志字上的 release/acquire 即足够。**非一致平台**:在标志发布/观察前后,对**数据 + 区域**显式 writeback(producer)/ invalidate(consumer)。 +- `cursor[T]`、`F`、`R` 等原子量统一取 acq-rel(§11.1)。 + +### 11.6 异步 / SDMA kernel(原“异步/SDMA kernel”) + +- **句柄记在私有环槽里,不是 `block.won`。** 异步算子是 owner 在执行自己**私有任务环**中的某个 + (子)任务时发起的,故异步句柄/事件记入**该私有环槽**,槽因任务尚未真正完成而**暂不释放**。 + 异步本身与 `block.won` 没有直接关系——它只是把“完成动作”从*发起时刻*推迟到 *DMA 真正完成时刻*。 +- Phase B 在检查依赖就绪之外,**额外轮询在飞私有环槽的句柄**;异步完成时,按 §11.5 的次序 + (先 flush)执行该(子)任务的**完成动作**,再释放槽。完成动作具体是什么取决于任务种类 + (与异步无关,沿用 §6 的完成逻辑): + - **单核任务(1C/1V)**:直接置 `flag(N)`。 + - **多核任务(MIX/2V)的子任务**:`atomic_dec(block.won[N].remaining)`,由把 `remaining` 减到 0 + 的那个子任务最后置 `flag(N)`。**仅在此情形下,被推迟的完成动作才触及 `block.won`**——即“在 + mixed/2V 子任务内部发起异步 DMA”时。 +- 消费者侧不变:仍只轮询标志,而标志只在算子(及其所属多核任务的全部子任务)**真正完成后**才被置。 +- **反压**:在飞异步算子数量被私有环容量天然封顶。 + +**这一步轮询由谁做:每个核自己做,不专设 AICPU。** + +- **决策**:在飞句柄由**发起该算子的 owner 核**在自己的 Phase B 中轮询,**不**引入一个专职轮询的 + AICPU。理由: + 1. **不违背全局目标**——本设计的根本目的就是把编排/调度从 AICPU 移除、SPMD 分布到 AI 核;专设 + AICPU 轮询器等于请回集中式部件,并制造单点。 + 2. **保持单一 owner、无锁不变式**——置 `flag(N)`、释放私有环槽、递减 `block.won[N].remaining` + 都是 owner 的本地动作(owner = builder = executor = completer)。让 AICPU 代劳就要写别人“单一 + owner、无锁”的私有环与 block-共享计数,反而需要加锁/协调。 + 3. **边际成本近零**——Phase B 本就逐槽遍历私有环查依赖就绪,顺带读一次在飞槽的句柄状态仅多一次 + 状态读;在飞数被私有环容量(`PRIVATE_TASK_SLOT_NUM`)封顶。 + 4. **异步算子本就并行**——SDMA 跑在 DMA 引擎上,核在此期间继续编排/执行其它任务,只在 Phase B + 间隙轮询,不占算力。 +- **可选硬件辅助(不改变上述归属)**:若异步引擎能在完成时**自行写一个内存位**或**发事件**,则 + - 让引擎按 §11.5 的次序直接置 `flag(N)`:消费者照常轮询标志,**无核需要为“发布完成”而忙等**; + owner 只需在下次访问该槽时**惰性**释放槽并递减 `remaining`(届时已见标志置位)。 + - 或:尾部空转的 owner(§7/§8,已无其它就绪工作)**等待该完成事件**而非忙轮询。 + + 两种辅助都仍由 owner 收尾,不引入集中式 AICPU 轮询器。 + +### 11.7 仍然开放 + +- **MIX 配对 —— 动态替代方案:** §3.1 规定*固定* block 配对(AIC_c + AIV0_c + AIV1_c)。 + **平台依据:在 A5 平台上,block 由硬件把 1 个 AIC + 2 个 AIV 固定绑定**,因此面向 A5(及当前 + 目标核)开发时,**采用固定配对、不做动态 co-owner 匹配是合理且既定的选择**——它与硬件 block + 边界天然对齐,省去跨 block 的认领协调与正确性论证负担(§3.2)。 + 动态配对方案(跨 block 均衡 MIX 工作;亦即 §3.2 讨论并暂不采用的“block 内先到先得代发布”等 + 思路的归宿)**仅在未来核解除该硬件绑定时**才需要,届时再行设计,**本节不予裁定**。 + +## 12. 相关文档 + +| 文档 | 关联性 | +| ---- | ------ | +| [chip-level-arch.md](chip-level-arch.md) | 当前 L2 host / AICPU / AICore 划分(本设计所替代的模型) | +| [scheduler.md](scheduler.md) | 当前 AICPU 侧调度器(此处移除) | +| [orchestrator.md](orchestrator.md) | Host/L3 Orchestrator DAG 构建器(不同层;仅命名重叠) | +| [simt-launch.md](simt-launch.md) | 设备上的 SPMD / 多 block 启动 | +| [tensormap_and_ringbuffer RUNTIME_LOGIC.md](../src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md) | 此处移除/修改结构的权威来源 | diff --git a/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_consumer.cpp b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_consumer.cpp new file mode 100644 index 000000000..55d69ce34 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_consumer.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#ifndef __gm__ +#define __gm__ +#endif +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +#include +#include "pto/common/pto_tile.hpp" + +#include "tensor.h" + +using namespace pto; + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *result_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ int32_t *notify_counter = reinterpret_cast<__gm__ int32_t *>(args[3]); + + __gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset; + __gm__ float *result = reinterpret_cast<__gm__ float *>(result_tensor->buffer.addr) + result_tensor->start_offset; + + constexpr int kRows = 128; + constexpr int kCols = 128; + using DynShapeDim5 = Shape<1, 1, 1, kRows, kCols>; + using DynStridDim5 = Stride<1, 1, 1, kCols, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src_tile(kRows, kCols); + TileData dst_tile(kRows, kCols); + TASSIGN(src_tile, 0x0); + TASSIGN(dst_tile, 0x10000); + + GlobalData src_global(src); + GlobalData dst_global(result); + TLOAD(src_tile, src_global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADDS(dst_tile, src_tile, static_cast(*notify_counter)); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dst_global, dst_tile); + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); +} diff --git a/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_notify_wait.cpp b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_notify_wait.cpp new file mode 100644 index 000000000..bc8f1cd86 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_notify_wait.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#ifndef __gm__ +#define __gm__ +#endif +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +#include +#include "pto_async_kernel_api.h" + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + uint64_t notify_counter_addr = static_cast(args[1]); + uint32_t expected_value = static_cast(args[2]); + AsyncCtx async_ctx = get_async_ctx(args); + save_expected_notification_counter( + async_ctx, reinterpret_cast(notify_counter_addr), expected_value + ); +} diff --git a/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_producer_notify.cpp b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_producer_notify.cpp new file mode 100644 index 000000000..1cd3fb7ec --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/aiv/kernel_producer_notify.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#ifndef __gm__ +#define __gm__ +#endif +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +#include + +#include "platform_comm/comm_context.h" +#include "pto_async_kernel_api.h" +#include "tensor.h" + +using namespace pto; + +template +static inline __aicore__ __gm__ T *comm_remote_ptr(__gm__ CommContext *ctx, __gm__ T *local_ptr, int peer_rank) { + uint64_t local_base = ctx->windowsIn[ctx->rankId]; + uint64_t offset = reinterpret_cast(local_ptr) - local_base; + return reinterpret_cast<__gm__ T *>(ctx->windowsIn[peer_rank] + offset); +} + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *in_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ int32_t *local_counter = reinterpret_cast<__gm__ int32_t *>(args[2]); + __gm__ CommContext *comm_ctx = reinterpret_cast<__gm__ CommContext *>(args[3]); + + __gm__ float *in_data = reinterpret_cast<__gm__ float *>(in_tensor->buffer.addr) + in_tensor->start_offset; + __gm__ float *out_data = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + int my_rank = static_cast(comm_ctx->rankId); + int peer_rank = 1 - my_rank; + + constexpr int kRows = 128; + constexpr int kCols = 128; + using DynShapeDim5 = Shape<1, 1, 1, kRows, kCols>; + using DynStridDim5 = Stride<1, 1, 1, kCols, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData in_tile(kRows, kCols); + TileData out_tile(kRows, kCols); + TASSIGN(in_tile, 0x0); + TASSIGN(out_tile, 0x10000); + + GlobalData in_global(in_data); + GlobalData out_global(out_data); + TLOAD(in_tile, in_global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(out_tile, in_tile, in_tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(out_global, out_tile); + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + + if (my_rank == 1) { + for (volatile int i = 0; i < 2000000; ++i) {} + } + + __gm__ int32_t *remote_counter = comm_remote_ptr(comm_ctx, local_counter, peer_rank); + send_notification(remote_counter, 1, pto::comm::NotifyOp::AtomicAdd); + pipe_barrier(PIPE_ALL); +} diff --git a/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/orchestration/async_notify_orchestration.cpp b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/orchestration/async_notify_orchestration.cpp new file mode 100644 index 000000000..59e1cc23c --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/async_notify_demo/kernels/orchestration/async_notify_orchestration.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#include "platform_comm/comm_context.h" +#include "pto_orchestration_api.h" + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +async_notify_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{.expected_arg_count = 5}; +} + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + return async_notify_orchestration_config(orch_args); +} + +__attribute__((visibility("default"))) void async_notify_orchestration(const L2TaskArgs &orch_args) { + if (orch_args.tensor_count() + orch_args.scalar_count() != 5) { + LOG_ERROR("async_notify_demo: expected 5 args"); + return; + } + + const Tensor &input = orch_args.tensor(0).ref(); + const Tensor &output = orch_args.tensor(1).ref(); + const Tensor &result = orch_args.tensor(2).ref(); + const Tensor ¬ify_counter = orch_args.tensor(3).ref(); + auto *comm_ctx = reinterpret_cast(static_cast(orch_args.scalar(0))); + + L0TaskArgs params_producer; + params_producer.add_input(input); + params_producer.add_output(output); + params_producer.add_scalar(notify_counter.buffer.addr); + params_producer.add_scalar(reinterpret_cast(comm_ctx)); + rt_submit_aiv_task(0, params_producer); + + uint32_t notify_token_shape[1] = {1}; + TensorCreateInfo notify_token_info(notify_token_shape, 1, DataType::INT32); + L0TaskArgs params_notify; + params_notify.add_output(notify_token_info); + params_notify.add_scalar(notify_counter.buffer.addr); + params_notify.add_scalar(static_cast(1)); + TaskOutputTensors notify_outputs = rt_submit_aiv_task(2, params_notify); + Tensor notify_token = notify_outputs.get_ref(0); + + L0TaskArgs params_consumer; + params_consumer.add_input(notify_token); + params_consumer.add_input(output); + params_consumer.add_output(result); + params_consumer.add_scalar(notify_counter.buffer.addr); + rt_submit_aiv_task(1, params_consumer); +} + +} // extern "C" diff --git a/examples/a2a3/fully_distributed_within_core/async_notify_demo/test_async_notify_demo.py b/examples/a2a3/fully_distributed_within_core/async_notify_demo/test_async_notify_demo.py new file mode 100644 index 000000000..df462249a --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/async_notify_demo/test_async_notify_demo.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Notification counter + deferred completion smoke test for onboard a2a3.""" + +from __future__ import annotations + +import argparse +import os + +import torch +from simpler.task_interface import ( + ArgDirection, + CallConfig, + ChipCallable, + CommBufferSpec, + CoreCallable, + DataType, + TaskArgs, + Tensor, + TensorArgType, +) +from simpler.worker import Worker + +from simpler_setup.elf_parser import extract_text_section +from simpler_setup.kernel_compiler import KernelCompiler +from simpler_setup.pto_isa import ensure_pto_isa_root +from simpler_setup.torch_interop import make_tensor_arg + +HERE = os.path.dirname(os.path.abspath(__file__)) +N = 128 * 128 + + +def parse_device_range(spec: str) -> list[int]: + if "," in spec: + return [int(x) for x in spec.split(",") if x] + if "-" in spec: + lo, hi = (int(x) for x in spec.split("-")) + return list(range(lo, hi + 1)) + return [int(spec)] + + +def build_chip_callable(platform: str, pto_isa_commit: str | None, clone_protocol: str) -> ChipCallable: + kc = KernelCompiler(platform=platform) + runtime = "fully_distributed_within_core" + pto_isa_root = ensure_pto_isa_root(commit=pto_isa_commit, clone_protocol=clone_protocol) + include_dirs = kc.get_orchestration_include_dirs(runtime) + extra_includes = list(include_dirs) + [str(kc.project_root / "src" / "common")] + + children = [] + for func_id, rel in [ + (0, "kernels/aiv/kernel_producer_notify.cpp"), + (1, "kernels/aiv/kernel_consumer.cpp"), + (2, "kernels/aiv/kernel_notify_wait.cpp"), + ]: + kernel = kc.compile_incore( + source_path=os.path.join(HERE, rel), + core_type="aiv", + pto_isa_root=pto_isa_root, + extra_include_dirs=extra_includes, + ) + if not platform.endswith("sim"): + kernel = extract_text_section(kernel) + children.append( + ( + func_id, + CoreCallable.build( + signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.OUT, ArgDirection.IN], + binary=kernel, + ), + ) + ) + + orch = kc.compile_orchestration( + runtime_name=runtime, + source_path=os.path.join(HERE, "kernels/orchestration/async_notify_orchestration.cpp"), + extra_include_dirs=[str(kc.project_root / "src" / "common")], + ) + return ChipCallable.build( + signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.OUT, ArgDirection.IN], + func_name="async_notify_orchestration", + binary=orch, + children=children, + ) + + +def run( + platform: str = "a2a3", + device_ids: list[int] | None = None, + pto_isa_commit: str | None = None, +) -> int: + if device_ids is None: + device_ids = [0, 1] + nranks = len(device_ids) + if nranks != 2: + raise ValueError(f"async_notify_demo needs exactly 2 devices, got {device_ids}") + + inp = [ + torch.tensor([float(i % 251) / 10.0 for i in range(N)], dtype=torch.float32).share_memory_() + for _ in range(nranks) + ] + out = [torch.zeros(N, dtype=torch.float32).share_memory_() for _ in range(nranks)] + result = [torch.zeros(N, dtype=torch.float32).share_memory_() for _ in range(nranks)] + + chip_callable = build_chip_callable(platform, pto_isa_commit, "https") + worker = Worker( + level=3, + platform=platform, + runtime="fully_distributed_within_core", + device_ids=device_ids, + num_sub_workers=0, + ) + chip_handle = worker.register(chip_callable) + try: + worker.init() + + def orch_fn(orch, _args, cfg): + with orch.allocate_domain( + name="default", + workers=list(range(nranks)), + window_size=4 * 1024, + buffers=[CommBufferSpec(name="notify_counter", dtype="int32", count=1, nbytes=4)], + ) as handle: + for rank in range(nranks): + domain = handle[rank] + args = TaskArgs() + args.add_tensor(make_tensor_arg(inp[rank]), TensorArgType.INPUT) + args.add_tensor(make_tensor_arg(out[rank]), TensorArgType.OUTPUT_EXISTING) + args.add_tensor(make_tensor_arg(result[rank]), TensorArgType.OUTPUT_EXISTING) + args.add_tensor( + Tensor.make( + data=domain.buffer_ptrs["notify_counter"], + shapes=(1,), + dtype=DataType.INT32, + child_memory=True, + ), + TensorArgType.INPUT, + ) + args.add_scalar(domain.device_ctx) + orch.submit_next_level(chip_handle, args, cfg, worker=rank) + + worker.run(orch_fn, args=None, config=CallConfig()) + + ok = True + for rank in range(nranks): + expected_out = inp[rank] * 2.0 + expected_result = expected_out + 1.0 + max_out = float(torch.max(torch.abs(out[rank] - expected_out))) + max_result = float(torch.max(torch.abs(result[rank] - expected_result))) + print(f"[async_notify_demo] rank {rank}: max_out={max_out:.3e} max_result={max_result:.3e}") + ok = ok and max_out <= 1e-3 and max_result <= 1e-3 + return 0 if ok else 1 + finally: + worker.close() + + +def test_async_notify_demo() -> None: + assert run("a2a3", [0, 1]) == 0 + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("-p", "--platform", default="a2a3") + parser.add_argument("-d", "--device", default="0-1") + parser.add_argument("--pto-isa-commit", default=None) + args = parser.parse_args() + return run(args.platform, parse_device_range(args.device), args.pto_isa_commit) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp new file mode 100644 index 000000000..1f331d6e0 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp @@ -0,0 +1,150 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Tile-based Matrix Multiplication Kernel (Cube Core) + * + * Computes: output = input_a @ input_b (tile_size x tile_size tile matmul) + * Uses TMATMUL instruction + * + * Tile size is determined by golden.py configuration and passed through + * tensor shapes from orchestration. + * + * Args (Tensor*): + * args[0] = input_a (INPUT) + * args[1] = input_b (INPUT) + * args[2] = output (OUTPUT) + * args[3] = config (INPUT) - int64_t[4]: [tile_size, grid_k, num_groups, incore_loop] + */ + +#include +#include +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +AICORE constexpr inline T CeilAlign(T num_1, T num_2) { + if (num_2 == 0) { + return 0; + } + return (num_1 + num_2 - 1) / num_2 * num_2; +} + +template +static __aicore__ void gemm_tile_impl(__gm__ float *input_a, __gm__ float *input_b, __gm__ float *output) { + constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float); + constexpr int M = CeilAlign(TILE, 16); + constexpr int K = CeilAlign(TILE, blockAlign); + constexpr int N = CeilAlign(TILE, blockAlign); + + using GlobalDataA = + GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; + using GlobalDataB = + GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; + using GlobalDataC = + GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; + + GlobalDataA src0Global(input_a); + GlobalDataB src1Global(input_b); + GlobalDataC dstGlobal(output); + + using TileMatA = Tile; + using TileMatB = Tile; + + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + TLOAD(aMatTile, src0Global); + TLOAD(bMatTile, src1Global); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(dstGlobal, cTile); + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *input_a = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *input_b = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *output = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *config = reinterpret_cast<__gm__ Tensor *>(args[3]); + + __gm__ int64_t *cfg = reinterpret_cast<__gm__ int64_t *>(config->buffer.addr); + uint64_t tile_size = static_cast(cfg[0]); + uint64_t tile_elems = tile_size * tile_size; + int num_tiles = static_cast(cfg[3]); + + __gm__ float *base_a = reinterpret_cast<__gm__ float *>(input_a->buffer.addr) + input_a->start_offset; + __gm__ float *base_b = reinterpret_cast<__gm__ float *>(input_b->buffer.addr) + input_b->start_offset; + __gm__ float *base_c = reinterpret_cast<__gm__ float *>(output->buffer.addr) + output->start_offset; + + for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + __gm__ float *a_ptr = base_a + (tile_idx * tile_elems); + __gm__ float *b_ptr = base_b + (tile_idx * tile_elems); + __gm__ float *c_ptr = base_c + (tile_idx * tile_elems); + + switch (tile_size) { + case 16: + gemm_tile_impl<16>(a_ptr, b_ptr, c_ptr); + break; + case 32: + gemm_tile_impl<32>(a_ptr, b_ptr, c_ptr); + break; + case 64: + gemm_tile_impl<64>(a_ptr, b_ptr, c_ptr); + break; + case 128: + gemm_tile_impl<128>(a_ptr, b_ptr, c_ptr); + break; + default: + break; + } + } +} diff --git a/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp new file mode 100644 index 000000000..c80e88244 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Tile-based Element-wise Addition Kernel (Vector Core) - INOUT Pattern + * + * Computes: C_tile = C_tile + P (tile_size x tile_size tile accumulation) + * Uses TADD instruction + * + * Tile size is determined by golden.py configuration and passed through + * tensor shapes from orchestration. + * + * Args (Tensor*): + * args[0] = C_tile (INOUT: read + write accumulator) + * args[1] = P (INPUT: matmul result to accumulate) + * args[2] = config (INPUT) - int64_t[4]: [tile_size, grid_k, num_groups, incore_loop] + */ + +#include +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void tile_add_impl(__gm__ float *c_ptr, __gm__ float *p_ptr) { + using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>; + using DynStridDim5 = Stride<1, 1, 1, TILE, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData cTile(TILE, TILE); + TileData pTile(TILE, TILE); + TileData outTile(TILE, TILE); + TASSIGN(cTile, 0x0); + TASSIGN(pTile, 0x10000); + TASSIGN(outTile, 0x20000); + + GlobalData cGlobal(c_ptr); + GlobalData pGlobal(p_ptr); + GlobalData outGlobal(c_ptr); // write back to same C location + + TLOAD(cTile, cGlobal); + TLOAD(pTile, pGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(outTile, cTile, pTile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(outGlobal, outTile); + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *c_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *p_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *config = reinterpret_cast<__gm__ Tensor *>(args[2]); + + __gm__ int64_t *cfg = reinterpret_cast<__gm__ int64_t *>(config->buffer.addr); + uint64_t tile_size = static_cast(cfg[0]); + uint64_t tile_elems = tile_size * tile_size; + int num_tiles = static_cast(cfg[3]); + + __gm__ float *base_c = reinterpret_cast<__gm__ float *>(c_tensor->buffer.addr) + c_tensor->start_offset; + __gm__ float *base_p = reinterpret_cast<__gm__ float *>(p_tensor->buffer.addr) + p_tensor->start_offset; + + for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + __gm__ float *c_ptr = base_c + (tile_idx * tile_elems); + __gm__ float *p_ptr = base_p + (tile_idx * tile_elems); + + switch (tile_size) { + case 16: + tile_add_impl<16>(c_ptr, p_ptr); + break; + case 32: + tile_add_impl<32>(c_ptr, p_ptr); + break; + case 64: + tile_add_impl<64>(c_ptr, p_ptr); + break; + case 128: + tile_add_impl<128>(c_ptr, p_ptr); + break; + default: + break; + } + } +} diff --git a/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp new file mode 100644 index 000000000..dcfc11340 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * BGEMM Orchestration Function (tensormap_and_ringbuffer Runtime) + * + * Builds the task graph for tiled matrix multiplication: C = A @ B + * + * Configuration read from scalar args (set in golden.py): + * - tile_size: tile dimension (tile_size x tile_size per tile) + * - grid_k: number of K-dimension partitions + * - num_groups: number of independent groups (= matmul_add_task_num / grid_k) + * - incore_loop: number of tiles per group + * + * Memory layout (tile-first, flattened): + * A: [num_groups, grid_k, incore_loop, tile_size, tile_size] + * B: [num_groups, grid_k, incore_loop, tile_size, tile_size] + * C: [incore_loop * num_groups, tile_size, tile_size] + * + * Arg layout: [A, B, C, config] + */ + +#include +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +#define FUNC_GEMM_TILE 0 +#define FUNC_TILE_ADD 1 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 4, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) { + // Tensor args + const Tensor &ext_A = orch_args.tensor(0).ref(); + const Tensor &ext_B = orch_args.tensor(1).ref(); + const Tensor &ext_C = orch_args.tensor(2).ref(); + const Tensor &ext_config = orch_args.tensor(3).ref(); + + // Read config from tensor data: [tile_size, grid_k, num_groups, incore_loop] + int64_t *host_config = orch_args.tensor(3).ref().data_as(); + int tile_size = static_cast(host_config[0]); + int grid_k = static_cast(host_config[1]); + int num_groups = static_cast(host_config[2]); + int incore_loop = static_cast(host_config[3]); + uint64_t tile_elems = static_cast(tile_size) * tile_size; + + int grid_m = 1; + int grid_n = 1; + + LOG_INFO_V0( + "[bgemm_orch] tile_size: %d, grid_m: %d, grid_n: %d, grid_k: %d, num_groups: %d, incore_loop: %d", tile_size, + grid_m, grid_n, grid_k, num_groups, incore_loop + ); + + uint32_t tile_shapes[1] = {static_cast(tile_elems)}; + uint64_t group_tile_elems = static_cast(incore_loop) * tile_elems; + uint32_t group_shapes[1] = {static_cast(group_tile_elems)}; + TensorCreateInfo group_ci(group_shapes, 1, DataType::FLOAT32); + + int total_gemm = 0; + int total_add = 0; + + // A/B layout: [num_groups, grid_k, incore_loop, tile_size, tile_size] + // C layout: [incore_loop * num_groups, tile_size, tile_size] + for (int group_idx = 0; group_idx < num_groups; group_idx++) { + PTO2_SCOPE_GUARD(); + + uint32_t c_elem_offset = static_cast(static_cast(group_idx) * group_tile_elems); + uint32_t c_view_offsets[1] = {c_elem_offset}; + Tensor C_view = ext_C.view(group_shapes, c_view_offsets); + + for (int k_idx = 0; k_idx < grid_k; k_idx++) { + // In layout [num_groups, grid_k, incore_loop, tile_size, tile_size], + // offset = (group_idx * grid_k + k_idx) * incore_loop * tile_elems + uint64_t ab_offset = + (static_cast(group_idx) * grid_k + static_cast(k_idx)) * group_tile_elems; + + uint32_t a_view_offsets[1] = {static_cast(ab_offset)}; + Tensor A_view = ext_A.view(group_shapes, a_view_offsets); + uint32_t b_view_offsets[1] = {static_cast(ab_offset)}; + Tensor B_view = ext_B.view(group_shapes, b_view_offsets); + L0TaskArgs params_gemm; + params_gemm.add_input(A_view); + params_gemm.add_input(B_view); + params_gemm.add_output(group_ci); + params_gemm.add_input(ext_config); + TaskOutputTensors gemm_outs = rt_submit_aic_task(FUNC_GEMM_TILE, params_gemm); + total_gemm++; + + L0TaskArgs params_add; + params_add.add_inout(C_view); + params_add.add_input(gemm_outs.get_ref(0)); + params_add.add_input(ext_config); + rt_submit_aiv_task(FUNC_TILE_ADD, params_add); + total_add++; + } + } + + LOG_INFO_V0( + "[bgemm_orch] Submitted %d gemm tasks and %d add tasks (%d total)", total_gemm, total_add, + total_gemm + total_add + ); +} + +} // extern "C" diff --git a/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/test_benchmark_bgemm.py b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/test_benchmark_bgemm.py new file mode 100644 index 000000000..ac1200326 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/benchmark_bgemm/test_benchmark_bgemm.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Benchmark BGEMM: runtime-configurable tiled matmul C = sum(k) A[k] @ B[k].""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + + +@scene_test(level=2, runtime="fully_distributed_within_core") +class TestBenchmarkBgemm(SceneTestCase): + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/bgemm_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT, D.IN], + }, + "incores": [ + { + "func_id": 0, + "name": "GEMM", + "source": "kernels/aic/kernel_gemm_tile.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "name": "ADD", + "source": "kernels/aiv/kernel_tile_add.cpp", + "core_type": "aiv", + "signature": [D.INOUT, D.IN], + }, + ], + } + + CASES = [ + { + "name": "Case0", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {"matmul_add_task_num": 500, "incore_data_size": 128, "incore_loop": 4, "grid_k": 2}, + }, + { + "name": "Case1", + "manual": True, + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {"matmul_add_task_num": 64, "incore_data_size": 128, "incore_loop": 4, "grid_k": 2}, + }, + { + "name": "Case2", + "manual": True, + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {"matmul_add_task_num": 256, "incore_data_size": 128, "incore_loop": 4, "grid_k": 2}, + }, + { + "name": "Case3", + "manual": True, + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {"matmul_add_task_num": 64, "incore_data_size": 128, "incore_loop": 16, "grid_k": 2}, + }, + { + "name": "Case4", + "manual": True, + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {"matmul_add_task_num": 64, "incore_data_size": 128, "incore_loop": 4, "grid_k": 4}, + }, + { + "name": "Bgemm64", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 3}, + "params": {"matmul_add_task_num": 32, "incore_data_size": 64, "incore_loop": 1, "grid_k": 4}, + }, + ] + + def generate_args(self, params): + tile_size = params["incore_data_size"] + incore_loop = params["incore_loop"] + grid_k = params["grid_k"] + num_groups = params["matmul_add_task_num"] // grid_k + A = torch.randn(num_groups, grid_k, incore_loop, tile_size, tile_size, dtype=torch.float32) * 0.01 + B = torch.randn(num_groups, grid_k, incore_loop, tile_size, tile_size, dtype=torch.float32) * 0.01 + C = torch.zeros(incore_loop * num_groups, tile_size, tile_size, dtype=torch.float32) + config = torch.tensor([tile_size, grid_k, num_groups, incore_loop], dtype=torch.int64) + return TaskArgsBuilder( + Tensor("A", A.flatten()), Tensor("B", B.flatten()), Tensor("C", C.flatten()), Tensor("config", config) + ) + + def compute_golden(self, args, params): + tile_size = params["incore_data_size"] + incore_loop = params["incore_loop"] + grid_k = params["grid_k"] + num_groups = params["matmul_add_task_num"] // grid_k + A = args.A.reshape(num_groups, grid_k, incore_loop, tile_size, tile_size) + B = args.B.reshape(num_groups, grid_k, incore_loop, tile_size, tile_size) + C = args.C.reshape(incore_loop * num_groups, tile_size, tile_size) + C[:] = 0.0 + for group in range(num_groups): + for k_idx in range(grid_k): + for i in range(incore_loop): + C[group * incore_loop + i] += torch.matmul(A[group, k_idx, i], B[group, k_idx, i]) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_consumer.cpp b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_consumer.cpp new file mode 100644 index 000000000..b860b7223 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_consumer.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#include + +#ifndef __gm__ +#define __gm__ +#endif +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +#include "tensor.h" + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *mailbox_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *result_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *mailbox = + reinterpret_cast<__gm__ float *>(mailbox_tensor->buffer.addr) + mailbox_tensor->start_offset; + __gm__ float *result = reinterpret_cast<__gm__ float *>(result_tensor->buffer.addr) + result_tensor->start_offset; + + uint32_t n = static_cast(result_tensor->shapes[0]); + for (uint32_t i = 0; i < n; ++i) { + result[i] = mailbox[i]; + } +} diff --git a/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_notify_wait.cpp b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_notify_wait.cpp new file mode 100644 index 000000000..2a4d5cbf2 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_notify_wait.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#include + +#ifndef __gm__ +#define __gm__ +#endif +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +#include "pto_async_kernel_api.h" + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + uint64_t counter_addr = static_cast(args[1]); + uint32_t expected_value = static_cast(args[2]); + AsyncCtx async_ctx = get_async_ctx(args); + save_expected_notification_counter( + async_ctx, reinterpret_cast(counter_addr), expected_value + ); +} diff --git a/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_producer.cpp b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_producer.cpp new file mode 100644 index 000000000..f846b313f --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/aiv/kernel_producer.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#include + +#ifndef __gm__ +#define __gm__ +#endif +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +#include "platform_comm/comm_context.h" +#include "pto_async_kernel_api.h" +#include "tensor.h" + +template +static inline __aicore__ __gm__ T *comm_remote_ptr(__gm__ CommContext *ctx, __gm__ T *local_ptr, int peer_rank) { + uint64_t local_base = ctx->windowsIn[ctx->rankId]; + uint64_t offset = reinterpret_cast(local_ptr) - local_base; + return reinterpret_cast<__gm__ T *>(ctx->windowsIn[peer_rank] + offset); +} + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *partial_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *mailbox_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ int32_t *local_counter = reinterpret_cast<__gm__ int32_t *>(args[3]); + __gm__ CommContext *ctx = reinterpret_cast<__gm__ CommContext *>(args[4]); + + __gm__ float *partial = + reinterpret_cast<__gm__ float *>(partial_tensor->buffer.addr) + partial_tensor->start_offset; + __gm__ float *mailbox = + reinterpret_cast<__gm__ float *>(mailbox_tensor->buffer.addr) + mailbox_tensor->start_offset; + + int peer_rank = (static_cast(ctx->rankId) + 1) % static_cast(ctx->rankNum); + __gm__ float *peer_mailbox = comm_remote_ptr(ctx, mailbox, peer_rank); + uint32_t n = static_cast(partial_tensor->shapes[0]); + for (uint32_t i = 0; i < n; ++i) { + peer_mailbox[i] = partial[i]; + } +#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__) + dcci((__gm__ int32_t *)peer_mailbox, ENTIRE_DATA_CACHE, CACHELINE_OUT); +#if defined(__CPU_SIM) + dsb(0); +#else + dsb(DSB_DDR); +#endif + pipe_barrier(PIPE_ALL); +#endif + + __gm__ int32_t *peer_counter = comm_remote_ptr(ctx, local_counter, peer_rank); + send_notification(peer_counter, 1, pto::comm::NotifyOp::AtomicAdd); +} diff --git a/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/orchestration/deferred_notify_orch.cpp b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/orchestration/deferred_notify_orch.cpp new file mode 100644 index 000000000..7a5af06c8 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/kernels/orchestration/deferred_notify_orch.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#include "platform_comm/comm_context.h" +#include "pto_orchestration_api.h" + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +deferred_notify_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{.expected_arg_count = 5}; +} + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + return deferred_notify_orchestration_config(orch_args); +} + +__attribute__((visibility("default"))) void deferred_notify_orchestration(const L2TaskArgs &orch_args) { + if (orch_args.tensor_count() + orch_args.scalar_count() != 5) { + LOG_ERROR("deferred_notify_demo: expected 5 args"); + return; + } + + const Tensor &partial = orch_args.tensor(0).ref(); + const Tensor &mailbox = orch_args.tensor(1).ref(); + const Tensor &result = orch_args.tensor(2).ref(); + const Tensor ¬ify_counter = orch_args.tensor(3).ref(); + auto *comm_ctx = reinterpret_cast(static_cast(orch_args.scalar(0))); + + uint32_t shapes[1] = {128 * 128}; + TensorCreateInfo producer_output_info(shapes, 1, DataType::FLOAT32); + L0TaskArgs params_producer; + params_producer.add_input(partial); + params_producer.add_inout(mailbox); + params_producer.add_output(producer_output_info); + params_producer.add_scalar(notify_counter.buffer.addr); + params_producer.add_scalar(reinterpret_cast(comm_ctx)); + rt_submit_aiv_task(0, params_producer); + + uint32_t notify_token_shape[1] = {1}; + TensorCreateInfo notify_token_info(notify_token_shape, 1, DataType::INT32); + L0TaskArgs params_notify; + params_notify.add_output(notify_token_info); + params_notify.add_scalar(notify_counter.buffer.addr); + params_notify.add_scalar(static_cast(1)); + TaskOutputTensors notify_outputs = rt_submit_aiv_task(2, params_notify); + Tensor notify_token = notify_outputs.get_ref(0); + + L0TaskArgs params_consumer; + params_consumer.add_input(notify_token); + params_consumer.add_input(mailbox); + params_consumer.add_output(result); + params_consumer.add_scalar(notify_counter.buffer.addr); + rt_submit_aiv_task(1, params_consumer); +} + +} // extern "C" diff --git a/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/test_deferred_notify_demo.py b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/test_deferred_notify_demo.py new file mode 100644 index 000000000..873871776 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/deferred_notify_demo/test_deferred_notify_demo.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""L2 deferred completion + two-chip comm smoke test for a2a3sim.""" + +from __future__ import annotations + +import argparse +import os + +import torch +from simpler.task_interface import ( + ArgDirection, + CallConfig, + ChipCallable, + CommBufferSpec, + CoreCallable, + DataType, + TaskArgs, + Tensor, + TensorArgType, +) +from simpler.worker import Worker + +from simpler_setup.elf_parser import extract_text_section +from simpler_setup.kernel_compiler import KernelCompiler +from simpler_setup.pto_isa import ensure_pto_isa_root +from simpler_setup.torch_interop import make_tensor_arg + +HERE = os.path.dirname(os.path.abspath(__file__)) +N = 128 * 128 +DTYPE_NBYTES = 4 + + +def parse_device_range(spec: str) -> list[int]: + if "," in spec: + return [int(x) for x in spec.split(",") if x] + if "-" in spec: + lo, hi = (int(x) for x in spec.split("-")) + return list(range(lo, hi + 1)) + return [int(spec)] + + +def build_chip_callable(platform: str, pto_isa_commit: str | None, clone_protocol: str) -> ChipCallable: + kc = KernelCompiler(platform=platform) + runtime = "fully_distributed_within_core" + pto_isa_root = ensure_pto_isa_root(commit=pto_isa_commit, clone_protocol=clone_protocol) + include_dirs = kc.get_orchestration_include_dirs(runtime) + extra_includes = list(include_dirs) + [str(kc.project_root / "src" / "common")] + + children = [] + for func_id, rel in [ + (0, "kernels/aiv/kernel_producer.cpp"), + (1, "kernels/aiv/kernel_consumer.cpp"), + (2, "kernels/aiv/kernel_notify_wait.cpp"), + ]: + kernel = kc.compile_incore( + source_path=os.path.join(HERE, rel), + core_type="aiv", + pto_isa_root=pto_isa_root, + extra_include_dirs=extra_includes, + ) + if not platform.endswith("sim"): + kernel = extract_text_section(kernel) + children.append( + ( + func_id, + CoreCallable.build( + signature=[ArgDirection.IN, ArgDirection.INOUT, ArgDirection.OUT, ArgDirection.IN], + binary=kernel, + ), + ) + ) + + orch = kc.compile_orchestration( + runtime_name=runtime, + source_path=os.path.join(HERE, "kernels/orchestration/deferred_notify_orch.cpp"), + extra_include_dirs=[str(kc.project_root / "src" / "common")], + ) + return ChipCallable.build( + signature=[ArgDirection.IN, ArgDirection.INOUT, ArgDirection.OUT, ArgDirection.IN], + func_name="deferred_notify_orchestration", + binary=orch, + children=children, + ) + + +def run( + platform: str = "a2a3sim", + device_ids: list[int] | None = None, + pto_isa_commit: str | None = None, +) -> int: + if device_ids is None: + device_ids = [0, 1] + nranks = len(device_ids) + if nranks != 2: + raise ValueError(f"deferred_notify_demo needs exactly 2 devices, got {device_ids}") + + mailbox_nbytes = N * DTYPE_NBYTES + counter_nbytes = 4 + window_size = max(mailbox_nbytes + counter_nbytes, 4 * 1024) + + partial = [torch.full((N,), float(rank + 1), dtype=torch.float32).share_memory_() for rank in range(nranks)] + result = [torch.zeros(N, dtype=torch.float32).share_memory_() for _ in range(nranks)] + + chip_callable = build_chip_callable(platform, pto_isa_commit, "https") + worker = Worker( + level=3, + platform=platform, + runtime="fully_distributed_within_core", + device_ids=device_ids, + num_sub_workers=0, + ) + chip_handle = worker.register(chip_callable) + try: + worker.init() + + def orch_fn(orch, _args, cfg): + # `notify_counter` must start at 0; allocate_domain zero-initializes + # the whole window, so no explicit host seed is needed. + with orch.allocate_domain( + name="default", + workers=list(range(nranks)), + window_size=window_size, + buffers=[ + CommBufferSpec(name="mailbox", dtype="float32", count=N, nbytes=mailbox_nbytes), + CommBufferSpec(name="notify_counter", dtype="int32", count=1, nbytes=counter_nbytes), + ], + ) as handle: + for rank in range(nranks): + domain = handle[rank] + args = TaskArgs() + args.add_tensor(make_tensor_arg(partial[rank]), TensorArgType.INPUT) + args.add_tensor( + Tensor.make( + data=domain.buffer_ptrs["mailbox"], + shapes=(N,), + dtype=DataType.FLOAT32, + child_memory=True, + ), + TensorArgType.INOUT, + ) + args.add_tensor(make_tensor_arg(result[rank]), TensorArgType.OUTPUT_EXISTING) + args.add_tensor( + Tensor.make( + data=domain.buffer_ptrs["notify_counter"], + shapes=(1,), + dtype=DataType.INT32, + child_memory=True, + ), + TensorArgType.INPUT, + ) + args.add_scalar(domain.device_ctx) + orch.submit_next_level(chip_handle, args, cfg, worker=rank) + + worker.run(orch_fn, args=None, config=CallConfig()) + + ok = True + for rank in range(nranks): + expected = partial[(rank + 1) % nranks] + max_diff = float(torch.max(torch.abs(result[rank] - expected))) + print(f"[deferred_notify_demo] rank {rank}: max_diff={max_diff:.3e}") + ok = ok and max_diff <= 1e-6 + return 0 if ok else 1 + finally: + worker.close() + + +def test_deferred_notify_demo() -> None: + assert run("a2a3sim", [0, 1]) == 0 + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("-p", "--platform", default="a2a3sim") + parser.add_argument("-d", "--device", default="0-1") + parser.add_argument("--pto-isa-commit", default=None) + args = parser.parse_args() + return run(args.platform, parse_device_range(args.device), args.pto_isa_commit) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aic/kernel_mm.cpp b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aic/kernel_mm.cpp new file mode 100644 index 000000000..123c44f65 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aic/kernel_mm.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * MIX co-ownership test — AIC (cube) subtask: Cmm = A @ B (single tile). + * + * This is the AIC lane of a 1C+2V MIX task. All three lanes share one argument + * list; each lane writes ITS OWN designated output by fixed index: + * + * args[0] = A (INPUT) + * args[1] = B (INPUT) + * args[2] = Cmm (INOUT, external) <- this AIC lane writes here + * args[3] = V0 (OUTPUT, heap) <- AIV0 lane + * args[4] = V1 (OUTPUT, heap) <- AIV1 lane + * args[5] = config (INPUT) int64_t[4]: [tile_size, grid_k, num_groups, num_tiles] + */ + +#include +#include +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +AICORE constexpr inline T CeilAlign(T num_1, T num_2) { + if (num_2 == 0) { + return 0; + } + return (num_1 + num_2 - 1) / num_2 * num_2; +} + +template +static __aicore__ void mm_tile_impl(__gm__ float *input_a, __gm__ float *input_b, __gm__ float *output) { + constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float); + constexpr int M = CeilAlign(TILE, 16); + constexpr int K = CeilAlign(TILE, blockAlign); + constexpr int N = CeilAlign(TILE, blockAlign); + + using GlobalDataA = + GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; + using GlobalDataB = + GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; + using GlobalDataC = + GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; + + GlobalDataA src0Global(input_a); + GlobalDataB src1Global(input_b); + GlobalDataC dstGlobal(output); + + using TileMatA = Tile; + using TileMatB = Tile; + + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + TLOAD(aMatTile, src0Global); + TLOAD(bMatTile, src1Global); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(dstGlobal, cTile); + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *input_a = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *input_b = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *cmm = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *config = reinterpret_cast<__gm__ Tensor *>(args[5]); + + __gm__ int64_t *cfg = reinterpret_cast<__gm__ int64_t *>(config->buffer.addr); + uint64_t tile_size = static_cast(cfg[0]); + uint64_t tile_elems = tile_size * tile_size; + int num_tiles = static_cast(cfg[3]); + + __gm__ float *base_a = reinterpret_cast<__gm__ float *>(input_a->buffer.addr) + input_a->start_offset; + __gm__ float *base_b = reinterpret_cast<__gm__ float *>(input_b->buffer.addr) + input_b->start_offset; + __gm__ float *base_c = reinterpret_cast<__gm__ float *>(cmm->buffer.addr) + cmm->start_offset; + + for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + __gm__ float *a_ptr = base_a + (tile_idx * tile_elems); + __gm__ float *b_ptr = base_b + (tile_idx * tile_elems); + __gm__ float *c_ptr = base_c + (tile_idx * tile_elems); + + switch (tile_size) { + case 16: + mm_tile_impl<16>(a_ptr, b_ptr, c_ptr); + break; + case 32: + mm_tile_impl<32>(a_ptr, b_ptr, c_ptr); + break; + case 64: + mm_tile_impl<64>(a_ptr, b_ptr, c_ptr); + break; + case 128: + mm_tile_impl<128>(a_ptr, b_ptr, c_ptr); + break; + default: + break; + } + } +} diff --git a/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v0.cpp b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v0.cpp new file mode 100644 index 000000000..f27d6dd6c --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v0.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * MIX co-ownership test — AIV0 subtask: V0 = A + B (single tile, element-wise). + * + * AIV0 lane of a 1C+2V MIX task. Shared argument list (see kernel_mm.cpp); + * this lane writes the V0 output at args[3]. + */ + +#include +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void add_tile_impl(__gm__ float *a_ptr, __gm__ float *b_ptr, __gm__ float *dst_ptr) { + using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>; + using DynStridDim5 = Stride<1, 1, 1, TILE, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData aTile(TILE, TILE); + TileData bTile(TILE, TILE); + TileData outTile(TILE, TILE); + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x10000); + TASSIGN(outTile, 0x20000); + + GlobalData aGlobal(a_ptr); + GlobalData bGlobal(b_ptr); + GlobalData outGlobal(dst_ptr); + + TLOAD(aTile, aGlobal); + TLOAD(bTile, bGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(outTile, aTile, bTile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(outGlobal, outTile); + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *a_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *b_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]); // V0 + __gm__ Tensor *config = reinterpret_cast<__gm__ Tensor *>(args[5]); + + __gm__ int64_t *cfg = reinterpret_cast<__gm__ int64_t *>(config->buffer.addr); + uint64_t tile_size = static_cast(cfg[0]); + uint64_t tile_elems = tile_size * tile_size; + int num_tiles = static_cast(cfg[3]); + + __gm__ float *base_a = reinterpret_cast<__gm__ float *>(a_tensor->buffer.addr) + a_tensor->start_offset; + __gm__ float *base_b = reinterpret_cast<__gm__ float *>(b_tensor->buffer.addr) + b_tensor->start_offset; + __gm__ float *base_out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + __gm__ float *a = base_a + (tile_idx * tile_elems); + __gm__ float *b = base_b + (tile_idx * tile_elems); + __gm__ float *o = base_out + (tile_idx * tile_elems); + switch (tile_size) { + case 16: + add_tile_impl<16>(a, b, o); + break; + case 32: + add_tile_impl<32>(a, b, o); + break; + case 64: + add_tile_impl<64>(a, b, o); + break; + case 128: + add_tile_impl<128>(a, b, o); + break; + default: + break; + } + } +} diff --git a/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v1.cpp b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v1.cpp new file mode 100644 index 000000000..8c794f477 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_add_v1.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * MIX co-ownership test — AIV1 subtask: V1 = A + B (single tile, element-wise). + * + * AIV1 lane of a 1C+2V MIX task. Shared argument list (see kernel_mm.cpp); + * this lane writes the V1 output at args[4]. + */ + +#include +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void add_tile_impl(__gm__ float *a_ptr, __gm__ float *b_ptr, __gm__ float *dst_ptr) { + using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>; + using DynStridDim5 = Stride<1, 1, 1, TILE, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData aTile(TILE, TILE); + TileData bTile(TILE, TILE); + TileData outTile(TILE, TILE); + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x10000); + TASSIGN(outTile, 0x20000); + + GlobalData aGlobal(a_ptr); + GlobalData bGlobal(b_ptr); + GlobalData outGlobal(dst_ptr); + + TLOAD(aTile, aGlobal); + TLOAD(bTile, bGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(outTile, aTile, bTile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(outGlobal, outTile); + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *a_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *b_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[4]); // V1 + __gm__ Tensor *config = reinterpret_cast<__gm__ Tensor *>(args[5]); + + __gm__ int64_t *cfg = reinterpret_cast<__gm__ int64_t *>(config->buffer.addr); + uint64_t tile_size = static_cast(cfg[0]); + uint64_t tile_elems = tile_size * tile_size; + int num_tiles = static_cast(cfg[3]); + + __gm__ float *base_a = reinterpret_cast<__gm__ float *>(a_tensor->buffer.addr) + a_tensor->start_offset; + __gm__ float *base_b = reinterpret_cast<__gm__ float *>(b_tensor->buffer.addr) + b_tensor->start_offset; + __gm__ float *base_out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + __gm__ float *a = base_a + (tile_idx * tile_elems); + __gm__ float *b = base_b + (tile_idx * tile_elems); + __gm__ float *o = base_out + (tile_idx * tile_elems); + switch (tile_size) { + case 16: + add_tile_impl<16>(a, b, o); + break; + case 32: + add_tile_impl<32>(a, b, o); + break; + case 64: + add_tile_impl<64>(a, b, o); + break; + case 128: + add_tile_impl<128>(a, b, o); + break; + default: + break; + } + } +} diff --git a/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_sum.cpp b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_sum.cpp new file mode 100644 index 000000000..59bc2b6f2 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/aiv/kernel_sum.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * MIX co-ownership test — consumer (AIV): Vfinal = V0 + V1. + * + * Reads the two heap outputs produced by the MIX task's AIV0/AIV1 lanes and + * writes the external Vfinal. Its fan-in is the single MIX task id, so it can + * only run once the joint completion flag is set (i.e. after BOTH co-owned + * AIV subtasks finished), validating the block.won remaining-counter logic. + * + * args[0] = V0 (INPUT, heap) + * args[1] = V1 (INPUT, heap) + * args[2] = Vfinal (INOUT, external) + * args[3] = config (INPUT) int64_t[4]: [tile_size, grid_k, num_groups, num_tiles] + */ + +#include +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void sum_tile_impl(__gm__ float *v0_ptr, __gm__ float *v1_ptr, __gm__ float *dst_ptr) { + using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>; + using DynStridDim5 = Stride<1, 1, 1, TILE, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData v0Tile(TILE, TILE); + TileData v1Tile(TILE, TILE); + TileData outTile(TILE, TILE); + TASSIGN(v0Tile, 0x0); + TASSIGN(v1Tile, 0x10000); + TASSIGN(outTile, 0x20000); + + GlobalData v0Global(v0_ptr); + GlobalData v1Global(v1_ptr); + GlobalData outGlobal(dst_ptr); + + TLOAD(v0Tile, v0Global); + TLOAD(v1Tile, v1Global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(outTile, v0Tile, v1Tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(outGlobal, outTile); + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *config = reinterpret_cast<__gm__ Tensor *>(args[3]); + + __gm__ int64_t *cfg = reinterpret_cast<__gm__ int64_t *>(config->buffer.addr); + uint64_t tile_size = static_cast(cfg[0]); + uint64_t tile_elems = tile_size * tile_size; + int num_tiles = static_cast(cfg[3]); + + __gm__ float *base_v0 = reinterpret_cast<__gm__ float *>(v0_tensor->buffer.addr) + v0_tensor->start_offset; + __gm__ float *base_v1 = reinterpret_cast<__gm__ float *>(v1_tensor->buffer.addr) + v1_tensor->start_offset; + __gm__ float *base_out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + __gm__ float *v0 = base_v0 + (tile_idx * tile_elems); + __gm__ float *v1 = base_v1 + (tile_idx * tile_elems); + __gm__ float *o = base_out + (tile_idx * tile_elems); + switch (tile_size) { + case 16: + sum_tile_impl<16>(v0, v1, o); + break; + case 32: + sum_tile_impl<32>(v0, v1, o); + break; + case 64: + sum_tile_impl<64>(v0, v1, o); + break; + case 128: + sum_tile_impl<128>(v0, v1, o); + break; + default: + break; + } + } +} diff --git a/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/orchestration/mix_coown_orch.cpp b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/orchestration/mix_coown_orch.cpp new file mode 100644 index 000000000..4d5b741e2 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/mix_coown/kernels/orchestration/mix_coown_orch.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * MIX co-ownership orchestration (fully_distributed_within_core). + * + * For each group g it submits a 1C+2V MIX task and a dependent consumer, + * exercising the block.won anchor->follower deposit/drain path (§3.1): + * + * MIX[g] (1C+2V): Cmm[g] = A[g] @ B[g] (AIC lane, external out) + * V0 = A[g] + B[g] (AIV0 lane, heap out) + * V1 = A[g] + B[g] (AIV1 lane, heap out) + * consumer[g] (1V): Vfinal[g] = V0 + V1 (depends on the single MIX + * completion flag) + * + * Golden: Cmm[g] = A[g]@B[g]; Vfinal[g] = 2*(A[g]+B[g]). + * + * Arg layout (external): [A, B, Cmm, Vfinal, config] + */ + +#include +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +#define FUNC_MM 0 +#define FUNC_ADD_V0 1 +#define FUNC_ADD_V1 2 +#define FUNC_SUM 3 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{ + .expected_arg_count = 5, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) { + const Tensor &ext_A = orch_args.tensor(0).ref(); + const Tensor &ext_B = orch_args.tensor(1).ref(); + const Tensor &ext_Cmm = orch_args.tensor(2).ref(); + const Tensor &ext_Vfinal = orch_args.tensor(3).ref(); + const Tensor &ext_config = orch_args.tensor(4).ref(); + + int64_t *host_config = orch_args.tensor(4).ref().data_as(); + int tile_size = static_cast(host_config[0]); + int num_groups = static_cast(host_config[2]); + int num_tiles = static_cast(host_config[3]); + uint64_t tile_elems = static_cast(tile_size) * tile_size; + uint64_t group_elems = static_cast(num_tiles) * tile_elems; + + LOG_INFO_V0( + "[mix_coown_orch] tile_size=%d num_groups=%d num_tiles=%d", tile_size, num_groups, num_tiles + ); + + uint32_t group_shapes[1] = {static_cast(group_elems)}; + TensorCreateInfo heap_ci(group_shapes, 1, DataType::FLOAT32); + + for (int g = 0; g < num_groups; g++) { + PTO2_SCOPE_GUARD(); + + uint32_t off[1] = {static_cast(static_cast(g) * group_elems)}; + Tensor A_view = ext_A.view(group_shapes, off); + Tensor B_view = ext_B.view(group_shapes, off); + Tensor Cmm_view = ext_Cmm.view(group_shapes, off); + Tensor Vfinal_view = ext_Vfinal.view(group_shapes, off); + + // 1C + 2V MIX task. Shared arg list; each lane writes its own output. + L0TaskArgs mix; + mix.add_input(A_view); // 0 + mix.add_input(B_view); // 1 + mix.add_inout(Cmm_view); // 2 (AIC writes Cmm) + mix.add_output(heap_ci); // 3 V0 (AIV0 writes) + mix.add_output(heap_ci); // 4 V1 (AIV1 writes) + mix.add_input(ext_config); // 5 + MixedKernels mk; + mk.aic_kernel_id = FUNC_MM; + mk.aiv0_kernel_id = FUNC_ADD_V0; + mk.aiv1_kernel_id = FUNC_ADD_V1; + TaskOutputTensors outs = rt_submit_task(mk, mix); + + // Consumer (1V): Vfinal = V0 + V1 — depends on the single MIX flag. + L0TaskArgs cons; + cons.add_input(outs.get_ref(0)); // 0 V0 + cons.add_input(outs.get_ref(1)); // 1 V1 + cons.add_inout(Vfinal_view); // 2 Vfinal + cons.add_input(ext_config); // 3 + rt_submit_aiv_task(FUNC_SUM, cons); + } + + LOG_INFO_V0("[mix_coown_orch] submitted %d MIX + %d consumer tasks", num_groups, num_groups); +} + +} // extern "C" diff --git a/examples/a2a3/fully_distributed_within_core/mix_coown/test_mix_coown.py b/examples/a2a3/fully_distributed_within_core/mix_coown/test_mix_coown.py new file mode 100644 index 000000000..fadc4723c --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/mix_coown/test_mix_coown.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""MIX co-ownership test for fully_distributed_within_core. + +Each group submits a 1C+2V MIX task (Cmm=A@B on AIC, V0=A+B on AIV0, V1=A+B on +AIV1) plus a consumer (Vfinal=V0+V1). This exercises the block.won anchor-> +follower deposit/drain path and the single joint completion flag. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + + +@scene_test(level=2, runtime="fully_distributed_within_core") +class TestMixCoown(SceneTestCase): + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/mix_coown_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT, D.OUT, D.IN], + }, + "incores": [ + { + "func_id": 0, + "name": "MM", + "source": "kernels/aic/kernel_mm.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.INOUT, D.OUT, D.OUT, D.IN], + }, + { + "func_id": 1, + "name": "ADD_V0", + "source": "kernels/aiv/kernel_add_v0.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.INOUT, D.OUT, D.OUT, D.IN], + }, + { + "func_id": 2, + "name": "ADD_V1", + "source": "kernels/aiv/kernel_add_v1.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.INOUT, D.OUT, D.OUT, D.IN], + }, + { + "func_id": 3, + "name": "SUM", + "source": "kernels/aiv/kernel_sum.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.INOUT, D.IN], + }, + ], + } + + CASES = [ + { + "name": "Mix12", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 3}, + "params": {"num_groups": 12, "tile_size": 64}, + }, + { + "name": "Mix24", + "manual": True, + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {"num_groups": 48, "tile_size": 64}, + }, + ] + + def generate_args(self, params): + n = params["num_groups"] + t = params["tile_size"] + A = torch.randn(n, t, t, dtype=torch.float32) * 0.01 + B = torch.randn(n, t, t, dtype=torch.float32) * 0.01 + Cmm = torch.zeros(n, t, t, dtype=torch.float32) + Vfinal = torch.zeros(n, t, t, dtype=torch.float32) + # config: [tile_size, grid_k(unused), num_groups, num_tiles_per_group] + config = torch.tensor([t, 1, n, 1], dtype=torch.int64) + return TaskArgsBuilder( + Tensor("A", A.flatten()), + Tensor("B", B.flatten()), + Tensor("Cmm", Cmm.flatten()), + Tensor("Vfinal", Vfinal.flatten()), + Tensor("config", config), + ) + + def compute_golden(self, args, params): + n = params["num_groups"] + t = params["tile_size"] + A = args.A.reshape(n, t, t) + B = args.B.reshape(n, t, t) + Cmm = args.Cmm.reshape(n, t, t) + Vfinal = args.Vfinal.reshape(n, t, t) + for g in range(n): + Cmm[g] = torch.matmul(A[g], B[g]) + Vfinal[g] = 2.0 * (A[g] + B[g]) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_pv_matmul.cpp new file mode 100644 index 000000000..0220a6bbb --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_pv_matmul.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N) +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128) -> (16, 128) +// Case2: (64, 64) @ ( 64, 128) -> (64, 128) +// +// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT). +// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout. +// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __gm__ Tensor *oi) { + __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr); + __gm__ bfloat16_t *vj_addr = reinterpret_cast<__gm__ bfloat16_t *>(vj->buffer.addr); + __gm__ float *oi_addr = reinterpret_cast<__gm__ float *>(oi->buffer.addr); + + // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32 + using GlobalA = GlobalTensor, Stride>; + using GlobalB = GlobalTensor, Stride>; + using GlobalOut = GlobalTensor, Stride>; + + GlobalA pijGlobal(pij_addr + pij->start_offset); + GlobalB vjGlobal(vj_addr + vj->start_offset); + GlobalOut oiGlobal(oi_addr + oi->start_offset); + + // L1 Mat tiles: standard ND pattern for both A and B + using TileMatA = Tile; + using TileMatB = Tile; + + // L0 tiles + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // Load pij and vj to L1 with separate events for pipeline overlap + TLOAD(aMatTile, pijGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); // A load done + TLOAD(bMatTile, vjGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); // B load done + + // Move A to L0A as soon as A load completes (B may still be loading) + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TMOV(aTile, aMatTile); + // Move B to L0B after B load completes + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Single matmul: (M,K) x (K,N) -> (M,N) + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(oiGlobal, cTile); + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *vj = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]); + uint64_t q_tile_size = static_cast(pij->shapes[0]); + + if (q_tile_size == 16 && pij->shapes[1] <= 16) { + pv_matmul_impl<16, 16, 16>(pij, vj, oi_new); + } else if (q_tile_size == 16) { + pv_matmul_impl<16, 128, 128>(pij, vj, oi_new); + } else { + pv_matmul_impl<64, 64, 128>(pij, vj, oi_new); + } +} diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_qk_matmul.cpp new file mode 100644 index 000000000..efd423bd6 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aic/aic_qk_matmul.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128).T -> (16, 128) +// Case2: (64, 128) @ (128, 64).T -> (64, 64) +// +// kj is stored as (N, K) = (block_size, head_dim) in row-major memory. +// This is equivalent to (K, N) in column-major (DN) layout. +// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm__ Tensor *sij) { + __gm__ bfloat16_t *qi_addr = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr); + __gm__ bfloat16_t *kj_addr = reinterpret_cast<__gm__ bfloat16_t *>(kj->buffer.addr); + __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr); + + // qi (M, K) bf16 in ND (row-major) layout + using GlobalA = GlobalTensor, Stride>; + // kj stored as (N, K) row-major = (K, N) column-major -> DN layout + using GlobalB = GlobalTensor, Stride, Layout::DN>; + using GlobalOut = GlobalTensor, Stride>; + + GlobalA qiGlobal(qi_addr + qi->start_offset); + GlobalB kjGlobal(kj_addr + kj->start_offset); + GlobalOut sijGlobal(sij_addr + sij->start_offset); + + // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor) + using TileMatA = Tile; + using TileMatB = Tile; + + // L0 tiles + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // Load A and B to L1 with separate events for pipeline overlap + TLOAD(aMatTile, qiGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); // A load done + TLOAD(bMatTile, kjGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); // B load done + + // Move A to L0A as soon as A load completes (B may still be loading) + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TMOV(aTile, aMatTile); + // Move B to L0B after B load completes + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Matmul + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(sijGlobal, cTile); + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *kj = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[2]); + uint64_t q_tile_size = static_cast(qi->shapes[0]); + + if (q_tile_size == 16 && qi->shapes[1] <= 16) { + qk_matmul_impl<16, 16, 16>(qi, kj, sij); + } else if (q_tile_size == 16) { + qk_matmul_impl<16, 128, 128>(qi, kj, sij); + } else { + qk_matmul_impl<64, 128, 64>(qi, kj, sij); + } +} diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_online_update.cpp new file mode 100644 index 000000000..ded4dcad8 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_online_update.cpp @@ -0,0 +1,256 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Online Softmax Update + Normalize Kernel (AIV) +// +// Operates on full tiles where M=q_tile_size, N=head_dim (128): +// Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors +// Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors +// +// Scalar layout strategy using TRESHAPE (zero-copy UB reshape): +// Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV. +// For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M). +// After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops. +// This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void online_update_impl( + __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li, + __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst +) { + __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr); + __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr); + __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr); + __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr); + __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr); + __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr); + __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr); + + // Aligned rows for ColMajor DN tiles (32-byte alignment) + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + // --- GlobalTensor types --- + + // Data (M, N) RowMajor + using GlobalDataMxN = GlobalTensor, Stride<1, 1, 1, N, 1>>; + + // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading + using GlobalScalarDN = GlobalTensor, Stride<1, 1, 1, 1, 1>, Layout::DN>; + + // Scalar ND: for storing mi_new and li_new back to GM + constexpr int kScalarCols = 32 / sizeof(float); + constexpr int kScalarRows = M / kScalarCols; + using GlobalScalarND = + GlobalTensor, Stride<1, 1, 1, kScalarCols, 1>>; + + // --- GlobalTensor instances --- + + GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset); + GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset); + GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset); + + // DN globals for loading scalars as ColMajor + GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset); + GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset); + GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset); + GlobalScalarDN liGlobalDN(li_ptr + li->start_offset); + + // ND globals for storing scalar results + GlobalScalarND miGlobalND(mi_ptr + mi->start_offset); + GlobalScalarND liGlobalND(li_ptr + li->start_offset); + + // --- Tile types --- + + using TileDataMxN = Tile; + using TileScalarDN = Tile; + + // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE + using TileScalarRow = Tile; + + // ND tile for storing back to GM + using TileScalarND = + Tile; + + // --- UB memory layout --- + + constexpr int kDataBytes = M * N * sizeof(float); + constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); + + // Data tiles + TileDataMxN oiNewTile; + TileDataMxN oiTile; + + // Scalar DN tiles loaded from GM (ColMajor) + TileScalarDN mijDN, lijDN, miDN, liDN; + + // Temporary DN tiles for results + TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN; + + TASSIGN(oiNewTile, 0); + TASSIGN(oiTile, kDataBytes); + TASSIGN(mijDN, 2 * kDataBytes); + TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes); + TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes); + TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes); + TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes); + TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes); + TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes); + TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes); + TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes); + + if (is_first) { + // --- First block: copy inputs to accumulators --- + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(mijDN, mijGlobalDN); + TLOAD(lijDN, lijGlobalDN); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Store mi = mij, li = lij, oi = oi_new + // Alias ND tiles to the same UB as DN tiles for storing as ND format + TileScalarND mijND, lijND; + TASSIGN(mijND, 2 * kDataBytes); // alias same UB as mijDN + TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes); // alias same UB as lijDN + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, mijND); // mi = mij + TSTORE(liGlobalND, lijND); // li = lij + TSTORE(oiGlobal, oiNewTile); // oi = oi_new + + if (is_last) { + // Single block: normalize dst = oi_new / lij + // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiNewTile); + } + } else { + // --- Subsequent blocks: accumulate --- + + // Load all inputs + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(oiTile, oiGlobal); + TLOAD(mijDN, mijGlobalDN); + TLOAD(lijDN, lijGlobalDN); + TLOAD(miDN, miGlobalDN); + TLOAD(liDN, liGlobalDN); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic + TileScalarRow miRow, mijRow, liRow, lijRow; + TRESHAPE(miRow, miDN); + TRESHAPE(mijRow, mijDN); + TRESHAPE(liRow, liDN); + TRESHAPE(lijRow, lijDN); + + // Scalar arithmetic in RowMajor (1, M) layout + TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow; + TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes); + TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes); + TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes); + TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes); + TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes); + + TMAX(miNewRow, miRow, mijRow); // mi_new = max(mi, mij) + pipe_barrier(PIPE_V); + TSUB(alphaRow, miRow, miNewRow); // alpha_exp = mi - mi_new + pipe_barrier(PIPE_V); + TEXP(alphaRow, alphaRow); // alpha = exp(mi - mi_new) + pipe_barrier(PIPE_V); + TSUB(betaRow, mijRow, miNewRow); // beta_exp = mij - mi_new + pipe_barrier(PIPE_V); + TEXP(betaRow, betaRow); // beta = exp(mij - mi_new) + pipe_barrier(PIPE_V); + TMUL(tmpRow, alphaRow, liRow); // alpha * li + pipe_barrier(PIPE_V); + TMUL(liNewRow, betaRow, lijRow); // beta * lij + pipe_barrier(PIPE_V); + TADD(liNewRow, tmpRow, liNewRow); // li_new = alpha*li + beta*lij + + // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL + TRESHAPE(alphaDN, alphaRow); + TRESHAPE(betaDN, betaRow); + + // Scale data tiles using row-broadcast multiply + TROWEXPANDMUL(oiTile, oiTile, alphaDN); // oi *= alpha + TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); // oi_new *= beta + pipe_barrier(PIPE_V); + TADD(oiTile, oiTile, oiNewTile); // oi = alpha*oi + beta*oi_new + + // Store mi_new and li_new to GM (ND format) + // Alias ND tiles to the same UB locations as miNewRow and liNewRow + TileScalarND miNewND, liNewND; + TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes); + TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes); + + if (is_last) { + // Normalize and output: dst = oi / li_new + TRESHAPE(liNewDN, liNewRow); + pipe_barrier(PIPE_V); + TROWEXPANDDIV(oiTile, oiTile, liNewDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); // persist mi_new + TSTORE(liGlobalND, liNewND); // persist li_new + TSTORE(dstGlobal, oiTile); + } else { + // Store updated accumulators + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); // persist mi_new + TSTORE(liGlobalND, liNewND); // persist li_new + TSTORE(oiGlobal, oiTile); + } + } + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]); + __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]); + __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]); + __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]); + uint64_t is_first = static_cast(args[7]); + uint64_t is_last = static_cast(args[8]); + uint64_t q_tile_size = static_cast(mij->shapes[0]); + + if (q_tile_size == 16 && oi_new->shapes[1] <= 16) { + online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } else if (q_tile_size == 16) { + online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } else { + online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } +} diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp new file mode 100644 index 000000000..8f0c41775 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -0,0 +1,156 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Softmax Preparation Kernel (AIV) with partial block masking +// +// Operates on (M, N) tile where M=q_tile_size, N=block_size: +// Case1: sij is (16, 128) +// Case2: sij is (64, 64) +// +// For partial blocks (valid_len < N), positions [valid_len, N) in sij are +// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0 +// so that invalid key positions contribute zero attention weight. +// +// Computes: +// sij_masked = TFILLPAD(sij, valid_len, pad=-inf) +// sij_scale = sij_masked * scale +// mij = row_max(sij_scale) -> (M, 1) +// pij = exp(sij_scale - mij) -> (M, N) +// lij = row_sum(pij) -> (M, 1) + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void softmax_prepare_impl( + __gm__ Tensor *sij, float scale_value, __gm__ Tensor *pij, __gm__ Tensor *mij, __gm__ Tensor *lij +) { + uint64_t valid_len = static_cast(sij->shapes[1]); + __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr); + __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr); + __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr); + __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr); + + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + using GlobalDataMxN = GlobalTensor, Stride<1, 1, 1, N, 1>>; + using GlobalDataMxN_bf16 = GlobalTensor, Stride<1, 1, 1, N, 1>>; + using GlobalScalarDN = GlobalTensor, Stride<1, 1, 1, 1, 1>, Layout::DN>; + + GlobalDataMxN sijGlobal(sij_addr + sij->start_offset); + GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset); + GlobalScalarDN mijGlobal(mij_addr + mij->start_offset); + GlobalScalarDN lijGlobal(lij_addr + lij->start_offset); + + // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary + using TileSijDyn = Tile; + // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf + using TileSijPad = Tile; + + using TileVecMxN = Tile; + using TileVecMxN_bf16 = Tile; + using TileScalarDN = Tile; + + TileVecMxN sijTile; + TileSijDyn sijDynTile(static_cast(valid_len)); + TileSijPad sijPadTile; + TileVecMxN pijTile; + TileVecMxN tmpTile; + TileScalarDN maxTile; + TileScalarDN sumTile; + TileVecMxN_bf16 pijBf16Tile; + + // All sij tiles share UB address 0x0 (in-place masking) + TASSIGN(sijTile, 0x0); + TASSIGN(sijDynTile, 0x0); + TASSIGN(sijPadTile, 0x0); + TASSIGN(pijTile, M * N * sizeof(float)); + TASSIGN(tmpTile, 2 * M * N * sizeof(float)); + TASSIGN(maxTile, 3 * M * N * sizeof(float)); + TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float)); + TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float)); + + // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks + // printf("sij addr incore %x\n", sij->buffer.addr); + TLOAD(sijTile, sijGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary, + // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N. + TFILLPAD_INPLACE(sijPadTile, sijDynTile); + pipe_barrier(PIPE_V); + + TMULS(sijTile, sijTile, scale_value); + pipe_barrier(PIPE_V); + TROWMAX(maxTile, sijTile, tmpTile); + pipe_barrier(PIPE_V); + TROWEXPANDSUB(pijTile, sijTile, maxTile); + pipe_barrier(PIPE_V); + TEXP(pijTile, pijTile); + // Truncate pij to bf16 first + pipe_barrier(PIPE_V); + TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); // pij bf16 ready, can store early + + // Continue computing: bf16 → f32 and rowsum while pij store proceeds in parallel + pipe_barrier(PIPE_V); + TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND); + pipe_barrier(PIPE_V); + TROWSUM(sumTile, pijTile, tmpTile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); // sum ready + + // Store pij (overlaps with TCVT + TROWSUM above) + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(pijGlobal, pijBf16Tile); + + // Store max and sum + TSTORE(mijGlobal, maxTile); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(lijGlobal, sumTile); + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]); + union { + uint64_t u; + float f; + } scale_conv; + scale_conv.u = static_cast(args[4]); + float scale_value = scale_conv.f; + uint64_t q_tile_size = static_cast(sij->shapes[0]); + + if (q_tile_size == 16 && pij->shapes[1] <= 16) { + softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij); + } else if (q_tile_size == 16) { + softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij); + } else { + softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij); + } +} diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/orchestration/paged_attention_orch.cpp new file mode 100644 index 000000000..2ed86cdf2 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -0,0 +1,292 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Paged Attention Orchestration Function - 16x16 Version + * + * Simplified for 16x16 framework-generated matmul kernels. + * Each block processes a single 16x16 matmul operation. + * + * Memory Layout: + * Query: (batch, 16, 16) - one 16x16 tile per batch + * Key: (total_blocks, 16, 16) - stored as K^T for direct matmul + * Value: (total_blocks, 16, 16) - direct format + */ + +#include +#include +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_QK_MATMUL 0 +#define FUNC_SOFTMAX_PREPARE 1 +#define FUNC_PV_MATMUL 2 +#define FUNC_ONLINE_UPDATE 3 +constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000; // 50 MHz + +inline double cycles_to_us(uint64_t cycles) { + return (static_cast(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0; +} + +inline uint64_t get_sys_cnt_aicpu() { +#if defined(__aarch64__) + uint64_t ticks; + asm volatile("mrs %0, cntvct_el0" : "=r"(ticks)); + return ticks; +#elif defined(__x86_64__) + return 0; +#else + return 0; +#endif +} + +#ifdef ENABLE_PROFILING +#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1 +#define CYCLE_COUNT_LAP(acc) \ + do { \ + _t1 = get_sys_cnt_aicpu(); \ + acc += (_t1 - _t0); \ + _t0 = _t1; \ + } while (0) +#define PROF_INC(counter, n) (counter) += (n) +#else +#define CYCLE_COUNT_START() (void)0 +#define CYCLE_COUNT_LAP(acc) (void)0 +#define PROF_INC(counter, n) (void)0 +#endif + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{ + .expected_arg_count = 7, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) { +#ifdef ENABLE_PROFILING + uint64_t prof_param_extract = 0; + uint64_t prof_ext_tensor = 0; + uint64_t prof_scope = 0; + uint64_t prof_make_tensor = 0; + uint64_t prof_tensor_view = 0; + uint64_t prof_param_setup = 0; + uint64_t prof_submit_task = 0; + int prof_submit_count = 0; + int prof_make_count = 0; + int prof_view_count = 0; +#endif + + CYCLE_COUNT_START(); + + // Read dimensions from tensor metadata + uint64_t batch = orch_args.tensor(0).ref().shapes[0]; + uint64_t num_heads = orch_args.tensor(0).ref().shapes[1]; + uint64_t head_dim = orch_args.tensor(0).ref().shapes[2]; + DataType data_type = orch_args.tensor(0).ref().dtype; + + uint64_t block_size = orch_args.tensor(1).ref().shapes[1]; + uint64_t block_num = orch_args.tensor(3).ref().shapes[1]; + + uint64_t scale_value = orch_args.scalar(0); + + uint64_t q_head_num = num_heads; + uint64_t q_tile = std::min(num_heads, static_cast(128)); + uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; + CYCLE_COUNT_LAP(prof_param_extract); + + LOG_INFO_V9(">>>>>> batch = %" PRIu64, batch); + + // Reshape tensors for kernel consumption (2D flattened) + void *query_ptr = orch_args.tensor(0).ref().data_as(); + void *kc_ptr = orch_args.tensor(1).ref().data_as(); + void *vc_ptr = orch_args.tensor(2).ref().data_as(); + void *out_ptr = orch_args.tensor(5).ref().data_as(); + + uint64_t total_blocks_count = orch_args.tensor(1).ref().shapes[0]; + + uint32_t query_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + uint32_t key_cache_shapes[2] = { + static_cast(total_blocks_count * block_size), static_cast(head_dim) + }; + uint32_t value_cache_shapes[2] = { + static_cast(total_blocks_count * block_size), static_cast(head_dim) + }; + uint32_t out_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type); + Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type); + Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type); + Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32); + CYCLE_COUNT_LAP(prof_ext_tensor); + + uint32_t bt_shapes[2] = {static_cast(batch), static_cast(block_num)}; + Tensor block_table = + make_tensor_external(orch_args.tensor(3).ref().data_as(), bt_shapes, 2, DataType::INT32, false); + uint32_t cl_shapes[1] = {static_cast(batch)}; + Tensor context_lens = + make_tensor_external(orch_args.tensor(4).ref().data_as(), cl_shapes, 1, DataType::INT32, false); + + // Create infos are loop-invariant — shapes depend only on q_tile/head_dim/block_size + uint32_t tile2d_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; + uint32_t scalar_shapes[1] = {static_cast(q_tile)}; + uint32_t sij_shapes[2] = {static_cast(q_tile), static_cast(block_size)}; + TensorCreateInfo tile2d_ci(tile2d_shapes, 2, DataType::FLOAT32); + TensorCreateInfo scalar_ci(scalar_shapes, 1, DataType::FLOAT32); + TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32); + TensorCreateInfo pij_f16_ci(sij_shapes, 2, data_type); + + PROF_INC(prof_make_count, 4); + CYCLE_COUNT_LAP(prof_make_tensor); + + for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { + uint32_t cl_idx[1] = {static_cast(b_idx)}; + uint64_t cur_seq = static_cast(get_tensor_data(context_lens, 1, cl_idx)); + uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; + for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + PTO2_SCOPE() { + CYCLE_COUNT_LAP(prof_scope); + uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; + + uint32_t qi_offsets[2] = {static_cast(cur_offset), 0}; + Tensor qi = query.view(tile2d_shapes, qi_offsets); + uint32_t out_view_offsets[2] = {static_cast(cur_offset), 0}; + Tensor out_view = out.view(tile2d_shapes, out_view_offsets); + PROF_INC(prof_view_count, 2); + CYCLE_COUNT_LAP(prof_tensor_view); + + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci); + const Tensor &oi = alloc_outs.get_ref(0); + const Tensor &li_update = alloc_outs.get_ref(1); + const Tensor &mi_update = alloc_outs.get_ref(2); + PROF_INC(prof_submit_count, 1); + CYCLE_COUNT_LAP(prof_submit_task); + + for (uint64_t bn = 0; bn < bn_this_batch; bn++) { + PTO2_SCOPE_GUARD(); + + uint32_t bt_idx[2] = {static_cast(b_idx), static_cast(bn)}; + uint64_t cur_block_idx = static_cast(get_tensor_data(block_table, 2, bt_idx)); + uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size); + CYCLE_COUNT_LAP(prof_param_extract); + + uint32_t kv_shapes[2] = {static_cast(block_size), static_cast(head_dim)}; + uint32_t kv_offsets[2] = {static_cast(cur_block_idx * block_size), 0}; + Tensor kj = key_cache.view(kv_shapes, kv_offsets); + Tensor vj = value_cache.view(kv_shapes, kv_offsets); + PROF_INC(prof_view_count, 2); + CYCLE_COUNT_LAP(prof_tensor_view); + + L0TaskArgs params_qk; + params_qk.add_input(qi); + params_qk.add_input(kj); + params_qk.add_output(sij_ci); + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors qk_outs = rt_submit_aic_task(FUNC_QK_MATMUL, params_qk); + const Tensor &sij = qk_outs.get_ref(0); + PROF_INC(prof_submit_count, 1); + CYCLE_COUNT_LAP(prof_submit_task); + + uint32_t sij_valid_shapes[2] = {static_cast(q_tile), static_cast(valid_len)}; + uint32_t sij_valid_offsets[2] = {0, 0}; + Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets); + PROF_INC(prof_view_count, 1); + CYCLE_COUNT_LAP(prof_tensor_view); + + L0TaskArgs params_sf; + params_sf.add_input(sij_valid); + params_sf.add_output(pij_f16_ci); + params_sf.add_output(scalar_ci); + params_sf.add_output(scalar_ci); + params_sf.add_scalar(scale_value); + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors sf_outs = rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf); + const Tensor &pij_f16 = sf_outs.get_ref(0); + const Tensor &mi = sf_outs.get_ref(1); + const Tensor &li = sf_outs.get_ref(2); + PROF_INC(prof_submit_count, 1); + CYCLE_COUNT_LAP(prof_submit_task); + + L0TaskArgs params_pv; + params_pv.add_input(pij_f16); + params_pv.add_input(vj); + params_pv.add_output(tile2d_ci); + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors pv_outs = rt_submit_aic_task(FUNC_PV_MATMUL, params_pv); + const Tensor &oi_tmp = pv_outs.get_ref(0); + PROF_INC(prof_submit_count, 1); + CYCLE_COUNT_LAP(prof_submit_task); + + uint64_t is_first = (bn == 0) ? 1 : 0; + uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0; + CYCLE_COUNT_LAP(prof_param_extract); + + L0TaskArgs params_up; + params_up.add_input(mi); + params_up.add_input(li); + params_up.add_input(oi_tmp); + params_up.add_inout(mi_update); + params_up.add_inout(li_update); + params_up.add_inout(oi); + params_up.add_inout(out_view); + params_up.add_scalar(is_first); + params_up.add_scalar(is_last); + CYCLE_COUNT_LAP(prof_param_setup); + rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up); + PROF_INC(prof_submit_count, 1); + CYCLE_COUNT_LAP(prof_submit_task); + } + } + CYCLE_COUNT_LAP(prof_scope); + } + } + +#ifdef ENABLE_PROFILING + uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup + + prof_submit_task + prof_scope; + LOG_INFO_V9( + "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count, + prof_make_count, prof_view_count, cycles_to_us(total) + ); + if (total > 0) { + LOG_INFO_V9( + " param_extract : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract), + prof_param_extract * 100.0 / total + ); + LOG_INFO_V9( + " ext_tensor(x4) : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total + ); + LOG_INFO_V9( + " create_info(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor), + prof_make_tensor * 100.0 / total, + prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0 + ); + LOG_INFO_V9( + " tensor_view(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view), + prof_tensor_view * 100.0 / total, + prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0 + ); + LOG_INFO_V9( + " param_setup : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total + ); + LOG_INFO_V9(" scope : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total); + LOG_INFO_V9( + " submit_task(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task), + prof_submit_task * 100.0 / total, + prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0 + ); + } +#endif +} + +} // extern "C" diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention/test_paged_attention.py b/examples/a2a3/fully_distributed_within_core/paged_attention/test_paged_attention.py new file mode 100644 index 000000000..8405a0d3d --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention/test_paged_attention.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Paged attention: online softmax with AIC/AIV subgraph splitting (bfloat16).""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden +from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs + + +@scene_test(level=2, runtime="fully_distributed_within_core") +class TestPagedAttention(SceneTestCase): + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/paged_attention_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "name": "QK", + "source": "kernels/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "name": "SF", + "source": "kernels/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 2, + "name": "PV", + "source": "kernels/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 3, + "name": "UP", + "source": "kernels/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": { + "batch": 256, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case2", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case3", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 256, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseSmall1", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 9}, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 33, + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseSmall2", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 128, + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseVarSeq2", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 2, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 33, + "context_lens_list": [33, 17], + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseVarSeq4", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 4, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 128, + "context_lens_list": [33, 64, 128, 15], + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + ] + + def generate_args(self, params): + result = _pa_generate_inputs(params) + specs = [] + for name, value in result: + if isinstance(value, torch.Tensor): + specs.append(Tensor(name, value)) + else: + specs.append(Scalar(name, value)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_pv_matmul.cpp new file mode 100644 index 000000000..0220a6bbb --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_pv_matmul.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N) +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128) -> (16, 128) +// Case2: (64, 64) @ ( 64, 128) -> (64, 128) +// +// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT). +// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout. +// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __gm__ Tensor *oi) { + __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr); + __gm__ bfloat16_t *vj_addr = reinterpret_cast<__gm__ bfloat16_t *>(vj->buffer.addr); + __gm__ float *oi_addr = reinterpret_cast<__gm__ float *>(oi->buffer.addr); + + // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32 + using GlobalA = GlobalTensor, Stride>; + using GlobalB = GlobalTensor, Stride>; + using GlobalOut = GlobalTensor, Stride>; + + GlobalA pijGlobal(pij_addr + pij->start_offset); + GlobalB vjGlobal(vj_addr + vj->start_offset); + GlobalOut oiGlobal(oi_addr + oi->start_offset); + + // L1 Mat tiles: standard ND pattern for both A and B + using TileMatA = Tile; + using TileMatB = Tile; + + // L0 tiles + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // Load pij and vj to L1 with separate events for pipeline overlap + TLOAD(aMatTile, pijGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); // A load done + TLOAD(bMatTile, vjGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); // B load done + + // Move A to L0A as soon as A load completes (B may still be loading) + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TMOV(aTile, aMatTile); + // Move B to L0B after B load completes + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Single matmul: (M,K) x (K,N) -> (M,N) + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(oiGlobal, cTile); + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *vj = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]); + uint64_t q_tile_size = static_cast(pij->shapes[0]); + + if (q_tile_size == 16 && pij->shapes[1] <= 16) { + pv_matmul_impl<16, 16, 16>(pij, vj, oi_new); + } else if (q_tile_size == 16) { + pv_matmul_impl<16, 128, 128>(pij, vj, oi_new); + } else { + pv_matmul_impl<64, 64, 128>(pij, vj, oi_new); + } +} diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_qk_matmul.cpp new file mode 100644 index 000000000..efd423bd6 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aic/aic_qk_matmul.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128).T -> (16, 128) +// Case2: (64, 128) @ (128, 64).T -> (64, 64) +// +// kj is stored as (N, K) = (block_size, head_dim) in row-major memory. +// This is equivalent to (K, N) in column-major (DN) layout. +// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm__ Tensor *sij) { + __gm__ bfloat16_t *qi_addr = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr); + __gm__ bfloat16_t *kj_addr = reinterpret_cast<__gm__ bfloat16_t *>(kj->buffer.addr); + __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr); + + // qi (M, K) bf16 in ND (row-major) layout + using GlobalA = GlobalTensor, Stride>; + // kj stored as (N, K) row-major = (K, N) column-major -> DN layout + using GlobalB = GlobalTensor, Stride, Layout::DN>; + using GlobalOut = GlobalTensor, Stride>; + + GlobalA qiGlobal(qi_addr + qi->start_offset); + GlobalB kjGlobal(kj_addr + kj->start_offset); + GlobalOut sijGlobal(sij_addr + sij->start_offset); + + // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor) + using TileMatA = Tile; + using TileMatB = Tile; + + // L0 tiles + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // Load A and B to L1 with separate events for pipeline overlap + TLOAD(aMatTile, qiGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); // A load done + TLOAD(bMatTile, kjGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); // B load done + + // Move A to L0A as soon as A load completes (B may still be loading) + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TMOV(aTile, aMatTile); + // Move B to L0B after B load completes + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + // Matmul + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(sijGlobal, cTile); + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *kj = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[2]); + uint64_t q_tile_size = static_cast(qi->shapes[0]); + + if (q_tile_size == 16 && qi->shapes[1] <= 16) { + qk_matmul_impl<16, 16, 16>(qi, kj, sij); + } else if (q_tile_size == 16) { + qk_matmul_impl<16, 128, 128>(qi, kj, sij); + } else { + qk_matmul_impl<64, 128, 64>(qi, kj, sij); + } +} diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_online_update.cpp new file mode 100644 index 000000000..ded4dcad8 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_online_update.cpp @@ -0,0 +1,256 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Online Softmax Update + Normalize Kernel (AIV) +// +// Operates on full tiles where M=q_tile_size, N=head_dim (128): +// Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors +// Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors +// +// Scalar layout strategy using TRESHAPE (zero-copy UB reshape): +// Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV. +// For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M). +// After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops. +// This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void online_update_impl( + __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li, + __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst +) { + __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr); + __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr); + __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr); + __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr); + __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr); + __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr); + __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr); + + // Aligned rows for ColMajor DN tiles (32-byte alignment) + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + // --- GlobalTensor types --- + + // Data (M, N) RowMajor + using GlobalDataMxN = GlobalTensor, Stride<1, 1, 1, N, 1>>; + + // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading + using GlobalScalarDN = GlobalTensor, Stride<1, 1, 1, 1, 1>, Layout::DN>; + + // Scalar ND: for storing mi_new and li_new back to GM + constexpr int kScalarCols = 32 / sizeof(float); + constexpr int kScalarRows = M / kScalarCols; + using GlobalScalarND = + GlobalTensor, Stride<1, 1, 1, kScalarCols, 1>>; + + // --- GlobalTensor instances --- + + GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset); + GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset); + GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset); + + // DN globals for loading scalars as ColMajor + GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset); + GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset); + GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset); + GlobalScalarDN liGlobalDN(li_ptr + li->start_offset); + + // ND globals for storing scalar results + GlobalScalarND miGlobalND(mi_ptr + mi->start_offset); + GlobalScalarND liGlobalND(li_ptr + li->start_offset); + + // --- Tile types --- + + using TileDataMxN = Tile; + using TileScalarDN = Tile; + + // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE + using TileScalarRow = Tile; + + // ND tile for storing back to GM + using TileScalarND = + Tile; + + // --- UB memory layout --- + + constexpr int kDataBytes = M * N * sizeof(float); + constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); + + // Data tiles + TileDataMxN oiNewTile; + TileDataMxN oiTile; + + // Scalar DN tiles loaded from GM (ColMajor) + TileScalarDN mijDN, lijDN, miDN, liDN; + + // Temporary DN tiles for results + TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN; + + TASSIGN(oiNewTile, 0); + TASSIGN(oiTile, kDataBytes); + TASSIGN(mijDN, 2 * kDataBytes); + TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes); + TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes); + TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes); + TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes); + TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes); + TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes); + TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes); + TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes); + + if (is_first) { + // --- First block: copy inputs to accumulators --- + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(mijDN, mijGlobalDN); + TLOAD(lijDN, lijGlobalDN); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Store mi = mij, li = lij, oi = oi_new + // Alias ND tiles to the same UB as DN tiles for storing as ND format + TileScalarND mijND, lijND; + TASSIGN(mijND, 2 * kDataBytes); // alias same UB as mijDN + TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes); // alias same UB as lijDN + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, mijND); // mi = mij + TSTORE(liGlobalND, lijND); // li = lij + TSTORE(oiGlobal, oiNewTile); // oi = oi_new + + if (is_last) { + // Single block: normalize dst = oi_new / lij + // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiNewTile); + } + } else { + // --- Subsequent blocks: accumulate --- + + // Load all inputs + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(oiTile, oiGlobal); + TLOAD(mijDN, mijGlobalDN); + TLOAD(lijDN, lijGlobalDN); + TLOAD(miDN, miGlobalDN); + TLOAD(liDN, liGlobalDN); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic + TileScalarRow miRow, mijRow, liRow, lijRow; + TRESHAPE(miRow, miDN); + TRESHAPE(mijRow, mijDN); + TRESHAPE(liRow, liDN); + TRESHAPE(lijRow, lijDN); + + // Scalar arithmetic in RowMajor (1, M) layout + TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow; + TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes); + TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes); + TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes); + TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes); + TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes); + + TMAX(miNewRow, miRow, mijRow); // mi_new = max(mi, mij) + pipe_barrier(PIPE_V); + TSUB(alphaRow, miRow, miNewRow); // alpha_exp = mi - mi_new + pipe_barrier(PIPE_V); + TEXP(alphaRow, alphaRow); // alpha = exp(mi - mi_new) + pipe_barrier(PIPE_V); + TSUB(betaRow, mijRow, miNewRow); // beta_exp = mij - mi_new + pipe_barrier(PIPE_V); + TEXP(betaRow, betaRow); // beta = exp(mij - mi_new) + pipe_barrier(PIPE_V); + TMUL(tmpRow, alphaRow, liRow); // alpha * li + pipe_barrier(PIPE_V); + TMUL(liNewRow, betaRow, lijRow); // beta * lij + pipe_barrier(PIPE_V); + TADD(liNewRow, tmpRow, liNewRow); // li_new = alpha*li + beta*lij + + // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL + TRESHAPE(alphaDN, alphaRow); + TRESHAPE(betaDN, betaRow); + + // Scale data tiles using row-broadcast multiply + TROWEXPANDMUL(oiTile, oiTile, alphaDN); // oi *= alpha + TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); // oi_new *= beta + pipe_barrier(PIPE_V); + TADD(oiTile, oiTile, oiNewTile); // oi = alpha*oi + beta*oi_new + + // Store mi_new and li_new to GM (ND format) + // Alias ND tiles to the same UB locations as miNewRow and liNewRow + TileScalarND miNewND, liNewND; + TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes); + TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes); + + if (is_last) { + // Normalize and output: dst = oi / li_new + TRESHAPE(liNewDN, liNewRow); + pipe_barrier(PIPE_V); + TROWEXPANDDIV(oiTile, oiTile, liNewDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); // persist mi_new + TSTORE(liGlobalND, liNewND); // persist li_new + TSTORE(dstGlobal, oiTile); + } else { + // Store updated accumulators + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); // persist mi_new + TSTORE(liGlobalND, liNewND); // persist li_new + TSTORE(oiGlobal, oiTile); + } + } + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]); + __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]); + __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]); + __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]); + uint64_t is_first = static_cast(args[7]); + uint64_t is_last = static_cast(args[8]); + uint64_t q_tile_size = static_cast(mij->shapes[0]); + + if (q_tile_size == 16 && oi_new->shapes[1] <= 16) { + online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } else if (q_tile_size == 16) { + online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } else { + online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } +} diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp new file mode 100644 index 000000000..8f0c41775 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp @@ -0,0 +1,156 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Softmax Preparation Kernel (AIV) with partial block masking +// +// Operates on (M, N) tile where M=q_tile_size, N=block_size: +// Case1: sij is (16, 128) +// Case2: sij is (64, 64) +// +// For partial blocks (valid_len < N), positions [valid_len, N) in sij are +// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0 +// so that invalid key positions contribute zero attention weight. +// +// Computes: +// sij_masked = TFILLPAD(sij, valid_len, pad=-inf) +// sij_scale = sij_masked * scale +// mij = row_max(sij_scale) -> (M, 1) +// pij = exp(sij_scale - mij) -> (M, N) +// lij = row_sum(pij) -> (M, 1) + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void softmax_prepare_impl( + __gm__ Tensor *sij, float scale_value, __gm__ Tensor *pij, __gm__ Tensor *mij, __gm__ Tensor *lij +) { + uint64_t valid_len = static_cast(sij->shapes[1]); + __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr); + __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr); + __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr); + __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr); + + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + using GlobalDataMxN = GlobalTensor, Stride<1, 1, 1, N, 1>>; + using GlobalDataMxN_bf16 = GlobalTensor, Stride<1, 1, 1, N, 1>>; + using GlobalScalarDN = GlobalTensor, Stride<1, 1, 1, 1, 1>, Layout::DN>; + + GlobalDataMxN sijGlobal(sij_addr + sij->start_offset); + GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset); + GlobalScalarDN mijGlobal(mij_addr + mij->start_offset); + GlobalScalarDN lijGlobal(lij_addr + lij->start_offset); + + // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary + using TileSijDyn = Tile; + // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf + using TileSijPad = Tile; + + using TileVecMxN = Tile; + using TileVecMxN_bf16 = Tile; + using TileScalarDN = Tile; + + TileVecMxN sijTile; + TileSijDyn sijDynTile(static_cast(valid_len)); + TileSijPad sijPadTile; + TileVecMxN pijTile; + TileVecMxN tmpTile; + TileScalarDN maxTile; + TileScalarDN sumTile; + TileVecMxN_bf16 pijBf16Tile; + + // All sij tiles share UB address 0x0 (in-place masking) + TASSIGN(sijTile, 0x0); + TASSIGN(sijDynTile, 0x0); + TASSIGN(sijPadTile, 0x0); + TASSIGN(pijTile, M * N * sizeof(float)); + TASSIGN(tmpTile, 2 * M * N * sizeof(float)); + TASSIGN(maxTile, 3 * M * N * sizeof(float)); + TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float)); + TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float)); + + // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks + // printf("sij addr incore %x\n", sij->buffer.addr); + TLOAD(sijTile, sijGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary, + // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N. + TFILLPAD_INPLACE(sijPadTile, sijDynTile); + pipe_barrier(PIPE_V); + + TMULS(sijTile, sijTile, scale_value); + pipe_barrier(PIPE_V); + TROWMAX(maxTile, sijTile, tmpTile); + pipe_barrier(PIPE_V); + TROWEXPANDSUB(pijTile, sijTile, maxTile); + pipe_barrier(PIPE_V); + TEXP(pijTile, pijTile); + // Truncate pij to bf16 first + pipe_barrier(PIPE_V); + TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); // pij bf16 ready, can store early + + // Continue computing: bf16 → f32 and rowsum while pij store proceeds in parallel + pipe_barrier(PIPE_V); + TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND); + pipe_barrier(PIPE_V); + TROWSUM(sumTile, pijTile, tmpTile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); // sum ready + + // Store pij (overlaps with TCVT + TROWSUM above) + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(pijGlobal, pijBf16Tile); + + // Store max and sum + TSTORE(mijGlobal, maxTile); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(lijGlobal, sumTile); + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]); + union { + uint64_t u; + float f; + } scale_conv; + scale_conv.u = static_cast(args[4]); + float scale_value = scale_conv.f; + uint64_t q_tile_size = static_cast(sij->shapes[0]); + + if (q_tile_size == 16 && pij->shapes[1] <= 16) { + softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij); + } else if (q_tile_size == 16) { + softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij); + } else { + softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij); + } +} diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/orchestration/paged_attention_orch.cpp new file mode 100644 index 000000000..4ddab0a70 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/kernels/orchestration/paged_attention_orch.cpp @@ -0,0 +1,311 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Paged Attention Orchestration Function - manual-scope variant + * + * Matches the small-case paged_attention orchestration shape while replacing + * the automatic same-scope dependency wiring with explicit task-to-task deps + * inside PTO2_SCOPE(PTO2ScopeMode::MANUAL). + */ + +#include +#include +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_QK_MATMUL 0 +#define FUNC_SOFTMAX_PREPARE 1 +#define FUNC_PV_MATMUL 2 +#define FUNC_ONLINE_UPDATE 3 +constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000; // 50 MHz + +inline double cycles_to_us(uint64_t cycles) { + return (static_cast(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0; +} + +inline uint64_t get_sys_cnt_aicpu() { +#if defined(__aarch64__) + uint64_t ticks; + asm volatile("mrs %0, cntvct_el0" : "=r"(ticks)); + return ticks; +#elif defined(__x86_64__) + return 0; +#else + return 0; +#endif +} + +#ifdef ENABLE_PROFILING +#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1 +#define CYCLE_COUNT_LAP(acc) \ + do { \ + _t1 = get_sys_cnt_aicpu(); \ + acc += (_t1 - _t0); \ + _t0 = _t1; \ + } while (0) +#define PROF_INC(counter, n) (counter) += (n) +#else +#define CYCLE_COUNT_START() (void)0 +#define CYCLE_COUNT_LAP(acc) (void)0 +#define PROF_INC(counter, n) (void)0 +#endif + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{ + .expected_arg_count = 7, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) { +#ifdef ENABLE_PROFILING + uint64_t prof_param_extract = 0; + uint64_t prof_ext_tensor = 0; + uint64_t prof_scope = 0; + uint64_t prof_make_tensor = 0; + uint64_t prof_tensor_view = 0; + uint64_t prof_param_setup = 0; + uint64_t prof_submit_task = 0; + int prof_submit_count = 0; + int prof_make_count = 0; + int prof_view_count = 0; +#endif + + CYCLE_COUNT_START(); + + // Read dimensions from tensor metadata + uint64_t batch = orch_args.tensor(0).ref().shapes[0]; + uint64_t num_heads = orch_args.tensor(0).ref().shapes[1]; + uint64_t head_dim = orch_args.tensor(0).ref().shapes[2]; + DataType data_type = orch_args.tensor(0).ref().dtype; + + uint64_t block_size = orch_args.tensor(1).ref().shapes[1]; + uint64_t block_num = orch_args.tensor(3).ref().shapes[1]; + + uint64_t scale_value = orch_args.scalar(0); + + uint64_t q_head_num = num_heads; + uint64_t q_tile = std::min(num_heads, static_cast(128)); + uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; + CYCLE_COUNT_LAP(prof_param_extract); + + LOG_INFO_V9(">>>>>> batch = %" PRIu64, batch); + + // Reshape tensors for kernel consumption (2D flattened) + void *query_ptr = orch_args.tensor(0).ref().data_as(); + void *kc_ptr = orch_args.tensor(1).ref().data_as(); + void *vc_ptr = orch_args.tensor(2).ref().data_as(); + void *out_ptr = orch_args.tensor(5).ref().data_as(); + + uint64_t total_blocks_count = orch_args.tensor(1).ref().shapes[0]; + + uint32_t query_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + uint32_t key_cache_shapes[2] = { + static_cast(total_blocks_count * block_size), static_cast(head_dim) + }; + uint32_t value_cache_shapes[2] = { + static_cast(total_blocks_count * block_size), static_cast(head_dim) + }; + uint32_t out_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type); + Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type); + Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type); + Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32); + CYCLE_COUNT_LAP(prof_ext_tensor); + + uint32_t bt_shapes[2] = {static_cast(batch), static_cast(block_num)}; + Tensor block_table = + make_tensor_external(orch_args.tensor(3).ref().data_as(), bt_shapes, 2, DataType::INT32, false); + uint32_t cl_shapes[1] = {static_cast(batch)}; + Tensor context_lens = + make_tensor_external(orch_args.tensor(4).ref().data_as(), cl_shapes, 1, DataType::INT32, false); + + // Create infos are loop-invariant — shapes depend only on q_tile/head_dim/block_size + uint32_t tile2d_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; + uint32_t scalar_shapes[1] = {static_cast(q_tile)}; + uint32_t sij_shapes[2] = {static_cast(q_tile), static_cast(block_size)}; + TensorCreateInfo tile2d_ci(tile2d_shapes, 2, DataType::FLOAT32); + TensorCreateInfo scalar_ci(scalar_shapes, 1, DataType::FLOAT32); + TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32); + TensorCreateInfo pij_f16_ci(sij_shapes, 2, data_type); + + PROF_INC(prof_make_count, 4); + CYCLE_COUNT_LAP(prof_make_tensor); + + for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { + uint32_t cl_idx[1] = {static_cast(b_idx)}; + uint64_t cur_seq = static_cast(get_tensor_data(context_lens, 1, cl_idx)); + uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; + for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + PTO2_SCOPE(PTO2ScopeMode::MANUAL) { + CYCLE_COUNT_LAP(prof_scope); + uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; + + uint32_t qi_offsets[2] = {static_cast(cur_offset), 0}; + Tensor qi = query.view(tile2d_shapes, qi_offsets); + uint32_t out_view_offsets[2] = {static_cast(cur_offset), 0}; + Tensor out_view = out.view(tile2d_shapes, out_view_offsets); + PROF_INC(prof_view_count, 2); + CYCLE_COUNT_LAP(prof_tensor_view); + + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci); + const Tensor &oi = alloc_outs.get_ref(0); + const Tensor &li_update = alloc_outs.get_ref(1); + const Tensor &mi_update = alloc_outs.get_ref(2); + PTO2TaskId alloc_task = alloc_outs.task_id(); + PTO2TaskId prev_update_task = PTO2TaskId::invalid(); + PROF_INC(prof_submit_count, 1); + CYCLE_COUNT_LAP(prof_submit_task); + + for (uint64_t bn = 0; bn < bn_this_batch; bn++) { + uint32_t bt_idx[2] = {static_cast(b_idx), static_cast(bn)}; + uint64_t cur_block_idx = static_cast(get_tensor_data(block_table, 2, bt_idx)); + uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size); + CYCLE_COUNT_LAP(prof_param_extract); + + uint32_t kv_shapes[2] = {static_cast(block_size), static_cast(head_dim)}; + uint32_t kv_offsets[2] = {static_cast(cur_block_idx * block_size), 0}; + Tensor kj = key_cache.view(kv_shapes, kv_offsets); + Tensor vj = value_cache.view(kv_shapes, kv_offsets); + PROF_INC(prof_view_count, 2); + CYCLE_COUNT_LAP(prof_tensor_view); + + L0TaskArgs params_qk; + params_qk.add_input(qi); + params_qk.add_input(kj); + params_qk.add_output(sij_ci); + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors qk_outs = rt_submit_aic_task(FUNC_QK_MATMUL, params_qk); + const Tensor &sij = qk_outs.get_ref(0); + PROF_INC(prof_submit_count, 1); + CYCLE_COUNT_LAP(prof_submit_task); + + uint32_t sij_valid_shapes[2] = {static_cast(q_tile), static_cast(valid_len)}; + uint32_t sij_valid_offsets[2] = {0, 0}; + Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets); + PROF_INC(prof_view_count, 1); + CYCLE_COUNT_LAP(prof_tensor_view); + + // --- Primitive dep API (Arg + set_dependencies) --- + // Caller owns the deps buffer; Arg stores (ptr, count). + // Suited for codegen and for cases with a fixed dep set. + L0TaskArgs params_sf; + params_sf.add_input(sij_valid); + params_sf.add_output(pij_f16_ci); + params_sf.add_output(scalar_ci); + params_sf.add_output(scalar_ci); + PTO2TaskId sf_deps[] = {qk_outs.task_id()}; + params_sf.set_dependencies(sf_deps, 1); + params_sf.add_scalar(scale_value); + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors sf_outs = rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf); + const Tensor &pij_f16 = sf_outs.get_ref(0); + const Tensor &mi = sf_outs.get_ref(1); + const Tensor &li = sf_outs.get_ref(2); + PROF_INC(prof_submit_count, 1); + CYCLE_COUNT_LAP(prof_submit_task); + + L0TaskArgs params_pv; + params_pv.add_input(pij_f16); + params_pv.add_input(vj); + params_pv.add_output(tile2d_ci); + PTO2TaskId pv_deps[] = {sf_outs.task_id()}; + params_pv.set_dependencies(pv_deps, 1); + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors pv_outs = rt_submit_aic_task(FUNC_PV_MATMUL, params_pv); + const Tensor &oi_tmp = pv_outs.get_ref(0); + PROF_INC(prof_submit_count, 1); + CYCLE_COUNT_LAP(prof_submit_task); + + uint64_t is_first = (bn == 0) ? 1 : 0; + uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0; + CYCLE_COUNT_LAP(prof_param_extract); + + // --- Convenience dep API (L0TaskArgsWithDeps + add_dep) --- + // Wrapper owns a stack-sized deps buffer and accepts + // incremental add_dep() calls; the submit overload binds + // them to the underlying Arg via set_dependencies(...). + // Suited for hand-written orch where the dep set is + // assembled conditionally across branches. + L0TaskArgsWithDeps<> params_up; + params_up.add_input(mi); + params_up.add_input(li); + params_up.add_input(oi_tmp); + params_up.add_inout(mi_update); + params_up.add_inout(li_update); + params_up.add_inout(oi); + params_up.add_inout(out_view); + // UP reads SF's mi/li, but SF -> PV -> UP already orders it; only the PV edge is explicit. + params_up.add_dep(pv_outs.task_id()); + if (prev_update_task.is_valid()) { + params_up.add_dep(prev_update_task); + } + // alloc completes inline; this dep only keeps the scratch buffers alive until the last consumer. + if (is_last) { + params_up.add_dep(alloc_task); + } + params_up.add_scalar(is_first); + params_up.add_scalar(is_last); + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors up_outs = rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up); + prev_update_task = up_outs.task_id(); + PROF_INC(prof_submit_count, 1); + CYCLE_COUNT_LAP(prof_submit_task); + } + } + CYCLE_COUNT_LAP(prof_scope); + } + } + +#ifdef ENABLE_PROFILING + uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup + + prof_submit_task + prof_scope; + LOG_INFO_V9( + "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count, + prof_make_count, prof_view_count, cycles_to_us(total) + ); + if (total > 0) { + LOG_INFO_V9( + " param_extract : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract), + prof_param_extract * 100.0 / total + ); + LOG_INFO_V9( + " ext_tensor(x4) : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total + ); + LOG_INFO_V9( + " create_info(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor), + prof_make_tensor * 100.0 / total, + prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0 + ); + LOG_INFO_V9( + " tensor_view(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view), + prof_tensor_view * 100.0 / total, + prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0 + ); + LOG_INFO_V9( + " param_setup : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total + ); + LOG_INFO_V9(" scope : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total); + LOG_INFO_V9( + " submit_task(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task), + prof_submit_task * 100.0 / total, + prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0 + ); + } +#endif +} + +} // extern "C" diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/test_paged_attention.py b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/test_paged_attention.py new file mode 100644 index 000000000..971c714b6 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_manual_scope/test_paged_attention.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Paged attention manual-scope wrapper for A2A3 tensormap_and_ringbuffer.""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden +from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs + + +@scene_test(level=2, runtime="fully_distributed_within_core") +class TestPagedAttentionManualScope(SceneTestCase): + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/paged_attention_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "name": "QK", + "source": "kernels/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "name": "SF", + "source": "kernels/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 2, + "name": "PV", + "source": "kernels/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 3, + "name": "UP", + "source": "kernels/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a2a3"], + # Long-context cases submit >16384 in-flight tasks into a single + # MANUAL scope; the default per-ring task window (16384) can fill + # before the oldest task retires and wedge the orchestrator + # (FLOW_CONTROL_DEADLOCK / code 3). Double the window for headroom. + "config": {"aicpu_thread_num": 4, "block_dim": 24, "runtime_env": {"ring_task_window": 32768}}, + "params": { + "batch": 256, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case2", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case3", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 256, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseSmall1", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 9}, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 33, + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseSmall2", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 128, + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseVarSeq2", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 2, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 33, + "context_lens_list": [33, 17], + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseVarSeq4", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 4, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 128, + "context_lens_list": [33, 64, 128, 15], + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + ] + + def generate_args(self, params): + result = _pa_generate_inputs(params) + specs = [] + for name, value in result: + if isinstance(value, torch.Tensor): + specs.append(Tensor(name, value)) + else: + specs.append(Scalar(name, value)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py b/examples/a2a3/fully_distributed_within_core/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py new file mode 100644 index 000000000..88f3de4d3 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Paged attention with small ring buffer sizes — stress test for ring rotation/reclamation. + +Drives per-case ring sizing through ``config.runtime_env`` (ring_task_window / +ring_heap / ring_dep_pool) rather than the process-global PTO2_RING_* env, plus +INOUT tensors, bfloat16, and AIC+AIV mixed execution. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden # noqa: PLC0415 +from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs # noqa: PLC0415 + +PA_KERNELS = "../../../../tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels" + + +@scene_test(level=2, runtime="fully_distributed_within_core") +class TestPagedAttentionRingbuffer(SceneTestCase): + """Paged attention with small ring buffer sizes for stress testing.""" + + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": f"{PA_KERNELS}/orchestration/paged_attention_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": f"{PA_KERNELS}/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "source": f"{PA_KERNELS}/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": f"{PA_KERNELS}/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 3, + "source": f"{PA_KERNELS}/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "ringbuffer_stress", + "platforms": ["a2a3"], + # ring_heap is bytes per ring. Non power-of-2 sizes are accepted, + # but 4 MiB keeps the small-ring stress intent compact. + "config": { + "aicpu_thread_num": 4, + "block_dim": 24, + "runtime_env": { + "ring_task_window": 64, + "ring_heap": 4 * 1024 * 1024, + "ring_dep_pool": 256, + }, + }, + "params": { + "batch": 32, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 4096, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + ] + + def generate_args(self, params): + inputs = _pa_generate_inputs(params) + specs = [] + for name, val in inputs: + if isinstance(val, torch.Tensor): + specs.append(Tensor(name, val)) + else: + specs.append(Scalar(name, val)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_pv_matmul.cpp new file mode 100644 index 000000000..8befa5c51 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_pv_matmul.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +// SplitK PV Matmul Kernel: Accumulated P @ V across n_blocks +// +// Processes n_blocks blocks using SplitK accumulation pattern: +// Block 0: TMATMUL(C, A, B) — initialize accumulator +// Block i: TMATMUL_ACC(C, C, A, B) — accumulate into same C +// +// Per-block pij addresses: contiguous slices of pij_buf (n_blocks * M * K) +// Per-block vj addresses: value_cache base + block_indices lookup +// Single output: oi_new (M, N) fp32 = sum of P_i @ V_i across all blocks +// +// Optimizations: +// - Double-buffered L1 tiles (ping/pong for A and B via MTE2) +// - Double-buffered L0 tiles (ping/pong for L0A and L0B via MTE1) +// - TLOAD(next) overlaps with TMATMUL(current) via MTE2/M-pipe parallelism +// - Canonical 3-stage pipeline: TLOAD(MTE2) → TMOV(MTE1) → TMATMUL(M) +// - Reverse-dependency events ensure buffer safety across iterations +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128) -> (16, 128) +// Case2: (64, 64) @ ( 64, 128) -> (64, 128) +// +// pij is bfloat16 (from softmax_prepare TCVT). +// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout. + +#include +// NOLINTBEGIN(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-avoid-c-arrays,modernize-use-auto) +#include + +#include "tensor.h" + +// NOLINTNEXTLINE(build/namespaces) +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] // NOLINT(whitespace/braces) +#endif + +template +static __aicore__ void pv_matmul_n_impl( + __gm__ bfloat16_t *pij_base, __gm__ bfloat16_t *val_base, __gm__ float *oi_base, uint64_t n_blocks, + __gm__ int32_t *bt, uint64_t bt_offset +) { + using GlobalA = GlobalTensor, Stride>; + using GlobalB = GlobalTensor, Stride>; + using GlobalOut = GlobalTensor, Stride>; + + using TileMatA = Tile; + using TileMatB = Tile; + + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + // L1 memory layout: double-buffered A and B tiles (tightly packed) + constexpr int kATileBytes = M * K * static_cast(sizeof(bfloat16_t)); + constexpr int kBTileBytes = K * N * static_cast(sizeof(bfloat16_t)); + + TileMatA aMatTile[2]; + TileMatB bMatTile[2]; + TASSIGN(aMatTile[0], 0x0); + TASSIGN(aMatTile[1], kATileBytes); + TASSIGN(bMatTile[0], 2 * kATileBytes); + TASSIGN(bMatTile[1], 2 * kATileBytes + kBTileBytes); + + // L0 memory layout: double-buffered L0A and L0B, single accumulator L0C + LeftTile aTile[2]; + RightTile bTile[2]; + AccTile cTile; + TASSIGN(aTile[0], 0x0); + TASSIGN(aTile[1], kATileBytes); + TASSIGN(bTile[0], 0x0); + TASSIGN(bTile[1], kBTileBytes); + TASSIGN(cTile, 0x0); + + GlobalOut oiGlobal(oi_base); + + // Seed reverse-dependency flags: all ping/pong buffers initially free + // PIPE_MTE1 → PIPE_MTE2: L1 buffer [0/1] safe for TLOAD to overwrite + // PIPE_M → PIPE_MTE1: L0 buffer [0/1] safe for TMOV to overwrite + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + + for (uint64_t i = 0; i < n_blocks; i++) { + int cur = static_cast(i % 2); + GlobalA pijGlobal(pij_base + i * M * K); + GlobalB vjGlobal(val_base + bt[bt_offset + i] * K * N); + + // Stage 1: TLOAD (MTE2: GM → L1[cur]) + // Wait for MTE1 to release L1[cur] (reverse dep from previous iteration) + wait_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(cur)); + TLOAD(aMatTile[cur], pijGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); // forward: A in L1 ready + TLOAD(bMatTile[cur], vjGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); // forward: B in L1 ready + + // Stage 2: TMOV (MTE1: L1[cur] → L0[cur]) + // Wait for M-pipe to release L0[cur] (reverse dep from previous iteration) + wait_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(cur)); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); // forward: wait A loaded + TMOV(aTile[cur], aMatTile[cur]); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); // forward: wait B loaded + TMOV(bTile[cur], bMatTile[cur]); + set_flag(PIPE_MTE1, PIPE_MTE2, static_cast<::event_t>(cur)); // reverse: release L1[cur] + + // Stage 3: TMATMUL (M-pipe: L0A[cur] × L0B[cur] → L0C) + set_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(cur)); // forward: L0[cur] ready + wait_flag(PIPE_MTE1, PIPE_M, static_cast<::event_t>(cur)); + if (i == 0) { + TMATMUL(cTile, aTile[cur], bTile[cur]); + } else { + TMATMUL_ACC(cTile, cTile, aTile[cur], bTile[cur]); + } + set_flag(PIPE_M, PIPE_MTE1, static_cast<::event_t>(cur)); // reverse: release L0[cur] + } + + // Drain outstanding reverse-dependency flags + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(oiGlobal, cTile); + + set_flag(PIPE_FIX, PIPE_S, EVENT_ID7); + wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *pij_buf = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *value_cache = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[3]); + uint64_t n_blocks = static_cast(args[4]); + uint64_t bt_offset = static_cast(args[5]); + + __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset; + __gm__ bfloat16_t *val_base = reinterpret_cast<__gm__ bfloat16_t *>(value_cache->buffer.addr); + __gm__ float *oi_base = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr) + oi_new->start_offset; + __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr); + + uint64_t q_tile_size = static_cast(pij_buf->shapes[0]); + + if (q_tile_size == 16) { + pv_matmul_n_impl<16, 128, 128>(pij_base, val_base, oi_base, n_blocks, bt, bt_offset); + } else { + pv_matmul_n_impl<64, 64, 128>(pij_base, val_base, oi_base, n_blocks, bt, bt_offset); + } +} +// NOLINTEND(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-avoid-c-arrays,modernize-use-auto) diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_qk_matmul.cpp new file mode 100644 index 000000000..13ef8e06b --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aic/aic_qk_matmul.cpp @@ -0,0 +1,156 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +// Multi-block QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N) for each block +// +// Processes n_blocks blocks in a single kernel invocation. +// Per-block kj addresses computed from key_cache base + block_indices lookup. +// qi is shared across all blocks (same query head against different key blocks). +// +// Output layout: n_blocks contiguous (M, N) tiles stacked vertically. +// Block i occupies sij[i*M : (i+1)*M, 0:N]. +// +// Optimizations: +// - qi TLOAD hoisted before the loop (constant across all iterations) +// - Double-buffered L1 B tiles: prefetch next kj during current TMATMUL+TSTORE +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128).T -> (16, 128) +// Case2: (64, 128) @ (128, 64).T -> (64, 64) +// +// Template: M=q_tile, K=head_dim, N=block_size + +#include +// NOLINTBEGIN(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-use-auto) +#include + +#include "tensor.h" + +// NOLINTNEXTLINE(build/namespaces) +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] // NOLINT(whitespace/braces) +#endif + +template +static __aicore__ void qk_matmul_n_impl( + __gm__ bfloat16_t *qi_base, __gm__ bfloat16_t *key_base, __gm__ float *sij_base, uint64_t n_blocks, + __gm__ int32_t *bt, uint64_t bt_offset +) { + using GlobalA = GlobalTensor, Stride>; + using GlobalB = GlobalTensor, Stride, Layout::DN>; + using GlobalOut = GlobalTensor, Stride>; + + using TileMatA = Tile; + using TileMatB = Tile; + + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + // Double-buffered L1 B tiles for kj prefetching + constexpr int kBBytes = K * N * static_cast(sizeof(bfloat16_t)); + TileMatA aMatTile; + TileMatB bMatTile_A; + TileMatB bMatTile_B; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile_A, 0x20000); + TASSIGN(bMatTile_B, 0x20000 + kBBytes); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + // Hoist qi TLOAD before the loop (qi is constant across all blocks) + GlobalA qiGlobal(qi_base); + TLOAD(aMatTile, qiGlobal); + + // Pre-load first kj into buffer A + GlobalB kjGlobal_0(key_base + bt[bt_offset + 0] * N * K); + TLOAD(bMatTile_A, kjGlobal_0); + + for (uint64_t i = 0; i < n_blocks; i++) { + GlobalOut sijGlobal(sij_base + i * M * N); + + // Wait for current kj TLOAD to complete + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + // TMOV qi L1→L0A and kj L1→L0B from current buffer + TMOV(aTile, aMatTile); + if (i % 2 == 0) { + TMOV(bTile, bMatTile_A); + } else { + TMOV(bTile, bMatTile_B); + } + + // Prefetch next kj into alternate L1 buffer (overlaps with MTE1→M→FIX) + if (i + 1 < n_blocks) { + GlobalB kjGlobal_next(key_base + bt[bt_offset + i + 1] * N * K); + if (i % 2 == 0) { + TLOAD(bMatTile_B, kjGlobal_next); + } else { + TLOAD(bMatTile_A, kjGlobal_next); + } + } + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(sijGlobal, cTile); + + if (i + 1 < n_blocks) { + // Drain all pipes before next iteration: + // - FIX/MTE3: ensures TSTORE data path (L0C→UB→GM) fully completes + // - MTE2: prefetch TLOAD likely already done (ran during TMATMUL+TSTORE) + // The prefetch TLOAD overlaps with compute, so barrier cost is minimal. + pipe_barrier(PIPE_ALL); + } + } + set_flag(PIPE_FIX, PIPE_S, EVENT_ID7); + wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *key_cache = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *sij_buf = reinterpret_cast<__gm__ Tensor *>(args[3]); + uint64_t n_blocks = static_cast(args[4]); + uint64_t bt_offset = static_cast(args[5]); + + __gm__ bfloat16_t *qi_base = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr) + qi->start_offset; + __gm__ bfloat16_t *key_base = reinterpret_cast<__gm__ bfloat16_t *>(key_cache->buffer.addr); + __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset; + __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr); + + uint64_t q_tile_size = static_cast(qi->shapes[0]); + + if (q_tile_size == 16) { + qk_matmul_n_impl<16, 128, 128>(qi_base, key_base, sij_base, n_blocks, bt, bt_offset); + } else { + qk_matmul_n_impl<64, 128, 64>(qi_base, key_base, sij_base, n_blocks, bt, bt_offset); + } +} +// NOLINTEND(clang-diagnostic-error,bugprone-reserved-identifier,bugprone-easily-swappable-parameters,modernize-use-auto) diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_online_update.cpp new file mode 100644 index 000000000..b5d71b544 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_online_update.cpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Online Softmax Update + Normalize Kernel (AIV) +// +// Operates on full tiles where M=q_tile_size, N=head_dim (128): +// Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors +// Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors +// +// Scalar layout strategy using TRESHAPE (zero-copy UB reshape): +// Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV. +// For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M). +// After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops. +// This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original. + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void online_update_impl( + __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li, + __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst +) { + __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr); + __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr); + __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr); + __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr); + __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr); + __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr); + __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr); + + // Aligned rows for ColMajor DN tiles (32-byte alignment) + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + // --- GlobalTensor types --- + + // Data (M, N) RowMajor + using GlobalDataMxN = GlobalTensor, Stride<1, 1, 1, N, 1>>; + + // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading + using GlobalScalarDN = GlobalTensor, Stride<1, 1, 1, 1, 1>, Layout::DN>; + + // Scalar ND: for storing mi_new and li_new back to GM + constexpr int kScalarCols = 32 / sizeof(float); + constexpr int kScalarRows = M / kScalarCols; + using GlobalScalarND = + GlobalTensor, Stride<1, 1, 1, kScalarCols, 1>>; + + // --- GlobalTensor instances --- + + GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset); + GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset); + GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset); + + // DN globals for loading scalars as ColMajor + GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset); + GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset); + GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset); + GlobalScalarDN liGlobalDN(li_ptr + li->start_offset); + + // ND globals for storing scalar results + GlobalScalarND miGlobalND(mi_ptr + mi->start_offset); + GlobalScalarND liGlobalND(li_ptr + li->start_offset); + + // --- Tile types --- + + using TileDataMxN = Tile; + using TileScalarDN = Tile; + + // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE + using TileScalarRow = Tile; + + // ND tile for storing back to GM + using TileScalarND = + Tile; + + // --- UB memory layout --- + + constexpr int kDataBytes = M * N * sizeof(float); + constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); + + // Data tiles + TileDataMxN oiNewTile; + TileDataMxN oiTile; + + // Scalar DN tiles loaded from GM (ColMajor) + TileScalarDN mijDN, lijDN, miDN, liDN; + + // Temporary DN tiles for results + TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN; + + TASSIGN(oiNewTile, 0); + TASSIGN(oiTile, kDataBytes); + TASSIGN(mijDN, 2 * kDataBytes); + TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes); + TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes); + TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes); + TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes); + TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes); + TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes); + TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes); + TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes); + + if (is_first) { + // --- First block: copy inputs to accumulators --- + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(mijDN, mijGlobalDN); + TLOAD(lijDN, lijGlobalDN); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // Store mi = mij, li = lij, oi = oi_new + // Alias ND tiles to same UB as DN tiles for ND-format store + TileScalarND mijND, lijND; + TASSIGN(mijND, 2 * kDataBytes); // alias same UB as mijDN + TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes); // alias same UB as lijDN + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, mijND); // mi = mij + TSTORE(liGlobalND, lijND); // li = lij + TSTORE(oiGlobal, oiNewTile); // oi = oi_new + + if (is_last) { + // Single block: normalize dst = oi_new / lij + // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiNewTile); + } + } else { + // --- Subsequent blocks: accumulate --- + + // Load all inputs as DN (ColMajor) + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(oiTile, oiGlobal); + TLOAD(mijDN, mijGlobalDN); + TLOAD(lijDN, lijGlobalDN); + TLOAD(miDN, miGlobalDN); + TLOAD(liDN, liGlobalDN); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic + TileScalarRow miRow, mijRow, liRow, lijRow; + TRESHAPE(miRow, miDN); + TRESHAPE(mijRow, mijDN); + TRESHAPE(liRow, liDN); + TRESHAPE(lijRow, lijDN); + + // Scalar arithmetic in RowMajor (1, M) layout + TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow; + TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes); + TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes); + TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes); + TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes); + TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes); + + TMAX(miNewRow, miRow, mijRow); // mi_new = max(mi, mij) + pipe_barrier(PIPE_V); + // alphaRow and betaRow write to independent UB addresses; both only read miNewRow + TSUB(alphaRow, miRow, miNewRow); // alpha_exp = mi - mi_new + TSUB(betaRow, mijRow, miNewRow); // beta_exp = mij - mi_new + pipe_barrier(PIPE_V); + // TEXP on independent UB addresses + TEXP(alphaRow, alphaRow); // alpha = exp(mi - mi_new) + TEXP(betaRow, betaRow); // beta = exp(mij - mi_new) + pipe_barrier(PIPE_V); + // tmpRow and liNewRow write to independent UB addresses + TMUL(tmpRow, alphaRow, liRow); // alpha * li + TMUL(liNewRow, betaRow, lijRow); // beta * lij + pipe_barrier(PIPE_V); + TADD(liNewRow, tmpRow, liNewRow); // li_new = alpha*li + beta*lij + + // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL + pipe_barrier(PIPE_V); + TRESHAPE(alphaDN, alphaRow); + TRESHAPE(betaDN, betaRow); + + // Scale data tiles using row-broadcast multiply + TROWEXPANDMUL(oiTile, oiTile, alphaDN); // oi *= alpha + TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); // oi_new *= beta + pipe_barrier(PIPE_V); + TADD(oiTile, oiTile, oiNewTile); // oi = alpha*oi + beta*oi_new + + // Store mi_new and li_new to GM (ND format) + // Alias ND tiles to the same UB locations as miNewRow and liNewRow + TileScalarND miNewND, liNewND; + TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes); + TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes); + + if (is_last) { + // Normalize and output: dst = oi / li_new + TRESHAPE(liNewDN, liNewRow); + pipe_barrier(PIPE_V); + TROWEXPANDDIV(oiTile, oiTile, liNewDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); // persist mi_new + TSTORE(liGlobalND, liNewND); // persist li_new + TSTORE(dstGlobal, oiTile); + } else { + // Store updated accumulators + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); // persist mi_new + TSTORE(liGlobalND, liNewND); // persist li_new + TSTORE(oiGlobal, oiTile); + } + } + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]); + __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]); + __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]); + __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]); + uint64_t is_first = static_cast(args[7]); + uint64_t is_last = static_cast(args[8]); + uint64_t q_tile_size = static_cast(mij->shapes[0]); + // args[10] = head_dim (128) + + if (q_tile_size == 16) { + online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } else { + online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst); + } +} diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp new file mode 100644 index 000000000..c18957ee5 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/aiv/aiv_softmax_prepare.cpp @@ -0,0 +1,292 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Two-Pass Softmax Kernel (AIV) for n_blocks tiles +// +// Input: sij_buf (n_blocks * M, N) fp32 — QK results stacked vertically +// Output: pij_buf (n_blocks * M, N) bf16 — attention weights per block +// mij (M,) fp32 — global row max across all blocks +// lij (M,) fp32 — total row sum across all blocks +// +// Pass 1: Iterate over n_blocks tiles, mask last block, +// find global m = scale * max over all blocks of rowmax(S_i) +// Defers scale to after the loop (single M-element TMULS vs n_blocks M×N). +// Uses double-buffered sij tiles and TRESHAPE for DN↔Row conversion. +// Pass 2: Iterate again, compute P_i = exp(S_i * scale - m) -> bf16, +// accumulate l = sum over all blocks of rowsum(P_i) +// Uses double-buffered sij tiles to overlap TLOAD with computation. +// +// Two-pass ensures all P_i tiles share the same scale (global max), +// enabling direct TMATMUL_ACC accumulation in the PV kernel. +// +// Supports two tile configurations via runtime dispatch: +// Case1: M=16, N=128 (q_tile=16, block_size=128) +// Case2: M=64, N=64 (q_tile=64, block_size=64) + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void softmax_prepare_n_impl( + __gm__ float *sij_base, float scale_value, __gm__ bfloat16_t *pij_base, __gm__ float *mij_addr, + __gm__ float *lij_addr, uint64_t n_blocks, uint64_t valid_len_last +) { + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + constexpr int kScalarCols = 32 / sizeof(float); + constexpr int kScalarRows = M / kScalarCols; + + // --- GlobalTensor types --- + using GlobalDataMxN = GlobalTensor, Stride<1, 1, 1, N, 1>>; + using GlobalDataMxN_bf16 = GlobalTensor, Stride<1, 1, 1, N, 1>>; + using GlobalScalarDN = GlobalTensor, Stride<1, 1, 1, 1, 1>, Layout::DN>; + using GlobalScalarND = + GlobalTensor, Stride<1, 1, 1, kScalarCols, 1>>; + + // --- Tile types --- + using TileSijDyn = Tile; + using TileSijPad = Tile; + using TileVecMxN = Tile; + using TileVecMxN_bf16 = Tile; + using TileScalarDN = Tile; + using TileScalarND = + Tile; + // RowMajor (1, M) tile for element-wise arithmetic via TRESHAPE + using TileScalarRow = Tile; + + // --- UB memory layout (double-buffered sij) --- + constexpr int kDataBytes = M * N * sizeof(float); + constexpr int kScalarDNBytes = kAlignedRows * sizeof(float); + + // Double-buffered sij tiles + TileVecMxN sijTile_A; + TileSijPad sijPadTile_A; + TileVecMxN sijTile_B; + TileSijPad sijPadTile_B; + TileVecMxN pijTile; + TileVecMxN tmpTile; + TileVecMxN sumAccTile; + TileScalarDN localMaxDN; + TileScalarDN globalMaxDN; + TileScalarDN sumDN; + TileVecMxN_bf16 pijBf16Tile; + + // TRESHAPE aliases (same UB address as their DN counterparts) + TileScalarRow localMaxRow; + TileScalarRow globalMaxRow; + + // ND alias for storing globalMax to GM + TileScalarND globalMaxND; + + TASSIGN(sijTile_A, 0x0); + TASSIGN(sijPadTile_A, 0x0); + TASSIGN(sijTile_B, kDataBytes); + TASSIGN(sijPadTile_B, kDataBytes); + TASSIGN(pijTile, 2 * kDataBytes); + TASSIGN(tmpTile, 3 * kDataBytes); + TASSIGN(sumAccTile, 4 * kDataBytes); + int scalarBase = 5 * kDataBytes; + TASSIGN(localMaxDN, scalarBase); + TASSIGN(localMaxRow, scalarBase); // alias: same UB as localMaxDN + TASSIGN(globalMaxDN, scalarBase + kScalarDNBytes); + TASSIGN(globalMaxRow, scalarBase + kScalarDNBytes); // alias: same UB as globalMaxDN + TASSIGN(globalMaxND, scalarBase + kScalarDNBytes); // alias: same UB as globalMaxDN + TASSIGN(sumDN, scalarBase + 2 * kScalarDNBytes); + TASSIGN(pijBf16Tile, scalarBase + 3 * kScalarDNBytes); + + // GM aliases (mij/lij output buffers) + GlobalScalarND mijGlobalND(mij_addr); + GlobalScalarDN lijGlobalDN(lij_addr); + + // ======== Pass 1: Find global row max (unscaled) with double-buffered sij ======== + // rowmax(S*scale) = scale * rowmax(S) since scale > 0, so defer scale to after loop. + GlobalDataMxN sijGlobal_p1_0(sij_base); + TLOAD(sijTile_A, sijGlobal_p1_0); + + for (uint64_t i = 0; i < n_blocks; i++) { + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + if (i == n_blocks - 1 && valid_len_last < static_cast(N)) { + TileSijDyn sijDynTile(static_cast(valid_len_last)); + if (i % 2 == 0) { + TASSIGN(sijDynTile, 0x0); + TFILLPAD_INPLACE(sijPadTile_A, sijDynTile); + } else { + TASSIGN(sijDynTile, static_cast(kDataBytes)); + TFILLPAD_INPLACE(sijPadTile_B, sijDynTile); + } + pipe_barrier(PIPE_V); + } + + // Compute unscaled TROWMAX on current buffer + if (i % 2 == 0) { + TROWMAX(localMaxDN, sijTile_A, tmpTile); + } else { + TROWMAX(localMaxDN, sijTile_B, tmpTile); + } + pipe_barrier(PIPE_V); + + // Prefetch next sij into alternate buffer (overlaps with V pipe scalar ops) + if (i + 1 < n_blocks) { + GlobalDataMxN sijGlobal_next(sij_base + (i + 1) * M * N); + if (i % 2 == 0) { + TLOAD(sijTile_B, sijGlobal_next); + } else { + TLOAD(sijTile_A, sijGlobal_next); + } + } + + // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise TMAX + TRESHAPE(localMaxRow, localMaxDN); + if (i == 0) { + TMAX(globalMaxRow, localMaxRow, localMaxRow); + } else { + TMAX(globalMaxRow, globalMaxRow, localMaxRow); + } + pipe_barrier(PIPE_V); + } + + // Apply scale once to the global max vector (M elements, not n_blocks × M × N) + TMULS(globalMaxRow, globalMaxRow, scale_value); + pipe_barrier(PIPE_V); + + // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for Pass 2's TROWEXPANDSUB + TRESHAPE(globalMaxDN, globalMaxRow); + + // Store final global max to mij for online_update to consume + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(mijGlobalND, globalMaxND); + + // ======== Pass 2: Compute softmax with double-buffered sij ======== + // globalMaxDN is already in UB from TRESHAPE — no reload needed. + // Sync MTE3→MTE2 to ensure the mij TSTORE completed before first sij TLOAD. + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + + // Pre-load first sij tile into buffer A + GlobalDataMxN sijGlobal_0(sij_base); + TLOAD(sijTile_A, sijGlobal_0); + + for (uint64_t i = 0; i < n_blocks; i++) { + GlobalDataMxN_bf16 pijGlobal(pij_base + i * M * N); + + // Wait for current tile's TLOAD to complete + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + // TFILLPAD on current buffer if last block with partial valid length + if (i == n_blocks - 1 && valid_len_last < static_cast(N)) { + TileSijDyn curSijDyn(static_cast(valid_len_last)); + if (i % 2 == 0) { + TASSIGN(curSijDyn, 0x0); + TFILLPAD_INPLACE(sijPadTile_A, curSijDyn); + } else { + TASSIGN(curSijDyn, static_cast(kDataBytes)); + TFILLPAD_INPLACE(sijPadTile_B, curSijDyn); + } + pipe_barrier(PIPE_V); + } + + // Compute on current buffer (select A or B based on iteration parity) + if (i % 2 == 0) { + TMULS(sijTile_A, sijTile_A, scale_value); + pipe_barrier(PIPE_V); + TROWEXPANDSUB(pijTile, sijTile_A, globalMaxDN); + } else { + TMULS(sijTile_B, sijTile_B, scale_value); + pipe_barrier(PIPE_V); + TROWEXPANDSUB(pijTile, sijTile_B, globalMaxDN); + } + pipe_barrier(PIPE_V); + TEXP(pijTile, pijTile); + pipe_barrier(PIPE_V); + TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND); + pipe_barrier(PIPE_V); + TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND); + + pipe_barrier(PIPE_V); + if (i == 0) { + TMULS(sumAccTile, pijTile, 1.0f); + } else { + TADD(sumAccTile, sumAccTile, pijTile); + } + + // Store pij (must complete before next iteration's TCVT overwrites pijBf16Tile) + pipe_barrier(PIPE_V); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(pijGlobal, pijBf16Tile); + + // Prefetch next sij into alternate buffer (after TSTORE to avoid UB race) + if (i + 1 < n_blocks) { + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + GlobalDataMxN sijGlobal_next(sij_base + (i + 1) * M * N); + if (i % 2 == 0) { + TLOAD(sijTile_B, sijGlobal_next); + } else { + TLOAD(sijTile_A, sijGlobal_next); + } + } + } + + // Compute final row sum from accumulated pij values + pipe_barrier(PIPE_V); + TROWSUM(sumDN, sumAccTile, tmpTile); + + // Store lij (total sum). mij already stored after Pass 1. + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(lijGlobalDN, sumDN); + + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *sij_buf = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *pij_buf = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]); + union { + uint64_t u; + float f; + } scale_conv; + scale_conv.u = static_cast(args[4]); + float scale_value = scale_conv.f; + uint64_t n_blocks = static_cast(args[5]); + uint64_t valid_len_last = static_cast(args[6]); + + __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_buf->buffer.addr) + sij_buf->start_offset; + __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_buf->buffer.addr) + pij_buf->start_offset; + __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr) + mij->start_offset; + __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr) + lij->start_offset; + + uint64_t q_tile_size = static_cast(sij_buf->shapes[0]); + + if (q_tile_size == 16) { + softmax_prepare_n_impl<16, 128>(sij_base, scale_value, pij_base, mij_addr, lij_addr, n_blocks, valid_len_last); + } else { + softmax_prepare_n_impl<64, 64>(sij_base, scale_value, pij_base, mij_addr, lij_addr, n_blocks, valid_len_last); + } +} diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/orchestration/paged_attention_orch.cpp new file mode 100644 index 000000000..82bc89f37 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/kernels/orchestration/paged_attention_orch.cpp @@ -0,0 +1,352 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Paged Attention Orchestration Function V2 - N_UNROLL=8, 4 Tasks Per Group + * + * Batches up to N_UNROLL blocks per group. Each group submits exactly 4 tasks: + * 1. QK matmul: qi @ K^T for n_blocks → sij_buf (q_tile, n_blocks * block_size) + * 2. Softmax: two-pass over sij_buf → pij_buf, mi, li + * 3. PV matmul: SplitK accumulated P @ V → oi_new (q_tile, head_dim) + * 4. Update: online softmax accumulation with group-level mi, li, oi_new + * + * Memory Layout: + * Query: (batch * num_heads, head_dim) bf16 + * Key: (total_blocks, block_size, head_dim) bf16 (stored as K^T for QK) + * Value: (total_blocks, block_size, head_dim) bf16 + */ + +#include +#include +#include + +#include "pto_orchestration_api.h" + +#define N_UNROLL 64 + +#define FUNC_QK_MATMUL 0 +#define FUNC_SOFTMAX_PREPARE 1 +#define FUNC_PV_MATMUL 2 +#define FUNC_ONLINE_UPDATE 3 +constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000; // 50 MHz + +inline double cycles_to_us(uint64_t cycles) { + return (static_cast(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0; +} + +inline uint64_t get_sys_cnt_aicpu() { + uint64_t ticks; + asm volatile("mrs %0, cntvct_el0" : "=r"(ticks)); + return ticks; +} + +#ifdef ENABLE_PROFILING +#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1 +#define CYCLE_COUNT_LAP(acc) \ + do { \ + _t1 = get_sys_cnt_aicpu(); \ + acc += (_t1 - _t0); \ + _t0 = _t1; \ + } while (0) +#else +#define CYCLE_COUNT_START() (void)0 +#define CYCLE_COUNT_LAP(acc) (void)0 +#endif + +extern "C" { +/** + * Orchestration config — the executor reads these values to set up + * shared memory and runtime before calling aicpu_orchestration_entry. + */ +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{ + .expected_arg_count = 7, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) { +#ifdef ENABLE_PROFILING + uint64_t prof_param_extract = 0; + uint64_t prof_ext_tensor = 0; + uint64_t prof_make_tensor = 0; + uint64_t prof_tensor_view = 0; + uint64_t prof_param_setup = 0; + uint64_t prof_submit_task = 0; + uint64_t prof_scope_and_loop = 0; + int prof_submit_count = 0; + int prof_make_count = 0; + int prof_view_count = 0; +#endif + + CYCLE_COUNT_START(); + + // Read dimensions from tensor metadata + // query: shape=[batch, num_heads, head_dim] + uint64_t batch = orch_args.tensor(0).ref().shapes[0]; + uint64_t num_heads = orch_args.tensor(0).ref().shapes[1]; + uint64_t head_dim = orch_args.tensor(0).ref().shapes[2]; + DataType data_type = orch_args.tensor(0).ref().dtype; + + // key_cache: shape=[total_blocks, block_size, kv_head_num, head_dim] + uint64_t block_size = orch_args.tensor(1).ref().shapes[1]; + + // block_table: shape=[batch, max_num_blocks_per_req] + uint64_t block_num = orch_args.tensor(3).ref().shapes[1]; + + // scale from scalar arg + uint64_t scale_value = orch_args.scalar(0); + uint64_t q_head_num = num_heads; + uint64_t q_tile = std::min(num_heads, 128UL); + uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile; + CYCLE_COUNT_LAP(prof_param_extract); + + // Reshape tensors for kernel consumption (2D flattened) + void *query_ptr = orch_args.tensor(0).ref().data_as(); + void *kc_ptr = orch_args.tensor(1).ref().data_as(); + void *vc_ptr = orch_args.tensor(2).ref().data_as(); + void *out_ptr = orch_args.tensor(5).ref().data_as(); + + uint64_t total_blocks_count = orch_args.tensor(1).ref().shapes[0]; + + uint32_t query_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + uint32_t key_cache_shapes[2] = { + static_cast(total_blocks_count * block_size), static_cast(head_dim) + }; + uint32_t value_cache_shapes[2] = { + static_cast(total_blocks_count * block_size), static_cast(head_dim) + }; + uint32_t out_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type, false); + Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type, false); + Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type, false); + Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32); + + uint32_t bt_shapes[2] = {static_cast(batch), static_cast(block_num)}; + Tensor block_table = + make_tensor_external(orch_args.tensor(3).ref().data_as(), bt_shapes, 2, DataType::INT32, false); + uint32_t cl_shapes[1] = {static_cast(batch)}; + Tensor context_lens = + make_tensor_external(orch_args.tensor(4).ref().data_as(), cl_shapes, 1, DataType::INT32, false); + +#ifdef ENABLE_PROFILING + CYCLE_COUNT_LAP(prof_ext_tensor); +#endif + + // Create infos are loop-invariant — shapes depend only on q_tile/head_dim + uint32_t oi_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; + uint32_t li_shapes[1] = {static_cast(q_tile)}; + TensorCreateInfo tile2d_ci(oi_shapes, 2, DataType::FLOAT32); + TensorCreateInfo scalar_ci(li_shapes, 1, DataType::FLOAT32); +#ifdef ENABLE_PROFILING + prof_make_count += 2; + CYCLE_COUNT_LAP(prof_make_tensor); +#endif + + for (uint64_t b_idx = 0; b_idx < batch; b_idx++) { + uint32_t cl_idx[1] = {static_cast(b_idx)}; + uint64_t cur_seq = static_cast(get_tensor_data(context_lens, 1, cl_idx)); + uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size; + + for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + CYCLE_COUNT_LAP(prof_scope_and_loop); + PTO2_SCOPE(PTO2ScopeMode::MANUAL) { + uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile; + + uint32_t qi_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; + uint32_t qi_offsets[2] = {static_cast(cur_offset), 0}; + Tensor qi = query.view(qi_shapes, qi_offsets); + uint32_t out_view_shapes[2] = {static_cast(q_tile), static_cast(head_dim)}; + uint32_t out_view_offsets[2] = {static_cast(cur_offset), 0}; + Tensor out_view = out.view(out_view_shapes, out_view_offsets, true); +#ifdef ENABLE_PROFILING + prof_view_count += 2; + CYCLE_COUNT_LAP(prof_tensor_view); +#endif + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci); + const Tensor &oi = alloc_outs.get_ref(0); + const Tensor &li_update = alloc_outs.get_ref(1); + const Tensor &mi_update = alloc_outs.get_ref(2); + PTO2TaskId pre_task_id; +#ifdef ENABLE_PROFILING + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); +#endif + + // Reusable Arg objects — reset() before each use avoids + // repeated stack-frame construction in the inner loop. + L0TaskArgs params_qk, params_sf, params_pv, params_up; + + for (uint64_t bn = 0; bn < bn_this_batch; bn += N_UNROLL) { + uint64_t n_blocks = std::min(static_cast(N_UNROLL), bn_this_batch - bn); + + // Valid length for last block in this group + uint64_t last_block_seq_start = (bn + n_blocks - 1) * block_size; + uint64_t valid_len_last = std::min(block_size, cur_seq - last_block_seq_start); + CYCLE_COUNT_LAP(prof_param_extract); + + // === Task 1: Batched QK matmul === + uint32_t sij_buf_shapes[2] = { + static_cast(q_tile), static_cast(n_blocks * block_size) + }; + TensorCreateInfo sij_buf_ci(sij_buf_shapes, 2, DataType::FLOAT32); +#ifdef ENABLE_PROFILING + prof_make_count += 1; + CYCLE_COUNT_LAP(prof_make_tensor); +#endif + + params_qk.reset(); + params_qk.add_input(qi); + params_qk.add_input(key_cache); + params_qk.add_input(block_table); + params_qk.add_output(sij_buf_ci); + params_qk.add_scalar(n_blocks); + params_qk.add_scalar(b_idx * block_num + bn); + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors qk_outs = rt_submit_aic_task(FUNC_QK_MATMUL, params_qk); + const Tensor &sij_buf = qk_outs.get_ref(0); +#ifdef ENABLE_PROFILING + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); +#endif + + // === Task 2: Two-pass softmax over all blocks in group === + uint32_t pij_buf_shapes[2] = { + static_cast(q_tile), static_cast(n_blocks * block_size) + }; + TensorCreateInfo pij_buf_ci(pij_buf_shapes, 2, data_type); +#ifdef ENABLE_PROFILING + prof_make_count += 1; + CYCLE_COUNT_LAP(prof_make_tensor); +#endif + + params_sf.reset(); + params_sf.add_input(sij_buf); + params_sf.add_output(pij_buf_ci); + params_sf.add_output(scalar_ci); + params_sf.add_output(scalar_ci); + PTO2TaskId sf_deps[] = {qk_outs.task_id()}; + params_sf.set_dependencies(sf_deps, 1); + params_sf.add_scalar(scale_value); + params_sf.add_scalar(n_blocks); + params_sf.add_scalar(valid_len_last); + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors sf_outs = rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf); + const Tensor &pij_buf = sf_outs.get_ref(0); + const Tensor &mi = sf_outs.get_ref(1); + const Tensor &li = sf_outs.get_ref(2); +#ifdef ENABLE_PROFILING + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); +#endif + + // === Task 3: SplitK PV matmul (accumulated P @ V) === + params_pv.reset(); + params_pv.add_input(pij_buf); + params_pv.add_input(value_cache); + params_pv.add_input(block_table); + params_pv.add_output(tile2d_ci); + PTO2TaskId pv_deps[] = {sf_outs.task_id()}; + params_pv.set_dependencies(pv_deps, 1); + params_pv.add_scalar(n_blocks); + params_pv.add_scalar(b_idx * block_num + bn); + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors pv_outs = rt_submit_aic_task(FUNC_PV_MATMUL, params_pv); + const Tensor &oi_new = pv_outs.get_ref(0); +#ifdef ENABLE_PROFILING + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); +#endif + + // === Task 4: Online update (per-group) === + uint64_t is_first = (bn == 0) ? 1 : 0; + uint64_t is_last = (bn + n_blocks >= bn_this_batch) ? 1 : 0; + + params_up.reset(); + params_up.add_input(mi); + params_up.add_input(li); + params_up.add_input(oi_new); + params_up.add_inout(mi_update); + params_up.add_inout(li_update); + params_up.add_inout(oi); + params_up.add_inout(out_view); + PTO2TaskId up_deps[3]; + uint32_t up_dep_count = 0; + up_deps[up_dep_count++] = pv_outs.task_id(); + if (!is_first) { + up_deps[up_dep_count++] = pre_task_id; + } + // alloc completes inline; this dep only keeps the scratch buffers alive until the last consumer. + if (is_last) { + up_deps[up_dep_count++] = alloc_outs.task_id(); + } + params_up.set_dependencies(up_deps, up_dep_count); + params_up.add_scalar(is_first); + params_up.add_scalar(is_last); + CYCLE_COUNT_LAP(prof_param_setup); + TaskOutputTensors update_outs = rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up); + pre_task_id = update_outs.task_id(); +#ifdef ENABLE_PROFILING + prof_submit_count++; + CYCLE_COUNT_LAP(prof_submit_task); +#endif + } + } + CYCLE_COUNT_LAP(prof_scope_and_loop); + } + } + CYCLE_COUNT_LAP(prof_scope_and_loop); + +#ifdef ENABLE_PROFILING + uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup + + prof_submit_task + prof_scope_and_loop; + LOG_INFO_V9( + "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count, + prof_make_count, prof_view_count, cycles_to_us(total) + ); + if (total > 0) { + LOG_INFO_V9( + " param_extract : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract), + prof_param_extract * 100.0 / total + ); + LOG_INFO_V9( + " ext_tensor(x4) : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total + ); + LOG_INFO_V9( + " create_info(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor), + prof_make_tensor * 100.0 / total, + prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0 + ); + LOG_INFO_V9( + " tensor_view(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view), + prof_tensor_view * 100.0 / total, + prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0 + ); + LOG_INFO_V9( + " param_setup : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total + ); + LOG_INFO_V9( + " submit_task(x%d) : %7.3fus (%5.1f%%) avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task), + prof_submit_task * 100.0 / total, + prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0 + ); + LOG_INFO_V9( + " scope_and_loop : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope_and_loop), + prof_scope_and_loop * 100.0 / total + ); + } +#endif + +#undef CYCLE_COUNT_START +#undef CYCLE_COUNT_LAP +} + +} // extern "C" diff --git a/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/test_paged_attention_unroll.py b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/test_paged_attention_unroll.py new file mode 100644 index 000000000..34cbdde6c --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/paged_attention_unroll_manual_scope/test_paged_attention_unroll.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Paged attention unroll manual-scope wrapper for A2A3 tensormap_and_ringbuffer.""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden +from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs + + +@scene_test(level=2, runtime="fully_distributed_within_core") +class TestPagedAttentionUnrollManualScope(SceneTestCase): + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/paged_attention_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "name": "QK", + "source": "kernels/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "name": "SF", + "source": "kernels/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 2, + "name": "PV", + "source": "kernels/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 3, + "name": "UP", + "source": "kernels/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": { + "batch": 256, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case2", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case3", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 256, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + ] + + def generate_args(self, params): + inputs = _pa_generate_inputs(params) + specs = [] + for name, val in inputs: + if isinstance(val, torch.Tensor): + specs.append(Tensor(name, val)) + else: + specs.append(Scalar(name, val)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/down_proj.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/down_proj.cpp new file mode 100644 index 000000000..968515353 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/down_proj.cpp @@ -0,0 +1,328 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: down_proj +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void down_proj(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, __gm__ float *v3, int32_t v4) { + unsigned v5 = 0; + const int32_t v6 = 68; + const int32_t v7 = 0; + const int32_t v8 = 256; + const int32_t v9 = 128; + const int32_t v10 = 5120; + const int32_t v11 = 1; + const int32_t v12 = 17408; + const int32_t v13 = 16; + const int64_t v14 = 32768; + const int64_t v15 = 4096; + const int64_t v16 = 8192; + const int64_t v17 = 0; + using T = float; + +#if defined(__DAV_CUBE__) + size_t v18 = (size_t)v11; + Tile< + TileType::Mat, bfloat16_t, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v19 = Tile< + TileType::Mat, bfloat16_t, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v13, v8); + uint64_t v20 = (uint64_t)v17; + TASSIGN(v19, v20); + pto::Shape<1, 1, 1, 16, 256> v21 = pto::Shape<1, 1, 1, 16, 256>(); + pto::Stride<278528, 278528, 278528, 17408, 1> v22 = pto::Stride<278528, 278528, 278528, 17408, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<278528, 278528, 278528, 17408, 1>, pto::Layout::ND> + v23 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<278528, 278528, 278528, 17408, 1>, pto::Layout::ND>( + v1 + (v5 + v5 * (unsigned)v12 + v5 * (unsigned)v11), v21, v22 + ); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + TLOAD(v19, v23); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + Tile< + TileType::Mat, bfloat16_t, 256, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v24 = Tile< + TileType::Mat, bfloat16_t, 256, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v8, v9); + uint64_t v25 = (uint64_t)v16; + TASSIGN(v24, v25); + pto::Shape<1, 1, 1, 256, 128> v26 = pto::Shape<1, 1, 1, 256, 128>(); + pto::Stride<1310720, 1310720, 1310720, 5120, 1> v27 = pto::Stride<1310720, 1310720, 1310720, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 256, 128>, pto::Stride<1310720, 1310720, 1310720, 5120, 1>, pto::Layout::ND> + v28 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 256, 128>, pto::Stride<1310720, 1310720, 1310720, 5120, 1>, + pto::Layout::ND>(v2 + (v5 + v5 * (unsigned)v10 + (unsigned)v4 * (unsigned)v11), v26, v27); + TLOAD(v24, v28); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v29 = Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v13, v9); + uint64_t v30 = (uint64_t)v17; + TASSIGN(v29, v30); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TEXTRACT(v29, v19, v7, v7); + Tile< + TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v31 = Tile< + TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v9, v9); + uint64_t v32 = (uint64_t)v17; + TASSIGN(v31, v32); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TEXTRACT(v31, v24, v7, v7); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v33 = Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v13, v9); + uint64_t v34 = (uint64_t)v15; + TASSIGN(v33, v34); + TEXTRACT(v33, v19, v7, v9); + Tile< + TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v35 = Tile< + TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v9, v9); + uint64_t v36 = (uint64_t)v14; + TASSIGN(v35, v36); + TEXTRACT(v35, v24, v9, v7); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + Tile< + TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v37 = Tile< + TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v13, v9); + uint64_t v38 = (uint64_t)v17; + TASSIGN(v37, v38); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + TMATMUL(v37, v29, v31); + Tile< + TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v39 = Tile< + TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v13, v9); + uint64_t v40 = (uint64_t)v17; + TASSIGN(v39, v40); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + TMATMUL_ACC(v39, v39, v33, v35); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + for (size_t v41 = v18; v41 < ((size_t)v6); v41 += v18) { + int32_t v42 = (int32_t)((uint32_t)((int32_t)v41) * (uint32_t)v8); + Tile< + TileType::Mat, bfloat16_t, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v43 = Tile< + TileType::Mat, bfloat16_t, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v13, v8); + uint64_t v44 = (uint64_t)v17; + TASSIGN(v43, v44); + pto::Shape<1, 1, 1, 16, 256> v45 = pto::Shape<1, 1, 1, 16, 256>(); + pto::Stride<278528, 278528, 278528, 17408, 1> v46 = pto::Stride<278528, 278528, 278528, 17408, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<278528, 278528, 278528, 17408, 1>, pto::Layout::ND> + v47 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<278528, 278528, 278528, 17408, 1>, + pto::Layout::ND>(v1 + (v5 + v5 * (unsigned)v12 + (unsigned)v42 * (unsigned)v11), v45, v46); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + TLOAD(v43, v47); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + Tile< + TileType::Mat, bfloat16_t, 256, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v48 = Tile< + TileType::Mat, bfloat16_t, 256, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v8, v9); + uint64_t v49 = (uint64_t)v16; + TASSIGN(v48, v49); + pto::Shape<1, 1, 1, 256, 128> v50 = pto::Shape<1, 1, 1, 256, 128>(); + pto::Stride<1310720, 1310720, 1310720, 5120, 1> v51 = pto::Stride<1310720, 1310720, 1310720, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 256, 128>, pto::Stride<1310720, 1310720, 1310720, 5120, 1>, pto::Layout::ND> + v52 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 256, 128>, pto::Stride<1310720, 1310720, 1310720, 5120, 1>, + pto::Layout::ND>(v2 + (v5 + (unsigned)v42 * (unsigned)v10 + (unsigned)v4 * (unsigned)v11), v50, v51); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + TLOAD(v48, v52); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3); + Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v53 = Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v13, v9); + uint64_t v54 = (uint64_t)v17; + TASSIGN(v53, v54); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + TEXTRACT(v53, v43, v7, v7); + Tile< + TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v55 = Tile< + TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v9, v9); + uint64_t v56 = (uint64_t)v17; + TASSIGN(v55, v56); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3); + TEXTRACT(v55, v48, v7, v7); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID2); + Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v57 = Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v13, v9); + uint64_t v58 = (uint64_t)v15; + TASSIGN(v57, v58); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + TEXTRACT(v57, v43, v7, v9); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + Tile< + TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v59 = Tile< + TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v9, v9); + uint64_t v60 = (uint64_t)v14; + TASSIGN(v59, v60); + TEXTRACT(v59, v48, v9, v7); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID3); + Tile< + TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v61 = Tile< + TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v13, v9); + uint64_t v62 = (uint64_t)v17; + TASSIGN(v61, v62); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID2); + pipe_barrier(PIPE_M); + TMATMUL_ACC(v61, v61, v53, v55); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + Tile< + TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v63 = Tile< + TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v13, v9); + uint64_t v64 = (uint64_t)v17; + TASSIGN(v63, v64); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID3); + TMATMUL_ACC(v63, v63, v57, v59); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + } + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 128> v65 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<2048, 2048, 2048, 128, 1> v66 = pto::Stride<2048, 2048, 2048, 128, 1>(); + GlobalTensor, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND> v67 = + GlobalTensor, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>( + v3 + (v5 + v5 * (unsigned)v9 + v5 * (unsigned)v11), v65, v66 + ); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(v67, v39); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); +#endif // __DAV_CUBE__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: mlp_tile__rv_v2 + __gm__ Tensor *mlp_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ bfloat16_t *mlp_tile__rv_v2 = reinterpret_cast<__gm__ bfloat16_t *>(mlp_tile__rv_v2_tensor->buffer.addr) + + mlp_tile__rv_v2_tensor->start_offset; + + // Unpack tensor: w_down__ssa_v0 + __gm__ Tensor *w_down__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ bfloat16_t *w_down__ssa_v0 = + reinterpret_cast<__gm__ bfloat16_t *>(w_down__ssa_v0_tensor->buffer.addr) + w_down__ssa_v0_tensor->start_offset; + + // Unpack tensor: fp32_chunk_gm__ssa_v0 + __gm__ Tensor *fp32_chunk_gm__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *fp32_chunk_gm__ssa_v0 = reinterpret_cast<__gm__ float *>(fp32_chunk_gm__ssa_v0_tensor->buffer.addr) + + fp32_chunk_gm__ssa_v0_tensor->start_offset; + + // Unpack scalar: d0__ssa_v0 + union { + uint64_t u64; + int64_t val; + } d0__ssa_v0_conv; + d0__ssa_v0_conv.u64 = args[3]; + int64_t d0__ssa_v0 = d0__ssa_v0_conv.val; + + // Forward to ptoas-generated function + down_proj(mlp_tile__rv_v2, w_down__ssa_v0, fp32_chunk_gm__ssa_v0, d0__ssa_v0); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/gate_proj.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/gate_proj.cpp new file mode 100644 index 000000000..11a0493e3 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/gate_proj.cpp @@ -0,0 +1,331 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: gate_proj +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void gate_proj(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, __gm__ float *v3, int32_t v4) { + unsigned v5 = 0; + const int32_t v6 = 40; + const int32_t v7 = 64; + const int32_t v8 = 0; + const int32_t v9 = 128; + const int32_t v10 = 256; + const int32_t v11 = 17408; + const int32_t v12 = 1; + const int32_t v13 = 5120; + const int32_t v14 = 16; + const int64_t v15 = 32768; + const int64_t v16 = 2048; + const int64_t v17 = 4096; + const int64_t v18 = 0; + using T = float; + +#if defined(__DAV_CUBE__) + size_t v19 = (size_t)v12; + Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v20 = Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v9); + uint64_t v21 = (uint64_t)v18; + TASSIGN(v20, v21); + pto::Shape<1, 1, 1, 16, 128> v22 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v23 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v24 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v1 + (v5 + v5 * (unsigned)v13 + v5 * (unsigned)v12), v22, v23 + ); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + TLOAD(v20, v24); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + Tile< + TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v25 = Tile< + TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v9, v10); + uint64_t v26 = (uint64_t)v17; + TASSIGN(v25, v26); + pto::Shape<1, 1, 1, 128, 256> v27 = pto::Shape<1, 1, 1, 128, 256>(); + pto::Stride<2228224, 2228224, 2228224, 17408, 1> v28 = pto::Stride<2228224, 2228224, 2228224, 17408, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>, pto::Layout::ND> + v29 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>, + pto::Layout::ND>(v2 + (v5 + v5 * (unsigned)v11 + (unsigned)v4 * (unsigned)v12), v27, v28); + TLOAD(v25, v29); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v30 = Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v7); + uint64_t v31 = (uint64_t)v18; + TASSIGN(v30, v31); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TEXTRACT(v30, v20, v8, v8); + Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v32 = Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v7, v10); + uint64_t v33 = (uint64_t)v18; + TASSIGN(v32, v33); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TEXTRACT(v32, v25, v8, v8); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v34 = Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v7); + uint64_t v35 = (uint64_t)v16; + TASSIGN(v34, v35); + TEXTRACT(v34, v20, v8, v7); + Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v36 = Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v7, v10); + uint64_t v37 = (uint64_t)v15; + TASSIGN(v36, v37); + TEXTRACT(v36, v25, v7, v8); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v38 = Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v14, v10); + uint64_t v39 = (uint64_t)v18; + TASSIGN(v38, v39); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + TMATMUL(v38, v30, v32); + Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v40 = Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v14, v10); + uint64_t v41 = (uint64_t)v18; + TASSIGN(v40, v41); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + TMATMUL_ACC(v40, v40, v34, v36); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + for (size_t v42 = v19; v42 < ((size_t)v6); v42 += v19) { + int32_t v43 = (int32_t)((uint32_t)((int32_t)v42) * (uint32_t)v9); + Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v44 = Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v9); + uint64_t v45 = (uint64_t)v18; + TASSIGN(v44, v45); + pto::Shape<1, 1, 1, 16, 128> v46 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v47 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v48 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v1 + (v5 + v5 * (unsigned)v13 + (unsigned)v43 * (unsigned)v12), v46, v47 + ); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + TLOAD(v44, v48); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + Tile< + TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v49 = Tile< + TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v9, v10); + uint64_t v50 = (uint64_t)v17; + TASSIGN(v49, v50); + pto::Shape<1, 1, 1, 128, 256> v51 = pto::Shape<1, 1, 1, 128, 256>(); + pto::Stride<2228224, 2228224, 2228224, 17408, 1> v52 = pto::Stride<2228224, 2228224, 2228224, 17408, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>, + pto::Layout::ND> + v53 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>, + pto::Layout::ND>(v2 + (v5 + (unsigned)v43 * (unsigned)v11 + (unsigned)v4 * (unsigned)v12), v51, v52); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + TLOAD(v49, v53); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3); + Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v54 = Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v7); + uint64_t v55 = (uint64_t)v18; + TASSIGN(v54, v55); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + TEXTRACT(v54, v44, v8, v8); + Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v56 = Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v7, v10); + uint64_t v57 = (uint64_t)v18; + TASSIGN(v56, v57); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3); + TEXTRACT(v56, v49, v8, v8); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID2); + Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v58 = Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v7); + uint64_t v59 = (uint64_t)v16; + TASSIGN(v58, v59); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + TEXTRACT(v58, v44, v8, v7); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v60 = Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v7, v10); + uint64_t v61 = (uint64_t)v15; + TASSIGN(v60, v61); + TEXTRACT(v60, v49, v7, v8); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID3); + Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v62 = Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v14, v10); + uint64_t v63 = (uint64_t)v18; + TASSIGN(v62, v63); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID2); + pipe_barrier(PIPE_M); + TMATMUL_ACC(v62, v62, v54, v56); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v64 = Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v14, v10); + uint64_t v65 = (uint64_t)v18; + TASSIGN(v64, v65); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID3); + TMATMUL_ACC(v64, v64, v58, v60); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + } + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 256> v66 = pto::Shape<1, 1, 1, 16, 256>(); + pto::Stride<4096, 4096, 4096, 256, 1> v67 = pto::Stride<4096, 4096, 4096, 256, 1>(); + GlobalTensor, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND> v68 = + GlobalTensor, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>( + v3 + (v5 + v5 * (unsigned)v10 + v5 * (unsigned)v12), v66, v67 + ); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(v68, v40); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); +#endif // __DAV_CUBE__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: post_norm_tile__rv_v2 + __gm__ Tensor *post_norm_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ bfloat16_t *post_norm_tile__rv_v2 = + reinterpret_cast<__gm__ bfloat16_t *>(post_norm_tile__rv_v2_tensor->buffer.addr) + + post_norm_tile__rv_v2_tensor->start_offset; + + // Unpack tensor: w_gate__ssa_v0 + __gm__ Tensor *w_gate__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ bfloat16_t *w_gate__ssa_v0 = + reinterpret_cast<__gm__ bfloat16_t *>(w_gate__ssa_v0_tensor->buffer.addr) + w_gate__ssa_v0_tensor->start_offset; + + // Unpack tensor: ret0__out + __gm__ Tensor *ret0__out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *ret0__out = + reinterpret_cast<__gm__ float *>(ret0__out_tensor->buffer.addr) + ret0__out_tensor->start_offset; + + // Unpack scalar: o0__ssa_v1 + union { + uint64_t u64; + int64_t val; + } o0__ssa_v1_conv; + o0__ssa_v1_conv.u64 = args[3]; + int64_t o0__ssa_v1 = o0__ssa_v1_conv.val; + + // Forward to ptoas-generated function + gate_proj(post_norm_tile__rv_v2, w_gate__ssa_v0, ret0__out, o0__ssa_v1); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/kv_proj.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/kv_proj.cpp new file mode 100644 index 000000000..2e865ac03 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/kv_proj.cpp @@ -0,0 +1,597 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: kv_proj +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void kv_proj( + __gm__ float *v1, __gm__ float *v2, __gm__ bfloat16_t *v3, __gm__ bfloat16_t *v4, __gm__ bfloat16_t *v5, int32_t v6, + int32_t v7 +) { + unsigned v8 = 0; + const int32_t v9 = 10; + const int32_t v10 = 256; + const int32_t v11 = 0; + const int32_t v12 = 512; + const int32_t v13 = 64; + const int32_t v14 = 4; + const int32_t v15 = 5120; + const int32_t v16 = 1; + const int32_t v17 = 1024; + const int32_t v18 = 16; + const int64_t v19 = 32768; + const int64_t v20 = 8192; + const int64_t v21 = 4096; + const int64_t v22 = 16384; + const int64_t v23 = 0; + using T = float; + +#if defined(__DAV_CUBE__) + size_t v24 = (size_t)v16; + size_t v25 = (size_t)v9; + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID3); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID6); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID7); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID6); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID7); + for (size_t v26 = (size_t)v6; v26 < ((size_t)((int32_t)(uint32_t)v6 + (uint32_t)v14)); v26 += v24) { + int32_t v27 = (int32_t)((uint32_t)((int32_t)v26) * (uint32_t)v13); + Tile< + TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v28 = Tile< + TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v18, v12); + uint64_t v29 = (uint64_t)v23; + TASSIGN(v28, v29); + pto::Shape<1, 1, 1, 16, 512> v30 = pto::Shape<1, 1, 1, 16, 512>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v31 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v32 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v3 + (v8 + v8 * (unsigned)v15 + v8 * (unsigned)v16), v30, v31 + ); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + TLOAD(v28, v32); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + Tile< + TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v33 = Tile< + TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v12, v13); + uint64_t v34 = (uint64_t)v22; + TASSIGN(v33, v34); + pto::Shape<1, 1, 1, 512, 64> v35 = pto::Shape<1, 1, 1, 512, 64>(); + pto::Stride<524288, 524288, 524288, 1024, 1> v36 = pto::Stride<524288, 524288, 524288, 1024, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>, pto::Layout::ND> + v37 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>, + pto::Layout::ND>(v4 + (v8 + v8 * (unsigned)v17 + (unsigned)v27 * (unsigned)v16), v35, v36); + TLOAD(v33, v37); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v38 = Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v18, v10); + uint64_t v39 = (uint64_t)v20; + TASSIGN(v38, v39); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + TEXTRACT(v38, v28, v11, v11); + Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v40 = Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v10, v13); + uint64_t v41 = (uint64_t)v19; + TASSIGN(v40, v41); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TEXTRACT(v40, v33, v11, v11); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v42 = Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v18, v10); + uint64_t v43 = (uint64_t)v23; + TASSIGN(v42, v43); + TEXTRACT(v42, v28, v11, v10); + Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v44 = Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v10, v13); + uint64_t v45 = (uint64_t)v23; + TASSIGN(v44, v45); + TEXTRACT(v44, v33, v10, v11); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v46 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v18, v13); + uint64_t v47 = (uint64_t)v21; + TASSIGN(v46, v47); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + TMATMUL(v46, v38, v40); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v48 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v18, v13); + uint64_t v49 = (uint64_t)v21; + TASSIGN(v48, v49); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + TMATMUL_ACC(v48, v48, v42, v44); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + for (size_t v50 = v24; v50 < v25; v50 += v24) { + int32_t v51 = (int32_t)((uint32_t)((int32_t)v50) * (uint32_t)v12); + Tile< + TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v52 = Tile< + TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v18, v12); + uint64_t v53 = (uint64_t)v23; + TASSIGN(v52, v53); + pto::Shape<1, 1, 1, 16, 512> v54 = pto::Shape<1, 1, 1, 16, 512>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v55 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v56 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, + pto::Layout::ND>(v3 + (v8 + v8 * (unsigned)v15 + (unsigned)v51 * (unsigned)v16), v54, v55); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + TLOAD(v52, v56); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + Tile< + TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v57 = Tile< + TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v12, v13); + uint64_t v58 = (uint64_t)v22; + TASSIGN(v57, v58); + pto::Shape<1, 1, 1, 512, 64> v59 = pto::Shape<1, 1, 1, 512, 64>(); + pto::Stride<524288, 524288, 524288, 1024, 1> v60 = pto::Stride<524288, 524288, 524288, 1024, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>, pto::Layout::ND> + v61 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>, + pto::Layout::ND>( + v4 + (v8 + (unsigned)v51 * (unsigned)v17 + (unsigned)v27 * (unsigned)v16), v59, v60 + ); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3); + TLOAD(v57, v61); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3); + Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v62 = Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v18, v10); + uint64_t v63 = (uint64_t)v20; + TASSIGN(v62, v63); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + TEXTRACT(v62, v52, v11, v11); + Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v64 = Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v10, v13); + uint64_t v65 = (uint64_t)v19; + TASSIGN(v64, v65); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3); + TEXTRACT(v64, v57, v11, v11); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID2); + Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v66 = Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v18, v10); + uint64_t v67 = (uint64_t)v23; + TASSIGN(v66, v67); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3); + TEXTRACT(v66, v52, v11, v10); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v68 = Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v10, v13); + uint64_t v69 = (uint64_t)v23; + TASSIGN(v68, v69); + TEXTRACT(v68, v57, v10, v11); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID3); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v70 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v18, v13); + uint64_t v71 = (uint64_t)v21; + TASSIGN(v70, v71); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID2); + pipe_barrier(PIPE_M); + TMATMUL_ACC(v70, v70, v62, v64); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v72 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v18, v13); + uint64_t v73 = (uint64_t)v21; + TASSIGN(v72, v73); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID3); + TMATMUL_ACC(v72, v72, v66, v68); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID3); + }; + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID4); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID4); + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 64> v74 = pto::Shape<1, 1, 1, 16, 64>(); + pto::Stride<16384, 16384, 16384, 1024, 1> v75 = pto::Stride<16384, 16384, 16384, 1024, 1>(); + GlobalTensor, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND> + v76 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND>( + v1 + (v8 + (unsigned)v7 * (unsigned)v17 + (unsigned)v27 * (unsigned)v16), v74, v75 + ); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(v76, v48); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + Tile< + TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v77 = Tile< + TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v18, v12); + uint64_t v78 = (uint64_t)v23; + TASSIGN(v77, v78); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID4); + TLOAD(v77, v32); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID4); + Tile< + TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v79 = Tile< + TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v12, v13); + uint64_t v80 = (uint64_t)v22; + TASSIGN(v79, v80); + pto::Shape<1, 1, 1, 512, 64> v81 = pto::Shape<1, 1, 1, 512, 64>(); + pto::Stride<524288, 524288, 524288, 1024, 1> v82 = pto::Stride<524288, 524288, 524288, 1024, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>, pto::Layout::ND> + v83 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>, + pto::Layout::ND>(v5 + (v8 + v8 * (unsigned)v17 + (unsigned)v27 * (unsigned)v16), v81, v82); + TLOAD(v79, v83); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID5); + Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v84 = Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v18, v10); + uint64_t v85 = (uint64_t)v20; + TASSIGN(v84, v85); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID4); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID4); + TEXTRACT(v84, v77, v11, v11); + Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v86 = Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v10, v13); + uint64_t v87 = (uint64_t)v19; + TASSIGN(v86, v87); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID5); + TEXTRACT(v86, v79, v11, v11); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID4); + Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v88 = Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v18, v10); + uint64_t v89 = (uint64_t)v23; + TASSIGN(v88, v89); + TEXTRACT(v88, v77, v11, v10); + Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v90 = Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v10, v13); + uint64_t v91 = (uint64_t)v23; + TASSIGN(v90, v91); + TEXTRACT(v90, v79, v10, v11); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID5); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID5); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v92 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v18, v13); + uint64_t v93 = (uint64_t)v21; + TASSIGN(v92, v93); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID4); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID1); + TMATMUL(v92, v84, v86); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v94 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v18, v13); + uint64_t v95 = (uint64_t)v21; + TASSIGN(v94, v95); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID5); + TMATMUL_ACC(v94, v94, v88, v90); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID5); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID5); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID5); + for (size_t v96 = v24; v96 < v25; v96 += v24) { + int32_t v97 = (int32_t)((uint32_t)((int32_t)v96) * (uint32_t)v12); + Tile< + TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v98 = Tile< + TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v18, v12); + uint64_t v99 = (uint64_t)v23; + TASSIGN(v98, v99); + pto::Shape<1, 1, 1, 16, 512> v100 = pto::Shape<1, 1, 1, 16, 512>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v101 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v102 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, + pto::Layout::ND>(v3 + (v8 + v8 * (unsigned)v15 + (unsigned)v97 * (unsigned)v16), v100, v101); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID6); + TLOAD(v98, v102); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID6); + Tile< + TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v103 = Tile< + TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v12, v13); + uint64_t v104 = (uint64_t)v22; + TASSIGN(v103, v104); + pto::Shape<1, 1, 1, 512, 64> v105 = pto::Shape<1, 1, 1, 512, 64>(); + pto::Stride<524288, 524288, 524288, 1024, 1> v106 = pto::Stride<524288, 524288, 524288, 1024, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>, pto::Layout::ND> + v107 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<524288, 524288, 524288, 1024, 1>, + pto::Layout::ND>( + v5 + (v8 + (unsigned)v97 * (unsigned)v17 + (unsigned)v27 * (unsigned)v16), v105, v106 + ); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID7); + TLOAD(v103, v107); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID7); + Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v108 = Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v18, v10); + uint64_t v109 = (uint64_t)v20; + TASSIGN(v108, v109); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID6); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID6); + TEXTRACT(v108, v98, v11, v11); + Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v110 = Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v10, v13); + uint64_t v111 = (uint64_t)v19; + TASSIGN(v110, v111); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID7); + TEXTRACT(v110, v103, v11, v11); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID6); + Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v112 = Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v18, v10); + uint64_t v113 = (uint64_t)v23; + TASSIGN(v112, v113); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID7); + TEXTRACT(v112, v98, v11, v10); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID6); + Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v114 = Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v10, v13); + uint64_t v115 = (uint64_t)v23; + TASSIGN(v114, v115); + TEXTRACT(v114, v103, v10, v11); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID7); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID7); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v116 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v18, v13); + uint64_t v117 = (uint64_t)v21; + TASSIGN(v116, v117); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID6); + pipe_barrier(PIPE_M); + TMATMUL_ACC(v116, v116, v108, v110); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID6); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v118 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v18, v13); + uint64_t v119 = (uint64_t)v21; + TASSIGN(v118, v119); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID7); + TMATMUL_ACC(v118, v118, v112, v114); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID7); + }; + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_M, PIPE_FIX, EVENT_ID1); + pto::Shape<1, 1, 1, 16, 64> v120 = pto::Shape<1, 1, 1, 16, 64>(); + pto::Stride<16384, 16384, 16384, 1024, 1> v121 = pto::Stride<16384, 16384, 16384, 1024, 1>(); + GlobalTensor, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND> + v122 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND>( + v2 + (v8 + (unsigned)v7 * (unsigned)v17 + (unsigned)v27 * (unsigned)v16), v120, v121 + ); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID1); + TSTORE(v122, v94); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + } + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID6); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID7); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID6); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID7); +#endif // __DAV_CUBE__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: k_proj__iter_v3 + __gm__ Tensor *k_proj__iter_v3_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ float *k_proj__iter_v3 = + reinterpret_cast<__gm__ float *>(k_proj__iter_v3_tensor->buffer.addr) + k_proj__iter_v3_tensor->start_offset; + + // Unpack tensor: v_proj__iter_v3 + __gm__ Tensor *v_proj__iter_v3_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ float *v_proj__iter_v3 = + reinterpret_cast<__gm__ float *>(v_proj__iter_v3_tensor->buffer.addr) + v_proj__iter_v3_tensor->start_offset; + + // Unpack tensor: normed_tile__rv_v2 + __gm__ Tensor *normed_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ bfloat16_t *normed_tile__rv_v2 = + reinterpret_cast<__gm__ bfloat16_t *>(normed_tile__rv_v2_tensor->buffer.addr) + + normed_tile__rv_v2_tensor->start_offset; + + // Unpack tensor: wk__ssa_v0 + __gm__ Tensor *wk__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]); + __gm__ bfloat16_t *wk__ssa_v0 = + reinterpret_cast<__gm__ bfloat16_t *>(wk__ssa_v0_tensor->buffer.addr) + wk__ssa_v0_tensor->start_offset; + + // Unpack tensor: wv__ssa_v0 + __gm__ Tensor *wv__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[4]); + __gm__ bfloat16_t *wv__ssa_v0 = + reinterpret_cast<__gm__ bfloat16_t *>(wv__ssa_v0_tensor->buffer.addr) + wv__ssa_v0_tensor->start_offset; + + // Unpack scalar: ob_chunk__idx_v0 + union { + uint64_t u64; + int64_t val; + } ob_chunk__idx_v0_conv; + ob_chunk__idx_v0_conv.u64 = args[5]; + int64_t ob_chunk__idx_v0 = ob_chunk__idx_v0_conv.val; + + // Unpack scalar: b0__idx_v0 + union { + uint64_t u64; + int64_t val; + } b0__idx_v0_conv; + b0__idx_v0_conv.u64 = args[6]; + int64_t b0__idx_v0 = b0__idx_v0_conv.val; + + // Forward to ptoas-generated function + kv_proj(k_proj__iter_v3, v_proj__iter_v3, normed_tile__rv_v2, wk__ssa_v0, wv__ssa_v0, ob_chunk__idx_v0, b0__idx_v0); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/out_proj.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/out_proj.cpp new file mode 100644 index 000000000..327c26af5 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/out_proj.cpp @@ -0,0 +1,269 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: out_proj +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void +out_proj(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, __gm__ float *v3, int32_t v4, int32_t v5) { + unsigned v6 = 0; + const int32_t v7 = 40; + const int32_t v8 = 128; + const int32_t v9 = 64; + const int32_t v10 = 1; + const int32_t v11 = 5120; + const int32_t v12 = 16; + const int64_t v13 = 4096; + const int64_t v14 = 0; + using T = float; + +#if defined(__DAV_CUBE__) + size_t v15 = (size_t)v10; + Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v16 = Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v12, v8); + uint64_t v17 = (uint64_t)v14; + TASSIGN(v16, v17); + pto::Shape<1, 1, 1, 16, 128> v18 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v19 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v20 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v1 + (v6 + (unsigned)v4 * (unsigned)v11 + v6 * (unsigned)v10), v18, v19 + ); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + TLOAD(v16, v20); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + Tile< + TileType::Mat, bfloat16_t, 128, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v21 = Tile< + TileType::Mat, bfloat16_t, 128, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v8, v9); + uint64_t v22 = (uint64_t)v13; + TASSIGN(v21, v22); + pto::Shape<1, 1, 1, 128, 64> v23 = pto::Shape<1, 1, 1, 128, 64>(); + pto::Stride<655360, 655360, 655360, 5120, 1> v24 = pto::Stride<655360, 655360, 655360, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 64>, pto::Stride<655360, 655360, 655360, 5120, 1>, pto::Layout::ND> + v25 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 64>, pto::Stride<655360, 655360, 655360, 5120, 1>, pto::Layout::ND>( + v2 + (v6 + v6 * (unsigned)v11 + (unsigned)v5 * (unsigned)v10), v23, v24 + ); + TLOAD(v21, v25); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v26 = Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v12, v8); + uint64_t v27 = (uint64_t)v14; + TASSIGN(v26, v27); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TMOV(v26, v16); + Tile< + TileType::Right, bfloat16_t, 128, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v28 = Tile< + TileType::Right, bfloat16_t, 128, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v8, v9); + uint64_t v29 = (uint64_t)v14; + TASSIGN(v28, v29); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TMOV(v28, v21); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v30 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v12, v9); + uint64_t v31 = (uint64_t)v14; + TASSIGN(v30, v31); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + TMATMUL(v30, v26, v28); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + for (size_t v32 = v15; v32 < ((size_t)v7); v32 += v15) { + int32_t v33 = (int32_t)((uint32_t)((int32_t)v32) * (uint32_t)v8); + Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v34 = Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v12, v8); + uint64_t v35 = (uint64_t)v14; + TASSIGN(v34, v35); + pto::Shape<1, 1, 1, 16, 128> v36 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v37 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v38 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v1 + (v6 + (unsigned)v4 * (unsigned)v11 + (unsigned)v33 * (unsigned)v10), v36, v37 + ); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + TLOAD(v34, v38); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + Tile< + TileType::Mat, bfloat16_t, 128, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v39 = Tile< + TileType::Mat, bfloat16_t, 128, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v8, v9); + uint64_t v40 = (uint64_t)v13; + TASSIGN(v39, v40); + pto::Shape<1, 1, 1, 128, 64> v41 = pto::Shape<1, 1, 1, 128, 64>(); + pto::Stride<655360, 655360, 655360, 5120, 1> v42 = pto::Stride<655360, 655360, 655360, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 64>, pto::Stride<655360, 655360, 655360, 5120, 1>, pto::Layout::ND> + v43 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 64>, pto::Stride<655360, 655360, 655360, 5120, 1>, + pto::Layout::ND>(v2 + (v6 + (unsigned)v33 * (unsigned)v11 + (unsigned)v5 * (unsigned)v10), v41, v42); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + TLOAD(v39, v43); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3); + Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v44 = Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v12, v8); + uint64_t v45 = (uint64_t)v14; + TASSIGN(v44, v45); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + TMOV(v44, v34); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + Tile< + TileType::Right, bfloat16_t, 128, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v46 = Tile< + TileType::Right, bfloat16_t, 128, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v8, v9); + uint64_t v47 = (uint64_t)v14; + TASSIGN(v46, v47); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3); + TMOV(v46, v39); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v48 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v12, v9); + uint64_t v49 = (uint64_t)v14; + TASSIGN(v48, v49); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + TMATMUL_ACC(v48, v48, v44, v46); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + } + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 64> v50 = pto::Shape<1, 1, 1, 16, 64>(); + pto::Stride<1024, 1024, 1024, 64, 1> v51 = pto::Stride<1024, 1024, 1024, 64, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 64, 1>, pto::Layout::ND> v52 = + GlobalTensor, pto::Stride<1024, 1024, 1024, 64, 1>, pto::Layout::ND>( + v3 + (v6 + v6 * (unsigned)v9 + v6 * (unsigned)v10), v50, v51 + ); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(v52, v30); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); +#endif // __DAV_CUBE__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: attn_out__rv_v2 + __gm__ Tensor *attn_out__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ bfloat16_t *attn_out__rv_v2 = reinterpret_cast<__gm__ bfloat16_t *>(attn_out__rv_v2_tensor->buffer.addr) + + attn_out__rv_v2_tensor->start_offset; + + // Unpack tensor: wo__ssa_v0 + __gm__ Tensor *wo__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ bfloat16_t *wo__ssa_v0 = + reinterpret_cast<__gm__ bfloat16_t *>(wo__ssa_v0_tensor->buffer.addr) + wo__ssa_v0_tensor->start_offset; + + // Unpack tensor: ret0__out + __gm__ Tensor *ret0__out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *ret0__out = + reinterpret_cast<__gm__ float *>(ret0__out_tensor->buffer.addr) + ret0__out_tensor->start_offset; + + // Unpack scalar: b0__idx_v0 + union { + uint64_t u64; + int64_t val; + } b0__idx_v0_conv; + b0__idx_v0_conv.u64 = args[3]; + int64_t b0__idx_v0 = b0__idx_v0_conv.val; + + // Unpack scalar: o0__ssa_v0 + union { + uint64_t u64; + int64_t val; + } o0__ssa_v0_conv; + o0__ssa_v0_conv.u64 = args[4]; + int64_t o0__ssa_v0 = o0__ssa_v0_conv.val; + + // Forward to ptoas-generated function + out_proj(attn_out__rv_v2, wo__ssa_v0, ret0__out, b0__idx_v0, o0__ssa_v0); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/q_proj.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/q_proj.cpp new file mode 100644 index 000000000..6bfcd7cb6 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/q_proj.cpp @@ -0,0 +1,357 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: q_proj +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void q_proj(__gm__ float *v1, __gm__ bfloat16_t *v2, __gm__ bfloat16_t *v3, int32_t v4, int32_t v5) { + unsigned v6 = 0; + const int32_t v7 = 10; + const int32_t v8 = 256; + const int32_t v9 = 0; + const int32_t v10 = 512; + const int32_t v11 = 64; + const int32_t v12 = 4; + const int32_t v13 = 1; + const int32_t v14 = 5120; + const int32_t v15 = 16; + const int64_t v16 = 32768; + const int64_t v17 = 8192; + const int64_t v18 = 16384; + const int64_t v19 = 0; + using T = float; + +#if defined(__DAV_CUBE__) + size_t v20 = (size_t)v13; + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID3); + for (size_t v21 = (size_t)v4; v21 < ((size_t)((int32_t)(uint32_t)v4 + (uint32_t)v12)); v21 += v20) { + int32_t v22 = (int32_t)((uint32_t)((int32_t)v21) * (uint32_t)v11); + Tile< + TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v23 = Tile< + TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v15, v10); + uint64_t v24 = (uint64_t)v19; + TASSIGN(v23, v24); + pto::Shape<1, 1, 1, 16, 512> v25 = pto::Shape<1, 1, 1, 16, 512>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v26 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v27 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v2 + (v6 + v6 * (unsigned)v14 + v6 * (unsigned)v13), v25, v26 + ); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + TLOAD(v23, v27); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + Tile< + TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v28 = Tile< + TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v10, v11); + uint64_t v29 = (uint64_t)v18; + TASSIGN(v28, v29); + pto::Shape<1, 1, 1, 512, 64> v30 = pto::Shape<1, 1, 1, 512, 64>(); + pto::Stride<2621440, 2621440, 2621440, 5120, 1> v31 = pto::Stride<2621440, 2621440, 2621440, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<2621440, 2621440, 2621440, 5120, 1>, pto::Layout::ND> + v32 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<2621440, 2621440, 2621440, 5120, 1>, + pto::Layout::ND>(v3 + (v6 + v6 * (unsigned)v14 + (unsigned)v22 * (unsigned)v13), v30, v31); + TLOAD(v28, v32); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v33 = Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v15, v8); + uint64_t v34 = (uint64_t)v19; + TASSIGN(v33, v34); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + TEXTRACT(v33, v23, v9, v9); + Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v35 = Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v8, v11); + uint64_t v36 = (uint64_t)v19; + TASSIGN(v35, v36); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TEXTRACT(v35, v28, v9, v9); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v37 = Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v15, v8); + uint64_t v38 = (uint64_t)v17; + TASSIGN(v37, v38); + TEXTRACT(v37, v23, v9, v8); + Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v39 = Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v8, v11); + uint64_t v40 = (uint64_t)v16; + TASSIGN(v39, v40); + TEXTRACT(v39, v28, v8, v9); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v41 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v15, v11); + uint64_t v42 = (uint64_t)v19; + TASSIGN(v41, v42); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + TMATMUL(v41, v33, v35); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v43 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v15, v11); + uint64_t v44 = (uint64_t)v19; + TASSIGN(v43, v44); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + TMATMUL_ACC(v43, v43, v37, v39); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + for (size_t v45 = v20; v45 < ((size_t)v7); v45 += v20) { + int32_t v46 = (int32_t)((uint32_t)((int32_t)v45) * (uint32_t)v10); + Tile< + TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v47 = Tile< + TileType::Mat, bfloat16_t, 16, 512, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v15, v10); + uint64_t v48 = (uint64_t)v19; + TASSIGN(v47, v48); + pto::Shape<1, 1, 1, 16, 512> v49 = pto::Shape<1, 1, 1, 16, 512>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v50 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v51 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, + pto::Layout::ND>(v2 + (v6 + v6 * (unsigned)v14 + (unsigned)v46 * (unsigned)v13), v49, v50); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + TLOAD(v47, v51); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + Tile< + TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v52 = Tile< + TileType::Mat, bfloat16_t, 512, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v10, v11); + uint64_t v53 = (uint64_t)v18; + TASSIGN(v52, v53); + pto::Shape<1, 1, 1, 512, 64> v54 = pto::Shape<1, 1, 1, 512, 64>(); + pto::Stride<2621440, 2621440, 2621440, 5120, 1> v55 = pto::Stride<2621440, 2621440, 2621440, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<2621440, 2621440, 2621440, 5120, 1>, + pto::Layout::ND> + v56 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 512, 64>, pto::Stride<2621440, 2621440, 2621440, 5120, 1>, + pto::Layout::ND>( + v3 + (v6 + (unsigned)v46 * (unsigned)v14 + (unsigned)v22 * (unsigned)v13), v54, v55 + ); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3); + TLOAD(v52, v56); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3); + Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v57 = Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v15, v8); + uint64_t v58 = (uint64_t)v19; + TASSIGN(v57, v58); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + TEXTRACT(v57, v47, v9, v9); + Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v59 = Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v8, v11); + uint64_t v60 = (uint64_t)v19; + TASSIGN(v59, v60); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3); + TEXTRACT(v59, v52, v9, v9); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID2); + Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v61 = Tile< + TileType::Left, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v15, v8); + uint64_t v62 = (uint64_t)v17; + TASSIGN(v61, v62); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3); + TEXTRACT(v61, v47, v9, v8); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v63 = Tile< + TileType::Right, bfloat16_t, 256, 64, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v8, v11); + uint64_t v64 = (uint64_t)v16; + TASSIGN(v63, v64); + TEXTRACT(v63, v52, v8, v9); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID3); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v65 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v15, v11); + uint64_t v66 = (uint64_t)v19; + TASSIGN(v65, v66); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID2); + pipe_barrier(PIPE_M); + TMATMUL_ACC(v65, v65, v57, v59); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v67 = Tile< + TileType::Acc, float, 16, 64, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v15, v11); + uint64_t v68 = (uint64_t)v19; + TASSIGN(v67, v68); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID3); + TMATMUL_ACC(v67, v67, v61, v63); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID3); + }; + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 64> v69 = pto::Shape<1, 1, 1, 16, 64>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v70 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v71 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 64>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v1 + (v6 + (unsigned)v5 * (unsigned)v14 + (unsigned)v22 * (unsigned)v13), v69, v70 + ); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(v71, v43); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + } + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID3); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID3); +#endif // __DAV_CUBE__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: q_proj__iter_v3 + __gm__ Tensor *q_proj__iter_v3_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ float *q_proj__iter_v3 = + reinterpret_cast<__gm__ float *>(q_proj__iter_v3_tensor->buffer.addr) + q_proj__iter_v3_tensor->start_offset; + + // Unpack tensor: normed_tile__rv_v2 + __gm__ Tensor *normed_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ bfloat16_t *normed_tile__rv_v2 = + reinterpret_cast<__gm__ bfloat16_t *>(normed_tile__rv_v2_tensor->buffer.addr) + + normed_tile__rv_v2_tensor->start_offset; + + // Unpack tensor: wq__ssa_v0 + __gm__ Tensor *wq__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ bfloat16_t *wq__ssa_v0 = + reinterpret_cast<__gm__ bfloat16_t *>(wq__ssa_v0_tensor->buffer.addr) + wq__ssa_v0_tensor->start_offset; + + // Unpack scalar: ob_chunk__idx_v0 + union { + uint64_t u64; + int64_t val; + } ob_chunk__idx_v0_conv; + ob_chunk__idx_v0_conv.u64 = args[3]; + int64_t ob_chunk__idx_v0 = ob_chunk__idx_v0_conv.val; + + // Unpack scalar: b0__idx_v0 + union { + uint64_t u64; + int64_t val; + } b0__idx_v0_conv; + b0__idx_v0_conv.u64 = args[4]; + int64_t b0__idx_v0 = b0__idx_v0_conv.val; + + // Forward to ptoas-generated function + q_proj(q_proj__iter_v3, normed_tile__rv_v2, wq__ssa_v0, ob_chunk__idx_v0, b0__idx_v0); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/qk_matmul.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/qk_matmul.cpp new file mode 100644 index 000000000..bb82ee9fb --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/qk_matmul.cpp @@ -0,0 +1,307 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: qk_matmul +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void qk_matmul( + __gm__ float *v1, __gm__ bfloat16_t *v2, __gm__ int32_t *v3, __gm__ bfloat16_t *v4, int32_t v5, int32_t v6, + int32_t v7, int32_t v8, int32_t v9, int32_t v10 +) { + unsigned v11 = 0; + const int32_t v12 = 2; + const int32_t v13 = 64; + const int32_t v14 = 16; + const int32_t v15 = 8; + const int32_t v16 = 0; + const int32_t v17 = 128; + const int32_t v18 = 1; + const int32_t v19 = 256; + const int64_t v20 = 2048; + const int64_t v21 = 32768; + const int64_t v22 = 4096; + const int64_t v23 = 0; + using T = float; + +#if defined(__DAV_CUBE__) + size_t v24 = (size_t)v18; + size_t v25 = (size_t)v16; + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + for (size_t v26 = v25; v26 < ((size_t)v15); v26 += v24) { + int32_t v27 = (int32_t)v26; + Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v28 = Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v17); + uint64_t v29 = (uint64_t)v23; + TASSIGN(v28, v29); + pto::Shape<1, 1, 1, 16, 128> v30 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<2048, 2048, 2048, 128, 1> v31 = pto::Stride<2048, 2048, 2048, 128, 1>(); + GlobalTensor, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND> + v32 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>( + v2 + (v11 + + (unsigned)((int32_t)(uint32_t)((int32_t)(uint32_t)v5 * (uint32_t)v17) + + (uint32_t)((int32_t)(uint32_t)v27 * (uint32_t)v14)) * + (unsigned)v17 + + v11 * (unsigned)v18), + v30, v31 + ); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + pipe_barrier(PIPE_MTE2); + TLOAD(v28, v32); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + for (size_t v33 = v25; v33 < ((size_t)v13); v33 += v24) { + int32_t v34 = (int32_t)((uint32_t)v6 + (uint32_t)((int32_t)v33)); + __gm__ float *v35; + if (v34 < v7) { + int32_t v36 = v3[(int32_t)((uint32_t)v8 + (uint32_t)v34)]; + Tile< + TileType::Mat, bfloat16_t, 128, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null> + v37 = Tile< + TileType::Mat, bfloat16_t, 128, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v17, v19); + uint64_t v38 = (uint64_t)v22; + TASSIGN(v37, v38); + pto::Shape<1, 1, 1, 128, 256> v39 = pto::Shape<1, 1, 1, 128, 256>(); + pto::Stride<128, 128, 128, 1, 128> v40 = pto::Stride<128, 128, 128, 1, 128>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<128, 128, 128, 1, 128>, pto::Layout::DN> + v41 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<128, 128, 128, 1, 128>, pto::Layout::DN>( + v4 + (v11 + v11 * (unsigned)v18 + + (unsigned)((int32_t)(uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v36 * + (uint32_t)v15) + + (uint32_t)v27) * + (uint32_t)v19) * + (unsigned)v17), + v39, v40 + ); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + TLOAD(v37, v41); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null> + v42 = Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v14, v13); + uint64_t v43 = (uint64_t)v23; + TASSIGN(v42, v43); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + TEXTRACT(v42, v28, v16, v16); + Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null> + v44 = Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v13, v19); + uint64_t v45 = (uint64_t)v21; + TASSIGN(v44, v45); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TEXTRACT(v44, v37, v16, v16); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null> + v46 = Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v14, v13); + uint64_t v47 = (uint64_t)v20; + TASSIGN(v46, v47); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + TEXTRACT(v46, v28, v16, v13); + Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null> + v48 = Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v13, v19); + uint64_t v49 = (uint64_t)v23; + TASSIGN(v48, v49); + TEXTRACT(v48, v37, v13, v16); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v50 = Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, + PadValue::Null, CompactMode::Null>(v14, v19); + uint64_t v51 = (uint64_t)v23; + TASSIGN(v50, v51); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + TMATMUL(v50, v42, v44); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v52 = Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, + PadValue::Null, CompactMode::Null>(v14, v19); + uint64_t v53 = (uint64_t)v23; + TASSIGN(v52, v53); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + TMATMUL_ACC(v52, v52, v46, v48); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 256> v54 = pto::Shape<1, 1, 1, 16, 256>(); + pto::Stride<4096, 4096, 4096, 256, 1> v55 = pto::Stride<4096, 4096, 4096, 256, 1>(); + GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND> + v56 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>( + v1 + (v11 + + (unsigned)((int32_t)(uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v27 * + (uint32_t)v12) + + (uint32_t)v34) * + (uint32_t)v14) * + (unsigned)v19 + + v11 * (unsigned)v18), + v54, v55 + ); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(v56, v52); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + v35 = v1; + } else { + v35 = v1; + }; + }; + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); +#endif // __DAV_CUBE__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: all_raw_scores__iter_v1 + __gm__ Tensor *all_raw_scores__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ float *all_raw_scores__iter_v1 = + reinterpret_cast<__gm__ float *>(all_raw_scores__iter_v1_tensor->buffer.addr) + + all_raw_scores__iter_v1_tensor->start_offset; + + // Unpack tensor: all_q_padded__rv_v7 + __gm__ Tensor *all_q_padded__rv_v7_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ bfloat16_t *all_q_padded__rv_v7 = + reinterpret_cast<__gm__ bfloat16_t *>(all_q_padded__rv_v7_tensor->buffer.addr) + + all_q_padded__rv_v7_tensor->start_offset; + + // Unpack tensor: block_table__ssa_v0 + __gm__ Tensor *block_table__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ int32_t *block_table__ssa_v0 = reinterpret_cast<__gm__ int32_t *>(block_table__ssa_v0_tensor->buffer.addr) + + block_table__ssa_v0_tensor->start_offset; + + // Unpack tensor: k_cache__rv_v6 + __gm__ Tensor *k_cache__rv_v6_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]); + __gm__ bfloat16_t *k_cache__rv_v6 = + reinterpret_cast<__gm__ bfloat16_t *>(k_cache__rv_v6_tensor->buffer.addr) + k_cache__rv_v6_tensor->start_offset; + + // Unpack scalar: b__idx_v0 + union { + uint64_t u64; + int64_t val; + } b__idx_v0_conv; + b__idx_v0_conv.u64 = args[4]; + int64_t b__idx_v0 = b__idx_v0_conv.val; + + // Unpack scalar: sb_chunk__idx_v0 + union { + uint64_t u64; + int64_t val; + } sb_chunk__idx_v0_conv; + sb_chunk__idx_v0_conv.u64 = args[5]; + int64_t sb_chunk__idx_v0 = sb_chunk__idx_v0_conv.val; + + // Unpack scalar: ctx_blocks__ssa_v0 + union { + uint64_t u64; + int64_t val; + } ctx_blocks__ssa_v0_conv; + ctx_blocks__ssa_v0_conv.u64 = args[6]; + int64_t ctx_blocks__ssa_v0 = ctx_blocks__ssa_v0_conv.val; + + // Unpack scalar: block_table_base__ssa_v0 + union { + uint64_t u64; + int64_t val; + } block_table_base__ssa_v0_conv; + block_table_base__ssa_v0_conv.u64 = args[7]; + int64_t block_table_base__ssa_v0 = block_table_base__ssa_v0_conv.val; + + // Extract dynamic dim: BLOCK_TABLE_FLAT_DYN + int64_t BLOCK_TABLE_FLAT_DYN = static_cast(block_table__ssa_v0_tensor->shapes[0]); + + // Extract dynamic dim: KV_CACHE_ROWS_DYN + int64_t KV_CACHE_ROWS_DYN = static_cast(k_cache__rv_v6_tensor->shapes[0]); + + // Forward to ptoas-generated function + qk_matmul( + all_raw_scores__iter_v1, all_q_padded__rv_v7, block_table__ssa_v0, k_cache__rv_v6, b__idx_v0, sb_chunk__idx_v0, + ctx_blocks__ssa_v0, block_table_base__ssa_v0, BLOCK_TABLE_FLAT_DYN, KV_CACHE_ROWS_DYN + ); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/sv_matmul.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/sv_matmul.cpp new file mode 100644 index 000000000..7186dd07e --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/sv_matmul.cpp @@ -0,0 +1,291 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: sv_matmul +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void sv_matmul( + __gm__ float *v1, __gm__ int32_t *v2, __gm__ bfloat16_t *v3, __gm__ bfloat16_t *v4, int32_t v5, int32_t v6, + int32_t v7, int32_t v8, int32_t v9 +) { + unsigned v10 = 0; + const int32_t v11 = 2; + const int32_t v12 = 16; + const int32_t v13 = 64; + const int32_t v14 = 8; + const int32_t v15 = 0; + const int32_t v16 = 1; + const int32_t v17 = 128; + const int32_t v18 = 256; + const int64_t v19 = 4096; + const int64_t v20 = 32768; + const int64_t v21 = 8192; + const int64_t v22 = 0; + using T = float; + +#if defined(__DAV_CUBE__) + size_t v23 = (size_t)v16; + size_t v24 = (size_t)v15; + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + for (size_t v25 = v24; v25 < ((size_t)v14); v25 += v23) { + int32_t v26 = (int32_t)v25; + for (size_t v27 = v24; v27 < ((size_t)v13); v27 += v23) { + int32_t v28 = (int32_t)((uint32_t)v5 + (uint32_t)((int32_t)v27)); + __gm__ float *v29; + if (v28 < v6) { + int32_t v30 = v2[(int32_t)((uint32_t)v7 + (uint32_t)v28)]; + Tile< + TileType::Mat, bfloat16_t, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null> + v31 = Tile< + TileType::Mat, bfloat16_t, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v12, v18); + uint64_t v32 = (uint64_t)v22; + TASSIGN(v31, v32); + int32_t v33 = + (int32_t)((uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v26 * (uint32_t)v11) + (uint32_t)v28) * + (uint32_t)v12); + pto::Shape<1, 1, 1, 16, 256> v34 = pto::Shape<1, 1, 1, 16, 256>(); + pto::Stride<4096, 4096, 4096, 256, 1> v35 = pto::Stride<4096, 4096, 4096, 256, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND> + v36 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, + pto::Layout::ND>(v3 + (v10 + (unsigned)v33 * (unsigned)v18 + v10 * (unsigned)v16), v34, v35); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + TLOAD(v31, v36); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + Tile< + TileType::Mat, bfloat16_t, 256, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null> + v37 = Tile< + TileType::Mat, bfloat16_t, 256, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v18, v17); + uint64_t v38 = (uint64_t)v21; + TASSIGN(v37, v38); + pto::Shape<1, 1, 1, 256, 128> v39 = pto::Shape<1, 1, 1, 256, 128>(); + pto::Stride<32768, 32768, 32768, 128, 1> v40 = pto::Stride<32768, 32768, 32768, 128, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 256, 128>, pto::Stride<32768, 32768, 32768, 128, 1>, + pto::Layout::ND> + v41 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 256, 128>, pto::Stride<32768, 32768, 32768, 128, 1>, + pto::Layout::ND>( + v4 + (v10 + + (unsigned)((int32_t)(uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v30 * + (uint32_t)v14) + + (uint32_t)v26) * + (uint32_t)v18) * + (unsigned)v17 + + v10 * (unsigned)v16), + v39, v40 + ); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + TLOAD(v37, v41); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null> + v42 = Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v12, v17); + uint64_t v43 = (uint64_t)v22; + TASSIGN(v42, v43); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + TEXTRACT(v42, v31, v15, v15); + Tile< + TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null> + v44 = Tile< + TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v17, v17); + uint64_t v45 = (uint64_t)v20; + TASSIGN(v44, v45); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TEXTRACT(v44, v37, v15, v15); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null> + v46 = Tile< + TileType::Left, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, + PadValue::Null, CompactMode::Null>(v12, v17); + uint64_t v47 = (uint64_t)v19; + TASSIGN(v46, v47); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + TEXTRACT(v46, v31, v15, v17); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + Tile< + TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null> + v48 = Tile< + TileType::Right, bfloat16_t, 128, 128, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, + PadValue::Null, CompactMode::Null>(v17, v17); + uint64_t v49 = (uint64_t)v22; + TASSIGN(v48, v49); + TEXTRACT(v48, v37, v17, v15); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + Tile< + TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v50 = Tile< + TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, + PadValue::Null, CompactMode::Null>(v12, v17); + uint64_t v51 = (uint64_t)v22; + TASSIGN(v50, v51); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + TMATMUL(v50, v42, v44); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + Tile< + TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v52 = Tile< + TileType::Acc, float, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, + PadValue::Null, CompactMode::Null>(v12, v17); + uint64_t v53 = (uint64_t)v22; + TASSIGN(v52, v53); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + TMATMUL_ACC(v52, v52, v46, v48); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 128> v54 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<2048, 2048, 2048, 128, 1> v55 = pto::Stride<2048, 2048, 2048, 128, 1>(); + GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND> + v56 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>( + v1 + (v10 + (unsigned)v33 * (unsigned)v17 + v10 * (unsigned)v16), v54, v55 + ); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(v56, v52); + set_flag(PIPE_FIX, PIPE_M, EVENT_ID0); + v29 = v1; + } else { + v29 = v1; + }; + }; + } + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + wait_flag(PIPE_FIX, PIPE_M, EVENT_ID0); +#endif // __DAV_CUBE__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: all_oi_tmp__iter_v1 + __gm__ Tensor *all_oi_tmp__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ float *all_oi_tmp__iter_v1 = reinterpret_cast<__gm__ float *>(all_oi_tmp__iter_v1_tensor->buffer.addr) + + all_oi_tmp__iter_v1_tensor->start_offset; + + // Unpack tensor: block_table__ssa_v0 + __gm__ Tensor *block_table__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ int32_t *block_table__ssa_v0 = reinterpret_cast<__gm__ int32_t *>(block_table__ssa_v0_tensor->buffer.addr) + + block_table__ssa_v0_tensor->start_offset; + + // Unpack tensor: all_exp_padded__rv_v2 + __gm__ Tensor *all_exp_padded__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ bfloat16_t *all_exp_padded__rv_v2 = + reinterpret_cast<__gm__ bfloat16_t *>(all_exp_padded__rv_v2_tensor->buffer.addr) + + all_exp_padded__rv_v2_tensor->start_offset; + + // Unpack tensor: v_cache__rv_v6 + __gm__ Tensor *v_cache__rv_v6_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]); + __gm__ bfloat16_t *v_cache__rv_v6 = + reinterpret_cast<__gm__ bfloat16_t *>(v_cache__rv_v6_tensor->buffer.addr) + v_cache__rv_v6_tensor->start_offset; + + // Unpack scalar: sb_chunk__idx_v0 + union { + uint64_t u64; + int64_t val; + } sb_chunk__idx_v0_conv; + sb_chunk__idx_v0_conv.u64 = args[4]; + int64_t sb_chunk__idx_v0 = sb_chunk__idx_v0_conv.val; + + // Unpack scalar: ctx_blocks__ssa_v0 + union { + uint64_t u64; + int64_t val; + } ctx_blocks__ssa_v0_conv; + ctx_blocks__ssa_v0_conv.u64 = args[5]; + int64_t ctx_blocks__ssa_v0 = ctx_blocks__ssa_v0_conv.val; + + // Unpack scalar: block_table_base__ssa_v0 + union { + uint64_t u64; + int64_t val; + } block_table_base__ssa_v0_conv; + block_table_base__ssa_v0_conv.u64 = args[6]; + int64_t block_table_base__ssa_v0 = block_table_base__ssa_v0_conv.val; + + // Extract dynamic dim: BLOCK_TABLE_FLAT_DYN + int64_t BLOCK_TABLE_FLAT_DYN = static_cast(block_table__ssa_v0_tensor->shapes[0]); + + // Extract dynamic dim: KV_CACHE_ROWS_DYN + int64_t KV_CACHE_ROWS_DYN = static_cast(v_cache__rv_v6_tensor->shapes[0]); + + // Forward to ptoas-generated function + sv_matmul( + all_oi_tmp__iter_v1, block_table__ssa_v0, all_exp_padded__rv_v2, v_cache__rv_v6, sb_chunk__idx_v0, + ctx_blocks__ssa_v0, block_table_base__ssa_v0, BLOCK_TABLE_FLAT_DYN, KV_CACHE_ROWS_DYN + ); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/up_proj.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/up_proj.cpp new file mode 100644 index 000000000..04e42ab45 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aic/up_proj.cpp @@ -0,0 +1,331 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: up_proj +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void up_proj(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, __gm__ float *v3, int32_t v4) { + unsigned v5 = 0; + const int32_t v6 = 40; + const int32_t v7 = 64; + const int32_t v8 = 0; + const int32_t v9 = 128; + const int32_t v10 = 256; + const int32_t v11 = 17408; + const int32_t v12 = 1; + const int32_t v13 = 5120; + const int32_t v14 = 16; + const int64_t v15 = 32768; + const int64_t v16 = 2048; + const int64_t v17 = 4096; + const int64_t v18 = 0; + using T = float; + +#if defined(__DAV_CUBE__) + size_t v19 = (size_t)v12; + Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v20 = Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v9); + uint64_t v21 = (uint64_t)v18; + TASSIGN(v20, v21); + pto::Shape<1, 1, 1, 16, 128> v22 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v23 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v24 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v1 + (v5 + v5 * (unsigned)v13 + v5 * (unsigned)v12), v22, v23 + ); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + TLOAD(v20, v24); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + Tile< + TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v25 = Tile< + TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v9, v10); + uint64_t v26 = (uint64_t)v17; + TASSIGN(v25, v26); + pto::Shape<1, 1, 1, 128, 256> v27 = pto::Shape<1, 1, 1, 128, 256>(); + pto::Stride<2228224, 2228224, 2228224, 17408, 1> v28 = pto::Stride<2228224, 2228224, 2228224, 17408, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>, pto::Layout::ND> + v29 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>, + pto::Layout::ND>(v2 + (v5 + v5 * (unsigned)v11 + (unsigned)v4 * (unsigned)v12), v27, v28); + TLOAD(v25, v29); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v30 = Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v7); + uint64_t v31 = (uint64_t)v18; + TASSIGN(v30, v31); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TEXTRACT(v30, v20, v8, v8); + Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v32 = Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v7, v10); + uint64_t v33 = (uint64_t)v18; + TASSIGN(v32, v33); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TEXTRACT(v32, v25, v8, v8); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v34 = Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v7); + uint64_t v35 = (uint64_t)v16; + TASSIGN(v34, v35); + TEXTRACT(v34, v20, v8, v7); + Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v36 = Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v7, v10); + uint64_t v37 = (uint64_t)v15; + TASSIGN(v36, v37); + TEXTRACT(v36, v25, v7, v8); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v38 = Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v14, v10); + uint64_t v39 = (uint64_t)v18; + TASSIGN(v38, v39); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + TMATMUL(v38, v30, v32); + Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v40 = Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v14, v10); + uint64_t v41 = (uint64_t)v18; + TASSIGN(v40, v41); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID1); + TMATMUL_ACC(v40, v40, v34, v36); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID0); + for (size_t v42 = v19; v42 < ((size_t)v6); v42 += v19) { + int32_t v43 = (int32_t)((uint32_t)((int32_t)v42) * (uint32_t)v9); + Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v44 = Tile< + TileType::Mat, bfloat16_t, 16, 128, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v9); + uint64_t v45 = (uint64_t)v18; + TASSIGN(v44, v45); + pto::Shape<1, 1, 1, 16, 128> v46 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v47 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v48 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v1 + (v5 + v5 * (unsigned)v13 + (unsigned)v43 * (unsigned)v12), v46, v47 + ); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + TLOAD(v44, v48); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + Tile< + TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v49 = Tile< + TileType::Mat, bfloat16_t, 128, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v9, v10); + uint64_t v50 = (uint64_t)v17; + TASSIGN(v49, v50); + pto::Shape<1, 1, 1, 128, 256> v51 = pto::Shape<1, 1, 1, 128, 256>(); + pto::Stride<2228224, 2228224, 2228224, 17408, 1> v52 = pto::Stride<2228224, 2228224, 2228224, 17408, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>, + pto::Layout::ND> + v53 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 128, 256>, pto::Stride<2228224, 2228224, 2228224, 17408, 1>, + pto::Layout::ND>(v2 + (v5 + (unsigned)v43 * (unsigned)v11 + (unsigned)v4 * (unsigned)v12), v51, v52); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + TLOAD(v49, v53); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3); + Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v54 = Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v7); + uint64_t v55 = (uint64_t)v18; + TASSIGN(v54, v55); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID2); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + TEXTRACT(v54, v44, v8, v8); + Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v56 = Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v7, v10); + uint64_t v57 = (uint64_t)v18; + TASSIGN(v56, v57); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID3); + TEXTRACT(v56, v49, v8, v8); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID2); + Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null> + v58 = Tile< + TileType::Left, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::RowMajor, 512, PadValue::Null, + CompactMode::Null>(v14, v7); + uint64_t v59 = (uint64_t)v16; + TASSIGN(v58, v59); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + TEXTRACT(v58, v44, v8, v7); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null> + v60 = Tile< + TileType::Right, bfloat16_t, 64, 256, BLayout::RowMajor, -1, -1, SLayout::ColMajor, 512, PadValue::Null, + CompactMode::Null>(v7, v10); + uint64_t v61 = (uint64_t)v15; + TASSIGN(v60, v61); + TEXTRACT(v60, v49, v7, v8); + set_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID3); + Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v62 = Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v14, v10); + uint64_t v63 = (uint64_t)v18; + TASSIGN(v62, v63); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID2); + pipe_barrier(PIPE_M); + TMATMUL_ACC(v62, v62, v54, v56); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null> + v64 = Tile< + TileType::Acc, float, 16, 256, BLayout::ColMajor, -1, -1, SLayout::RowMajor, 1024, PadValue::Null, + CompactMode::Null>(v14, v10); + uint64_t v65 = (uint64_t)v18; + TASSIGN(v64, v65); + pipe_barrier(PIPE_M); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID3); + TMATMUL_ACC(v64, v64, v58, v60); + set_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); + } + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 256> v66 = pto::Shape<1, 1, 1, 16, 256>(); + pto::Stride<4096, 4096, 4096, 256, 1> v67 = pto::Stride<4096, 4096, 4096, 256, 1>(); + GlobalTensor, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND> v68 = + GlobalTensor, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>( + v3 + (v5 + v5 * (unsigned)v10 + v5 * (unsigned)v12), v66, v67 + ); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + TSTORE(v68, v40); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE1, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID1); + wait_flag(PIPE_M, PIPE_MTE1, EVENT_ID2); +#endif // __DAV_CUBE__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: post_norm_tile__rv_v2 + __gm__ Tensor *post_norm_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ bfloat16_t *post_norm_tile__rv_v2 = + reinterpret_cast<__gm__ bfloat16_t *>(post_norm_tile__rv_v2_tensor->buffer.addr) + + post_norm_tile__rv_v2_tensor->start_offset; + + // Unpack tensor: w_up__ssa_v0 + __gm__ Tensor *w_up__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ bfloat16_t *w_up__ssa_v0 = + reinterpret_cast<__gm__ bfloat16_t *>(w_up__ssa_v0_tensor->buffer.addr) + w_up__ssa_v0_tensor->start_offset; + + // Unpack tensor: ret0__out + __gm__ Tensor *ret0__out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *ret0__out = + reinterpret_cast<__gm__ float *>(ret0__out_tensor->buffer.addr) + ret0__out_tensor->start_offset; + + // Unpack scalar: o0__ssa_v1 + union { + uint64_t u64; + int64_t val; + } o0__ssa_v1_conv; + o0__ssa_v1_conv.u64 = args[3]; + int64_t o0__ssa_v1 = o0__ssa_v1_conv.val; + + // Forward to ptoas-generated function + up_proj(post_norm_tile__rv_v2, w_up__ssa_v0, ret0__out, o0__ssa_v1); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/attention_writeback.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/attention_writeback.cpp new file mode 100644 index 000000000..b4d9f0ba5 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/attention_writeback.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: attention_writeback +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void attention_writeback(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2) { + unsigned v3 = 0; + const int32_t v4 = 128; + const int32_t v5 = 2048; + const int32_t v6 = 640; + const int32_t v7 = 5; + const int32_t v8 = 8; + const int32_t v9 = 0; + const int32_t v10 = 16384; + const int32_t v11 = 5120; + const int32_t v12 = 1; + const int64_t v13 = 0; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + for (size_t v14 = (size_t)v9; v14 < ((size_t)v8); v14 += (size_t)v12) { + int32_t v15 = (int32_t)v14; + Tile< + TileType::Vec, bfloat16_t, 1, 640, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v16 = Tile< + TileType::Vec, bfloat16_t, 1, 640, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v6); + uint64_t v17 = (uint64_t)v13; + TASSIGN(v16, v17); + pto::Shape<1, 1, 1, 1, 640> v18 = pto::Shape<1, 1, 1, 1, 640>(); + pto::Stride<16384, 16384, 16384, 16384, 1> v19 = pto::Stride<16384, 16384, 16384, 16384, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 1, 640>, pto::Stride<16384, 16384, 16384, 16384, 1>, pto::Layout::ND> + v20 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 1, 640>, pto::Stride<16384, 16384, 16384, 16384, 1>, pto::Layout::ND>( + v2 + (v3 + v3 * (unsigned)v10 + (unsigned)((int32_t)(uint32_t)v15 * (uint32_t)v5) * (unsigned)v12), v18, + v19 + ); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v16, v20); + set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 640> v21 = pto::Shape<1, 1, 1, 1, 640>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v22 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> + v23 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 1, 640>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>( + v1 + (v3 + v3 * (unsigned)v11 + + (unsigned)((int32_t)(uint32_t)((int32_t)(uint32_t)v15 * (uint32_t)v7) * (uint32_t)v4) * + (unsigned)v12), + v21, v22 + ); + wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); + TSTORE(v23, v16); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: attn_row__ssa_v0 + __gm__ Tensor *attn_row__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ bfloat16_t *attn_row__ssa_v0 = reinterpret_cast<__gm__ bfloat16_t *>(attn_row__ssa_v0_tensor->buffer.addr) + + attn_row__ssa_v0_tensor->start_offset; + + // Unpack tensor: attn_row_padded__rv_v2 + __gm__ Tensor *attn_row_padded__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ bfloat16_t *attn_row_padded__rv_v2 = + reinterpret_cast<__gm__ bfloat16_t *>(attn_row_padded__rv_v2_tensor->buffer.addr) + + attn_row_padded__rv_v2_tensor->start_offset; + + // Forward to ptoas-generated function + attention_writeback(attn_row__ssa_v0, attn_row_padded__rv_v2); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_hidden.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_hidden.cpp new file mode 100644 index 000000000..59fa858fe --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_hidden.cpp @@ -0,0 +1,141 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: copy_hidden +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void copy_hidden(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, int32_t v3, int32_t v4, int32_t v5) { + unsigned v6 = 5120; + unsigned v7 = 0; + const int32_t v8 = 128; + const int32_t v9 = 40; + const int32_t v10 = 0; + const int32_t v11 = 1; + const int32_t v12 = 5120; + const int64_t v13 = 0; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + for (size_t v14 = (size_t)v10; v14 < ((size_t)v9); v14 += (size_t)v11) { + int32_t v15 = (int32_t)((uint32_t)((int32_t)v14) * (uint32_t)v8); + Tile< + TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v16 = Tile< + TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v4, v8); + uint64_t v17 = (uint64_t)v13; + TASSIGN(v16, v17); + unsigned v18 = (unsigned)v4 * v6; + pto::Shape<1, 1, 1, -1, 128> v19 = pto::Shape<1, 1, 1, -1, 128>(v4); + pto::Stride<-1, -1, -1, 5120, 1> v20 = pto::Stride<-1, -1, -1, 5120, 1>(v18, v18, v18); + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v21 = + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>( + v2 + (v7 + (unsigned)v3 * (unsigned)v12 + (unsigned)v15 * (unsigned)v11), v19, v20 + ); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v16, v21); + set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); + unsigned v22 = (unsigned)v4 * v6; + pto::Shape<1, 1, 1, -1, 128> v23 = pto::Shape<1, 1, 1, -1, 128>(v4); + pto::Stride<-1, -1, -1, 5120, 1> v24 = pto::Stride<-1, -1, -1, 5120, 1>(v22, v22, v22); + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v25 = + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>( + v1 + (v7 + (unsigned)v3 * (unsigned)v12 + (unsigned)v15 * (unsigned)v11), v23, v24 + ); + wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); + TSTORE(v25, v16); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: current_hidden__iter_v1 + __gm__ Tensor *current_hidden__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ bfloat16_t *current_hidden__iter_v1 = + reinterpret_cast<__gm__ bfloat16_t *>(current_hidden__iter_v1_tensor->buffer.addr) + + current_hidden__iter_v1_tensor->start_offset; + + // Unpack tensor: hidden_states__ssa_v0 + __gm__ Tensor *hidden_states__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ bfloat16_t *hidden_states__ssa_v0 = + reinterpret_cast<__gm__ bfloat16_t *>(hidden_states__ssa_v0_tensor->buffer.addr) + + hidden_states__ssa_v0_tensor->start_offset; + + // Unpack scalar: b0__idx_v0 + union { + uint64_t u64; + int64_t val; + } b0__idx_v0_conv; + b0__idx_v0_conv.u64 = args[2]; + int64_t b0__idx_v0 = b0__idx_v0_conv.val; + + // Unpack scalar: cur_valid__ssa_v0 + union { + uint64_t u64; + int64_t val; + } cur_valid__ssa_v0_conv; + cur_valid__ssa_v0_conv.u64 = args[3]; + int64_t cur_valid__ssa_v0 = cur_valid__ssa_v0_conv.val; + + // Extract dynamic dim: USER_BATCH_DYN + int64_t USER_BATCH_DYN = static_cast(hidden_states__ssa_v0_tensor->shapes[0]); + + // Forward to ptoas-generated function + copy_hidden(current_hidden__iter_v1, hidden_states__ssa_v0, b0__idx_v0, cur_valid__ssa_v0, USER_BATCH_DYN); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_out.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_out.cpp new file mode 100644 index 000000000..4299251a2 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/copy_out.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: copy_out +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void copy_out(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, int32_t v3, int32_t v4, int32_t v5) { + unsigned v6 = 5120; + unsigned v7 = 0; + const int32_t v8 = 128; + const int32_t v9 = 40; + const int32_t v10 = 0; + const int32_t v11 = 1; + const int32_t v12 = 5120; + const int64_t v13 = 0; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + for (size_t v14 = (size_t)v10; v14 < ((size_t)v9); v14 += (size_t)v11) { + int32_t v15 = (int32_t)((uint32_t)((int32_t)v14) * (uint32_t)v8); + Tile< + TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v16 = Tile< + TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v4, v8); + uint64_t v17 = (uint64_t)v13; + TASSIGN(v16, v17); + unsigned v18 = (unsigned)v4 * v6; + pto::Shape<1, 1, 1, -1, 128> v19 = pto::Shape<1, 1, 1, -1, 128>(v4); + pto::Stride<-1, -1, -1, 5120, 1> v20 = pto::Stride<-1, -1, -1, 5120, 1>(v18, v18, v18); + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v21 = + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>( + v2 + (v7 + (unsigned)v3 * (unsigned)v12 + (unsigned)v15 * (unsigned)v11), v19, v20 + ); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v16, v21); + set_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); + unsigned v22 = (unsigned)v4 * v6; + pto::Shape<1, 1, 1, -1, 128> v23 = pto::Shape<1, 1, 1, -1, 128>(v4); + pto::Stride<-1, -1, -1, 5120, 1> v24 = pto::Stride<-1, -1, -1, 5120, 1>(v22, v22, v22); + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v25 = + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>( + v1 + (v7 + (unsigned)v3 * (unsigned)v12 + (unsigned)v15 * (unsigned)v11), v23, v24 + ); + wait_flag(PIPE_MTE2, PIPE_MTE3, EVENT_ID0); + TSTORE(v25, v16); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: out__iter_v1 + __gm__ Tensor *out__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ bfloat16_t *out__iter_v1 = + reinterpret_cast<__gm__ bfloat16_t *>(out__iter_v1_tensor->buffer.addr) + out__iter_v1_tensor->start_offset; + + // Unpack tensor: current_hidden__ssa_v8 + __gm__ Tensor *current_hidden__ssa_v8_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ bfloat16_t *current_hidden__ssa_v8 = + reinterpret_cast<__gm__ bfloat16_t *>(current_hidden__ssa_v8_tensor->buffer.addr) + + current_hidden__ssa_v8_tensor->start_offset; + + // Unpack scalar: b0__idx_v0 + union { + uint64_t u64; + int64_t val; + } b0__idx_v0_conv; + b0__idx_v0_conv.u64 = args[2]; + int64_t b0__idx_v0 = b0__idx_v0_conv.val; + + // Unpack scalar: cur_valid__ssa_v3 + union { + uint64_t u64; + int64_t val; + } cur_valid__ssa_v3_conv; + cur_valid__ssa_v3_conv.u64 = args[3]; + int64_t cur_valid__ssa_v3 = cur_valid__ssa_v3_conv.val; + + // Extract dynamic dim: USER_BATCH_DYN + int64_t USER_BATCH_DYN = static_cast(out__iter_v1_tensor->shapes[0]); + + // Forward to ptoas-generated function + copy_out(out__iter_v1, current_hidden__ssa_v8, b0__idx_v0, cur_valid__ssa_v3, USER_BATCH_DYN); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/down_proj_residual.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/down_proj_residual.cpp new file mode 100644 index 000000000..451288cd7 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/down_proj_residual.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: down_proj_residual +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void +down_proj_residual(__gm__ float *v1, __gm__ float *v2, __gm__ bfloat16_t *v3, int32_t v4, int32_t v5) { + RoundMode v6 = RoundMode::CAST_ROUND; + unsigned v7 = 0; + const int32_t v8 = 5120; + const int32_t v9 = 1; + const int32_t v10 = 128; + const int32_t v11 = 16; + const int64_t v12 = 16384; + const int64_t v13 = 8192; + const int64_t v14 = 0; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v15 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v16 = (uint64_t)v14; + TASSIGN(v15, v16); + pto::Shape<1, 1, 1, 16, 128> v17 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<2048, 2048, 2048, 128, 1> v18 = pto::Stride<2048, 2048, 2048, 128, 1>(); + GlobalTensor, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND> v19 = + GlobalTensor, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>( + v1 + (v7 + v7 * (unsigned)v10 + v7 * (unsigned)v9), v17, v18 + ); + TLOAD(v15, v19); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v20 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v21 = (uint64_t)v13; + TASSIGN(v20, v21); + pto::Shape<1, 1, 1, 16, 128> v22 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v23 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> v24 = + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v2 + (v7 + v7 * (unsigned)v8 + (unsigned)v4 * (unsigned)v9), v22, v23 + ); + TLOAD(v20, v24); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v25 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v26 = (uint64_t)v14; + TASSIGN(v25, v26); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(v25, v15, v20); + Tile< + TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v27 = Tile< + TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v28 = (uint64_t)v12; + TASSIGN(v27, v28); + pipe_barrier(PIPE_V); + TCVT(v27, v25, v6); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 128> v29 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v30 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v31 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v3 + (v7 + (unsigned)v5 * (unsigned)v8 + (unsigned)v4 * (unsigned)v9), v29, v30 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v31, v27); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: fp32_chunk_gm__ssa_v1 + __gm__ Tensor *fp32_chunk_gm__ssa_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ float *fp32_chunk_gm__ssa_v1 = reinterpret_cast<__gm__ float *>(fp32_chunk_gm__ssa_v1_tensor->buffer.addr) + + fp32_chunk_gm__ssa_v1_tensor->start_offset; + + // Unpack tensor: resid1_tile__rv_v2 + __gm__ Tensor *resid1_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ float *resid1_tile__rv_v2 = reinterpret_cast<__gm__ float *>(resid1_tile__rv_v2_tensor->buffer.addr) + + resid1_tile__rv_v2_tensor->start_offset; + + // Unpack tensor: next_hidden__iter_v3 + __gm__ Tensor *next_hidden__iter_v3_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ bfloat16_t *next_hidden__iter_v3 = + reinterpret_cast<__gm__ bfloat16_t *>(next_hidden__iter_v3_tensor->buffer.addr) + + next_hidden__iter_v3_tensor->start_offset; + + // Unpack scalar: d0__ssa_v0 + union { + uint64_t u64; + int64_t val; + } d0__ssa_v0_conv; + d0__ssa_v0_conv.u64 = args[3]; + int64_t d0__ssa_v0 = d0__ssa_v0_conv.val; + + // Unpack scalar: b0__idx_v0 + union { + uint64_t u64; + int64_t val; + } b0__idx_v0_conv; + b0__idx_v0_conv.u64 = args[4]; + int64_t b0__idx_v0 = b0__idx_v0_conv.val; + + // Forward to ptoas-generated function + down_proj_residual(fp32_chunk_gm__ssa_v1, resid1_tile__rv_v2, next_hidden__iter_v3, d0__ssa_v0, b0__idx_v0); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/online_softmax.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/online_softmax.cpp new file mode 100644 index 000000000..5af6c32c7 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/online_softmax.cpp @@ -0,0 +1,550 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: online_softmax +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void +online_softmax(__gm__ bfloat16_t *v1, __gm__ float *v2, __gm__ float *v3, __gm__ float *v4, int32_t v5) { + RoundMode v6 = RoundMode::CAST_ROUND; + unsigned v7 = 0; + const int32_t v8 = 2048; + const int32_t v9 = 16; + const int32_t v10 = 32; + const int32_t v11 = 8; + const int32_t v12 = 0; + const int32_t v13 = 128; + const int32_t v14 = 256; + const int32_t v15 = 16384; + const int32_t v16 = 1; + const int64_t v17 = 8512; + const int64_t v18 = 320; + const int64_t v19 = 256; + const int64_t v20 = 192; + const int64_t v21 = 128; + const int64_t v22 = 64; + const int64_t v23 = 0; + const int64_t v24 = 29184; + const int64_t v25 = 29120; + const int64_t v26 = 20928; + const int64_t v27 = 20864; + const int64_t v28 = 20800; + const int64_t v29 = 12608; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + size_t v30 = (size_t)v16; + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + for (size_t v31 = (size_t)v12; v31 < ((size_t)v11); v31 += v30) { + int32_t v32 = (int32_t)v31; + int32_t v33 = (int32_t)((uint32_t)v32 * (uint32_t)v10); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v34 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v13); + uint64_t v35 = (uint64_t)v29; + TASSIGN(v34, v35); + pto::Shape<1, 1, 1, 16, 128> v36 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<2048, 2048, 2048, 128, 1> v37 = pto::Stride<2048, 2048, 2048, 128, 1>(); + GlobalTensor, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND> v38 = + GlobalTensor, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>( + v2 + (v7 + (unsigned)v33 * (unsigned)v13 + v7 * (unsigned)v16), v36, v37 + ); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v34, v38); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v39 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v16); + uint64_t v40 = (uint64_t)v28; + TASSIGN(v39, v40); + pto::Shape<1, 1, 1, 16, 1> v41 = pto::Shape<1, 1, 1, 16, 1>(); + pto::Stride<16, 16, 16, 1, 256> v42 = pto::Stride<16, 16, 16, 1, 256>(); + GlobalTensor, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN> v43 = + GlobalTensor, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN>( + v3 + (v7 + (unsigned)v33 * (unsigned)v16 + v7 * (unsigned)v14), v41, v42 + ); + TLOAD(v39, v43); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v44 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v16); + uint64_t v45 = (uint64_t)v27; + TASSIGN(v44, v45); + pto::Shape<1, 1, 1, 16, 1> v46 = pto::Shape<1, 1, 1, 16, 1>(); + pto::Stride<16, 16, 16, 1, 256> v47 = pto::Stride<16, 16, 16, 1, 256>(); + GlobalTensor, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN> v48 = + GlobalTensor, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN>( + v4 + (v7 + (unsigned)v33 * (unsigned)v16 + v7 * (unsigned)v14), v46, v47 + ); + TLOAD(v44, v48); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + for (size_t v49 = v30; v49 < ((size_t)v5); v49 += v30) { + int32_t v50 = (int32_t)((uint32_t)v33 + (uint32_t)((int32_t)(uint32_t)((int32_t)v49) * (uint32_t)v9)); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v51 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v13); + uint64_t v52 = (uint64_t)v26; + TASSIGN(v51, v52); + pto::Shape<1, 1, 1, 16, 128> v53 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<2048, 2048, 2048, 128, 1> v54 = pto::Stride<2048, 2048, 2048, 128, 1>(); + GlobalTensor, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND> + v55 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<2048, 2048, 2048, 128, 1>, pto::Layout::ND>( + v2 + (v7 + (unsigned)v50 * (unsigned)v13 + v7 * (unsigned)v16), v53, v54 + ); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + TLOAD(v51, v55); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v56 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v16); + uint64_t v57 = (uint64_t)v25; + TASSIGN(v56, v57); + pto::Shape<1, 1, 1, 16, 1> v58 = pto::Shape<1, 1, 1, 16, 1>(); + pto::Stride<16, 16, 16, 1, 256> v59 = pto::Stride<16, 16, 16, 1, 256>(); + GlobalTensor, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN> v60 = + GlobalTensor, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN>( + v3 + (v7 + (unsigned)v50 * (unsigned)v16 + v7 * (unsigned)v14), v58, v59 + ); + TLOAD(v56, v60); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v61 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v16); + uint64_t v62 = (uint64_t)v24; + TASSIGN(v61, v62); + pto::Shape<1, 1, 1, 16, 1> v63 = pto::Shape<1, 1, 1, 16, 1>(); + pto::Stride<16, 16, 16, 1, 256> v64 = pto::Stride<16, 16, 16, 1, 256>(); + GlobalTensor, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN> v65 = + GlobalTensor, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN>( + v4 + (v7 + (unsigned)v50 * (unsigned)v16 + v7 * (unsigned)v14), v63, v64 + ); + TLOAD(v61, v65); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v66 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v67 = (uint64_t)v28; + TASSIGN(v66, v67); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v68 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v69 = (uint64_t)v25; + TASSIGN(v68, v69); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v70 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v71 = (uint64_t)v23; + TASSIGN(v70, v71); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_V); + TMAX(v70, v66, v68); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v72 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v73 = (uint64_t)v28; + TASSIGN(v72, v73); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v74 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v75 = (uint64_t)v23; + TASSIGN(v74, v75); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v76 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v77 = (uint64_t)v22; + TASSIGN(v76, v77); + pipe_barrier(PIPE_V); + TSUB(v76, v72, v74); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v78 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v79 = (uint64_t)v22; + TASSIGN(v78, v79); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v80 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v81 = (uint64_t)v22; + TASSIGN(v80, v81); + pipe_barrier(PIPE_V); + TEXP(v80, v78); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v82 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v16); + uint64_t v83 = (uint64_t)v22; + TASSIGN(v82, v83); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v84 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v85 = (uint64_t)v25; + TASSIGN(v84, v85); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v86 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v87 = (uint64_t)v23; + TASSIGN(v86, v87); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v88 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v89 = (uint64_t)v21; + TASSIGN(v88, v89); + TSUB(v88, v84, v86); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v90 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v91 = (uint64_t)v21; + TASSIGN(v90, v91); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v92 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v93 = (uint64_t)v21; + TASSIGN(v92, v93); + pipe_barrier(PIPE_V); + TEXP(v92, v90); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v94 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v16); + uint64_t v95 = (uint64_t)v21; + TASSIGN(v94, v95); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v96 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v97 = (uint64_t)v22; + TASSIGN(v96, v97); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v98 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v99 = (uint64_t)v27; + TASSIGN(v98, v99); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v100 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v101 = (uint64_t)v20; + TASSIGN(v100, v101); + TMUL(v100, v96, v98); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v102 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v103 = (uint64_t)v21; + TASSIGN(v102, v103); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v104 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v105 = (uint64_t)v24; + TASSIGN(v104, v105); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v106 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v107 = (uint64_t)v19; + TASSIGN(v106, v107); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v106, v102, v104); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v108 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v109 = (uint64_t)v20; + TASSIGN(v108, v109); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v110 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v111 = (uint64_t)v19; + TASSIGN(v110, v111); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v112 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v9); + uint64_t v113 = (uint64_t)v20; + TASSIGN(v112, v113); + pipe_barrier(PIPE_V); + TADD(v112, v108, v110); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v114 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v16); + uint64_t v115 = (uint64_t)v20; + TASSIGN(v114, v115); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v116 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v13); + uint64_t v117 = (uint64_t)v18; + TASSIGN(v116, v117); + TROWEXPANDMUL(v116, v34, v82); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v118 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v13); + uint64_t v119 = (uint64_t)v26; + TASSIGN(v118, v119); + TROWEXPANDMUL(v118, v51, v94); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v120 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v13); + uint64_t v121 = (uint64_t)v29; + TASSIGN(v120, v121); + pipe_barrier(PIPE_V); + TADD(v120, v116, v118); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v122 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v16); + uint64_t v123 = (uint64_t)v23; + TASSIGN(v122, v123); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v124 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v16); + uint64_t v125 = (uint64_t)v27; + TASSIGN(v124, v125); + TMOV(v124, v114); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v126 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v16); + uint64_t v127 = (uint64_t)v28; + TASSIGN(v126, v127); + TMOV(v126, v122); + }; + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v128 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v9, v13); + uint64_t v129 = (uint64_t)v29; + TASSIGN(v128, v129); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TROWEXPANDDIV(v128, v34, v44); + Tile< + TileType::Vec, float, 1, 2048, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v130 = Tile< + TileType::Vec, float, 1, 2048, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v8); + uint64_t v131 = (uint64_t)v29; + TASSIGN(v130, v131); + Tile< + TileType::Vec, bfloat16_t, 1, 2048, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v132 = Tile< + TileType::Vec, bfloat16_t, 1, 2048, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v8); + uint64_t v133 = (uint64_t)v17; + TASSIGN(v132, v133); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TCVT(v132, v130, v6); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 2048> v134 = pto::Shape<1, 1, 1, 1, 2048>(); + pto::Stride<16384, 16384, 16384, 16384, 1> v135 = pto::Stride<16384, 16384, 16384, 16384, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 1, 2048>, pto::Stride<16384, 16384, 16384, 16384, 1>, pto::Layout::ND> + v136 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 1, 2048>, pto::Stride<16384, 16384, 16384, 16384, 1>, pto::Layout::ND>( + v1 + (v7 + v7 * (unsigned)v15 + (unsigned)((int32_t)(uint32_t)v32 * (uint32_t)v8) * (unsigned)v16), + v134, v135 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v136, v132); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: attn_row_padded__ssa_v0 + __gm__ Tensor *attn_row_padded__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ bfloat16_t *attn_row_padded__ssa_v0 = + reinterpret_cast<__gm__ bfloat16_t *>(attn_row_padded__ssa_v0_tensor->buffer.addr) + + attn_row_padded__ssa_v0_tensor->start_offset; + + // Unpack tensor: all_oi_tmp__rv_v2 + __gm__ Tensor *all_oi_tmp__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ float *all_oi_tmp__rv_v2 = reinterpret_cast<__gm__ float *>(all_oi_tmp__rv_v2_tensor->buffer.addr) + + all_oi_tmp__rv_v2_tensor->start_offset; + + // Unpack tensor: all_cur_mi__rv_v2 + __gm__ Tensor *all_cur_mi__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *all_cur_mi__rv_v2 = reinterpret_cast<__gm__ float *>(all_cur_mi__rv_v2_tensor->buffer.addr) + + all_cur_mi__rv_v2_tensor->start_offset; + + // Unpack tensor: all_cur_li__rv_v2 + __gm__ Tensor *all_cur_li__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]); + __gm__ float *all_cur_li__rv_v2 = reinterpret_cast<__gm__ float *>(all_cur_li__rv_v2_tensor->buffer.addr) + + all_cur_li__rv_v2_tensor->start_offset; + + // Unpack scalar: ctx_blocks__ssa_v0 + union { + uint64_t u64; + int64_t val; + } ctx_blocks__ssa_v0_conv; + ctx_blocks__ssa_v0_conv.u64 = args[4]; + int64_t ctx_blocks__ssa_v0 = ctx_blocks__ssa_v0_conv.val; + + // Forward to ptoas-generated function + online_softmax( + attn_row_padded__ssa_v0, all_oi_tmp__rv_v2, all_cur_mi__rv_v2, all_cur_li__rv_v2, ctx_blocks__ssa_v0 + ); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/out_proj_residual.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/out_proj_residual.cpp new file mode 100644 index 000000000..e34aeaa10 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/out_proj_residual.cpp @@ -0,0 +1,183 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: out_proj_residual +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void +out_proj_residual(__gm__ bfloat16_t *v1, __gm__ float *v2, __gm__ float *v3, int32_t v4, int32_t v5, int32_t v6) { + RoundMode v7 = RoundMode::CAST_ROUND; + unsigned v8 = 5120; + unsigned v9 = 0; + const int32_t v10 = 64; + const int32_t v11 = 1; + const int32_t v12 = 5120; + const int32_t v13 = 16; + const int64_t v14 = 6144; + const int64_t v15 = 4096; + const int64_t v16 = 0; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + Tile< + TileType::Vec, float, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v17 = Tile< + TileType::Vec, float, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v13, v10); + uint64_t v18 = (uint64_t)v16; + TASSIGN(v17, v18); + pto::Shape<1, 1, 1, 16, 64> v19 = pto::Shape<1, 1, 1, 16, 64>(); + pto::Stride<1024, 1024, 1024, 64, 1> v20 = pto::Stride<1024, 1024, 1024, 64, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 64, 1>, pto::Layout::ND> v21 = + GlobalTensor, pto::Stride<1024, 1024, 1024, 64, 1>, pto::Layout::ND>( + v2 + (v9 + v9 * (unsigned)v10 + v9 * (unsigned)v11), v19, v20 + ); + TLOAD(v17, v21); + Tile< + TileType::Vec, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v22 = Tile< + TileType::Vec, bfloat16_t, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v6, v10); + uint64_t v23 = (uint64_t)v15; + TASSIGN(v22, v23); + unsigned v24 = (unsigned)v6 * v8; + pto::Shape<1, 1, 1, -1, 64> v25 = pto::Shape<1, 1, 1, -1, 64>(v6); + pto::Stride<-1, -1, -1, 5120, 1> v26 = pto::Stride<-1, -1, -1, 5120, 1>(v24, v24, v24); + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v27 = + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>( + v1 + (v9 + (unsigned)v4 * (unsigned)v12 + (unsigned)v5 * (unsigned)v11), v25, v26 + ); + TLOAD(v22, v27); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + Tile< + TileType::Vec, float, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v28 = Tile< + TileType::Vec, float, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v13, v10); + uint64_t v29 = (uint64_t)v14; + TASSIGN(v28, v29); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v28, v22, v7); + Tile< + TileType::Vec, float, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v30 = Tile< + TileType::Vec, float, 16, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v13, v10); + uint64_t v31 = (uint64_t)v16; + TASSIGN(v30, v31); + pipe_barrier(PIPE_V); + TADD(v30, v17, v28); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 64> v32 = pto::Shape<1, 1, 1, 16, 64>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v33 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> v34 = + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v3 + (v9 + v9 * (unsigned)v12 + (unsigned)v5 * (unsigned)v11), v32, v33 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v34, v30); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: current_hidden__rv_v2 + __gm__ Tensor *current_hidden__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ bfloat16_t *current_hidden__rv_v2 = + reinterpret_cast<__gm__ bfloat16_t *>(current_hidden__rv_v2_tensor->buffer.addr) + + current_hidden__rv_v2_tensor->start_offset; + + // Unpack tensor: o_acc__rv_v2 + __gm__ Tensor *o_acc__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ float *o_acc__rv_v2 = + reinterpret_cast<__gm__ float *>(o_acc__rv_v2_tensor->buffer.addr) + o_acc__rv_v2_tensor->start_offset; + + // Unpack tensor: resid1_tile__iter_v1 + __gm__ Tensor *resid1_tile__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *resid1_tile__iter_v1 = reinterpret_cast<__gm__ float *>(resid1_tile__iter_v1_tensor->buffer.addr) + + resid1_tile__iter_v1_tensor->start_offset; + + // Unpack scalar: b0__idx_v0 + union { + uint64_t u64; + int64_t val; + } b0__idx_v0_conv; + b0__idx_v0_conv.u64 = args[3]; + int64_t b0__idx_v0 = b0__idx_v0_conv.val; + + // Unpack scalar: o0__ssa_v0 + union { + uint64_t u64; + int64_t val; + } o0__ssa_v0_conv; + o0__ssa_v0_conv.u64 = args[4]; + int64_t o0__ssa_v0 = o0__ssa_v0_conv.val; + + // Unpack scalar: cur_valid__ssa_v2 + union { + uint64_t u64; + int64_t val; + } cur_valid__ssa_v2_conv; + cur_valid__ssa_v2_conv.u64 = args[5]; + int64_t cur_valid__ssa_v2 = cur_valid__ssa_v2_conv.val; + + // Forward to ptoas-generated function + out_proj_residual( + current_hidden__rv_v2, o_acc__rv_v2, resid1_tile__iter_v1, b0__idx_v0, o0__ssa_v0, cur_valid__ssa_v2 + ); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/post_rmsnorm.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/post_rmsnorm.cpp new file mode 100644 index 000000000..e4f07e151 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/post_rmsnorm.cpp @@ -0,0 +1,324 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: post_rmsnorm +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void post_rmsnorm(__gm__ float *v1, __gm__ bfloat16_t *v2, __gm__ float *v3) { + RoundMode v4 = RoundMode::CAST_ROUND; + unsigned v5 = 0; + const float v6 = 9.99999997E-7f; + const float v7 = 1.95312503E-4f; + const int32_t v8 = 128; + const int32_t v9 = 40; + const int32_t v10 = 0; + const float v11 = 0.0f; + const int32_t v12 = 1; + const int32_t v13 = 5120; + const int32_t v14 = 16; + const int64_t v15 = 576; + const int64_t v16 = 64; + const int64_t v17 = 0; + const int64_t v18 = 21120; + const int64_t v19 = 12928; + const int64_t v20 = 4736; + const int64_t v21 = 4672; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + size_t v22 = (size_t)v12; + size_t v23 = (size_t)v10; + size_t v24 = (size_t)v9; + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v25 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v14); + uint64_t v26 = (uint64_t)v21; + TASSIGN(v25, v26); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TEXPANDS(v25, v11); + for (size_t v27 = v23; v27 < v24; v27 += v22) { + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v28 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v14, v8); + uint64_t v29 = (uint64_t)v20; + TASSIGN(v28, v29); + pto::Shape<1, 1, 1, 16, 128> v30 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v31 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v32 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v1 + (v5 + v5 * (unsigned)v13 + + (unsigned)((int32_t)(uint32_t)((int32_t)v27) * (uint32_t)v8) * (unsigned)v12), + v30, v31 + ); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v28, v32); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v33 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v14, v8); + uint64_t v34 = (uint64_t)v20; + TASSIGN(v33, v34); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TMUL(v33, v28, v28); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v35 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v14, v8); + uint64_t v36 = (uint64_t)v19; + TASSIGN(v35, v36); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v37 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v14, v12); + uint64_t v38 = (uint64_t)v18; + TASSIGN(v37, v38); + pipe_barrier(PIPE_V); + TROWSUM(v37, v33, v35); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v39 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v14); + uint64_t v40 = (uint64_t)v18; + TASSIGN(v39, v40); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v41 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v14); + uint64_t v42 = (uint64_t)v21; + TASSIGN(v41, v42); + pipe_barrier(PIPE_V); + TADD(v41, v25, v39); + } + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v43 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v14); + uint64_t v44 = (uint64_t)v21; + TASSIGN(v43, v44); + pipe_barrier(PIPE_V); + TMULS(v43, v25, v7); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v45 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v14); + uint64_t v46 = (uint64_t)v21; + TASSIGN(v45, v46); + pipe_barrier(PIPE_V); + TADDS(v45, v43, v6); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v47 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v14); + uint64_t v48 = (uint64_t)v21; + TASSIGN(v47, v48); + pipe_barrier(PIPE_V); + TSQRT(v47, v45); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v49 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v14); + uint64_t v50 = (uint64_t)v17; + TASSIGN(v49, v50); + pipe_barrier(PIPE_V); + TRECIP(v49, v47); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v51 = v23; v51 < v24; v51 += v22) { + int32_t v52 = (int32_t)((uint32_t)((int32_t)v51) * (uint32_t)v8); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v53 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v14, v8); + uint64_t v54 = (uint64_t)v20; + TASSIGN(v53, v54); + pto::Shape<1, 1, 1, 16, 128> v55 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v56 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v57 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v1 + (v5 + v5 * (unsigned)v13 + (unsigned)v52 * (unsigned)v12), v55, v56 + ); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v53, v57); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + Tile< + TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v58 = Tile< + TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v8); + uint64_t v59 = (uint64_t)v16; + TASSIGN(v58, v59); + pto::Shape<1, 1, 1, 1, 128> v60 = pto::Shape<1, 1, 1, 1, 128>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v61 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v62 = + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>( + v3 + (v5 + v5 * (unsigned)v13 + (unsigned)v52 * (unsigned)v12), v60, v61 + ); + TLOAD(v58, v62); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v63 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v14, v12); + uint64_t v64 = (uint64_t)v17; + TASSIGN(v63, v64); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v65 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v14, v8); + uint64_t v66 = (uint64_t)v20; + TASSIGN(v65, v66); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_V); + TROWEXPANDMUL(v65, v53, v63); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v67 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v14, v8); + uint64_t v68 = (uint64_t)v20; + TASSIGN(v67, v68); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TCOLEXPANDMUL(v67, v65, v58); + Tile< + TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v69 = Tile< + TileType::Vec, bfloat16_t, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v14, v8); + uint64_t v70 = (uint64_t)v15; + TASSIGN(v69, v70); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TCVT(v69, v67, v4); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 128> v71 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v72 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v73 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v2 + (v5 + v5 * (unsigned)v13 + (unsigned)v52 * (unsigned)v12), v71, v72 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v73, v69); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: resid1_tile__rv_v2 + __gm__ Tensor *resid1_tile__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ float *resid1_tile__rv_v2 = reinterpret_cast<__gm__ float *>(resid1_tile__rv_v2_tensor->buffer.addr) + + resid1_tile__rv_v2_tensor->start_offset; + + // Unpack tensor: post_norm_tile__ssa_v0 + __gm__ Tensor *post_norm_tile__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ bfloat16_t *post_norm_tile__ssa_v0 = + reinterpret_cast<__gm__ bfloat16_t *>(post_norm_tile__ssa_v0_tensor->buffer.addr) + + post_norm_tile__ssa_v0_tensor->start_offset; + + // Unpack tensor: post_rms_weight__ssa_v0 + __gm__ Tensor *post_rms_weight__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *post_rms_weight__ssa_v0 = + reinterpret_cast<__gm__ float *>(post_rms_weight__ssa_v0_tensor->buffer.addr) + + post_rms_weight__ssa_v0_tensor->start_offset; + + // Forward to ptoas-generated function + post_rmsnorm(resid1_tile__rv_v2, post_norm_tile__ssa_v0, post_rms_weight__ssa_v0); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/q_pad.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/q_pad.cpp new file mode 100644 index 000000000..5bbc66c08 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/q_pad.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: q_pad +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void q_pad(__gm__ bfloat16_t *v1) { + unsigned v2 = 0; + RoundMode v3 = RoundMode::CAST_ROUND; + const int32_t v4 = 5; + const int32_t v5 = 16; + const float v6 = 0.0f; + const int32_t v7 = 11; + const int32_t v8 = 0; + const int32_t v9 = 1; + const int32_t v10 = 128; + const int64_t v11 = 5632; + const int64_t v12 = 0; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + for (size_t v13 = (size_t)v8; v13 < ((size_t)v10); v13 += (size_t)v9) { + Tile< + TileType::Vec, float, 11, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v14 = Tile< + TileType::Vec, float, 11, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v7, v10); + uint64_t v15 = (uint64_t)v12; + TASSIGN(v14, v15); + pipe_barrier(PIPE_V); + TEXPANDS(v14, v6); + Tile< + TileType::Vec, bfloat16_t, 11, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v16 = Tile< + TileType::Vec, bfloat16_t, 11, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v7, v10); + uint64_t v17 = (uint64_t)v11; + TASSIGN(v16, v17); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TCVT(v16, v14, v3); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 11, 128> v18 = pto::Shape<1, 1, 1, 11, 128>(); + pto::Stride<1408, 1408, 1408, 128, 1> v19 = pto::Stride<1408, 1408, 1408, 128, 1>(); + GlobalTensor, pto::Stride<1408, 1408, 1408, 128, 1>, pto::Layout::ND> + v20 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 11, 128>, pto::Stride<1408, 1408, 1408, 128, 1>, pto::Layout::ND>( + v1 + (v2 + + (unsigned)((int32_t)(uint32_t)((int32_t)(uint32_t)((int32_t)v13) * (uint32_t)v5) + (uint32_t)v4) * + (unsigned)v10 + + v2 * (unsigned)v9), + v18, v19 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v20, v16); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + } + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: all_q_padded__ssa_v0 + __gm__ Tensor *all_q_padded__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ bfloat16_t *all_q_padded__ssa_v0 = + reinterpret_cast<__gm__ bfloat16_t *>(all_q_padded__ssa_v0_tensor->buffer.addr) + + all_q_padded__ssa_v0_tensor->start_offset; + + // Forward to ptoas-generated function + q_pad(all_q_padded__ssa_v0); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/qk_norm.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/qk_norm.cpp new file mode 100644 index 000000000..f375142f7 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/qk_norm.cpp @@ -0,0 +1,456 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: qk_norm +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void qk_norm( + __gm__ float *v1, __gm__ float *v2, __gm__ float *v3, __gm__ float *v4, __gm__ float *v5, __gm__ float *v6, + int32_t v7 +) { + unsigned v8 = 0; + const int32_t v9 = 8; + const float v10 = 9.99999997E-7f; + const float v11 = 0.0078125f; + const int32_t v12 = 40; + const int32_t v13 = 0; + const int32_t v14 = 1024; + const int32_t v15 = 128; + const int32_t v16 = 1; + const int32_t v17 = 5120; + const int32_t v18 = 16; + const int64_t v19 = 64; + const int64_t v20 = 0; + const int64_t v21 = 25152; + const int64_t v22 = 16960; + const int64_t v23 = 8768; + const int64_t v24 = 576; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + size_t v25 = (size_t)v16; + size_t v26 = (size_t)v13; + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + for (size_t v27 = v26; v27 < ((size_t)v12); v27 += v25) { + int32_t v28 = (int32_t)((uint32_t)((int32_t)v27) * (uint32_t)v15); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v29 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v15); + uint64_t v30 = (uint64_t)v24; + TASSIGN(v29, v30); + pto::Shape<1, 1, 1, 16, 128> v31 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v32 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v33 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v2 + (v8 + (unsigned)v7 * (unsigned)v17 + (unsigned)v28 * (unsigned)v16), v31, v32 + ); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + TLOAD(v29, v33); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v34 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v15); + uint64_t v35 = (uint64_t)v23; + TASSIGN(v34, v35); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TMUL(v34, v29, v29); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v36 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v15); + uint64_t v37 = (uint64_t)v22; + TASSIGN(v36, v37); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v38 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v16); + uint64_t v39 = (uint64_t)v21; + TASSIGN(v38, v39); + pipe_barrier(PIPE_V); + TROWSUM(v38, v34, v36); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v40 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v18); + uint64_t v41 = (uint64_t)v21; + TASSIGN(v40, v41); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v42 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v18); + uint64_t v43 = (uint64_t)v20; + TASSIGN(v42, v43); + pipe_barrier(PIPE_V); + TMULS(v42, v40, v11); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v44 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v18); + uint64_t v45 = (uint64_t)v20; + TASSIGN(v44, v45); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v46 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v18); + uint64_t v47 = (uint64_t)v20; + TASSIGN(v46, v47); + pipe_barrier(PIPE_V); + TADDS(v46, v44, v10); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v48 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v18); + uint64_t v49 = (uint64_t)v20; + TASSIGN(v48, v49); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v50 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v18); + uint64_t v51 = (uint64_t)v20; + TASSIGN(v50, v51); + pipe_barrier(PIPE_V); + TRSQRT(v50, v48); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v52 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v16); + uint64_t v53 = (uint64_t)v20; + TASSIGN(v52, v53); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v54 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v15); + uint64_t v55 = (uint64_t)v24; + TASSIGN(v54, v55); + pipe_barrier(PIPE_V); + TROWEXPANDMUL(v54, v29, v52); + Tile< + TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v56 = Tile< + TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v15); + uint64_t v57 = (uint64_t)v19; + TASSIGN(v56, v57); + pto::Shape<1, 1, 1, 1, 128> v58 = pto::Shape<1, 1, 1, 1, 128>(); + pto::Stride<128, 128, 128, 128, 1> v59 = pto::Stride<128, 128, 128, 128, 1>(); + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v60 = + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>( + v3 + (v8 + v8 * (unsigned)v15 + v8 * (unsigned)v16), v58, v59 + ); + TLOAD(v56, v60); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v61 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v15); + uint64_t v62 = (uint64_t)v24; + TASSIGN(v61, v62); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_V); + TCOLEXPANDMUL(v61, v54, v56); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 128> v63 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v64 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v65 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v1 + (v8 + (unsigned)v7 * (unsigned)v17 + (unsigned)v28 * (unsigned)v16), v63, v64 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v65, v61); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + } + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1); + for (size_t v66 = v26; v66 < ((size_t)v9); v66 += v25) { + int32_t v67 = (int32_t)((uint32_t)((int32_t)v66) * (uint32_t)v15); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v68 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v15); + uint64_t v69 = (uint64_t)v24; + TASSIGN(v68, v69); + pto::Shape<1, 1, 1, 16, 128> v70 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<16384, 16384, 16384, 1024, 1> v71 = pto::Stride<16384, 16384, 16384, 1024, 1>(); + GlobalTensor, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND> + v72 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND>( + v5 + (v8 + (unsigned)v7 * (unsigned)v14 + (unsigned)v67 * (unsigned)v16), v70, v71 + ); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + TLOAD(v68, v72); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v73 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v15); + uint64_t v74 = (uint64_t)v23; + TASSIGN(v73, v74); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TMUL(v73, v68, v68); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v75 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v15); + uint64_t v76 = (uint64_t)v22; + TASSIGN(v75, v76); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v77 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v16); + uint64_t v78 = (uint64_t)v21; + TASSIGN(v77, v78); + pipe_barrier(PIPE_V); + TROWSUM(v77, v73, v75); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v79 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v18); + uint64_t v80 = (uint64_t)v21; + TASSIGN(v79, v80); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v81 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v18); + uint64_t v82 = (uint64_t)v20; + TASSIGN(v81, v82); + pipe_barrier(PIPE_V); + TMULS(v81, v79, v11); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v83 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v18); + uint64_t v84 = (uint64_t)v20; + TASSIGN(v83, v84); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v85 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v18); + uint64_t v86 = (uint64_t)v20; + TASSIGN(v85, v86); + pipe_barrier(PIPE_V); + TADDS(v85, v83, v10); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v87 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v18); + uint64_t v88 = (uint64_t)v20; + TASSIGN(v87, v88); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v89 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v18); + uint64_t v90 = (uint64_t)v20; + TASSIGN(v89, v90); + pipe_barrier(PIPE_V); + TRSQRT(v89, v87); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v91 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v16); + uint64_t v92 = (uint64_t)v20; + TASSIGN(v91, v92); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v93 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v15); + uint64_t v94 = (uint64_t)v24; + TASSIGN(v93, v94); + pipe_barrier(PIPE_V); + TROWEXPANDMUL(v93, v68, v91); + Tile< + TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v95 = Tile< + TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v16, v15); + uint64_t v96 = (uint64_t)v19; + TASSIGN(v95, v96); + pto::Shape<1, 1, 1, 1, 128> v97 = pto::Shape<1, 1, 1, 1, 128>(); + pto::Stride<128, 128, 128, 128, 1> v98 = pto::Stride<128, 128, 128, 128, 1>(); + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v99 = + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>( + v6 + (v8 + v8 * (unsigned)v15 + v8 * (unsigned)v16), v97, v98 + ); + TLOAD(v95, v99); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v100 = Tile< + TileType::Vec, float, 16, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v18, v15); + uint64_t v101 = (uint64_t)v24; + TASSIGN(v100, v101); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + pipe_barrier(PIPE_V); + TCOLEXPANDMUL(v100, v93, v95); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pto::Shape<1, 1, 1, 16, 128> v102 = pto::Shape<1, 1, 1, 16, 128>(); + pto::Stride<16384, 16384, 16384, 1024, 1> v103 = pto::Stride<16384, 16384, 16384, 1024, 1>(); + GlobalTensor, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND> + v104 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, 128>, pto::Stride<16384, 16384, 16384, 1024, 1>, pto::Layout::ND>( + v4 + (v8 + (unsigned)v7 * (unsigned)v14 + (unsigned)v67 * (unsigned)v16), v102, v103 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(v104, v100); + set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); + } + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: q_proj_norm__iter_v1 + __gm__ Tensor *q_proj_norm__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ float *q_proj_norm__iter_v1 = reinterpret_cast<__gm__ float *>(q_proj_norm__iter_v1_tensor->buffer.addr) + + q_proj_norm__iter_v1_tensor->start_offset; + + // Unpack tensor: q_proj__rv_v2 + __gm__ Tensor *q_proj__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ float *q_proj__rv_v2 = + reinterpret_cast<__gm__ float *>(q_proj__rv_v2_tensor->buffer.addr) + q_proj__rv_v2_tensor->start_offset; + + // Unpack tensor: q_norm_weight__ssa_v0 + __gm__ Tensor *q_norm_weight__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *q_norm_weight__ssa_v0 = reinterpret_cast<__gm__ float *>(q_norm_weight__ssa_v0_tensor->buffer.addr) + + q_norm_weight__ssa_v0_tensor->start_offset; + + // Unpack tensor: k_proj_norm__iter_v1 + __gm__ Tensor *k_proj_norm__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]); + __gm__ float *k_proj_norm__iter_v1 = reinterpret_cast<__gm__ float *>(k_proj_norm__iter_v1_tensor->buffer.addr) + + k_proj_norm__iter_v1_tensor->start_offset; + + // Unpack tensor: k_proj__rv_v2 + __gm__ Tensor *k_proj__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[4]); + __gm__ float *k_proj__rv_v2 = + reinterpret_cast<__gm__ float *>(k_proj__rv_v2_tensor->buffer.addr) + k_proj__rv_v2_tensor->start_offset; + + // Unpack tensor: k_norm_weight__ssa_v0 + __gm__ Tensor *k_norm_weight__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[5]); + __gm__ float *k_norm_weight__ssa_v0 = reinterpret_cast<__gm__ float *>(k_norm_weight__ssa_v0_tensor->buffer.addr) + + k_norm_weight__ssa_v0_tensor->start_offset; + + // Unpack scalar: b0__idx_v0 + union { + uint64_t u64; + int64_t val; + } b0__idx_v0_conv; + b0__idx_v0_conv.u64 = args[6]; + int64_t b0__idx_v0 = b0__idx_v0_conv.val; + + // Forward to ptoas-generated function + qk_norm( + q_proj_norm__iter_v1, q_proj__rv_v2, q_norm_weight__ssa_v0, k_proj_norm__iter_v1, k_proj__rv_v2, + k_norm_weight__ssa_v0, b0__idx_v0 + ); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rmsnorm.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rmsnorm.cpp new file mode 100644 index 000000000..677bee8e4 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rmsnorm.cpp @@ -0,0 +1,383 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: rmsnorm +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void rmsnorm(__gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, __gm__ float *v3, int32_t v4, int32_t v5) { + RoundMode v6 = RoundMode::CAST_ROUND; + unsigned v7 = 5120; + unsigned v8 = 0; + const float v9 = 9.99999997E-7f; + const float v10 = 1.95312503E-4f; + const int32_t v11 = 512; + const int32_t v12 = 10; + const int32_t v13 = 0; + const float v14 = 0.0f; + const int32_t v15 = 1; + const int32_t v16 = 5120; + const int32_t v17 = 16; + const int64_t v18 = 2112; + const int64_t v19 = 64; + const int64_t v20 = 0; + const int64_t v21 = 100480; + const int64_t v22 = 67712; + const int64_t v23 = 34944; + const int64_t v24 = 18560; + const int64_t v25 = 18496; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + size_t v26 = (size_t)v15; + size_t v27 = (size_t)v13; + size_t v28 = (size_t)v12; + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v29 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v15, v17); + uint64_t v30 = (uint64_t)v25; + TASSIGN(v29, v30); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TEXPANDS(v29, v14); + for (size_t v31 = v27; v31 < v28; v31 += v26) { + Tile< + TileType::Vec, bfloat16_t, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v32 = Tile< + TileType::Vec, bfloat16_t, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v5, v11); + uint64_t v33 = (uint64_t)v24; + TASSIGN(v32, v33); + unsigned v34 = (unsigned)v5 * v7; + pto::Shape<1, 1, 1, -1, 512> v35 = pto::Shape<1, 1, 1, -1, 512>(v5); + pto::Stride<-1, -1, -1, 5120, 1> v36 = pto::Stride<-1, -1, -1, 5120, 1>(v34, v34, v34); + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v37 = + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>( + v1 + (v8 + (unsigned)v4 * (unsigned)v16 + + (unsigned)((int32_t)(uint32_t)((int32_t)v31) * (uint32_t)v11) * (unsigned)v15), + v35, v36 + ); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v32, v37); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + Tile< + TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v38 = Tile< + TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v17, v11); + uint64_t v39 = (uint64_t)v23; + TASSIGN(v38, v39); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCVT(v38, v32, v6); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + Tile< + TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v40 = Tile< + TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v17, v11); + uint64_t v41 = (uint64_t)v23; + TASSIGN(v40, v41); + pipe_barrier(PIPE_V); + TMUL(v40, v38, v38); + Tile< + TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v42 = Tile< + TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v17, v11); + uint64_t v43 = (uint64_t)v22; + TASSIGN(v42, v43); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v44 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v17, v15); + uint64_t v45 = (uint64_t)v21; + TASSIGN(v44, v45); + pipe_barrier(PIPE_V); + TROWSUM(v44, v40, v42); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v46 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v15, v17); + uint64_t v47 = (uint64_t)v21; + TASSIGN(v46, v47); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v48 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v15, v17); + uint64_t v49 = (uint64_t)v25; + TASSIGN(v48, v49); + pipe_barrier(PIPE_V); + TADD(v48, v29, v46); + } + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v50 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v15, v17); + uint64_t v51 = (uint64_t)v25; + TASSIGN(v50, v51); + pipe_barrier(PIPE_V); + TMULS(v50, v29, v10); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v52 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v15, v17); + uint64_t v53 = (uint64_t)v25; + TASSIGN(v52, v53); + pipe_barrier(PIPE_V); + TADDS(v52, v50, v9); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v54 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v15, v17); + uint64_t v55 = (uint64_t)v25; + TASSIGN(v54, v55); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v56 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v15, v17); + uint64_t v57 = (uint64_t)v25; + TASSIGN(v56, v57); + pipe_barrier(PIPE_V); + TSQRT(v56, v54); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v58 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v15, v17); + uint64_t v59 = (uint64_t)v25; + TASSIGN(v58, v59); + Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v60 = Tile< + TileType::Vec, float, 1, 16, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v15, v17); + uint64_t v61 = (uint64_t)v20; + TASSIGN(v60, v61); + pipe_barrier(PIPE_V); + TRECIP(v60, v58); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v62 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v17, v15); + uint64_t v63 = (uint64_t)v20; + TASSIGN(v62, v63); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + for (size_t v64 = v27; v64 < v28; v64 += v26) { + int32_t v65 = (int32_t)((uint32_t)((int32_t)v64) * (uint32_t)v11); + Tile< + TileType::Vec, bfloat16_t, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v66 = Tile< + TileType::Vec, bfloat16_t, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v5, v11); + uint64_t v67 = (uint64_t)v24; + TASSIGN(v66, v67); + unsigned v68 = (unsigned)v5 * v7; + pto::Shape<1, 1, 1, -1, 512> v69 = pto::Shape<1, 1, 1, -1, 512>(v5); + pto::Stride<-1, -1, -1, 5120, 1> v70 = pto::Stride<-1, -1, -1, 5120, 1>(v68, v68, v68); + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND> v71 = + GlobalTensor, pto::Stride<-1, -1, -1, 5120, 1>, pto::Layout::ND>( + v1 + (v8 + (unsigned)v4 * (unsigned)v16 + (unsigned)v65 * (unsigned)v15), v69, v70 + ); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v66, v71); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + Tile< + TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v72 = Tile< + TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v17, v11); + uint64_t v73 = (uint64_t)v23; + TASSIGN(v72, v73); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + pipe_barrier(PIPE_V); + TCVT(v72, v66, v6); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + Tile< + TileType::Vec, float, 1, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v74 = Tile< + TileType::Vec, float, 1, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v15, v11); + uint64_t v75 = (uint64_t)v19; + TASSIGN(v74, v75); + pto::Shape<1, 1, 1, 1, 512> v76 = pto::Shape<1, 1, 1, 1, 512>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v77 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> v78 = + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>( + v3 + (v8 + v8 * (unsigned)v16 + (unsigned)v65 * (unsigned)v15), v76, v77 + ); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + TLOAD(v74, v78); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + Tile< + TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v79 = Tile< + TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v17, v11); + uint64_t v80 = (uint64_t)v23; + TASSIGN(v79, v80); + pipe_barrier(PIPE_V); + TROWEXPANDMUL(v79, v72, v62); + Tile< + TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v81 = Tile< + TileType::Vec, float, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v17, v11); + uint64_t v82 = (uint64_t)v23; + TASSIGN(v81, v82); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TCOLEXPANDMUL(v81, v79, v74); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + Tile< + TileType::Vec, bfloat16_t, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v83 = Tile< + TileType::Vec, bfloat16_t, 16, 512, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v17, v11); + uint64_t v84 = (uint64_t)v18; + TASSIGN(v83, v84); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TCVT(v83, v81, v6); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 512> v85 = pto::Shape<1, 1, 1, 16, 512>(); + pto::Stride<81920, 81920, 81920, 5120, 1> v86 = pto::Stride<81920, 81920, 81920, 5120, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND> + v87 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 512>, pto::Stride<81920, 81920, 81920, 5120, 1>, pto::Layout::ND>( + v2 + (v8 + v8 * (unsigned)v16 + (unsigned)v65 * (unsigned)v15), v85, v86 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v87, v83); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID3); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: current_hidden__rv_v2 + __gm__ Tensor *current_hidden__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ bfloat16_t *current_hidden__rv_v2 = + reinterpret_cast<__gm__ bfloat16_t *>(current_hidden__rv_v2_tensor->buffer.addr) + + current_hidden__rv_v2_tensor->start_offset; + + // Unpack tensor: normed_tile__ssa_v0 + __gm__ Tensor *normed_tile__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ bfloat16_t *normed_tile__ssa_v0 = + reinterpret_cast<__gm__ bfloat16_t *>(normed_tile__ssa_v0_tensor->buffer.addr) + + normed_tile__ssa_v0_tensor->start_offset; + + // Unpack tensor: input_rms_weight__ssa_v0 + __gm__ Tensor *input_rms_weight__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *input_rms_weight__ssa_v0 = + reinterpret_cast<__gm__ float *>(input_rms_weight__ssa_v0_tensor->buffer.addr) + + input_rms_weight__ssa_v0_tensor->start_offset; + + // Unpack scalar: b0__idx_v0 + union { + uint64_t u64; + int64_t val; + } b0__idx_v0_conv; + b0__idx_v0_conv.u64 = args[3]; + int64_t b0__idx_v0 = b0__idx_v0_conv.val; + + // Unpack scalar: cur_valid__ssa_v1 + union { + uint64_t u64; + int64_t val; + } cur_valid__ssa_v1_conv; + cur_valid__ssa_v1_conv.u64 = args[4]; + int64_t cur_valid__ssa_v1 = cur_valid__ssa_v1_conv.val; + + // Forward to ptoas-generated function + rmsnorm(current_hidden__rv_v2, normed_tile__ssa_v0, input_rms_weight__ssa_v0, b0__idx_v0, cur_valid__ssa_v1); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rope_kv_cache.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rope_kv_cache.cpp new file mode 100644 index 000000000..54bce68c8 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/rope_kv_cache.cpp @@ -0,0 +1,593 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: rope_kv_cache +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void rope_kv_cache( + __gm__ bfloat16_t *v1, __gm__ bfloat16_t *v2, __gm__ bfloat16_t *v3, __gm__ float *v4, __gm__ float *v5, + __gm__ float *v6, __gm__ float *v7, __gm__ float *v8, __gm__ float *v9, __gm__ float *v10, int32_t v11, int32_t v12, + int32_t v13, int32_t v14, int32_t v15 +) { + unsigned v16 = 64; + RoundMode v17 = RoundMode::CAST_ROUND; + unsigned v18 = 0; + const int32_t v19 = 5; + const int32_t v20 = 256; + const int32_t v21 = 8; + const int32_t v22 = 0; + const int32_t v23 = 5120; + const int32_t v24 = 64; + const int32_t v25 = 1024; + const int32_t v26 = 16; + const int32_t v27 = 1; + const int32_t v28 = 128; + const int64_t v29 = 2944; + const int64_t v30 = 2688; + const int64_t v31 = 2176; + const int64_t v32 = 2048; + const int64_t v33 = 1792; + const int64_t v34 = 1536; + const int64_t v35 = 1280; + const int64_t v36 = 1024; + const int64_t v37 = 768; + const int64_t v38 = 512; + const int64_t v39 = 256; + const int64_t v40 = 0; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + size_t v41 = (size_t)v27; + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v42 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v43 = (uint64_t)v40; + TASSIGN(v42, v43); + pto::Shape<1, 1, 1, 1, 64> v44 = pto::Shape<1, 1, 1, 1, 64>(); + pto::Stride<128, 128, 128, 128, 1> v45 = pto::Stride<128, 128, 128, 128, 1>(); + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v46 = + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>( + v5 + (v18 + v18 * (unsigned)v28 + v18 * (unsigned)v27), v44, v45 + ); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID3); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID4); + TLOAD(v42, v46); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v47 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v48 = (uint64_t)v39; + TASSIGN(v47, v48); + pto::Shape<1, 1, 1, 1, 64> v49 = pto::Shape<1, 1, 1, 1, 64>(); + pto::Stride<128, 128, 128, 128, 1> v50 = pto::Stride<128, 128, 128, 128, 1>(); + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v51 = + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>( + v6 + (v18 + v18 * (unsigned)v28 + v18 * (unsigned)v27), v49, v50 + ); + TLOAD(v47, v51); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v52 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v53 = (uint64_t)v38; + TASSIGN(v52, v53); + pto::Shape<1, 1, 1, 1, 64> v54 = pto::Shape<1, 1, 1, 1, 64>(); + pto::Stride<128, 128, 128, 128, 1> v55 = pto::Stride<128, 128, 128, 128, 1>(); + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v56 = + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>( + v7 + (v18 + v18 * (unsigned)v28 + v18 * (unsigned)v27), v54, v55 + ); + TLOAD(v52, v56); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v57 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v58 = (uint64_t)v37; + TASSIGN(v57, v58); + pto::Shape<1, 1, 1, 1, 64> v59 = pto::Shape<1, 1, 1, 1, 64>(); + pto::Stride<128, 128, 128, 128, 1> v60 = pto::Stride<128, 128, 128, 128, 1>(); + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v61 = + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>( + v8 + (v18 + v18 * (unsigned)v28 + v18 * (unsigned)v27), v59, v60 + ); + TLOAD(v57, v61); + for (size_t v62 = (size_t)v11; v62 < ((size_t)((int32_t)(uint32_t)v11 + (uint32_t)v21)); v62 += v41) { + int32_t v63 = (int32_t)v62; + int32_t v64 = (int32_t)((uint32_t)v63 * (uint32_t)v28); + int32_t v65 = + (int32_t)((uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v12 * (uint32_t)v21) + + (uint32_t)v63) * + (uint32_t)v20) + + (uint32_t)v13); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v66 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v67 = (uint64_t)v36; + TASSIGN(v66, v67); + pto::Shape<1, 1, 1, 1, 64> v68 = pto::Shape<1, 1, 1, 1, 64>(); + pto::Stride<1024, 1024, 1024, 1024, 1> v69 = pto::Stride<1024, 1024, 1024, 1024, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 1024, 1>, pto::Layout::ND> v70 = + GlobalTensor, pto::Stride<1024, 1024, 1024, 1024, 1>, pto::Layout::ND>( + v4 + (v18 + (unsigned)v14 * (unsigned)v25 + (unsigned)v64 * (unsigned)v27), v68, v69 + ); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v66, v70); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v71 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v72 = (uint64_t)v35; + TASSIGN(v71, v72); + pto::Shape<1, 1, 1, 1, 64> v73 = pto::Shape<1, 1, 1, 1, 64>(); + pto::Stride<1024, 1024, 1024, 1024, 1> v74 = pto::Stride<1024, 1024, 1024, 1024, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 1024, 1>, pto::Layout::ND> v75 = + GlobalTensor, pto::Stride<1024, 1024, 1024, 1024, 1>, pto::Layout::ND>( + v4 + (v18 + (unsigned)v14 * (unsigned)v25 + + (unsigned)((int32_t)(uint32_t)v64 + (uint32_t)v24) * (unsigned)v27), + v73, v74 + ); + TLOAD(v71, v75); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v76 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v77 = (uint64_t)v34; + TASSIGN(v76, v77); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TCOLEXPANDMUL(v76, v66, v42); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v78 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v79 = (uint64_t)v33; + TASSIGN(v78, v79); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TCOLEXPANDMUL(v78, v71, v47); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v80 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v81 = (uint64_t)v34; + TASSIGN(v80, v81); + pipe_barrier(PIPE_V); + TSUB(v80, v76, v78); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v82 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v83 = (uint64_t)v35; + TASSIGN(v82, v83); + TCOLEXPANDMUL(v82, v71, v52); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v84 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v85 = (uint64_t)v36; + TASSIGN(v84, v85); + TCOLEXPANDMUL(v84, v66, v57); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v86 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v87 = (uint64_t)v36; + TASSIGN(v86, v87); + pipe_barrier(PIPE_V); + TADD(v86, v82, v84); + Tile< + TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v88 = Tile< + TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v89 = (uint64_t)v32; + TASSIGN(v88, v89); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TCVT(v88, v80, v17); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 1, 64> v90 = pto::Shape<1, 1, 1, 1, 64>(); + pto::Stride<128, 128, 128, 128, 1> v91 = pto::Stride<128, 128, 128, 128, 1>(); + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v92 = + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>( + v2 + (v18 + (unsigned)v65 * (unsigned)v28 + v18 * (unsigned)v27), v90, v91 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v92, v88); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + Tile< + TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v93 = Tile< + TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v94 = (uint64_t)v32; + TASSIGN(v93, v94); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + TCVT(v93, v86, v17); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pto::Shape<1, 1, 1, 1, 64> v95 = pto::Shape<1, 1, 1, 1, 64>(); + pto::Stride<128, 128, 128, 128, 1> v96 = pto::Stride<128, 128, 128, 128, 1>(); + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> v97 = + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>( + v2 + (v18 + (unsigned)v65 * (unsigned)v28 + v16 * (unsigned)v27), v95, v96 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(v97, v93); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID2); + Tile< + TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v98 = Tile< + TileType::Vec, float, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v28); + uint64_t v99 = (uint64_t)v31; + TASSIGN(v98, v99); + pto::Shape<1, 1, 1, 1, 128> v100 = pto::Shape<1, 1, 1, 1, 128>(); + pto::Stride<1024, 1024, 1024, 1024, 1> v101 = pto::Stride<1024, 1024, 1024, 1024, 1>(); + GlobalTensor, pto::Stride<1024, 1024, 1024, 1024, 1>, pto::Layout::ND> v102 = + GlobalTensor, pto::Stride<1024, 1024, 1024, 1024, 1>, pto::Layout::ND>( + v9 + (v18 + (unsigned)v14 * (unsigned)v25 + (unsigned)v64 * (unsigned)v27), v100, v101 + ); + TLOAD(v98, v102); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + Tile< + TileType::Vec, bfloat16_t, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v103 = Tile< + TileType::Vec, bfloat16_t, 1, 128, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v28); + uint64_t v104 = (uint64_t)v30; + TASSIGN(v103, v104); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID2); + TCVT(v103, v98, v17); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + pto::Shape<1, 1, 1, 1, 128> v105 = pto::Shape<1, 1, 1, 1, 128>(); + pto::Stride<128, 128, 128, 128, 1> v106 = pto::Stride<128, 128, 128, 128, 1>(); + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> + v107 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 1, 128>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>( + v3 + (v18 + (unsigned)v65 * (unsigned)v28 + v18 * (unsigned)v27), v105, v106 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID2); + TSTORE(v107, v103); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID2); + for (size_t v108 = (size_t)v22; v108 < ((size_t)v19); v108 += v41) { + int32_t v109 = (int32_t)v108; + int32_t v110 = + (int32_t)((uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v63 * (uint32_t)v19) + (uint32_t)v109) * + (uint32_t)v28); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v111 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v112 = (uint64_t)v36; + TASSIGN(v111, v112); + pto::Shape<1, 1, 1, 1, 64> v113 = pto::Shape<1, 1, 1, 1, 64>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v114 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> + v115 = GlobalTensor< + float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>( + v10 + (v18 + (unsigned)v14 * (unsigned)v23 + (unsigned)v110 * (unsigned)v27), v113, v114 + ); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + TLOAD(v111, v115); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v116 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v117 = (uint64_t)v35; + TASSIGN(v116, v117); + pto::Shape<1, 1, 1, 1, 64> v118 = pto::Shape<1, 1, 1, 1, 64>(); + pto::Stride<5120, 5120, 5120, 5120, 1> v119 = pto::Stride<5120, 5120, 5120, 5120, 1>(); + GlobalTensor, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND> + v120 = GlobalTensor< + float, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<5120, 5120, 5120, 5120, 1>, pto::Layout::ND>( + v10 + (v18 + (unsigned)v14 * (unsigned)v23 + + (unsigned)((int32_t)(uint32_t)v110 + (uint32_t)v24) * (unsigned)v27), + v118, v119 + ); + TLOAD(v116, v120); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v121 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v122 = (uint64_t)v34; + TASSIGN(v121, v122); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID3); + TCOLEXPANDMUL(v121, v111, v42); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v123 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v124 = (uint64_t)v33; + TASSIGN(v123, v124); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID4); + TCOLEXPANDMUL(v123, v116, v47); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v125 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v126 = (uint64_t)v34; + TASSIGN(v125, v126); + pipe_barrier(PIPE_V); + TSUB(v125, v121, v123); + Tile< + TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v127 = Tile< + TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v128 = (uint64_t)v32; + TASSIGN(v127, v128); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID3); + TCVT(v127, v125, v17); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v129 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v130 = (uint64_t)v35; + TASSIGN(v129, v130); + TCOLEXPANDMUL(v129, v116, v52); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v131 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v132 = (uint64_t)v36; + TASSIGN(v131, v132); + TCOLEXPANDMUL(v131, v111, v57); + Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v133 = Tile< + TileType::Vec, float, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v134 = (uint64_t)v36; + TASSIGN(v133, v134); + pipe_barrier(PIPE_V); + TADD(v133, v129, v131); + Tile< + TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v135 = Tile< + TileType::Vec, bfloat16_t, 1, 64, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v27, v24); + uint64_t v136 = (uint64_t)v29; + TASSIGN(v135, v136); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID4); + TCVT(v135, v133, v17); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + int32_t v137 = (int32_t)((uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)v14 * (uint32_t)v28) + + (uint32_t)((int32_t)(uint32_t)v63 * (uint32_t)v26)) + + (uint32_t)v109); + pto::Shape<1, 1, 1, 1, 64> v138 = pto::Shape<1, 1, 1, 1, 64>(); + pto::Stride<128, 128, 128, 128, 1> v139 = pto::Stride<128, 128, 128, 128, 1>(); + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> + v140 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>( + v1 + (v18 + (unsigned)v137 * (unsigned)v28 + v18 * (unsigned)v27), v138, v139 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID3); + pipe_barrier(PIPE_MTE3); + TSTORE(v140, v127); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID3); + pto::Shape<1, 1, 1, 1, 64> v141 = pto::Shape<1, 1, 1, 1, 64>(); + pto::Stride<128, 128, 128, 128, 1> v142 = pto::Stride<128, 128, 128, 128, 1>(); + GlobalTensor, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND> + v143 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 1, 64>, pto::Stride<128, 128, 128, 128, 1>, pto::Layout::ND>( + v1 + (v18 + (unsigned)v137 * (unsigned)v28 + v16 * (unsigned)v27), v141, v142 + ); + pipe_barrier(PIPE_MTE3); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID4); + TSTORE(v143, v135); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID4); + }; + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID2); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID3); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID4); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: all_q_padded__iter_v6 + __gm__ Tensor *all_q_padded__iter_v6_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ bfloat16_t *all_q_padded__iter_v6 = + reinterpret_cast<__gm__ bfloat16_t *>(all_q_padded__iter_v6_tensor->buffer.addr) + + all_q_padded__iter_v6_tensor->start_offset; + + // Unpack tensor: k_cache__iter_v5 + __gm__ Tensor *k_cache__iter_v5_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ bfloat16_t *k_cache__iter_v5 = reinterpret_cast<__gm__ bfloat16_t *>(k_cache__iter_v5_tensor->buffer.addr) + + k_cache__iter_v5_tensor->start_offset; + + // Unpack tensor: v_cache__iter_v5 + __gm__ Tensor *v_cache__iter_v5_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ bfloat16_t *v_cache__iter_v5 = reinterpret_cast<__gm__ bfloat16_t *>(v_cache__iter_v5_tensor->buffer.addr) + + v_cache__iter_v5_tensor->start_offset; + + // Unpack tensor: k_proj_norm__rv_v2 + __gm__ Tensor *k_proj_norm__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]); + __gm__ float *k_proj_norm__rv_v2 = reinterpret_cast<__gm__ float *>(k_proj_norm__rv_v2_tensor->buffer.addr) + + k_proj_norm__rv_v2_tensor->start_offset; + + // Unpack tensor: cos_lo__ssa_v0 + __gm__ Tensor *cos_lo__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[4]); + __gm__ float *cos_lo__ssa_v0 = + reinterpret_cast<__gm__ float *>(cos_lo__ssa_v0_tensor->buffer.addr) + cos_lo__ssa_v0_tensor->start_offset; + + // Unpack tensor: sin_lo__ssa_v0 + __gm__ Tensor *sin_lo__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[5]); + __gm__ float *sin_lo__ssa_v0 = + reinterpret_cast<__gm__ float *>(sin_lo__ssa_v0_tensor->buffer.addr) + sin_lo__ssa_v0_tensor->start_offset; + + // Unpack tensor: cos_hi__ssa_v0 + __gm__ Tensor *cos_hi__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[6]); + __gm__ float *cos_hi__ssa_v0 = + reinterpret_cast<__gm__ float *>(cos_hi__ssa_v0_tensor->buffer.addr) + cos_hi__ssa_v0_tensor->start_offset; + + // Unpack tensor: sin_hi__ssa_v0 + __gm__ Tensor *sin_hi__ssa_v0_tensor = reinterpret_cast<__gm__ Tensor *>(args[7]); + __gm__ float *sin_hi__ssa_v0 = + reinterpret_cast<__gm__ float *>(sin_hi__ssa_v0_tensor->buffer.addr) + sin_hi__ssa_v0_tensor->start_offset; + + // Unpack tensor: v_proj__rv_v2 + __gm__ Tensor *v_proj__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[8]); + __gm__ float *v_proj__rv_v2 = + reinterpret_cast<__gm__ float *>(v_proj__rv_v2_tensor->buffer.addr) + v_proj__rv_v2_tensor->start_offset; + + // Unpack tensor: q_proj_norm__rv_v2 + __gm__ Tensor *q_proj_norm__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[9]); + __gm__ float *q_proj_norm__rv_v2 = reinterpret_cast<__gm__ float *>(q_proj_norm__rv_v2_tensor->buffer.addr) + + q_proj_norm__rv_v2_tensor->start_offset; + + // Unpack scalar: ki_chunk__idx_v0 + union { + uint64_t u64; + int64_t val; + } ki_chunk__idx_v0_conv; + ki_chunk__idx_v0_conv.u64 = args[10]; + int64_t ki_chunk__idx_v0 = ki_chunk__idx_v0_conv.val; + + // Unpack scalar: slot_block__ssa_v0 + union { + uint64_t u64; + int64_t val; + } slot_block__ssa_v0_conv; + slot_block__ssa_v0_conv.u64 = args[11]; + int64_t slot_block__ssa_v0 = slot_block__ssa_v0_conv.val; + + // Unpack scalar: slot_offset__ssa_v0 + union { + uint64_t u64; + int64_t val; + } slot_offset__ssa_v0_conv; + slot_offset__ssa_v0_conv.u64 = args[12]; + int64_t slot_offset__ssa_v0 = slot_offset__ssa_v0_conv.val; + + // Unpack scalar: b__idx_v0 + union { + uint64_t u64; + int64_t val; + } b__idx_v0_conv; + b__idx_v0_conv.u64 = args[13]; + int64_t b__idx_v0 = b__idx_v0_conv.val; + + // Extract dynamic dim: KV_CACHE_ROWS_DYN + int64_t KV_CACHE_ROWS_DYN = static_cast(k_cache__iter_v5_tensor->shapes[0]); + + // Forward to ptoas-generated function + rope_kv_cache( + all_q_padded__iter_v6, k_cache__iter_v5, v_cache__iter_v5, k_proj_norm__rv_v2, cos_lo__ssa_v0, sin_lo__ssa_v0, + cos_hi__ssa_v0, sin_hi__ssa_v0, v_proj__rv_v2, q_proj_norm__rv_v2, ki_chunk__idx_v0, slot_block__ssa_v0, + slot_offset__ssa_v0, b__idx_v0, KV_CACHE_ROWS_DYN + ); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/silu.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/silu.cpp new file mode 100644 index 000000000..69af06ad5 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/silu.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: silu +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void silu(__gm__ float *v1, __gm__ float *v2, __gm__ bfloat16_t *v3, int32_t v4) { + RoundMode v5 = RoundMode::CAST_ROUND; + unsigned v6 = 0; + const float v7 = 1.0f; + const int32_t v8 = 17408; + const int32_t v9 = 1; + const int32_t v10 = 256; + const int32_t v11 = 16; + const int64_t v12 = 0; + const int64_t v13 = 57344; + const int64_t v14 = 40960; + const int64_t v15 = 24576; + const int64_t v16 = 8192; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v17 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v18 = (uint64_t)v16; + TASSIGN(v17, v18); + pto::Shape<1, 1, 1, 16, 256> v19 = pto::Shape<1, 1, 1, 16, 256>(); + pto::Stride<4096, 4096, 4096, 256, 1> v20 = pto::Stride<4096, 4096, 4096, 256, 1>(); + GlobalTensor, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND> v21 = + GlobalTensor, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>( + v1 + (v6 + v6 * (unsigned)v10 + v6 * (unsigned)v9), v19, v20 + ); + TLOAD(v17, v21); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v22 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v23 = (uint64_t)v15; + TASSIGN(v22, v23); + pto::Shape<1, 1, 1, 16, 256> v24 = pto::Shape<1, 1, 1, 16, 256>(); + pto::Stride<4096, 4096, 4096, 256, 1> v25 = pto::Stride<4096, 4096, 4096, 256, 1>(); + GlobalTensor, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND> v26 = + GlobalTensor, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>( + v2 + (v6 + v6 * (unsigned)v10 + v6 * (unsigned)v9), v24, v25 + ); + TLOAD(v22, v26); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v27 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v28 = (uint64_t)v14; + TASSIGN(v27, v28); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TNEG(v27, v17); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v29 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v30 = (uint64_t)v14; + TASSIGN(v29, v30); + pipe_barrier(PIPE_V); + TEXP(v29, v27); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v31 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v32 = (uint64_t)v14; + TASSIGN(v31, v32); + pipe_barrier(PIPE_V); + TADDS(v31, v29, v7); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v33 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v34 = (uint64_t)v13; + TASSIGN(v33, v34); + pipe_barrier(PIPE_V); + TRECIP(v33, v31); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v35 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v36 = (uint64_t)v16; + TASSIGN(v35, v36); + pipe_barrier(PIPE_V); + TMUL(v35, v17, v33); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v37 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v38 = (uint64_t)v16; + TASSIGN(v37, v38); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1); + TMUL(v37, v35, v22); + Tile< + TileType::Vec, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v39 = Tile< + TileType::Vec, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v11, v10); + uint64_t v40 = (uint64_t)v12; + TASSIGN(v39, v40); + pipe_barrier(PIPE_V); + TCVT(v39, v37, v5); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 256> v41 = pto::Shape<1, 1, 1, 16, 256>(); + pto::Stride<278528, 278528, 278528, 17408, 1> v42 = pto::Stride<278528, 278528, 278528, 17408, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<278528, 278528, 278528, 17408, 1>, pto::Layout::ND> + v43 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<278528, 278528, 278528, 17408, 1>, pto::Layout::ND>( + v3 + (v6 + v6 * (unsigned)v8 + (unsigned)v4 * (unsigned)v9), v41, v42 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v43, v39); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: gate_acc__rv_v2 + __gm__ Tensor *gate_acc__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ float *gate_acc__rv_v2 = + reinterpret_cast<__gm__ float *>(gate_acc__rv_v2_tensor->buffer.addr) + gate_acc__rv_v2_tensor->start_offset; + + // Unpack tensor: up_acc__rv_v2 + __gm__ Tensor *up_acc__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ float *up_acc__rv_v2 = + reinterpret_cast<__gm__ float *>(up_acc__rv_v2_tensor->buffer.addr) + up_acc__rv_v2_tensor->start_offset; + + // Unpack tensor: mlp_tile__iter_v1 + __gm__ Tensor *mlp_tile__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ bfloat16_t *mlp_tile__iter_v1 = + reinterpret_cast<__gm__ bfloat16_t *>(mlp_tile__iter_v1_tensor->buffer.addr) + + mlp_tile__iter_v1_tensor->start_offset; + + // Unpack scalar: o0__ssa_v1 + union { + uint64_t u64; + int64_t val; + } o0__ssa_v1_conv; + o0__ssa_v1_conv.u64 = args[3]; + int64_t o0__ssa_v1 = o0__ssa_v1_conv.val; + + // Forward to ptoas-generated function + silu(gate_acc__rv_v2, up_acc__rv_v2, mlp_tile__iter_v1, o0__ssa_v1); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/softmax.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/softmax.cpp new file mode 100644 index 000000000..e94af05e1 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/aiv/softmax.cpp @@ -0,0 +1,313 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Kernel Function: softmax +// Generated by PyPTO IR Compiler (PTO backend) + +#include + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#if defined(__CPU_SIM) +#define __aicore__ +#else +#define __aicore__ [aicore] +#endif +#endif + +#include +#include "tensor.h" + +using namespace pto; + +// --- ptoas-generated code --- + +enum class PTOAutoSyncTailMode : int { + kBarrierAll = 0, + kSetWaitMte3ToSEvent0 = 1, +}; + +static __aicore__ inline void ptoas_auto_sync_tail(PTOAutoSyncTailMode mode = PTOAutoSyncTailMode::kBarrierAll) { + switch (mode) { + case PTOAutoSyncTailMode::kSetWaitMte3ToSEvent0: + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID0); + break; + case PTOAutoSyncTailMode::kBarrierAll: + default: + pipe_barrier(PIPE_ALL); + break; + } +} + +static __aicore__ void softmax( + __gm__ float *v1, __gm__ float *v2, __gm__ bfloat16_t *v3, __gm__ float *v4, int32_t v5, int32_t v6, int32_t v7 +) { + RoundMode v8 = RoundMode::CAST_ROUND; + unsigned v9 = 0; + const float v10 = 0.0883883461f; + const int32_t v11 = 2; + const int32_t v12 = 16; + const int32_t v13 = 64; + const int32_t v14 = 8; + const int32_t v15 = 0; + const int32_t v16 = 1; + const int32_t v17 = 256; + const int64_t v18 = 8192; + const int64_t v19 = 0; + const int64_t v20 = 57408; + const int64_t v21 = 41024; + const int64_t v22 = 24640; + const int64_t v23 = 8256; + using T = float; + +#if defined(__DAV_VEC__) + set_mask_norm(); + set_vector_mask(-1, -1); + size_t v24 = (size_t)v16; + size_t v25 = (size_t)v15; + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + for (size_t v26 = v25; v26 < ((size_t)v14); v26 += v24) { + for (size_t v27 = v25; v27 < ((size_t)v13); v27 += v24) { + int32_t v28 = (int32_t)((uint32_t)v5 + (uint32_t)((int32_t)v27)); + __gm__ float *v29; + __gm__ float *v30; + __gm__ bfloat16_t *v31; + if (v28 < v6) { + int32_t v32 = (int32_t)((uint32_t)v7 - (uint32_t)((int32_t)(uint32_t)v28 * (uint32_t)v17)); + int32_t v33 = v32 < v17 ? v32 : v17; + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v34 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v33); + uint64_t v35 = (uint64_t)v23; + TASSIGN(v34, v35); + int32_t v36 = + (int32_t)((uint32_t)((int32_t)(uint32_t)((int32_t)(uint32_t)((int32_t)v26) * (uint32_t)v11) + + (uint32_t)v28) * + (uint32_t)v12); + unsigned v37 = (unsigned)v33; + pto::Shape<1, 1, 1, 16, -1> v38 = pto::Shape<1, 1, 1, 16, -1>(v33); + pto::Stride<4096, 4096, 4096, 256, 1> v39 = pto::Stride<4096, 4096, 4096, 256, 1>(); + GlobalTensor, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND> + v40 = GlobalTensor< + float, pto::Shape<1, 1, 1, 16, -1>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND>( + v4 + (v9 + (unsigned)v36 * (unsigned)v17 + v9 * (unsigned)v16), v38, v39 + ); + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + TLOAD(v34, v40); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min, + CompactMode::Null> + v41 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min, + CompactMode::Null>(v12, v17); + uint64_t v42 = (uint64_t)v22; + TASSIGN(v41, v42); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + pipe_barrier(PIPE_V); + TFILLPAD(v41, v34); + set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min, + CompactMode::Null> + v43 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min, + CompactMode::Null>(v12, v17); + uint64_t v44 = (uint64_t)v22; + TASSIGN(v43, v44); + pipe_barrier(PIPE_V); + TMULS(v43, v41, v10); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v45 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v17); + uint64_t v46 = (uint64_t)v21; + TASSIGN(v45, v46); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v47 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v16); + uint64_t v48 = (uint64_t)v20; + TASSIGN(v47, v48); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + TROWMAX(v47, v43, v45); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min, + CompactMode::Null> + v49 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min, + CompactMode::Null>(v12, v17); + uint64_t v50 = (uint64_t)v22; + TASSIGN(v49, v50); + pipe_barrier(PIPE_V); + TROWEXPANDSUB(v49, v43, v47); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min, + CompactMode::Null> + v51 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min, + CompactMode::Null>(v12, v17); + uint64_t v52 = (uint64_t)v22; + TASSIGN(v51, v52); + pipe_barrier(PIPE_V); + TEXP(v51, v49); + Tile< + TileType::Vec, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min, + CompactMode::Null> + v53 = Tile< + TileType::Vec, bfloat16_t, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, + PadValue::Min, CompactMode::Null>(v12, v17); + uint64_t v54 = (uint64_t)v19; + TASSIGN(v53, v54); + pipe_barrier(PIPE_V); + TCVT(v53, v51, v8); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min, + CompactMode::Null> + v55 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Min, + CompactMode::Null>(v12, v17); + uint64_t v56 = (uint64_t)v22; + TASSIGN(v55, v56); + pipe_barrier(PIPE_V); + TCVT(v55, v53, v8); + Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v57 = Tile< + TileType::Vec, float, 16, 256, BLayout::RowMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v17); + uint64_t v58 = (uint64_t)v21; + TASSIGN(v57, v58); + Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null> + v59 = Tile< + TileType::Vec, float, 16, 1, BLayout::ColMajor, -1, -1, SLayout::NoneBox, 512, PadValue::Null, + CompactMode::Null>(v12, v16); + uint64_t v60 = (uint64_t)v18; + TASSIGN(v59, v60); + pipe_barrier(PIPE_V); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + TROWSUM(v59, v55, v57); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + pto::Shape<1, 1, 1, 16, 256> v61 = pto::Shape<1, 1, 1, 16, 256>(); + pto::Stride<4096, 4096, 4096, 256, 1> v62 = pto::Stride<4096, 4096, 4096, 256, 1>(); + GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, pto::Layout::ND> + v63 = GlobalTensor< + bfloat16_t, pto::Shape<1, 1, 1, 16, 256>, pto::Stride<4096, 4096, 4096, 256, 1>, + pto::Layout::ND>(v3 + (v9 + (unsigned)v36 * (unsigned)v17 + v9 * (unsigned)v16), v61, v62); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(v63, v53); + pto::Shape<1, 1, 1, 16, 1> v64 = pto::Shape<1, 1, 1, 16, 1>(); + pto::Stride<16, 16, 16, 1, 256> v65 = pto::Stride<16, 16, 16, 1, 256>(); + GlobalTensor, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN> v66 = + GlobalTensor, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN>( + v2 + (v9 + (unsigned)v36 * (unsigned)v16 + v9 * (unsigned)v17), v64, v65 + ); + TSTORE(v66, v47); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + pto::Shape<1, 1, 1, 16, 1> v67 = pto::Shape<1, 1, 1, 16, 1>(); + pto::Stride<16, 16, 16, 1, 256> v68 = pto::Stride<16, 16, 16, 1, 256>(); + GlobalTensor, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN> v69 = + GlobalTensor, pto::Stride<16, 16, 16, 1, 256>, pto::Layout::DN>( + v1 + (v9 + (unsigned)v36 * (unsigned)v16 + v9 * (unsigned)v17), v67, v68 + ); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(v69, v59); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + v29 = v1; + v30 = v2; + v31 = v3; + } else { + v29 = v1; + v30 = v2; + v31 = v3; + }; + }; + } + wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); +#endif // __DAV_VEC__ + + ptoas_auto_sync_tail(PTOAutoSyncTailMode::kBarrierAll); + return; +} + +// --- Kernel entry point --- +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack tensor: all_cur_li__iter_v1 + __gm__ Tensor *all_cur_li__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ float *all_cur_li__iter_v1 = reinterpret_cast<__gm__ float *>(all_cur_li__iter_v1_tensor->buffer.addr) + + all_cur_li__iter_v1_tensor->start_offset; + + // Unpack tensor: all_cur_mi__iter_v1 + __gm__ Tensor *all_cur_mi__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ float *all_cur_mi__iter_v1 = reinterpret_cast<__gm__ float *>(all_cur_mi__iter_v1_tensor->buffer.addr) + + all_cur_mi__iter_v1_tensor->start_offset; + + // Unpack tensor: all_exp_padded__iter_v1 + __gm__ Tensor *all_exp_padded__iter_v1_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ bfloat16_t *all_exp_padded__iter_v1 = + reinterpret_cast<__gm__ bfloat16_t *>(all_exp_padded__iter_v1_tensor->buffer.addr) + + all_exp_padded__iter_v1_tensor->start_offset; + + // Unpack tensor: all_raw_scores__rv_v2 + __gm__ Tensor *all_raw_scores__rv_v2_tensor = reinterpret_cast<__gm__ Tensor *>(args[3]); + __gm__ float *all_raw_scores__rv_v2 = reinterpret_cast<__gm__ float *>(all_raw_scores__rv_v2_tensor->buffer.addr) + + all_raw_scores__rv_v2_tensor->start_offset; + + // Unpack scalar: sb_chunk__idx_v0 + union { + uint64_t u64; + int64_t val; + } sb_chunk__idx_v0_conv; + sb_chunk__idx_v0_conv.u64 = args[4]; + int64_t sb_chunk__idx_v0 = sb_chunk__idx_v0_conv.val; + + // Unpack scalar: ctx_blocks__ssa_v0 + union { + uint64_t u64; + int64_t val; + } ctx_blocks__ssa_v0_conv; + ctx_blocks__ssa_v0_conv.u64 = args[5]; + int64_t ctx_blocks__ssa_v0 = ctx_blocks__ssa_v0_conv.val; + + // Unpack scalar: ctx_len__ssa_v0 + union { + uint64_t u64; + int32_t val; + } ctx_len__ssa_v0_conv; + ctx_len__ssa_v0_conv.u64 = args[6]; + int32_t ctx_len__ssa_v0 = ctx_len__ssa_v0_conv.val; + + // Forward to ptoas-generated function + softmax( + all_cur_li__iter_v1, all_cur_mi__iter_v1, all_exp_padded__iter_v1, all_raw_scores__rv_v2, sb_chunk__idx_v0, + ctx_blocks__ssa_v0, ctx_len__ssa_v0 + ); +} diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/orchestration/qwen3_decode.cpp b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/orchestration/qwen3_decode.cpp new file mode 100644 index 000000000..c4899e663 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/kernels/orchestration/qwen3_decode.cpp @@ -0,0 +1,455 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Orchestration Function: qwen3_decode +// Generated by PyPTO IR Compiler + +#include "runtime.h" +#include + +#include +#include +#include + +#include "pto_orchestration_api.h" + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{ + .expected_arg_count = 20, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) { + // External tensors + const Tensor &ext_hidden_states = orch_args.tensor(0).ref(); + const Tensor &ext_input_rms_weight = orch_args.tensor(1).ref(); + const Tensor &ext_wq = orch_args.tensor(2).ref(); + const Tensor &ext_wk = orch_args.tensor(3).ref(); + const Tensor &ext_wv = orch_args.tensor(4).ref(); + const Tensor &ext_q_norm_weight = orch_args.tensor(5).ref(); + const Tensor &ext_k_norm_weight = orch_args.tensor(6).ref(); + const Tensor &ext_seq_lens = orch_args.tensor(7).ref(); + const Tensor &ext_block_table = orch_args.tensor(8).ref(); + const Tensor &ext_slot_mapping = orch_args.tensor(9).ref(); + const Tensor &ext_rope_cos = orch_args.tensor(10).ref(); + const Tensor &ext_rope_sin = orch_args.tensor(11).ref(); + const Tensor &ext_k_cache = orch_args.tensor(12).ref(); + const Tensor &ext_v_cache = orch_args.tensor(13).ref(); + const Tensor &ext_wo = orch_args.tensor(14).ref(); + const Tensor &ext_post_rms_weight = orch_args.tensor(15).ref(); + const Tensor &ext_w_gate = orch_args.tensor(16).ref(); + const Tensor &ext_w_up = orch_args.tensor(17).ref(); + const Tensor &ext_w_down = orch_args.tensor(18).ref(); + const Tensor &ext_out = orch_args.tensor(19).ref(); + + PTO2_SCOPE() { + uint32_t current_hidden_ci_shapes[2] = {16, 5120}; + TensorCreateInfo current_hidden_ci(current_hidden_ci_shapes, 2, DataType::BFLOAT16); + uint32_t next_hidden_ci_shapes[2] = {16, 5120}; + TensorCreateInfo next_hidden_ci(next_hidden_ci_shapes, 2, DataType::BFLOAT16); + uint32_t q_proj_ci_shapes[2] = {16, 5120}; + TensorCreateInfo q_proj_ci(q_proj_ci_shapes, 2, DataType::FLOAT32); + uint32_t k_proj_ci_shapes[2] = {16, 1024}; + TensorCreateInfo k_proj_ci(k_proj_ci_shapes, 2, DataType::FLOAT32); + uint32_t v_proj_ci_shapes[2] = {16, 1024}; + TensorCreateInfo v_proj_ci(v_proj_ci_shapes, 2, DataType::FLOAT32); + uint32_t q_proj_norm_ci_shapes[2] = {16, 5120}; + TensorCreateInfo q_proj_norm_ci(q_proj_norm_ci_shapes, 2, DataType::FLOAT32); + uint32_t k_proj_norm_ci_shapes[2] = {16, 1024}; + TensorCreateInfo k_proj_norm_ci(k_proj_norm_ci_shapes, 2, DataType::FLOAT32); + uint32_t attn_out_ci_shapes[2] = {16, 5120}; + TensorCreateInfo attn_out_ci(attn_out_ci_shapes, 2, DataType::BFLOAT16); + uint32_t all_q_padded_ci_shapes[2] = {2048, 128}; + TensorCreateInfo all_q_padded_ci(all_q_padded_ci_shapes, 2, DataType::BFLOAT16); + TaskOutputTensors alloc_0 = alloc_tensors( + current_hidden_ci, next_hidden_ci, q_proj_ci, k_proj_ci, v_proj_ci, q_proj_norm_ci, k_proj_norm_ci, + attn_out_ci, all_q_padded_ci + ); + const Tensor ¤t_hidden = alloc_0.get_ref(0); + const Tensor &next_hidden = alloc_0.get_ref(1); + const Tensor &q_proj = alloc_0.get_ref(2); + const Tensor &k_proj = alloc_0.get_ref(3); + const Tensor &v_proj = alloc_0.get_ref(4); + const Tensor &q_proj_norm = alloc_0.get_ref(5); + const Tensor &k_proj_norm = alloc_0.get_ref(6); + const Tensor &attn_out = alloc_0.get_ref(7); + const Tensor &all_q_padded = alloc_0.get_ref(8); + int64_t user_batch = (int64_t)orch_args.tensor(0).ref().shapes[0]; + int64_t batch_padded = (((user_batch + 15) / 16) * 16); + for (int64_t b0 = 0; b0 < batch_padded; b0 += 16) { + PTO2_SCOPE() { + int64_t cur_valid = std::min((user_batch - b0), 16); + + // Task 0: copy_hidden + L0TaskArgs params_t0; + params_t0.add_output(current_hidden); + params_t0.add_input(ext_hidden_states); + params_t0.add_scalar(b0); + params_t0.add_scalar(cur_valid); + rt_submit_aiv_task(0, params_t0); + const Tensor ¤t_hidden__rv_v4 = current_hidden; + } + } + for (int64_t b0 = 0; b0 < batch_padded; b0 += 16) { + PTO2_SCOPE() { + uint32_t normed_tile_ci_shapes[2] = {16, 5120}; + TensorCreateInfo normed_tile_ci(normed_tile_ci_shapes, 2, DataType::BFLOAT16); + TaskOutputTensors alloc_1 = alloc_tensors(normed_tile_ci); + const Tensor &normed_tile = alloc_1.get_ref(0); + int64_t cur_valid__ssa_v1 = std::min((user_batch - b0), 16); + + // Task 1: rmsnorm + L0TaskArgs params_t1; + params_t1.add_input(current_hidden); + params_t1.add_output(normed_tile); + params_t1.add_input(ext_input_rms_weight); + params_t1.add_scalar(b0); + params_t1.add_scalar(cur_valid__ssa_v1); + rt_submit_aiv_task(1, params_t1); + const Tensor &normed_tile__rv_v2 = normed_tile; + for (int64_t ob_chunk = 0; ob_chunk < 80; ob_chunk += 4) { + PTO2_SCOPE() { + // Task 2: q_proj + L0TaskArgs params_t2; + params_t2.add_output(q_proj); + params_t2.add_input(normed_tile__rv_v2); + params_t2.add_input(ext_wq); + params_t2.add_scalar(ob_chunk); + params_t2.add_scalar(b0); + rt_submit_aic_task(2, params_t2); + const Tensor &q_proj__rv_v6 = q_proj; + } + } + for (int64_t ob_chunk = 0; ob_chunk < 16; ob_chunk += 4) { + PTO2_SCOPE() { + // Task 3: kv_proj + L0TaskArgs params_t3; + params_t3.add_output(k_proj); + params_t3.add_output(v_proj); + params_t3.add_input(normed_tile__rv_v2); + params_t3.add_input(ext_wk); + params_t3.add_input(ext_wv); + params_t3.add_scalar(ob_chunk); + params_t3.add_scalar(b0); + rt_submit_aic_task(3, params_t3); + const Tensor &k_proj__rv_v6 = k_proj; + const Tensor &v_proj__rv_v6 = v_proj; + } + } + } + } + for (int64_t b0 = 0; b0 < batch_padded; b0 += 16) { + PTO2_SCOPE() { + // Task 4: qk_norm + L0TaskArgs params_t4; + params_t4.add_output(q_proj_norm); + params_t4.add_input(q_proj); + params_t4.add_input(ext_q_norm_weight); + params_t4.add_output(k_proj_norm); + params_t4.add_input(k_proj); + params_t4.add_input(ext_k_norm_weight); + params_t4.add_scalar(b0); + rt_submit_aiv_task(4, params_t4); + const Tensor &q_proj_norm__rv_v4 = q_proj_norm; + const Tensor &k_proj_norm__rv_v4 = k_proj_norm; + } + } + + // Task 5: q_pad + L0TaskArgs params_t5; + params_t5.add_output(all_q_padded); + rt_submit_aiv_task(5, params_t5); + const Tensor &all_q_padded__rv_v2 = all_q_padded; + for (int64_t b = 0; b < user_batch; b += 1) { + PTO2_SCOPE() { + uint32_t attn_row_padded_ci_shapes[2] = {1, 16384}; + TensorCreateInfo attn_row_padded_ci(attn_row_padded_ci_shapes, 2, DataType::BFLOAT16); + uint32_t all_raw_scores_ci_shapes[2] = {256, 256}; + TensorCreateInfo all_raw_scores_ci(all_raw_scores_ci_shapes, 2, DataType::FLOAT32); + uint32_t all_exp_padded_ci_shapes[2] = {256, 256}; + TensorCreateInfo all_exp_padded_ci(all_exp_padded_ci_shapes, 2, DataType::BFLOAT16); + uint32_t all_oi_tmp_ci_shapes[2] = {256, 128}; + TensorCreateInfo all_oi_tmp_ci(all_oi_tmp_ci_shapes, 2, DataType::FLOAT32); + uint32_t all_cur_mi_ci_shapes[2] = {256, 1}; + TensorCreateInfo all_cur_mi_ci(all_cur_mi_ci_shapes, 2, DataType::FLOAT32); + uint32_t all_cur_li_ci_shapes[2] = {256, 1}; + TensorCreateInfo all_cur_li_ci(all_cur_li_ci_shapes, 2, DataType::FLOAT32); + TaskOutputTensors alloc_2 = alloc_tensors( + attn_row_padded_ci, all_raw_scores_ci, all_exp_padded_ci, all_oi_tmp_ci, all_cur_mi_ci, + all_cur_li_ci + ); + const Tensor &attn_row_padded = alloc_2.get_ref(0); + const Tensor &all_raw_scores = alloc_2.get_ref(1); + const Tensor &all_exp_padded = alloc_2.get_ref(2); + const Tensor &all_oi_tmp = alloc_2.get_ref(3); + const Tensor &all_cur_mi = alloc_2.get_ref(4); + const Tensor &all_cur_li = alloc_2.get_ref(5); + size_t idx_ctx_len = b; + int32_t ctx_len = static_cast(orch_args.tensor(7).ref().data_as())[idx_ctx_len]; + int64_t pos = (static_cast(ctx_len) - 1); + int64_t ctx_blocks = ((static_cast(ctx_len) + 255) / 256); + int64_t block_table_base = (b * 2); + size_t idx_slot = b; + int32_t slot = static_cast(orch_args.tensor(9).ref().data_as())[idx_slot]; + int64_t slot_block = (static_cast(slot) / 256); + int64_t slot_offset = (static_cast(slot) - (slot_block * 256)); + uint32_t cos_row_shapes[2] = {1, 128}; + uint32_t cos_row_offsets[2] = {static_cast(pos), 0}; + Tensor cos_row = ext_rope_cos.view(cos_row_shapes, cos_row_offsets); + uint32_t sin_row_shapes[2] = {1, 128}; + uint32_t sin_row_offsets[2] = {static_cast(pos), 0}; + Tensor sin_row = ext_rope_sin.view(sin_row_shapes, sin_row_offsets); + uint32_t cos_lo_shapes[2] = {1, 64}; + uint32_t cos_lo_offsets[2] = {0, 0}; + Tensor cos_lo = cos_row.view(cos_lo_shapes, cos_lo_offsets); + uint32_t cos_hi_shapes[2] = {1, 64}; + uint32_t cos_hi_offsets[2] = {0, 64}; + Tensor cos_hi = cos_row.view(cos_hi_shapes, cos_hi_offsets); + uint32_t sin_lo_shapes[2] = {1, 64}; + uint32_t sin_lo_offsets[2] = {0, 0}; + Tensor sin_lo = sin_row.view(sin_lo_shapes, sin_lo_offsets); + uint32_t sin_hi_shapes[2] = {1, 64}; + uint32_t sin_hi_offsets[2] = {0, 64}; + Tensor sin_hi = sin_row.view(sin_hi_shapes, sin_hi_offsets); + for (int64_t ki_chunk = 0; ki_chunk < 8; ki_chunk += 8) { + PTO2_SCOPE() { + // Task 6: rope_kv_cache + L0TaskArgs params_t6; + params_t6.add_inout(all_q_padded__rv_v2); + params_t6.add_output(ext_k_cache); + params_t6.add_output(ext_v_cache); + params_t6.add_input(k_proj_norm); + params_t6.add_input(cos_lo); + params_t6.add_input(sin_lo); + params_t6.add_input(cos_hi); + params_t6.add_input(sin_hi); + params_t6.add_input(v_proj); + params_t6.add_input(q_proj_norm); + params_t6.add_scalar(ki_chunk); + params_t6.add_scalar(slot_block); + params_t6.add_scalar(slot_offset); + params_t6.add_scalar(b); + rt_submit_aiv_task(6, params_t6); + const Tensor &all_q_padded__rv_v9 = all_q_padded__rv_v2; + const Tensor &k_cache__rv_v8 = ext_k_cache; + const Tensor &v_cache__rv_v8 = ext_v_cache; + } + } + uint32_t attn_row_shapes[2] = {1, 5120}; + uint32_t attn_row_offsets[2] = {static_cast(b), 0}; + Tensor attn_row = attn_out.view(attn_row_shapes, attn_row_offsets); + for (int64_t sb_chunk = 0; sb_chunk < ctx_blocks; sb_chunk += 64) { + PTO2_SCOPE() { + // Task 7: qk_matmul + L0TaskArgs params_t7; + params_t7.add_output(all_raw_scores); + params_t7.add_input(all_q_padded__rv_v2); + params_t7.add_input(ext_block_table); + params_t7.add_input(ext_k_cache); + params_t7.add_scalar(b); + params_t7.add_scalar(sb_chunk); + params_t7.add_scalar(ctx_blocks); + params_t7.add_scalar(block_table_base); + rt_submit_aic_task(7, params_t7); + const Tensor &all_raw_scores__rv_v4 = all_raw_scores; + } + } + for (int64_t sb_chunk = 0; sb_chunk < ctx_blocks; sb_chunk += 64) { + PTO2_SCOPE() { + // Task 8: softmax + L0TaskArgs params_t8; + params_t8.add_output(all_cur_li); + params_t8.add_output(all_cur_mi); + params_t8.add_output(all_exp_padded); + params_t8.add_input(all_raw_scores); + params_t8.add_scalar(sb_chunk); + params_t8.add_scalar(ctx_blocks); + params_t8.add_scalar(ctx_len); + rt_submit_aiv_task(8, params_t8); + const Tensor &all_cur_li__rv_v4 = all_cur_li; + const Tensor &all_cur_mi__rv_v4 = all_cur_mi; + const Tensor &all_exp_padded__rv_v4 = all_exp_padded; + } + } + for (int64_t sb_chunk = 0; sb_chunk < ctx_blocks; sb_chunk += 64) { + PTO2_SCOPE() { + // Task 9: sv_matmul + L0TaskArgs params_t9; + params_t9.add_output(all_oi_tmp); + params_t9.add_input(ext_block_table); + params_t9.add_input(all_exp_padded); + params_t9.add_input(ext_v_cache); + params_t9.add_scalar(sb_chunk); + params_t9.add_scalar(ctx_blocks); + params_t9.add_scalar(block_table_base); + rt_submit_aic_task(9, params_t9); + const Tensor &all_oi_tmp__rv_v4 = all_oi_tmp; + } + } + + // Task 10: online_softmax + L0TaskArgs params_t10; + params_t10.add_output(attn_row_padded); + params_t10.add_input(all_oi_tmp); + params_t10.add_input(all_cur_mi); + params_t10.add_input(all_cur_li); + params_t10.add_scalar(ctx_blocks); + rt_submit_aiv_task(10, params_t10); + const Tensor &attn_row_padded__rv_v2 = attn_row_padded; + + // Task 11: attention_writeback + L0TaskArgs params_t11; + params_t11.add_output(attn_row); + params_t11.add_input(attn_row_padded__rv_v2); + rt_submit_aiv_task(11, params_t11); + } + } + for (int64_t b0 = 0; b0 < batch_padded; b0 += 16) { + PTO2_SCOPE() { + uint32_t resid1_tile_ci_shapes[2] = {16, 5120}; + TensorCreateInfo resid1_tile_ci(resid1_tile_ci_shapes, 2, DataType::FLOAT32); + uint32_t post_norm_tile_ci_shapes[2] = {16, 5120}; + TensorCreateInfo post_norm_tile_ci(post_norm_tile_ci_shapes, 2, DataType::BFLOAT16); + uint32_t mlp_tile_ci_shapes[2] = {16, 17408}; + TensorCreateInfo mlp_tile_ci(mlp_tile_ci_shapes, 2, DataType::BFLOAT16); + TaskOutputTensors alloc_3 = alloc_tensors(resid1_tile_ci, post_norm_tile_ci, mlp_tile_ci); + const Tensor &resid1_tile = alloc_3.get_ref(0); + const Tensor &post_norm_tile = alloc_3.get_ref(1); + const Tensor &mlp_tile = alloc_3.get_ref(2); + int64_t cur_valid__ssa_v2 = std::min((user_batch - b0), 16); + for (int64_t ob = 0; ob < 80; ob += 1) { + PTO2_SCOPE() { + uint32_t ret0__out_ci_shapes[2] = {16, 64}; + TensorCreateInfo ret0__out_ci(ret0__out_ci_shapes, 2, DataType::FLOAT32); + TaskOutputTensors alloc_4 = alloc_tensors(ret0__out_ci); + const Tensor &ret0__out = alloc_4.get_ref(0); + int64_t o0 = (ob * 64); + + // Task 12: out_proj + L0TaskArgs params_t12; + params_t12.add_input(attn_out); + params_t12.add_input(ext_wo); + params_t12.add_inout(ret0__out); + params_t12.add_scalar(b0); + params_t12.add_scalar(o0); + rt_submit_aic_task(12, params_t12); + const Tensor &o_acc = ret0__out; + + // Task 13: out_proj_residual + L0TaskArgs params_t13; + params_t13.add_input(current_hidden); + params_t13.add_input(o_acc); + params_t13.add_inout(resid1_tile); + params_t13.add_scalar(b0); + params_t13.add_scalar(o0); + params_t13.add_scalar(cur_valid__ssa_v2); + rt_submit_aiv_task(13, params_t13); + const Tensor &resid1_tile__ssa_v3 = resid1_tile; + } + } + + // Task 14: post_rmsnorm + L0TaskArgs params_t14; + params_t14.add_input(resid1_tile); + params_t14.add_output(post_norm_tile); + params_t14.add_input(ext_post_rms_weight); + rt_submit_aiv_task(14, params_t14); + const Tensor &post_norm_tile__rv_v2 = post_norm_tile; + for (int64_t ob = 0; ob < 68; ob += 1) { + PTO2_SCOPE() { + uint32_t ret0__out_1_ci_shapes[2] = {16, 256}; + TensorCreateInfo ret0__out_1_ci(ret0__out_1_ci_shapes, 2, DataType::FLOAT32); + uint32_t ret0__out_2_ci_shapes[2] = {16, 256}; + TensorCreateInfo ret0__out_2_ci(ret0__out_2_ci_shapes, 2, DataType::FLOAT32); + TaskOutputTensors alloc_5 = alloc_tensors(ret0__out_1_ci, ret0__out_2_ci); + const Tensor &ret0__out_1 = alloc_5.get_ref(0); + const Tensor &ret0__out_2 = alloc_5.get_ref(1); + int64_t o0__ssa_v1 = (ob * 256); + + // Task 15: gate_proj + L0TaskArgs params_t15; + params_t15.add_input(post_norm_tile__rv_v2); + params_t15.add_input(ext_w_gate); + params_t15.add_inout(ret0__out_1); + params_t15.add_scalar(o0__ssa_v1); + rt_submit_aic_task(15, params_t15); + const Tensor &gate_acc = ret0__out_1; + + // Task 16: up_proj + L0TaskArgs params_t16; + params_t16.add_input(post_norm_tile__rv_v2); + params_t16.add_input(ext_w_up); + params_t16.add_inout(ret0__out_2); + params_t16.add_scalar(o0__ssa_v1); + rt_submit_aic_task(16, params_t16); + const Tensor &up_acc = ret0__out_2; + + // Task 17: silu + L0TaskArgs params_t17; + params_t17.add_input(gate_acc); + params_t17.add_input(up_acc); + params_t17.add_inout(mlp_tile); + params_t17.add_scalar(o0__ssa_v1); + rt_submit_aiv_task(17, params_t17); + const Tensor &mlp_tile__ssa_v3 = mlp_tile; + } + } + for (int64_t dob = 0; dob < 40; dob += 1) { + PTO2_SCOPE() { + uint32_t fp32_chunk_gm_ci_shapes[2] = {16, 128}; + TensorCreateInfo fp32_chunk_gm_ci(fp32_chunk_gm_ci_shapes, 2, DataType::FLOAT32); + TaskOutputTensors alloc_6 = alloc_tensors(fp32_chunk_gm_ci); + const Tensor &fp32_chunk_gm = alloc_6.get_ref(0); + int64_t d0 = (dob * 128); + + // Task 18: down_proj + L0TaskArgs params_t18; + params_t18.add_input(mlp_tile); + params_t18.add_input(ext_w_down); + params_t18.add_inout(fp32_chunk_gm); + params_t18.add_scalar(d0); + rt_submit_aic_task(18, params_t18); + const Tensor &fp32_chunk_gm__ssa_v1 = fp32_chunk_gm; + + // Task 19: down_proj_residual + L0TaskArgs params_t19; + params_t19.add_input(fp32_chunk_gm__ssa_v1); + params_t19.add_input(resid1_tile); + params_t19.add_inout(next_hidden); + params_t19.add_scalar(d0); + params_t19.add_scalar(b0); + rt_submit_aiv_task(19, params_t19); + const Tensor &next_hidden__ssa_v5 = next_hidden; + } + } + } + } + Tensor current_hidden__ssa_v8 = next_hidden; + for (int64_t b0 = 0; b0 < batch_padded; b0 += 16) { + PTO2_SCOPE() { + int64_t cur_valid__ssa_v3 = std::min((user_batch - b0), 16); + + // Task 20: copy_out + L0TaskArgs params_t20; + params_t20.add_output(ext_out); + params_t20.add_input(current_hidden__ssa_v8); + params_t20.add_scalar(b0); + params_t20.add_scalar(cur_valid__ssa_v3); + rt_submit_aiv_task(20, params_t20); + const Tensor &out = ext_out; + } + } + } +} + +} // extern "C" diff --git a/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/test_qwen3_14b_decode.py b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/test_qwen3_14b_decode.py new file mode 100644 index 000000000..8ef7c8b34 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/qwen3_14b_decode/test_qwen3_14b_decode.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Qwen3-14B single-layer decode — tensormap_and_ringbuffer SceneTestCase. + +A single fused decode step (21 kernels: 8 AIC + 13 AIV) covering +RMSNorm → QKV → per-head Q/K RMS → RoPE → paged KV-cache write → paged +attention (online softmax) → output projection + residual → post-RMSNorm +→ SwiGLU FFN → down-proj + residual, against the production Qwen3-14B +hidden/intermediate/head shapes (HIDDEN=5120, INTERMEDIATE=17408, +NUM_HEADS=40 / NUM_KV_HEADS=8, HEAD_DIM=128, BLOCK_SIZE=256). +""" + +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, scene_test +from simpler_setup.goldens.qwen3_14b_decode import ( + compute_golden as _decode_golden, +) +from simpler_setup.goldens.qwen3_14b_decode import ( + generate_inputs as _decode_generate_inputs, +) + + +@scene_test(level=2, runtime="fully_distributed_within_core") +class TestQwen314BDecode(SceneTestCase): + """Single-layer Qwen3-14B decode against a torch reference.""" + + # Bf16 deep-transformer drift over 21 kernels in series — paged attention + # plus FFN accumulate, so values O(10) settle in the ~1e-1 absolute range. + RTOL = 5e-2 + ATOL = 1e-1 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/qwen3_decode.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [ + D.IN, # 0 hidden_states + D.IN, # 1 input_rms_weight + D.IN, # 2 wq + D.IN, # 3 wk + D.IN, # 4 wv + D.IN, # 5 q_norm_weight + D.IN, # 6 k_norm_weight + D.IN, # 7 seq_lens + D.IN, # 8 block_table + D.IN, # 9 slot_mapping + D.IN, # 10 rope_cos + D.IN, # 11 rope_sin + D.INOUT, # 12 k_cache + D.INOUT, # 13 v_cache + D.IN, # 14 wo + D.IN, # 15 post_rms_weight + D.IN, # 16 w_gate + D.IN, # 17 w_up + D.IN, # 18 w_down + D.OUT, # 19 out + ], + }, + "incores": [ + { + "func_id": 0, + "name": "copy_hidden", + "source": "kernels/aiv/copy_hidden.cpp", + "core_type": "aiv", + "signature": [D.OUT, D.IN, D.SCALAR, D.SCALAR], + }, + { + "func_id": 1, + "name": "rmsnorm", + "source": "kernels/aiv/rmsnorm.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.IN, D.SCALAR, D.SCALAR], + }, + { + "func_id": 2, + "name": "q_proj", + "source": "kernels/aic/q_proj.cpp", + "core_type": "aic", + "signature": [D.OUT, D.IN, D.IN, D.SCALAR, D.SCALAR], + }, + { + "func_id": 3, + "name": "kv_proj", + "source": "kernels/aic/kv_proj.cpp", + "core_type": "aic", + "signature": [D.OUT, D.OUT, D.IN, D.IN, D.IN, D.SCALAR, D.SCALAR], + }, + { + "func_id": 4, + "name": "qk_norm", + "source": "kernels/aiv/qk_norm.cpp", + "core_type": "aiv", + "signature": [D.OUT, D.IN, D.IN, D.OUT, D.IN, D.IN, D.SCALAR], + }, + { + "func_id": 5, + "name": "q_pad", + "source": "kernels/aiv/q_pad.cpp", + "core_type": "aiv", + "signature": [D.OUT], + }, + { + "func_id": 6, + "name": "rope_kv_cache", + "source": "kernels/aiv/rope_kv_cache.cpp", + "core_type": "aiv", + "signature": [ + D.INOUT, + D.OUT, + D.OUT, + D.IN, + D.IN, + D.IN, + D.IN, + D.IN, + D.IN, + D.IN, + D.SCALAR, + D.SCALAR, + D.SCALAR, + D.SCALAR, + ], + }, + { + "func_id": 7, + "name": "qk_matmul", + "source": "kernels/aic/qk_matmul.cpp", + "core_type": "aic", + "signature": [D.OUT, D.IN, D.IN, D.IN, D.SCALAR, D.SCALAR, D.SCALAR, D.SCALAR], + }, + { + "func_id": 8, + "name": "softmax", + "source": "kernels/aiv/softmax.cpp", + "core_type": "aiv", + "signature": [D.OUT, D.OUT, D.OUT, D.IN, D.SCALAR, D.SCALAR, D.SCALAR], + }, + { + "func_id": 9, + "name": "sv_matmul", + "source": "kernels/aic/sv_matmul.cpp", + "core_type": "aic", + "signature": [D.OUT, D.IN, D.IN, D.IN, D.SCALAR, D.SCALAR, D.SCALAR], + }, + { + "func_id": 10, + "name": "online_softmax", + "source": "kernels/aiv/online_softmax.cpp", + "core_type": "aiv", + "signature": [D.OUT, D.IN, D.IN, D.IN, D.SCALAR], + }, + { + "func_id": 11, + "name": "attention_writeback", + "source": "kernels/aiv/attention_writeback.cpp", + "core_type": "aiv", + "signature": [D.OUT, D.IN], + }, + { + "func_id": 12, + "name": "out_proj", + "source": "kernels/aic/out_proj.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.INOUT, D.SCALAR, D.SCALAR], + }, + { + "func_id": 13, + "name": "out_proj_residual", + "source": "kernels/aiv/out_proj_residual.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.INOUT, D.SCALAR, D.SCALAR, D.SCALAR], + }, + { + "func_id": 14, + "name": "post_rmsnorm", + "source": "kernels/aiv/post_rmsnorm.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.IN], + }, + { + "func_id": 15, + "name": "gate_proj", + "source": "kernels/aic/gate_proj.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.INOUT, D.SCALAR], + }, + { + "func_id": 16, + "name": "up_proj", + "source": "kernels/aic/up_proj.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.INOUT, D.SCALAR], + }, + { + "func_id": 17, + "name": "silu", + "source": "kernels/aiv/silu.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.INOUT, D.SCALAR], + }, + { + "func_id": 18, + "name": "down_proj", + "source": "kernels/aic/down_proj.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.INOUT, D.SCALAR], + }, + { + "func_id": 19, + "name": "down_proj_residual", + "source": "kernels/aiv/down_proj_residual.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.INOUT, D.SCALAR, D.SCALAR], + }, + { + "func_id": 20, + "name": "copy_out", + "source": "kernels/aiv/copy_out.cpp", + "core_type": "aiv", + "signature": [D.OUT, D.IN, D.SCALAR, D.SCALAR], + }, + ], + } + + CASES = [ + { + "name": "SmallSingle", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {"user_batch": 1, "seq_len": 8}, + }, + ] + + def generate_args(self, params): + return _decode_generate_inputs(params["user_batch"], params["seq_len"]) + + def compute_golden(self, args, params): + _decode_golden(args, params["user_batch"], params["seq_len"]) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_add.cpp b/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_add.cpp new file mode 100644 index 000000000..8a119554d --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_add.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Element-wise Tensor Addition Kernel + * + * Implements: out[i] = src0[i] + src1[i] + * + * This kernel performs element-wise addition of two tensors. It's compiled + * separately as a standalone kernel and linked with the dispatcher using + * function pointers, demonstrating the separation pattern used in production + * systems where kernel binaries are loaded dynamically. + */ + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +/** + * Element-wise addition kernel implementation + * + * Unified signature: all arguments passed via int64_t array + * @param args Argument array: + * args[0] = src0 pointer (first input tensor) + * args[1] = src1 pointer (second input tensor) + * args[2] = out pointer (output tensor) + * args[3] = size (number of elements) + */ +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack arguments (Tensor* pointers from runtime) + __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset; + __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset; + __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + // Configuration: float, 128, 128, 128, 128 + constexpr int kTRows_ = 128; + constexpr int kTCols_ = 128; + constexpr int vRows = 128; + constexpr int vCols = 128; + + using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; + using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src0Tile(vRows, vCols); + TileData src1Tile(vRows, vCols); + TileData dstTile(vRows, vCols); + TASSIGN(src0Tile, 0x0); + TASSIGN(src1Tile, 0x10000); + TASSIGN(dstTile, 0x20000); + + GlobalData src0Global(src0); + GlobalData src1Global(src1); + GlobalData dstGlobal(out); + + TLOAD(src0Tile, src0Global); + TLOAD(src1Tile, src1Global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(dstTile, src0Tile, src1Tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dstGlobal, dstTile); + + pipe_sync(); +} diff --git a/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_noop.cpp b/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_noop.cpp new file mode 100644 index 000000000..8187197c4 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/aiv/kernel_noop.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * No-op Kernel + * + * Empty kernel used to trigger runtime allocation for tensors passed + * as OUTPUT/INOUT via add_inout(). The runtime allocates HeapRing memory + * and writes initial values before dispatching this task; the kernel + * itself does not read or modify any data. + */ + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { (void)args; } diff --git a/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp b/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp new file mode 100644 index 000000000..a0a8ed7d8 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp @@ -0,0 +1,265 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Scalar Data Dependency Test Orchestration + * + * End-to-end test for get_tensor_data, set_tensor_data, and add_inout + * with runtime-created outputs and initial value support. + * + * Flow: + * 1. c = a + b (kernel_add, runtime-created tensor) + * 2. get_tensor_data(c, {0}) → check[0] = 2.0 + * 3. get_tensor_data(c, {100}) → check[1] = 102.0 + * 4. scalar_tensor = add_output(TensorCreateInfo, 77.0f), submit noop + * 5. get_tensor_data(scalar_tensor, {0}) → check[2] = 77.0 + * 6. add_inout(scalar_tensor) (INOUT path), submit noop + * 7. get_tensor_data(scalar_tensor, {0}) → check[3] = 77.0 + * 8. check[4] = 2.0 + 77.0 = 79.0 (orchestration arithmetic) + * 9. set_tensor_data(scalar_tensor, {0}, 42.0), get_tensor_data → check[5] = 42.0 + * 10. Orch set_tensor_data(d, {0}, 10.0) → kernel_add(d, a) → check[6] = 12.0 + * 11. WAW+WAR: kernel_add reads c → set_tensor_data(c, 88.0) auto-waits → check[7] = 88.0 + * 12. External WAR with INOUT: noop(ext_b as INOUT) → set_tensor_data(ext_b) → check[8] = 55.0 + * 13. result = a + b (kernel_add, external output via INOUT) + */ + +#include +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +#define FUNC_ADD 0 +#define FUNC_NOOP 1 + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 4, // a, b, result, check + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) { + // External tensors from golden.py + const Tensor &ext_a = orch_args.tensor(0).ref(); + const Tensor &ext_b = orch_args.tensor(1).ref(); + const Tensor &ext_result = orch_args.tensor(2).ref(); + const Tensor &ext_check = orch_args.tensor(3).ref(); + + uint32_t SIZE = orch_args.tensor(0).ref().shapes[0]; + LOG_INFO_V0("scalar_data_test: SIZE=%u, check_size=%u", SIZE, orch_args.tensor(3).ref().shapes[0]); + + uint32_t inter_shapes[1] = {SIZE}; + TensorCreateInfo inter_ci(inter_shapes, 1, DataType::FLOAT32); + + // ========================================================= + // Step 1: c = a + b (runtime-created tensor, kernel_add) + // ========================================================= + L0TaskArgs params_c; + params_c.add_input(ext_a); + params_c.add_input(ext_b); + params_c.add_output(inter_ci); + TaskOutputTensors c_outs = rt_submit_aiv_task(FUNC_ADD, params_c); + const Tensor &c = c_outs.get_ref(0); + + // ========================================================= + // Step 2: get_tensor_data(c, {0}) → check[0] + // Tests TensorMap lookup + spin-wait for kernel completion + // ========================================================= + uint32_t idx[1] = {0}; + float c0_val = get_tensor_data(c, 1, idx); + LOG_INFO_V0("get_tensor_data(c, {0}) = %f (expected 2.0)", static_cast(c0_val)); + + uint32_t check_idx[1] = {0}; + set_tensor_data(ext_check, 1, check_idx, c0_val); + + // ========================================================= + // Step 3: get_tensor_data(c, {100}) → check[1] + // Tests flat offset calculation for non-zero index + // ========================================================= + idx[0] = 100; + float c100_val = get_tensor_data(c, 1, idx); + LOG_INFO_V0("get_tensor_data(c, {100}) = %f (expected 102.0)", static_cast(c100_val)); + + check_idx[0] = 1; + set_tensor_data(ext_check, 1, check_idx, c100_val); + + // ========================================================= + // Step 4: Runtime-created scalar output with initial value + // Runtime allocates HeapRing buffer, writes 77.0 to element [0] + // ========================================================= + uint32_t scalar_shapes[1] = {1}; + TensorCreateInfo scalar_ci(scalar_shapes, 1, DataType::FLOAT32); + scalar_ci.set_initial_value(77.0f); + TaskOutputTensors scalar_alloc_outs = alloc_tensors(scalar_ci); + const Tensor &scalar_tensor = scalar_alloc_outs.get_ref(0); + + // ========================================================= + // Step 5: get_tensor_data(scalar_tensor, {0}) → check[2] + // Verifies initial value was written correctly + // ========================================================= + idx[0] = 0; + float s0_val = get_tensor_data(scalar_tensor, 1, idx); + LOG_INFO_V0("get_tensor_data(scalar_tensor, {0}) after init = %f (expected 77.0)", static_cast(s0_val)); + + check_idx[0] = 2; + set_tensor_data(ext_check, 1, check_idx, s0_val); + + // ========================================================= + // Step 6: add_inout(scalar_tensor) second use → INOUT path + // Buffer already exists, so the noop just registers dependency + // ========================================================= + { + L0TaskArgs args; + args.add_inout(scalar_tensor); + rt_submit_aiv_task(FUNC_NOOP, args); + } + + // ========================================================= + // Step 7: get_tensor_data(scalar_tensor, {0}) → check[3] + // Value should be preserved (noop kernel didn't modify it) + // ========================================================= + float s1_val = get_tensor_data(scalar_tensor, 1, idx); + LOG_INFO_V0("get_tensor_data(scalar_tensor, {0}) after 2nd noop = %f (expected 77.0)", static_cast(s1_val)); + + check_idx[0] = 3; + set_tensor_data(ext_check, 1, check_idx, s1_val); + + // ========================================================= + // Step 8: set_tensor_data with orchestration-computed value → check[4] + // Tests set_tensor_data write + orchestration arithmetic + // ========================================================= + float combined = c0_val + s0_val; // 2.0 + 77.0 = 79.0 + LOG_INFO_V0( + "Orchestration arithmetic: %f + %f = %f", static_cast(c0_val), static_cast(s0_val), + static_cast(combined) + ); // NOLINT(whitespace/line_length) + + check_idx[0] = 4; + set_tensor_data(ext_check, 1, check_idx, combined); + + // ========================================================= + // Step 9: Orch set→get round-trip on internal tensor + // Validates that set_tensor_data writes are visible to get_tensor_data + // on the same tensor. Uses scalar_tensor (currently 77.0), overwrites to 42.0. + // ========================================================= + set_tensor_data(scalar_tensor, 1, idx, 42.0f); + float rw_val = get_tensor_data(scalar_tensor, 1, idx); + LOG_INFO_V0("set_tensor_data→get_tensor_data round-trip = %f (expected 42.0)", static_cast(rw_val)); + + check_idx[0] = 5; + set_tensor_data(ext_check, 1, check_idx, rw_val); + + // ========================================================= + // Step 10: Orch→AICore RAW (set_tensor_data → kernel reads) + // Orchestration writes d[0]=10.0 via set_tensor_data, then + // kernel_add reads d as input: e[0] = d[0] + a[0] = 12.0 + // ========================================================= + TaskOutputTensors d_alloc_outs = alloc_tensors(inter_ci); + const Tensor &d = d_alloc_outs.get_ref(0); + + idx[0] = 0; + set_tensor_data(d, 1, idx, 10.0f); + + L0TaskArgs params_e; + params_e.add_input(d); + params_e.add_input(ext_a); + params_e.add_output(inter_ci); + TaskOutputTensors e_outs = rt_submit_aiv_task(FUNC_ADD, params_e); + const Tensor &e = e_outs.get_ref(0); + + float e0_val = get_tensor_data(e, 1, idx); + LOG_INFO_V0("Orch→AICore RAW: e[0] = %f (expected 12.0)", static_cast(e0_val)); + + check_idx[0] = 6; + set_tensor_data(ext_check, 1, check_idx, e0_val); + + // ========================================================= + // Step 11: WAW + WAR on internal tensor + // c was written by Step 1 (kernel_add, TensorMap has producer entry). + // Submit a new kernel that reads c as INPUT (creates consumer dep). + // Then set_tensor_data(c) — no manual get_tensor_data sync. + // set_tensor_data internally waits for: + // - WAW: producer (Step 1) COMPLETED + // - WAR: consumer (this kernel) done (fanout_refcount check) + // + // NOTE on external tensors: ext_a was read by Step 1 as INPUT, + // but TensorMap has no producer entry for ext_a (only consumers). + // set_tensor_data(ext_a) would NOT detect the reader — data race. + // To ensure WAR safety on external tensors, use add_inout() + // instead of add_input() so TensorMap tracks the access chain. + // ========================================================= + { + L0TaskArgs args; + args.add_input(c); + args.add_input(ext_b); + args.add_output(inter_ci); + (void)rt_submit_aiv_task(FUNC_ADD, args); // NOLINT(readability/casting) + } + + // set_tensor_data auto-waits for producer + consumer before writing + idx[0] = 0; + set_tensor_data(c, 1, idx, 88.0f); + float waw_val = get_tensor_data(c, 1, idx); + LOG_INFO_V0("WAW+WAR: set_tensor_data(c, 88.0) after consumer = %f (expected 88.0)", static_cast(waw_val)); + + check_idx[0] = 7; + set_tensor_data(ext_check, 1, check_idx, waw_val); + + // ========================================================= + // Step 12: External tensor WAR — must use add_output or add_inout, not add_input + // + // For external tensors, using add_input() does NOT create a + // TensorMap entry. set_tensor_data would then write immediately + // without waiting for the reader kernel — a WAR data race. + // + // Using add_output() (or add_inout()) creates a TensorMap entry, + // enabling set_tensor_data to detect the producer via TensorMap lookup + // and wait for fanout_refcount (all consumers done). + // + // Here we submit noop with ext_b as write-only output (noop doesn't + // read data), then set_tensor_data overwrites ext_b[0] = 55.0. + // set_tensor_data auto-waits for the noop to complete. + // ========================================================= + { + L0TaskArgs args; + args.add_output(ext_b); // write-only: creates TensorMap entry (not add_input!) + rt_submit_aiv_task(FUNC_NOOP, args); + } + + idx[0] = 0; + set_tensor_data(ext_b, 1, idx, 55.0f); + float ext_war_val = get_tensor_data(ext_b, 1, idx); + LOG_INFO_V0( + "External WAR (INOUT): set_tensor_data(ext_b, 55.0) = %f (expected 55.0)", static_cast(ext_war_val) + ); + + check_idx[0] = 8; + set_tensor_data(ext_check, 1, check_idx, ext_war_val); + + // Restore ext_b[0] for final result comparison + set_tensor_data(ext_b, 1, idx, 0.0f); + + // ========================================================= + // Step 13: result = a + b (external output via add_output, kernel_add) + // ========================================================= + { + L0TaskArgs args; + args.add_input(ext_a); + args.add_input(ext_b); + args.add_output(ext_result); + rt_submit_aiv_task(FUNC_ADD, args); + } + + LOG_INFO_V0("scalar_data_test: orchestration complete"); +} + +} // extern "C" diff --git a/examples/a2a3/fully_distributed_within_core/scalar_data_test/test_scalar_data.py b/examples/a2a3/fully_distributed_within_core/scalar_data_test/test_scalar_data.py new file mode 100644 index 000000000..4cce2af1d --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/scalar_data_test/test_scalar_data.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Scalar data dependency test: GetTensorData, SetTensorData, add_inout. + +Tests orchestration-level data manipulation: scalar initialization, +Get/Set round-trips, WAW+WAR dependency auto-wait, and external tensor WAR. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + + +@scene_test(level=2, runtime="fully_distributed_within_core") +class TestScalarData(SceneTestCase): + """Scalar data dependency: Get/SetTensorData, add_inout with initial value.""" + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/scalar_data_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/kernel_noop.cpp", + "core_type": "aiv", + "signature": [], + }, + ], + } + + CASES = [ + { + "name": "default", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 3}, + "params": {}, + }, + ] + + def generate_args(self, params): + SIZE = 128 * 128 + return TaskArgsBuilder( + Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)), + Tensor("b", torch.arange(SIZE, dtype=torch.float32)), + Tensor("result", torch.zeros(SIZE, dtype=torch.float32)), + Tensor("check", torch.zeros(10, dtype=torch.float32)), + ) + + def compute_golden(self, args, params): + # result = a + b (computed by kernel_add) + args.result[:] = args.a + args.b + + # check values written by orchestration via SetTensorData + args.check[0] = 2.0 # GetTensorData(c, {0}): c = a + b, c[0] = 2.0+0.0 + args.check[1] = 102.0 # GetTensorData(c, {100}): c[100] = 2.0+100.0 + args.check[2] = 77.0 # runtime-created scalar output initialized to 77.0 + args.check[3] = 77.0 # second noop via add_inout preserves the value + args.check[4] = 79.0 # orchestration arithmetic: 2.0 + 77.0 + args.check[5] = 42.0 # Orch set->get round-trip: SetTensorData then GetTensorData + args.check[6] = 12.0 # Orch->AICore RAW: SetTensorData(d,10.0) + kernel_add(d,a) -> 10.0+2.0 + args.check[7] = 88.0 # WAW+WAR: kernel reads c, SetTensorData(c,88.0) auto-waits + args.check[8] = 55.0 # External WAR: noop(ext_b INOUT) -> SetTensorData(ext_b,55.0) auto-waits + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_consumer.cpp b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_consumer.cpp new file mode 100644 index 000000000..ef56d934e --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_consumer.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#ifndef __gm__ +#define __gm__ +#endif +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +#include + +#include "tensor.h" + +using namespace pto; + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *result_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + + __gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset; + __gm__ float *result = reinterpret_cast<__gm__ float *>(result_tensor->buffer.addr) + result_tensor->start_offset; + + constexpr int kTotalRows = 128; + constexpr int kRows = 64; + constexpr int kCols = 128; + constexpr int kIters = kTotalRows / kRows; + using DynShapeDim5 = Shape<1, 1, 1, kRows, kCols>; + using DynStrideDim5 = Stride<1, 1, 1, kCols, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src_tile(kRows, kCols); + TileData result_tile(kRows, kCols); + TASSIGN(src_tile, 0x0); + TASSIGN(result_tile, 0x10000); + + constexpr int kChunkElems = kRows * kCols; + for (int iter = 0; iter < kIters; ++iter) { + GlobalData src_global(src + iter * kChunkElems); + GlobalData result_global(result + iter * kChunkElems); + TLOAD(src_tile, src_global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + TADDS(result_tile, src_tile, 1.0f); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + + TSTORE(result_global, result_tile); + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + } +} diff --git a/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_sdma_tget_async.cpp b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_sdma_tget_async.cpp new file mode 100644 index 000000000..eb8b5aeb3 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/aiv/kernel_sdma_tget_async.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#ifndef __gm__ +#define __gm__ +#endif +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +#include + +#include "backend/sdma/sdma_completion_kernel.h" +#include "platform_comm/comm_context.h" +#include "pto_async_kernel_api.h" +#include "tensor.h" + +using namespace pto; + +template +static inline __aicore__ __gm__ T *comm_remote_ptr(__gm__ CommContext *ctx, __gm__ T *local_ptr, int peer_rank) { + uint64_t local_base = ctx->windowsIn[ctx->rankId]; + uint64_t offset = reinterpret_cast(local_ptr) - local_base; + return reinterpret_cast<__gm__ T *>(ctx->windowsIn[peer_rank] + offset); +} + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *in_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ CommContext *comm_ctx = reinterpret_cast<__gm__ CommContext *>(args[2]); + + __gm__ float *local_in = reinterpret_cast<__gm__ float *>(in_tensor->buffer.addr) + in_tensor->start_offset; + __gm__ float *local_out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + int rank = static_cast(comm_ctx->rankId); + int nranks = static_cast(comm_ctx->rankNum); + if (nranks != 2 || comm_ctx->workSpace == 0) { + pipe_barrier(PIPE_ALL); + return; + } + int peer_rank = 1 - rank; + + constexpr int kElems = 128 * 128; + using FlatShape = Shape<1, 1, 1, 1, kElems>; + using FlatStride = Stride; + using GlobalData = GlobalTensor; + using ScratchTile = Tile; + + __gm__ float *remote_in = comm_remote_ptr(comm_ctx, local_in, peer_rank); + GlobalData remote_global(remote_in); + GlobalData local_global(local_out); + + ScratchTile scratch_tile; + TASSIGN(scratch_tile, 0x0); + + AsyncCtx async_ctx = get_async_ctx(args); + send_request_entry( + async_ctx, + SdmaTget(local_global, remote_global, scratch_tile, reinterpret_cast<__gm__ uint8_t *>(comm_ctx->workSpace)) + ); +} diff --git a/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/orchestration/sdma_async_completion_orch.cpp b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/orchestration/sdma_async_completion_orch.cpp new file mode 100644 index 000000000..a33c96730 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/kernels/orchestration/sdma_async_completion_orch.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include + +#include "platform_comm/comm_context.h" +#include "pto_orchestration_api.h" + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig +sdma_async_completion_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{.expected_arg_count = 4}; +} + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + return sdma_async_completion_orchestration_config(orch_args); +} + +__attribute__((visibility("default"))) void sdma_async_completion_orchestration(const L2TaskArgs &orch_args) { + if (orch_args.tensor_count() + orch_args.scalar_count() != 4) { + LOG_ERROR("sdma_async_completion_demo: expected 4 args"); + return; + } + + const Tensor &input = orch_args.tensor(0).ref(); + const Tensor &out = orch_args.tensor(1).ref(); + const Tensor &result = orch_args.tensor(2).ref(); + auto *comm_ctx = reinterpret_cast(static_cast(orch_args.scalar(0))); + + L0TaskArgs producer_args; + producer_args.add_input(input); + producer_args.add_output(out); + producer_args.add_scalar(reinterpret_cast(comm_ctx)); + rt_submit_aiv_task(0, producer_args); + + L0TaskArgs consumer_args; + consumer_args.add_input(out); + consumer_args.add_output(result); + rt_submit_aiv_task(1, consumer_args); +} + +} // extern "C" diff --git a/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/test_sdma_async_completion_demo.py b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/test_sdma_async_completion_demo.py new file mode 100644 index 000000000..f727d3a72 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/sdma_async_completion_demo/test_sdma_async_completion_demo.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""SDMA deferred completion smoke test for onboard a2a3. + +Each rank stages its input inside the HCCL window. The deferred producer +TGET_ASYNCs the peer rank's input into local ``out`` and registers the PTO +AsyncEvent through ``defer_pto_async_event``. The consumer depends on the +producer output and writes ``result = out + 1``. Correct ``out`` and +``result`` therefore validate both the SDMA completion polling and the +deferred-release dependency path. +""" + +from __future__ import annotations + +import argparse +import os + +import pytest +import torch +from simpler.task_interface import ( + ArgDirection, + CallConfig, + ChipCallable, + CommBufferSpec, + CoreCallable, + DataType, + TaskArgs, + Tensor, + TensorArgType, +) +from simpler.worker import Worker + +from simpler_setup.elf_parser import extract_text_section +from simpler_setup.kernel_compiler import KernelCompiler +from simpler_setup.pto_isa import ensure_pto_isa_root +from simpler_setup.torch_interop import make_tensor_arg + +HERE = os.path.dirname(os.path.abspath(__file__)) +N = 128 * 128 +DTYPE_NBYTES = 4 + + +def parse_device_range(spec: str) -> list[int]: + if "," in spec: + return [int(x) for x in spec.split(",") if x] + if "-" in spec: + lo, hi = (int(x) for x in spec.split("-")) + return list(range(lo, hi + 1)) + return [int(spec)] + + +def build_chip_callable(platform: str, pto_isa_commit: str | None, clone_protocol: str) -> ChipCallable: + kc = KernelCompiler(platform=platform) + runtime = "fully_distributed_within_core" + pto_isa_root = ensure_pto_isa_root(commit=pto_isa_commit, clone_protocol=clone_protocol) + include_dirs = kc.get_orchestration_include_dirs(runtime) + extra_includes = list(include_dirs) + [str(kc.project_root / "src" / "common")] + + children = [] + for func_id, rel in [ + (0, "kernels/aiv/kernel_sdma_tget_async.cpp"), + (1, "kernels/aiv/kernel_consumer.cpp"), + ]: + kernel = kc.compile_incore( + source_path=os.path.join(HERE, rel), + core_type="aiv", + pto_isa_root=pto_isa_root, + extra_include_dirs=extra_includes, + ) + if not platform.endswith("sim"): + kernel = extract_text_section(kernel) + children.append( + ( + func_id, + CoreCallable.build( + signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.OUT, ArgDirection.IN], + binary=kernel, + ), + ) + ) + + orch = kc.compile_orchestration( + runtime_name=runtime, + source_path=os.path.join(HERE, "kernels/orchestration/sdma_async_completion_orch.cpp"), + extra_include_dirs=[str(kc.project_root / "src" / "common")], + ) + return ChipCallable.build( + signature=[ArgDirection.IN, ArgDirection.OUT, ArgDirection.OUT, ArgDirection.IN], + func_name="sdma_async_completion_orchestration", + binary=orch, + children=children, + ) + + +def run( + platform: str = "a2a3", + device_ids: list[int] | None = None, + pto_isa_commit: str | None = None, +) -> int: + if device_ids is None: + device_ids = [0, 1] + nranks = len(device_ids) + if nranks != 2: + raise ValueError(f"sdma_async_completion_demo needs exactly 2 devices, got {device_ids}") + if platform.endswith("sim"): + raise ValueError("sdma_async_completion_demo requires onboard a2a3 hardware") + + input_nbytes = N * DTYPE_NBYTES + window_size = max(input_nbytes, 4 * 1024) + + # `inputs` must live in shared memory: `orch.copy_to` stages each rank's + # data into its HCCL window from the forked chip child, which reads `src` + # out of its own address space. + inputs = [ + torch.tensor([float(rank * 1000 + (i % 251)) / 10.0 for i in range(N)], dtype=torch.float32).share_memory_() + for rank in range(nranks) + ] + out = [torch.zeros(N, dtype=torch.float32).share_memory_() for _ in range(nranks)] + result = [torch.zeros(N, dtype=torch.float32).share_memory_() for _ in range(nranks)] + + chip_callable = build_chip_callable(platform, pto_isa_commit, "https") + worker = Worker( + level=3, + platform=platform, + runtime="fully_distributed_within_core", + device_ids=device_ids, + num_sub_workers=0, + ) + chip_handle = worker.register(chip_callable) + try: + worker.init() + + def orch_fn(orch, _args, cfg): + with orch.allocate_domain( + name="default", + workers=list(range(nranks)), + window_size=window_size, + buffers=[ + CommBufferSpec(name="input_window", dtype="float32", count=N, nbytes=input_nbytes), + ], + ) as handle: + # Stage every rank's input window before submitting any kernel: + # each producer TGET_ASYNCs the *peer* rank's window, so all + # windows must hold real data before execution begins. + for rank in range(nranks): + orch.copy_to( + rank, + dst=handle[rank].buffer_ptrs["input_window"], + src=inputs[rank].data_ptr(), + size=input_nbytes, + ) + for rank in range(nranks): + domain = handle[rank] + args = TaskArgs() + args.add_tensor( + Tensor.make( + data=domain.buffer_ptrs["input_window"], + shapes=(N,), + dtype=DataType.FLOAT32, + child_memory=True, + ), + TensorArgType.INPUT, + ) + args.add_tensor(make_tensor_arg(out[rank]), TensorArgType.OUTPUT_EXISTING) + args.add_tensor(make_tensor_arg(result[rank]), TensorArgType.OUTPUT_EXISTING) + args.add_scalar(domain.device_ctx) + orch.submit_next_level(chip_handle, args, cfg, worker=rank) + + worker.run(orch_fn, args=None, config=CallConfig()) + + ok = True + for rank in range(nranks): + peer = 1 - rank + expected_out = inputs[peer] + expected_result = expected_out + 1.0 + max_out = float(torch.max(torch.abs(out[rank] - expected_out))) + max_result = float(torch.max(torch.abs(result[rank] - expected_result))) + print(f"[sdma_async_completion_demo] rank {rank}: max_out={max_out:.3e} max_result={max_result:.3e}") + ok = ok and max_out <= 1e-3 and max_result <= 1e-3 + return 0 if ok else 1 + finally: + worker.close() + + +@pytest.mark.platforms(["a2a3"]) +@pytest.mark.runtime("fully_distributed_within_core") +@pytest.mark.device_count(2) +def test_sdma_async_completion_demo(st_device_ids, st_platform) -> None: + assert run(st_platform, [int(d) for d in st_device_ids]) == 0 + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("-p", "--platform", default="a2a3") + parser.add_argument("-d", "--device", default="0-1") + parser.add_argument("--pto-isa-commit", default=None) + args = parser.parse_args() + return run(args.platform, parse_device_range(args.device), args.pto_isa_commit) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add.cpp b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add.cpp new file mode 100644 index 000000000..8a119554d --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Element-wise Tensor Addition Kernel + * + * Implements: out[i] = src0[i] + src1[i] + * + * This kernel performs element-wise addition of two tensors. It's compiled + * separately as a standalone kernel and linked with the dispatcher using + * function pointers, demonstrating the separation pattern used in production + * systems where kernel binaries are loaded dynamically. + */ + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +/** + * Element-wise addition kernel implementation + * + * Unified signature: all arguments passed via int64_t array + * @param args Argument array: + * args[0] = src0 pointer (first input tensor) + * args[1] = src1 pointer (second input tensor) + * args[2] = out pointer (output tensor) + * args[3] = size (number of elements) + */ +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack arguments (Tensor* pointers from runtime) + __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset; + __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset; + __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + // Configuration: float, 128, 128, 128, 128 + constexpr int kTRows_ = 128; + constexpr int kTCols_ = 128; + constexpr int vRows = 128; + constexpr int vCols = 128; + + using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; + using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src0Tile(vRows, vCols); + TileData src1Tile(vRows, vCols); + TileData dstTile(vRows, vCols); + TASSIGN(src0Tile, 0x0); + TASSIGN(src1Tile, 0x10000); + TASSIGN(dstTile, 0x20000); + + GlobalData src0Global(src0); + GlobalData src1Global(src1); + GlobalData dstGlobal(out); + + TLOAD(src0Tile, src0Global); + TLOAD(src1Tile, src1Global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(dstTile, src0Tile, src1Tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dstGlobal, dstTile); + + pipe_sync(); +} diff --git a/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add_scalar.cpp b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add_scalar.cpp new file mode 100644 index 000000000..42ec41bcc --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_add_scalar.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * Scalar Addition Kernel + * + * Implements: out[i] = src[i] + scalar + * + * This kernel adds a scalar value to each element of a tensor. It's compiled + * separately as a standalone kernel and linked with the dispatcher using + * function pointers, demonstrating the separation pattern used in production + * systems where kernel binaries are loaded dynamically. + */ + +#include +#include + +#include "tensor.h" // NOLINT(build/include_subdir) + +// NOLINTNEXTLINE(build/namespaces) +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] // NOLINT(whitespace/braces) +#endif + +/** + * Scalar addition kernel implementation + * + * Unified signature: all arguments passed via int64_t array + * @param args Argument array: + * args[0] = src pointer (input tensor) + * args[1] = out pointer (output tensor) + * args[2] = scalar value (as uint64_t, needs conversion to float) + * args[3] = size (number of elements) + */ +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack arguments (Tensor* pointers from runtime) + __gm__ Tensor *src_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ float *src = reinterpret_cast<__gm__ float *>(src_tensor->buffer.addr) + src_tensor->start_offset; + __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + // Convert scalar from uint64_t to float + float scalar = from_u64(static_cast(args[2])); + + // Configuration: float, 128, 128, 128, 128 + constexpr int kTRows_ = 128; + constexpr int kTCols_ = 128; + constexpr int vRows = 128; + constexpr int vCols = 128; + + using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; + using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData srcTile(vRows, vCols); + TileData dstTile(vRows, vCols); + TASSIGN(srcTile, 0x0); + TASSIGN(dstTile, 0x10000); + + GlobalData srcGlobal(src); + GlobalData dstGlobal(out); + + TLOAD(srcTile, srcGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADDS(dstTile, srcTile, scalar); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dstGlobal, dstTile); + + pipe_sync(); +} diff --git a/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_mul.cpp b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_mul.cpp new file mode 100644 index 000000000..d48c63e27 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/aiv/kernel_mul.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Element-wise Tensor Multiplication Kernel + * + * Implements: out[i] = src0[i] * src1[i] + * + * This kernel performs element-wise multiplication of two tensors. It's + * compiled separately as a standalone kernel and linked with the dispatcher + * using function pointers, demonstrating the separation pattern used in + * production systems where kernel binaries are loaded dynamically. + */ + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +/** + * Element-wise multiplication kernel implementation + * + * Unified signature: all arguments passed via int64_t array + * @param args Argument array: + * args[0] = src0 pointer (first input tensor) + * args[1] = src1 pointer (second input tensor) + * args[2] = out pointer (output tensor) + * args[3] = size (number of elements) + */ +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + // Unpack arguments (Tensor* pointers from runtime) + __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ float *src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset; + __gm__ float *src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset; + __gm__ float *out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + // Configuration: float, 128, 128, 128, 128 + constexpr int kTRows_ = 128; + constexpr int kTCols_ = 128; + constexpr int vRows = 128; + constexpr int vCols = 128; + + using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; + using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src0Tile(vRows, vCols); + TileData src1Tile(vRows, vCols); + TileData dstTile(vRows, vCols); + TASSIGN(src0Tile, 0x0); + TASSIGN(src1Tile, 0x10000); + TASSIGN(dstTile, 0x20000); + + GlobalData src0Global(src0); + GlobalData src1Global(src1); + GlobalData dstGlobal(out); + + TLOAD(src0Tile, src0Global); + TLOAD(src1Tile, src1Global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TMUL(dstTile, src0Tile, src1Tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dstGlobal, dstTile); + + pipe_sync(); +} diff --git a/examples/a2a3/fully_distributed_within_core/vector_example/kernels/orchestration/example_orchestration.cpp b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/orchestration/example_orchestration.cpp new file mode 100644 index 000000000..a4b865326 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/vector_example/kernels/orchestration/example_orchestration.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Example: aicpu_orchestration_entry (device-side orchestration) + * + * DAG structure for formula: (a + b + 1)(a + b + 2) + (a + b) + * t0: c = a + b (func_id=0, kernel_add) [outer scope] + * t1: d = c + 1 (func_id=1, kernel_add_scalar) [inner scope] + * t2: e = c + 2 (func_id=1, kernel_add_scalar) [inner scope] + * t3: g = d * e (func_id=2, kernel_mul) [inner scope] + * t4: f = g + c (func_id=0, kernel_add) [inner scope] + * Dependencies: t0->t1, t0->t2, t1->t3, t2->t3, t0->t4, t3->t4 + * + * Nested scope demonstration: + * - Inner scope owns t1, t2, t3, t4; intermediates d, e, g release on inner scope end + * - Outer scope owns t0; c persists across inner scope for t1, t2, t4 + * - c flows from outer to inner scope (outer-scope tensors are visible to inner scopes) + * + * This file compiles as a standalone .so with zero runtime link dependencies. + * All runtime calls go through the PTO2RuntimeOps function-pointer table. + */ + +#include +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +extern "C" { + +/** + * Orchestration config — the executor reads these values to set up + * shared memory and runtime before calling aicpu_orchestration_entry. + */ +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 3, + }; +} + +/** + * Orchestration entry — runtime is bound implicitly by the framework. + * The executor wraps this call in PTO2_SCOPE, so we are already inside + * the outer scope on entry. + */ +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) { + // golden shape = kernel shape, use orch_args.tensor(i).ref() directly + const Tensor &ext_a = orch_args.tensor(0).ref(); + const Tensor &ext_b = orch_args.tensor(1).ref(); + const Tensor &ext_f = orch_args.tensor(2).ref(); + + uint32_t SIZE = orch_args.tensor(0).ref().shapes[0]; + LOG_INFO_V0("===============SIZE=%u", SIZE); + + uint32_t inter_shapes[1] = {SIZE}; + TensorCreateInfo inter_ci(inter_shapes, 1, DataType::FLOAT32); + + // t0: c = a + b (kernel_id=0, kernel_add) [outer scope] + L0TaskArgs params_t0; + params_t0.add_input(ext_a); + params_t0.add_input(ext_b); + params_t0.add_output(inter_ci); + TaskOutputTensors outs_t0 = rt_submit_aiv_task(0, params_t0); // kernel_add + const Tensor &c = outs_t0.get_ref(0); + + // Inner scope: owns t1, t2, t3, t4; intermediates d, e, g release on scope end. + // c flows in from outer scope (outer-scope tensors are visible to inner scopes). + PTO2_SCOPE() { + // t1: d = c + 1 (kernel_id=1, kernel_add_scalar) + L0TaskArgs params_t1; + params_t1.add_input(c); + params_t1.add_output(inter_ci); + params_t1.add_scalar(1.0f); + params_t1.add_scalar(3u); + TaskOutputTensors outs_t1 = rt_submit_aiv_task(1, params_t1); // kernel_add_scalar + const Tensor &d = outs_t1.get_ref(0); + + // t2: e = c + 2 (kernel_id=1, kernel_add_scalar) + L0TaskArgs params_t2; + params_t2.add_input(c); + params_t2.add_output(inter_ci); + params_t2.add_scalar(2.0f); + params_t2.add_scalar(3u); + TaskOutputTensors outs_t2 = rt_submit_aiv_task(1, params_t2); // kernel_add_scalar + const Tensor &e = outs_t2.get_ref(0); + + // t3: g = d * e (kernel_id=2, kernel_mul) + L0TaskArgs params_t3; + params_t3.add_input(d); + params_t3.add_input(e); + params_t3.add_output(inter_ci); + params_t3.add_scalar(3u); + TaskOutputTensors outs_t3 = rt_submit_aiv_task(2, params_t3); // kernel_mul + const Tensor &g = outs_t3.get_ref(0); + + // t4: f = g + c (kernel_id=0, kernel_add) + L0TaskArgs params_t4; + params_t4.add_input(g); + params_t4.add_input(c); + params_t4.add_output(ext_f); + rt_submit_aiv_task(0, params_t4); // kernel_add + } // inner scope ends: releases d, e, g +} + +} // extern "C" diff --git a/examples/a2a3/fully_distributed_within_core/vector_example/test_vector_example.py b/examples/a2a3/fully_distributed_within_core/vector_example/test_vector_example.py new file mode 100644 index 000000000..15a92d667 --- /dev/null +++ b/examples/a2a3/fully_distributed_within_core/vector_example/test_vector_example.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Tensormap-and-ringbuffer vector example: f = (a+b+1)*(a+b+2) + (a+b).""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + + +@scene_test(level=2, runtime="fully_distributed_within_core") +class TestVectorExample(SceneTestCase): + """f = (a+b+1)*(a+b+2) + (a+b), where a=2.0, b=3.0 -> f=47.0.""" + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/example_orchestration.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/kernel_add_scalar.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT], + }, + { + "func_id": 2, + "source": "kernels/aiv/kernel_mul.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + ], + } + + CASES = [ + { + "name": "default", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 3}, + "params": {}, + }, + ] + + def generate_args(self, params): + SIZE = 128 * 128 + return TaskArgsBuilder( + Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)), + Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32)), + Tensor("f", torch.zeros(SIZE, dtype=torch.float32)), + ) + + def compute_golden(self, args, params): + args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/src/a2a3/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp b/src/a2a3/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp new file mode 100644 index 000000000..21fb63133 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include "aicore/aicore.h" +#include "aicore/aicore_profiling_state.h" +#include "aicore/l2_swimlane_collector_aicore.h" +#include "aicore/pmu_collector_aicore.h" +#include "common/l2_swimlane_profiling.h" +#include "common/platform_config.h" // Register-based communication +#include "pto2_dispatch_payload.h" +#include "runtime.h" + +/** + * AICore main execution loop + * + * Implements the AICPU-AICore register-based dispatch protocol: + * 1. Wait for AICPU ready signal via handshake buffer + * 2. Report physical core ID and core type, signal AICore ready + * 3. Cache per-core PTO2DispatchPayload pointer from hank->task + * 4. Poll DATA_MAIN_BASE register for task dispatch until exit signal + * + * AICPU writes &s_payload_per_core[i] to hank->task before setting + * aicpu_ready=1. AICore caches this pointer and reads function_bin_addr + + * args pointer from it on each dispatch. reg_val is a monotonically + * increasing task ID used only for dispatch signaling and ACK/FIN protocol. + * + * Profiling state (enable flag, L2 swimlane rotation channel) is published into the platform + * via set_aicore_profiling_flag / set_aicore_l2_swimlane_ring at kernel entry — + * this routine reads it through the matching getters, so neither Handshake + * nor this signature carry profiling fields. + * + * @param runtime Pointer to Runtime in global memory + * @param block_idx Block index (core ID) + * @param core_type Core type (AIC or AIV) + */ +__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type) { + __gm__ Handshake *my_hank = (__gm__ Handshake *)(&runtime->workers[block_idx]); + + // Phase 1: Wait for AICPU initialization signal + while (my_hank->aicpu_ready == 0) { + dcci(my_hank, SINGLE_CACHE_LINE); + SPIN_WAIT_HINT(); + } + + // Phase 2: Report physical core ID, signal ready + my_hank->physical_core_id = get_physical_core_id(); + OUT_OF_ORDER_STORE_BARRIER(); + my_hank->aicore_regs_ready = 1; + dcci(&my_hank->aicore_regs_ready, SINGLE_CACHE_LINE, CACHELINE_OUT); + while (my_hank->aicpu_regs_ready == 0) { + dcci(&my_hank->aicpu_regs_ready, SINGLE_CACHE_LINE); + SPIN_WAIT_HINT(); + } + // Report initial idle status via register + write_reg(RegId::COND, AICORE_IDLE_VALUE); + + // Phase 3: Report core type, signal ready + my_hank->core_type = core_type; + OUT_OF_ORDER_STORE_BARRIER(); + my_hank->aicore_done = block_idx + 1; // Signal ready (use block_idx + 1 to avoid 0) + + dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); + + // =========================================================================== + // fully_distributed_within_core: run orchestration + scheduling + execution + // ON this AICore worker (SPMD). Instead of polling DATA_MAIN_BASE for + // AICPU-dispatched tasks, each worker invokes the distributed engine entry + // (compiled into the AICPU .so, but executed here on the AICore thread so + // kernels run with this thread's sim TLS in place). The engine replays the + // orchestration submit stream, claims/builds the tasks it wins, and executes + // them; on return it has set this worker's completion flags. The worker then + // honors the existing teardown protocol (wait for EXIT, ack EXITED) so the + // AICPU scheduler/shutdown path is reused unchanged. + // See runtime/dist_engine.* and docs/fully_distributed_within_core.md. + // =========================================================================== + while (runtime->dist.go == 0) { + dcci(&runtime->dist, SINGLE_CACHE_LINE); + SPIN_WAIT_HINT(); + } + { + DistCoreMainFn core_main = reinterpret_cast(runtime->dist.core_main_fn); + if (core_main != nullptr) { + core_main(runtime, block_idx, static_cast(core_type)); + } else { + __atomic_add_fetch(&runtime->dist.done_count, 1, __ATOMIC_ACQ_REL); + } + } + + // Teardown: wait for the AICPU EXIT signal on DATA_MAIN_BASE and ack. + while (true) { + uint32_t reg_val = static_cast(read_reg(RegId::DATA_MAIN_BASE)); + if (reg_val == AICORE_EXIT_SIGNAL) { + write_reg(RegId::COND, AICORE_EXITED_VALUE); + break; + } + SPIN_WAIT_HINT(); + } + dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp new file mode 100644 index 000000000..7ca06dce8 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp @@ -0,0 +1,873 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#ifdef __linux__ +#include +#endif + +#include "aicpu/device_time.h" +#include "aicpu/orch_so_file.h" +#include "callable_protocol.h" +#include "pto2_dispatch_payload.h" +#include "runtime.h" +#include "spin_hint.h" + +// Runtime headers (full struct definition for create/destroy + PTO2_SCOPE) +#include "pto_runtime2.h" +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" + +// fully_distributed_within_core engine — orchestration + scheduling + execution +// run on the AICore workers; this AICPU thread only wires the engine and waits. +#include "dist_engine.h" + +// Performance profiling headers +#include "aicpu/l2_swimlane_collector_aicpu.h" +#include "aicpu/scope_stats_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" +#include "aicpu/dep_gen_collector_aicpu.h" +#include "common/l2_swimlane_profiling.h" +#include "common/unified_log.h" + +// Register-based communication +#include "aicpu/platform_regs.h" +#include "common/platform_config.h" + +// Core type definitions +#include "common/core_type.h" + +// CoreCallable for resolved dispatch address +#include "callable.h" + +// Scheduler data structures (CoreExecState, CoreTracker, etc.) +#include "scheduler/scheduler_types.h" + +// Scheduler context class +#include "scheduler/scheduler_context.h" + +// Device orchestration function signature (loaded via dlopen). +// The executor binds the current thread's PTO2Runtime into orchestration TLS +// before calling the user entry. +typedef void (*DeviceOrchestrationFunc)(const L2TaskArgs &orch_args); +typedef void (*DeviceOrchestrationBindRuntimeFunc)(PTO2Runtime *rt); + +// Config function exported by orchestration .so +typedef PTO2OrchestrationConfig (*DeviceOrchestrationConfigFunc)(const L2TaskArgs &orch_args); + +// From orchestration/common.cpp linked into this DSO — updates g_current_runtime here (distinct from +// framework_bind_runtime in the dlopen'd libdevice_orch_*.so). +extern "C" void framework_bind_runtime(PTO2Runtime *rt); + +constexpr const char *DEFAULT_ORCH_ENTRY_SYMBOL = "aicpu_orchestration_entry"; +constexpr const char *DEFAULT_ORCH_CONFIG_SYMBOL = "aicpu_orchestration_config"; + +static int32_t read_pto2_runtime_status(Runtime *runtime) { + if (runtime == nullptr) { + return 0; + } + + void *sm = runtime->get_gm_sm_ptr(); + if (sm == nullptr) { + return 0; + } + + auto *header = static_cast(sm); + int32_t orch_error_code = header->orch_error_code.load(std::memory_order_acquire); + int32_t sched_error_code = header->sched_error_code.load(std::memory_order_acquire); + return runtime_status_from_error_codes(orch_error_code, sched_error_code); +} + +static PTO2Runtime *rt{nullptr}; + +// Per-callable_id orchestration SO table. The executor dispatches +// `orch_so_table_[active_callable_id_]` (created on first sighting of +// that callable_id, kept warm across runs). +// MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values +// (mailbox uint32 callable_id, register() returns small ints) and is shared +// with the host bounds check in DeviceRunner::register_callable — +// see src/common/task_interface/callable_protocol.h. + +struct OrchSoEntry { + bool in_use{false}; + void *handle{nullptr}; + char path[256]{}; + DeviceOrchestrationFunc func{nullptr}; + DeviceOrchestrationBindRuntimeFunc bind{nullptr}; + DeviceOrchestrationConfigFunc config_func{nullptr}; +}; + +struct AicpuExecutor { + int32_t sched_thread_num_; + bool orch_to_sched_{false}; + + // ===== Thread management state ===== + std::atomic thread_idx_{0}; + std::atomic initialized_{false}; + std::atomic init_done_{false}; + std::atomic init_failed_{false}; + std::atomic finished_{false}; + + int32_t aicpu_thread_num_{0}; + + // ===== Task queue state (managed by scheduler ready queues) ===== + + std::atomic finished_count_{0}; + std::atomic runtime_init_ready_{false}; + + // Per-Worker arena backing the PTO2Runtime + sm_handle + orch/sched/mailbox + // sub-regions (created in runtime_create_from_sm, released in runtime_destroy). + // Default-constructed: libc-backed backend, no ctx. + DeviceArena runtime_arena_; + + // Entry-arg L2TaskArgs built (via create_from_chip_args) from get_orch_args() + // before scheduler init; consumed by the (*p_func)(orch_args_cached_) below. + L2TaskArgs orch_args_cached_; + + // Per-callable_id table. Single orch thread today, so first-write/read + // race is not possible; if multiple orch threads are ever introduced, + // guard the in_use=false→true transition with a mutex. + OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]; + + // ===== Scheduler context (owns all dispatch/completion/drain state) ===== + SchedulerContext sched_ctx_; + + // ===== Methods ===== + int32_t init(Runtime *runtime); + int32_t run(Runtime *runtime); + void deinit(Runtime *runtime); + + ~AicpuExecutor() { + // Process-wide teardown (the single static instance dies here). Every + // in-use callable_id slot is dlclose()'d here; each is otherwise kept + // alive across runs for cache-hit reuse. + for (auto &e : orch_so_table_) { + if (!e.in_use) continue; + if (e.handle != nullptr) dlclose(e.handle); + if (e.path[0] != '\0') unlink(e.path); + e = OrchSoEntry{}; + } + } +}; + +static AicpuExecutor g_aicpu_executor; + +// ===== AicpuExecutor Method Implementations ===== + +int32_t AicpuExecutor::init(Runtime *runtime) { + bool expected = false; + if (!initialized_.compare_exchange_strong(expected, true, std::memory_order_acq_rel, std::memory_order_acquire)) { + return 0; + } + + LOG_INFO_V0("AicpuExecutor: Initializing"); + + if (runtime == nullptr) { + LOG_ERROR("runtime is nullptr"); + init_failed_.store(true, std::memory_order_release); + return -1; + } + + // Read execution parameters from runtime. The 0 → 1 fixup runs before the + // sched_thread_num_ derivation so a zero input doesn't leave the scheduler + // count at -1. + aicpu_thread_num_ = runtime->aicpu_thread_num; + if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1; + sched_thread_num_ = aicpu_thread_num_ - 1; + orch_to_sched_ = runtime->orch_to_sched; + + if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) { + LOG_ERROR("Invalid aicpu_thread_num: %d", aicpu_thread_num_); + init_failed_.store(true, std::memory_order_release); + return -1; + } + + if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) { + init_failed_.store(true, std::memory_order_release); + return -1; + } + + finished_count_.store(0, std::memory_order_release); + + init_done_.store(true, std::memory_order_release); + LOG_INFO_V0("AicpuExecutor: Init complete"); + return 0; +} + +/** + * Shutdown AICore - Send exit signal via registers to all AICore kernels + */ +int32_t AicpuExecutor::run(Runtime *runtime) { + int32_t thread_idx = thread_idx_++; + int32_t run_rc = 0; + LOG_INFO_V0("Thread %d: Start", thread_idx); + + // Orchestrator check + if (thread_idx >= sched_thread_num_) { +#if PTO2_PROFILING + uint64_t orch_cycle_start = 0; + int32_t pto2_submitted_tasks = -1; +#endif + // Orchestrator thread: load + run the device orchestration SO. The braces + // scope the per-callable dlopen / SO-table locals to this block. + { + // Per-callable_id dispatch: the orch SO state lives in + // `orch_so_table_[callable_id]` keyed by registration order; + // reload is governed by `register_new_callable_id_`. + const int32_t callable_id = runtime->get_active_callable_id(); + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS + ); + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + void **p_handle = &orch_so_table_[callable_id].handle; + char *p_path = orch_so_table_[callable_id].path; + DeviceOrchestrationFunc *p_func = &orch_so_table_[callable_id].func; + DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind; + DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func; + const bool reload_so = runtime->register_new_callable_id(); + + if (reload_so) { + LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id); + if (*p_handle != nullptr) { + dlclose(*p_handle); + *p_handle = nullptr; + *p_func = nullptr; + *p_bind = nullptr; + if (p_path[0] != '\0') { + // Unlink the old file so the new open() lands on a + // fresh inode — protects against SIGBUS / ETXTBSY when + // the kernel still has the old mapping pinned. + unlink(p_path); + p_path[0] = '\0'; + } + } + + const void *so_data = reinterpret_cast(runtime->get_dev_orch_so_addr()); + size_t so_size = runtime->get_dev_orch_so_size(); + + if (so_data == nullptr || so_size == 0) { + LOG_ERROR("Thread %d: Device orchestration SO not set", thread_idx); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + + // Try multiple paths that may allow execution on AICPU. + char so_path[256]; + bool file_created = false; + const char *candidate_dirs[] = { + "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp" + }; + const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); + + for (int32_t i = 0; i < num_candidates && !file_created; i++) { + int32_t fd = create_orch_so_file( + candidate_dirs[i], callable_id, get_orch_device_id(), so_path, sizeof(so_path) + ); + if (fd < 0) { + LOG_INFO_V0( + "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno + ); + continue; + } + ssize_t written = write(fd, so_data, so_size); + close(fd); + if (written != static_cast(so_size)) { + LOG_INFO_V0( + "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno + ); + unlink(so_path); + continue; + } + file_created = true; + LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size); + } + + if (!file_created) { + LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + + dlerror(); + void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); + const char *dlopen_err = dlerror(); + if (handle == nullptr) { + LOG_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown"); + unlink(so_path); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); + + // Unlink the on-disk SO immediately: dlopen has already mmap'd + // the image, so the kernel keeps the inode alive until the + // matching dlclose / process exit. This prevents stale + // libdevice_orch__.so files from accumulating in + // /tmp when child processes exit via os._exit(0), which skips + // ~AicpuExecutor (worker.py: _sub/_chip/_child loops). + unlink(so_path); + + const char *entry_symbol = runtime->get_device_orch_func_name(); + if (entry_symbol == nullptr || entry_symbol[0] == '\0') { + entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL; + } + const char *config_symbol = runtime->get_device_orch_config_name(); + if (config_symbol == nullptr || config_symbol[0] == '\0') { + config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL; + } + + dlerror(); + DeviceOrchestrationFunc orch_func = + reinterpret_cast(dlsym(handle, entry_symbol)); + const char *entry_dlsym_error = dlerror(); + if (entry_dlsym_error != nullptr) { + LOG_ERROR( + "Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error + ); + dlclose(handle); + unlink(so_path); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + if (orch_func == nullptr) { + LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol); + dlclose(handle); + unlink(so_path); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + + dlerror(); + auto config_func = reinterpret_cast(dlsym(handle, config_symbol)); + const char *config_dlsym_error = dlerror(); + if (config_dlsym_error != nullptr || config_func == nullptr) { + LOG_ERROR( + "Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol, + config_dlsym_error ? config_dlsym_error : "NULL function pointer" + ); + config_func = nullptr; + } + + dlerror(); + auto bind_runtime_func = + reinterpret_cast(dlsym(handle, "framework_bind_runtime")); + const char *bind_runtime_error = dlerror(); + if (bind_runtime_error != nullptr) { + LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", thread_idx, bind_runtime_error); + bind_runtime_func = nullptr; + } + + *p_handle = handle; + *p_func = orch_func; + *p_bind = bind_runtime_func; + *p_config_func = config_func; + snprintf(p_path, 256, "%s", so_path); + orch_so_table_[callable_id].in_use = true; + } else { + LOG_INFO_V0( + "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id + ); + if (*p_handle == nullptr || *p_func == nullptr) { + LOG_ERROR( + "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx, + callable_id + ); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + } + + // Build the entry-arg once per run; both the config call below and + // the orchestration entry (consumed at orch_args_cached_) use it. + orch_args_cached_.create_from_chip_args(runtime->get_orch_args()); + + // Validate arg count on every run (reload or cache hit). + if (*p_config_func != nullptr) { + PTO2OrchestrationConfig cfg = (*p_config_func)(orch_args_cached_); + LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count); + if (cfg.expected_arg_count > 0) { + const ChipStorageTaskArgs &args_validate = runtime->get_orch_args(); + int32_t actual_arg_count = args_validate.tensor_count() + args_validate.scalar_count(); + if (actual_arg_count < cfg.expected_arg_count) { + LOG_ERROR( + "Thread %d: arg_count %d < expected %d", thread_idx, actual_arg_count, + cfg.expected_arg_count + ); + // Clean up cached state so a subsequent run does a full reload. + if (*p_handle != nullptr) { + dlclose(*p_handle); + *p_handle = nullptr; + } + if (p_path[0] != '\0') { + unlink(p_path); + p_path[0] = '\0'; + } + *p_func = nullptr; + *p_bind = nullptr; + *p_config_func = nullptr; + orch_so_table_[callable_id].in_use = false; + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + } + } else { + LOG_INFO_V0("Thread %d: No config function, using defaults", thread_idx); + } + + // sm_handle / rt are bound to *this* run's memory and must be + // (re)created every run, regardless of whether the SO itself was + // reused above. + const ChipStorageTaskArgs &args = runtime->get_orch_args(); + int32_t arg_count = args.tensor_count() + args.scalar_count(); + LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", thread_idx, runtime->get_gm_sm_ptr(), arg_count); + for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) { + const Tensor &t = args.tensor(i); + LOG_INFO_V0( + "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", thread_idx, i, + static_cast(t.buffer.addr), t.ndims, static_cast(t.dtype) + ); + } + for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) { + LOG_INFO_V0( + "Thread %d: orch_args[%d] = SCALAR(0x%lx)", thread_idx, args.tensor_count() + i, + static_cast(args.scalar(i)) + ); + } + + void *sm_ptr = runtime->get_gm_sm_ptr(); + + // Prebuilt-arena fast path. Host has pre-populated the entire + // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map + // sub-regions + sm_handle wrapper + mailbox) and uploaded it via + // rtMemcpy into the pooled runtime_arena buffer. We attach to it, + // wire arena-internal pointers to their device addresses, reset + // the SM, and finalize the few device-only fields the host could + // not know at image-build time. + void *prebuilt_arena = runtime->get_prebuilt_arena_base(); + size_t off_runtime = runtime->get_prebuilt_runtime_offset(); + if (prebuilt_arena == nullptr) { + LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx); + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign); + rt = reinterpret_cast(static_cast(prebuilt_arena) + off_runtime); + + // Wire every arena-internal pointer field (host wrote host-mirror + // addresses; we overwrite them with device addresses). + runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt); + uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(rt->prebuilt_layout.task_window_sizes); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; ++r) { + LOG_INFO_V0( + "Thread %d: Ring %d sizes: task_window=%" PRIu64 " heap=%" PRIu64 " dep_pool=%d", thread_idx, r, + rt->prebuilt_layout.task_window_sizes[r], rt->prebuilt_layout.heap_sizes[r], + rt->prebuilt_layout.dep_pool_capacities[r] + ); + } + + // Reset SM state. setup_pointers + init_header_per_ring restore + // ring flow-control counters, layout metadata, error flags, and + // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse + + // fanin_count/active_mask zero — previously done inside + // RingSchedState::init). + memset(rt->sm_handle, 0, sizeof(*rt->sm_handle)); + if (!rt->sm_handle->init_per_ring( + sm_ptr, sm_size, rt->prebuilt_layout.task_window_sizes, rt->prebuilt_layout.heap_sizes + )) { + LOG_ERROR("Thread %d: sm_handle->init_per_ring failed", thread_idx); + rt = nullptr; + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + + // AICore completion mailbox lives in the arena; reset it each + // boot so stale completion notifications from a previous run do + // not leak. + memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox)); + + // Fill ops / core counts (host can't resolve s_runtime_ops's + // device address nor know the SchedulerContext's core fan-out). + runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count()); +#if PTO2_PROFILING + rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level(); + { + auto &orch = rt->orchestrator; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &alloc = orch.rings[r].task_allocator; + scope_stats_set_ring_capacity( + r, alloc.window_size(), alloc.heap_capacity(), rt->prebuilt_layout.dep_pool_capacities[r] + ); + } + scope_stats_set_tensormap_capacity(orch.tensor_map.pool_capacity()); + } +#endif + + // With multi-ring, slot_states are per-ring inside the scheduler. + runtime->set_slot_states_ptr(nullptr); + + // Wire scheduler context to the newly created PTO2Runtime before + // releasing scheduler threads from runtime_init_ready_. + sched_ctx_.bind_runtime(rt); + + runtime_init_ready_.store(true, std::memory_order_release); + + // Wait for scheduler's one-time init to complete + sched_ctx_.wait_pto2_init_complete(); + +#if PTO2_PROFILING + if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { + l2_swimlane_aicpu_set_orch_thread_idx(thread_idx); + } +#endif + + // dep_gen plugs into the orchestrator thread (single-instance subsystem): + // set the per-thread queue index and pop the initial buffer before any + // submit_task can fire inside orch_func_. + if (is_dep_gen_enabled()) { + dep_gen_aicpu_set_orch_thread_idx(thread_idx); + dep_gen_aicpu_init(); + } + +#if PTO2_PROFILING + // scope_stats streams scope_end records off the orchestrator thread: + // record the per-thread ready_queue index. No-op (writer shared + // state null) when scope_stats is disabled; the current buffer is + // popped lazily on the first scope_end append. + scope_stats_aicpu_set_orch_thread_idx(thread_idx); +#endif + +#if PTO2_PROFILING + orch_cycle_start = get_sys_cnt_aicpu(); +#endif + framework_bind_runtime(rt); + if (*p_bind != nullptr) { + (*p_bind)(rt); + } + + // ---- fully_distributed_within_core handoff ---- + // Instead of running orchestration here, wire the distributed engine + // (resets cursors/flags/heap, points rt->ops at the distributed + // submit path) and hand the per-core entry off to the AICore worker + // threads, which replay orchestration in SPMD fashion and execute + // the tasks they win. This AICPU thread then waits for all workers. + // See runtime/dist_engine.* and docs/fully_distributed_within_core.md. + { + const int32_t num_workers = runtime->worker_count; + // dist_engine_register repoints rt->ops at the distributed submit + // table for the duration of the on-core orchestration replay. Save + // the centralized ops so the scheduler-handoff calls below + // (rt_orchestration_done / on_orchestration_done) work unchanged. + const PTO2RuntimeOps *saved_ops = rt->ops; + void *core_main = + dist_engine_register(rt, *p_func, &orch_args_cached_, num_workers, runtime); + runtime->dist.core_main_fn = reinterpret_cast(core_main); + runtime->dist.num_workers = num_workers; + __atomic_store_n(&runtime->dist.done_count, 0, __ATOMIC_RELEASE); + __atomic_store_n(&runtime->dist.go, 1u, __ATOMIC_RELEASE); + const bool dist_trace = (getenv("PTO_DIST_TRACE") != nullptr); + if (dist_trace) + fprintf(stderr, "[dist] Thread %d: engine wired, %d workers launched\n", thread_idx, num_workers); + while (__atomic_load_n(&runtime->dist.done_count, __ATOMIC_ACQUIRE) < num_workers) { + SPIN_WAIT_HINT(); + } + rt->ops = saved_ops; + if (dist_trace) + fprintf(stderr, "[dist] Thread %d: all %d distributed workers finished\n", thread_idx, num_workers); + } + + // Flush the (potentially partially-filled) DepGenBuffer so the host + // collector can pick it up before this orchestrator thread joins. + if (is_dep_gen_enabled()) { + dep_gen_aicpu_flush(); + } +#if PTO2_PROFILING + // Push the partially-filled scope_stats buffer so the host gets the + // final scope_end records. Idempotent / no-op when disabled. + scope_stats_aicpu_flush_buffers(); +#endif +#if PTO2_PROFILING + uint64_t orch_cycle_end = get_sys_cnt_aicpu(); + (void)orch_cycle_end; +#endif + + // Print orchestrator profiling data +#if PTO2_ORCH_PROFILING + PTO2OrchProfilingData p = orchestrator_get_profiling(); + uint64_t total = + p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle; + if (total == 0) total = 1; // avoid div-by-zero + LOG_INFO_V9( + "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", thread_idx, + static_cast(p.submit_count), cycles_to_us(total) + ); + LOG_INFO_V9( + "Thread %d: task+heap_alloc: %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", + thread_idx, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total, + cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle), + static_cast(p.alloc_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: sync_tensormap : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.sync_cycle), + p.sync_cycle * 100.0 / total + ); + LOG_INFO_V9( + "Thread %d: lookup+dep : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.lookup_cycle), + p.lookup_cycle * 100.0 / total + ); + LOG_INFO_V9( + "Thread %d: tensormap_ins : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.insert_cycle), + p.insert_cycle * 100.0 / total + ); + LOG_INFO_V9( + "Thread %d: param_copy : %.3fus (%.1f%%) atomics=%" PRIu64 "", thread_idx, + cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast(p.args_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: fanin+ready : %.3fus (%.1f%%) work=%.3fus wait=%.3fus", thread_idx, + cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total, + cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle) + ); + LOG_INFO_V9( + "Thread %d: avg/task : %.3fus", thread_idx, + p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0 + ); + +#if PTO2_TENSORMAP_PROFILING + PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling(); + LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", thread_idx); + LOG_INFO_V9( + "Thread %d: lookups : %" PRIu64 ", inserts: %" PRIu64 "", thread_idx, + static_cast(tp.lookup_count), static_cast(tp.insert_count) + ); + LOG_INFO_V9( + "Thread %d: chain walked : total=%" PRIu64 ", avg=%.1f, max=%d", thread_idx, + static_cast(tp.lookup_chain_total), + tp.lookup_count > 0 ? static_cast(tp.lookup_chain_total) / tp.lookup_count : 0.0, + tp.lookup_chain_max + ); + LOG_INFO_V9( + "Thread %d: overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", thread_idx, + static_cast(tp.overlap_checks), static_cast(tp.overlap_hits), + tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0 + ); +#endif +#endif // PTO2_ORCH_PROFILING + + // Latch task count from PTO2 shared memory to hand off to the + // scheduler. The orchestrator's run window (start_time / end_time / + // submit_count) is no longer published to shared memory — the + // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line + // below carries the same envelope info for debugging, and + // host-side swimlane derives per-phase timing from the per-event + // L2SwimlaneAicpuSchedPhaseRecord[] + L2SwimlaneAicpuOrchPhaseRecord[] + // streams that already cover everything inside submit_task(). + int32_t total_tasks = 0; + if (rt->orchestrator.sm_header) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + total_tasks += + rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire); + } + } + +#if PTO2_PROFILING + pto2_submitted_tasks = total_tasks; +#endif + + // Signal completion to the orchestrator state machine + rt_orchestration_done(rt); + + sched_ctx_.on_orchestration_done(runtime, rt, thread_idx, total_tasks); + } +#if PTO2_PROFILING + uint64_t orch_end_ts = get_sys_cnt_aicpu(); + LOG_INFO_V9( + "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", thread_idx, + static_cast(orch_cycle_start), static_cast(orch_end_ts), + cycles_to_us(orch_end_ts - orch_cycle_start) + ); + if (pto2_submitted_tasks >= 0) { + LOG_INFO_V9( + "PTO2 total submitted tasks = %d, already executed %d tasks", pto2_submitted_tasks, + sched_ctx_.completed_tasks_count() + ); + } +#endif + LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx); + } + + // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false) + if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) { + // Device orchestration: wait for the primary orchestrator to initialize the SM header + while (!runtime_init_ready_.load(std::memory_order_acquire)) { + SPIN_WAIT_HINT(); + } + if (rt == nullptr) { + LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx); + } else { + sched_ctx_.bind_runtime(rt); + int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, thread_idx); + if (completed < 0) { + LOG_ERROR("Thread %d: Scheduler failed with rc=%d", thread_idx, completed); + run_rc = completed; + } else { + LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", thread_idx, completed); + } + } + } + + // Always shutdown AICore — even if sched_ctx_.completed_ was already true. + // platform_deinit_aicore_regs is idempotent; orchestrator threads have + // core_trackers_[thread_idx].core_num() == 0 so they skip the loop harmlessly. + int32_t shutdown_rc = sched_ctx_.shutdown(thread_idx); + if (shutdown_rc != 0 && run_rc == 0) { + run_rc = shutdown_rc; + } + + LOG_INFO_V0("Thread %d: Completed", thread_idx); + + // Check if this is the last thread to finish + int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel); + if (prev_finished + 1 == aicpu_thread_num_) { + finished_.store(true, std::memory_order_release); + // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we + // always tear them down here, but we keep the per-cid orch SO entries + // alive for the next run's cache-hit reuse (see run() reload_so branch). + if (rt != nullptr) { + // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt. + const int32_t callable_id = runtime->get_active_callable_id(); + framework_bind_runtime(nullptr); + if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) { + DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind; + if (bind != nullptr) { + bind(nullptr); + } + } + runtime_destroy(rt, runtime_arena_); + rt = nullptr; + } + } + + return run_rc; +} + +void AicpuExecutor::deinit(Runtime *runtime) { + // 1. Invalidate AICPU cache for Runtime address range. + // Next round's Host DMA (rtMemcpy) writes fresh Runtime to HBM but + // bypasses this cache. Invalidating now ensures next round reads from HBM. + cache_invalidate_range(runtime, sizeof(Runtime)); + + // Reset all SchedulerContext-owned state in one place. + sched_ctx_.deinit(); + + finished_count_.store(0, std::memory_order_release); + runtime_init_ready_.store(false, std::memory_order_release); + + aicpu_thread_num_ = 0; + sched_thread_num_ = 0; + orch_to_sched_ = false; + + orch_args_cached_.reset(); + // orch_so_table_ entries are intentionally preserved across deinit: the + // next run reuses cached handles when register_new_callable_id() returns + // false. The destructor releases them at process teardown. + + // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit) + rt = nullptr; + + // Clear dep_gen file-local bookkeeping. No-op when dep_gen is disabled. + dep_gen_aicpu_finalize(); + + LOG_INFO_V0("DeInit: Runtime execution state reset"); + + initialized_.store(false, std::memory_order_release); + init_done_.store(false, std::memory_order_release); + init_failed_.store(false, std::memory_order_release); + thread_idx_.store(0, std::memory_order_release); + finished_.store(false, std::memory_order_release); + + LOG_INFO_V0("DeInit: AicpuExecutor reset complete"); +} + +// ===== Public Entry Point ===== + +/** + * aicpu_execute - Main AICPU kernel execution entry point + * + * This is called by DynTileFwkBackendKernelServer in kernel.cpp. + * Orchestrates the complete task runtime execution: + * 1. Initialize executor (thread-safe, first thread only) + * 2. Wait for initialization to complete + * 3. Execute tasks on managed cores + * 4. Cleanup when last thread finishes + * + * @param runtime Pointer to Runtime structure + * @return 0 on success, non-zero on error + */ +extern "C" int32_t aicpu_execute(Runtime *runtime) { + if (runtime == nullptr) { + LOG_ERROR("%s", "Invalid argument: null Runtime pointer"); + return -1; + } + + LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution"); + + g_aicpu_executor.init(runtime); + + while (!g_aicpu_executor.init_done_.load(std::memory_order_acquire)) { + if (g_aicpu_executor.init_failed_.load(std::memory_order_acquire)) { + LOG_ERROR("%s", "aicpu_execute: Initialization failed, aborting execution"); + return -1; + } + } + + int32_t rc = g_aicpu_executor.run(runtime); + if (rc != 0) { + LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc); + } + + int32_t runtime_rc = read_pto2_runtime_status(runtime); + + // Last thread cleans up + if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) { + LOG_INFO_V0("aicpu_execute: Last thread finished, cleaning up"); + g_aicpu_executor.deinit(runtime); + } + + if (runtime_rc != 0) { + LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc); + return runtime_rc; + } + + if (rc != 0) { + return rc; + } + + LOG_INFO_V0("%s", "aicpu_execute: Kernel execution completed successfully"); + return 0; +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/build_config.py b/src/a2a3/runtime/fully_distributed_within_core/build_config.py new file mode 100644 index 000000000..da34f14f9 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/build_config.py @@ -0,0 +1,32 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +# fully_distributed_within_core runtime build configuration +# All paths are relative to this file's directory (src/runtime/fully_distributed_within_core/) +# +# Goal: orchestration + scheduling + execution run on the AI cores themselves in +# SPMD fashion, removing AICPU from orchestration/scheduling. See the design spec: +# docs/fully_distributed_within_core.md +# +# This tree is currently re-based on the tensormap_and_ringbuffer runtime so it +# is discoverable and compiles; it reuses TensorMap, MixedKernels/ActiveMask, +# L0TaskArgs, the pto_orchestration_api submit API, and kernel-address +# resolution. The distributed model (claim race + per-core TensorMap + private +# task ring + global completion-flag ring) is layered on incrementally per the +# spec; the AICPU is reduced to an init/teardown stub. +# +# The "orchestration" directory contains source files compiled into both +# runtime targets AND the orchestration .so (e.g., tensor methods needed +# by the Tensor constructor's validation logic). + +BUILD_CONFIG = { + "aicore": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicore", "orchestration"]}, + "aicpu": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicpu", "runtime", "orchestration"]}, + "host": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["host", "runtime/shared", "orchestration"]}, + "orchestration": {"include_dirs": ["runtime", "orchestration", "common", ".."], "source_dirs": ["orchestration"]}, +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/common/intrinsic.h b/src/a2a3/runtime/fully_distributed_within_core/common/intrinsic.h new file mode 100644 index 000000000..768e6a612 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/common/intrinsic.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file intrinsic.h + * @brief SPMD execution context for AICore user kernels + * + * Topology data exposed to user kernels has two distinct lifetimes: + * + * 1. Global topology (per-core, fixed after runtime init): + * - sub_block_id : identifies the AIV lane within a cluster + * (0 = AIV0/left, 1 = AIV1/right). Initialized once at runtime + * startup based on each core's cluster position; never changes. + * Only meaningful for AIV kernels in MIX tasks. + * + * 2. Local per-dispatch context (changes each dispatch): + * - block_idx : which logical block the current worker is executing + * - block_num : total number of blocks in this task (= block_dim) + * Written by build_payload() before each dispatch. + * + * Both categories are injected via two pointer slots appended at the tail + * of the kernel args[] array: + * + * args layout: + * [0 .. tensor_count-1] = tensor GM pointers + * [tensor_count .. +scalar_count-1] = scalar values + * ... + * [SPMD_LOCAL_CONTEXT_INDEX] = (uint64_t)&LocalContext (per-dispatch) + * [SPMD_GLOBAL_CONTEXT_INDEX] = (uint64_t)&GlobalContext (per-core) + * + * The suffix positions are compile-time constants and do not depend on the + * runtime tensor_count or scalar_count. + * + * Include this header in AICore kernel source files to use the Get* accessors. + * Do NOT depend on the raw index constants; always use the accessor functions. + * + * On CCEC (real hardware), __gm__ and __aicore__ must be defined before + * including this header (e.g. via or manual #define). + * The #ifndef guards below provide fallbacks for non-kernel builds + * (AICPU, HOST) where these qualifiers are not needed. + * + * IMPORTANT — do NOT mix these with the CCE built-in topology intrinsics + * (`get_subblockid()`, `get_block_idx()`, `get_block_num()` declared in + * `kernel_operator.h` / tikcfw). Those intrinsics read AICore hardware + * registers that simpler's tensormap_and_ringbuffer runtime does NOT + * program. Specifically: + * + * - CCE `get_subblockid()` returns whatever stale value the AICore + * sub-block register holds — under simpler's MIX dispatch it is 0 + * for BOTH AIV0 and AIV1 of every cluster, so a kernel that uses + * it to partition heads will silently have AIV1 redo AIV0's work + * and the AIV1 share of the output is never written. This is the + * exact failure mode that produced the partial-zero output in + * issue #900 (PR #899 spmd_paged_attention_highperf); the kernel + * compiled, ran without error, and produced wrong output. Use + * `get_sub_block_id(args)` instead, which reads from the runtime's + * `GlobalContext.sub_block_id` that the scheduler initializes per + * AIV core in `scheduler_cold_path.cpp::SchedulerContext::init`. + * + * - `get_block_idx()` and `get_block_num()` are not redirected to + * simpler's LocalContext either — use the `(args)` variants below + * so the values reflect simpler's logical block_dim (which can + * differ from `RUNTIME_CONFIG.block_dim`, the physical core count). + * + * If you are porting a kernel originally written for native CANN dispatch + * (AscendC, ascend-transformer-boost, etc.), every reference to those + * three CCE intrinsics needs to be rewritten against this header. See + * `docs/aicore-kernel-programming.md` for the full author contract, + * porting checklist, and the worked example from PR #899 / issue #900. + */ + +#pragma once + +#include + +#include "aicore_completion_mailbox_types.h" +#include "pto_task_id.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ +#endif + +/** Number of extra pointer slots appended to the args[] tail (LocalContext + GlobalContext). */ +static constexpr int32_t PTO2_EXT_PARAMS_COUNT = 2; + +/** + * Args[] suffix indices for context pointers. + * Derived from MAX_TENSOR_ARGS(32) + MAX_SCALAR_ARGS(16). + * Users should not depend on these values; use the Get* functions below. + */ +static constexpr int32_t SPMD_LOCAL_CONTEXT_INDEX = 48; +static constexpr int32_t SPMD_GLOBAL_CONTEXT_INDEX = 49; +static constexpr int32_t PAYLOAD_LOCAL_CONTEXT_INDEX = SPMD_LOCAL_CONTEXT_INDEX; +static constexpr int32_t PAYLOAD_GLOBAL_CONTEXT_INDEX = SPMD_GLOBAL_CONTEXT_INDEX; + +/** + * Per-core global context, stored in PTO2DispatchPayload. + * Initialized once at runtime startup (init_global_context) based on each + * core's cluster position. Never modified after initialization. + */ +struct GlobalContext { + // AIV lane within cluster: 0=AIV0(left), 1=AIV1(right). + // Used by AIV to select the correct intra-cluster hw instruction. + // Not meaningful for AIC kernels or single-AIV tasks. + int32_t sub_block_id; +}; + +struct AsyncCtx { + volatile __gm__ uint32_t *completion_count; + volatile __gm__ int32_t *completion_error_code; + volatile __gm__ DeferredCompletionEntry *completion_entries; + uint32_t completion_capacity; + PTO2TaskId task_token; + + static inline AsyncCtx make(PTO2TaskId task_token, volatile __gm__ DeferredCompletionSlab *buffer) { + AsyncCtx ctx{}; + ctx.task_token = task_token; + if (buffer == nullptr) { + ctx.task_token = PTO2TaskId::invalid(); + return ctx; + } + ctx.completion_count = &buffer->count; + ctx.completion_error_code = &buffer->error_code; + ctx.completion_entries = &buffer->entries[0]; + ctx.completion_capacity = MAX_COMPLETIONS_PER_TASK; + return ctx; + } +}; + +/** + * Per-dispatch local context, stored in PTO2DispatchPayload. + * Written by build_payload() before each dispatch. Different blocks of the + * same task receive different block_idx values but the same block_num. + * + */ +struct LocalContext { + int32_t block_idx; // Logical block index within the task [0, block_num) + int32_t block_num; // How many logical blocks this task requires. + // Currently fixed to 1 (block_dim > 1 not yet implemented). + // NOT the same as RUNTIME_CONFIG.block_dim in kernel_config.py, + // which controls how many physical cores the runtime launches. + AsyncCtx async_ctx; +}; + +/** + * Return the AIV lane index within the cluster. + * In a MIX 1C2V task: AIV0(left)=0, AIV1(right)=1. + * + * This value is only meaningful for AIV kernels in MIX tasks. It tells + * the AIV whether it is the left lane or the right lane within the cluster, + * which determines the correct hardware instruction for intra-cluster + * communication. + * + * AIC kernels should NOT call this function. + * Single-AIV tasks have no intra-cluster communication, so sub_block_id + * has no meaning and should not be used. + */ +static __aicore__ inline int32_t get_sub_block_id(__gm__ int64_t *args) { + __gm__ GlobalContext *ctx = + reinterpret_cast<__gm__ GlobalContext *>(static_cast(args[SPMD_GLOBAL_CONTEXT_INDEX])); + return ctx->sub_block_id; +} + +/** + * Return the logical block index assigned to the current worker. + * Range: [0, get_block_num(args)). + * Within the same task, different blocks receive different indices. + */ +static __aicore__ inline int32_t get_block_idx(__gm__ int64_t *args) { + __gm__ LocalContext *ctx = + reinterpret_cast<__gm__ LocalContext *>(static_cast(args[SPMD_LOCAL_CONTEXT_INDEX])); + return ctx->block_idx; +} + +/** + * Return how many logical blocks the current task requires. + * All blocks of the same task see the same value. + * Currently always returns 1 (block_dim>1 not yet implemented). + * + * Note: this is NOT the same as RUNTIME_CONFIG.block_dim in + * kernel_config.py, which controls how many physical cores are launched. + */ +static __aicore__ inline int32_t get_block_num(__gm__ int64_t *args) { + __gm__ LocalContext *ctx = + reinterpret_cast<__gm__ LocalContext *>(static_cast(args[SPMD_LOCAL_CONTEXT_INDEX])); + return ctx->block_num; +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/common/pto_runtime_status.h b/src/a2a3/runtime/fully_distributed_within_core/common/pto_runtime_status.h new file mode 100644 index 000000000..5d33fe18d --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/common/pto_runtime_status.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO2 Runtime Status Helpers + * + * Shared error-code contract used inside the tensormap_and_ringbuffer runtime. + */ + +#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_ +#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_ + +#include + +// Orchestrator errors (1-99): detected in orchestrator thread +#define PTO2_ERROR_NONE 0 // Explicitly means "no error"; it is not an "unknown/unspecified" error code. +#define PTO2_ERROR_SCOPE_DEADLOCK 1 +#define PTO2_ERROR_HEAP_RING_DEADLOCK 2 +#define PTO2_ERROR_FLOW_CONTROL_DEADLOCK 3 +#define PTO2_ERROR_DEP_POOL_OVERFLOW 4 +#define PTO2_ERROR_INVALID_ARGS 5 // Arg construction error (invalid args) +#define PTO2_ERROR_DEPENDENCY_OVERFLOW 6 // Too many unique fanin dependencies for one task +#define PTO2_ERROR_REQUIRE_SYNC_START_INVALID 7 +#define PTO2_ERROR_TENSOR_WAIT_TIMEOUT 8 +#define PTO2_ERROR_EXPLICIT_ORCH_FATAL 9 +#define PTO2_ERROR_SCOPE_TASKS_OVERFLOW 10 // scope_tasks buffer saturated (all rings full) + +// Scheduler errors (100+): detected in scheduler threads +#define PTO2_ERROR_SCHEDULER_TIMEOUT 100 +#define PTO2_ERROR_ASYNC_COMPLETION_INVALID 101 +#define PTO2_ERROR_ASYNC_WAIT_OVERFLOW 102 +#define PTO2_ERROR_ASYNC_REGISTRATION_FAILED 103 + +static inline int32_t runtime_status_from_error_codes(int32_t orch_error_code, int32_t sched_error_code) { + if (orch_error_code != PTO2_ERROR_NONE) { + return orch_error_code < 0 ? orch_error_code : -orch_error_code; + } + if (sched_error_code != PTO2_ERROR_NONE) { + return sched_error_code < 0 ? sched_error_code : -sched_error_code; + } + return 0; +} + +#endif // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_ diff --git a/src/a2a3/runtime/fully_distributed_within_core/docs/MULTI_RING.md b/src/a2a3/runtime/fully_distributed_within_core/docs/MULTI_RING.md new file mode 100644 index 000000000..0de4f96ba --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/docs/MULTI_RING.md @@ -0,0 +1,330 @@ +# Multi-Ring Buffer Architecture + +> Extension to the PTO2 runtime. For the base architecture, see [RUNTIME_LOGIC.md](RUNTIME_LOGIC.md). + +## 1. Problem + +The single-ring design uses one `last_task_alive` watermark shared by HeapRing, TaskRing, and DepPool. When tasks from an inner scope (e.g., per-block iteration) complete, their resources cannot be reclaimed until **all** prior tasks — including those from the outer scope — also complete. This wastes ring capacity and can trigger deadlocks when ring sizes are small. + +## 2. Solution + +Split HeapRing, TaskRing, and DepPool into arrays of `PTO2_MAX_RING_DEPTH` (4) independent instances. Each scope depth maps to its own ring, with an independent `last_task_alive` watermark. + +```text +Scope depth 0 ──► rings[0] = { HeapRing, TaskRing, DepPool } +Scope depth 1 ──► rings[1] = { HeapRing, TaskRing, DepPool } +Scope depth 2 ──► rings[2] = { HeapRing, TaskRing, DepPool } +Scope depth ≥3 ──► rings[3] = { HeapRing, TaskRing, DepPool } (clamped) +``` + +Inner-scope tasks can now be reclaimed independently without waiting for outer-scope tasks to complete. + +## 3. Task ID Encoding + +Task IDs are widened from 32-bit to 64-bit to carry the ring identity: + +```text +task_id.raw = (ring_id << 32) | local_id +``` + +`PTO2TaskId` exposes direct accessors in `pto_runtime2_types.h`: + +| API | Purpose | +| --- | ------- | +| `PTO2TaskId::make(ring_id, local_id)` | Compose a 64-bit task ID (`PTO2TaskId`) | +| `task_id.ring()` | Extract `ring_id` (bits 63-32) | +| `task_id.local()` | Extract `local_id` (bits 31-0) | +| `task_id.raw` | Access the packed 64-bit encoding | + +Type changes: + +| Field | Before | After | +| ----- | ------ | ----- | +| `PTO2TaskDescriptor.task_id` | `int32_t` | `PTO2TaskId` | +| `PTO2TensorMapEntry.producer_task_id` | `int32_t` | `PTO2TaskId` | +| `PTO2TaskSlotState.ring_id` | N/A | `uint8_t` (new, denormalized for fast access) | + +## 4. Data Structures + +### 4.1 PTO2RingSet (new) + +Bundles the three per-ring resources into a single aggregate (`pto_ring_buffer.h`): + +```cpp +struct PTO2RingSet { + PTO2HeapRing heap_ring; + PTO2TaskRing task_ring; + PTO2FaninPool fanin_pool; +}; +``` + +### 4.2 PTO2OrchestratorState (modified) + +```cpp +// Before: single ring +PTO2HeapRing heap_ring; +PTO2TaskRing task_ring; +PTO2DepListPool dep_pool; + +// After: per-ring array (dep_pool moved to scheduler, see §4.5) +PTO2RingSet rings[PTO2_MAX_RING_DEPTH]; +``` + +Ring selection: `current_ring_id() = min(scope_stack_top, PTO2_MAX_RING_DEPTH - 1)`. + +### 4.3 PTO2SharedMemoryHeader (modified) + +Per-ring flow control and per-ring layout info are grouped together: + +```cpp +struct PTO2RingFlowControl { + std::atomic current_task_index; // task ring head + std::atomic last_task_alive; // task ring tail + std::atomic heap_top; // heap alloc pointer + std::atomic heap_tail; // heap reclaim pointer +}; + +struct alignas(64) PTO2SharedMemoryRingHeader { + PTO2RingFlowControl fc; + + // Layout metadata (set once at init) + uint64_t task_window_size; + int32_t task_window_mask; // task_window_size - 1 + uint64_t heap_size; + uint64_t task_descriptors_offset; + + // Per-ring data pointers (host-side, set by setup_pointers) + PTO2TaskDescriptor *task_descriptors; + PTO2TaskPayload *task_payloads; + PTO2TaskSlotState *slot_states; + + // Accessors (slot = local_id & task_window_mask) + PTO2TaskDescriptor &get_task_by_slot(int32_t slot); + PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id); + PTO2TaskPayload &get_payload_by_slot(int32_t slot); + PTO2TaskPayload &get_payload_by_task_id(int32_t local_id); + PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot); + PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id); +}; + +// In header: +PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH]; +``` + +Per-ring try-locks in the scheduler state prevent concurrent scheduler threads from interleaving watermark writes within the same ring. `FaninPool`/`DepListPool` `reclaim`/`ensure_space` take `PTO2SharedMemoryRingHeader&` directly (no `ring_id` or `fc` parameters). + +### 4.4 PTO2SharedMemoryHandle (lifecycle-only) + +Slimmed to lifecycle management only. Per-ring data pointers now live in `PTO2SharedMemoryRingHeader` (§4.3). Runtime components (orchestrator, scheduler) store `PTO2SharedMemoryHeader*` directly, eliminating one indirection on every per-ring access. + +```cpp +struct PTO2SharedMemoryHandle { + void *sm_base; + uint64_t sm_size; + PTO2SharedMemoryHeader *header; + bool is_owner; +}; +``` + +### 4.5 PTO2SchedulerState (modified) + +```cpp +struct RingSchedState { + // Cache Line 0: ring pointer (read-only) + hot path (read-write) + PTO2SharedMemoryRingHeader *ring; // direct pointer, no indirection + int32_t last_task_alive; + std::atomic advance_lock; // multi-thread CAS + + // Cache Line 1+: Thread 0 only (wiring dep_pool, cache-isolated) + alignas(64) PTO2DepListPool dep_pool; +}; + +RingSchedState ring_sched_states[PTO2_MAX_RING_DEPTH]; +PTO2SpscQueue wiring_queue; // global SPSC queue: orchestrator pushes, scheduler thread 0 drains +``` + +`slot_states`, `task_window_size`, and `task_window_mask` are no longer duplicated — callers access them via `ring->get_slot_state_by_*()` and other ring header accessors. The ring pointer shares cache line 0 with `last_task_alive` and `advance_lock`. + +### 4.6 PTO2TensorMap (modified) + +```cpp +PTO2TensorMapEntry** task_entry_heads[PTO2_MAX_RING_DEPTH]; +int64_t last_task_alives[PTO2_MAX_RING_DEPTH]; +``` + +Entry validity checks and `cleanup_retired` operate per-ring: + +```cpp +bool entry_valid(const PTO2TensorMapEntry& e) { + int32_t ring = e.producer_task_id.ring(); + int32_t local = e.producer_task_id.local(); + return local >= last_task_alives[ring]; +} +``` + +### 4.7 Unchanged Structures + +| Structure | Reason | +| --------- | ------ | +| `PTO2DepListEntry` | Stores `PTO2TaskSlotState*` pointer — naturally crosses ring boundaries | +| `PTO2TaskPayload` | `fanin_slot_states[]` are pointers — no ring coupling | +| `PTO2ReadyQueue` | Global ready queues shared across all rings (tasks ready to dispatch regardless of origin ring) | +| `PTO2DispatchPayload` | Built per-dispatch, no ring state needed | + +## 5. Reclamation + +### 5.1 Per-Ring Watermark Advancement + +Each ring's `last_task_alive` advances independently: + +```text +advance_ring_pointers(ring_id): // protected by per-ring advance_lock + la = ring->fc.last_task_alive + while ring->get_slot_state_by_task_id(la).task_state >= CONSUMED: + reset slot for reuse + la++ + sync_to_sm() // release-store last_task_alive +``` + +Per-ring try-locks in the scheduler state prevent concurrent scheduler threads from interleaving heap_tail writes within the same ring. + +### 5.2 Cross-Ring Dependencies + +Dependency edges use `PTO2TaskSlotState*` pointers, which naturally span rings: + +- Ring 1 task depends on ring 0 producer → ring 0's `fanout_head` linked list contains a ring 1 `PTO2TaskSlotState*` +- When ring 0 task completes, it walks its fanout list and decrements ring 1 consumers' `fanin_refcount` +- No special cross-ring logic needed — pointer-based design is ring-agnostic + +### 5.3 DepPool Reclamation + +DepPool is exclusively managed by scheduler thread 0 (allocation during wiring, reclamation during watermark advancement): + +```text +// Called by scheduler thread 0 during wiring_queue drain: +dep_pool_reclaim(ring_id): + la = ring->fc.last_task_alive + newest_consumed = la - 1 + mark = ring->get_slot_state_by_task_id(newest_consumed).dep_pool_mark + if mark > 0: + ring_sched_states[ring_id].dep_pool.advance_tail(mark) +``` + +Note: dep entries from ring N's pool may appear in ring M's fanout lists. Reclamation is safe because the entries are accessed during fanout traversal (completion time), which always happens before the consumer task — and therefore the dep entry — becomes eligible for reclamation. + +## 6. AICPU Register Protocol Fix + +The AICore dispatch protocol uses 32-bit registers. With multi-ring, `task_id` truncation to 32-bit loses the `ring_id`, causing collisions: + +```text +Ring 0, local_id=0 → DATA_MAIN_BASE = 0 + 1 = 1 +Ring 1, local_id=0 → DATA_MAIN_BASE = 0 + 1 = 1 (collision!) +``` + +AICore uses `last_reg_val` to detect new dispatches — identical values cause skipped tasks and false completions from stale COND registers. + +**Fix**: Per-core monotonic dispatch counter `s_dispatch_seq[core_id]` replaces `task_id` in register writes, guaranteeing unique `DATA_MAIN_BASE` values per core regardless of ring origin. + +## 7. Configuration + +### 7.1 Compile-Time Defaults (per ring) + +| Constant | Default | Total (×4 rings) | +| -------- | ------- | ---------------- | +| `PTO2_TASK_WINDOW_SIZE` | 16384 | 65536 | +| `PTO2_HEAP_SIZE` | 256 MB | 1 GB | +| `PTO2_DEP_LIST_POOL_SIZE` | 16384 | 65536 | + +### 7.2 Runtime Overrides + +Ring sizing can be configured either uniformly for every ring or independently +per ring. Precedence is resolved independently for each resource and ring: + +```text +per-ring CallConfig value + > scalar CallConfig value + > per-ring PTO2_RING_* env value + > scalar PTO2_RING_* env value + > compile-time default +``` + +`ring_id` is the scope-depth ring selected by the runtime: + +```text +scope depth 0 -> ring 0 +scope depth 1 -> ring 1 +scope depth 2 -> ring 2 +scope depth >=3 -> ring 3 +``` + +Per-task via `CallConfig.runtime_env` — different L2 tasks in one launch can +each carry their own sizes. Invalid values raise at submit time (`validate()`). +The scalar fields preserve the old behavior and broadcast one value to all +rings: + +```python +cfg = CallConfig() +cfg.runtime_env.ring_task_window = 128 # power of 2, >= 4 +cfg.runtime_env.ring_heap = 262144 # bytes/ring, >= 1024 +cfg.runtime_env.ring_dep_pool = 256 # 4 .. INT32_MAX +orchestrator.submit_next_level(handle, args, cfg) +``` + +Set the array fields to tune the four scope-depth rings independently. Each +array must contain exactly four entries; use `0` for an entry that should fall +through to the next precedence tier. All `CallConfig` values are integer +byte/count values. + +```python +cfg = CallConfig() +cfg.runtime_env.ring_task_windows = [8192, 16384, 131072, 524288] +cfg.runtime_env.ring_heaps = [ + 128 * 1024 * 1024, + 256 * 1024 * 1024, + 384 * 1024 * 1024, + 512 * 1024 * 1024, +] +cfg.runtime_env.ring_dep_pools = [4096, 8192, 16384, 32768] +orchestrator.submit_next_level(handle, args, cfg) +``` + +Scene tests set the same keys under a nested `runtime_env` block in the +per-case `config` dict: + +```python +"config": { + "runtime_env": { + "ring_task_windows": [8192, 16384, 131072, 524288], + "ring_heaps": [134217728, 268435456, 402653184, 536870912], + "ring_dep_pools": [4096, 8192, 16384, 32768], + } +} +``` + +Process-wide env fallback accepts either one scalar value or exactly four +comma-separated per-ring values. Invalid env values are logged and ignored, then +fall through to defaults. `PTO2_RING_HEAP` values are integer bytes: + +```bash +# Uniform, old behavior: +PTO2_RING_TASK_WINDOW=1024 +PTO2_RING_HEAP=1048576 +PTO2_RING_DEP_POOL=1024 + +# Per-ring, indexed by ring_id 0..3: +PTO2_RING_TASK_WINDOW=8192,16384,131072,524288 +PTO2_RING_HEAP=134217728,268435456,402653184,536870912 +PTO2_RING_DEP_POOL=4096,8192,16384,32768 +``` + +Use `--enable-scope-stats` to confirm the effective values for a real run. The +first line of `scope_stats/scope_stats.jsonl` includes `task_window_max`, +`heap_max`, and `dep_pool_max`, indexed by `ring`. + +### 7.3 Sizing Guidelines + +- `task_window` must be ≥ max tasks in any single scope + headroom for concurrent scopes +- `heap` must accommodate peak output buffer allocation across all in-flight tasks on that ring +- `dep_pool` must be ≥ total dependency entries for all in-flight tasks on that ring +- On hardware, back-pressure latency is higher than in simulation — size conservatively +- Adding inner `PTO2_SCOPE` reduces peak per-ring usage, enabling smaller sizes diff --git a/src/a2a3/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md new file mode 100644 index 000000000..e6760fb1e --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md @@ -0,0 +1,39 @@ +# Runtime Logic: fully_distributed_within_core + +**Target design.** Orchestration, scheduling, and execution all run on the AI +cores in SPMD fashion; the AICPU is removed from orchestration/scheduling. The +authoritative specification is: + +- [`docs/fully_distributed_within_core.md`](../../../../docs/fully_distributed_within_core.md) + +Core elements (see the spec): + +- Task ownership via a claim race over two global cursors (`cube_cursor`, + `vector_cursor`); `owner = builder = executor`. +- Per-core full-duplicate TensorMap for dependency discovery (pull model via a + global `task_completed_flag` ring). +- Per-core private task ring + block-shared `block.won[N]` deposit table for + multi-core (MIX / 2V) co-ownership (anchor push + follower async drain). +- Deterministic, per-core-replicated GM output heap with frontier-based + reclamation. + +## Current state (re-based on tensormap_and_ringbuffer) + +This runtime is re-based on `tensormap_and_ringbuffer` to reuse its +`PTO2TensorMap`, `MixedKernels`/`ActiveMask`, `L0TaskArgs`, the +`pto_orchestration_api.h` submit API, and kernel-address resolution. The +distributed model is layered on incrementally: + +- `runtime/` — adds global claim cursors, a global completion-flag ring, a + deterministic GM output heap, and per-core replicated TensorMap + private task + ring on top of the reused types. +- `aicore/` — the SPMD run-ahead orchestrate+execute loop (spec section 6). +- `aicpu/` — reduced to an init/wire/signal/wait stub (no orchestration, + scheduling, or dispatch). +- `host/` — runtime maker / compile info (orchestration entry is invoked on the + cores). +- `orchestration/` — the PTO2 orchestration API (unchanged surface). + +The legacy AICPU orchestrator/scheduler sources inherited from +`tensormap_and_ringbuffer` (`runtime/scheduler/`, the orchestrator pipeline) are +progressively replaced or bypassed by the distributed path. diff --git a/src/a2a3/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md b/src/a2a3/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md new file mode 100644 index 000000000..ef1de83b4 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md @@ -0,0 +1,137 @@ +# Scalar Data Access — get/set_tensor_data Design + +## 1. Overview + +During task graph construction, orchestration sometimes needs to read InCore kernel results (for control-flow decisions) or write initial values into tensors. `get_tensor_data` / `set_tensor_data` provide **blocking** cross-layer data access, allowing orchestration to safely read and write tensor data. + +**Core design principle**: Reuse the existing TensorMap dependency tracking mechanism — no new synchronization infrastructure. + +## 2. API + +```cpp +// Blocking read: returns value at the given indices (default: raw uint64_t bits) +// Specify T for typed read: float val = get_tensor_data(tensor, 1, idx); +template +T get_tensor_data(const Tensor& tensor, uint32_t ndims, const uint32_t indices[]); + +// Blocking write: stores value at the given indices (type deduced from argument) +// Typed write: set_tensor_data(tensor, 1, idx, 42.0f); +template +void set_tensor_data(Tensor& tensor, uint32_t ndims, const uint32_t indices[], T value); +``` + +Both call into the runtime through the ops table — orchestration .so needs no runtime symbol linkage. + +## 3. Blocking Interface Design + +### 3.1 get_tensor_data Flow + +```text +addr null-check → TensorMap lookup → spin-wait producer COMPLETED → compute flat offset → memcpy read +``` + +- **addr null-check**: `buffer.addr == 0` means unallocated — log error, return 0 +- **TensorMap lookup**: find producer task by `buffer.addr` +- **spin-wait**: wait until producer `task_state >= PTO2_TASK_COMPLETED` +- **No producer** (lookup callback never fires): skip waiting, read immediately + +### 3.2 set_tensor_data Flow + +```text +addr null-check → TensorMap lookup → spin-wait producer COMPLETED → spin-wait consumers done → memcpy write +``` + +One extra step versus get_tensor_data: wait for all consumers to finish (`fanout_refcount >= fanout_count - 1`, excluding the scope reference). + +### 3.3 Timeout + +- Uses cycle counter (`get_sys_cnt_aicpu()`), checked every 1024 spins +- Threshold: `PTO2_TENSOR_DATA_TIMEOUT_CYCLES` (~10 s at 1.5 GHz) +- On timeout: sets `orch.fatal = true`, preventing further task submission + +## 4. add_output with Initial Value + +```cpp +TensorCreateInfo ci(shapes, ndims, dtype); +ci.set_initial_value(initial_value); +args.add_output(ci); +``` + +**Mechanism**: + +1. `ci.set_initial_value(value)` marks the create-info with an initial value before submission +2. `add_output(ci)` stores a pointer to `ci` in `L0TaskArgs` (the original must remain valid until submit) +3. During payload init, the output tensor is materialized via `init_from_create_info()` which triggers the fill +4. Fill strategy: + - Small buffer (< 64 B): element-by-element memcpy directly into dst + - Large buffer (≥ 64 B): fill the first 64 bytes as a template block, then bulk-memcpy in 64 B chunks; partial tail copy for remainder + +**Constraint**: existing tensors are write targets only through `add_inout()`. + +## 5. Scalar Dependencies via 1-Element Tensors + +Traditional scalars (`L0TaskArgs::add_scalar`) are one-way inputs with no TensorMap tracking. For cross-task scalar values, use a 1-element tensor as the carrier: + +```cpp +uint32_t shapes[1] = {1}; +TensorCreateInfo scalar_ci(shapes, 1, DataType::FLOAT32); + +// Submit with initial value and keep the returned tensor +scalar_ci.set_initial_value(float_to_u64(77.0f)); +L0TaskArgs args; +args.add_output(scalar_ci); +TaskOutputTensors outs = rt_submit_aiv_task(FUNC_NOOP, args); +const Tensor& scalar_tensor = outs.get_ref(0); + +// Orchestration-side blocking read (waits for kernel completion) +uint32_t idx[1] = {0}; +float val = get_tensor_data(scalar_tensor, 1, idx); +``` + +**Advantage**: Fully reuses existing TensorMap (producer tracking, fanin/fanout dependencies) — no new infrastructure needed. + +## 6. Data Hazard Analysis + +Three actors: + +- **Kernel**: InCore task submitted via add_input/add_output/add_inout (asynchronous execution) +- **Orch Read**: orchestration calls `get_tensor_data` (blocking read) +- **Orch Write**: orchestration calls `set_tensor_data` (blocking write) + +### Hazard Matrix (earlier operation → later operation) + +| # | Earlier Op | Later Op | Hazard | Guarantee | Safe? | +| - | ---------- | -------- | ------ | --------- | ----- | +| 1 | Kernel write (OUTPUT) | Orch Read | RAW | spin-wait producer COMPLETED | Yes | +| 2 | Kernel write (OUTPUT) | Orch Write | WAW | spin-wait producer COMPLETED | Yes | +| 3 | Kernel read (INPUT) | Orch Write | WAR | spin-wait fanout_refcount | **Needs INOUT** | +| 4 | Kernel read-write (INOUT) | Orch Read | RAW | spin-wait producer COMPLETED | Yes | +| 5 | Kernel read-write (INOUT) | Orch Write | WAW+WAR | spin-wait producer + consumers | Yes | +| 6 | Orch Write | Kernel read (INPUT) | RAW | blocking completes before next submit | Yes | +| 7 | Orch Write | Kernel write (OUTPUT) | WAW | same — serial guarantee | Yes | +| 8 | Orch Read | Kernel write (OUTPUT) | WAR | same — serial guarantee | Yes | +| 9–12 | Orch ↔ Orch | — | — | same-thread serial execution | Yes | + +### Key Design Points + +**Scenario #3 is the only case requiring special attention**: + +TensorMap tracks only producers (OUTPUT/INOUT), not pure INPUT consumers. If a tensor is only registered via `add_input()`, TensorMap has no producer entry for it. `set_tensor_data`'s `wait_for_tensor_ready()` finds no matching producer (the lookup callback never fires) and returns immediately — but the kernel may still be reading → **WAR data race**. + +**Solution**: For tensors that may later be written via `set_tensor_data`, use `add_inout()` instead of `add_input()`. INOUT registers a producer entry in TensorMap, enabling `set_tensor_data` to track all consumers through `fanout_refcount`. + +**Scenarios #6–8 serial guarantee**: + +get/set_tensor_data are blocking calls, and orchestration is single-threaded serial submission. After a blocking operation completes, subsequent code (including task submissions) executes strictly afterward. + +## 7. External Tensor Behavior + +`make_tensor_external()` creates tensors with a pre-set `buffer.addr` (pointing to host-allocated device memory). + +| Scenario | Behavior | +| -------- | -------- | +| External tensor never submitted as OUTPUT/INOUT | No TensorMap entry — get/set execute immediately | +| External tensor previously submitted as OUTPUT/INOUT | TensorMap has producer entry — get/set spin-wait | +| External tensor submitted as INPUT, then set_tensor_data | **WAR risk** — must use INOUT instead (same as scenario #3) | + +**Key rule**: If an external tensor will later be written via `set_tensor_data`, all prior kernel accesses must use `add_inout()`, not `add_input()`. diff --git a/src/a2a3/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md b/src/a2a3/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md new file mode 100644 index 000000000..8cba7e90c --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md @@ -0,0 +1,222 @@ +# Submit by Cluster - Requirements and Main-Branch-Aligned Design + +## 1. Goal + +Define a single, main-branch-aligned specification for PTO2 cluster submission that combines: + +1. Product requirements (what must be true). +2. Runtime design (how it is implemented on current main baseline). + +The target model is: one submitted graph node is one `MixedTask`, and dispatch/completion is mixed-task-granular. + +## 2. Background and Motivation + +Future Ascend hardware is expected to provide stronger locality within an AICore cluster (`1 AIC + 2 AIV`). +The runtime therefore needs a "submit together, run together" model for related AIC/AIV kernels. + +Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-dispatch of multiple kernels to one cluster. + +## 3. Scope + +### In Scope + +1. New orchestration-facing submit API for cluster-aware mixed submission. +2. Runtime/backend scheduler and executor changes to treat a mixed submit as one atomic scheduling unit. +3. Dependency gating, readiness, dispatch, completion, and reclamation at mixed-task granularity. +4. AIV slot equivalence (`AIV0` and `AIV1` are equivalent execution targets). + +### Out of Scope + +1. User-facing cluster pinning (`allocate_cluster/free_cluster`-style APIs). +2. New worker types beyond AIC/AIV. +3. Cross-cluster user placement policies. +4. Hardware topology changes beyond `1 AIC + 2 AIV` per cluster. + +## 4. Main-Branch Baseline Constraints + +Design must preserve the current main runtime architecture: + +1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`). +2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold). + +## 5. Terminology + +1. `cluster`: one physical unit with `1 AIC + 2 AIV`. +2. `MixedKernels`: 3 submit slots (`AIC`, `AIV0`, `AIV1`) with `INVALID_KERNEL_ID` for inactive slots. +3. `MixedTask`: one runtime graph node created by one submit call. +4. `active_mask`: bitmask of active subtask slots. +5. `resource shape`: normalized lane demand class of a mixed task. + +## 6. API Contract + +```cpp +inline constexpr int32_t INVALID_KERNEL_ID = -1; + +struct MixedKernels { + int32_t aic_kernel_id{INVALID_KERNEL_ID}; + int32_t aiv0_kernel_id{INVALID_KERNEL_ID}; + int32_t aiv1_kernel_id{INVALID_KERNEL_ID}; +}; + +static inline void rt_submit_task(PTO2Runtime* rt, + const MixedKernels& mixed_kernels, + Arg* args, + int32_t num_args); + +static inline void rt_submit_aic_task(PTO2Runtime* rt, + int32_t kernel_id, + Arg* args, + int32_t num_args); + +static inline void rt_submit_aiv_task(PTO2Runtime* rt, + int32_t kernel_id, + Arg* args, + int32_t num_args); +``` + +Rules: + +1. One submit call creates one `MixedTask`. +2. All active slots share the same `args` and `num_args`. +3. At least one slot must be active. +4. `aiv0_kernel_id` and `aiv1_kernel_id` are semantically equivalent. +5. Wrappers are orchestration sugar only (inline in orchestration API); no dedicated runtime ops entries. +6. Submit-contract types are defined once in a shared header-only submit-types surface consumed by orchestration and runtime headers. +7. Invalid submits follow existing PTO2 behavior (`always_assert`), not a new recoverable return-code API. + +## 7. Data Model (Requirements + Design) + +`PTO2TaskDescriptor` (hot path) carries mixed-task identity/state: + +1. `task_id` +2. `active_mask` +3. `completed_subtasks` (atomic counter, incremented per subtask completion) +4. `kernel_id[3]` for `(AIC, AIV0, AIV1)` +5. dependency heads/counters and packed-buffer metadata + +`PTO2TaskPayload` (cold path) carries: + +1. shared args/tensors/scalars copied once per mixed submit +2. fanin mixed-task IDs +3. other cold-path submit metadata + +Producer identity in TensorMap is mixed-task ID end-to-end. + +## 8. Scheduling Model + +### 8.1 Resource Shapes + +Runtime uses shape-based ready queues (not worker-type queues): + +1. `AIC_ONLY` +2. `AIV_X1` +3. `AIV_X2` +4. `AIC_AIV_X1` +5. `AIC_AIV_X2` + +Queueing key is normalized resource shape (not raw slot label). + +### 8.2 Atomic Cluster Dispatch + +1. Dispatch decision unit is one mixed task. +2. For multi-slot mixed tasks, partial launch is forbidden. +3. A mixed task is dispatchable only when one local owned cluster can satisfy all required lanes. +4. Compatible mixed tasks may co-reside over time if they use disjoint free lanes. + +### 8.3 Dependency and Completion + +1. Fanin release/readiness remains dependency-correct and graph-level. +2. Two-stage completion: + - `on_subtask_complete(task_id, subslot)` + - `on_task_complete(task_id)` only when `completed_subtasks == total_required_subtasks` +3. Downstream release is triggered once per mixed task completion, not once per subslot. + +## 9. Executor Ownership and Numbering + +### 9.1 Canonical Flattened Numbering (Unchanged) + +Given `block_dim` clusters: + +1. AIC IDs: `[0, block_dim)` +2. AIV IDs: `[block_dim, 3 * block_dim)` +3. Cluster `i`: `{i, block_dim + i, 2 * block_dim + i}` + +This project-defined flattened numbering is kept unchanged. + +### 9.2 Cluster Ownership + +1. One cluster must be owned by one scheduler domain/thread at a time. +2. No split-cluster ownership in either: + - initial `assign_cores_to_threads()` + - post-orchestrator `reassign_cores_for_all_threads()` +3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment. + +## 10. Functional Requirements + +### 10.1 Valid Mixed Shapes + +1. AIC only +2. AIV only (1 or 2 AIV lanes) +3. AIC + 1 AIV +4. AIC + 2 AIV + +### 10.2 Runtime Behavior per Submit + +1. Validate submit arguments. +2. Allocate mixed-task ID and initialize descriptor/payload/slot_state once. +3. Lookup producers via TensorMap; collect fanin metadata and increment producers' `fanout_count`. +4. Push task to scheduler's wiring queue (scheduler thread 0 asynchronously wires fanout edges and determines readiness). +5. Dispatch all active lanes atomically when resources allow. +6. Aggregate completion and release downstream once. + +## 11. Non-Functional Requirements + +1. Correctness: no dependency violation, no partial mixed-task dispatch. +2. Determinism: dependency-correct ordering preserved; AIV lane choice may vary but remains semantically equivalent. +3. Fairness: resource-aware polling heuristic is allowed; strict starvation-free guarantee across all shapes is not required. +4. Performance: no obvious regression for non-cluster workflows. +5. Observability: lifecycle visibility for submit/ready/dispatch/block/complete. + +## 12. Acceptance Criteria + +Feature is accepted when: + +1. Orchestration compiles and submits via `MixedKernels` API/wrappers. +2. Scheduler dispatches each mixed task as one cluster scheduling decision. +3. Dependencies gate mixed-task readiness correctly. +4. AIV execution remains cluster-local and semantically equivalent across lanes. +5. Existing non-cluster workflows continue to pass without behavior regression. +6. Cluster ownership is never split across scheduler domains before/after transition. + +## 13. Verification Matrix + +Recommended validation coverage: + +1. Mapping correctness for cluster-to-core ID relation. +2. Atomic dispatch for multi-slot shapes. +3. Dependency gating and completion aggregation (`done_mask == active_mask`). +4. Lane-occupancy co-residency behavior for compatible shapes. +5. Core-transition ownership stability. +6. Invalid submit handling (`always_assert` path). +7. Regression coverage for existing examples/tests. + +Milestone command (device): + +```bash +python tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py \ + -p a2a3 -d 9 +``` + +Final validation: + +```bash +pytest examples tests/st --platform a2a3 +``` + +## 14. Resolved Decisions + +1. Legacy orchestration-facing single-task submit is replaced by mixed submit contract. +2. Invalid mixed submits fail with existing submit-time assert behavior. +3. Per-cluster concurrent capacity is lane-occupancy-driven, not a fixed constant. +4. Submit-contract types live in one shared header-only surface. +5. Resource-aware dispatch heuristics are allowed without a strict starvation-free guarantee. diff --git a/src/a2a3/runtime/fully_distributed_within_core/docs/device_log_profiling.md b/src/a2a3/runtime/fully_distributed_within_core/docs/device_log_profiling.md new file mode 100644 index 000000000..af661d440 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/docs/device_log_profiling.md @@ -0,0 +1,166 @@ +# PTO2 Device Log Profiling Guide + +## How to Find Device Logs + +AICPU logs (via `LOG_INFO_V9`) are written by CANN's **dlog** subsystem and do **not** appear in the `python test_*.py` / pytest terminal output. They are written to CANN's device log directory: + +```text +$HOME/ascend/log/debug/device-/device-_.log +``` + +Each run produces a new log file (or appends to an existing one). Find the most recent file by modification time: + +```bash +ls -lt $HOME/ascend/log/debug/device-/ | head -5 +``` + +## Log Structure Overview + +A single run produces two profiling blocks in the device log: + +| Block | Emitted by | Function | Content | +| ----- | ---------- | -------- | ------- | +| **Orchestrator Profiling** | Thread 3 (orchestrator) | `aicpu_orchestration_entry` | Time breakdown of graph construction on device | +| **PTO2 Scheduler Summary** | Threads 0/1/2 (schedulers) | `SchedulerContext::resolve_and_dispatch` | Per-thread scheduling statistics, phase timing, and lock contention | + +All timing values are in microseconds (us), converted from AICPU cycle counters. + +--- + +## Block 1: Orchestrator Profiling + +Thread 3 loads the orchestration `.so` via `dlopen`, calls `aicpu_orchestration_entry`, and prints a profiling summary after it returns. + +### Example (from a real run: batch=64, 16704 tasks) + +```text +Thread 3: Calling aicpu_orchestration_entry from SO +Thread 3: aicpu_orchestration_entry returned, cost 20943.940us +Thread 3: === Orchestrator Profiling: 16704 tasks, total=14601.580us === +Thread 3: sync_tensormap : 286.300us (2.0%) +Thread 3: task_ring_alloc: 380.400us (2.6%) +Thread 3: param_copy : 2147.800us (14.7%) +Thread 3: lookup+dep : 7290.300us (49.9%) +Thread 3: heap_alloc : 701.500us (4.8%) +Thread 3: tensormap_ins : 1890.380us (12.9%) +Thread 3: fanin+ready : 1207.400us (8.3%) +Thread 3: finalize+SM : 697.500us (4.8%) +Thread 3: scope_end : 364.080us +Thread 3: avg/task : 0.874us +Thread 3: PTO2 total submitted tasks = 16704 +``` + +### Field Reference + +| Field | Source (`pto_orchestrator.cpp`) | Description | +| ----- | ------------------------------- | ----------- | +| **cost** | Wall-clock around `orch_func()` call | Total time including orchestration logic + scope overhead | +| **total** | Sum of all sub-steps below | Accumulated time inside `submit_task` across all tasks | +| **sync_tensormap** | `g_orch_sync_cycle` | TensorMap validity sync and optional cleanup before each submission | +| **task_ring_alloc** | `g_orch_alloc_cycle` | Allocating a task slot from the task ring buffer | +| **param_copy** | `g_orch_args_cycle` | Copying param descriptors + tensor descriptor copies into task-owned storage | +| **lookup+dep** | `g_orch_lookup_cycle` | TensorMap lookup for inputs/inouts + building fanin/fanout dependency edges | +| **heap_alloc** | `g_orch_heap_cycle` | Allocating packed output buffers from the heap ring | +| **tensormap_ins** | `g_orch_insert_cycle` | Inserting output/inout tensors into the TensorMap | +| **fanin+ready** | `g_orch_fanin_cycle` | Building the fanin list + checking if task is already ready (Step 5/5b) | +| **scope_end** | `g_orch_scope_end_cycle` | `end_scope` overhead (notifying scheduler of scope completion) | +| **avg/task** | `total / submit_count` | Average orchestrator time per task submission | + +### Interpreting the Numbers + +- **cost > total**: The difference is overhead outside `submit_task` (the orchestration user code itself, scope_begin/end, TensorCreateInfo construction, etc.). +- **lookup+dep** is typically the dominant cost (~50%) because it involves TensorMap hash lookups and building dependency edges with spinlock-protected fanout list insertions. +- **param_copy** scales with the number of parameters per task. +- **avg/task < 1us** indicates efficient graph construction. + +--- + +## Block 2: PTO2 Scheduler Summary + +Each of the 3 scheduler threads (Thread 0, 1, 2) prints its own summary after completing all tasks. The output has two sub-sections: **summary** and **phase breakdown**. + +### Example (Thread 0, from a different run: batch=1, 1044 tasks) + +```text +Thread 0: completed=352 tasks in 3477.420us (147 loops, 2.4 tasks/loop) +Thread 0: --- Phase Breakdown --- +Thread 0: complete: 1485.020us (42.7%) +Thread 0: scan: 14.400us (0.4%) +Thread 0: dispatch: 1973.060us (56.7%) +Thread 0: idle: 4.940us (0.1%) +``` + +### Summary Line + +```text +Thread N: completed=X tasks in Yus (Z loops, W tasks/loop) +``` + +| Field | Description | +| ----- | ----------- | +| **completed** | Number of tasks this thread processed to completion | +| **Y us** | Total scheduler loop time (sum of all phase cycles) | +| **Z loops** | Number of scheduler loop iterations | +| **W tasks/loop** | Average tasks completed per loop iteration; higher = better throughput | + +### Phase Breakdown + +The scheduler loop runs four phases each iteration. Each phase's time is accumulated across all loop iterations. + +| Phase | What it does | Inline stats | +| ----- | ------------ | ------------ | +| **complete** | Polls handshake on each managed core; when a core completes, calls `on_subtask_complete(task_id, subslot)` to increment the completion counter; when `completed_subtasks == total_required_subtasks`, triggers `on_task_complete` which traverses fanout list (notify consumers) and fanin list (release producers) | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release | +| **scan** | Updates the perf profiling header with latest scheduler state | — | +| **dispatch** | For each idle core, pops a task from the shape-based ready queue via `get_ready_task(shape)`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) | +| **idle** | Scheduler loop iteration where no progress was made (no completions, no dispatches) | — | + +**Interpreting phase percentages:** + +- **dispatch** is typically the largest (~55-60%) because it includes ready-queue pops (with spinlock), payload construction, and cache flush (`dc cvac` + `dsb sy`). +- **complete** is the second largest (~40-45%) because it traverses both fanout (CAS-based fanin decrement, conditional ready-queue push) and fanin (release_producer, check_consumed, ring pointer advancement). +- **scan** is small (<1%) — only updates the perf header. +- **idle** is negligible when tasks are flowing; high idle% indicates the scheduler is starved. + +**Interpreting pop hit_rate:** + +- **High hit_rate (>50%)**: Ready queue is well-supplied; dispatch is efficient. +- **Low hit_rate (<10%)**: Ready queue is mostly empty when cores become idle. The bottleneck is upstream (orchestrator submission speed or fanout resolution latency), not dispatch itself. + +### Per-Task Averages + +Divide each thread's phase times by its `completed` count to get per-task scheduling cost: + +| Metric | Formula | Typical value | +| ------ | ------- | ------------- | +| Scheduling overhead per task | total_time / completed | ~5-10 us/task | +| Dispatch per task | dispatch_time / completed | ~3-6 us/task | +| Complete per task | complete_time / completed | ~2-4 us/task | + +--- + +## Cross-Referencing with Host Profiling + +When `--enable-l2-swimlane` is used, the host terminal prints a **Task Statistics by Function** table with `Total_Exec` (total AICore kernel execution time). Combined with device log data: + +| Metric | Source | Description | +| ------ | ------ | ----------- | +| Avg kernel exec time | `Total_Exec / total_tasks` (host) | Time AICore spends executing each kernel | +| Avg scheduling overhead | `sum(thread_total) / total_tasks` (device log) | Time AICPU spends scheduling each task | +| Sched/Exec ratio | scheduling / execution | Scheduling overhead relative to kernel execution | + +A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates, and optimizations should target the scheduler's dispatch hot path (cache flush, payload construction) or upstream task flow. + +--- + +## Quick Reference: Extracting Profiling Data + +```bash +# Find the latest device log for device 2 +ls -t $HOME/ascend/log/debug/device-2/device-*.log | head -1 + +# Extract orchestrator profiling (Thread 3) +grep "Thread 3:" + +# Extract scheduler profiling (Threads 0/1/2) +grep -E "Thread [012]:" +``` diff --git a/src/a2a3/runtime/fully_distributed_within_core/docs/profiling_levels.md b/src/a2a3/runtime/fully_distributed_within_core/docs/profiling_levels.md new file mode 100644 index 000000000..bd669f365 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/docs/profiling_levels.md @@ -0,0 +1,480 @@ +# PTO Runtime2 Profiling Levels + +This document describes the profiling macro hierarchy and logging control in the PTO Runtime2 system. + +## Overview + +PTO Runtime2 uses a hierarchical profiling system with compile-time macros to control profiling code compilation and log output. The `enable_l2_swimlane` runtime flag (integer perf_level 0–4) controls data collection granularity (performance buffers, shared memory writes) but does NOT control log output. + +## Profiling Macro Hierarchy + +Defaults and dependency validation are centralized in +`src/common/task_interface/profiling_config.h`. Runtime headers include that +file before using the macros, so both a2a3 and a5 share the same default +values and compile-time checks. + +```text +PTO2_PROFILING (base level, default=1) +├── PTO2_ORCH_PROFILING (orchestrator, default=0, requires PTO2_PROFILING=1) +| └──PTO2_TENSORMAP_PROFILING (tensormap, default=0, requires PTO2_ORCH_PROFILING=1) +├── PTO2_SCHED_PROFILING (scheduler, default=0, requires PTO2_PROFILING=1) +└── --enable-l2-swimlane [PERF_LEVEL] (L2 swimlane data collection, 0-4, bare=4, requires PTO2_PROFILING=1) + +``` + +### Compile-Time Validation + +Each sub-level macro requires `PTO2_PROFILING=1`: + +```cpp +#if PTO2_ORCH_PROFILING && !PTO2_PROFILING +#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1" +#endif + +#if PTO2_SCHED_PROFILING && !PTO2_PROFILING +#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1" +#endif + +#if PTO2_TENSORMAP_PROFILING && !PTO2_ORCH_PROFILING +#error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1" +#endif +``` + +## Profiling Levels + +### Level 0: No Profiling (PTO2_PROFILING=0) + +**What's compiled:** + +- Debug/diagnostic logs (always present) +- Progress tracking (`PTO2 progress: completed=...`) +- Stall detection and dump (triggered after the `SCHEDULER_TIMEOUT_MS` wall-clock no-progress budget) +- Deadlock/livelock detection (`diagnose_stuck_state`, called on stall) + +**What's NOT compiled:** + +- All `CYCLE_COUNT_*` timing counters (`sched_*_cycle`, orchestrator cost counters) +- Scheduler/Orchestrator profiling summary logs guarded by `#if PTO2_PROFILING` +- Performance data collection paths (`enable_l2_swimlane` runtime flag becomes ineffective because profiling code is not compiled) + +**Log output (normal run, no stall):** + +- No `sched_start/sched_end/sched_cost` timestamps +- No `orch_start/orch_end/orch_cost` timestamps +- No `Scheduler summary: total_time=...` +- No `PTO2 total submitted tasks` log +- `PTO2 progress: completed=... total=...` may appear (thread 0 only, at task completion milestones) + +--- + +### Level 1: Basic Profiling (PTO2_PROFILING=1) + +**What's compiled:** + +- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`) +- Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`) +- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true) +- PTO2 total submitted tasks count (printed by last orch thread, after orch timing line) +- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`) +- Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary) + +**What's NOT compiled:** + +- Detailed phase breakdowns +- TensorMap statistics + +**Log output (additional lines vs Level 0, per normal run):** + +- `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete +- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line +- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true` +- `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary +- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread +- `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`) + +**LOG_INFO_V9 count (normal run):** + +- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary) +- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`) + +> See the table at the end for concrete counts based on the `paged_attention` example. + +**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10): + +```text +Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us +Thread 3: orch_start=48214752948316 orch_end=48214752961505 orch_cost=275.000us +PTO2 total submitted tasks = 13, already executed 13 tasks +Thread 1: sched_start=48214752948235 sched_end=48214752962379 sched_cost=295.000us +Thread 1: Scheduler summary: total_time=159.560us, loops=3782, tasks_scheduled=6 +Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000us +Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7 +``` + +**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11): + +```text +Thread 3: orch_stage_end=48236915058307 +Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us +Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us +PTO2 total submitted tasks = 13, already executed 13 tasks +Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us +Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4 +Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us +Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9 +``` + +> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time). + +**Note:** + +- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`. +- `enable_l2_swimlane` only controls shared-memory data collection / swimlane export. +- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`. + +--- + +### Level 2: Scheduler Detailed Profiling (PTO2_SCHED_PROFILING=1) + +**Requires:** `PTO2_PROFILING=1` + +**What's compiled:** + +- All Level 1 features +- Detailed scheduler phase counters +- Phase-specific statistics (complete, scan, dispatch, idle) +- Hit rate tracking (complete poll, ready queue pop) + +**Log output:** 18 LOG_INFO_V9 logs (11 debug + 2 basic + 7 scheduler detailed - 2 replaced) + +- Replaces scheduler summary with detailed breakdown + +**Scheduler output:** + +```text +Thread X: === Scheduler Phase Breakdown: total=XXXus, XXX tasks === +Thread X: complete : XXXus (XX.X%) +Thread X: poll : XXXus (XX.X%) hit=XXX, miss=XXX, hit_rate=XX.X% +Thread X: otc_lock : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: otc_fanout : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: otc_fanin : XXXus (XX.X%) atomics=XXX +Thread X: otc_self : XXXus (XX.X%) atomics=XXX +Thread X: perf : XXXus (XX.X%) +Thread X: dispatch : XXXus (XX.X%) +Thread X: poll : XXXus (XX.X%) +Thread X: pop : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: setup : XXXus (XX.X%) +Thread X: scan : XXXus (XX.X%) +Thread X: idle : XXXus (XX.X%) +Thread X: avg/complete : XXXus +Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX +``` + +Per-thread fanout / fanin edge counts and ready-queue pop hit / miss +stats live in `aicpu_scheduler_phases[]` (in `l2_swimlane_records.json` +captured at l2_swimlane_level >= 3) and `deps.json`; consume them via +`simpler_setup/tools/sched_overhead_analysis.py`. + +--- + +### Level 3: Orchestrator Detailed Profiling (PTO2_ORCH_PROFILING=1) + +**Requires:** `PTO2_PROFILING=1` + +**What's compiled:** + +- All Level 1 features +- Detailed orchestrator phase counters +- Per-phase cycle tracking +- Atomic operation counters +- Wait time tracking + +**Log output:** 30 LOG_INFO_V9 logs (11 debug + 2 basic + 1 scheduler summary + 17 orchestrator detailed - 1 replaced) + +- Replaces basic orchestration completion with detailed breakdown + +**Orchestrator output:** + +```text +Thread X: === Orchestrator Profiling: XXX tasks, total=XXXus === +Thread X: sync_tensormap : XXXus (XX.X%) +Thread X: task_ring_alloc: XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: param_copy : XXXus (XX.X%) atomics=XXX +Thread X: lookup+dep : XXXus (XX.X%) +Thread X: heap_alloc : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: tensormap_ins : XXXus (XX.X%) +Thread X: fanin+ready : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: finalize+SM : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: scope_end : XXXus atomics=XXX +Thread X: avg/task : XXXus +``` + +**Note:** Orchestrator logs always print when `PTO2_ORCH_PROFILING=1`, regardless of `enable_l2_swimlane` flag. + +--- + +### Level 4: TensorMap Profiling (PTO2_TENSORMAP_PROFILING=1) + +**Requires:** `PTO2_PROFILING=1` AND `PTO2_ORCH_PROFILING=1` + +**What's compiled:** + +- All Level 3 features +- TensorMap lookup statistics +- Hash chain walk tracking +- Overlap check counters + +**Log output:** 34 LOG_INFO_V9 logs (30 from Level 3 + 4 tensormap) + +**TensorMap output:** + +```text +Thread X: === TensorMap Lookup Stats === +Thread X: lookups : XXX, inserts: XXX +Thread X: chain walked : total=XXX, avg=X.X, max=X +Thread X: overlap checks : XXX, hits=XXX (XX.X%) +``` + +--- + +## Runtime Flag: enable_l2_swimlane (perf_level) + +`--enable-l2-swimlane` accepts an integer perf_level (0–4). Transport +mirrors the PMU pattern — two independent channels (one binary, one int): + +- **Binary on/off** — `KernelArgs::enable_profiling_flag` bit1 + (`PROFILING_FLAG_L2_SWIMLANE`). Set by the host whenever level > 0; read + by AICore (which only needs on/off to decide whether to write timing) and + by AICPU kernel entry via `set_l2_swimlane_enabled(bool)`. +- **Granular level (0–4)** — `L2SwimlaneDataHeader::l2_swimlane_level` + (shared memory). Host writes it in `L2SwimlaneCollector::initialize`; AICPU + promotes it from the header in `l2_swimlane_aicpu_init` and exposes it via + `get_l2_swimlane_level()` (typed `L2SwimlaneLevel`) for + `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates. + +On sim, the binary on/off travels via the dlsym'd `set_l2_swimlane_enabled` +entry point; the granular level still goes through the shared-memory +header just like on onboard. + +| Level | Collects | +| ----- | -------- | +| 0 | Nothing (disabled) | +| 1 | AICore timing only (start/end/task_token_raw) — AICPU `complete_task` is bypassed | +| 2 | + AICPU dispatch_time, finish_time | +| 3 | + Scheduler phases (`SCHED_*`) | +| 4 | + Orchestrator phases (full) | + +At level 1 the AICore record carries the full PTO2 `task_token_raw` +(`(ring_id << 32) | local_id`), read straight from +`LocalContext.async_ctx.task_token.raw` inside the AICore helper — +already in cache from the dispatch payload, so no extra GM load. +Identity fields the AICPU side used to write at level 1 (`func_id`, +`core_type`) are derived host-side: + +- `func_id` ← `deps.json`'s per-task `kernel_ids[]`, joined by + `task_id` at post-process by `swimlane_converter.py`. Same model + `fanout` already uses. +- `core_type` ← per-core static table published by the host into the + collector (`L2SwimlaneCollector::set_core_types`). + +AICore buffer rotation no longer piggy-backs on `complete_task`. AICPU +counts dispatches per core in the dispatch path (scheduler_dispatch in +tensormap_and_ringbuffer; aicpu_executor in host_build_graph) and rotates +the AICore buffer when the count is about to cross a +`PLATFORM_AICORE_BUFFER_SIZE` boundary — strictly before +`write_reg(DATA_MAIN_BASE)` for the first task of the new batch. The +hook is `l2_swimlane_aicpu_on_aicore_dispatch`. No AICore-side signal is +needed: AICPU has full dispatch visibility on its own. Race safety comes +from the completion-before-dispatch invariant (AICore per core is +single-threaded and AICPU does not dispatch task K+1 until K FIN'd), which +guarantees AICore has FIN'd — and `dcci`'d out — every record in the old +buffer by rotation time. This decoupling is what lets level 1 skip +`complete_task` without losing rotations. + +Fanout edges are no longer carried on the device hot path — `swimlane_converter.py` +joins them from the sibling `deps.json` (produced by dep_gen) at post-process time. + +Bare `--enable-l2-swimlane` = level 4 (backward compatible). + +### Level gating in AICPU code + +Use the strongly-typed `L2SwimlaneLevel` enum so each gate names the +content it depends on instead of relying on magic numbers: + +```cpp +// Any level > 0: AICPU task record buffer init / flush. +// Cheap binary check, available immediately after kernel entry. +if (is_l2_swimlane_enabled()) { ... } + +// AICPU dispatch/finish timestamps. +// Granular checks below require l2_swimlane_aicpu_init to have already run +// (so the level has been promoted from the shared-memory header). +if (get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) { ... } + +// Scheduler main-loop phase records (SCHED_*) +if (get_l2_swimlane_level() >= L2SwimlaneLevel::SCHED_PHASES) { ... } + +// Orchestrator phase records +if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { ... } +``` + +`L2SwimlaneLevel` is defined in `common/l2_swimlane_profiling.h` with +underlying type `uint32_t` (matches the `L2SwimlaneDataHeader::l2_swimlane_level` +shared-memory field and mirrors `PmuEventType : uint32_t`): + +| Enumerator | Underlying value | +| ---------- | ---------------- | +| `DISABLED` | 0 | +| `AICORE_TIMING` | 1 | +| `AICPU_TIMING` | 2 | +| `SCHED_PHASES` | 3 | +| `ORCH_PHASES` | 4 | + +### When enable_l2_swimlane=0 + +- No performance data collection +- No shared memory writes +- Logs still print (controlled by macros only) + +--- + +## Common Profiling Configurations + +### Development (minimal overhead) + +```bash +# No profiling overhead +PTO2_PROFILING=0 +``` + +### Basic Performance Monitoring + +```bash +# Minimal overhead, summary logs only +PTO2_PROFILING=1 +PTO2_ORCH_PROFILING=0 +PTO2_SCHED_PROFILING=0 +``` + +### Scheduler Performance Analysis + +```bash +# Detailed scheduler breakdown +PTO2_PROFILING=1 +PTO2_ORCH_PROFILING=0 +PTO2_SCHED_PROFILING=1 +``` + +### Orchestrator Performance Analysis + +```bash +# Detailed orchestrator breakdown +PTO2_PROFILING=1 +PTO2_ORCH_PROFILING=1 +PTO2_SCHED_PROFILING=0 +``` + +### Full Profiling (maximum overhead) + +```bash +# All profiling features enabled +PTO2_PROFILING=1 +PTO2_ORCH_PROFILING=1 +PTO2_SCHED_PROFILING=1 +PTO2_TENSORMAP_PROFILING=1 +``` + +--- + +## Setting Profiling Macros + +### At compile time + +Pass compile definitions through the build command or CI `CXXFLAGS`. +This overrides the defaults in `profiling_config.h` without changing source. + +```bash +# Example: disable all profiling code +CXXFLAGS="-DPTO2_PROFILING=0" pip install --no-build-isolation -e . + +# Example: enable orchestrator and tensormap profiling +CXXFLAGS="-DPTO2_ORCH_PROFILING=1 -DPTO2_TENSORMAP_PROFILING=1" \ + pip install --no-build-isolation -e . +``` + +### In source code (before including headers) + +Source-level overrides are only for local experiments. They must appear before +any header includes `profiling_config.h`; do not add duplicated fallback +definitions to runtime headers. + +```cpp +#define PTO2_PROFILING 1 +#define PTO2_ORCH_PROFILING 1 +#include "pto_runtime2_types.h" +``` + +--- + +## Log Output Summary + +> Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout). + +| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description | +| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- | +| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output | +| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary | +| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown | +| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown | +| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats | + +--- + +## Implementation Notes + +### Key Principles + +1. **Macros control compilation and logging** + - `#if PTO2_PROFILING` controls whether profiling code is compiled + - Logs print when macro is enabled, regardless of runtime flag + +2. **Runtime flag controls data collection** + - `enable_l2_swimlane` controls performance buffer allocation + - Controls shared memory writes for host-side export + - Does NOT control log output + +3. **Consistent behavior across components** + - Scheduler logs: macro-controlled only + - Orchestrator logs: macro-controlled only + - Data collection: runtime flag controlled + +### Code Locations + +- Macro defaults and validation: `src/common/task_interface/profiling_config.h` +- Scheduler profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp` and `scheduler_cold_path.cpp` +- Orchestrator profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp` +- TensorMap profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h` + +--- + +## Performance Impact + +### Compilation overhead + +- Level 0: No overhead +- Level 1: Minimal (counter increments, basic arithmetic) +- Level 2-4: Low to moderate (additional counters, cycle measurements) + +### Runtime overhead + +- Logging: Negligible (device logs are asynchronous) +- Data collection (`enable_l2_swimlane>0`): Low to moderate + - Performance buffer writes + - Shared memory updates + - Per-task timing measurements + +### Recommendation + +- Use Level 0 for production +- Use Level 1-2 for performance monitoring +- Use Level 3-4 for detailed performance analysis only diff --git a/src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp b/src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp new file mode 100644 index 000000000..55565e885 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp @@ -0,0 +1,784 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file dep_gen_replay.cpp + * @brief Replay in-memory DepGenRecord stream → deps.json (strided tensor + * representation, tensor-annotated) via a host-resident PTO2TensorMap, + * with a differential check against the runtime template `compute_task_fanin`. + * + * Two passes run per record against two parallel PTO2TensorMap instances that + * evolve in lockstep: + * + * ORACLE pass (read-only contract): + * Drives `compute_task_fanin` (the same template the device orchestrator + * uses in pto_orchestrator.cpp:submit_task) against `tm_oracle`. Emits + * only PTO2TaskId values — the canonical set of producer IDs the runtime + * would have wired. We never widen this template's emit signature: this + * pass IS the contract, and any future change to `compute_task_fanin` + * automatically refreshes the oracle. + * + * ANNOT pass (this file's feature): + * Inlines the same STEP A (creator retention) + STEP B (tensormap lookup) + * against `tm_annot`, but the callback fires with the full + * `PTO2TensorMapEntry&` + the consumer Tensor* + the arg index, so the + * replay can record per-edge tensor metadata (producer/consumer + * shape/offset, dtype, version). + * + * After both passes finish per record, we compare the producer-ID set the + * oracle emitted to the producer-ID set the annot pass emitted. They MUST + * match. If they diverge, deps.json is not written and the function returns + * non-zero — this is the "no shotgun modifications" guarantee: anyone who + * changes `compute_task_fanin` will trip this gate immediately and know to + * mirror the change in the annot pass. + * + * STEP 1 (explicit_deps) is emitted at the call site (per pto_dep_compute.h's + * "kept at call site" note); both passes run the same explicit-deps loop, so + * the comparison covers it too. + * + * STEP 4 (`register_task_outputs`) runs on BOTH tensor maps after both passes + * complete, keeping `tm_oracle` and `tm_annot` bit-equivalent for the next + * record's INOUT+COVERED `remove_entry` mutations. + * + * Pool sizing: replay never advances last_task_alive, so each tensor map's + * entry pool must accommodate every output write across the whole trace. We + * scan the record buffer once to count INOUT + OUTPUT_EXISTING slots and size + * the pool accordingly. Both maps get the same size. + */ + +#include "dep_gen_replay.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/dep_gen.h" +#include "common/unified_log.h" +#include "data_type.h" +#include "pto_dep_compute.h" +#include "pto_task_id.h" +#include "pto_tensormap.h" +#include "tensor.h" + +namespace { + +int32_t ceil_pow2(int32_t v) { + if (v <= 1) return 1; + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +// Count INOUT + OUTPUT_EXISTING slots across the record buffer — +// register_task_outputs only inserts those, and skips entries with manual_dep +// set. Counting both without inspecting manual_dep is a conservative upper +// bound (manual_dep is rare; the small over-allocation pays for itself in +// avoided pool exhaustion). +int32_t count_outputs(const DepGenRecord *records, size_t n) { + int32_t total = 0; + for (size_t i = 0; i < n; i++) { + const DepGenRecord &r = records[i]; + // Overflow chain slots are reinterpret_cast views with no tensor data; + // their `tensor_count` bytes are actually the overflow `dep_count` field, + // which would mislead the loop below if read as a tensor count. + if (r.flags & DEP_GEN_FLAG_OVERFLOW) continue; + for (uint16_t j = 0; j < r.tensor_count; j++) { + auto t = static_cast(r.arg_types[j]); + if (t == TensorArgType::INOUT || t == TensorArgType::OUTPUT_EXISTING) { + total++; + } + } + } + return total; +} + +// --------------------------------------------------------------------------- +// JSON output accumulators (in-memory tables that get serialized at the end) +// --------------------------------------------------------------------------- + +// Edge categories — matches the three places a runtime fanin edge is born. +enum class EdgeSource { EXPLICIT, CREATOR, TENSORMAP }; + +const char *edge_source_str(EdgeSource s) { + switch (s) { + case EdgeSource::EXPLICIT: + return "explicit"; + case EdgeSource::CREATOR: + return "creator"; + case EdgeSource::TENSORMAP: + return "tensormap"; + } + return "unknown"; +} + +const char *overlap_status_str(OverlapStatus s) { + switch (s) { + case OverlapStatus::COVERED: + return "covered"; + case OverlapStatus::OTHER: + return "other"; + case OverlapStatus::NO_OVERLAP: + return "no_overlap"; + } + return "unknown"; +} + +// One annotated edge. consumer_* always populated. producer_* populated for +// TENSORMAP source only — the explicit/creator emit paths don't have a +// matched tensormap entry to copy from. +// +// Slice description follows the strided Tensor model: (start_offset, strides[]) +// in element units. Byte offset of element coords[] is +// (start_offset + Σ coords[i] · strides[i]) · dtype_bytes +struct EdgeAnnot { + uint64_t pred; + uint64_t succ; + int32_t consumer_arg_idx; // -1 for EXPLICIT (not tied to a tensor arg) + EdgeSource source; + OverlapStatus overlap; // only meaningful for TENSORMAP + uint64_t tensor_id; // 0 for EXPLICIT + // Consumer side (the Tensor the submitting task is reading). + uint8_t consumer_dtype; + uint32_t consumer_ndims; + uint32_t consumer_shape[MAX_TENSOR_DIMS]; + uint64_t consumer_start_offset; // 1D element offset + uint32_t consumer_strides[MAX_TENSOR_DIMS]; + // Producer side (the slice the producer wrote, from the tensormap entry). + // Only populated when source == TENSORMAP. + uint32_t producer_ndims; + uint32_t producer_shape[MAX_TENSOR_DIMS]; + uint64_t producer_start_offset; + uint32_t producer_strides[MAX_TENSOR_DIMS]; +}; + +// One entry in the tensors[] table: the underlying storage, keyed by +// (buffer_addr, version). buffer_numel is the storage element count; +// per-edge fields describe the slice (start_offset + stride). +struct TensorTableEntry { + uint64_t tensor_id; + uint64_t buffer_addr; + uint64_t buffer_numel; // storage size in elements (= buffer.size / dtype_bytes) + int32_t version; + uint8_t dtype; +}; + +// One arg slot of a task, captured for the `tasks[].args[]` block so +// downstream viewers can render per-task input / output compartments without +// having to scan every edge. `has_tensor_info` is false only for OUTPUT slots: +// the runtime hasn't materialized a Tensor yet at submit_task time, so the +// captured blob is zeroed. +struct TaskArgEntry { + int32_t idx; + TensorArgType arg_type; + bool has_tensor_info; + uint64_t tensor_id; + uint8_t dtype; + uint32_t ndims; + uint32_t shape[MAX_TENSOR_DIMS]; + uint64_t start_offset; // 1D element offset + uint32_t strides[MAX_TENSOR_DIMS]; +}; + +struct TaskTableEntry { + uint64_t task_id; + bool in_manual_scope; + int32_t kernel_id[3]; // per-subslot {AIC, AIV0, AIV1}, -1 = inactive + std::vector args; +}; + +const char *arg_type_str(TensorArgType t) { + switch (t) { + case TensorArgType::INPUT: + return "INPUT"; + case TensorArgType::OUTPUT: + return "OUTPUT"; + case TensorArgType::INOUT: + return "INOUT"; + case TensorArgType::OUTPUT_EXISTING: + return "OUTPUT_EXISTING"; + } + return "UNKNOWN"; +} + +// FNV-1a 64-bit hash of (buffer_addr, version) — stable tensor identity +// across runs (no time-dependent inputs). +uint64_t make_tensor_id(uint64_t buffer_addr, int32_t version) { + constexpr uint64_t FNV_OFFSET = 0xcbf29ce484222325ULL; + constexpr uint64_t FNV_PRIME = 0x100000001b3ULL; + uint64_t h = FNV_OFFSET; + const uint8_t *p; + p = reinterpret_cast(&buffer_addr); + for (size_t i = 0; i < sizeof(buffer_addr); i++) { + h ^= p[i]; + h *= FNV_PRIME; + } + uint32_t v = static_cast(version); + p = reinterpret_cast(&v); + for (size_t i = 0; i < sizeof(v); i++) { + h ^= p[i]; + h *= FNV_PRIME; + } + return h; +} + +// Register a tensor in the tensors[] table on first sight of (addr, +// version). buffer_numel describes the underlying storage size in elements; +// per-edge fields describe the slice via (start_offset, strides[]). Subsequent +// sightings of the same (addr, version) are no-ops. +uint64_t register_tensor( + std::unordered_map &index_by_id, std::vector &table, const Tensor &t +) { + uint64_t id = make_tensor_id(t.buffer.addr, t.version); + auto it = index_by_id.find(id); + if (it != index_by_id.end()) { + return id; + } + TensorTableEntry e; + e.tensor_id = id; + e.buffer_addr = t.buffer.addr; + e.version = t.version; + e.dtype = static_cast(t.dtype); + const uint64_t elem_size = get_element_size(t.dtype); + e.buffer_numel = (elem_size == 0) ? 0 : (t.buffer.size / elem_size); + index_by_id[id] = table.size(); + table.push_back(e); + return id; +} + +// Copy a Tensor's slice description (shape + start_offset + stride) into an +// EdgeAnnot's consumer_* fields. +void fill_consumer(EdgeAnnot &e, const Tensor &t) { + e.consumer_dtype = static_cast(t.dtype); + e.consumer_ndims = t.ndims; + e.consumer_start_offset = t.start_offset; + for (uint32_t i = 0; i < t.ndims && i < MAX_TENSOR_DIMS; i++) { + e.consumer_shape[i] = t.shapes[i]; + e.consumer_strides[i] = t.strides[i]; + } +} + +// Copy a PTO2TensorMapEntry's slice description into an EdgeAnnot's producer_* +// fields. Only called from the TENSORMAP emit path. +void fill_producer(EdgeAnnot &e, const PTO2TensorMapEntry &entry) { + e.producer_ndims = entry.ndims; + e.producer_start_offset = entry.start_offset; + for (uint32_t i = 0; i < entry.ndims && i < MAX_TENSOR_DIMS; i++) { + e.producer_shape[i] = entry.shapes[i]; + e.producer_strides[i] = entry.strides[i]; + } +} + +// --------------------------------------------------------------------------- +// JSON writer +// --------------------------------------------------------------------------- + +void write_uint_array(std::ofstream &out, const uint32_t *data, uint32_t n) { + out << '['; + for (uint32_t i = 0; i < n; i++) { + if (i > 0) out << ','; + out << data[i]; + } + out << ']'; +} + +bool write_deps_json( + const char *path, const std::vector &tasks, const std::vector &tensors, + const std::vector &edges +) { + std::ofstream out(path, std::ios::out | std::ios::trunc); + if (!out) { + LOG_ERROR("dep_gen replay: failed to open '%s' for write", path); + return false; + } + // Strided tensor representation. tensors[].buffer_numel is the underlying + // storage element count; tasks[].args[] and edges[] carry per-slice + // geometry as (start_offset uint64, strides[] uint32 — runtime invariant + // forbids zero / negative strides, see runtime/tensor.h). + out << "{\"tasks\":["; + for (size_t i = 0; i < tasks.size(); i++) { + if (i > 0) out << ','; + const auto &t = tasks[i]; + // uint64 fields are quoted as strings — task_id/tensor_id/buffer_addr/ + // pred/succ can exceed Number.MAX_SAFE_INTEGER (2^53-1), silently + // losing precision in JS-based JSON parsers. Python consumers already + // pass these through int(...) and don't care which form they receive. + out << "{\"task_id\":\"" << t.task_id << '"'; + out << ",\"scope\":\"" << (t.in_manual_scope ? "manual" : "auto") << '"'; + // Per-subslot kernel ids {AIC, AIV0, AIV1}; INVALID_KERNEL_ID = -1 for + // inactive subslots. Emitted as a plain int triple — downstream viewers + // (and the swimlane host post-processor) use it to resolve task_id → + // kernel without the AICore record carrying the field itself. + out << ",\"kernel_ids\":[" << t.kernel_id[0] << ',' << t.kernel_id[1] << ',' << t.kernel_id[2] << ']'; + out << ",\"args\":["; + for (size_t a = 0; a < t.args.size(); a++) { + if (a > 0) out << ','; + const auto &arg = t.args[a]; + out << "{\"idx\":" << arg.idx; + out << ",\"type\":\"" << arg_type_str(arg.arg_type) << '"'; + if (arg.has_tensor_info) { + out << ",\"tensor_id\":\"" << arg.tensor_id << '"'; + out << ",\"dtype\":\"" << get_dtype_name(static_cast(arg.dtype)) << '"'; + out << ",\"shape\":"; + write_uint_array(out, arg.shape, arg.ndims); + out << ",\"start_offset\":\"" << arg.start_offset << '"'; + out << ",\"strides\":"; + write_uint_array(out, arg.strides, arg.ndims); + } + out << '}'; + } + out << "]}"; + } + out << ']'; + + out << ",\"tensors\":["; + for (size_t i = 0; i < tensors.size(); i++) { + if (i > 0) out << ','; + const auto &t = tensors[i]; + out << "{\"tensor_id\":\"" << t.tensor_id << '"'; + out << ",\"buffer_addr\":\"" << t.buffer_addr << '"'; + out << ",\"version\":" << t.version; + out << ",\"dtype\":\"" << get_dtype_name(static_cast(t.dtype)) << '"'; + out << ",\"buffer_numel\":\"" << t.buffer_numel << '"'; + out << '}'; + } + out << ']'; + + out << ",\"edges\":["; + for (size_t i = 0; i < edges.size(); i++) { + if (i > 0) out << ','; + const auto &e = edges[i]; + out << "{\"pred\":\"" << e.pred << "\",\"succ\":\"" << e.succ << '"'; + out << ",\"arg\":" << e.consumer_arg_idx; + out << ",\"source\":\"" << edge_source_str(e.source) << '"'; + if (e.source == EdgeSource::TENSORMAP) { + out << ",\"overlap\":\"" << overlap_status_str(e.overlap) << '"'; + } + if (e.source != EdgeSource::EXPLICIT) { + out << ",\"tensor_id\":\"" << e.tensor_id << '"'; + out << ",\"consumer_dtype\":\"" << get_dtype_name(static_cast(e.consumer_dtype)) << '"'; + out << ",\"consumer_shape\":"; + write_uint_array(out, e.consumer_shape, e.consumer_ndims); + out << ",\"consumer_start_offset\":\"" << e.consumer_start_offset << '"'; + out << ",\"consumer_strides\":"; + write_uint_array(out, e.consumer_strides, e.consumer_ndims); + } + if (e.source == EdgeSource::TENSORMAP) { + out << ",\"producer_shape\":"; + write_uint_array(out, e.producer_shape, e.producer_ndims); + out << ",\"producer_start_offset\":\"" << e.producer_start_offset << '"'; + out << ",\"producer_strides\":"; + write_uint_array(out, e.producer_strides, e.producer_ndims); + } + out << '}'; + } + out << "]}\n"; + return static_cast(out); +} + +// --------------------------------------------------------------------------- +// Annot pass — mirrors compute_task_fanin step-by-step against tm_annot. +// Must stay bit-equivalent to pto_dep_compute.h::compute_task_fanin in terms +// of which producer IDs are emitted (the differential check enforces this). +// --------------------------------------------------------------------------- + +template +void annot_pass( + const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, EmitCreator emit_creator, + EmitTM emit_tensormap +) { + if (in_manual_scope) { + return; + } + for (int32_t i = 0; i < inputs.tensor_count; i++) { + TensorArgType ptype = inputs.arg_types[i]; + if (ptype == TensorArgType::OUTPUT) { + continue; + } + const Tensor *tensor = &inputs.tensors[i].ref(); + + // STEP A: creator retention. + PTO2TaskId owner = tensor->owner_task_id; + if (owner.is_valid()) { + emit_creator(owner, i, *tensor); + } + + // STEP B: tensormap lookup (only INPUT/INOUT, skip manual_dep). + if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) { + continue; + } + if (tensor->manual_dep) { + continue; + } + + tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool { + emit_tensormap(entry.producer_task_id, i, *tensor, entry, overlap_status); + if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) { + tensor_map.remove_entry(entry); + } + return true; + }); + } +} + +} // namespace + +extern "C" int +dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, const char *deps_json_path) { + if (deps_json_path == nullptr) { + LOG_ERROR("dep_gen replay: null deps_json_path"); + return -1; + } + if (num_records > 0 && records == nullptr) { + LOG_ERROR("dep_gen replay: num_records=%zu but records pointer is null", num_records); + return -1; + } + LOG_INFO_V0("dep_gen replay: processing %zu in-memory records (dual-pass)", num_records); + + // Per-ring task window sizes — tensormap masks slot indices and requires + // each to be a power of two. Auto-size from the records themselves so each + // ring's window comfortably covers its observed max local_id (no slot + // aliasing during INOUT+COVERED remove_from_task). Same sizes feed both + // maps so they stay in lockstep. + int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + uint32_t max_local[PTO2_MAX_RING_DEPTH] = {0}; + for (size_t i = 0; i < num_records; i++) { + PTO2TaskId tid{records[i].task_id}; + uint8_t ring = tid.ring(); + uint32_t local = tid.local(); + if (ring < PTO2_MAX_RING_DEPTH && local > max_local[ring]) { + max_local[ring] = local; + } + } + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + int32_t need = static_cast(max_local[r] + 1); + task_window_sizes[r] = ceil_pow2(need < 16 ? 16 : need); + } + + int32_t output_count = count_outputs(records, num_records); + int32_t pool_size = output_count + (output_count / 10) + 64; + if (pool_size < PTO2_TENSORMAP_POOL_SIZE) { + pool_size = PTO2_TENSORMAP_POOL_SIZE; + } + + PTO2TensorMap tm_oracle; + PTO2TensorMap tm_annot; + std::memset(&tm_oracle, 0, sizeof(tm_oracle)); + std::memset(&tm_annot, 0, sizeof(tm_annot)); + + // Libc-backed arena (default ctor) that owns both replay tensormaps' + // storage. Released by the arena destructor when this function returns. + DeviceArena replay_arena; + + auto oracle_layout = + PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes); + auto annot_layout = + PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes); + if (replay_arena.commit() == nullptr || !tm_oracle.init_data_from_layout(oracle_layout, replay_arena) || + !tm_annot.init_data_from_layout(annot_layout, replay_arena)) { + LOG_ERROR("dep_gen replay: tensormap.init failed (buckets=%d, pool=%d)", PTO2_TENSORMAP_NUM_BUCKETS, pool_size); + return -3; + } + // Replay tensormaps live entirely on host; only arena-internal pointer + // fields need wiring (no parent-orch back-reference exists anymore). + tm_oracle.wire_arena_pointers(oracle_layout, replay_arena); + tm_annot.wire_arena_pointers(annot_layout, replay_arena); + + // JSON output accumulators. + std::vector task_table; + std::vector tensor_table; + std::unordered_map tensor_index; // tensor_id → table idx + std::vector annot_edges; + annot_edges.reserve(num_records * 2); + + TensorRef tref_buf[CORE_MAX_TENSOR_ARGS]; + TensorArgType atype_buf[CORE_MAX_TENSOR_ARGS]; + + // Per-record dedup of producer IDs — must match runtime's + // PTO2FaninBuilder::append_fanin_or_fail semantics, which collapses STEP 1 + // (explicit_deps) + STEP A (creator retention) + STEP B (tensormap lookup) + // into a single per-task fanin list. Both oracle and annot use this same + // semantics so the divergence check is meaningful. + std::unordered_set oracle_preds; + std::unordered_set annot_preds; + + // Scratch buffer for assembling full dep lists across overflow chains. + // Declared outside the loop so it can be reused (clear() keeps capacity). + std::vector full_deps_buf; + + for (size_t rec_i = 0; rec_i < num_records; rec_i++) { + const DepGenRecord &rec = records[rec_i]; + + // Overflow chain records are consumed by the preceding base; skip + // them in the main scan so we don't double-process or read the + // overflow's reinterpreted bytes as tensor/dep info. + if (rec.flags & DEP_GEN_FLAG_OVERFLOW) continue; + + PTO2TaskId task_id{rec.task_id}; + bool in_manual_scope = (rec.flags & DEP_GEN_FLAG_IN_MANUAL_SCOPE) != 0; + + oracle_preds.clear(); + annot_preds.clear(); + + int32_t tc = static_cast(rec.tensor_count); + if (tc > CORE_MAX_TENSOR_ARGS) { + tc = CORE_MAX_TENSOR_ARGS; + } + for (int32_t i = 0; i < tc; i++) { + tref_buf[i] = reinterpret_cast(&rec.tensors[i][0]); + atype_buf[i] = static_cast(rec.arg_types[i]); + } + + // Assemble the full dep list. Fast path: ≤ DEP_GEN_MAX_EXPLICIT_DEPS, + // no chain, point straight at rec.explicit_deps. Slow path: gather + // base + chain into full_deps_buf and point at the buffer. + // + // `explicit_dep_count` / `over->dep_count` originate from device + // shared memory and are bounded by the writer to the array sizes, but + // we clamp on read too so a corrupted record never drives an OOB read + // off the end of rec.explicit_deps[64] / over->deps[582]. + const uint64_t *deps_data; + int32_t dc; + if (rec.flags & DEP_GEN_FLAG_HAS_OVERFLOW) { + full_deps_buf.clear(); + uint16_t base_dc = rec.explicit_dep_count; + if (base_dc > DEP_GEN_MAX_EXPLICIT_DEPS) { + LOG_ERROR( + "dep_gen replay: clamping base explicit_dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")", + base_dc, DEP_GEN_MAX_EXPLICIT_DEPS, rec_i, rec.task_id + ); + base_dc = DEP_GEN_MAX_EXPLICIT_DEPS; + } + full_deps_buf.reserve(static_cast(base_dc) + DEP_GEN_OVERFLOW_DEPS_PER_RECORD); + full_deps_buf.insert(full_deps_buf.end(), rec.explicit_deps, rec.explicit_deps + base_dc); + bool chain_complete = false; + for (size_t j = rec_i + 1; j < num_records; j++) { + const DepGenRecord &maybe = records[j]; + if (!(maybe.flags & DEP_GEN_FLAG_OVERFLOW)) { + LOG_ERROR( + "dep_gen replay: unterminated overflow chain at rec_idx=%zu (task_id=%" PRIu64 ")", rec_i, + rec.task_id + ); + break; + } + if (maybe.task_id != rec.task_id) { + LOG_ERROR( + "dep_gen replay: orphan overflow at rec_idx=%zu (expected task_id=%" PRIu64 ", found %" PRIu64 + ")", + j, rec.task_id, maybe.task_id + ); + break; + } + const auto *over = reinterpret_cast(&maybe); + uint16_t over_dc = over->dep_count; + if (over_dc > DEP_GEN_OVERFLOW_DEPS_PER_RECORD) { + LOG_ERROR( + "dep_gen replay: clamping overflow dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")", + over_dc, DEP_GEN_OVERFLOW_DEPS_PER_RECORD, j, rec.task_id + ); + over_dc = DEP_GEN_OVERFLOW_DEPS_PER_RECORD; + } + full_deps_buf.insert(full_deps_buf.end(), over->deps, over->deps + over_dc); + if (over->flags & DEP_GEN_FLAG_LAST_OVERFLOW) { + chain_complete = true; + break; + } + } + if (!chain_complete) { + LOG_ERROR( + "dep_gen replay: chain for task_id=%" PRIu64 " missing LAST_OVERFLOW marker — " + "using partial dep list (%zu deps)", + rec.task_id, full_deps_buf.size() + ); + } + deps_data = full_deps_buf.data(); + dc = static_cast(full_deps_buf.size()); + } else { + deps_data = rec.explicit_deps; + uint16_t base_dc = rec.explicit_dep_count; + if (base_dc > DEP_GEN_MAX_EXPLICIT_DEPS) { + LOG_ERROR( + "dep_gen replay: clamping no-chain explicit_dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")", + base_dc, DEP_GEN_MAX_EXPLICIT_DEPS, rec_i, rec.task_id + ); + base_dc = DEP_GEN_MAX_EXPLICIT_DEPS; + } + dc = static_cast(base_dc); + } + + DepInputs inputs; + inputs.tensor_count = tc; + inputs.tensors = tref_buf; + inputs.arg_types = atype_buf; + inputs.explicit_dep_count = dc; + inputs.explicit_deps = reinterpret_cast(deps_data); + + // Register tasks[] entry (with per-arg slot info) and any unseen + // tensors[] entries up-front. Tensors are registered from the + // consumer-side blob so raw_shapes / dtype are populated (the + // producer-side PTO2TensorMapEntry drops raw_shapes to fit in two + // cache lines). + TaskTableEntry task_entry; + task_entry.task_id = rec.task_id; + task_entry.in_manual_scope = in_manual_scope; + task_entry.kernel_id[0] = rec.kernel_id[0]; + task_entry.kernel_id[1] = rec.kernel_id[1]; + task_entry.kernel_id[2] = rec.kernel_id[2]; + task_entry.args.reserve(tc); + for (int32_t i = 0; i < tc; i++) { + TaskArgEntry slot{}; + slot.idx = i; + slot.arg_type = atype_buf[i]; + if (atype_buf[i] == TensorArgType::OUTPUT) { + // OUTPUT blob is zero at submit time (writer has no Tensor + // yet); leave has_tensor_info=false. Viewers render this as + // a placeholder "alloc" output slot. + slot.has_tensor_info = false; + } else { + const Tensor &t = tref_buf[i].ref(); + register_tensor(tensor_index, tensor_table, t); + slot.has_tensor_info = true; + slot.tensor_id = make_tensor_id(t.buffer.addr, t.version); + slot.dtype = static_cast(t.dtype); + slot.ndims = t.ndims; + slot.start_offset = t.start_offset; + for (uint32_t d = 0; d < t.ndims && d < MAX_TENSOR_DIMS; d++) { + slot.shape[d] = t.shapes[d]; + slot.strides[d] = t.strides[d]; + } + } + task_entry.args.push_back(slot); + } + task_table.push_back(std::move(task_entry)); + + // ============ STEP 1 — explicit_deps (call-site emit) ============ + // Same loop on both passes; they MUST produce identical sets here + // because they read the same record. Annot records explicit edges + // with consumer_arg_idx = -1 (not tied to any tensor arg). Reads + // from deps_data (base record's explicit_deps[] on fast path, the + // gathered base+chain buffer on overflow path). + for (int32_t i = 0; i < dc; i++) { + uint64_t pred_raw = deps_data[i]; + if (oracle_preds.insert(pred_raw).second) { + // First time this pred is seen at runtime call site. + } + if (annot_preds.insert(pred_raw).second) { + EdgeAnnot e{}; + e.pred = pred_raw; + e.succ = rec.task_id; + e.consumer_arg_idx = -1; + e.source = EdgeSource::EXPLICIT; + annot_edges.push_back(e); + } + } + + // ============ ORACLE pass — drive compute_task_fanin ============ + bool ok = compute_task_fanin(inputs, tm_oracle, in_manual_scope, [&](PTO2TaskId producer) -> bool { + oracle_preds.insert(producer.raw); + return true; + }); + if (!ok) { + LOG_ERROR("dep_gen replay: compute_task_fanin returned fatal at task_id=%" PRIu64, rec.task_id); + tm_oracle.destroy(); + tm_annot.destroy(); + return -4; + } + + // ============ ANNOT pass — inline mirror, full entry capture ============ + annot_pass( + inputs, tm_annot, in_manual_scope, + // emit_creator(producer, arg_idx, consumer_tensor) + [&](PTO2TaskId producer, int32_t arg_idx, const Tensor &consumer) { + if (!annot_preds.insert(producer.raw).second) { + return; // already covered by an earlier emit on this record + } + EdgeAnnot e{}; + e.pred = producer.raw; + e.succ = rec.task_id; + e.consumer_arg_idx = arg_idx; + e.source = EdgeSource::CREATOR; + e.tensor_id = make_tensor_id(consumer.buffer.addr, consumer.version); + fill_consumer(e, consumer); + annot_edges.push_back(e); + }, + // emit_tensormap(producer, arg_idx, consumer_tensor, entry, status) + [&](PTO2TaskId producer, int32_t arg_idx, const Tensor &consumer, const PTO2TensorMapEntry &entry, + OverlapStatus status) { + // Per-(succ, arg_idx, producer_buffer_addr, producer_version) + // dedup gives us "the same producer slice fired twice for the + // same consumer arg" collapse — but two distinct slices from + // the same producer (different version), or two different + // producers, both yield their own edges. The producer-id-set + // comparison below uses annot_preds, which dedups by pred + // only, matching runtime PTO2FaninBuilder semantics. + annot_preds.insert(producer.raw); + EdgeAnnot e{}; + e.pred = producer.raw; + e.succ = rec.task_id; + e.consumer_arg_idx = arg_idx; + e.source = EdgeSource::TENSORMAP; + e.overlap = status; + e.tensor_id = make_tensor_id(entry.buffer_addr, entry.version); + fill_consumer(e, consumer); + fill_producer(e, entry); + annot_edges.push_back(e); + } + ); + + // ============ Differential check ============ + if (oracle_preds != annot_preds) { + LOG_ERROR( + "dep_gen replay: DIVERGENCE at task_id=%" PRIu64 " (rec_idx=%zu): oracle has %zu preds, annot has %zu", + rec.task_id, rec_i, oracle_preds.size(), annot_preds.size() + ); + // Log the symmetric difference for debugging. + for (uint64_t p : oracle_preds) { + if (annot_preds.find(p) == annot_preds.end()) { + LOG_ERROR(" only-in-oracle pred: %" PRIu64, p); + } + } + for (uint64_t p : annot_preds) { + if (oracle_preds.find(p) == oracle_preds.end()) { + LOG_ERROR(" only-in-annot pred: %" PRIu64, p); + } + } + tm_oracle.destroy(); + tm_annot.destroy(); + return -6; + } + + // ============ STEP 4 — publish outputs on BOTH maps ============ + register_task_outputs(inputs, task_id, tm_oracle, in_manual_scope); + register_task_outputs(inputs, task_id, tm_annot, in_manual_scope); + } + + tm_oracle.destroy(); + tm_annot.destroy(); + + if (!write_deps_json(deps_json_path, task_table, tensor_table, annot_edges)) { + return -5; + } + LOG_INFO_V0( + "dep_gen replay: wrote deps.json to %s (tasks=%zu, tensors=%zu, edges=%zu)", deps_json_path, task_table.size(), + tensor_table.size(), annot_edges.size() + ); + return 0; +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.h b/src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.h new file mode 100644 index 000000000..2ea3d5768 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/host/dep_gen_replay.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file dep_gen_replay.h + * @brief Host-side replay of in-memory DepGenRecord stream → deps.json. + * + * Takes the records the host collector drained from the device ring buffer + * (``DepGenCollector::records()``) and runs them back through a host-resident + * PTO2TensorMap using the same ``compute_task_fanin`` / ``register_task_outputs`` + * primitives the device orchestrator uses, emitting the full + * predecessor → successor edge list to deps.json. + * + * The records buffer is passed in directly — there is no intermediate + * ``submit_trace.bin`` on disk. The host already has the records once the + * device run completes, so going through the filesystem would just be + * extra I/O and an extra file in the output directory. + * + * deps.json is the sole source of truth for fanout: the L2 swimlane hot + * path no longer records ``L2SwimlaneAicpuTaskRecord::fanout[]`` (taking the per-task + * 1 KB GM store off the scheduler critical path). Replay sees every + * submit and reconstructs the complete dependency graph. + * + * Output format (deps.json, strided tensor representation): + * + * {"tasks": [{"task_id":, "scope":"auto|manual", + * "args":[{"idx":, "type":"", + * "tensor_id":, "dtype":"...", "shape":[...], + * "start_offset":, "strides":[...]}, ...]}, ...], + * "tensors": [{"tensor_id":, "buffer_addr":, "version":, + * "dtype":"FLOAT32", "buffer_numel":}, ...], + * "edges": [{"pred":, "succ":, "arg":, + * "source":"explicit|creator|tensormap", + * "overlap":"covered|other" (tensormap only), + * "tensor_id": (non-explicit), + * "consumer_dtype":"...", "consumer_shape":[...], + * "consumer_start_offset":, "consumer_strides":[...], + * "producer_shape":[...] (tensormap), + * "producer_start_offset": (tensormap), + * "producer_strides":[...] (tensormap)}, + * ...]} + * + * - All task ids are ``PTO2TaskId::raw`` values (``(ring_id << 32) | local_id``). + * - ``tensor_id`` is a stable FNV-1a hash of ``(buffer_addr, version)``. + * - ``buffer_numel`` is the underlying storage element count; tensor shapes + * are carried per-arg / per-edge alongside ``start_offset`` + ``strides``. + * - Distinct producers / arg indices / sources keep their own edges; per-record + * deduplication of producer ids mirrors the runtime + * ``PTO2FaninBuilder::append_fanin_or_fail`` semantics so the set of + * ``(pred, succ)`` pairs is identical to what the runtime would have + * recorded. + * + * Self-checking: the replay runs two parallel tensormap instances per record — + * an "oracle" map driven by the canonical ``compute_task_fanin`` template, and + * an "annotated" map driven by an inlined mirror that captures the per-edge + * tensor metadata. If the producer-id set on the two passes ever diverges, + * deps.json is NOT written and the function returns a non-zero error code. + * This is the guarantee against silent shotgun modifications: anyone who + * changes ``compute_task_fanin`` semantics has to mirror the change here too + * or the gate fires immediately. + * + * The replay is single-threaded and pure CPU: no device handle is required. + */ + +#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_ +#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_ + +#include +#include + +// Opaque forward decl — the canonical layout lives in common/dep_gen.h, but +// replay's API only needs to take a pointer + count. Callers who construct +// the buffer must include common/dep_gen.h themselves. +struct DepGenRecord; + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Replay an in-memory DepGenRecord stream and write deps.json. + * + * Per-ring task window sizes are auto-derived from the trace itself so each + * ring's window covers its observed max local_id without slot aliasing. + * + * @param records Pointer to a contiguous DepGenRecord array + * (typically ``DepGenCollector::records().data()``). + * @param num_records Number of records in the array. + * @param deps_json_path Output path; truncated if it exists. + * @return 0 on success; negative on error (see source for codes). + */ +int dep_gen_replay_emit_deps_json(const struct DepGenRecord *records, size_t num_records, const char *deps_json_path); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_ diff --git a/src/a2a3/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp b/src/a2a3/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp new file mode 100644 index 000000000..dfc5590c1 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp @@ -0,0 +1,27 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include "host/platform_compile_info.h" +#include "host/runtime_compile_info.h" +#include + +extern "C" { + +ToolchainType get_incore_compiler(void) { + if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_CCEC; + return TOOLCHAIN_HOST_GXX_15; +} + +ToolchainType get_orchestration_compiler(void) { + // tensormap_and_ringbuffer: a2a3 needs aarch64 cross-compile (AICPU is aarch64) + if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_AARCH64_GXX; + return TOOLCHAIN_HOST_GXX; +} +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/host/runtime_maker.cpp b/src/a2a3/runtime/fully_distributed_within_core/host/runtime_maker.cpp new file mode 100644 index 000000000..8c8c9f082 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/host/runtime_maker.cpp @@ -0,0 +1,692 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Runtime Builder - rt2 Implementation (Device Orchestration) + * + * Provides init_runtime_impl and validate_runtime_impl functions for rt2 runtime. + * Supports device orchestration where AICPU thread 3 runs the orchestrator. + * + * init_runtime_impl: + * - Converts host tensor pointers to device pointers (all inputs copied H2D; + * only OUTPUT/INOUT tensors are copied back D2H) + * - Copies orchestration SO to device memory + * - Sets up runtime state for device orchestration + * + * validate_runtime_impl: + * - Copies OUTPUT/INOUT tensors back from device to host (read-only inputs + * are skipped) + * - Frees device memory + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../common/pto_runtime_status.h" +#include "../runtime/pto_runtime2.h" +#include "../runtime/pto_shared_memory.h" +#include "../runtime/runtime.h" +#include "../../../../common/task_interface/call_config.h" +#include "callable.h" +#include "common/platform_config.h" +#include "common/unified_log.h" +#include "utils/device_arena.h" +#include "prepare_callable_common.h" + +static_assert( + RUNTIME_ENV_RING_COUNT == PTO2_MAX_RING_DEPTH, "RuntimeEnv ring count must match PTO2 runtime ring depth" +); + +// Helper: return current time in milliseconds +static int64_t _now_ms() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; +} + +static bool is_power_of_2_u64(uint64_t value) { return value != 0 && (value & (value - 1)) == 0; } + +template +static std::string format_ring_array(const T (&values)[PTO2_MAX_RING_DEPTH]) { + std::string out = "["; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; ++r) { + if (r != 0) { + out += ", "; + } + out += std::to_string(values[r]); + } + out += "]"; + return out; +} + +static std::string trim_copy(const std::string &input) { + size_t begin = 0; + while (begin < input.size() && std::isspace(static_cast(input[begin]))) { + ++begin; + } + size_t end = input.size(); + while (end > begin && std::isspace(static_cast(input[end - 1]))) { + --end; + } + return input.substr(begin, end - begin); +} + +static bool parse_uint_token( + const char *name, const std::string &raw, uint64_t min_val, uint64_t max_val, bool require_power_of_2, uint64_t *out +) { + std::string token = trim_copy(raw); + if (token.empty()) { + LOG_WARN("%s has an empty value in '%s', ignored", name, raw.c_str()); + return false; + } + + if (token[0] == '-') { + LOG_WARN("%s=%s invalid (must be a non-negative integer), ignored", name, token.c_str()); + return false; + } + char *endptr = nullptr; + errno = 0; + unsigned long long parsed = std::strtoull(token.c_str(), &endptr, 10); + if (errno == ERANGE || endptr == token.c_str() || *endptr != '\0') { + LOG_WARN("%s=%s invalid (must be a non-negative integer), ignored", name, token.c_str()); + return false; + } + uint64_t val = static_cast(parsed); + + if (val < min_val || val > max_val) { + LOG_WARN( + "%s=%s invalid (must be in [%" PRIu64 ", %" PRIu64 "]), ignored", name, token.c_str(), min_val, max_val + ); + return false; + } + if (require_power_of_2 && !is_power_of_2_u64(val)) { + LOG_WARN("%s=%s invalid (must be a power of 2), ignored", name, token.c_str()); + return false; + } + *out = val; + return true; +} + +static void apply_env_ring_values( + const char *name, uint64_t min_val, uint64_t max_val, bool require_power_of_2, uint64_t out[PTO2_MAX_RING_DEPTH] +) { + const char *env = std::getenv(name); + if (!env) return; + + std::string text(env); + if (text.find(',') == std::string::npos) { + uint64_t value = 0; + if (!parse_uint_token(name, text, min_val, max_val, require_power_of_2, &value)) { + return; + } + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + out[r] = value; + } + return; + } + + uint64_t parsed[PTO2_MAX_RING_DEPTH]{}; + size_t pos = 0; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + size_t comma = text.find(',', pos); + std::string token = text.substr(pos, comma == std::string::npos ? std::string::npos : comma - pos); + if (!parse_uint_token(name, token, min_val, max_val, require_power_of_2, &parsed[r])) { + return; + } + if (comma == std::string::npos) { + if (r != PTO2_MAX_RING_DEPTH - 1) { + LOG_WARN( + "%s=%s invalid (expected exactly %d comma-separated values), ignored", name, env, + PTO2_MAX_RING_DEPTH + ); + return; + } + pos = text.size(); + } else { + pos = comma + 1; + } + } + if (pos < text.size() || (!text.empty() && text.back() == ',')) { + LOG_WARN("%s=%s invalid (expected exactly %d comma-separated values), ignored", name, env, PTO2_MAX_RING_DEPTH); + return; + } + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + out[r] = parsed[r]; + } +} + +static bool resolve_ring_config( + uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool, const uint64_t *ring_task_windows, + const uint64_t *ring_heaps, const uint64_t *ring_dep_pools, uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH], + uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH], int32_t eff_dep_pool_capacities[PTO2_MAX_RING_DEPTH] +) { + uint64_t dep_pool_values[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + eff_task_window_sizes[r] = PTO2_TASK_WINDOW_SIZE; + eff_heap_sizes[r] = PTO2_HEAP_SIZE; + dep_pool_values[r] = PTO2_DEP_LIST_POOL_SIZE; + } + + apply_env_ring_values("PTO2_RING_TASK_WINDOW", 4, static_cast(INT32_MAX), true, eff_task_window_sizes); + apply_env_ring_values("PTO2_RING_HEAP", 1024, std::numeric_limits::max(), false, eff_heap_sizes); + apply_env_ring_values("PTO2_RING_DEP_POOL", 4, static_cast(INT32_MAX), false, dep_pool_values); + + if (ring_task_window != 0) { + if (ring_task_window < 4 || ring_task_window > static_cast(INT32_MAX) || + !is_power_of_2_u64(ring_task_window)) { + LOG_ERROR( + "runtime_env.ring_task_window=%" PRIu64 " must be a power of 2 in [4, INT32_MAX]", ring_task_window + ); + return false; + } + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + eff_task_window_sizes[r] = ring_task_window; + } + } + if (ring_heap != 0) { + if (ring_heap < 1024) { + LOG_ERROR("runtime_env.ring_heap=%" PRIu64 " must be >= 1024", ring_heap); + return false; + } + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + eff_heap_sizes[r] = ring_heap; + } + } + if (ring_dep_pool != 0) { + if (ring_dep_pool < 4 || ring_dep_pool > static_cast(INT32_MAX)) { + LOG_ERROR("runtime_env.ring_dep_pool=%" PRIu64 " must be in [4, INT32_MAX]", ring_dep_pool); + return false; + } + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + dep_pool_values[r] = ring_dep_pool; + } + } + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (ring_task_windows != nullptr && ring_task_windows[r] != 0) { + eff_task_window_sizes[r] = ring_task_windows[r]; + } + if (ring_heaps != nullptr && ring_heaps[r] != 0) { + eff_heap_sizes[r] = ring_heaps[r]; + } + if (ring_dep_pools != nullptr && ring_dep_pools[r] != 0) { + dep_pool_values[r] = ring_dep_pools[r]; + } + + if (eff_task_window_sizes[r] < 4 || eff_task_window_sizes[r] > static_cast(INT32_MAX) || + !is_power_of_2_u64(eff_task_window_sizes[r])) { + LOG_ERROR( + "ring_task_windows[%d]=%" PRIu64 " must be a power of 2 in [4, INT32_MAX]", r, eff_task_window_sizes[r] + ); + return false; + } + if (eff_heap_sizes[r] < 1024) { + LOG_ERROR("ring_heaps[%d]=%" PRIu64 " must be >= 1024", r, eff_heap_sizes[r]); + return false; + } + if (dep_pool_values[r] < 4 || dep_pool_values[r] > static_cast(INT32_MAX)) { + LOG_ERROR("ring_dep_pools[%d]=%" PRIu64 " must be in [4, INT32_MAX]", r, dep_pool_values[r]); + return false; + } + eff_dep_pool_capacities[r] = static_cast(dep_pool_values[r]); + } + + return true; +} + +static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) { + if (runtime == nullptr || host_header == nullptr) { + return 0; + } + + void *pto2_sm = runtime->get_gm_sm_ptr(); + if (pto2_sm == nullptr) { + return 0; + } + + int hdr_rc = runtime->host_api.copy_from_device(host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader)); + if (hdr_rc != 0) { + LOG_WARN("Failed to copy PTO2 header from device"); + return 0; + } + + int32_t orch_error_code = host_header->orch_error_code.load(std::memory_order_relaxed); + int32_t sched_error_code = host_header->sched_error_code.load(std::memory_order_relaxed); + return runtime_status_from_error_codes(orch_error_code, sched_error_code); +} + +/** + * Stage the per-callable resources (kernel binaries + orchestration SO) into + * the supplied runtime so a subsequent bind_callable_to_runtime_impl can use + * them. This is the cacheable half of init_runtime_impl: nothing here depends + * on per-run argument values, so the prepare_callable / run_prepared split + * lets us run this once per callable_id and amortize across runs. + * + * @param runtime Pointer to pre-constructed Runtime (host_api populated) + * @param callable ChipCallable carrying the orch SO + child kernel binaries + * @return 0 on success, -1 on failure + */ +extern "C" int +prepare_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const void *), CallableArtifacts *out) { + if (callable == nullptr) { + LOG_ERROR("Callable pointer is null"); + return -1; + } + if (upload_fn == nullptr || out == nullptr) { + LOG_ERROR("upload_fn or out is null"); + return -1; + } + *out = CallableArtifacts{}; + out->signature.assign(callable->signature_, callable->signature_ + callable->sig_count()); + + LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count()); + if (upload_and_collect_child_addrs(callable, upload_fn, &out->kernel_addrs) != 0) { + LOG_ERROR("Failed to upload ChipCallable buffer"); + return -1; + } + for (const ChildKernelAddr &c : out->kernel_addrs) { + if (c.func_id < 0 || c.func_id >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("func_id=%d is out of range [0, %d)", c.func_id, RUNTIME_MAX_FUNC_ID); + return -1; + } + } + + const uint8_t *orch_so_binary = static_cast(callable->binary_data()); + size_t orch_so_size = callable->binary_size(); + + if (orch_so_binary == nullptr || orch_so_size == 0) { + LOG_ERROR("Orchestration SO binary is required for device orchestration"); + return -1; + } + + out->orch_so_data = orch_so_binary; + out->orch_so_size = orch_so_size; + out->func_name = callable->func_name(); + out->config_name = callable->config_name(); + LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size); + return 0; +} + +/** + * Per-run binding: build device-side argument storage (tensor copy-out, GM + * heap, PTO2 shared memory) and publish it to the runtime. Assumes the + * callable-side state (kernel binaries, orch SO bytes, func/config names) + * is already populated by prepare_callable_impl. + * + * Splitting this from prepare_callable_impl matches the per-callable_id + * design: register/run_prepared invokes this every call, while the prep + * half runs only once per callable_id. + * + * @param runtime Pointer to pre-constructed Runtime (host_api populated) + * @param orch_args Separated tensor/scalar arguments for this run + * @return 0 on success, -1 on failure + */ +extern "C" int bind_callable_to_runtime_impl( + Runtime *runtime, const ChipStorageTaskArgs *orch_args, void *host_orch_func_ptr, const ArgDirection *signature, + int sig_count, uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool, + const uint64_t *ring_task_windows, const uint64_t *ring_heaps, const uint64_t *ring_dep_pools +) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } + if (orch_args == nullptr) { + LOG_ERROR("orch_args pointer is null"); + return -1; + } + // trb runs orchestration on the device — there is no host-side orch + // function pointer to invoke. The c_api signature accepts one for + // symmetry with hbg; assert the trb-side invariant here. + if (host_orch_func_ptr != nullptr) { + LOG_ERROR("bind_callable_to_runtime_impl: trb does not accept a host_orch_func_ptr"); + return -1; + } + + int tensor_count = orch_args->tensor_count(); + int scalar_count = orch_args->scalar_count(); + LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count); + + int64_t t_total_start = _now_ms(); + + uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH]; + uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH]; + int32_t eff_dep_pool_capacities[PTO2_MAX_RING_DEPTH]; + if (!resolve_ring_config( + ring_task_window, ring_heap, ring_dep_pool, ring_task_windows, ring_heaps, ring_dep_pools, + eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities + )) { + return -1; + } + const std::string task_window_log = format_ring_array(eff_task_window_sizes); + const std::string heap_log = format_ring_array(eff_heap_sizes); + const std::string dep_pool_log = format_ring_array(eff_dep_pool_capacities); + LOG_INFO_V0( + "Ring buffer sizes: task_window=%s heap=%s dep_pool=%s", task_window_log.c_str(), heap_log.c_str(), + dep_pool_log.c_str() + ); + + // Build device args: copy from input, replace host tensor pointers with device pointers + ChipStorageTaskArgs device_args; + + int64_t t_args_start = _now_ms(); + for (int i = 0; i < tensor_count; i++) { + Tensor t = orch_args->tensor(i); + + if (t.is_child_memory()) { + LOG_INFO_V0(" Tensor %d: child memory, pass-through (0x%" PRIx64 ")", i, t.buffer.addr); + device_args.add_tensor(t); + continue; + } + + void *host_ptr = reinterpret_cast(static_cast(t.buffer.addr)); + size_t size = static_cast(t.nbytes()); + + void *dev_ptr = runtime->host_api.device_malloc(size); + if (dev_ptr == nullptr) { + LOG_ERROR("Failed to allocate device memory for tensor %d", i); + return -1; + } + + // Pure write-only OUTPUT buffers carry no meaningful host content, so + // the H2D copy-in is wasted. Zero them on-device instead (cheap HBM + // memset, no PCIe) so any region the kernel leaves unwritten reads as 0 + // rather than pooled-allocator garbage. INOUT (read-before-write) + // and IN keep the H2D copy. Falls back to copy_to_device if a backend + // did not wire device_memset. + bool is_pure_output = (signature != nullptr && i < sig_count && signature[i] == ArgDirection::OUT); + int rc; + if (is_pure_output && runtime->host_api.device_memset != nullptr) { + rc = runtime->host_api.device_memset(dev_ptr, 0, size); + } else { + rc = runtime->host_api.copy_to_device(dev_ptr, host_ptr, size); + } + if (rc != 0) { + LOG_ERROR("Failed to stage tensor %d to device", i); + runtime->host_api.device_free(dev_ptr); + return -1; + } + // Read-only INPUT tensors are never written by the kernel, so there is + // no point copying them back D2H at the end. Index the signature + // by the orch tensor index `i` (child_memory tensors are skipped above + // but do not consume a separate signature slot — scalars follow the + // tensor entries). Anything not provably IN keeps the safe default of + // copying back. + bool needs_copy_back = !(signature != nullptr && i < sig_count && signature[i] == ArgDirection::IN); + runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size, needs_copy_back}); + LOG_INFO_V0(" Tensor %d: %zu bytes at %p", i, size, dev_ptr); + + t.buffer.addr = reinterpret_cast(dev_ptr); + device_args.add_tensor(t); + } + for (int i = 0; i < scalar_count; i++) { + device_args.add_scalar(orch_args->scalar(i)); + } + int64_t t_args_end = _now_ms(); + + // Read orchestrator-to-scheduler transition flag from environment + { + const char *env_val = std::getenv("PTO2_ORCH_TO_SCHED"); + if (env_val && (env_val[0] == '1' || env_val[0] == 't' || env_val[0] == 'T')) { + runtime->orch_to_sched = true; + } + LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->orch_to_sched ? "enabled" : "disabled"); + } + + // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory, + // and the prebuilt runtime arena all live in a single backing allocation; + // setup_static_arena reserves the three regions and commits in one shot. + // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the + // free is deferred to DeviceRunner::finalize(). The runtime-arena size is + // determined by replaying the reserve sequence on a host-side arena. + uint64_t total_heap_size = 0; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (eff_heap_sizes[r] > std::numeric_limits::max() - total_heap_size) { + LOG_ERROR("Total ring heap size overflows uint64_t"); + return -1; + } + total_heap_size += eff_heap_sizes[r]; + } + uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(eff_task_window_sizes); + + int64_t t_prebuilt_start = _now_ms(); + DeviceArena host_arena; // libc malloc backend by default + PTO2RuntimeArenaLayout layout = + runtime_reserve_layout(host_arena, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities); + if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + LOG_ERROR("Failed to commit host arena for prebuilt runtime image"); + return -1; + } + + int64_t t_setup_start = _now_ms(); + if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) { + LOG_ERROR("Failed to setup pooled static arena"); + return -1; + } + int64_t t_setup_end = _now_ms(); + + int64_t t_heap_start = _now_ms(); + void *gm_heap = runtime->host_api.acquire_pooled_gm_heap(); + int64_t t_heap_end = _now_ms(); + if (gm_heap == nullptr) { + LOG_ERROR("Failed to acquire pooled GM heap"); + return -1; + } + runtime->set_gm_heap(gm_heap); + + int64_t t_sm_start = _now_ms(); + void *sm_ptr = runtime->host_api.acquire_pooled_gm_sm(); + int64_t t_sm_end = _now_ms(); + if (sm_ptr == nullptr) { + LOG_ERROR("Failed to acquire pooled PTO2 shared memory"); + return -1; + } + runtime->set_gm_sm_ptr(sm_ptr); + + void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena(); + if (runtime_arena_dev == nullptr) { + LOG_ERROR("Failed to acquire pooled runtime arena"); + return -1; + } + + // Set up device orchestration state + runtime->set_orch_args(device_args); + + // ------------------------------------------------------------------------- + // Build the prebuilt runtime-arena image on host. + // + // We pre-compute every byte the AICPU's runtime arena would otherwise have + // to write at boot: layout offsets, sub-structure init data, and pointers + // back to the SM / GM heap. Then we rtMemcpy the image into the pooled + // runtime-arena region that DeviceRunner keeps alive across runs. AICPU + // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM + // reset) + a handful of device-only field fixups. + // ------------------------------------------------------------------------- + PTO2Runtime *rt = + runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_sizes); + if (rt == nullptr) { + LOG_ERROR("runtime_init_data_from_layout failed"); + return -1; + } + runtime_wire_arena_pointers(host_arena, layout, rt); + + // Stash the layout inside the PTO2Runtime image so the AICPU can recover + // every arena-internal offset after rtMemcpy. The runtime arena's device + // base does NOT travel in this image — it's on the host Runtime + // (set_prebuilt_arena below), since the AICPU needs that pointer + // *before* it can dereference the image. + rt->prebuilt_layout = layout; + + int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size); + if (rc_upload != 0) { + LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload); + return -1; + } + runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime); + int64_t t_prebuilt_end = _now_ms(); + + LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count); + + int64_t t_total_end = _now_ms(); + LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start); + LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start); + LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start); + LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start); + LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start); + LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start); + + return 0; +} + +/** + * Validate runtime results and cleanup. + * + * This function: + * 1. Copies recorded tensors from device back to host + * 2. Frees device memory for recorded tensors + * 3. Clears tensor pair state + * + * @param runtime Pointer to Runtime + * @return 0 on success, -1 on failure + */ +extern "C" int validate_runtime_impl(Runtime *runtime) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } + + int rc = 0; + + LOG_INFO_V0("=== Copying Results Back to Host ==="); + + // Copy all recorded tensors from device back to host + TensorPair *tensor_pairs = runtime->tensor_pairs_.data(); + int tensor_pair_count = static_cast(runtime->tensor_pairs_.size()); + + LOG_INFO_V0("Tensor pairs to process: %d", tensor_pair_count); + + // PTO2 (device orchestration): graph output may be in packed buffer + uint64_t graph_out_ptr = 0; + uint64_t graph_out_size = 0; + bool skip_tensor_copy_back = false; + int32_t runtime_status = 0; + PTO2SharedMemoryHeader host_header; + memset(&host_header, 0, sizeof(host_header)); + + runtime_status = pto2_read_runtime_status(runtime, &host_header); + if (runtime_status != 0) { + int32_t orch_error_code = host_header.orch_error_code.load(std::memory_order_relaxed); + int32_t sched_error_code = host_header.sched_error_code.load(std::memory_order_relaxed); + LOG_ERROR( + "PTO2 runtime failed: orch_error_code=%d sched_error_code=%d runtime_status=%d", orch_error_code, + sched_error_code, runtime_status + ); + skip_tensor_copy_back = true; + } else { + graph_out_ptr = host_header.graph_output_ptr; + graph_out_size = host_header.graph_output_size; + if (graph_out_ptr != 0) { + LOG_INFO_V0("Graph output buffer: ptr=0x%" PRIx64 ", size=%" PRIu64, graph_out_ptr, graph_out_size); + } + } + + if (skip_tensor_copy_back) { + LOG_WARN("Skipping tensor copy-back because PTO2 runtime reported fatal status"); + } else { + bool first_output_tensor = true; + for (int i = 0; i < tensor_pair_count; i++) { + const TensorPair &pair = tensor_pairs[i]; + + // Skip if device pointer is null + if (pair.dev_ptr == nullptr) { + LOG_WARN("Tensor %d has null device pointer, skipping", i); + continue; + } + + // If host pointer is null, this is a device-only allocation (no copy-back) + if (pair.host_ptr == nullptr) { + LOG_INFO_V0("Tensor %d: device-only allocation (no copy-back)", i); + continue; + } + + // Read-only INPUT tensors were uploaded H2D but the kernel never + // wrote them — copying them back (potentially ~GB) is pure waste. + // They are still device_free'd in the cleanup loop below. + if (!pair.needs_copy_back) { + LOG_INFO_V0("Tensor %d: read-only input, skipping copy-back", i); + continue; + } + + void *src_ptr = pair.dev_ptr; + size_t copy_size = pair.size; + + // Use graph_output_ptr for the first output tensor if available + if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) { + src_ptr = reinterpret_cast(static_cast(graph_out_ptr)); + copy_size = static_cast(graph_out_size); + LOG_INFO_V0("Using packed output buffer for tensor %d", i); + first_output_tensor = false; + } + + int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, src_ptr, copy_size); + if (copy_rc != 0) { + LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc); + rc = copy_rc; + } else { + LOG_INFO_V0("Tensor %d: %zu bytes copied to host", i, pair.size); + } + } + } + + // Cleanup device tensors + LOG_INFO_V0("=== Cleaning Up ==="); + for (int i = 0; i < tensor_pair_count; i++) { + if (tensor_pairs[i].dev_ptr != nullptr) { + runtime->host_api.device_free(tensor_pairs[i].dev_ptr); + } + } + LOG_INFO_V0("Freed %d device allocations", tensor_pair_count); + + // Clear the per-run dispatch-table entries staged by prepare_callable_impl. + // The underlying chip-callable device buffer is pool-managed by + // DeviceRunner (keyed by content hash) and bulk-freed in + // DeviceRunner::finalize(); re-running the same callable repeatedly + // should not re-upload. + int kernel_count = runtime->get_registered_kernel_count(); + for (int i = 0; i < kernel_count; i++) { + int func_id = runtime->get_registered_kernel_func_id(i); + runtime->set_function_bin_addr(func_id, 0); + } + if (kernel_count > 0) { + LOG_INFO_V0("Cleared %d kernel dispatch-table entries", kernel_count); + } + runtime->clear_registered_kernels(); + + // Clear tensor pairs + runtime->tensor_pairs_.clear(); + + LOG_INFO_V0("=== Finalize Complete ==="); + + if (rc == 0 && runtime_status != 0) { + rc = runtime_status; + } + + return rc; +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/orchestration/common.cpp b/src/a2a3/runtime/fully_distributed_within_core/orchestration/common.cpp new file mode 100644 index 000000000..c4878a1c2 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/orchestration/common.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include "common.h" + +#ifdef __linux__ +#include +#include +#include +#include + +#include +#include +#include +#endif + +struct PTO2Runtime; + +// Unified-log error sink. Forward-declared here rather than pulled via +// common/unified_log.h: that header lives under common/log/include, which is +// not on the orchestration .so build's include path. The symbol resolves at +// link time for the runtime targets, and at dlopen time for the orchestration +// .so (against the executor's unified_log_device), so onboard diagnostics still +// reach the CANN device log. +extern "C" void unified_log_error(const char *func, const char *fmt, ...); + +namespace { +// Plain global (not thread_local) to avoid glibc TLSDESC stale-resolution +// crash (BZ #32412) when the orchestration SO is dlclose'd/re-dlopen'd +// between execution rounds. All orchestrator threads bind the same rt +// value, so per-thread storage is unnecessary. +PTO2Runtime *g_current_runtime = nullptr; +} // namespace + +extern "C" __attribute__((visibility("default"))) void framework_bind_runtime(PTO2Runtime *rt) { + g_current_runtime = rt; +} + +// Keep current_runtime local to this .so so orchestration helpers do not +// accidentally bind to the AICPU binary's same-named symbol. +extern "C" __attribute__((visibility("hidden"))) PTO2Runtime *framework_current_runtime() { return g_current_runtime; } + +/** + * Use addr2line to convert an address to file:line information. + * Uses the -i flag to expand inlines; returns the first line (innermost actual code location). + * If inlining is present, also returns the outer call chain via inline_chain. + */ +#ifdef __linux__ +static std::string addr_to_line(const char *executable, void *addr, std::string *inline_chain = nullptr) { + char cmd[512]; + snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr); + + std::array buffer; + std::string raw_output; + + FILE *pipe = popen(cmd, "r"); + if (pipe) { + while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) { + raw_output += buffer.data(); + } + pclose(pipe); + } + + if (raw_output.empty() || raw_output.find("??") != std::string::npos) { + return ""; + } + + // Split by lines + std::vector lines; + size_t pos = 0; + while (pos < raw_output.size()) { + size_t nl = raw_output.find('\n', pos); + if (nl == std::string::npos) nl = raw_output.size(); + std::string line = raw_output.substr(pos, nl - pos); + while (!line.empty() && line.back() == '\r') + line.pop_back(); + if (!line.empty()) lines.push_back(line); + pos = nl + 1; + } + + if (lines.empty()) return ""; + + // First line is the innermost actual code location; subsequent lines are outer inline callers + if (inline_chain && lines.size() > 1) { + *inline_chain = ""; + for (size_t j = 1; j < lines.size(); j++) { + *inline_chain += " [inlined by] " + lines[j] + "\n"; + } + } + + return lines.front(); +} +#endif + +/** + * Get current stack trace information (including file paths and line numbers). + * Uses dladdr to locate the shared library for each stack frame, then calls addr2line with relative addresses. + */ +std::string get_stacktrace(int skip_frames) { + (void)skip_frames; // May be unused on non-Linux platforms + std::string result; +#ifdef __linux__ + const int max_frames = 64; + void *buffer[max_frames]; + int nframes = backtrace(buffer, max_frames); + char **symbols = backtrace_symbols(buffer, nframes); + + if (symbols) { + result = "Stack trace:\n"; + for (int i = skip_frames; i < nframes; i++) { + std::string frame_info; + + void *addr = (void *)((char *)buffer[i] - 1); + + Dl_info dl_info; + std::string inline_chain; + if (dladdr(addr, &dl_info) && dl_info.dli_fname) { + void *rel_addr = (void *)((char *)addr - (char *)dl_info.dli_fbase); + std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain); + + if (addr2line_result.empty()) { + addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain); + } + + if (!addr2line_result.empty()) { + frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result; + } + } + + if (frame_info.empty()) { + std::string frame(symbols[i]); + + size_t start = frame.find('('); + size_t end = frame.find('+', start); + if (start != std::string::npos && end != std::string::npos) { + std::string mangled = frame.substr(start + 1, end - start - 1); + int status; + char *demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status); + if (status == 0 && demangled) { + frame = frame.substr(0, start + 1) + demangled + frame.substr(end); + free(demangled); + } + } + frame_info = frame; + } + + char buf[16]; + snprintf(buf, sizeof(buf), " #%d ", i - skip_frames); + result += buf + frame_info + "\n"; + if (!inline_chain.empty()) { + result += inline_chain; + } + } + free(symbols); + } +#else + result = "(Stack trace is only available on Linux)\n"; +#endif + return result; +} + +// AssertionError constructor +static std::string build_assert_message(const char *condition, const char *file, int line) { + std::string msg = "Assertion failed: " + std::string(condition) + "\n"; + msg += " Location: " + std::string(file) + ":" + std::to_string(line) + "\n"; + msg += get_stacktrace(3); + return msg; +} + +AssertionError::AssertionError(const char *condition, const char *file, int line) : + std::runtime_error(build_assert_message(condition, file, line)), + condition_(condition), + file_(file), + line_(line) {} + +[[noreturn]] void assert_impl(const char *condition, const char *file, int line) { + // Use unified_log_error directly rather than the LOG_ERROR macro: that macro + // lives in pto_orchestration_api.h and expands to + // current_runtime()->ops->log_error, but the ops table's definition pulls in + // pto_types.h (Arg → __aicore__-only to_u64), which the AICore build of this + // TU cannot compile. unified_log_error reaches the same sink without that + // dependency. + unified_log_error(__FUNCTION__, "\n========================================"); + unified_log_error(__FUNCTION__, "Assertion failed: %s", condition); + unified_log_error(__FUNCTION__, "Location: %s:%d", file, line); + unified_log_error(__FUNCTION__, "%s", get_stacktrace(2).c_str()); + unified_log_error(__FUNCTION__, "========================================\n"); + + throw AssertionError(condition, file, line); +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h b/src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h new file mode 100644 index 000000000..863bed92d --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Convenience layer over Arg: bundles a fixed-capacity dependency buffer with + * an Arg and exposes an incremental add_dep(...) API on top of the runtime + * primitive L0TaskArgs::set_dependencies(ptr, count). + * + * Layering: + * - Primitive: Arg + set_dependencies(ptr, count) in pto_types.h. + * No cap, caller owns the deps buffer. + * - Convenience: L0TaskArgsWithDeps in this header. Owns a stack-sized dep + * buffer of capacity N (default 16); provides add_dep(). + * Submitted via the rt_submit_*_task overloads below, which + * forward the bundled deps into the underlying Arg. + * + * This file is auto-included at the bottom of pto_orchestration_api.h so + * orchestration sources see L0TaskArgsWithDeps after a single `#include + * "pto_orchestration_api.h"`. The split is purely organizational — + * orchestration code should not include this header directly. Code generated + * from pypto can ignore the convenience layer entirely and target Arg + + * set_dependencies(ptr, count) directly. + * + * L0TaskArgsWithDeps uses private inheritance from Arg so that set_dependencies and + * the explicit_dep* accessors are NOT reachable on a wrapper instance — users + * who pick the convenience layer cannot accidentally mix it with the + * primitive layer's dep API on the same object. + */ + +#pragma once + +#include +#include + +#include + +#include "pto_orchestration_api.h" // Arg, MixedKernels, rt_submit_* primitives + +template +class L0TaskArgsWithDeps : private L0TaskArgs { +public: + // Tensor / scalar setters — forward to Arg + using L0TaskArgs::add_inout; + using L0TaskArgs::add_input; + using L0TaskArgs::add_no_dep; + using L0TaskArgs::add_output; + using L0TaskArgs::add_scalar; + using L0TaskArgs::add_scalars; + using L0TaskArgs::add_scalars_i32; + using L0TaskArgs::allow_early_resolve; // speculative early-dispatch hint (getter) + using L0TaskArgs::copy_scalars_from; + using L0TaskArgs::set_allow_early_resolve; // speculative early-dispatch hint (setter) + + // Error / status — forward to Arg + using L0TaskArgs::error_msg; + using L0TaskArgs::has_error; + using L0TaskArgs::launch_spec; + using L0TaskArgs::set_error; + + // NOT exposed: set_dependencies, explicit_dep_count, explicit_dep, + // explicit_deps_data — these are the primitive-layer dep API. Users of + // the convenience layer reach dependencies only through add_dep() below. + + /** + * Append one or more dependencies to the bundled buffer. May be called + * multiple times; deps accumulate. Variadic accepts any non-zero number + * of PTO2TaskId arguments. + * + * Overflow (more than MAX_DEP_COUNT total) records an error on the + * underlying Arg; the error surfaces at submit time. + */ + template + void add_dep(Ids... ids) { + static_assert(sizeof...(Ids) >= 1, "add_dep: at least one task id is required"); + static_assert( + (std::is_same_v, PTO2TaskId> && ...), "add_dep: all arguments must be PTO2TaskId" + ); + if (count_ + sizeof...(Ids) > MAX_DEP_COUNT) { + L0TaskArgs::set_error( + "L0TaskArgsWithDeps::add_dep: dep count exceeds MAX_DEP_COUNT (bump the template arg)" + ); + return; + } + ((deps_[count_++] = ids), ...); + } + + /** + * Clear the bundled dep buffer and reset the underlying Arg. + * Use this to recycle an L0TaskArgsWithDeps across loop iterations. + */ + void reset() { + L0TaskArgs::reset(); + count_ = 0; + } + + /** + * Submit-only hook: bind the bundled deps onto the underlying Arg and + * return it as Arg&. Called by the rt_submit_*_task overloads below; + * orchestration code does not invoke this directly. + * + * Idempotent: explicitly clears any prior dep binding before re-setting, + * so a wrapper can be re-finalized (e.g. resubmitted) without tripping + * the primitive layer's single-shot check. + */ + L0TaskArgs &finalize_for_submit() { + L0TaskArgs::set_dependencies(nullptr, 0); + L0TaskArgs::set_dependencies(deps_, count_); + return *this; + } + +private: + PTO2TaskId deps_[MAX_DEP_COUNT]; + uint32_t count_ = 0; +}; + +// ============================================================================= +// Submit overloads — accept L0TaskArgsWithDeps transparently +// ============================================================================= + +template +static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, L0TaskArgsWithDeps &awd) { + return rt_submit_task(mixed_kernels, awd.finalize_for_submit()); +} + +template +static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, L0TaskArgsWithDeps &awd) { + return rt_submit_aic_task(kernel_id, awd.finalize_for_submit()); +} + +template +static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, L0TaskArgsWithDeps &awd) { + return rt_submit_aiv_task(kernel_id, awd.finalize_for_submit()); +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h new file mode 100644 index 000000000..b07c94926 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h @@ -0,0 +1,385 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Orchestration API - Slim header for orchestration .so files + * + * This header provides everything an orchestration source needs without + * pulling in runtime implementation headers. The orchestration .so has + * zero link dependencies on runtime .cpp files; all runtime calls go + * through the PTO2RuntimeOps function-pointer table embedded in + * PTO2Runtime. + * + * Orchestration sources include ONLY this header: + * #include "pto_orchestration_api.h" + * + * Runtime sources continue to use pto_runtime2.h (which defines the + * full PTO2Runtime struct with all internal fields). + */ + +#pragma once + +#include +#include +#include + +#include + +// Type headers needed by orchestration +#include "common.h" // framework_bind_runtime / framework_current_runtime +#include "pto_runtime2_types.h" // PTO2_ERROR_* +#include "pto_submit_types.h" // MixedKernels, INVALID_KERNEL_ID, subtask slots +#include "pto_types.h" // Arg, TaskOutputTensors, TensorArgType +#include "task_args.h" // ChipStorageTaskArgs, Tensor +#include "tensor.h" // Tensor, TensorCreateInfo + +// ============================================================================= +// Tensor Factory Helpers +// ============================================================================= + +// make_tensor_external(...) — canonical factory for pre-allocated external +// memory — is defined in the unified tensor.h (common), so host and runtime +// build Tensors through the same controlled path. + +// ============================================================================= +// Ops Table and Opaque Runtime +// ============================================================================= + +/** + * Forward declaration — the orchestration sees PTO2Runtime as a partial + * struct whose first field is the ops pointer. The full definition + * lives in pto_runtime2.h (used only by runtime .cpp files). + */ +typedef struct PTO2Runtime PTO2Runtime; + +/** + * Function-pointer table for runtime operations. + * Populated by the runtime; called by orchestration through inline wrappers. + */ +typedef struct PTO2RuntimeOps { + TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args); + void (*scope_begin)(PTO2Runtime *rt); + void (*scope_end)(PTO2Runtime *rt); + void (*orchestration_done)(PTO2Runtime *rt); + bool (*is_fatal)(PTO2Runtime *rt); + void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...); + + // Logging (populated by runtime, called by orchestration) + void (*log_error)(const char *func, const char *fmt, ...); + void (*log_warn)(const char *func, const char *fmt, ...); + void (*log_debug)(const char *func, const char *fmt, ...); + // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside). + void (*log_info_v)(const char *func, int v, const char *fmt, ...); + + // Cross-layer data access (orchestration reads/writes tensor values via runtime) + // Placed after logging to avoid shifting hot-path field offsets. + uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]); + void (*set_tensor_data)( + PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value + ); + TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const L0TaskArgs &args); + TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const L0TaskArgs &args); + + // Stash the call-site of the next PTO2ScopeGuard so the [ScopeStats] + // collector can log it. Always present to keep ops-table layout stable + // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0. + void (*scope_set_site)(const char *file, int line); +} PTO2RuntimeOps; + +/** + * Partial PTO2Runtime definition for orchestration. + * + * Exposes the ops pointer (for runtime calls) and pending_scope_mode + * (read directly by inline scope wrappers). The real struct (in + * pto_runtime2.h) has the same first fields, so accessing them through + * this definition is well-defined (C struct layout guarantee). + */ +struct PTO2Runtime { + const PTO2RuntimeOps *ops; + PTO2ScopeMode pending_scope_mode; +}; + +// ============================================================================= +// Inline Convenience Wrappers (call through ops table) +// ============================================================================= + +static inline PTO2Runtime *current_runtime() { return framework_current_runtime(); } + +static inline TaskOutputTensors alloc_tensors(const L0TaskArgs &args) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return TaskOutputTensors{}; + } + return rt->ops->alloc_tensors(rt, args); +} + +static inline TaskOutputTensors alloc_tensors(const TensorCreateInfo create_infos[], uint32_t count) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return TaskOutputTensors{}; + } + L0TaskArgs args; + for (uint32_t i = 0; i < count; i++) { + args.add_output(create_infos[i]); + } + if (args.has_error) { + rt->ops->report_fatal( + rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", + args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg" + ); + return TaskOutputTensors{}; + } + return alloc_tensors(args); +} + +template +static inline TaskOutputTensors alloc_tensors(const CIs &...cis) { + static_assert(sizeof...(cis) > 0, "alloc_tensors requires at least one TensorCreateInfo"); + static_assert( + (std::is_same_v, TensorCreateInfo> && ...), + "alloc_tensors only accepts TensorCreateInfo arguments" + ); + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return TaskOutputTensors{}; + } + L0TaskArgs args; + (args.add_output(cis), ...); + if (args.has_error) { + rt->ops->report_fatal( + rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", + args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg" + ); + return TaskOutputTensors{}; + } + return alloc_tensors(args); +} + +static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return TaskOutputTensors{}; + } + return rt->ops->submit_task(rt, mixed_kernels, args); +} + +/** + * Convenience wrapper: submit an AIC-only task. + */ +static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, const L0TaskArgs &args) { + MixedKernels mk; + mk.aic_kernel_id = kernel_id; + return rt_submit_task(mk, args); +} + +/** + * Convenience wrapper: submit an AIV-only task (uses AIV0 slot). + */ +static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, const L0TaskArgs &args) { + MixedKernels mk; + mk.aiv0_kernel_id = kernel_id; + return rt_submit_task(mk, args); +} + +/** + * Submit a dependency-only task. Accepts the same Arg shape as rt_submit_task + * (inputs, outputs, inouts, explicit_deps, scalars) but does not run any + * AICore kernel. The task still participates in the dependency graph: it + * waits on its fanin and notifies its fanout. Useful as a synchronization + * barrier or as a placeholder producer for tests / dep-graph wiring. + */ +static inline TaskOutputTensors rt_submit_dummy_task(const L0TaskArgs &args) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return TaskOutputTensors{}; + } + return rt->ops->submit_dummy_task(rt, args); +} + +static inline void rt_scope_begin(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return; + } + rt->pending_scope_mode = mode; + rt->ops->scope_begin(rt); +} + +static inline void rt_scope_end() { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return; + } + rt->ops->scope_end(rt); +} + +static inline void rt_orchestration_done() { + PTO2Runtime *rt = current_runtime(); + rt->ops->orchestration_done(rt); +} + +static inline bool rt_is_fatal() { + PTO2Runtime *rt = current_runtime(); + return rt->ops->is_fatal(rt); +} + +#define rt_report_fatal(code, fmt, ...) \ + do { \ + PTO2Runtime *_rt = current_runtime(); \ + _rt->ops->report_fatal(_rt, (code), __FUNCTION__, (fmt), ##__VA_ARGS__); \ + } while (0) + +// ============================================================================= +// Logging Macros for Orchestration (call through ops table) +// ============================================================================= + +#define LOG_ERROR(fmt, ...) current_runtime()->ops->log_error(__FUNCTION__, fmt, ##__VA_ARGS__) +#define LOG_WARN(fmt, ...) current_runtime()->ops->log_warn(__FUNCTION__, fmt, ##__VA_ARGS__) +#define LOG_DEBUG(fmt, ...) current_runtime()->ops->log_debug(__FUNCTION__, fmt, ##__VA_ARGS__) + +// INFO verbosity tiers. v=0 most verbose, v=9 must-see, v=5 default. +#define LOG_INFO_V0(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 0, fmt, ##__VA_ARGS__) +#define LOG_INFO_V1(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 1, fmt, ##__VA_ARGS__) +#define LOG_INFO_V2(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 2, fmt, ##__VA_ARGS__) +#define LOG_INFO_V3(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 3, fmt, ##__VA_ARGS__) +#define LOG_INFO_V4(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 4, fmt, ##__VA_ARGS__) +#define LOG_INFO_V5(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 5, fmt, ##__VA_ARGS__) +#define LOG_INFO_V6(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 6, fmt, ##__VA_ARGS__) +#define LOG_INFO_V7(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 7, fmt, ##__VA_ARGS__) +#define LOG_INFO_V8(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 8, fmt, ##__VA_ARGS__) +#define LOG_INFO_V9(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 9, fmt, ##__VA_ARGS__) + +// ============================================================================= +// Cross-Layer Data Access +// ============================================================================= + +/** + * Read a value from a tensor at the given multi-dimensional indices. + * + * Default T = uint64_t preserves old behavior (raw bits). + * Specify T to get automatic type conversion: + * + * uint64_t raw = get_tensor_data(tensor, 1, idx); // old usage unchanged + * float val = get_tensor_data(tensor, 1, idx); // typed read + * + * If the tensor has a producer in TensorMap, spin-waits until the producer + * task completes before reading. External tensors (make_tensor_external) + * are read immediately without waiting. + */ +template +static inline T get_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return from_u64(0); + } + return from_u64(rt->ops->get_tensor_data(rt, tensor, ndims, indices)); +} + +/** + * Write a value to a tensor at the given multi-dimensional indices. + * + * Type is deduced from value argument; uint64_t by default: + * + * set_tensor_data(tensor, 1, idx, raw_u64); // old usage unchanged + * set_tensor_data(tensor, 1, idx, 42.0f); // typed write (T = float) + * + * If the tensor has a producer in TensorMap, spin-waits until the producer + * and all its consumers complete before writing (WAW + WAR safety). + * External tensors (make_tensor_external) with no TensorMap entry are + * written immediately without waiting. + * + * Limitation: TensorMap only tracks producers (OUTPUT/INOUT), not consumers + * that used the tensor as INPUT. If a kernel reads this tensor as INPUT + * (not INOUT) and the tensor has no TensorMap producer entry, set_tensor_data + * cannot detect the reader and may cause a data race. + * + * To ensure WAR safety for all access patterns, use add_inout() instead of + * add_input() for kernel parameters that may later be written via + * set_tensor_data. INOUT creates a TensorMap entry that enables automatic + * consumer tracking via fanout_refcount. + * + * The tensor must already have an allocated buffer (addr != 0). + * For runtime-created outputs, call this only on the Tensor returned by + * add_output(TensorCreateInfo) after submit returns. + */ +template +static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[], T value) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return; + } + rt->ops->set_tensor_data(rt, tensor, ndims, indices, to_u64(value)); +} + +// ============================================================================= +// C++ Scope Guards and Macros +// ============================================================================= + +/** + * RAII Scope Guard (calls through ops table) + */ +class PTO2ScopeGuard { +public: + explicit PTO2ScopeGuard( + PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE() + ) : + rt_(current_runtime()) { + if (!rt_->ops->is_fatal(rt_)) { + rt_->pending_scope_mode = mode; + if (rt_->ops->scope_set_site) rt_->ops->scope_set_site(file, line); + rt_->ops->scope_begin(rt_); + } + } + ~PTO2ScopeGuard() { + if (!rt_->ops->is_fatal(rt_)) { + rt_->ops->scope_end(rt_); + } + } + +private: + PTO2Runtime *rt_; +}; + +#define _PTO2_CONCATENATE_IMPL(x, y) x##y +#define _PTO2_CONCATENATE(x, y) _PTO2_CONCATENATE_IMPL(x, y) + +#define PTO2_SCOPE_GUARD() [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__) + +/** + * Scoped block macro: + * PTO2_SCOPE() { + * rt_submit_task(...); + * } + */ +#define PTO2_SCOPE(...) if (PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__){__VA_ARGS__}; true) + +// ============================================================================= +// Orchestration Config +// ============================================================================= + +/** + * Configuration exported by orchestration .so via aicpu_orchestration_config(). + * The executor reads these values to set up shared memory and runtime. + * + * This struct is defined identically in pto_runtime2.h (with an include + * guard) so the executor can use the same type without including this header. + */ +#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED +#define PTO2_ORCHESTRATION_CONFIG_DEFINED +struct PTO2OrchestrationConfig { + int expected_arg_count; +}; +#endif + +// Convenience layer (L0TaskArgsWithDeps + matching rt_submit_*_task overloads). +// Pulled in at the bottom so the wrapper sees L0TaskArgs, MixedKernels, and the +// rt_submit_*_task primitives defined above. Orchestration sources include +// only this single header to access both the primitive and convenience APIs. +#include "pto_arg_with_deps.h" // NOLINT(build/include_subdir) diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h new file mode 100644 index 000000000..0f73a043a --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_ +#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_ + +#include +#include + +#include "aicore_completion_mailbox_types.h" +#include "pto_constants.h" +#include "pto_task_id.h" + +// AICPU-only MPSC ring used to convey deferred-completion observations from +// FIN-handling scheduler threads to the dispatch thread. Producers push under +// CAS on `head`; the single consumer (dispatch thread, under AsyncWaitList:: +// busy) drains in seq order. Kernel-side code never touches this struct — +// AICore writes go into DeferredCompletionSlab (see +// aicore_completion_mailbox_types.h), which the FIN thread reads, flattens +// into messages here, and forwards. + +#define AICORE_COMPLETION_MAILBOX_CAPACITY 4096u +#define AICORE_COMPLETION_MAILBOX_MASK (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u) + +static_assert( + (AICORE_COMPLETION_MAILBOX_CAPACITY & (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)) == 0, + "AICORE_COMPLETION_MAILBOX_CAPACITY must be a power of two" +); + +// Mailbox message discriminator. CONDITION carries one deferred-completion +// observation flattened from a DeferredCompletionEntry. TASK_NORMAL_DONE +// carries the slot_state pointer in `addr` so the consumer can finalize the +// AsyncWaitEntry.slot_state binding for tasks whose conditions arrived +// before the FIN thread saw task_complete. New kinds may be added in future +// without growing the message — the `_pad[5]` slack is reserved for +// kind-specific payload extension. +#define MSG_KIND_CONDITION 0u +#define MSG_KIND_TASK_NORMAL_DONE 1u + +struct AICoreCompletionMailboxMessage { + // Per-slot ready flag. Producer publishes `tail+1` after filling the rest + // of the slot with a release store; consumer waits for the matching seq + // value with an acquire load. The release-acquire pair publishes all + // other fields below as a side effect, so they stay plain. + std::atomic seq; + PTO2TaskId task_token; + // CONDITION: completion observation addr (counter / SDMA event record). + // TASK_NORMAL_DONE: PTO2TaskSlotState pointer carried over to the consumer + // so it can finalize the AsyncWaitEntry.slot_state binding. + uint64_t addr; + uint32_t expected_value; + uint32_t engine; + int32_t completion_type; + uint32_t kind; + uint32_t _pad[5]; +}; + +static_assert(sizeof(AICoreCompletionMailboxMessage) == PTO2_ALIGN_SIZE, "AICoreCompletionMailboxMessage layout drift"); +static_assert( + sizeof(std::atomic) == sizeof(uint64_t), + "std::atomic must be layout-compatible with uint64_t for the message slot layout to hold" +); +static_assert( + std::atomic::is_always_lock_free, + "AICoreCompletionMailbox requires lock-free uint64_t atomics on every supported target" +); + +// POD view of a drained message. `seq` is the ring's publication flag, not +// payload, so try_pop copies out only the fields below (and seq is not even +// copyable — it is a std::atomic). +struct AICoreCompletionMsgView { + PTO2TaskId task_token{PTO2TaskId::invalid()}; + uint64_t addr{0}; + uint32_t expected_value{0}; + uint32_t engine{0}; + int32_t completion_type{0}; + uint32_t kind{0}; +}; + +struct AICoreCompletionMailbox { + // head and tail live on their own cache lines so producer CAS contention + // on head can't false-share with the consumer's tail updates. + alignas(PTO2_ALIGN_SIZE) std::atomic head; + uint8_t _head_pad[PTO2_ALIGN_SIZE - sizeof(uint64_t)]; + alignas(PTO2_ALIGN_SIZE) std::atomic tail; + uint8_t _tail_pad[PTO2_ALIGN_SIZE - sizeof(uint64_t)]; + alignas(PTO2_ALIGN_SIZE) AICoreCompletionMailboxMessage entries[AICORE_COMPLETION_MAILBOX_CAPACITY]; + + // Cheap, lock-free pending hint. Callers may invoke this outside the + // consumer lock; a stale answer only over/under-triggers a drain attempt. + bool has_pending() { return tail.load(std::memory_order_acquire) < head.load(std::memory_order_acquire); } + + // MPSC push for a CONDITION message. Returns false when the ring is full + // (head - tail >= CAPACITY); caller should SPIN_WAIT_HINT and retry. + // Lock-free: CAS the shared head to claim a slot, write the fields, then + // release-store seq so the single consumer observes the publication. + // + // The head CAS is relaxed: head is a pure ticket counter and carries no + // data to the consumer — publication is solely the seq release-store, and + // slot-reuse safety rests on the acquire load of tail. The relaxed failure + // order is likewise sufficient since a lost CAS just re-reads head and + // retries. compare_exchange_weak is used because this loop already re-reads + // head and re-checks fullness, so masking LL/SC spurious failures (what + // _strong adds on aarch64) would only be a redundant inner retry. + // + // Safe to call concurrently from any number of producers; structurally + // independent of the AsyncWaitList::busy lock. + bool try_push_condition( + PTO2TaskId task_token, uint64_t addr, uint32_t expected_value, uint32_t engine, int32_t completion_type + ) { + while (true) { + uint64_t h = head.load(std::memory_order_relaxed); + uint64_t t = tail.load(std::memory_order_acquire); + if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false; + uint64_t new_head = h + 1; + if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) { + AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK]; + slot->task_token.raw = task_token.raw; + slot->addr = addr; + slot->expected_value = expected_value; + slot->engine = engine; + slot->completion_type = completion_type; + slot->kind = MSG_KIND_CONDITION; + slot->seq.store(new_head, std::memory_order_release); + return true; + } + // CAS lost: another producer claimed the slot, retry with refreshed head. + } + } + + // MPSC push for a TASK_NORMAL_DONE sentinel. Carries the PTO2TaskSlotState + // pointer in the `addr` field so the consumer can finish binding the + // AsyncWaitEntry.slot_state without going back to the FIN-handling thread. + bool try_push_normal_done(PTO2TaskId task_token, uint64_t slot_state_addr) { + while (true) { + uint64_t h = head.load(std::memory_order_relaxed); + uint64_t t = tail.load(std::memory_order_acquire); + if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false; + uint64_t new_head = h + 1; + if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) { + AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK]; + slot->task_token.raw = task_token.raw; + slot->addr = slot_state_addr; + slot->expected_value = 0; + slot->engine = 0; + slot->completion_type = 0; + slot->kind = MSG_KIND_TASK_NORMAL_DONE; + slot->seq.store(new_head, std::memory_order_release); + return true; + } + } + } + + // Single-consumer transport-level dequeue (caller holds the consumer lock). + // Returns false at the first not-yet-published slot (gap) or when empty; + // otherwise copies the next message in tail order into `out`, advances + // tail, and returns true. tail is consumer-only-written (relaxed read); + // head bounds the scan (relaxed); the seq acquire is the real publication + // gate; the tail release publishes "slot free" to reusing producers. + bool try_pop(AICoreCompletionMsgView &out) { + uint64_t t = tail.load(std::memory_order_relaxed); + uint64_t h = head.load(std::memory_order_relaxed); + if (t >= h) return false; + AICoreCompletionMailboxMessage *slot = &entries[t & AICORE_COMPLETION_MAILBOX_MASK]; + if (slot->seq.load(std::memory_order_acquire) != t + 1) return false; + out.task_token.raw = slot->task_token.raw; + out.addr = slot->addr; + out.expected_value = slot->expected_value; + out.engine = slot->engine; + out.completion_type = slot->completion_type; + out.kind = slot->kind; + tail.store(t + 1, std::memory_order_release); + return true; + } +}; + +static_assert( + sizeof(AICoreCompletionMailbox) % PTO2_ALIGN_SIZE == 0, "AICoreCompletionMailbox size must be cache-line aligned" +); + +#endif // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_ diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h new file mode 100644 index 000000000..da0d89ad7 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_ +#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_ + +#include + +#include "pto_constants.h" + +// Types shared across the AICore↔AICPU boundary. +// +// This header is reachable from AICore-side translation units (via +// pto_async_kernel_api.h / pto_completion_token.h / sdma_completion_kernel.h) +// and must stay parseable by every AICore toolchain configuration: no +// , no __atomic_* intrinsics, no MPSC ring buffer struct. +// +// The MPSC ring (AICoreCompletionMailbox) and its push/drain helpers live in +// aicore_completion_mailbox.h, which is AICPU-only. + +inline constexpr int32_t MAX_COMPLETIONS_PER_TASK = 64; + +#define COMPLETION_ENGINE_SDMA 0u +#define COMPLETION_ENGINE_ROCE 1u +#define COMPLETION_ENGINE_URMA 2u +#define COMPLETION_ENGINE_CCU 3u + +#define COMPLETION_TYPE_COUNTER 0 +#define COMPLETION_TYPE_SDMA_EVENT_RECORD 1 + +// DeferredCompletionEntry / DeferredCompletionSlab back the per-task scratch +// area that AICore writes into to record "this completion has to be observed +// before the task can retire." The FIN-handling scheduler thread reads the +// slab, flattens entries into AICoreCompletionMailbox messages, and forwards +// them to the dispatch thread. `volatile` here is load-bearing: writers live +// on AICore and readers on AICPU, so the qualifier is the correct way to +// pin the compiler against caching / reordering on either side. +struct DeferredCompletionEntry { + uint64_t addr; + uint32_t expected_value; + uint32_t engine; + int32_t completion_type; + uint32_t _pad; +}; + +static_assert(sizeof(DeferredCompletionEntry) == 24, "DeferredCompletionEntry layout drift"); + +struct alignas(PTO2_ALIGN_SIZE) DeferredCompletionSlab { + volatile uint32_t count; + volatile int32_t error_code; + DeferredCompletionEntry entries[MAX_COMPLETIONS_PER_TASK]; +}; + +static_assert( + sizeof(DeferredCompletionSlab) % PTO2_ALIGN_SIZE == 0, + "DeferredCompletionSlab size must preserve array element cache-line boundaries" +); + +#endif // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_ diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h new file mode 100644 index 000000000..49ee7cc11 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#pragma once + +#include + +#include +#include + +#include "pto_async_kernel_api.h" +#include "aicore_completion_mailbox_types.h" +#include "pto_runtime_status.h" + +#ifndef __aicore__ +#define __aicore__ +#endif +#ifndef __gm__ +#define __gm__ +#endif + +// Re-exposed PTO-ISA constant so examples / callers don't need to include +// just to spell their scratch tile. +inline constexpr uint32_t SDMA_SCRATCH_ALIGNMENT = pto::comm::sdma::UB_ALIGN_SIZE; + +enum class SdmaOp : uint8_t { + TGET = 0, + TPUT = 1, +}; + +// SdmaRequestDescriptor bundles everything send_request_entry needs to drive +// one SDMA transfer + completion registration. It is a template because the +// destination / source / scratch types carry tensor shape & stride at compile +// time; the SdmaTget() / SdmaTput() helpers below let callers skip the +// template arguments. +// +// sync_id selects which event-record slot inside the workspace the engine +// writes into. Concurrent dispatches must use distinct sync_ids; today every +// caller submits one request per kernel invocation so passing 0 is safe. +// Future work (see .docs/25.comm-api-refactor/03.implementation-plan.md §5.2) +// will fold sync_id allocation into the adapter. +template +struct SdmaRequestDescriptor { + SdmaOp op; + DstTensor dst; + SrcTensor src; + ScratchTileT scratch; + __gm__ uint8_t *workspace; + uint32_t sync_id; +}; + +template +inline __aicore__ SdmaRequestDescriptor SdmaTget( + const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace, + uint32_t sync_id = 0 +) { + return SdmaRequestDescriptor{SdmaOp::TGET, dst, src, + scratch, workspace, sync_id}; +} + +template +inline __aicore__ SdmaRequestDescriptor SdmaTput( + const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace, + uint32_t sync_id = 0 +) { + return SdmaRequestDescriptor{SdmaOp::TPUT, dst, src, + scratch, workspace, sync_id}; +} + +namespace pto2::detail { + +inline __aicore__ void register_sdma_event_record(AsyncCtx &ctx, volatile __gm__ void *record_addr) { + CompletionToken token{ + reinterpret_cast(record_addr), 0, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_SDMA_EVENT_RECORD, 0 + }; + (void)register_completion_condition(ctx, token); +} + +template +inline __aicore__ void +register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsyncSession &session) { + if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) { + (void)event.Wait(session); + return; + } + if (event.handle == 0) { + return; + } + + const uint32_t engine = static_cast(event.engine); + if (engine != static_cast(::pto::comm::DmaEngine::SDMA)) { + defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID); + return; + } + + ::pto::comm::sdma::detail::UbTmpBuf tmp_buf; + uint32_t sync_id = 0; + __gm__ uint8_t *recv_workspace = nullptr; + uint32_t queue_num = 0; + if (!::pto::comm::sdma::detail::PrepareEventCheck( + session.sdmaSession, tmp_buf, sync_id, recv_workspace, queue_num + )) { + defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID); + return; + } + for (uint32_t queue_id = 0; queue_id < queue_num; ++queue_id) { + register_sdma_event_record(ctx, ::pto::comm::sdma::detail::GetEventRecord(recv_workspace, queue_id)); + } +} + +} // namespace pto2::detail + +// SDMA overload of the runtime's send_request_entry. Submits the descriptor +// to PTO-ISA, then registers the resulting AsyncEvent's GM flag(s) into the +// AsyncCtx deferred-wait slab and flushes. Returns false on submit/session +// failure (also records the error in ctx.completion_error_code). +template +inline __aicore__ bool +send_request_entry(AsyncCtx &ctx, SdmaRequestDescriptor desc) { + pto::comm::AsyncSession session; + if (!pto::comm::BuildAsyncSession(desc.scratch, desc.workspace, session, desc.sync_id)) { + pto2::detail::defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID); + return false; + } + + pto::comm::AsyncEvent event; + if (desc.op == SdmaOp::TGET) { + event = pto::comm::TGET_ASYNC(desc.dst, desc.src, session); + } else { + event = pto::comm::TPUT_ASYNC(desc.dst, desc.src, session); + } + pto2::detail::register_pto_async_event(ctx, event, session); + pto2::detail::defer_flush(ctx); + return true; +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h new file mode 100644 index 000000000..689219c35 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#pragma once + +#include +#include + +#include "aicpu/platform_regs.h" +#include "aicore_completion_mailbox.h" +#include "pto_completion_token.h" +#include "pto_runtime_status.h" + +// runtime-side mirror of the PTO-ISA SdmaEventRecord. SDMA backend is the only +// allowed holder of this ABI knowledge; the generic scheduler dispatches into +// the helpers below through the completion ops table. +struct SdmaEventRecord { + uint32_t flag; + uint32_t sq_tail; + uint64_t channel_info; +}; + +static_assert(sizeof(SdmaEventRecord) == 16, "SDMA event record ABI drift"); +static_assert(offsetof(SdmaEventRecord, sq_tail) == 4, "SDMA event record ABI drift"); + +inline uintptr_t sdma_completion_cache_line(const volatile void *addr) { + return reinterpret_cast(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u); +} + +inline CompletionPollResult poll_sdma_event_record(uint64_t record_addr) { + if (record_addr == 0) { + return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID}; + } + volatile SdmaEventRecord *record = + reinterpret_cast(static_cast(record_addr)); + cache_invalidate_range(reinterpret_cast(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE); + uint32_t flag = __atomic_load_n(&record->flag, __ATOMIC_ACQUIRE); + return {flag != 0 ? CompletionPollState::READY : CompletionPollState::PENDING, PTO2_ERROR_NONE}; +} + +inline void retire_sdma_event_record(uint64_t record_addr) { + if (record_addr == 0) return; + volatile SdmaEventRecord *record = + reinterpret_cast(static_cast(record_addr)); + cache_invalidate_range(reinterpret_cast(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE); + uint32_t completed_tail = __atomic_load_n(&record->sq_tail, __ATOMIC_ACQUIRE); + uint64_t channel_info_addr = __atomic_load_n(&record->channel_info, __ATOMIC_ACQUIRE); + + volatile uint64_t *record_head = reinterpret_cast(record); + __atomic_store_n(record_head, 0ULL, __ATOMIC_RELEASE); + cache_flush_range(const_cast(reinterpret_cast(record_head)), sizeof(uint64_t)); + + if (channel_info_addr == 0) return; + uint64_t packed = (static_cast(completed_tail) << 32) | static_cast(completed_tail); + volatile uint64_t *channel_info = reinterpret_cast(static_cast(channel_info_addr)); + __atomic_store_n(channel_info, packed, __ATOMIC_RELEASE); + cache_flush_range(const_cast(reinterpret_cast(channel_info)), sizeof(uint64_t)); +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/common.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/common.h new file mode 100644 index 000000000..9dcf438ed --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/common.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#pragma once + +#include +#include + +// Assertion macros (always_assert / debug_assert), AssertionError, and the +// MAYBE_UNINITIALIZED diagnostics live in the shared header so the unified +// Tensor (src/common/task_interface/tensor.h) can use them without depending +// on this runtime-specific header. assert_impl / get_stacktrace are defined in +// orchestration/common.cpp for runtime targets. +#include "assert_compat.h" + +// Framework-internal TLS bridge. The executor binds the current thread's +// runtime before invoking the orchestration entry, so orchestration helpers can +// fetch the current PTO2Runtime without explicit parameter threading. Declared +// here (rather than in pto_orchestration_api.h) so framework TUs the AICore +// build also compiles — notably orchestration/common.cpp — see these symbols +// without pulling in pto_types.h, whose Arg::add_scalar → to_u64 path is +// __aicore__-only and would break the ccec build. +#ifdef __cplusplus +extern "C" { +#endif +struct PTO2Runtime; +PTO2Runtime *framework_current_runtime(void); +void framework_bind_runtime(PTO2Runtime *rt); +#ifdef __cplusplus +} +#endif diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.cpp new file mode 100644 index 000000000..6525ef7ba --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.cpp @@ -0,0 +1,1195 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * fully_distributed_within_core engine. + * + * SPMD orchestration + scheduling + execution on the AI cores. See + * docs/fully_distributed_within_core.md for the authoritative design and + * src/.../docs/RUNTIME_LOGIC.md for the local overview. + * + * Each AICore worker thread runs dist_core_main(), which: + * 1. replays the full orchestration submit stream (every core builds an + * identical per-core TensorMap and computes identical deterministic GM + * output-heap addresses; only ownership differs); + * 2. on each rt_submit_*, races to claim the task on one of two global + * cursors (cube for AIC-anchored, vector for AIV-only). The winner is + * owner = builder = executor and builds the task into its private ring; + * 3. runs a run-ahead loop: orchestrate until the private ring is full + * (back-pressure drains ready tasks inline), then after orchestration + * returns, drains the ring to completion. A task is ready once all its + * fan-in producers have set their entry in the global completion-flag + * ring; on completion the owner sets its own flag (release). + * + * This file is compiled into the AICPU .so (build_config aicore source_dirs do + * not include runtime/), but dist_core_main runs ON the AICore worker threads + * (invoked through a function pointer), so kernels execute on AICore threads + * with their sim TLS in place. + * + * M2 scope: single-core tasks (1C / 1V) only — sufficient for benchmark_bgemm. + * Multi-core co-ownership (MIX / 2V, block.won) is M3; GM heap reclamation is + * M4. A MIX task encountered in M2 raises a fatal error. + */ + +#include "dist_engine.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "callable.h" +#include "common/core_type.h" +#include "intrinsic.h" +#include "pto2_dispatch_payload.h" +#include "pto_constants.h" +#include "pto_runtime2.h" +#include "pto_submit_types.h" +#include "pto_types.h" +#include "runtime.h" +#include "spin_hint.h" +#include "tensor.h" +#include "tensor_create_info.h" + +namespace { + +// ----------------------------------------------------------------------------- +// Tunables. The completion-flag ring is sized to hold an entire run without +// wrap (>= total tasks); the GM output heap is a BOUNDED RING reclaimed by the +// completion frontier (M4, §9.5/§11.4) rather than a run-sized bump. +// ----------------------------------------------------------------------------- +constexpr int32_t kPrivateSlots = 8; // PRIVATE_TASK_SLOT_NUM (back-pressure cap) +// Ring slots a core reserves for draining block.won deposits addressed to its +// lane. Self-claimed tasks (consumers / single-core / own anchor subtask) may +// only occupy kPrivateSlots - kWonReserve slots, so a follower can ALWAYS pull +// and run an (immediately-ready) deposit even when its ring is otherwise full of +// not-yet-ready consumers — breaking the consumer<->deposit priority inversion. +constexpr int32_t kWonReserve = 2; +constexpr int32_t kMaxFanin = 16; // max distinct producers a task waits on +constexpr int32_t kOutPoolSlots = 1024; // per-core ring of materialized output Tensors +constexpr int32_t kMapCap = 16384; // per-core producer-map capacity (distinct regions) +constexpr int32_t kFlagCap = 1 << 16; // global completion-flag ring (>= total tasks) + +// M4 GM-heap reclamation (§9.5/§11.4). +// kHeapRingDefault — bounded physical heap ring (env PTO_DIST_HEAP_MB overrides, +// in MiB). The deterministic virtual bump is unbounded; physical address is +// (virtual_offset mod ring). A region is reused only after its previous +// occupant's task id <= R (the reclaim frontier), enforced by back-pressure. +// kHDefault — dependency-span bound H (env PTO_DIST_H overrides): every consumer +// of task N has id <= N + H. R = F - H. Must be >= the graph's true heap span +// or a producer region could be recycled while a late consumer still reads it +// (run-time-checked → fatal "heap span exceeded"). +constexpr size_t kHeapRingDefault = 64ull << 20; +constexpr int32_t kHDefault = 64; + +// ----------------------------------------------------------------------------- +// Per-core producer map (the "full per-core duplicate TensorMap"). +// +// A faithful, compact stand-in for PTO2TensorMap: keyed by GM byte range, it +// records the most recent producer task id of each written region. INPUT/INOUT +// fan-in resolves to the producer(s) whose region overlaps. Exact-region writes +// (e.g. an INOUT accumulation chain) replace in place; new regions append. +// Every core builds an identical map by replaying the same submit stream. +// ----------------------------------------------------------------------------- +struct MapEntry { + uint64_t buf_addr; // Tensor.buffer.addr (GM buffer base, bytes) + uint64_t lo; // byte offset of view origin within buffer + uint64_t hi; // byte offset one-past the view extent + int32_t producer; // task id that last wrote this region +}; + +struct DistTensorMap { + MapEntry entries[kMapCap]; + int32_t count; + + void reset() { count = 0; } + + static void byte_range(const Tensor &t, uint64_t &addr, uint64_t &lo, uint64_t &hi) { + const uint64_t esz = get_element_size(t.dtype); + addr = t.buffer.addr; + lo = t.start_offset * esz; + hi = (t.start_offset + t.extent_elem()) * esz; + } + + // Record `producer` as the writer of `t`'s region (replace exact match). + void insert(const Tensor &t, int32_t producer) { + uint64_t addr, lo, hi; + byte_range(t, addr, lo, hi); + for (int32_t i = 0; i < count; i++) { + MapEntry &e = entries[i]; + if (e.buf_addr == addr && e.lo == lo && e.hi == hi) { + e.producer = producer; + return; + } + } + if (count < kMapCap) { + entries[count++] = MapEntry{addr, lo, hi, producer}; + } + } + + // Most-recent producer whose region overlaps `t`, or -1 if none. + int32_t lookup(const Tensor &t) const { + uint64_t addr, lo, hi; + byte_range(t, addr, lo, hi); + int32_t best = -1; + for (int32_t i = 0; i < count; i++) { + const MapEntry &e = entries[i]; + if (e.buf_addr == addr && lo < e.hi && e.lo < hi) { + if (e.producer > best) best = e.producer; + } + } + return best; + } +}; + +// ----------------------------------------------------------------------------- +// A private-ring slot: a fully materialized, self-contained task this core owns +// and will execute itself. Holds its own copy of the argument Tensors so it can +// be executed at any later point (deferred past further orchestration). +// ----------------------------------------------------------------------------- +struct RingSlot { + bool occupied; + // A slot can be reserved (occupied=true) before it is fully populated: the + // submit winner grabs a slot up front so concurrent drains do not reuse it, + // then may spin in block.won back-pressure (which itself drains Phase B) + // before calling build_ring_slot. `built` gates execution so Phase B never + // (re)runs a reserved-but-unbuilt slot still holding a prior occupant's + // task_id/fanin/won linkage. build_ring_slot sets it; execute_slot clears it. + bool built; + int32_t task_id; + uint64_t function_bin_addr; + + int32_t tensor_count; + int32_t scalar_count; + Tensor tensors[MAX_TENSOR_ARGS]; + uint64_t scalars[MAX_SCALAR_ARGS]; + + uint64_t args[PTO2_DISPATCH_MAX_ARGS]; + LocalContext local_ctx; + GlobalContext global_ctx; + + int32_t fanin[kMaxFanin]; + int32_t fanin_count; + + // Multi-core (MIX / 2V) linkage. When is_multicore, the completion flag for + // task_id is owned jointly: each co-owner decrements block.won[won_slot].remaining + // after executing its own subtask, and the one driving it to zero publishes + // the single global task_completed_flag. Single-core tasks set the flag directly. + bool is_multicore; + int32_t won_block; + int32_t won_slot; +}; + +// ----------------------------------------------------------------------------- +// block.won — the id-keyed anchor→follower deposit table (block-shared, §3.1). +// One BlockWon per physical block (1 AIC + 2 AIV). The anchor that wins a +// multi-core task builds its OWN physical-lane subtask into its private ring and +// deposits the remaining active-lane subtasks here; followers asynchronously +// drain the entry addressed to their physical lane (no blocking, no per-walk +// wait). Keyed by task id via per-slot task_id so concurrent multi-core tasks of +// one block never alias. `remaining` = popcount(active_mask) drives the single +// completion flag (§3.1). Lane index uses PTO2SubtaskSlot (AIC=0/AIV0=1/AIV1=2). +// ----------------------------------------------------------------------------- +struct BuiltSubtask { + bool present; + uint64_t function_bin_addr; + int32_t tensor_count; + int32_t scalar_count; + Tensor tensors[MAX_TENSOR_ARGS]; + uint64_t scalars[MAX_SCALAR_ARGS]; + int32_t fanin[kMaxFanin]; + int32_t fanin_count; + int32_t sub_block_id; +}; + +struct WonSlot { + std::atomic state; // 0=free, 1=published, 2=reserving + int32_t task_id; + std::atomic remaining; // co-owners (incl. anchor) left to finish + std::atomic drained[PTO2_SUBTASK_SLOT_COUNT]; // 0/1 per follower lane + BuiltSubtask lane[PTO2_SUBTASK_SLOT_COUNT]; // deposited follower subtasks +}; + +struct BlockWon { + WonSlot slots[kPrivateSlots]; +}; + +enum LaneId : int32_t { LANE_AIC = 0, LANE_AIV0 = 1, LANE_AIV1 = 2, LANE_NONE = -1 }; + +struct CoreLayout { + int32_t block_id; // physical block index + int32_t lane; // LaneId of this core within its block +}; + +// ----------------------------------------------------------------------------- +// Per-core engine state (the SPMD worker context). +// ----------------------------------------------------------------------------- +struct DistCore { + CoreType role; + int32_t block_id; // physical block this core belongs to + int32_t lane; // LaneId within the block (AIC / AIV0 / AIV1) + int32_t sub_block_id; + int32_t local_index; // next task id this core will see (== tasks replayed) + uint64_t heap_next; // deterministic GM output-heap bump cursor (bytes) + + DistTensorMap map; + + RingSlot slots[kPrivateSlots]; + int32_t occupied_count; + int32_t owned_total; // tasks this core claimed+executed (debug) + + Tensor outpool[kOutPoolSlots]; + int32_t outpool_head; + + void reset(CoreType r, int32_t block, int32_t lane_id) { + role = r; + block_id = block; + lane = lane_id; + sub_block_id = (lane_id == LANE_AIV1) ? 1 : 0; + local_index = 0; + heap_next = 0; + map.reset(); + occupied_count = 0; + owned_total = 0; + outpool_head = 0; + for (int32_t i = 0; i < kPrivateSlots; i++) { + slots[i].occupied = false; + slots[i].built = false; + } + } +}; + +// ----------------------------------------------------------------------------- +// Global engine state (shared by all worker threads in this process). Cursors + +// flags live here rather than in GM because in sim every core is a host thread +// in one address space; the GM output heap below is a real shared buffer. +// ----------------------------------------------------------------------------- +struct DistGlobal { + std::atomic cube_cursor; // highest claimed AIC-anchored task id + std::atomic vector_cursor; // highest claimed AIV-only task id + std::atomic flags[kFlagCap]; // completion-flag ring (1 == task done) + + // M4 reclamation (§9.5/§11.4). `frontier` (F) is the global continuous + // completion frontier — the largest prefix s.t. every task id <= F is done; + // advanced cooperatively (CAS) by whichever core sets the flag that extends + // the prefix. `R = frontier - H` is the reclaim frontier. `vend[N]` is the + // cumulative virtual heap bytes through task N (deterministic & identical on + // every core), so any core can compute the live byte window [vend[R], top). + std::atomic frontier; + int32_t H; + std::atomic vend[kFlagCap]; + + uint8_t *heap_base; + size_t heap_size; // == bounded ring size + + DistOrchFunc orch_func; + const L2TaskArgs *orch_args; + PTO2Runtime *rt; + Runtime *runtime; // outer Runtime (for kernel-address resolution + done_count) + + std::atomic fatal; + + // Physical-block topology (1 AIC + 2 AIV per block), derived once at register + // time from Runtime::workers[].core_type, identical to the centralized + // scheduler's cluster discovery (AIC core b pairs with the 2b-th / (2b+1)-th + // AIV cores in worker-index order). + int32_t num_workers; + int32_t num_blocks; + CoreLayout layout[RUNTIME_MAX_WORKER]; + BlockWon blocks[RUNTIME_MAX_WORKER]; // indexed by block_id (<= num AIC) + + // Global "all cores finished orchestration replay" counter. A follower must + // not conclude "no more pushes are coming for my lane" until every core has + // finished replaying the submit stream (§7 tail-idle). + std::atomic replay_done; + + DistCore cores[RUNTIME_MAX_WORKER]; +}; + +DistGlobal g_dist; +thread_local DistCore *g_self = nullptr; + +// Opt-in per-core tracing (set PTO_DIST_TRACE=1). Off by default so a passing +// run is quiet; fatal/error/heap-exhaustion diagnostics are always emitted. +inline bool dist_trace() { + static const bool on = (getenv("PTO_DIST_TRACE") != nullptr); + return on; +} + +// ----------------------------------------------------------------------------- +// Fatal / claim / execution helpers +// ----------------------------------------------------------------------------- +inline bool fatal_set() { return g_dist.fatal.load(std::memory_order_acquire) != 0; } +inline void set_fatal() { g_dist.fatal.store(1, std::memory_order_release); } + +void dist_dump_state(int); // defined below; dumps full engine state for hangs + +// Env-gated stall watchdog (set PTO_DIST_WATCHDOG=, default off). Called +// from inside the engine's spin loops on a worker thread (so fprintf is safe, +// unlike a signal handler). On the first call it records a start time; if a loop +// keeps spinning past the budget the engine is presumed deadlocked, so it dumps +// the full state once and sets fatal to unwind every core for a fast, diagnosed +// failure instead of an indefinite hang. +inline void watchdog(uint64_t &start_ns) { + static const long budget_s = []() -> long { + const char *e = getenv("PTO_DIST_WATCHDOG"); + return e ? atol(e) : 0; + }(); + if (budget_s <= 0) return; + const uint64_t now = static_cast( + std::chrono::duration_cast(std::chrono::steady_clock::now().time_since_epoch()) + .count()); + if (start_ns == 0) { + start_ns = now; + return; + } + if (now - start_ns > static_cast(budget_s) * 1000000000ull) { + static std::atomic dumped{0}; + int32_t exp = 0; + if (dumped.compare_exchange_strong(exp, 1, std::memory_order_acq_rel)) { + fprintf(stderr, "[dist_engine] WATCHDOG fired after %lds — presumed deadlock, dumping state\n", budget_s); + dist_dump_state(0); + } + set_fatal(); + } +} + +// CAS-loop fetch_max (§11.1): returns true (WON) iff this core advanced the +// cursor to N. No hardware fetch_max on the target, so this is the equivalent +// acq-rel CAS retry. Monotonic: each task id is claimed by exactly one core and +// no id is skipped within a cursor's subsequence. +bool claim(std::atomic &cursor, int32_t N) { + int32_t c = cursor.load(std::memory_order_acquire); + while (true) { + if (N <= c) return false; + if (cursor.compare_exchange_weak(c, N, std::memory_order_acq_rel, std::memory_order_acquire)) return true; + } +} + +// Cooperatively advance the global completion frontier F (§11.4): after any core +// publishes flag(N), the contiguous-done prefix may have grown, so any core walks +// F forward while flag(F+1) is set. Lock-free; the CAS makes exactly one core win +// each step and the cost is amortized across all cores. +void advance_frontier() { + int32_t f = g_dist.frontier.load(std::memory_order_acquire); + while (true) { + const int32_t next = f + 1; + if (next >= kFlagCap) break; + if (g_dist.flags[next & (kFlagCap - 1)].load(std::memory_order_acquire) == 0) break; + if (g_dist.frontier.compare_exchange_weak(f, next, std::memory_order_acq_rel, std::memory_order_acquire)) { + f = next; + } + // On CAS failure f was reloaded with the current value; retry. + } +} + +// Resolve a kernel id to its executable address (CoreCallable::resolved_addr()). +uint64_t resolve_kernel_addr(Runtime *runtime, int32_t kernel_id) { + if (kernel_id == INVALID_KERNEL_ID) return 0; + uint64_t callable_addr = runtime->get_function_bin_addr(kernel_id); + if (callable_addr == 0) return 0; + const CoreCallable *callable = reinterpret_cast(callable_addr); + return callable->resolved_addr(); +} + +// Execute one owned task, then publish its completion flag (release). In sim all +// cores share the address space, so the release/acquire pair is the visibility +// barrier between the kernel's output writes and a consumer's input reads. +void execute_slot(RingSlot &s) { + typedef void (*KernelFn)(int64_t *); + if (s.function_bin_addr != 0) { + KernelFn fn = reinterpret_cast(s.function_bin_addr); + fn(reinterpret_cast(s.args)); + } + if (s.is_multicore) { + // Joint ownership: the co-owner that drives remaining to zero (the last + // subtask to finish) publishes the single global completion flag (§3.1), + // then frees the block.won entry for reuse. + WonSlot &w = g_dist.blocks[s.won_block].slots[s.won_slot]; + if (w.remaining.fetch_sub(1, std::memory_order_acq_rel) == 1) { + g_dist.flags[s.task_id & (kFlagCap - 1)].store(1, std::memory_order_release); + w.state.store(0, std::memory_order_release); // recycle the id-keyed slot + advance_frontier(); + } + } else { + g_dist.flags[s.task_id & (kFlagCap - 1)].store(1, std::memory_order_release); + advance_frontier(); + } + s.built = false; + s.occupied = false; +} + +// Phase B: execute every ready owned task in the private ring. A task is ready +// once all its fan-in producers have set their completion flag (acquire). +// Returns the number of slots freed this pass. +int32_t drain_phase_b(DistCore *self) { + int32_t freed = 0; + for (int32_t i = 0; i < kPrivateSlots; i++) { + RingSlot &s = self->slots[i]; + if (!s.occupied || !s.built) continue; // skip reserved-but-unbuilt slots + bool ready = true; + for (int32_t f = 0; f < s.fanin_count; f++) { + if (g_dist.flags[s.fanin[f] & (kFlagCap - 1)].load(std::memory_order_acquire) == 0) { + ready = false; + break; + } + } + if (!ready) continue; + execute_slot(s); + self->occupied_count--; + freed++; + } + return freed; +} + +int32_t alloc_ring_slot(DistCore *self) { + for (int32_t i = 0; i < kPrivateSlots; i++) { + if (!self->slots[i].occupied) return i; + } + return -1; +} + +// Kernel id for a physical lane (AIC/AIV0/AIV1) of a MixedKernels. +inline int32_t kernel_id_for_lane(const MixedKernels &mixed, int32_t lane) { + switch (lane) { + case LANE_AIC: return mixed.aic_kernel_id; + case LANE_AIV0: return mixed.aiv0_kernel_id; + case LANE_AIV1: return mixed.aiv1_kernel_id; + default: return INVALID_KERNEL_ID; + } +} + +inline bool lane_active(const ActiveMask &M, int32_t lane) { + return M.subtask_active(static_cast(lane)); +} + +// Materialize a private-ring slot from already-resolved components (shared by the +// owner build path and the follower drain path). `tensors`/`scalars` are copied +// in; args[] is (re)built to point at this slot's own copies so the slot is +// self-contained and executable at any later time. +void build_ring_slot( + RingSlot &s, int32_t task_id, uint64_t fn_addr, const Tensor *tensors, int32_t tc, const uint64_t *scalars, + int32_t sc, const int32_t *fanin, int32_t fc, int32_t sub_block_id, bool is_multicore, int32_t won_block, + int32_t won_slot +) { + s.occupied = true; + s.task_id = task_id; + s.function_bin_addr = fn_addr; + s.built = true; // fully populated below — now safe for Phase B to execute + s.tensor_count = tc; + s.scalar_count = sc; + for (int32_t i = 0; i < tc; i++) s.tensors[i].copy(tensors[i]); + for (int32_t j = 0; j < sc; j++) s.scalars[j] = scalars[j]; + int32_t n = 0; + for (int32_t i = 0; i < tc; i++) s.args[n++] = reinterpret_cast(&s.tensors[i]); + for (int32_t j = 0; j < sc; j++) s.args[n++] = s.scalars[j]; + s.local_ctx.block_idx = 0; + s.local_ctx.block_num = 1; + s.local_ctx.async_ctx = AsyncCtx{}; + s.global_ctx.sub_block_id = sub_block_id; + s.args[SPMD_LOCAL_CONTEXT_INDEX] = reinterpret_cast(&s.local_ctx); + s.args[SPMD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast(&s.global_ctx); + s.fanin_count = fc; + for (int32_t k = 0; k < fc; k++) s.fanin[k] = fanin[k]; + s.is_multicore = is_multicore; + s.won_block = won_block; + s.won_slot = won_slot; +} + +// Reserve a free block.won slot in `block`. Returns slot index or -1 if full. +// 2V allows either AIV of the block to be an anchor, so allocation must be atomic. +int32_t alloc_won_slot(int32_t block) { + BlockWon &bw = g_dist.blocks[block]; + for (int32_t i = 0; i < kPrivateSlots; i++) { + int32_t exp = 0; + if (bw.slots[i].state.compare_exchange_strong(exp, 2, std::memory_order_acq_rel, std::memory_order_relaxed)) { + return i; + } + } + return -1; +} + +// True if a published block.won deposit for this core's lane has not yet been +// taken — used by the termination check to avoid finishing before draining. +bool has_pending_won(DistCore *self) { + if (self->lane == LANE_AIC || self->lane == LANE_NONE) return false; + BlockWon &bw = g_dist.blocks[self->block_id]; + for (int32_t i = 0; i < kPrivateSlots; i++) { + WonSlot &w = bw.slots[i]; + if (w.state.load(std::memory_order_acquire) != 1) continue; + if (w.lane[self->lane].present && w.drained[self->lane].load(std::memory_order_acquire) == 0) return true; + } + return false; +} + +// Follower drain (§3.1, §6): pull every published block.won subtask addressed to +// this core's physical lane that we have not yet taken, building each into a free +// private-ring slot (back-pressure: stop when the ring is full). Non-blocking — +// if nothing is addressed to us we simply return. +void drain_block_won(DistCore *self) { + if (self->lane == LANE_AIC || self->lane == LANE_NONE) return; // AIC is never a follower + BlockWon &bw = g_dist.blocks[self->block_id]; + for (int32_t i = 0; i < kPrivateSlots; i++) { + WonSlot &w = bw.slots[i]; + if (w.state.load(std::memory_order_acquire) != 1) continue; + if (!w.lane[self->lane].present) continue; + int32_t exp = 0; + if (!w.drained[self->lane].compare_exchange_strong(exp, 1, std::memory_order_acq_rel, std::memory_order_relaxed)) + continue; // already taken by us on a prior pass + int32_t si = alloc_ring_slot(self); + if (si < 0) { + // Ring full: hand the deposit back and let Phase B free a slot first. + w.drained[self->lane].store(0, std::memory_order_release); + return; + } + const BuiltSubtask &b = w.lane[self->lane]; + build_ring_slot( + self->slots[si], w.task_id, b.function_bin_addr, b.tensors, b.tensor_count, b.scalars, b.scalar_count, + b.fanin, b.fanin_count, b.sub_block_id, /*is_multicore=*/true, self->block_id, i + ); + self->occupied_count++; + self->owned_total++; + } +} + +// ----------------------------------------------------------------------------- +// Distributed submit op (replaces the centralized orchestrator submit). +// +// Every core runs this for every task (identical replay): materialize outputs +// at deterministic heap addresses, maintain the per-core producer map, then +// race to claim ownership. Only the winner builds the task into its private +// ring; losers return with map + outputs updated so downstream get_ref() and +// fan-in resolution stay consistent across cores. +// ----------------------------------------------------------------------------- +TaskOutputTensors dist_submit_impl(PTO2Runtime *, const MixedKernels &mixed, const L0TaskArgs &args) { + DistCore *self = g_self; + if (self == nullptr) return TaskOutputTensors{}; + Runtime *runtime = g_dist.runtime; + + const int32_t N = self->local_index++; + const ActiveMask M = mixed.to_active_mask(); + const int32_t tc = args.tensor_count(); + if (N >= kFlagCap) { // flag ring + vend[] are non-windowed; cap total tasks + set_fatal(); + fprintf(stderr, "[dist_engine] task id %d exceeds kFlagCap %d (enlarge or window the flag/vend rings)\n", N, + kFlagCap); + return TaskOutputTensors{}; + } + + // (a) Deterministic GM output-heap allocation + materialization (§9.3, §11.4). + // The virtual bump `heap_next` is unbounded and identical on every core; the + // PHYSICAL address is (virtual mod ring). First sum this task's aligned output + // bytes so we can keep the whole task within one ring lap: if it would straddle + // the ring end, pad the virtual base up to the next ring boundary (deterministic + // → every core agrees). A single task larger than the ring is unsatisfiable. + const size_t ring = g_dist.heap_size; + uint64_t total = 0; + for (int32_t i = 0; i < tc; i++) { + if (args.tag(i) != TensorArgType::OUTPUT) continue; + total += PTO2_ALIGN_UP(args.tensor(i).create_info().buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN); + } + uint64_t task_base = PTO2_ALIGN_UP(self->heap_next, PTO2_PACKED_OUTPUT_ALIGN); + if (total > 0 && g_dist.heap_base != nullptr) { + if (total > ring) { + set_fatal(); + fprintf(stderr, "[dist_engine] task %d outputs %llu B exceed heap ring %zu B (enlarge PTO_DIST_HEAP_MB)\n", + N, (unsigned long long)total, ring); + return TaskOutputTensors{}; + } + if ((task_base % ring) + total > ring) { + task_base = ((task_base / ring) + 1) * ring; // skip the ring tail; start next lap + } + } + uint64_t off = 0; + TaskOutputTensors result; + for (int32_t i = 0; i < tc; i++) { + if (args.tag(i) != TensorArgType::OUTPUT) continue; + const TensorCreateInfo &ci = args.tensor(i).create_info(); + const uint64_t logical = ci.buffer_size_bytes(); + const uint64_t sz = PTO2_ALIGN_UP(logical, PTO2_PACKED_OUTPUT_ALIGN); + if (g_dist.heap_base == nullptr) { + set_fatal(); + fprintf(stderr, "[dist_engine] GM output heap not allocated at task %d\n", N); + return result; + } + const uint64_t phys = (task_base + off) % ring; // straddle-pad guarantees phys+logical <= ring + Tensor &slot_t = self->outpool[self->outpool_head]; + self->outpool_head = (self->outpool_head + 1) % kOutPoolSlots; + init_tensor_from_create_info(slot_t, ci, g_dist.heap_base + phys, logical); + result.materialize_output(slot_t); + off += sz; + } + self->heap_next = task_base + off; + // Publish cumulative virtual bytes through task N so any core can derive the + // live window [vend[R], heap_next) for reclaim back-pressure. Deterministic, so + // all cores store the same value (this core also reads its own writes for R= 0 && N < kFlagCap) g_dist.vend[N].store(self->heap_next, std::memory_order_relaxed); + + // Once fatal, stop claiming/executing but keep replaying the deterministic + // allocation above so this task's `result` carries valid (materialized) output + // refs — the orchestration may still call get_ref() on them. This degrades a + // fatal (e.g. heap-too-small) into a clean wrong-answer failure + diagnostic + // rather than an assertion crash mid-replay. + if (fatal_set()) return result; + + // (b) Fan-in resolution: look up producers of INPUT/INOUT regions BEFORE + // this task registers its own writes. + int32_t fanin[kMaxFanin]; + int32_t fc = 0; + for (int32_t i = 0; i < tc; i++) { + const TensorArgType tag = args.tag(i); + if (tag != TensorArgType::INPUT && tag != TensorArgType::INOUT) continue; + const Tensor &t = args.tensor(i).ref(); + if (t.manual_dep) continue; + const int32_t p = self->map.lookup(t); + if (p < 0) continue; + bool dup = false; + for (int32_t k = 0; k < fc; k++) + if (fanin[k] == p) { + dup = true; + break; + } + if (!dup && fc < kMaxFanin) fanin[fc++] = p; + } + + // (c) Register this task as the producer of its OUTPUT / INOUT / existing + // outputs (every core, so all maps stay identical). + uint32_t out_idx = 0; + for (int32_t i = 0; i < tc; i++) { + const TensorArgType tag = args.tag(i); + if (tag == TensorArgType::OUTPUT) { + self->map.insert(result.get_ref(out_idx), N); + out_idx++; + } else if (tag == TensorArgType::INOUT || tag == TensorArgType::OUTPUT_EXISTING) { + self->map.insert(args.tensor(i).ref(), N); + } + } + + // (d) Assemble the shared argument Tensors once (identical for every active + // lane of a multi-core task — they operate on the same task tensors, each + // lane writing its designated output(s) per the kernels). Inputs are copied + // from the args; outputs are the materialized heap-addressed descriptors. + Tensor built[MAX_TENSOR_ARGS]; + { + uint32_t bo = 0; + for (int32_t i = 0; i < tc; i++) { + if (args.tag(i) == TensorArgType::OUTPUT) { + built[i].copy(result.get_ref(bo)); + bo++; + } else { + built[i].copy(args.tensor(i).ref()); + } + } + } + const uint64_t *scalars = args.scalars(); + const int32_t sc = args.scalar_count(); + const uint8_t cmask = M.core_mask(); + const int32_t pc = __builtin_popcount(cmask); + const bool has_aic = (cmask & PTO2_SUBTASK_MASK_AIC) != 0; + + // (e) Claim race. Competition is by anchor TYPE (§2/§3.1): cube tasks (any + // task with an AIC subtask) are contested by AIC cores; vector tasks (AIV-only, + // incl. 2V) by AIV cores (AIV0 and AIV1 equally). + const bool anchor_is_cube = has_aic; + const bool type_match = + anchor_is_cube ? (self->role == CoreType::AIC) : (self->role == CoreType::AIV); + if (!type_match) return result; // wrong type for this task: only TensorMap was updated + + std::atomic &cursor = anchor_is_cube ? g_dist.cube_cursor : g_dist.vector_cursor; + if (!claim(cursor, N)) return result; // lost the race (another core of this type owns N) + + // ---- Winner = owner (single-core) / anchor (multi-core). ---- + // Back-pressure for self-claimed work: wait until the ring has a non-reserved + // slot free, draining block.won deposits + ready tasks meanwhile. The reserve + // guarantees a follower can still pull its (ready) deposits when the rest of + // the ring is full of not-yet-ready consumers (no priority inversion). + uint64_t wd_self = 0; + while (self->occupied_count >= kPrivateSlots - kWonReserve && !fatal_set()) { + drain_block_won(self); + if (drain_phase_b(self) == 0) { + SPIN_WAIT_HINT(); + watchdog(wd_self); + } + } + if (fatal_set()) return result; + + // Heap reclaim back-pressure (§9.5/§11.4): this owner is about to build (and + // later write) task N's outputs at deterministic physical offsets. Recycling a + // ring region is safe only once its previous occupant's task id <= R = F - H + // (all that occupant's consumers, which have id <= occupant+H, are done). The + // equivalent global-derivable test is: the live virtual window (heap_next minus + // vend[R]) must fit in the ring. Spin (draining + advancing F) until it does. + if (g_dist.heap_base != nullptr) { + const size_t ring = g_dist.heap_size; + uint64_t wd_heap = 0; + while (!fatal_set()) { + const int32_t f = g_dist.frontier.load(std::memory_order_acquire); + const int32_t R = f - g_dist.H; + const uint64_t vstart_live = + (R < 0) ? 0 : g_dist.vend[R].load(std::memory_order_relaxed); + if (self->heap_next - vstart_live <= ring) break; // window fits — region free + if (f >= N - 1) { // every predecessor done yet H-window still overflows the ring + set_fatal(); + fprintf(stderr, + "[dist_engine] heap ring %zu B too small for H=%d window at task %d (live=%llu B); " + "enlarge PTO_DIST_HEAP_MB or reduce PTO_DIST_H\n", + ring, g_dist.H, N, (unsigned long long)(self->heap_next - vstart_live)); + return result; + } + drain_block_won(self); + if (drain_phase_b(self) == 0) { + SPIN_WAIT_HINT(); + watchdog(wd_heap); + } + } + if (fatal_set()) return result; + } + + int32_t si = alloc_ring_slot(self); + if (si < 0) { // should not happen given the back-pressure gate above + set_fatal(); + fprintf(stderr, "[dist_engine] no free private-ring slot after back-pressure at task %d\n", N); + return result; + } + // Reserve so concurrent drains (including the block.won back-pressure loop + // below, which calls drain_phase_b) do not reuse this slot. Mark it unbuilt + // so Phase B skips it until build_ring_slot populates it (avoids re-executing + // the prior occupant's stale task_id/fanin/won linkage). + self->slots[si].occupied = true; + self->slots[si].built = false; + + int32_t own_lane; + int32_t won_block = -1; + int32_t won_slot = -1; + bool is_multicore = (pc > 1); + + if (!is_multicore) { + // Single core (1C / 1V): the one active lane is the only subtask. For 1V + // the winner may be physically AIV0 or AIV1, but the active lane/kernel is + // AIV0 (rt_submit_aiv fills aiv0). Find the single active lane. + own_lane = has_aic ? LANE_AIC : LANE_AIV0; + } else { + // Multi-core (MIX / 2V): we are the anchor. Our own physical lane subtask + // goes to our private ring; the remaining active lanes are deposited into + // block.won for our same-block followers to drain (§3.1). + own_lane = self->lane; + won_block = self->block_id; + won_slot = alloc_won_slot(won_block); + uint64_t wd_won = 0; + while (won_slot < 0 && !fatal_set()) { // block.won full → back-pressure (drain, then retry) + drain_block_won(self); + if (drain_phase_b(self) == 0) { + SPIN_WAIT_HINT(); + watchdog(wd_won); + } + won_slot = alloc_won_slot(won_block); + } + if (fatal_set()) return result; + WonSlot &w = g_dist.blocks[won_block].slots[won_slot]; + w.task_id = N; + w.remaining.store(pc, std::memory_order_relaxed); + for (int32_t L = 0; L < PTO2_SUBTASK_SLOT_COUNT; L++) { + w.drained[L].store(0, std::memory_order_relaxed); + w.lane[L].present = false; + } + for (int32_t L = 0; L < PTO2_SUBTASK_SLOT_COUNT; L++) { + if (L == own_lane || !lane_active(M, L)) continue; + BuiltSubtask &b = w.lane[L]; + b.present = true; + b.function_bin_addr = resolve_kernel_addr(runtime, kernel_id_for_lane(mixed, L)); + b.tensor_count = tc; + b.scalar_count = sc; + for (int32_t i = 0; i < tc; i++) b.tensors[i].copy(built[i]); + for (int32_t j = 0; j < sc; j++) b.scalars[j] = scalars[j]; + b.fanin_count = fc; + for (int32_t k = 0; k < fc; k++) b.fanin[k] = fanin[k]; + b.sub_block_id = (L == LANE_AIV1) ? 1 : 0; + } + std::atomic_thread_fence(std::memory_order_release); + w.state.store(1, std::memory_order_release); // publish the deposits to followers + } + + const int32_t own_sub_block = (own_lane == LANE_AIV1) ? 1 : 0; + build_ring_slot( + self->slots[si], N, resolve_kernel_addr(runtime, kernel_id_for_lane(mixed, own_lane)), built, tc, scalars, sc, + fanin, fc, own_sub_block, is_multicore, won_block, won_slot + ); + self->occupied_count++; + self->owned_total++; + + return result; +} + +// ----------------------------------------------------------------------------- +// Remaining ops — minimal stubs (bgemm exercises submit/scope/log only). +// ----------------------------------------------------------------------------- +void dist_scope_begin(PTO2Runtime *) {} +void dist_scope_end(PTO2Runtime *) {} +void dist_orchestration_done(PTO2Runtime *) {} +bool dist_is_fatal(PTO2Runtime *) { return fatal_set(); } + +void dist_report_fatal(PTO2Runtime *, int32_t code, const char *func, const char *fmt, ...) { + set_fatal(); + va_list ap; + va_start(ap, fmt); + fprintf(stderr, "[dist_engine][FATAL][%s] code=%d: ", func ? func : "?", code); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); +} + +void dist_log_error(const char *func, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + fprintf(stderr, "[dist_engine][E][%s] ", func ? func : "?"); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); +} +void dist_log_warn(const char *, const char *, ...) {} +void dist_log_debug(const char *, const char *, ...) {} +void dist_log_info_v(const char *, int, const char *, ...) {} + +// Orchestration-side tensor data access (get/set_tensor_data). Replay runs on the +// AICore worker and reads/writes real GM, so these are genuine memory accesses. +// The only subtlety is read-after-write across tasks: if the region has a producer +// in this core's map, wait until that producer's completion flag is set (draining +// this core's own ring meanwhile so an owned producer actually runs). External +// tensors (no producer) are accessed immediately. Consumer (WAR) tracking is not +// modeled, mirroring the centralized runtime's documented INPUT-reader limitation. +void wait_producer_ready(DistCore *self, const Tensor &t) { + const int32_t p = self->map.lookup(t); + if (p < 0) return; + uint64_t wd = 0; + while (!fatal_set()) { + if (g_dist.flags[p & (kFlagCap - 1)].load(std::memory_order_acquire) != 0) break; + drain_block_won(self); + if (drain_phase_b(self) == 0) { + SPIN_WAIT_HINT(); + watchdog(wd); + } + } +} + +uint64_t dist_get_tensor_data(PTO2Runtime *, const Tensor &tensor, uint32_t ndims, const uint32_t *indices) { + if (tensor.buffer.addr == 0) return 0; + DistCore *self = g_self; + if (self != nullptr) wait_producer_ready(self, tensor); + const uint64_t flat = tensor.compute_flat_offset(indices, ndims); + const uint64_t esz = get_element_size(tensor.dtype); + uint64_t result = 0; + memcpy(&result, reinterpret_cast(tensor.buffer.addr + flat * esz), esz); + return result; +} + +void dist_set_tensor_data(PTO2Runtime *, const Tensor &tensor, uint32_t ndims, const uint32_t *indices, uint64_t value) { + if (tensor.buffer.addr == 0) return; + DistCore *self = g_self; + if (self != nullptr) wait_producer_ready(self, tensor); + const uint64_t flat = tensor.compute_flat_offset(indices, ndims); + const uint64_t esz = get_element_size(tensor.dtype); + memcpy(reinterpret_cast(tensor.buffer.addr + flat * esz), &value, esz); +} + +// alloc_tensors — a kernel-less "hidden task" that only reserves GM output +// buffers (no compute). It consumes one task id, allocates its outputs on the +// deterministic heap exactly like dist_submit_impl step (a), registers itself as +// their producer, and completes INLINE (sets its own flag immediately) since no +// kernel runs. A later writer (INOUT / OUTPUT_EXISTING) becomes the new producer +// of the region, so real consumers depend on the writer, not on this alloc. Every +// core replays it identically, keeping heap addresses + maps consistent. +TaskOutputTensors dist_alloc_tensors(PTO2Runtime *, const L0TaskArgs &args) { + DistCore *self = g_self; + if (self == nullptr) return TaskOutputTensors{}; + const int32_t N = self->local_index++; + const int32_t tc = args.tensor_count(); + if (N >= kFlagCap) { + set_fatal(); + fprintf(stderr, "[dist_engine] alloc task id %d exceeds kFlagCap %d\n", N, kFlagCap); + return TaskOutputTensors{}; + } + + // Deterministic GM heap allocation + straddle-padding (identical to submit (a)). + const size_t ring = g_dist.heap_size; + uint64_t total = 0; + for (int32_t i = 0; i < tc; i++) { + if (args.tag(i) != TensorArgType::OUTPUT) continue; + total += PTO2_ALIGN_UP(args.tensor(i).create_info().buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN); + } + uint64_t task_base = PTO2_ALIGN_UP(self->heap_next, PTO2_PACKED_OUTPUT_ALIGN); + if (total > 0 && g_dist.heap_base != nullptr) { + if (total > ring) { + set_fatal(); + fprintf(stderr, "[dist_engine] alloc task %d outputs %llu B exceed heap ring %zu B\n", N, + (unsigned long long)total, ring); + return TaskOutputTensors{}; + } + if ((task_base % ring) + total > ring) task_base = ((task_base / ring) + 1) * ring; + } + + // Heap reclaim back-pressure (same window test as submit). An alloc bumps the + // heap like any output; drain this core's ring while the live window overflows. + if (total > 0 && g_dist.heap_base != nullptr) { + const uint64_t want_next = task_base + total; + uint64_t wd_heap = 0; + while (!fatal_set()) { + const int32_t f = g_dist.frontier.load(std::memory_order_acquire); + const int32_t R = f - g_dist.H; + const uint64_t vstart_live = (R < 0) ? 0 : g_dist.vend[R].load(std::memory_order_relaxed); + if (want_next - vstart_live <= ring) break; + if (f >= N - 1) { + set_fatal(); + fprintf(stderr, + "[dist_engine] heap ring %zu B too small for H=%d window at alloc %d (live=%llu B)\n", + ring, g_dist.H, N, (unsigned long long)(want_next - vstart_live)); + return TaskOutputTensors{}; + } + drain_block_won(self); + if (drain_phase_b(self) == 0) { + SPIN_WAIT_HINT(); + watchdog(wd_heap); + } + } + if (fatal_set()) return TaskOutputTensors{}; + } + + uint64_t off = 0; + TaskOutputTensors result; + for (int32_t i = 0; i < tc; i++) { + if (args.tag(i) != TensorArgType::OUTPUT) continue; + const TensorCreateInfo &ci = args.tensor(i).create_info(); + const uint64_t logical = ci.buffer_size_bytes(); + const uint64_t sz = PTO2_ALIGN_UP(logical, PTO2_PACKED_OUTPUT_ALIGN); + if (g_dist.heap_base == nullptr) { + set_fatal(); + fprintf(stderr, "[dist_engine] GM output heap not allocated at alloc %d\n", N); + return result; + } + const uint64_t phys = (task_base + off) % ring; + Tensor &slot_t = self->outpool[self->outpool_head]; + self->outpool_head = (self->outpool_head + 1) % kOutPoolSlots; + init_tensor_from_create_info(slot_t, ci, g_dist.heap_base + phys, logical); + result.materialize_output(slot_t); + off += sz; + } + self->heap_next = task_base + off; + if (N >= 0 && N < kFlagCap) g_dist.vend[N].store(self->heap_next, std::memory_order_relaxed); + if (fatal_set()) return result; + + // Register producer for each allocated output, then complete inline (no kernel). + uint32_t out_idx = 0; + for (int32_t i = 0; i < tc; i++) { + if (args.tag(i) != TensorArgType::OUTPUT) continue; + self->map.insert(result.get_ref(out_idx), N); + out_idx++; + } + g_dist.flags[N & (kFlagCap - 1)].store(1, std::memory_order_release); + advance_frontier(); + return result; +} + +TaskOutputTensors dist_submit_dummy(PTO2Runtime *, const L0TaskArgs &) { return TaskOutputTensors{}; } +void dist_scope_set_site(const char *, int) {} + +const PTO2RuntimeOps g_dist_ops = { + dist_submit_impl, dist_scope_begin, dist_scope_end, dist_orchestration_done, + dist_is_fatal, dist_report_fatal, dist_log_error, dist_log_warn, + dist_log_debug, dist_log_info_v, dist_get_tensor_data, dist_set_tensor_data, + dist_alloc_tensors, dist_submit_dummy, dist_scope_set_site, +}; + +// ----------------------------------------------------------------------------- +// Deadlock diagnostics: dump the full engine state on SIGUSR1. Sim runs every +// core as a pthread in one process, so a single handler can walk g_dist. Used to +// debug hangs (kill -USR1 ); compiled in but inert unless signalled. +// ----------------------------------------------------------------------------- +void dist_dump_state(int) { + fprintf(stderr, "\n===== DIST STATE DUMP =====\n"); + fprintf(stderr, "cube_cursor=%d vector_cursor=%d frontier=%d H=%d ring=%zuB replay_done=%d/%d num_blocks=%d fatal=%d\n", + g_dist.cube_cursor.load(), g_dist.vector_cursor.load(), g_dist.frontier.load(), g_dist.H, + g_dist.heap_size, g_dist.replay_done.load(), g_dist.num_workers, g_dist.num_blocks, g_dist.fatal.load()); + for (int32_t c = 0; c < g_dist.num_workers && c < RUNTIME_MAX_WORKER; c++) { + DistCore &co = g_dist.cores[c]; + fprintf(stderr, "core %d role=%d blk=%d lane=%d replayed=%d occ=%d owned=%d\n", c, + static_cast(co.role), co.block_id, co.lane, co.local_index, co.occupied_count, co.owned_total); + for (int32_t i = 0; i < kPrivateSlots; i++) { + RingSlot &s = co.slots[i]; + if (!s.occupied) continue; + int32_t unmet = -1; + for (int32_t f = 0; f < s.fanin_count; f++) + if (g_dist.flags[s.fanin[f] & (kFlagCap - 1)].load() == 0) { unmet = s.fanin[f]; break; } + fprintf(stderr, " slot%d tid=%d built=%d mc=%d won=(%d,%d) fanin=%d unmet=%d\n", i, s.task_id, + s.built, s.is_multicore, s.won_block, s.won_slot, s.fanin_count, unmet); + } + } + for (int32_t b = 0; b < g_dist.num_blocks; b++) { + for (int32_t i = 0; i < kPrivateSlots; i++) { + WonSlot &w = g_dist.blocks[b].slots[i]; + int32_t st = w.state.load(); + if (st == 0) continue; + fprintf(stderr, " won blk%d slot%d state=%d tid=%d remaining=%d drained=[%d,%d,%d] present=[%d,%d,%d]\n", + b, i, st, w.task_id, w.remaining.load(), w.drained[0].load(), w.drained[1].load(), + w.drained[2].load(), w.lane[0].present, w.lane[1].present, w.lane[2].present); + } + } + fprintf(stderr, "===== END DUMP =====\n"); +} + +// ----------------------------------------------------------------------------- +// Per-core entry point invoked by each AICore worker thread. +// ----------------------------------------------------------------------------- +void dist_core_main(void *runtime_v, int core_idx, int core_type_int) { + if (core_idx < 0 || core_idx >= RUNTIME_MAX_WORKER) return; + Runtime *runtime = reinterpret_cast(runtime_v); + DistCore *self = &g_dist.cores[core_idx]; + const CoreType role = static_cast(core_type_int); + + // sub_block lane: only meaningful for AIV in MIX tasks (M3). bgemm's 1V add + // ignores it, so 0 is correct for the M2 single-core scope. + const CoreLayout lay = g_dist.layout[core_idx]; + self->reset(role, lay.block_id, lay.lane); + g_self = self; + if (dist_trace()) fprintf(stderr, "[dist] core %d role=%d block=%d lane=%d START\n", core_idx, core_type_int, + lay.block_id, lay.lane); + + // Replay the full orchestration submit stream: build the per-core map and + // claim/build owned tasks into the private ring (back-pressure inline). MIX + // anchors deposit follower subtasks into block.won during this replay. + if (g_dist.orch_func != nullptr && g_dist.orch_args != nullptr && !fatal_set()) { + g_dist.orch_func(*g_dist.orch_args); + } + + // Publish "my replay is done" so followers can eventually conclude that no + // further block.won deposits will arrive for them (§7 tail-idle). + g_dist.replay_done.fetch_add(1, std::memory_order_acq_rel); + + // Drain to completion: pull any follower deposits addressed to my lane, run + // ready tasks, and only finish once every core has finished replay (no more + // pushes), my private ring is empty, and there is no undrained deposit left + // for my lane. + uint64_t wd_drain = 0; + while (!fatal_set()) { + drain_block_won(self); + int32_t freed = drain_phase_b(self); + const bool all_replayed = g_dist.replay_done.load(std::memory_order_acquire) >= g_dist.num_workers; + const bool ring_empty = (self->occupied_count == 0); + const bool pending = has_pending_won(self); + if (all_replayed && ring_empty && !pending) break; + if (freed == 0) { + SPIN_WAIT_HINT(); + watchdog(wd_drain); + } + } + + if (dist_trace() || fatal_set()) { + fprintf(stderr, "[dist] core %d role=%d DONE replayed=%d owned=%d fatal=%d\n", core_idx, core_type_int, + self->local_index, self->owned_total, fatal_set() ? 1 : 0); + } + g_self = nullptr; + __atomic_add_fetch(&runtime->dist.done_count, 1, __ATOMIC_ACQ_REL); +} + +} // namespace + +void *dist_engine_register( + PTO2Runtime *rt, DistOrchFunc orch_func, const L2TaskArgs *orch_args, int num_workers, Runtime *runtime +) { + // GM output heap: a BOUNDED ring reclaimed by the completion frontier (M4). + // Size from PTO_DIST_HEAP_MB (MiB) else kHeapRingDefault. Allocated once per + // process; if a later run needs a different size, free + realloc. + { + size_t want = kHeapRingDefault; + if (const char *e = getenv("PTO_DIST_HEAP_MB")) { + const long mb = atol(e); + if (mb > 0) want = static_cast(mb) << 20; + } + if (g_dist.heap_base != nullptr && g_dist.heap_size != want) { + free(g_dist.heap_base); + g_dist.heap_base = nullptr; + } + if (g_dist.heap_base == nullptr) { + g_dist.heap_base = static_cast(malloc(want)); + g_dist.heap_size = (g_dist.heap_base != nullptr) ? want : 0; + } + // Zero the heap each run so freshly-allocated output regions read as 0, + // matching the centralized runtime's zero-initialized GM. Kernels that + // read a padded tile (e.g. softmax/PV where valid_len < tile width) rely + // on the unwritten remainder being zero; an uninitialized (malloc) or + // recycled heap would otherwise yield nondeterministic results. + if (g_dist.heap_base != nullptr) memset(g_dist.heap_base, 0, g_dist.heap_size); + } + // Dependency-span bound H (R = F - H). Env override for graphs with longer + // heap spans; default kHDefault. + g_dist.H = kHDefault; + if (const char *e = getenv("PTO_DIST_H")) { + const long h = atol(e); + if (h >= 0) g_dist.H = static_cast(h); + } + g_dist.cube_cursor.store(-1, std::memory_order_relaxed); + g_dist.vector_cursor.store(-1, std::memory_order_relaxed); + g_dist.frontier.store(-1, std::memory_order_relaxed); + for (int32_t i = 0; i < kFlagCap; i++) g_dist.flags[i].store(0, std::memory_order_relaxed); + g_dist.fatal.store(0, std::memory_order_relaxed); + g_dist.replay_done.store(0, std::memory_order_relaxed); + g_dist.orch_func = orch_func; + g_dist.orch_args = orch_args; + g_dist.rt = rt; + g_dist.runtime = runtime; + + // Derive the physical-block topology (1 AIC + 2 AIV per block) the same way + // the centralized scheduler discovers clusters: AIC/AIV cores in worker-index + // order, AIC[b] paired with AIV[2b] (AIV0) and AIV[2b+1] (AIV1). Followers and + // anchors use this to address block.won deposits. See §3.1. + g_dist.num_workers = num_workers; + int32_t aic_ids[RUNTIME_MAX_WORKER]; + int32_t aiv_ids[RUNTIME_MAX_WORKER]; + int32_t naic = 0, naiv = 0; + for (int32_t i = 0; i < num_workers && i < RUNTIME_MAX_WORKER; i++) { + g_dist.layout[i].block_id = -1; + g_dist.layout[i].lane = LANE_NONE; + if (runtime->workers[i].core_type == CoreType::AIC) { + aic_ids[naic++] = i; + } else { + aiv_ids[naiv++] = i; + } + } + g_dist.num_blocks = naic; + for (int32_t b = 0; b < naic; b++) { + g_dist.layout[aic_ids[b]] = CoreLayout{b, LANE_AIC}; + if (2 * b < naiv) g_dist.layout[aiv_ids[2 * b]] = CoreLayout{b, LANE_AIV0}; + if (2 * b + 1 < naiv) g_dist.layout[aiv_ids[2 * b + 1]] = CoreLayout{b, LANE_AIV1}; + for (int32_t s = 0; s < kPrivateSlots; s++) { + g_dist.blocks[b].slots[s].state.store(0, std::memory_order_relaxed); + } + } + + if (dist_trace()) { + fprintf(stderr, "[dist] register: num_workers=%d heap_base=%p heap_size=%zu\n", num_workers, + (void *)g_dist.heap_base, g_dist.heap_size); + } + + // Install the SIGUSR1 deadlock dumper once, but only when diagnostics are + // opted in (PTO_DIST_WATCHDOG set) — default runs install no signal handler. + static bool handler_installed = false; + if (!handler_installed && getenv("PTO_DIST_WATCHDOG") != nullptr) { + signal(SIGUSR1, dist_dump_state); + handler_installed = true; + } + + // Publish all of the above before any worker observes Runtime::dist.go. + std::atomic_thread_fence(std::memory_order_release); + rt->ops = &g_dist_ops; + return reinterpret_cast(&dist_core_main); +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.h new file mode 100644 index 000000000..a137c5ec2 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/dist_engine.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * fully_distributed_within_core engine — public wiring entry. + * + * The distributed runtime moves orchestration + scheduling + execution onto the + * AI cores in SPMD fashion (see docs/fully_distributed_within_core.md). The + * engine itself (per-core TensorMap, claim race over global cursors, private + * task ring, run-ahead loop, completion-flag ring, deterministic GM output + * heap) lives in dist_engine.cpp and is compiled into the AICPU .so so it can + * reuse the full submit-side type set (TensorMap, MixedKernels, L0TaskArgs, + * kernel-address resolution). + * + * The AICPU "stub" thread does dlopen + arena setup, then calls + * dist_engine_register() once and publishes the returned per-core entry pointer + * via Runtime::dist.core_main_fn. Each AICore worker thread invokes that entry, + * which runs the orchestration entry (replaying the full submit stream) and + * executes the tasks it wins. + */ + +#pragma once + +struct PTO2Runtime; +struct L2TaskArgs; +class Runtime; + +// Orchestration entry signature (matches DeviceOrchestrationFunc in the AICPU +// executor): the dlopen'd user orchestration function the cores replay. +typedef void (*DistOrchFunc)(const L2TaskArgs &); + +/** + * Wire the distributed engine for one run. + * + * Resets the global claim cursors + completion-flag ring, (re)acquires the GM + * output heap, stores the orchestration entry / args / PTO2Runtime, and points + * rt->ops at the distributed ops table so the cores route rt_submit_* into the + * distributed submit path. Must be called once on the AICPU orchestrator thread + * before publishing Runtime::dist.go. + * + * Returns the address of the per-core entry function + * (signature: void(void *runtime, int core_idx, int core_type)) to store into + * Runtime::dist.core_main_fn. Returned as void* to keep this header light. + */ +void *dist_engine_register( + PTO2Runtime *rt, DistOrchFunc orch_func, const L2TaskArgs *orch_args, int num_workers, Runtime *runtime +); diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h new file mode 100644 index 000000000..e1bb3465e --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file pto2_dispatch_payload.h + * @brief Per-core dispatch payload for AICore kernel execution + * + * PTO2DispatchPayload holds the kernel function address, a per-core args[] + * array, and embedded SPMD context (LocalContext + GlobalContext). AICPU + * maintains a static array of these (one per core). + * + * GlobalContext (sub_block_id) is initialized once at runtime startup via + * init_global_context() and never modified afterwards. + * + * LocalContext (block_idx, block_num) and args[] are rebuilt by build_payload() + * before each dispatch. Both context struct pointers are written into the + * args[] suffix on every dispatch (since args[] is rebuilt entirely each time). + * + * AICore caches a pointer to its per-core slot at startup and reads from + * it on each dispatch. The struct is cache-line aligned to avoid false + * sharing across concurrently dispatched cores. + * + * The DATA_MAIN_BASE register protocol is unchanged from the base runtime: + * a monotonically increasing reg_task_id signals new work to AICore. + */ + +#pragma once + +#include + +#include "arg_direction.h" +#include "intrinsic.h" + +/** Max dispatch arguments: 16 scalars + up to 32 tensor pointers + ext params */ +#ifndef PTO2_DISPATCH_MAX_ARGS +#define PTO2_DISPATCH_MAX_ARGS (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + PTO2_EXT_PARAMS_COUNT) +#endif + +#ifndef PTO2_ALIGN_UP +#define PTO2_ALIGN_UP(x, align) (((x) + (align) - 1) & ~((align) - 1)) +#endif + +// Verify hardcoded indices in intrinsic.h match the computed values. +static_assert( + (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) == SPMD_LOCAL_CONTEXT_INDEX, "LOCAL_CONTEXT_INDEX out of sync with intrinsic.h" +); +static_assert( + (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + 1) == SPMD_GLOBAL_CONTEXT_INDEX, + "GLOBAL_CONTEXT_INDEX out of sync with intrinsic.h" +); + +/** + * Per-core dispatch payload: function address + args[] + SPMD context. + * + * AICPU maintains a static array s_payload_per_core[RUNTIME_MAX_WORKER]. + * AICore caches a pointer to its per-core slot at startup (via Handshake.task) + * and reads from it on each dispatch. + * + * The struct is cache-line aligned to prevent false sharing across + * concurrently dispatched cores. + */ +struct alignas(64) PTO2DispatchPayload { + uint64_t function_bin_addr; /**< Kernel entry address in GM (set by Scheduler) */ + uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars + ext params) */ + + /** Per-dispatch context: block_idx and block_num. + * Written by build_payload() before each dispatch. + * args[SPMD_LOCAL_CONTEXT_INDEX] points here. */ + LocalContext local_context; + + /** Per-core global context: sub_block_id (AIV lane identity). + * Initialized once by init_global_context() at runtime startup. + * args[SPMD_GLOBAL_CONTEXT_INDEX] points here. */ + GlobalContext global_context; + + /** Speculative early-dispatch gate. 0 = ready: AICore executes on pickup. + * 1 = not-ready: AICore waits until AICPU rings the doorbell + * (DATA_MAIN_BASE high 32 == this dispatch's reg_task_id) before executing. */ + volatile uint32_t not_ready; + uint8_t reserved_payload_abi_pad[4]; + + static_assert(sizeof(args[0]) == 8); + static_assert( + PTO2_ALIGN_UP((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]), 64) == + (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]) + ); +}; + +static_assert(sizeof(PTO2DispatchPayload) == 512, "PTO2DispatchPayload hardware ABI size drift"); diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h new file mode 100644 index 000000000..cf6eb4790 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef PTO_ASYNC_KERNEL_API_H +#define PTO_ASYNC_KERNEL_API_H + +#include + +#include +#include + +#include "intrinsic.h" +#include "aicore_completion_mailbox_types.h" +#include "pto_completion_token.h" +#include "pto_runtime_status.h" + +#ifndef __aicore__ +#define __aicore__ +#endif +#ifndef __gm__ +#define __gm__ +#endif + +// Public surface: get_async_ctx, async_ctx_is_deferred, +// register_completion_condition, send_notification, +// save_expected_notification_counter. Everything else lives in +// pto2::detail and is reserved for backend adapters / internal use. +namespace pto2::detail { + +inline __aicore__ void defer_load_slab(AsyncCtx &ctx) { + if (ctx.completion_count == nullptr) return; +#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__) + uintptr_t line = reinterpret_cast(ctx.completion_count) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u); + dcci((__gm__ int32_t *)line, SINGLE_CACHE_LINE); +#else + __asm__ __volatile__("" ::: "memory"); +#endif +} + +inline __aicore__ void defer_error(AsyncCtx &ctx, int32_t error_code) { + if (ctx.task_token.is_valid() && ctx.completion_error_code != nullptr) { + *ctx.completion_error_code = error_code; + } +} + +inline __aicore__ void defer_flush_range(volatile __gm__ void *addr, uint32_t size_bytes) { + if (addr == nullptr || size_bytes == 0) return; +#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__) + uintptr_t start = reinterpret_cast(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u); + uintptr_t end = + (reinterpret_cast(addr) + size_bytes + PTO2_ALIGN_SIZE - 1u) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u); + for (uintptr_t p = start; p < end; p += PTO2_ALIGN_SIZE) { + dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT); + } +#else + (void)addr; + (void)size_bytes; +#endif +} + +inline __aicore__ void defer_flush(AsyncCtx &ctx) { + if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr) return; +#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__) + uint32_t count = *ctx.completion_count; + if (count > ctx.completion_capacity) { + count = ctx.completion_capacity; + } + uint32_t flush_bytes = static_cast(sizeof(*ctx.completion_count)); + if (ctx.completion_error_code != nullptr) { + flush_bytes += static_cast(sizeof(*ctx.completion_error_code)); + } + if (ctx.completion_entries != nullptr) { + flush_bytes += count * static_cast(sizeof(DeferredCompletionEntry)); + } + defer_flush_range(ctx.completion_count, flush_bytes); +#if defined(__CPU_SIM) + dsb(0); +#else + dsb(DSB_DDR); +#endif + pipe_barrier(PIPE_ALL); +#else + (void)ctx; + __asm__ __volatile__("" ::: "memory"); +#endif +} + +} // namespace pto2::detail + +inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args) { + __gm__ LocalContext *lc = + reinterpret_cast<__gm__ LocalContext *>(static_cast(args[PAYLOAD_LOCAL_CONTEXT_INDEX])); + AsyncCtx ctx{}; + ctx.completion_count = lc->async_ctx.completion_count; + ctx.completion_error_code = lc->async_ctx.completion_error_code; + ctx.completion_entries = lc->async_ctx.completion_entries; + ctx.completion_capacity = lc->async_ctx.completion_capacity; + ctx.task_token.raw = lc->async_ctx.task_token.raw; + pto2::detail::defer_load_slab(ctx); + return ctx; +} + +inline __aicore__ bool async_ctx_is_deferred(const AsyncCtx &ctx) { return ctx.task_token.is_valid(); } + +// Canonical writer: backend submit handlers build a CompletionToken and pass +// it here. Writes one DeferredCompletionEntry to the AsyncCtx slab and +// bumps completion_count. Returns false on overflow (also stores +// PTO2_ERROR_ASYNC_WAIT_OVERFLOW in ctx.completion_error_code) or when ctx is +// not currently a deferred context. +inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const CompletionToken &token) { + if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) { + return false; + } + + uint32_t idx = *ctx.completion_count; + if (idx >= ctx.completion_capacity) { + if (ctx.completion_error_code != nullptr) { + *ctx.completion_error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW; + } + return false; + } + + volatile __gm__ DeferredCompletionEntry *slot = &ctx.completion_entries[idx]; + slot->addr = token.addr; + slot->expected_value = token.expected_value; + slot->engine = token.engine; + slot->completion_type = token.completion_type; + slot->_pad = 0; + *ctx.completion_count = idx + 1; + return true; +} + +inline __aicore__ void +send_notification(volatile __gm__ void *remote_counter_addr, int32_t value, pto::comm::NotifyOp notify_op) { + __gm__ int32_t *counter = reinterpret_cast<__gm__ int32_t *>(const_cast<__gm__ void *>(remote_counter_addr)); + pto::comm::Signal signal(counter); + pto::comm::TNOTIFY(signal, value, notify_op); +} + +inline __aicore__ void +save_expected_notification_counter(AsyncCtx &ctx, volatile __gm__ void *counter_addr, uint32_t expected_value) { + CompletionToken token{ + reinterpret_cast(counter_addr), expected_value, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_COUNTER, 0 + }; + (void)register_completion_condition(ctx, token); + pto2::detail::defer_flush(ctx); +} + +#endif // PTO_ASYNC_KERNEL_API_H diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_wait.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_wait.h new file mode 100644 index 000000000..65608ad2f --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_async_wait.h @@ -0,0 +1,303 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef PTO_ASYNC_WAIT_H +#define PTO_ASYNC_WAIT_H + +#include +#include +#include + +#include "aicpu/platform_regs.h" +#include "backend/sdma/sdma_completion_scheduler.h" +#include "intrinsic.h" +#include "aicore_completion_mailbox.h" +#include "pto_completion_token.h" +#include "pto_runtime2_types.h" + +struct PTO2SchedulerState; +struct PTO2LocalReadyBuffer; +struct CompletionStats; + +inline constexpr int32_t MAX_ASYNC_WAITS = 64; + +// The mailbox transport (has_pending / try_push_condition / +// try_push_normal_done / try_pop) lives as AICoreCompletionMailbox member +// functions in aicore_completion_mailbox.h. This file only holds the +// application layer: translating drained messages into wait-list state. + +inline uintptr_t mailbox_cache_line(const volatile void *addr) { + return reinterpret_cast(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u); +} + +struct CompletionCondition; + +using CompletionPollFn = CompletionPollResult (*)(const CompletionCondition &); +using CompletionRetireFn = void (*)(CompletionCondition &); + +struct CompletionBackendOps { + CompletionPollFn poll; + CompletionRetireFn retire; +}; + +struct CompletionCondition { + AsyncEngine engine{ASYNC_ENGINE_SDMA}; + int32_t completion_type{COMPLETION_TYPE_COUNTER}; + bool satisfied{false}; + bool retired{false}; + volatile uint32_t *counter_addr{nullptr}; + uint64_t addr{0}; + uint32_t expected_value{0}; + + CompletionPollResult test() const; + void retire(); +}; + +// Per-completion-type ops. SDMA_EVENT_RECORD detail lives in +// backend/sdma/sdma_completion_scheduler.h; the op wrappers below are thin +// glue mapping CompletionCondition.addr into the backend's raw-addr helpers. +inline CompletionPollResult counter_poll_op(const CompletionCondition &cond) { + if (cond.counter_addr == nullptr) { + return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID}; + } + return { + *cond.counter_addr >= cond.expected_value ? CompletionPollState::READY : CompletionPollState::PENDING, + PTO2_ERROR_NONE + }; +} + +inline void counter_retire_op(CompletionCondition & /*cond*/) {} + +inline CompletionPollResult sdma_event_record_poll_op(const CompletionCondition &cond) { + return poll_sdma_event_record(cond.addr); +} + +inline void sdma_event_record_retire_op(CompletionCondition &cond) { retire_sdma_event_record(cond.addr); } + +inline const CompletionBackendOps *completion_backend_ops_for(int completion_type) { + static const CompletionBackendOps kOps[] = { + {counter_poll_op, counter_retire_op}, // COMPLETION_TYPE_COUNTER = 0 + {sdma_event_record_poll_op, sdma_event_record_retire_op}, // COMPLETION_TYPE_SDMA_EVENT_RECORD = 1 + }; + constexpr int kOpsCount = static_cast(sizeof(kOps) / sizeof(kOps[0])); + if (completion_type < 0 || completion_type >= kOpsCount) return nullptr; + return &kOps[completion_type]; +} + +inline CompletionPollResult CompletionCondition::test() const { + if (satisfied) { + return {CompletionPollState::READY, PTO2_ERROR_NONE}; + } + const CompletionBackendOps *ops = completion_backend_ops_for(completion_type); + if (ops == nullptr || ops->poll == nullptr) { + return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID}; + } + return ops->poll(*this); +} + +inline void CompletionCondition::retire() { + if (retired) return; + const CompletionBackendOps *ops = completion_backend_ops_for(completion_type); + if (ops != nullptr && ops->retire != nullptr) { + ops->retire(*this); + } + retired = true; +} + +struct AsyncWaitEntry { + PTO2TaskSlotState *slot_state{nullptr}; + PTO2TaskId task_token{PTO2TaskId::invalid()}; + CompletionCondition conditions[MAX_COMPLETIONS_PER_TASK]; + int32_t condition_count{0}; + int32_t waiting_completion_count{0}; + bool normal_done{false}; +}; + +struct AsyncPollResult { + int32_t completed{0}; + int32_t error_code{PTO2_ERROR_NONE}; + PTO2TaskSlotState *failed_slot_state{nullptr}; +}; + +inline const char *async_engine_name(AsyncEngine engine) { + switch (engine) { + case ASYNC_ENGINE_SDMA: + return "SDMA"; + case ASYNC_ENGINE_ROCE: + return "ROCE"; + case ASYNC_ENGINE_URMA: + return "URMA"; + case ASYNC_ENGINE_CCU: + return "CCU"; + default: + return "UNKNOWN"; + } +} + +struct AsyncWaitList { + std::atomic busy{0}; + AsyncWaitEntry entries[MAX_ASYNC_WAITS]; + int32_t count{0}; + // Diagnostic: counts every FIN-side try_push that hit a full mailbox. + // Expected to stay zero on real workloads (ring is 4096 entries); a + // non-zero value means consumers are too slow or the ring is undersized. + // Read by scheduler shutdown / l2 perf summary; not on the hot path. + std::atomic mpsc_skipped_count{0}; + + bool try_lock() { + int32_t expected = 0; + return busy.compare_exchange_strong(expected, 1, std::memory_order_acquire, std::memory_order_relaxed); + } + + void unlock() { busy.store(0, std::memory_order_release); } + + AsyncWaitEntry *find_entry_by_token(PTO2TaskId token) { + for (int32_t i = 0; i < count; i++) { + if (entries[i].task_token == token) return &entries[i]; + } + return nullptr; + } + + // Captures the side-channel a scheduler-aware drain needs to complete + // NotDeferred tasks inline (without storing a transient entry in + // entries[]). + struct DrainCompletionSink { + PTO2SchedulerState *sched{nullptr}; + PTO2LocalReadyBuffer *local_bufs{nullptr}; + PTO2TaskSlotState **deferred_release_slot_states{nullptr}; + int32_t *deferred_release_count{nullptr}; + int32_t deferred_release_capacity{0}; + int32_t inline_completed{0}; +#if PTO2_SCHED_PROFILING + int32_t thread_idx{0}; +#endif + + bool can_inline_complete() const { return sched != nullptr; } + }; + + // Inline-complete a NotDeferred task during drain. Returns false on + // deferred_release_slot_states overflow. + bool try_inline_complete_locked(DrainCompletionSink &sink, PTO2TaskSlotState &slot_state); + + // Single-consumer drain: pop each published message in tail order and + // translate it into wait-list state. An empty sink (sched == nullptr) just + // materializes entries; a sched-aware sink additionally inline-completes + // lonely NotDeferred NORMAL_DONEs without ever growing entries[]. + int32_t drain_aicore_completion_mailbox_locked( + AICoreCompletionMailbox *aicore_mailbox, DrainCompletionSink &sink, int32_t &error_code + ) { + error_code = PTO2_ERROR_NONE; + if (aicore_mailbox == nullptr) return 0; + + int32_t drained = 0; + AICoreCompletionMsgView msg; + // try_pop is the transport layer (seq-gated, in-order dequeue); this + // loop is the application layer (translate each message into wait-list + // state). try_pop returns false at the first gap or when empty. + while (aicore_mailbox->try_pop(msg)) { + drained++; + if (msg.kind == MSG_KIND_CONDITION) { + AsyncWaitEntry *entry = find_entry_by_token(msg.task_token); + if (entry == nullptr) { + // First message for this task — materialize the entry here. + // slot_state stays null until the matching TASK_NORMAL_DONE + // sentinel arrives. + if (count >= MAX_ASYNC_WAITS) { + error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW; + return drained; + } + entry = &entries[count++]; + entry->task_token = msg.task_token; + entry->slot_state = nullptr; + entry->condition_count = 0; + entry->waiting_completion_count = 0; + entry->normal_done = false; + } + if (!append_condition_locked( + *entry, msg.addr, msg.expected_value, static_cast(msg.engine), msg.completion_type, + error_code + )) { + return drained; + } + } else if (msg.kind == MSG_KIND_TASK_NORMAL_DONE) { + PTO2TaskSlotState *slot_state_ptr = + reinterpret_cast(static_cast(msg.addr)); + AsyncWaitEntry *entry = find_entry_by_token(msg.task_token); + if (entry == nullptr) { + // Producers strictly order: all CONDITIONs for token T are + // pushed before the matching NORMAL_DONE (the acq_rel on + // on_subtask_complete enforces this across producers). So + // observing NORMAL_DONE first => the task registered no + // conditions => NotDeferred. Complete it inline when the + // sink allows; otherwise fall back to the entry-store path. + if (sink.can_inline_complete()) { + (void)try_inline_complete_locked(sink, *slot_state_ptr); + continue; + } + if (count >= MAX_ASYNC_WAITS) { + error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW; + return drained; + } + entry = &entries[count++]; + entry->task_token = msg.task_token; + entry->slot_state = slot_state_ptr; + entry->condition_count = 0; + entry->waiting_completion_count = 0; + entry->normal_done = true; + } else { + if (entry->slot_state == nullptr) { + entry->slot_state = slot_state_ptr; + } + entry->normal_done = true; + } + } else { + error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED; + return drained; + } + } + return drained; + } + + bool append_condition_locked( + AsyncWaitEntry &entry, uint64_t addr, uint32_t expected_value, AsyncEngine engine, int32_t completion_type, + int32_t &error_code + ) { + if (entry.condition_count >= MAX_COMPLETIONS_PER_TASK) { + error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED; + return false; + } + CompletionCondition &cond = entry.conditions[entry.condition_count++]; + cond.engine = engine; + cond.completion_type = completion_type; + cond.satisfied = false; + cond.retired = false; + cond.addr = addr; + cond.counter_addr = completion_type == COMPLETION_TYPE_COUNTER ? + reinterpret_cast(static_cast(addr)) : + nullptr; + cond.expected_value = expected_value; + entry.waiting_completion_count++; + return true; + } + + template + AsyncPollResult poll_and_complete( + AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs, + PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, + int32_t deferred_release_capacity +#if PTO2_SCHED_PROFILING + , + int thread_idx +#endif + ); +}; + +#endif // PTO_ASYNC_WAIT_H diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_completion_token.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_completion_token.h new file mode 100644 index 000000000..c5a8c345f --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_completion_token.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_ +#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_ + +#include + +#include "aicore_completion_mailbox_types.h" +#include "pto_runtime_status.h" + +// CompletionToken is the runtime-internal POD that backend submit handlers +// produce and the generic register_completion_condition() consumes. It is the +// ABI contract for "this is one completion to wait on" — independent of which +// backend (SDMA, RoCE, notification counter, ...) generated it. Each backend's +// (poll, retire) pair is registered in pto_async_wait.h's ops table, keyed by +// completion_type. +struct CompletionToken { + uint64_t addr; + uint32_t expected_value; + uint32_t engine; + int32_t completion_type; + uint64_t backend_cookie; +}; + +enum class CompletionPollState : uint8_t { + PENDING = 0, + READY = 1, + FAILED = 2, +}; + +struct CompletionPollResult { + CompletionPollState state{CompletionPollState::PENDING}; + int32_t error_code{PTO2_ERROR_NONE}; +}; + +#endif // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_ diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_constants.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_constants.h new file mode 100644 index 000000000..07251cc39 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_constants.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_ +#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_ + +#define PTO2_ALIGN_SIZE 64 // Cache line alignment +#define PTO2_PACKED_OUTPUT_ALIGN 1024 // Each output in packed buffer aligned to 1024B; gap is padding +#define PTO2_ALIGN_UP(x, align) (((x) + (align) - 1) & ~((align) - 1)) + +#endif // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_ diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h new file mode 100644 index 000000000..cf68a2617 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file pto_dep_compute.h + * @brief Dependency computation primitives shared by runtime submit_task and dep_gen replay. + * + * Two header-only template entry points: + * + * compute_task_fanin — STEP 3 in submit_task: per-tensor creator retention (Step A) + * + tensormap.lookup for INPUT/INOUT (Step B). Calls back into + * user-supplied `emit` for each producer it identifies. + * + * register_task_outputs — STEP 4 in submit_task: tensormap.insert for INOUT and + * OUTPUT_EXISTING tensors. No callbacks. + * + * STEP 1 (explicit_deps) is intentionally left at the runtime call site because its + * `last_task_alive` shortcut + unchecked slot lookup is subtly different from the + * `slot_state->task->task_id == producer` reuse check in STEP 3. Unifying them would + * require two emit semantics or a marginal behavior change in transients — not worth + * the minor structural overlap. Replay handles STEP 1 with a one-line loop of its own. + * + * The Emit callback contract: + * bool emit(PTO2TaskId producer); + * - return true to continue (whether or not the producer was actually recorded — + * producer-not-alive / dedup-hit / etc. all return true silently) + * - return false to signal fatal (e.g. fanin spill overflow); caller bails + * + * Performance: Emit is a template parameter, not std::function. Both runtime + * (lambda capturing fanin_builder + sm_header) and replay (lambda capturing edge + * vector) instantiate at the call site and inline through. Do NOT replace with + * std::function — it would break the inlining and add ~5 ns/call to the orch hot path. + */ + +#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_ +#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_ + +#include + +#include "pto_task_id.h" +#include "pto_tensormap.h" +#include "pto_types.h" // TensorRef +#include "tensor.h" + +/** + * View struct for inputs to compute_task_fanin / register_task_outputs. + * + * Both runtime and replay assemble one of these from their own data sources + * (runtime: from Arg accessors; replay: from SubmitTraceEntry fields). All + * pointer arrays must remain valid for the duration of the call. + */ +struct DepInputs { + int32_t tensor_count; + const TensorRef *tensors; // length = tensor_count (union; OUTPUT slots' .ptr is unused) + const TensorArgType *arg_types; // length = tensor_count + int32_t explicit_dep_count; + const PTO2TaskId *explicit_deps; // length = explicit_dep_count (validity checked by caller) +}; + +/** + * Compute fanin for a task being submitted (STEP 3: Step A creator retention + + * Step B tensormap modifier lookup). + * + * For each non-OUTPUT tensor: + * - If owner_task_id is valid, emit(owner) + * - For INPUT/INOUT (and not manual_dep), tensor_map.lookup(*tensor) and emit + * each matching producer. INOUT+COVERED triggers tensor_map.remove_entry(entry). + * + * @return true on success (or producer-skipped-silently); false if emit signaled + * fatal — caller should propagate (after any fatal bookkeeping done by emit). + */ +template +[[nodiscard]] inline bool +compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, Emit emit) { + if (in_manual_scope) { + return true; + } + + for (int32_t i = 0; i < inputs.tensor_count; i++) { + TensorArgType ptype = inputs.arg_types[i]; + if (ptype == TensorArgType::OUTPUT) { + // Runtime-created OUTPUT tensors are not looked up in the TensorMap since + // they have no dependencies. + continue; + } + + const Tensor *tensor = &inputs.tensors[i].ref(); + + // Step A: creator retention — all existing tensors extend their creator lifetime. + PTO2TaskId owner = tensor->owner_task_id; + if (owner.is_valid()) { + if (!emit(owner)) { + return false; + } + } + + // Step B: only INPUT/INOUT need modifier dependency lookup. + if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) { + continue; + } + if (tensor->manual_dep) { + continue; + } + + bool fatal = false; + tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool { + if (!emit(entry.producer_task_id)) { + fatal = true; + return false; // stop iteration + } + if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) { + tensor_map.remove_entry(entry); + } + return true; + }); + if (fatal) { + return false; + } + } + return true; +} + +/** + * Register a task's outputs in the tensormap (STEP 4 in submit_task). + * + * For INOUT and OUTPUT_EXISTING tensors (excluding manual_dep), inserts the + * tensor into tensor_map keyed by its buffer.addr with `task_id` as producer. + * + * No-op when in_manual_scope. + */ +inline void +register_task_outputs(const DepInputs &inputs, PTO2TaskId task_id, PTO2TensorMap &tensor_map, bool in_manual_scope) { + if (in_manual_scope) { + return; + } + for (int32_t i = 0; i < inputs.tensor_count; i++) { + TensorArgType ptype = inputs.arg_types[i]; + if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) { + const Tensor *tensor = &inputs.tensors[i].ref(); + if (!tensor->manual_dep) { + tensor_map.insert(*tensor, task_id); + } + } + } +} + +#endif // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_ diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp new file mode 100644 index 000000000..2043c116b --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp @@ -0,0 +1,972 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO Runtime2 - Orchestrator Implementation + * + * Implements orchestrator state management, scope handling, and task submission. + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#include "pto_orchestrator.h" + +#include +#include +#include +#include +#include +#include + +#include "aicpu/dep_gen_collector_aicpu.h" +#include "common/dep_gen.h" +#include "common/unified_log.h" +#include "pto_dep_compute.h" +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "pto_types.h" +#include "tensor.h" + +#if PTO2_PROFILING +#include "aicpu/scope_stats_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" +#endif + +// Verify the captured Tensor blob size in DepGenRecord matches the runtime +// Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without +// including runtime/tensor.h, so this check lives at the orch callsite. +static_assert(sizeof(Tensor) == DEP_GEN_TENSOR_SIZE, "DepGenRecord::tensors slot size out of sync with sizeof(Tensor)"); +// DEP_GEN_MAX_EXPLICIT_DEPS is a diagnostic-side capture cap only; the runtime +// imposes no hard cap on explicit dep count. If a submit exceeds this cap, +// dep_gen_aicpu_record_submit() logs and truncates — runtime correctness is +// unaffected, only the captured replay record is truncated. + +// Weak fallbacks: dep_gen_collector_aicpu.cpp provides the strong symbols in +// AICPU builds. Host builds (host_build_graph runtime, future dep_gen replay) +// link these no-op stubs so the runtime translation unit is self-contained. +// Visibility is hidden so the HOST .so doesn't export them into the global +// dynamic symbol table where they'd shadow the AICPU .so's strong symbols +// (same pattern as get_sys_cnt_aicpu / l2_swimlane_aicpu_record_orch_phase below). +extern "C" __attribute__((weak, visibility("hidden"))) bool is_dep_gen_enabled() { return false; } +__attribute__((weak, visibility("hidden"))) void dep_gen_aicpu_record_submit( + uint64_t, bool, int, const void *const *, const uint8_t *, int, const uint64_t *, const int32_t[3] +) {} + +// Scope_stats enable gate, queried via the same predicate idiom as +// is_dep_gen_enabled above. The AICPU collector links the strong definition; +// host builds fall back to this weak `false`. Gating here still skips the +// cross-agent occupancy reads that feed the sample when scope_stats is disabled. +extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; } + +// Heap-ring wrap report, called from the allocator (pto_ring_buffer.h) on each +// wrap. Strong definition lives in the AICPU collector; host builds fall back to +// this weak no-op so the runtime translation unit stays self-contained. +extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {} + +// ============================================================================= +// Orchestrator Profiling (compile-time toggle) +// ============================================================================= +#if PTO2_ORCH_PROFILING +#include "aicpu/device_time.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" +// Weak fallback for builds that don't link device_time.cpp (e.g. host). +// The strong symbol from platform/.../device_time.cpp wins in the AICPU build. +// +// IMPORTANT: visibility("hidden") is required to prevent the HOST .so from +// exporting this weak fallback into the global dynamic symbol table via +// RTLD_GLOBAL. Without it, when the AICPU .so is loaded and its PLT entry +// for get_sys_cnt_aicpu is resolved, the dynamic linker finds the HOST .so's +// weak definition first (already in global table) and uses it — returning 0. +// With hidden visibility, the HOST .so does not export this symbol globally, +// so the AICPU .so's PLT resolves to its own strong definition from +// device_time.cpp. +__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; } +// Weak fallback for builds that don't link l2_swimlane_collector_aicpu.cpp. +// The strong symbol from the AICPU build wins when profiling is available. +// Also hidden to prevent HOST .so from polluting the global symbol table. +__attribute__((weak, visibility("hidden"))) void +l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {} +// Accumulated cycles per sub-step (only needed for ORCH_PROFILING export) +static uint64_t g_orch_sync_cycle = 0; // tensormap sync +static uint64_t g_orch_alloc_cycle = 0; // unified task+heap alloc +static uint64_t g_orch_args_cycle = 0; // param copy +static uint64_t g_orch_lookup_cycle = 0; // tensormap lookup + dep building +static uint64_t g_orch_insert_cycle = 0; // tensormap insert +static uint64_t g_orch_fanin_cycle = 0; // fanin list + early-return check +static uint64_t g_orch_scope_end_cycle = 0; // scope_end overhead +static int64_t g_orch_submit_count = 0; +static uint32_t g_orch_submit_idx = 0; +uint64_t g_orch_alloc_wait_cycle = 0; +uint64_t g_orch_fanin_wait_cycle = 0; +uint64_t g_orch_alloc_atomic_count = 0; +uint64_t g_orch_args_atomic_count = 0; +uint64_t g_orch_scope_end_atomic_count = 0; +// Cycle accumulation is unconditional under PTO2_ORCH_PROFILING (that's what +// the flag is for) and feeds the per-sub-step `g_orch_*_cycle` cumulatives +// printed in the cold-path log. +// +// Per-submit ORCH_SUBMIT record is the only swim-lane emit on the orch +// path — one record per submit_task() / alloc_tensors() call spanning +// the entire [start, end] window. Per-sub-step phase records were dropped +// in favour of the cumulatives + per-submit envelope; the dispatcher +// already inserts one record at the end of each submit path via +// CYCLE_COUNT_ORCH_SUBMIT_RECORD. +#define CYCLE_COUNT_START() \ + bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \ + uint64_t _t0 = get_sys_cnt_aicpu(), _t1; \ + uint64_t _submit_start_ts = _t0 +#define CYCLE_COUNT_LAP(acc) \ + do { \ + _t1 = get_sys_cnt_aicpu(); \ + acc += (_t1 - _t0); \ + _t0 = _t1; \ + } while (0) +#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid) \ + do { \ + if (_prof_active) { \ + l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \ + } \ + } while (0) +#elif PTO2_PROFILING +#include "aicpu/device_time.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" +__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; } +__attribute__((weak, visibility("hidden"))) void +l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {} +// submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level) +static uint32_t g_orch_submit_idx = 0; +#define CYCLE_COUNT_START() \ + bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \ + uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0; \ + uint64_t _submit_start_ts = _t0 +#define CYCLE_COUNT_LAP(acc) \ + do { \ + } while (0) +#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid) \ + do { \ + if (_prof_active) { \ + _t1 = get_sys_cnt_aicpu(); \ + l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \ + } \ + } while (0) +#else +#define CYCLE_COUNT_START() +#define CYCLE_COUNT_LAP(acc) +#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid) +#endif + +static int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code) { + always_assert(orch != nullptr); + orch->fatal = true; + if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) { + return PTO2_ERROR_NONE; + } + + int32_t expected = PTO2_ERROR_NONE; + std::atomic &orch_error_code = orch->sm_header->orch_error_code; + if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) { + return error_code; + } + return expected; +} + +static void +orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *func, const char *fmt, va_list args) { + int32_t latched_code = orch_mark_fatal(orch, error_code); + +#if PTO2_PROFILING + // Flush the current scope's peaks BEFORE the FATAL log line, so the + // diagnostic context (which pool/window filled up) appears right next to + // the failure reason. on_fatal is latched, so duplicate fatals from + // different layers don't print multiple stats lines. + scope_stats_on_fatal(); +#endif + + if (fmt == nullptr || fmt[0] == '\0') { + if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) { + unified_log_error(func, "FATAL(code=%d, latched=%d)", error_code, latched_code); + } else { + unified_log_error(func, "FATAL(code=%d)", error_code); + } + return; + } + + char message[1024]; + vsnprintf(message, sizeof(message), fmt, args); + if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) { + unified_log_error(func, "FATAL(code=%d, latched=%d): %s", error_code, latched_code, message); + return; + } + unified_log_error(func, "FATAL(code=%d): %s", error_code, message); +} + +void PTO2OrchestratorState::report_fatal(int32_t error_code, const char *func, const char *fmt, ...) { + auto *orch = this; + va_list args; + va_start(args, fmt); + orch_report_fatal_v(orch, error_code, func, fmt, args); + va_end(args); +} + +static uint32_t next_fanin_seen_epoch(PTO2OrchestratorState *orch) { + uint32_t next = orch->fanin_seen_current_epoch + 1; + if (next == 0) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + memset( + orch->fanin_seen_epoch[r], 0, + static_cast(orch->sm_header->rings[r].task_window_size) * sizeof(uint32_t) + ); + } + next = 1; + } + orch->fanin_seen_current_epoch = next; + return next; +} + +struct PTO2FaninBuilder { + PTO2FaninBuilder(PTO2OrchestratorState *orch, PTO2FaninPool &spill_pool, uint32_t seen_epoch) : + count(0), + spill_start(0), + orch(orch), + seen_epoch(seen_epoch), + spill_pool(spill_pool) {} + int32_t count{0}; + int32_t spill_start{0}; + PTO2OrchestratorState *orch{nullptr}; + uint32_t seen_epoch{0}; + PTO2FaninPool &spill_pool; + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP]; + + template + PTO2FaninForEachReturn for_each(Fn &&fn) const { + return for_each_fanin_storage(inline_slots, count, spill_start, spill_pool, static_cast(fn)); + } + + bool mark_seen(uint8_t prod_ring, int32_t prod_slot) { + if (prod_ring >= PTO2_MAX_RING_DEPTH || prod_slot < 0) { + return false; + } + uint32_t *seen = orch->fanin_seen_epoch[prod_ring]; + uint32_t slot = static_cast(prod_slot); + if (seen[slot] == seen_epoch) { + return true; + } + seen[slot] = seen_epoch; + return false; + } +}; + +static bool append_fanin_or_fail( + PTO2OrchestratorState *orch, uint8_t prod_ring, int32_t prod_slot, PTO2TaskSlotState *prod_state, + PTO2FaninBuilder *fanin_builder, uint8_t ring_id +) { + if (fanin_builder->mark_seen(prod_ring, prod_slot)) { + return true; + } + + if (fanin_builder->count < PTO2_FANIN_INLINE_CAP) { + fanin_builder->inline_slots[fanin_builder->count++] = prod_state; + return true; + } + + PTO2FaninPool &fanin_pool = fanin_builder->spill_pool; + if (!fanin_pool.ensure_space(orch->sm_header->rings[ring_id], 1)) { + orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW); + return false; + } + int32_t spill_idx = fanin_pool.top; + PTO2FaninSpillEntry *entry = fanin_pool.alloc(); + if (entry == nullptr) { + orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW); + return false; + } + if (fanin_builder->count == PTO2_FANIN_INLINE_CAP) { + fanin_builder->spill_start = spill_idx; + } + entry->slot_state = prod_state; + fanin_builder->count++; + return true; +} + +static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state); + +struct PTO2PreparedTask { + PTO2TaskId task_id = PTO2TaskId::invalid(); + PTO2TaskAllocResult alloc_result = {-1, 0, nullptr, nullptr}; + PTO2TaskDescriptor *task = nullptr; + PTO2TaskPayload *payload = nullptr; + PTO2TaskSlotState *slot_state = nullptr; +}; + +static PTO2OutputLayout calculate_output_layout(const L0TaskArgs &args) { + PTO2OutputLayout layout; + for (int32_t i = 0; i < args.tensor_count(); i++) { + if (args.tag(i) != TensorArgType::OUTPUT) { + continue; + } + layout.offsets[i] = layout.total_output_size; + layout.buffer_sizes[i] = + PTO2_ALIGN_UP(args.tensor(i).create_info().buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN); + layout.total_output_size += layout.buffer_sizes[i]; + } + return layout; +} + +static bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator, uint8_t ring_id) { + always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope"); + + int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top]; + if (scope_task_count < allocator.window_size() - 1) { + return true; + } + + int32_t active_count = allocator.active_count(); + + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Scope Deadlock Detected! (ring %d)", ring_id); + LOG_ERROR("========================================"); + LOG_ERROR("Tasks in current scope (%d) >= task_window_size (%d).", scope_task_count, allocator.window_size()); + LOG_ERROR(" scope_depth: %d", orch->scope_stack_top + 1); + LOG_ERROR(" ring_id: %d", ring_id); + LOG_ERROR(" scope_task_count: %d", scope_task_count); + LOG_ERROR(" active_tasks: %d / %d", active_count, allocator.window_size()); + LOG_ERROR("Root Cause:"); + LOG_ERROR(" Tasks within a scope hold a fanout_count reference that is only"); + LOG_ERROR(" released at scope_end. When scope task count >= window_size,"); + LOG_ERROR(" no slots can be reclaimed -> deadlock."); + LOG_ERROR("Solution:"); + LOG_ERROR(" 1. Reduce tasks per scope (use batching/unroll)"); + LOG_ERROR(" 2. Increase task window (current: %d)", allocator.window_size()); + LOG_ERROR(" Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_TASK_WINDOW="); + LOG_ERROR(" 3. Split work across multiple scopes"); + LOG_ERROR("========================================"); + orch_mark_fatal(orch, PTO2_ERROR_SCOPE_DEADLOCK); + return false; +} + +static bool prepare_task( + PTO2OrchestratorState *orch, const L0TaskArgs &args, int32_t total_output_size, ActiveMask active_mask, + PTO2PreparedTask *out +) { + uint8_t ring_id = orch->current_ring_id(); + auto &allocator = orch->rings[ring_id].task_allocator; + + if (!check_scope_can_accept_task(orch, allocator, ring_id)) { + return false; + } + + out->alloc_result = allocator.alloc(total_output_size); + if (out->alloc_result.failed()) { + orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK); + return false; + } + + out->task_id = PTO2TaskId::make(ring_id, static_cast(out->alloc_result.task_id)); + out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot); + out->task = &orch->sm_header->rings[ring_id].task_descriptors[out->alloc_result.slot]; + out->payload = &orch->sm_header->rings[ring_id].task_payloads[out->alloc_result.slot]; + + out->payload->prefetch(args.tensor_count(), args.scalar_count()); + + // Re-bind payload/task pointers each submit. Value is per-slot constant + // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing + // here lets RingSchedState::init() skip the O(window_size) bind loop. + // Both writes hit the same 64B slot_state cache line we're about to + // dirty below, so the extra cost is two stores on an already-hot line. + // Must precede the scheduler wiring.queue.push at the end of + // submit_task_common — that push is the first read of slot_state->task / + // slot_state->payload by another thread. + out->slot_state->bind_buffers(out->payload, out->task); + + // prepare_task does NO payload writes: all payload content (tensors/scalars + + // early-dispatch spec fields) is initialized in PTO2TaskPayload::init, the + // single payload-init point, which runs before the scheduler wiring push. + + // Fields already reset by advance_ring_pointers (eager reset after CONSUMED): + // fanout_lock=0, fanout_count=1, fanout_head=nullptr, + // fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0 + // Fields immutable after RingSchedState::init(): + // ring_id + // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor + // observers); set to PENDING here when orchestrator actually reuses the slot. + out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed); + int16_t block_num = args.launch_spec.block_num(); + out->slot_state->total_required_subtasks = + static_cast(block_num * __builtin_popcount(active_mask.core_mask())); + out->slot_state->logical_block_num = block_num; + out->slot_state->active_mask = active_mask; + // fanin_count is set by scheduler during wiring + scope_tasks_push(orch, out->slot_state); + + return true; +} + +// ============================================================================= +// Scope Management +// ============================================================================= + +static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state) { + if (orch->scope_tasks_size >= orch->scope_tasks_capacity) { + // scope_tasks lives in the per-Worker arena (single backing allocation), + // so realloc is not legal. Capacity == PTO2_SCOPE_TASKS_CAP == + // PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH, the total in-flight slot + // budget — hitting it means every ring is saturated, so no further push + // could succeed regardless of buffer growth. + orch->report_fatal( + PTO2_ERROR_SCOPE_TASKS_OVERFLOW, __FUNCTION__, + "scope_tasks buffer saturated at %d entries (all rings full)", orch->scope_tasks_capacity + ); + return; + } + orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state; +} + +void PTO2OrchestratorState::begin_scope(PTO2ScopeMode mode) { + auto *orch = this; + if (orch->fatal) { + return; + } + assert(orch->scope_stack_top < static_cast(orch->scope_stack_capacity - 1) && "Scope stack overflow"); + if (mode == PTO2ScopeMode::AUTO && orch->in_manual_scope()) { + report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "auto scope nested inside manual scope is not supported"); + return; + } + + bool already_in_manual_scope = orch->in_manual_scope(); + ++orch->scope_stack_top; + orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size; + if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) { + orch->manual_begin_depth = orch->scope_stack_top; + } +#if PTO2_PROFILING + // Gate via is_scope_stats_enabled() (weak-false in host builds) BEFORE the + // collector call: when disabled we pay nothing. Sample the current ring's + // task/heap start-end and tensormap usage at the scope boundary. + if (is_scope_stats_enabled()) { + uint8_t ring_id = orch->current_ring_id(); + auto &alloc = orch->rings[ring_id].task_allocator; + int32_t dep_pool_tail = 0; + int32_t dep_pool_top = 0; + if (orch->scheduler) { + orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top); + } + scope_stats_begin( + ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail, + dep_pool_top, orch->tensor_map.current_used() + ); + } +#endif +} + +void PTO2OrchestratorState::end_scope() { + auto *orch = this; + if (orch->fatal) { + return; + } + assert(orch->scope_stack_top >= 0 && "Scope stack underflow"); + + // Snapshot the ring start/end BEFORE the orchestrator drains pending tasks + // via scheduler->on_scope_end, so the end record reflects the scope's + // occupancy at close, not the residual after teardown. +#if PTO2_PROFILING + // Gate via is_scope_stats_enabled() (see begin_scope). One collector call + // emits the end-boundary record and tears down bookkeeping. + if (is_scope_stats_enabled()) { + uint8_t ring_id = orch->current_ring_id(); + auto &alloc = orch->rings[ring_id].task_allocator; + int32_t dep_pool_tail = 0; + int32_t dep_pool_top = 0; + if (orch->scheduler) { + orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top); + } + scope_stats_end( + ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail, + dep_pool_top, orch->tensor_map.current_used() + ); + } +#endif + +#if PTO2_ORCH_PROFILING + uint64_t _se0 = get_sys_cnt_aicpu(); +#endif + + bool ending_manual_scope = orch->scope_stack_top == orch->manual_begin_depth; + int32_t begin = orch->scope_begins[orch->scope_stack_top--]; + int32_t count = orch->scope_tasks_size - begin; + if (ending_manual_scope) { + orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; + } + + if (orch->scheduler && count > 0) { + orch->scheduler->on_scope_end(&orch->scope_tasks[begin], count); + } + + // Rewind the task buffer — these entries are no longer needed + orch->scope_tasks_size = begin; + +#if PTO2_ORCH_PROFILING + uint64_t _se1 = get_sys_cnt_aicpu(); + g_orch_scope_end_cycle += (_se1 - _se0); +#endif +} + +// ============================================================================= +// Task Submission +// ============================================================================= + +// Shared body for submit_task / submit_dummy_task. Caller has already validated +// args.has_error, decided active_mask (empty for dummy), and resolved the per-slot +// kernel_ids (all INVALID_KERNEL_ID for dummy). Performs tensormap sync, fanin +// computation (explicit_deps + auto), output registration, slot init, and pushes +// to the scheduler wiring queue. +static TaskOutputTensors submit_task_common( + PTO2OrchestratorState *orch, const L0TaskArgs &args, ActiveMask active_mask, int32_t aic_kernel_id, + int32_t aiv0_kernel_id, int32_t aiv1_kernel_id +) { + CYCLE_COUNT_START(); + TaskOutputTensors result; + PTO2OutputLayout layout = calculate_output_layout(args); + PTO2PreparedTask prepared; + if (!prepare_task(orch, args, layout.total_output_size, active_mask, &prepared)) { + return result; + } + uint8_t ring_id = prepared.task_id.ring(); + PTO2SchedulerState *sched = orch->scheduler; + PTO2RingFlowControl &fc = orch->sm_header->rings[ring_id].fc; + PTO2TaskId task_id = prepared.task_id; + PTO2TaskSlotState &cur_slot_state = *prepared.slot_state; + PTO2TaskDescriptor &task = *prepared.task; + PTO2TaskPayload &payload = *prepared.payload; + result.set_task_id(task_id); + + // dep_gen capture point: snapshot the orch submit_task inputs while the + // tensormap is still in its pre-lookup state for this task. Replay reads + // these records offline to reconstruct the complete dep graph — the sole + // source of truth for fanout now that the swimlane hot path no longer + // records it. + if (is_dep_gen_enabled()) { + const void *tensor_ptrs[MAX_TENSOR_ARGS]; + // TensorArgType is `enum class : int32_t` (4 bytes); the on-disk record + // packs arg_types as uint8_t[16] (5-value enum fits in a byte). Narrow + // each tag here rather than letting the AICPU writer reinterpret a + // 4×-wider array as bytes — that path silently lost two of every three + // tags on little-endian and synthesized phantom self-edges in replay. + uint8_t arg_types_u8[MAX_TENSOR_ARGS]; + // Clamp to MAX_TENSOR_ARGS even though the Arg builder caps adds at + // MAX_TENSOR_ARGS: defensive against any future builder bypass / + // shared-memory bit-flip that could otherwise overrun the two + // MAX_TENSOR_ARGS-sized stack buffers above. + const int tc_raw = args.tensor_count(); + const int tc = tc_raw > MAX_TENSOR_ARGS ? MAX_TENSOR_ARGS : tc_raw; + for (int i = 0; i < tc; i++) { + // OUTPUT slots carry create_info (not yet a Tensor); skip them — + // they have no producer to look up and replay's per-tensor loop + // also skips OUTPUT. + tensor_ptrs[i] = (args.tag(i) == TensorArgType::OUTPUT) ? nullptr : &args.tensor(i).ref(); + arg_types_u8[i] = static_cast(args.tag(i)); + } + const int32_t kernel_ids_capture[3] = {aic_kernel_id, aiv0_kernel_id, aiv1_kernel_id}; + dep_gen_aicpu_record_submit( + task_id.raw, orch->in_manual_scope(), tc, tensor_ptrs, arg_types_u8, + static_cast(args.explicit_dep_count()), reinterpret_cast(args.explicit_deps_data()), + kernel_ids_capture + ); + } + + PTO2FaninBuilder fanin_builder(orch, orch->rings[ring_id].fanin_pool, next_fanin_seen_epoch(orch)); + + CYCLE_COUNT_LAP(g_orch_alloc_cycle); + +#if PTO2_PROFILING + if (layout.total_output_size > 0) { + orch->buffers_allocated++; + orch->bytes_allocated += layout.total_output_size; + } +#endif + + // === STEP 2: Sync TensorMap validity and optional cleanup === + // Read current last_task_alive from shared memory for this ring + int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire); + + orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive); + + CYCLE_COUNT_LAP(g_orch_sync_cycle); + + for (uint32_t i = 0; i < args.explicit_dep_count(); i++) { + PTO2TaskId dep_task_id = args.explicit_dep(i); + if (!dep_task_id.is_valid()) { + orch->report_fatal( + PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "Arg.set_dependencies(...) requires valid task ids" + ); + return result; + } + uint8_t dep_ring_id = dep_task_id.ring(); + PTO2SharedMemoryRingHeader &dep_ring = orch->sm_header->rings[dep_ring_id]; + int32_t dep_local_task_id = static_cast(dep_task_id.local()); + int32_t dep_last_task_alive = dep_ring.fc.last_task_alive.load(std::memory_order_acquire); + if (dep_local_task_id < dep_last_task_alive) { + continue; + } + int32_t dep_slot = dep_ring.get_slot_by_task_id(dep_local_task_id); + PTO2TaskSlotState *producer_slot_state = &dep_ring.get_slot_state_by_slot(dep_slot); + if (!append_fanin_or_fail(orch, dep_ring_id, dep_slot, producer_slot_state, &fanin_builder, ring_id)) { + return result; + } + } + + // === STEP 3: Lookup inputs (creator retention + tensormap modifier lookup) === + DepInputs dep_inputs{ + args.tensor_count(), args.tensor_data(), args.tag_data(), static_cast(args.explicit_dep_count()), + args.explicit_deps_data(), + }; + + auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool { + uint8_t prod_ring = producer_task_id.ring(); + PTO2SharedMemoryRingHeader &producer_ring = orch->sm_header->rings[prod_ring]; + int32_t prod_slot = producer_ring.get_slot_by_task_id(static_cast(producer_task_id.local())); + PTO2TaskSlotState *prod_state = &producer_ring.get_slot_state_by_slot(prod_slot); + return append_fanin_or_fail(orch, prod_ring, prod_slot, prod_state, &fanin_builder, ring_id); + }; + + if (!compute_task_fanin(dep_inputs, orch->tensor_map, orch->in_manual_scope(), runtime_emit)) { + return result; + } + + CYCLE_COUNT_LAP(g_orch_lookup_cycle); + + // === STEP 4: Register outputs/inouts in TensorMap (must be separate from lookup) === + register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope()); + + CYCLE_COUNT_LAP(g_orch_insert_cycle); + + // === STEP 5: Batch-write to GM (single cache line burst) === + // Deferred from allocation phase to avoid scattered GM writes that get + // evicted by TensorMap lookup/insert cache pressure. + __builtin_prefetch(&task, 1, 1); + task.task_id = task_id; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIC)] = aic_kernel_id; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIV0)] = aiv0_kernel_id; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIV1)] = aiv1_kernel_id; + task.packed_buffer_base = prepared.alloc_result.packed_base; + task.packed_buffer_end = prepared.alloc_result.packed_end; + + // Increment fanout_count on each producer (no lock — only orch writes this field). + // Prevents premature CONSUMED: scope_end's release_producer checks fanout_refcount == fanout_count. + for_each_fanin_storage( + fanin_builder.inline_slots, fanin_builder.count, fanin_builder.spill_start, fanin_builder.spill_pool, + [](PTO2TaskSlotState *producer) { + producer->fanout_count++; + } + ); + + int32_t inline_count = std::min(fanin_builder.count, PTO2_FANIN_INLINE_CAP); + // Store fanin metadata in payload for scheduler to iterate + payload.fanin_actual_count = fanin_builder.count; + payload.fanin_spill_start = fanin_builder.spill_start; + payload.fanin_spill_pool = &fanin_builder.spill_pool; + for (int i = 0; i < inline_count; i++) { + payload.fanin_inline_slot_states[i] = fanin_builder.inline_slots[i]; + } + + payload.init(args, result, prepared.alloc_result, layout); +#if PTO2_PROFILING + if (is_dump_args_enabled()) { + if (args.scalar_count() > 0) { + set_dump_args_task_scalar_dtypes( + task_id.raw, static_cast(args.scalar_count()), args.scalar_dtypes() + ); + } + // Selective vs full dump is latched at dump_args_init from DumpDataHeader + // (host-decided before any dispatch), so it is race-free regardless of + // submission order. Here we only record each marked task's arg mask and + // metadata flags, which selective collection consults. + if (args.dump_arg_mask() != 0) { + set_dump_args_task_mask(task_id.raw, args.dump_arg_mask(), args.dump_arg_index_ambiguous_mask()); + } + } +#endif + + CYCLE_COUNT_LAP(g_orch_args_cycle); +#if PTO2_ORCH_PROFILING + g_orch_args_atomic_count += 2; // fanout_lock.store + fanout_count.store +#endif + + // === STEP 6: push to wiring queue === + // Deferred wiring: orchestrator only stores dependency metadata and increments + // fanout_count. The actual fanout_head wiring (lock + dep_pool + early_finished) + // is handled asynchronously by scheduler thread 0 via the wiring queue. + // Push to global wiring queue — scheduler sets fanin_count, wires fanout, checks readiness + while (!sched->wiring.queue.push(&cur_slot_state)) { + SPIN_WAIT_HINT(); + } + + CYCLE_COUNT_LAP(g_orch_fanin_cycle); + CYCLE_COUNT_ORCH_SUBMIT_RECORD(task_id.raw); + +#if PTO2_PROFILING + orch->tasks_submitted++; +#if PTO2_ORCH_PROFILING + g_orch_submit_count++; +#endif + g_orch_submit_idx++; +#endif + return result; +} + +TaskOutputTensors PTO2OrchestratorState::submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args) { + auto *orch = this; + + // Orchestration API should short-circuit after fatal, but keep this entry + // robust as a no-op in case a caller reaches it directly. + if (orch->fatal) { + return TaskOutputTensors{}; + } + + // Validate Arg construction (errors recorded by add_input/add_output/etc.) + if (args.has_error) { + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Invalid Arg Detected!"); + LOG_ERROR("========================================"); + LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)"); + LOG_ERROR(" tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count()); + LOG_ERROR("This is a bug in the orchestration code."); + LOG_ERROR("========================================"); + orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS); + return TaskOutputTensors{}; + } + always_assert(orch->scheduler != nullptr); + // === Validate submit inputs === + ActiveMask active_mask = mixed_kernels.to_active_mask(); + always_assert(static_cast(active_mask) && "MixedKernels must have at least one active slot"); + + int16_t block_num = args.launch_spec.block_num(); + always_assert(block_num >= 1 && "block_num must be >= 1"); + + // Normalize single-AIV tasks: if only aiv1 is set (no aic, no aiv0), move + // it to the aiv0 slot. This guarantees the dispatch path can always use + // PTO2SubtaskSlot::AIV0 for single-AIV shapes without inspecting active_mask. + // Mixed tasks (AIC+AIV) keep their original AIV identity so the correct + // hardware channel (AIV0→AIC vs AIV1→AIC) is used at dispatch time. + MixedKernels normalized = mixed_kernels; + bool has_aic = active_mask.has_mask(PTO2_SUBTASK_MASK_AIC); + bool has_aiv0 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV0); + bool has_aiv1 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV1); + if (!has_aic && has_aiv1 && !has_aiv0) { + normalized.aiv0_kernel_id = normalized.aiv1_kernel_id; + normalized.aiv1_kernel_id = INVALID_KERNEL_ID; + active_mask = normalized.to_active_mask(); + } + + // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1) + if (block_num > 1 && args.launch_spec.require_sync_start()) { + // Deadlock check: block_num >= total available slots of the required type. + // For MIX/AIC: limit is total_cluster_count (one AIC per cluster). + // For AIV: limit is total_aiv_count. + PTO2ResourceShape shape = active_mask.to_shape(); + int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count; + if (limit > 0 && block_num > limit) { + report_fatal( + PTO2_ERROR_REQUIRE_SYNC_START_INVALID, __FUNCTION__, + "require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit + ); + return TaskOutputTensors{}; + } + active_mask.set_sync_start(); + } + + return submit_task_common( + orch, args, active_mask, normalized.aic_kernel_id, normalized.aiv0_kernel_id, normalized.aiv1_kernel_id + ); +} + +// Submit a dependency-only task: full dependency graph participation +// (tensormap lookup/insert, explicit_deps, manual_dep, manual_scope) but no +// AICore dispatch. Empty active_mask routes the slot to the DUMMY ready +// bucket; dispatch loop short-circuits to completion. Accepts the same Arg +// shape as submit_task; scalars are permitted but never consumed. +TaskOutputTensors PTO2OrchestratorState::submit_dummy_task(const L0TaskArgs &args) { + auto *orch = this; + + if (orch->fatal) { + return TaskOutputTensors{}; + } + + if (args.has_error) { + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Invalid Arg in submit_dummy_task!"); + LOG_ERROR("========================================"); + LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)"); + LOG_ERROR(" tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count()); + LOG_ERROR("========================================"); + orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS); + return TaskOutputTensors{}; + } + always_assert(orch->scheduler != nullptr); + + return submit_task_common(orch, args, ActiveMask{}, INVALID_KERNEL_ID, INVALID_KERNEL_ID, INVALID_KERNEL_ID); +} + +TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const L0TaskArgs &args) { + auto *orch = this; + // Orchestration API should short-circuit after fatal, but keep this entry + // robust as a no-op in case a caller reaches it directly. + if (orch->fatal) { + return TaskOutputTensors{}; + } + + if (args.tensor_count() <= 0) { + report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors requires at least one TensorCreateInfo"); + return TaskOutputTensors{}; + } + if (args.scalar_count() != 0) { + report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args"); + return TaskOutputTensors{}; + } + for (int32_t i = 0; i < args.tensor_count(); i++) { + if (args.tag(i) != TensorArgType::OUTPUT) { + report_fatal( + PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args" + ); + return TaskOutputTensors{}; + } + } + + CYCLE_COUNT_START(); + + if (args.has_error) { + report_fatal( + PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", + args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg" + ); + return TaskOutputTensors{}; + } + + PTO2OutputLayout layout = calculate_output_layout(args); + PTO2PreparedTask prepared; + if (!prepare_task(orch, args, layout.total_output_size, ActiveMask{}, &prepared)) { + return TaskOutputTensors{}; + } + + PTO2TaskDescriptor &task = *prepared.task; + PTO2TaskPayload &payload = *prepared.payload; + + CYCLE_COUNT_LAP(g_orch_alloc_cycle); + +#if PTO2_PROFILING + if (layout.total_output_size > 0) { + orch->buffers_allocated++; + orch->bytes_allocated += layout.total_output_size; + } +#endif + + task.task_id = prepared.task_id; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIC)] = INVALID_KERNEL_ID; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIV0)] = INVALID_KERNEL_ID; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIV1)] = INVALID_KERNEL_ID; + task.packed_buffer_base = prepared.alloc_result.packed_base; + task.packed_buffer_end = prepared.alloc_result.packed_end; + + TaskOutputTensors outputs; + outputs.set_task_id(prepared.task_id); + payload.init(args, outputs, prepared.alloc_result, layout); + payload.fanin_actual_count = 0; + payload.fanin_spill_start = 0; + payload.fanin_spill_pool = &orch->rings[prepared.task_id.ring()].fanin_pool; + CYCLE_COUNT_LAP(g_orch_args_cycle); + + if (prepared.slot_state != nullptr) { + // Hidden alloc tasks complete inline in the orchestrator before any + // consumer can exist, so they have no fanout to notify and no worker + // subtasks to retire. Running the full on_task_complete path + // would only pay unnecessary fanout_lock / traversal overhead here. + // The generic slot initialization done in prepare_task() is still + // required so scope_end can release the producer-side reference and + // drive the slot to CONSUMED, but worker dispatch fields are never + // observed for hidden alloc tasks. + prepared.slot_state->task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release); + } + orch->inline_completed_tasks++; + + CYCLE_COUNT_LAP(g_orch_fanin_cycle); + CYCLE_COUNT_ORCH_SUBMIT_RECORD(prepared.task_id.raw); + +#if PTO2_PROFILING + orch->tasks_submitted++; +#if PTO2_ORCH_PROFILING + g_orch_submit_count++; +#endif + g_orch_submit_idx++; +#endif + + return outputs; +} + +// ============================================================================= +// Flow Control +// ============================================================================= + +void PTO2OrchestratorState::mark_done() { + auto *orch = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + int32_t total_tasks = orch->rings[r].task_allocator.active_count(); + if (total_tasks > 0) { + LOG_INFO_V0("=== [Orchestrator] ring %d: total_tasks=%d ===", r, total_tasks); + } + auto &fanin_pool = orch->rings[r].fanin_pool; + if (fanin_pool.top > 1) { + LOG_INFO_V0( + "=== [FaninPool %d] top=%d tail=%d used=%d high_water=%d capacity=%d ===", r, fanin_pool.top, + fanin_pool.tail, fanin_pool.top - fanin_pool.tail, fanin_pool.high_water, fanin_pool.capacity + ); + } + } + orch->sm_header->orchestrator_done.store(1, std::memory_order_release); + orch->scope_tasks_size = 0; + orch->scope_stack_top = -1; + orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; +#if !PTO2_ORCH_PROFILING && PTO2_PROFILING + g_orch_submit_idx = 0; +#endif +} + +#if PTO2_ORCH_PROFILING +PTO2OrchProfilingData orchestrator_get_profiling() { + PTO2OrchProfilingData d; + d.sync_cycle = g_orch_sync_cycle; + d.alloc_cycle = g_orch_alloc_cycle; + d.args_cycle = g_orch_args_cycle; + d.lookup_cycle = g_orch_lookup_cycle; + d.insert_cycle = g_orch_insert_cycle; + d.fanin_cycle = g_orch_fanin_cycle; + d.scope_end_cycle = g_orch_scope_end_cycle; + d.submit_count = g_orch_submit_count; + d.alloc_wait_cycle = g_orch_alloc_wait_cycle; + d.fanin_wait_cycle = g_orch_fanin_wait_cycle; + d.alloc_atomic_count = g_orch_alloc_atomic_count; + d.args_atomic_count = g_orch_args_atomic_count; + d.scope_end_atomic_count = g_orch_scope_end_atomic_count; + + // Reset + g_orch_sync_cycle = g_orch_alloc_cycle = g_orch_args_cycle = 0; + g_orch_lookup_cycle = g_orch_insert_cycle = 0; + g_orch_fanin_cycle = g_orch_scope_end_cycle = 0; + g_orch_submit_count = 0; + g_orch_submit_idx = 0; + g_orch_alloc_wait_cycle = 0; + g_orch_fanin_wait_cycle = 0; + g_orch_alloc_atomic_count = 0; + g_orch_args_atomic_count = 0; + g_orch_scope_end_atomic_count = 0; + return d; +} +#endif diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h new file mode 100644 index 000000000..8ffe39b31 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h @@ -0,0 +1,209 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Orchestrator Interface + * + * The Orchestrator is responsible for: + * 1. Executing the orchestration function (Turing-complete control flow) + * 2. Allocating intermediate buffers from the heap + * 3. Submitting tasks via async InCore function calls + * 4. Building the dependency graph using TensorMap + * 5. Managing buffer scopes for lifecycle control + * + * The Orchestrator can run on either: + * - Host CPU (lower latency for complex control, easier debugging) + * - Device AI_CPU (lower latency for task submission) + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#ifndef PTO_ORCHESTRATOR_H +#define PTO_ORCHESTRATOR_H + +#include "common/l2_swimlane_profiling.h" +#include "utils/device_arena.h" +#include "pto_ring_buffer.h" +#include "pto_runtime2_types.h" +#include "pto_submit_types.h" +#include "scheduler/pto_scheduler.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "pto_types.h" + +/** + * Layout descriptor produced by PTO2OrchestratorState::reserve_layout(). Holds + * arena offsets for every sub-region the orchestrator owns (per-ring fanin + * pools, scope arrays, plus the nested PTO2TensorMap layout). + */ +struct PTO2OrchestratorLayout { + size_t off_fanin_pool[PTO2_MAX_RING_DEPTH]; + size_t off_fanin_seen_epoch[PTO2_MAX_RING_DEPTH]; + size_t off_scope_tasks; + size_t off_scope_begins; + PTO2TensorMapLayout tensor_map; + int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; + int32_t scope_tasks_cap; + uint64_t scope_stack_capacity; +}; + +// ============================================================================= +// Orchestrator State +// ============================================================================= + +/** + * Orchestrator state structure (private to Orchestrator) + * + * Contains all state needed for task graph construction and buffer management. + */ +struct PTO2OrchestratorState { + // === SHARED MEMORY ACCESS === + PTO2SharedMemoryHeader *sm_header; + + // === PER-RING RESOURCES === + PTO2RingSet rings[PTO2_MAX_RING_DEPTH]; + uint32_t *fanin_seen_epoch[PTO2_MAX_RING_DEPTH]; + uint32_t fanin_seen_current_epoch{1}; + + // === TENSOR MAP (Private) === + PTO2TensorMap tensor_map; // Producer lookup + + // === SCOPE STACK (Private) === + // Single contiguous buffer of task IDs, partitioned by scope level. + // scope_begins[i] is the index into scope_tasks where scope i starts. + // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size). + PTO2TaskSlotState **scope_tasks; // Flat buffer of taskSlotState (all scopes concatenated) + int32_t scope_tasks_size; // Number of task IDs currently in the buffer + int32_t scope_tasks_capacity; // Allocated capacity of scope_tasks + int32_t *scope_begins; // scope_begins[i] = start index of scope i in scope_tasks + int32_t scope_stack_top; // Current top of stack (-1 = no scope open) + uint64_t scope_stack_capacity; // Max nesting depth (PTO2_MAX_SCOPE_DEPTH) + int32_t manual_begin_depth{PTO2_MAX_SCOPE_DEPTH}; + + // === SCHEDULER REFERENCE === + // Note: In simulated mode, orchestrator and scheduler share address space + // In real mode, they communicate via shared memory only + PTO2SchedulerState *scheduler; // For simulated mode only + + // Total core counts set once at executor init; used for submit-time deadlock detection. + int32_t total_cluster_count{0}; // AIC cores = MIX clusters + int32_t total_aiv_count{0}; // AIV cores (= 2 × clusters on standard hardware) +#if PTO2_PROFILING + // L2 swimlane_level copied from get_l2_swimlane_level(). + L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED}; +#endif + + // === GM HEAP (for output buffers) === + void *gm_heap_base; // Base address of GM heap + uint64_t gm_heap_size; // Total size of GM heap (all rings) + + // === FATAL ERROR === + // Fatal error flag (single-thread access by orchestrator, no atomic needed) + // Cross-thread notification uses shared memory orch_error_code (atomic) + bool fatal; + + // Hidden alloc tasks complete synchronously inside the orchestrator and + // therefore bypass the executor's normal worker-completion counter path. + // The executor adds this count into its completed_tasks_ progress counter + // after orchestration finishes so shutdown/profiling totals remain closed. + int64_t inline_completed_tasks{0}; + + // === STATISTICS === +#if PTO2_PROFILING + int64_t tasks_submitted; + int64_t buffers_allocated; + int64_t bytes_allocated; +#endif + + /** + * Get current ring index from scope depth. + * Maps scope depth to ring_id: min(scope_depth, PTO2_MAX_RING_DEPTH - 1) + */ + uint8_t current_ring_id() const { + int32_t depth = scope_stack_top; + if (depth < 0) depth = 0; + return depth < PTO2_MAX_RING_DEPTH ? static_cast(depth) : PTO2_MAX_RING_DEPTH - 1; + } + + bool in_manual_scope() const { return scope_stack_top >= manual_begin_depth; } + + // === Cold-path API (defined in pto_orchestrator.cpp) === + + // Phase 1: declare every sub-region (per-ring fanin pool, scope arrays, + // tensor_map sub-layout) on the supplied arena. task_window_sizes feeds + // the nested tensor_map layout. Returned layout is consumed by + // init_from_layout. + static PTO2OrchestratorLayout reserve_layout( + DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], + int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE + ); + static PTO2OrchestratorLayout reserve_layout( + DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], + const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH] + ); + + // Phase 3a: write everything *except* arena-internal pointer fields. + // sm_dev_base is the SM device address (only stored, never dereferenced); + // task_window_size feeds the per-ring SM address arithmetic. Safe to call + // on a host arena that holds the prebuilt image. + bool init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, + uint64_t task_window_size + ); + bool init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, + const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH] + ); + + // Phase 3b: write the arena-internal pointer fields (scope_tasks, + // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool, + // free_entry_list,task_entry_heads}, scheduler reference). + // Idempotent — host runs once on the image, AICPU runs once after attach. + void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler); + + // Forget pointers; arena owns the backing buffers. + void destroy(); + void set_scheduler(PTO2SchedulerState *scheduler); + void report_fatal(int32_t error_code, const char *func, const char *fmt, ...); + void begin_scope(PTO2ScopeMode mode = PTO2ScopeMode::AUTO); + void end_scope(); + TaskOutputTensors submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args); + TaskOutputTensors submit_dummy_task(const L0TaskArgs &args); + TaskOutputTensors alloc_tensors(const L0TaskArgs &args); + void mark_done(); +}; + +// ============================================================================= +// Orchestrator Profiling Data +// ============================================================================= + +#if PTO2_ORCH_PROFILING +struct PTO2OrchProfilingData { + uint64_t sync_cycle; + uint64_t alloc_cycle; // Combined task slot + heap allocation + uint64_t args_cycle; + uint64_t lookup_cycle; + uint64_t insert_cycle; + uint64_t fanin_cycle; + uint64_t scope_end_cycle; + int64_t submit_count; + // Wait time tracking for blocking phases + uint64_t alloc_wait_cycle; // Cycles spent waiting in unified alloc + uint64_t fanin_wait_cycle; // Cycles spent waiting in fanout_lock + // Atomic operation counts per phase + uint64_t alloc_atomic_count; + uint64_t args_atomic_count; + uint64_t scope_end_atomic_count; +}; + +PTO2OrchProfilingData orchestrator_get_profiling(); +#endif + +#endif // PTO_ORCHESTRATOR_H diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp new file mode 100644 index 000000000..f6009dc57 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Ring Buffer Implementation + * + * Implements DepListPool ring buffer for zero-overhead dependency management. + * TaskAllocator methods are defined inline in pto_ring_buffer.h. + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#include "pto_ring_buffer.h" +#include +#include +#include "common/unified_log.h" +#include "scheduler/pto_scheduler.h" + +static void latch_pool_error(std::atomic *error_code_ptr, int32_t error_code) { + if (error_code_ptr == nullptr) { + return; + } + int32_t expected = PTO2_ERROR_NONE; + error_code_ptr->compare_exchange_strong(expected, error_code, std::memory_order_acq_rel); +} + +// ============================================================================= +// Fanin Spill Pool Implementation +// ============================================================================= +void PTO2FaninPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) { + if (sm_last_task_alive <= reclaim_task_cursor) return; + + int32_t scan_end = sm_last_task_alive; + for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) { + PTO2TaskPayload &payload = ring.get_payload_by_task_id(task_id); + if (payload.fanin_spill_pool != this) { + continue; + } + + int32_t inline_count = std::min(payload.fanin_actual_count, PTO2_FANIN_INLINE_CAP); + int32_t spill_edge_count = payload.fanin_actual_count - inline_count; + if (spill_edge_count > 0) { + advance_tail(payload.fanin_spill_start + spill_edge_count); + } + } + reclaim_task_cursor = scan_end; +} + +bool PTO2FaninPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) { + if (available() >= needed) return true; + + int spin_count = 0; + int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); + while (available() < needed) { + reclaim(ring, prev_last_alive); + if (available() >= needed) return true; + + spin_count++; + + int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); + if (cur_last_alive > prev_last_alive) { + spin_count = 0; + prev_last_alive = cur_last_alive; + } + + if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) { + int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire); + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Fanin Spill Pool Deadlock Detected!"); + LOG_ERROR("========================================"); + LOG_ERROR("Fanin spill pool cannot reclaim space after %d spins (no progress).", spin_count); + LOG_ERROR( + " - Pool used: %d / %d (%.1f%%)", used(), capacity, + (capacity > 0) ? (100.0 * used() / capacity) : 0.0 + ); + LOG_ERROR(" - Pool top: %d (linear)", top); + LOG_ERROR(" - Pool tail: %d (linear)", tail); + LOG_ERROR(" - High water: %d", high_water); + LOG_ERROR(" - Needed: %d entries", needed); + LOG_ERROR(" - last_task_alive: %d (stuck here)", cur_last_alive); + LOG_ERROR(" - current_task: %d", current); + LOG_ERROR(" - In-flight tasks: %d", current - cur_last_alive); + LOG_ERROR("Diagnosis:"); + LOG_ERROR(" last_task_alive is not advancing, so fanin spill pool tail"); + LOG_ERROR(" cannot reclaim. Check TaskRing diagnostics for root cause."); + LOG_ERROR("Solution:"); + LOG_ERROR(" Increase fanin spill pool capacity (current: %d, recommended: %d)", capacity, high_water * 2); + LOG_ERROR(" Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_DEP_POOL=%d", high_water * 2); + LOG_ERROR("========================================"); + latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW); + return false; + } + SPIN_WAIT_HINT(); + } + return true; +} + +// ============================================================================= +// Dependency List Pool Implementation +// ============================================================================= +void PTO2DepListPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) { + if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) { + int32_t mark = ring.get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark; + if (mark > 0) { + advance_tail(mark); + } + last_reclaimed = sm_last_task_alive; + } +} + +bool PTO2DepListPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) { + if (available() >= needed) return true; + + int spin_count = 0; + int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); + while (available() < needed) { + reclaim(ring, prev_last_alive); + if (available() >= needed) return true; + + spin_count++; + + // Progress detection: reset spin counter if last_task_alive advances + int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); + if (cur_last_alive > prev_last_alive) { + spin_count = 0; + prev_last_alive = cur_last_alive; + } + + if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) { + int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire); + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Dependency Pool Deadlock Detected!"); + LOG_ERROR("========================================"); + LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count); + LOG_ERROR( + " - Pool used: %d / %d (%.1f%%)", used(), capacity, + (capacity > 0) ? (100.0 * used() / capacity) : 0.0 + ); + LOG_ERROR(" - Pool top: %d (linear)", top); + LOG_ERROR(" - Pool tail: %d (linear)", tail); + LOG_ERROR(" - High water: %d", high_water); + LOG_ERROR(" - Needed: %d entries", needed); + LOG_ERROR(" - last_task_alive: %d (stuck here)", cur_last_alive); + LOG_ERROR(" - current_task: %d", current); + LOG_ERROR(" - In-flight tasks: %d", current - cur_last_alive); + LOG_ERROR("Diagnosis:"); + LOG_ERROR(" last_task_alive is not advancing, so dep pool tail"); + LOG_ERROR(" cannot reclaim. Check TaskRing diagnostics for root cause."); + LOG_ERROR("Solution:"); + LOG_ERROR(" Increase dep pool capacity (current: %d, recommended: %d)", capacity, high_water * 2); + LOG_ERROR(" Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_DEP_POOL=%d", high_water * 2); + LOG_ERROR("========================================"); + latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW); + return false; + } + SPIN_WAIT_HINT(); + } + return true; +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h new file mode 100644 index 000000000..ea39c8b4c --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h @@ -0,0 +1,693 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Ring Buffer Data Structures + * + * Implements ring buffer designs for zero-overhead memory management: + * + * 1. TaskAllocator - Unified task slot + output buffer allocation + * - Combines task ring (slot allocation) and heap ring (output buffer allocation) + * - Single spin-wait loop with unified back-pressure and deadlock detection + * - O(1) bump allocation for both task slots and heap buffers + * + * 2. FaninPool - Fanin spill entry allocation + * - Ring buffer for spilled fanin entries + * - O(1) append allocation + * - Implicit reclamation with task ring + * + * 3. DepListPool - Dependency list entry allocation + * - Ring buffer for linked list entries + * - O(1) prepend operation + * - Implicit reclamation with task ring + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#ifndef PTO_RING_BUFFER_H +#define PTO_RING_BUFFER_H + +#include +#include +#include + +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" +#include "common/unified_log.h" + +#if PTO2_PROFILING +// Heap-ring wrap reporting — the allocator is the only place each individual +// wrap is observable, so it notifies the scope_stats collector here. Gated: +// pays nothing (no include, no call) when profiling is compiled out. +#include "aicpu/scope_stats_collector_aicpu.h" +#endif + +// Block notification interval (in spin counts) +#define PTO2_BLOCK_NOTIFY_INTERVAL 10000 +// Alloc spin limit - after this, report deadlock and exit +#define PTO2_ALLOC_SPIN_LIMIT 100000 + +// Dep pool spin limit - if exceeded, dep pool capacity too small for workload +#define PTO2_DEP_POOL_SPIN_LIMIT 100000 + +// ============================================================================= +// Task Allocator (unified task slot + heap buffer allocation) +// ============================================================================= + +/** + * Unified task slot + heap buffer allocator. + * + * Since task and heap are always allocated together and the orchestrator is + * single-threaded, both pointers (task index, heap top) are tracked locally + * and published to shared memory via plain store — no fetch_add or CAS needed. + * + * The alloc() method checks both resources BEFORE committing to either, + * eliminating the need for rollback on partial failure. + */ +class PTO2TaskAllocator { +public: + /** + * Initialize the allocator with task ring and heap ring resources. + * + * All pointer arguments are device addresses (live in SM / GM heap); this + * function only stores them, no dereferences, so it is safe to invoke + * from host code that constructs a prebuilt arena image. + * + * Production callers leave `initial_local_task_id` at 0: the SM ring + * flow-control counters that current_index_ptr / last_alive_ptr point at + * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM + * reset), so we keep local_task_id_ aligned with that without reading the + * SM. Tests that drive SM state directly may pass a non-zero seed to + * exercise corner cases like task IDs near INT32_MAX. + */ + void init( + PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic *current_index_ptr, + std::atomic *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic *error_code_ptr, + int32_t initial_local_task_id = 0 + ) { + descriptors_ = descriptors; + window_size_ = window_size; + window_mask_ = window_size - 1; + current_index_ptr_ = current_index_ptr; + last_alive_ptr_ = last_alive_ptr; + heap_base_ = heap_base; + heap_size_ = heap_size; + error_code_ptr_ = error_code_ptr; + local_task_id_ = initial_local_task_id; + heap_top_ = 0; + heap_tail_ = 0; + last_alive_seen_ = 0; + } + + /** + * Allocate a task slot and its associated output buffer in one call. + * + * Both task index and heap top are maintained as local counters and + * published to shared memory only on success. Since the orchestrator is + * single-threaded, no CAS or fetch_add is needed — just check-then-commit. + * + * @param output_size Total packed output size in bytes (0 = no heap needed) + * @return Allocation result; check failed() for errors + */ + PTO2TaskAllocResult alloc(int32_t output_size) { + uint64_t aligned_size = + output_size > 0 ? PTO2_ALIGN_UP(static_cast(output_size), PTO2_ALIGN_SIZE) : 0; + + int spin_count = 0; + int32_t prev_last_alive = last_alive_ptr_->load(std::memory_order_acquire); + int32_t last_alive = prev_last_alive; + update_heap_tail(last_alive); + bool blocked_on_heap = false; +#if PTO2_ORCH_PROFILING + uint64_t wait_start = 0; + bool waiting = false; +#endif + + while (true) { + // Check both resources; commit only if both available + if (local_task_id_ - last_alive + 1 < window_size_) { + void *heap_ptr = try_bump_heap(aligned_size); + if (heap_ptr) { + int32_t task_id = commit_task(); +#if PTO2_ORCH_PROFILING + record_wait(spin_count, wait_start, waiting); +#endif + return {task_id, task_id & window_mask_, heap_ptr, static_cast(heap_ptr) + aligned_size}; + } + blocked_on_heap = true; + } else { + blocked_on_heap = false; + } + + // Spin: wait for scheduler to advance last_task_alive + spin_count++; +#if PTO2_ORCH_PROFILING + if (!waiting) { + wait_start = get_sys_cnt_aicpu(); + waiting = true; + } +#endif + last_alive = last_alive_ptr_->load(std::memory_order_acquire); + update_heap_tail(last_alive); + if (last_alive > prev_last_alive) { + spin_count = 0; + prev_last_alive = last_alive; + } else { + if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0) { + LOG_WARN( + "[TaskAllocator] BLOCKED: tasks=%d/%d, heap=%" PRIu64 "/%" PRIu64 ", on=%s, spins=%d", + local_task_id_ - last_alive, window_size_, heap_top_, heap_size_, + blocked_on_heap ? "heap" : "task", spin_count + ); + } + if (spin_count >= PTO2_ALLOC_SPIN_LIMIT) { + report_deadlock(output_size, blocked_on_heap); + return {-1, -1, nullptr, nullptr}; + } + } + SPIN_WAIT_HINT(); + } + } + + // ========================================================================= + // State queries + // ========================================================================= + + int32_t active_count() const { + int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire); + return local_task_id_ - last_alive; + } + + // Task ring start/end: tail = oldest live task (last_task_alive), head = + // next task id to allocate. head - tail == active_count(). + int32_t task_tail() const { return last_alive_ptr_->load(std::memory_order_acquire); } + int32_t task_head() const { return local_task_id_; } + + int32_t window_size() const { return window_size_; } + + uint64_t heap_available() const { + uint64_t tail = heap_tail_; + if (heap_top_ >= tail) { + uint64_t at_end = heap_size_ - heap_top_; + uint64_t at_begin = tail; + return at_end > at_begin ? at_end : at_begin; + } + return tail - heap_top_; + } + + uint64_t heap_top() const { return heap_top_; } + // Heap ring start: reclaim pointer (oldest byte still live). heap_top() is + // the end (next allocation). heap_top - heap_tail == heap_used_bytes(). + uint64_t heap_tail() const { return heap_tail_; } + uint64_t heap_capacity() const { return heap_size_; } + uint64_t heap_used_bytes() const { + if (heap_size_ == 0) return 0; + return (heap_top_ + heap_size_ - heap_tail_) % heap_size_; + } + +private: + // --- Task Ring --- + PTO2TaskDescriptor *descriptors_ = nullptr; + int32_t window_size_ = 0; + int32_t window_mask_ = 0; + std::atomic *current_index_ptr_ = nullptr; + std::atomic *last_alive_ptr_ = nullptr; + + // --- Heap --- + void *heap_base_ = nullptr; + uint64_t heap_size_ = 0; + + // --- Local state (single-writer, no atomics needed) --- + int32_t local_task_id_ = 0; // Next task ID to allocate + uint64_t heap_top_ = 0; // Current heap allocation pointer + uint64_t heap_tail_ = 0; // Heap reclamation pointer (derived from consumed tasks) + int32_t last_alive_seen_ = 0; // last_task_alive at last heap_tail derivation + + // --- Shared --- + std::atomic *error_code_ptr_ = nullptr; + + // ========================================================================= + // Internal helpers + // ========================================================================= + + /** + * Commit a task slot: bump local counter and publish to shared memory. + * Must only be called after space check has passed. + */ + int32_t commit_task() { + int32_t task_id = local_task_id_++; + current_index_ptr_->store(local_task_id_, std::memory_order_release); + return task_id; + } + + /** + * Derive heap_tail_ from the last consumed task's packed_buffer_end. + * + * Every task has a valid packed_buffer_end (equal to packed_buffer_base + * for zero-size allocations), so the last consumed task always determines + * the correct heap_tail — no backward scan needed. + */ + void update_heap_tail(int32_t last_alive) { + if (last_alive <= last_alive_seen_) return; + last_alive_seen_ = last_alive; + + PTO2TaskDescriptor &desc = descriptors_[(last_alive - 1) & window_mask_]; + uint64_t old_tail = heap_tail_; + heap_tail_ = + static_cast(static_cast(desc.packed_buffer_end) - static_cast(heap_base_)); +#if PTO2_PROFILING + // Reclaim pointer moves forward monotonically in ring order; a decrease + // means it wrapped past heap_size_ (occupancy < heap_size_ guarantees at + // most one wrap per call). Report it so scope_stats can unroll. + if (is_scope_stats_enabled() && heap_tail_ < old_tail) { + scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_RECLAIM); + } +#else + (void)old_tail; +#endif + } + + /** + * Bump the heap pointer for the given allocation size. + * Returns the allocated pointer, or nullptr if insufficient space. + * When alloc_size == 0, returns current position without advancing. + */ + void *try_bump_heap(uint64_t alloc_size) { + uint64_t top = heap_top_; + if (alloc_size == 0) { + return static_cast(heap_base_) + top; + } + uint64_t tail = heap_tail_; + void *result; + + if (top >= tail) { + uint64_t space_at_end = heap_size_ - top; + if (space_at_end >= alloc_size) { + result = static_cast(heap_base_) + top; + heap_top_ = top + alloc_size; + } else if (tail > alloc_size) { + LOG_DEBUG( + "try_bump_heap wrap-around alloc: top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64, top, tail, + alloc_size + ); + result = heap_base_; + heap_top_ = alloc_size; +#if PTO2_PROFILING + // Allocation pointer just wrapped past heap_size_; report it so + // scope_stats can unroll the wrapping offset into a monotonic value. + // The collector attributes the wrap to the current scope's ring. + if (is_scope_stats_enabled()) scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_ALLOC); +#endif + } else { + LOG_DEBUG( + "try_bump_heap failed (top>=tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64 + ", heap_size=%" PRIu64, + top, tail, alloc_size, heap_size_ + ); + return nullptr; + } + } else { + if (tail - top > alloc_size) { + result = static_cast(heap_base_) + top; + heap_top_ = top + alloc_size; + } else { + LOG_DEBUG( + "try_bump_heap failed (topload(std::memory_order_acquire); + int32_t active_tasks = local_task_id_ - last_alive; + uint64_t htail = heap_tail_; + + LOG_ERROR("========================================"); + if (heap_blocked) { + LOG_ERROR("FATAL: Task Allocator Deadlock - Heap Exhausted!"); + } else { + LOG_ERROR("FATAL: Task Allocator Deadlock - Task Ring Full!"); + } + LOG_ERROR("========================================"); + LOG_ERROR("No progress after %d spins.", PTO2_ALLOC_SPIN_LIMIT); + LOG_ERROR( + " Task ring: current=%d, last_alive=%d, active=%d/%d (%.1f%%)", local_task_id_, last_alive, active_tasks, + window_size_, 100.0 * active_tasks / window_size_ + ); + LOG_ERROR( + " Heap ring: top=%" PRIu64 ", tail=%" PRIu64 ", size=%" PRIu64 ", available=%" PRIu64, heap_top_, htail, + heap_size_, heap_available() + ); + if (heap_blocked) { + LOG_ERROR(" Requested: %d bytes", requested_output_size); + } + LOG_ERROR("Diagnosis:"); + LOG_ERROR(" last_task_alive is stuck at %d, meaning task %d", last_alive, last_alive); + LOG_ERROR(" cannot transition to CONSUMED. Possible causes:"); + LOG_ERROR(" 1. Task %d still executing (subtasks not complete)", last_alive); + LOG_ERROR(" 2. Task %d fanout not fully released (downstream not done)", last_alive); + LOG_ERROR(" 3. Scope reference not released (scope_end not called)"); + LOG_ERROR(" 4. Orchestrator blocked here -> can't call scope_end -> circular wait"); + LOG_ERROR("Solution:"); + if (heap_blocked) { + LOG_ERROR( + " Increase heap size (current: %" PRIu64 ", recommended: %" PRIu64 ")", heap_size_, heap_size_ * 2 + ); + LOG_ERROR(" Compile-time: PTO2_HEAP_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_HEAP= (e.g. %" PRIu64 ")", heap_size_ * 2); + } else { + LOG_ERROR(" Increase task window size (current: %d, recommended: %d)", window_size_, active_tasks * 2); + LOG_ERROR(" Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_TASK_WINDOW= (e.g. %d)", active_tasks * 2); + } + LOG_ERROR("========================================"); + if (error_code_ptr_) { + int32_t code = heap_blocked ? PTO2_ERROR_HEAP_RING_DEADLOCK : PTO2_ERROR_FLOW_CONTROL_DEADLOCK; + error_code_ptr_->store(code, std::memory_order_release); + } + } +}; + +// ============================================================================= +// Fanin Spill Pool +// ============================================================================= + +/** + * Fanin spill pool structure + * + * True ring buffer for allocating spilled fanin entries. + * Entries are reclaimed when their consumer tasks become CONSUMED. + * + * Linear counters (top, tail) grow monotonically; the physical index + * is obtained via modulo: base[linear_index % capacity]. + */ +struct PTO2FaninPool { + PTO2FaninSpillEntry *base; // Pool base address + int32_t capacity; // Total number of entries + int32_t top; // Linear next-allocation counter (starts from 1) + int32_t tail; // Linear first-alive counter (entries before this are dead) + int32_t high_water; // Peak concurrent usage (top - tail) + int32_t reclaim_task_cursor{0}; // Last task id scanned for reclaim on this pool + + std::atomic *error_code_ptr = nullptr; + + void init(PTO2FaninSpillEntry *in_base, int32_t in_capacity, std::atomic *in_error_code_ptr) { + base = in_base; + capacity = in_capacity; + top = 1; + tail = 1; + high_water = 0; + reclaim_task_cursor = 0; + base[0].slot_state = nullptr; + error_code_ptr = in_error_code_ptr; + } + + void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive); + + bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed); + + PTO2FaninSpillEntry *alloc() { + int32_t used = top - tail; + if (used >= capacity) { + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Fanin Spill Pool Overflow!"); + LOG_ERROR("========================================"); + LOG_ERROR("Fanin spill pool exhausted: %d entries alive (capacity=%d).", used, capacity); + LOG_ERROR(" - Pool top: %d (linear)", top); + LOG_ERROR(" - Pool tail: %d (linear)", tail); + LOG_ERROR(" - High water: %d", high_water); + LOG_ERROR("Solution:"); + LOG_ERROR(" Increase fanin spill pool capacity (current: %d, recommended: %d).", capacity, capacity * 2); + LOG_ERROR(" Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_DEP_POOL=%d", capacity * 2); + LOG_ERROR("========================================"); + if (error_code_ptr) { + error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release); + } + return nullptr; + } + int32_t idx = top % capacity; + top++; + used++; + if (used > high_water) high_water = used; + return &base[idx]; + } + + void advance_tail(int32_t new_tail) { + if (new_tail > tail) { + tail = new_tail; + } + } + + int32_t used() const { return top - tail; } + + int32_t available() const { return capacity - used(); } +}; + +template +using PTO2FaninCallbackResult = std::invoke_result_t; + +template +using PTO2FaninForEachReturn = std::conditional_t, void>, void, bool>; + +template +inline PTO2FaninForEachReturn for_each_fanin_storage( + InlineSlots &&inline_slot_states, int32_t fanin_count, int32_t spill_start, PTO2FaninPool &spill_pool, Fn &&fn +) { + using FaninCallbackResult = PTO2FaninCallbackResult; + static_assert( + std::is_same_v || std::is_same_v, + "fanin callback must return void or bool" + ); + + if constexpr (std::is_void_v) { + int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP); + for (int32_t i = 0; i < inline_count; i++) { + fn(inline_slot_states[i]); + } + + int32_t spill_count = fanin_count - inline_count; + if (spill_count <= 0) { + return; + } + + int32_t start_idx = spill_start % spill_pool.capacity; + int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx); + PTO2FaninSpillEntry *first = spill_pool.base + start_idx; + for (int32_t i = 0; i < first_count; i++) { + fn(first[i].slot_state); + } + + int32_t second_count = spill_count - first_count; + for (int32_t i = 0; i < second_count; i++) { + fn(spill_pool.base[i].slot_state); + } + return; + } else { + int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP); + for (int32_t i = 0; i < inline_count; i++) { + if (!fn(inline_slot_states[i])) { + return false; + } + } + + int32_t spill_count = fanin_count - inline_count; + if (spill_count <= 0) { + return true; + } + + int32_t start_idx = spill_start % spill_pool.capacity; + int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx); + PTO2FaninSpillEntry *first = spill_pool.base + start_idx; + for (int32_t i = 0; i < first_count; i++) { + if (!fn(first[i].slot_state)) { + return false; + } + } + + int32_t second_count = spill_count - first_count; + for (int32_t i = 0; i < second_count; i++) { + if (!fn(spill_pool.base[i].slot_state)) { + return false; + } + } + return true; + } +} + +template +inline PTO2FaninForEachReturn for_each_fanin_slot_state(const PTO2TaskPayload &payload, Fn &&fn) { + return for_each_fanin_storage( + payload.fanin_inline_slot_states, payload.fanin_actual_count, payload.fanin_spill_start, + *payload.fanin_spill_pool, static_cast(fn) + ); +} + +// ============================================================================= +// Dependency List Pool +// ============================================================================= + +/** + * Dependency list pool structure + * + * True ring buffer for allocating linked list entries. + * Entries are reclaimed when their producer tasks become CONSUMED, + * as tracked by the orchestrator via dep_pool_mark per task. + * + * Linear counters (top, tail) grow monotonically; the physical index + * is obtained via modulo: base[linear_index % capacity]. + */ +struct PTO2DepListPool { + PTO2DepListEntry *base; // Pool base address + int32_t capacity; // Total number of entries + int32_t top; // Linear next-allocation counter (starts from 1) + int32_t tail; // Linear first-alive counter (entries before this are dead) + int32_t high_water; // Peak concurrent usage (top - tail) + int32_t last_reclaimed{0}; // last_task_alive at last successful reclamation + + // Error code pointer for fatal error reporting (→ sm_header->orch_error_code) + std::atomic *error_code_ptr = nullptr; + + /** + * + * Initialize dependency list pool + * @param base Pool base address from shared memory + * @param capacity Total number of entries + */ + void init(PTO2DepListEntry *in_base, int32_t in_capacity, std::atomic *in_error_code_ptr) { + base = in_base; + capacity = in_capacity; + top = 1; // Start from 1, 0 means NULL/empty + tail = 1; // Match initial top (no reclaimable entries yet) + high_water = 0; + last_reclaimed = 0; + + // Initialize entry 0 as NULL marker + base[0].slot_state = nullptr; + base[0].next = nullptr; + + error_code_ptr = in_error_code_ptr; + } + + /** + * Reclaim dead entries based on scheduler's slot state dep_pool_mark. + * Safe to call multiple times — only advances tail forward. + * + * @param ring Ring header (for reading slot dep_pool_mark) + * @param sm_last_task_alive Current last_task_alive from shared memory + */ + void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive); + + /** + * Ensure dep pool for a specific ring has at least `needed` entries available. + * Spin-waits for reclamation if under pressure. Detects deadlock if no progress. + */ + bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed); + + /** + * Allocate a single entry from the pool (single-thread per pool instance) + * + * @return Pointer to allocated entry, or nullptr on fatal error + */ + PTO2DepListEntry *alloc() { + int32_t used = top - tail; + if (used >= capacity) { + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Dependency Pool Overflow!"); + LOG_ERROR("========================================"); + LOG_ERROR("DepListPool exhausted: %d entries alive (capacity=%d).", used, capacity); + LOG_ERROR(" - Pool top: %d (linear)", top); + LOG_ERROR(" - Pool tail: %d (linear)", tail); + LOG_ERROR(" - High water: %d", high_water); + LOG_ERROR("Solution:"); + LOG_ERROR(" Increase dep pool capacity (current: %d, recommended: %d).", capacity, capacity * 2); + LOG_ERROR(" Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_DEP_POOL=%d", capacity * 2); + LOG_ERROR("========================================"); + if (error_code_ptr) { + error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release); + } + return nullptr; + } + int32_t idx = top % capacity; + top++; + used++; + if (used > high_water) high_water = used; + return &base[idx]; + } + + /** + * Advance the tail pointer, reclaiming dead entries. + * Called by the orchestrator based on last_task_alive advancement. + */ + void advance_tail(int32_t new_tail) { + if (new_tail > tail) { + tail = new_tail; + } + } + + /** + * Prepend a task ID to a dependency list + * + * O(1) operation: allocates new entry and links to current head. + * + * @param current_head Current list head offset (0 = empty list) + * @param task_slot Task slot to prepend + * @return New head offset + */ + PTO2DepListEntry *prepend(PTO2DepListEntry *cur, PTO2TaskSlotState *slot_state) { + PTO2DepListEntry *new_entry = alloc(); + if (!new_entry) return nullptr; + new_entry->slot_state = slot_state; + new_entry->next = cur; + return new_entry; + } + + int32_t used() const { return top - tail; } + + int32_t available() const { return capacity - used(); } +}; + +// ============================================================================= +// Ring Set (per-depth aggregate) +// ============================================================================= + +/** + * Groups a TaskAllocator and DepPool into one per-depth unit. + * PTO2_MAX_RING_DEPTH instances provide independent reclamation per scope depth. + */ +struct PTO2RingSet { + PTO2TaskAllocator task_allocator; + PTO2FaninPool fanin_pool; +}; + +#endif // PTO_RING_BUFFER_H diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp new file mode 100644 index 000000000..263adec8d --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp @@ -0,0 +1,287 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO Runtime2 - Main Implementation + * + * Implements the unified runtime API that combines orchestrator and scheduler. + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#include "pto_runtime2.h" + +#include +#include +#include +#include + +#include + +#include "aicpu/device_time.h" +#include "common/unified_log.h" +#if PTO2_PROFILING +#include "aicpu/scope_stats_collector_aicpu.h" +#endif + +// Weak fallback for HOST .so builds (never called, but satisfies linker). +// The AICPU build links the strong symbol from platform/.../device_time.cpp. +// Hidden visibility prevents HOST .so from polluting global symbol table. +__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; } + +// ============================================================================= +// Orchestration Ops Table (function-pointer dispatch for orchestration .so) +// ============================================================================= + +static TaskOutputTensors submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args) { + return rt->orchestrator.submit_task(mixed_kernels, args); +} + +static TaskOutputTensors alloc_tensors_impl(PTO2Runtime *rt, const L0TaskArgs &args) { + return rt->orchestrator.alloc_tensors(args); +} + +static TaskOutputTensors submit_dummy_task_impl(PTO2Runtime *rt, const L0TaskArgs &args) { + return rt->orchestrator.submit_dummy_task(args); +} + +void rt_scope_begin(PTO2Runtime *rt) { + PTO2ScopeMode mode = rt->pending_scope_mode; + rt->pending_scope_mode = PTO2ScopeMode::AUTO; + rt->orchestrator.begin_scope(mode); +} + +void rt_scope_end(PTO2Runtime *rt) { rt->orchestrator.end_scope(); } + +void rt_orchestration_done(PTO2Runtime *rt) { rt->orchestrator.mark_done(); } + +static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; } + +void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + if (fmt == nullptr || fmt[0] == '\0') { + rt->orchestrator.report_fatal(error_code, func, nullptr); + } else { + char message[1024]; + vsnprintf(message, sizeof(message), fmt, args); + rt->orchestrator.report_fatal(error_code, func, "%s", message); + } + va_end(args); +} + +// Wait for all producers of this tensor to be safe for data access. +// Checks owner metadata (lifecycle anchor) and OverlapMap (modifier writers). +// For reads: wait until each producer COMPLETED (done writing). +// For writes: also wait until all consumers done reading +// (fanout_refcount >= fanout_count - 1, excluding scope reference). +// Uses cycle-based timeout (checked every 1024 spins). +// Returns false on timeout (sets orch.fatal). +MAYBE_UNINITIALIZED_BEGIN +static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller) { + PTO2TaskId owner = tensor.owner_task_id; + PTO2OrchestratorState &orch = rt->orchestrator; + + // Segmented wait: collect up to kSegmentCap producer slots, then flush by + // spinning on each. When the segment fills, we wait for the accumulated + // batch before continuing to gather more. Dedup is per-segment only; a + // producer that appears in two segments is waited on twice, which is + // idempotent (task_state is monotonic) and only adds one atomic load on + // the second encounter. + constexpr int kSegmentCap = 64; + const PTO2TaskSlotState *seg[kSegmentCap]; + int seg_count = 0; + bool signaled = false; + bool failed = false; + + auto wait_one_producer = [&](const PTO2TaskSlotState &slot) { + uint8_t ring_id = slot.ring_id; + int32_t local_id = static_cast(slot.task->task_id.local()); + uint64_t t0 = get_sys_cnt_aicpu(); + int32_t spin_count = 0; + while (slot.task_state.load(std::memory_order_acquire) < PTO2_TASK_COMPLETED) { + SPIN_WAIT_HINT(); + if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) { + orch.report_fatal( + PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller, + "Timeout (%llu cycles): producer (ring=%d, local=%d) not completed", + (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id + ); + failed = true; + return; + } + } + }; + + auto wait_one_consumers = [&](const PTO2TaskSlotState &slot) { + uint8_t ring_id = slot.ring_id; + int32_t local_id = slot.task->task_id.local(); + uint64_t t0 = get_sys_cnt_aicpu(); + int32_t spin_count = 0; + while (slot.fanout_refcount.load(std::memory_order_acquire) < slot.fanout_count - 1) { + SPIN_WAIT_HINT(); + if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) { + orch.report_fatal( + PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller, + "Timeout (%llu cycles): consumers of producer (ring=%d, local=%d) not done", + (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id + ); + failed = true; + return; + } + } + }; + + auto flush_segment = [&]() { + for (int i = 0; i < seg_count; i++) { + wait_one_producer(*seg[i]); + if (failed) return; + if (!wait_for_consumers) continue; + wait_one_consumers(*seg[i]); + if (failed) return; + } + seg_count = 0; + }; + + auto try_push = [&](const PTO2TaskSlotState &s) { + for (int j = 0; j < seg_count; j++) { + if (seg[j] == &s) return; // per-segment dedup + } + if (seg_count == kSegmentCap) { + flush_segment(); + if (failed) return; + } + seg[seg_count++] = &s; + if (!signaled) { + orch.scheduler->wiring.orch_needs_drain.store(true, std::memory_order_release); + signaled = true; + } + }; + + auto do_wait = [&]() { + // Step A: creator retention — read owner directly from tensor metadata + if (owner.is_valid()) { + auto &s = orch.sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local()); + try_push(s); + if (failed) return; + } + + // Step B: modifier writer lookup (OverlapMap), direct callback + orch.tensor_map.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus) -> bool { + PTO2TaskId pid = entry.producer_task_id; + auto &s = orch.sm_header->rings[pid.ring()].get_slot_state_by_task_id(pid.local()); + try_push(s); + return !failed; + }); + if (failed) return; + flush_segment(); + }; + + do_wait(); + if (signaled) { + orch.scheduler->wiring.orch_needs_drain.store(false, std::memory_order_release); + } + return !failed; +} +MAYBE_UNINITIALIZED_END + +uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) { + if (tensor.buffer.addr == 0) { + unified_log_error( + __FUNCTION__, "get_tensor_data: buffer not allocated (addr=0). " + "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns." + ); + return 0; + } + + if (!wait_for_tensor_ready(rt, tensor, false, __FUNCTION__)) { + return 0; + } + + uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims); + uint64_t elem_size = get_element_size(tensor.dtype); + const void *ptr = reinterpret_cast(tensor.buffer.addr + flat_offset * elem_size); + uint64_t result = 0; + memcpy(&result, ptr, elem_size); + return result; +} + +void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value) { + if (tensor.buffer.addr == 0) { + unified_log_error( + __FUNCTION__, "set_tensor_data: buffer not allocated (addr=0). " + "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns." + ); + return; + } + + // Wait for producer + all consumers before writing (WAW + WAR safety) + if (!wait_for_tensor_ready(rt, tensor, true, __FUNCTION__)) { + return; + } + + uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims); + uint64_t elem_size = get_element_size(tensor.dtype); + void *ptr = reinterpret_cast(tensor.buffer.addr + flat_offset * elem_size); + memcpy(ptr, &value, elem_size); +} + +// Ops-table entry that hands the call-site captured by PTO2ScopeGuard to the +// [ScopeStats] collector. The slot is always present in the struct to keep +// the layout stable; at PTO2_PROFILING=0 we fill nullptr so the orchestration +// .so's null-check skips it. +#if PTO2_PROFILING +static void scope_set_site_impl(const char *file, int line) { scope_stats_set_pending_site(file, line); } +#endif + +static const PTO2RuntimeOps s_runtime_ops = { + .submit_task = submit_task_impl, + .scope_begin = rt_scope_begin, + .scope_end = rt_scope_end, + .orchestration_done = rt_orchestration_done, + .is_fatal = is_fatal_impl, + .report_fatal = rt_report_fatal, + .log_error = unified_log_error, + .log_warn = unified_log_warn, + .log_debug = unified_log_debug, + .log_info_v = unified_log_info_v, + .get_tensor_data = get_tensor_data, + .set_tensor_data = set_tensor_data, + .alloc_tensors = alloc_tensors_impl, + .submit_dummy_task = submit_dummy_task_impl, +#if PTO2_PROFILING + .scope_set_site = scope_set_site_impl, +#else + .scope_set_site = nullptr, +#endif +}; + +// ============================================================================= +// Runtime Lifecycle (AICPU-only fixup) +// ============================================================================= +// +// Layout / init_data / wire / destroy live in +// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the +// prebuilt arena image. The pieces below — wiring the ops table and the +// SPMD core counts — depend on the device-side s_runtime_ops global and the +// AICPU SchedulerContext respectively, so they remain in the AICPU build. + +void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) { + rt->ops = &s_runtime_ops; + rt->orchestrator.total_cluster_count = aic_count; + rt->orchestrator.total_aiv_count = aiv_count; +} + +void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) { + if (rt) { + rt->mode = mode; + } +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.h new file mode 100644 index 000000000..85680d8c3 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2.h @@ -0,0 +1,290 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Main Interface + * + * This is the main header for the PTO Runtime2 system. + * It provides a unified API for task graph construction and execution. + * + * Key Features: + * - Ring buffer based memory management (zero allocation overhead) + * - Lazy invalidation TensorMap for dependency discovery + * - Scope-based buffer lifecycle management + * - Per-task spinlocks for concurrent fanout updates + * - Orchestrator-Scheduler decoupling via shared memory + * + * Usage: + * 1. Create runtime: PTO2Runtime create methods + * 2. Build task graph in orchestration function: + * - begin_scope() / end_scope() + * - submit_task() + * 3. Mark orchestration complete: mark_done() + * 4. Destroy runtime + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#pragma once + +#include "utils/device_arena.h" +#include "pto_runtime2_types.h" +#include "pto_submit_types.h" +#include "pto_shared_memory.h" +#include "pto_ring_buffer.h" +#include "pto_tensormap.h" +#include "scheduler/pto_scheduler.h" +#include "pto_orchestrator.h" +#include "aicore_completion_mailbox.h" + +// ============================================================================= +// Runtime Context +// ============================================================================= + +/** + * Runtime execution mode + */ +enum PTO2RuntimeMode { + PTO2_MODE_EXECUTE = 0, // Execute tasks on workers + PTO2_MODE_SIMULATE = 1, // Simulate task execution with cycle counting + PTO2_MODE_GRAPH_ONLY = 2 // Build graph only, no execution +}; + +/** + * Function-pointer ops table for runtime operations. + * + * The orchestration .so calls runtime functions through this table + * (via pto_orchestration_api.h inline wrappers), so it has zero link + * dependencies on runtime .cpp files. + */ +typedef struct PTO2Runtime PTO2Runtime; // forward declare for ops signatures + +struct PTO2RuntimeOps { + TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args); + void (*scope_begin)(PTO2Runtime *rt); + void (*scope_end)(PTO2Runtime *rt); + void (*orchestration_done)(PTO2Runtime *rt); + bool (*is_fatal)(PTO2Runtime *rt); + void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...); + + // Logging (populated by runtime, called by orchestration) + void (*log_error)(const char *func, const char *fmt, ...); + void (*log_warn)(const char *func, const char *fmt, ...); + void (*log_debug)(const char *func, const char *fmt, ...); + // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside). + void (*log_info_v)(const char *func, int v, const char *fmt, ...); + + // Cross-layer data access (orchestration reads/writes tensor values via runtime) + // Placed after logging to avoid shifting hot-path field offsets. + uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]); + void (*set_tensor_data)( + PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value + ); + TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const L0TaskArgs &args); + TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const L0TaskArgs &args); + // Stash the call-site captured by PTO2ScopeGuard into the [ScopeStats] + // collector. Always present in the struct to keep ops-table layout stable + // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0. + void (*scope_set_site)(const char *file, int line); +}; + +/** + * Layout descriptor for the prebuilt runtime arena. Holds all sub-region + * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header / + * AICore mailbox) plus the layout-defining capacities. Produced once on the + * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout + * and runtime_wire_arena_pointers. + */ +struct PTO2RuntimeArenaLayout { + size_t off_sm_handle{0}; + PTO2OrchestratorLayout orch; + PTO2SchedulerLayout sched; + size_t off_runtime{0}; + size_t off_mailbox{0}; + + // Cached parameters (re-used by init_data + wire stages). + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{}; + uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{}; + int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{}; + + // Total arena byte size post-commit. Used by host to size the prebuilt + // image buffer and as the rtMemcpy length. + size_t arena_size{0}; +}; + +/** + * PTO Runtime2 context + * + * Contains all state for orchestration and scheduling. + * In simulated mode, runs in single process with shared address space. + */ +struct PTO2Runtime { + // Ops table (first field — used by orchestration .so via function pointers) + const PTO2RuntimeOps *ops; + PTO2ScopeMode pending_scope_mode; + + // Components + PTO2SharedMemoryHandle *sm_handle; + PTO2OrchestratorState orchestrator; + PTO2SchedulerState scheduler; + AICoreCompletionMailbox *aicore_mailbox; + + // GM Heap for output buffers + void *gm_heap; + uint64_t gm_heap_size; + bool gm_heap_owned; // True if we allocated it + + // Mode + PTO2RuntimeMode mode; + + // Statistics + int64_t total_cycles; + + // Prebuilt-arena fast path metadata. Carries every offset + // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct + // all arena-internal pointer fields without re-running init_data. The + // device base of the runtime arena travels separately on the host-side + // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it + // *before* dereferencing this image. Populated on host by + // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by + // aicpu_executor.cpp. + PTO2RuntimeArenaLayout prebuilt_layout; +}; + +// ============================================================================= +// Runtime Lifecycle API +// ============================================================================= + +/** + * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator / + * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied + * arena. Pure arithmetic; does not touch device memory and may run on host. + * Returns the layout descriptor; caller commits/attaches the arena before + * Phase 2/3. + */ +PTO2RuntimeArenaLayout runtime_reserve_layout( + DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE +); +PTO2RuntimeArenaLayout runtime_reserve_layout( + DeviceArena &arena, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], + const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH] +); + +/** + * Phase 2 — write the data half of the runtime arena: standalone fields, + * memset'd arena regions, sub-structure initializers, and SM-side device + * pointers. The arena must already be committed (or attached); writes go + * into arena.base() + sub-region offsets. + * + * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store + * them (never dereference). Safe to run on a host arena that owns a host + * mirror of the runtime image — the resulting buffer is rtMemcpy-ready. + * + * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena. + * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the + * AICore-side count fields are left untouched and must be filled by the + * AICPU at boot. + */ +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size, + void *gm_heap_dev_base, uint64_t heap_size +); +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size, + void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] +); + +/** + * Phase 3 — wire every arena-internal pointer field (rt->sm_handle, + * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler, + * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool, + * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on + * both host (writing host-mirror addresses) and AICPU (writing device + * addresses) sides. + */ +void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt); + +/** + * AICPU-only Phase 4 — fill in the few fields the host could not know at + * prebuilt-image build time: the ops table (s_runtime_ops is a device-side + * file-local global, host cannot resolve its device address) and the + * orchestrator's core counts (depend on the executor's scheduler context). + * Call once per boot after runtime_wire_arena_pointers. + */ +void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count); + +/** + * Destroy runtime. With the prebuilt-arena fast path the arena buffer is + * pooled across runs by DeviceRunner, so we never call arena.release() + * here — the destructor only forgets sub-structure pointers (idempotent + * cleanup). + */ +void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena); + +/** + * Set execution mode + */ +void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode); + +// ============================================================================= +// Orchestration API (called by orchestration function) +// ============================================================================= + +/** + * Begin a new scope + * + * All tasks submitted within this scope will have their lifetime + * bounded by the scope. When scope_end() is called, the scope + * releases its reference to all enclosed tasks. + */ +void rt_scope_begin(PTO2Runtime *rt); + +/** + * End current scope + * + * Releases scope reference for all tasks submitted since scope_begin(). + * Tasks whose refcount reaches zero will have their buffers released. + */ +void rt_scope_end(PTO2Runtime *rt); + +/** + * Mark orchestration as complete + * + * Signals that no more tasks will be submitted. + */ +void rt_orchestration_done(PTO2Runtime *rt); + +/** + * Enter fatal state explicitly from orchestration. + */ +void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...); + +/** + * Cross-layer data access: read a tensor value by waiting for its producer. + */ +uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]); + +/** + * Cross-layer data access: write a value to a tensor at given indices. + * Waits for producer completion (WAW) and all consumers (WAR) via TensorMap. + * See set_tensor_data in pto_orchestration_api.h for full documentation. + */ +void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value); + +/** + * Slim config struct exported by orchestration .so via aicpu_orchestration_config(). + * Shared definition with pto_orchestration_api.h (same layout, guarded). + */ +#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED +#define PTO2_ORCHESTRATION_CONFIG_DEFINED +struct PTO2OrchestrationConfig { + int expected_arg_count; +}; +#endif diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h new file mode 100644 index 000000000..e4135a366 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h @@ -0,0 +1,524 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO Runtime2 - Core Type Definitions + * + * This header defines all fundamental types used by the PTO Runtime2 system: + * - Configuration constants + * - Worker types and task states + * - Tensor regions and task parameters + * - Task descriptors with fanin/fanout tracking + * - Dependency list entries + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_ +#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_ + +#include +#include +#include + +#include + +#include "profiling_config.h" +#include "pto_constants.h" +#include "pto_runtime_status.h" +#include "pto2_dispatch_payload.h" +#include "aicore_completion_mailbox.h" +#include "pto_submit_types.h" +#include "pto_task_id.h" +#include "pto_types.h" + +// Spin-wait hint for AICPU threads. On real hardware the AICPU has dedicated +// ARM A55 cores — no OS yield is needed, so the hint is a no-op. In simulation +// all threads share host CPU cores, so we yield to prevent starvation. +// This header is also compiled into the Host .so (for struct definitions only), +// where the hint is never called — the fallback no-op keeps Host builds clean. +#if __has_include("spin_hint.h") +#include "spin_hint.h" +#else +#define SPIN_WAIT_HINT() ((void)0) +#endif + +#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING +#include "aicpu/device_time.h" +#endif + +// ============================================================================= +// Configuration Constants +// ============================================================================= + +// Task management +// NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value. +// Actual window size is passed at runtime to runtime_create_from_sm(). +// Use pto2_task_slot(sched, task_id) for slot calculation. +#define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2) + +// Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer) +// Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1) +#define PTO2_MAX_RING_DEPTH 4 + +// Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH) +#define PTO2_HEAP_SIZE (256 * 1024 * 1024) // 256MB per ring (1GB total) +#define PTO2_DEP_LIST_POOL_SIZE 16384 // Per-ring dependency list pool entries +#define PTO2_TENSORMAP_POOL_SIZE (65536) // TensorMap entry pool +#define PTO2_TENSORMAP_NUM_BUCKETS 4096 // Power of 2 for fast hash (4096×8B=32KB fits L1) + +// Scope management +#define PTO2_MAX_SCOPE_DEPTH 64 // Maximum nesting depth +// Hard cap for the scope_tasks buffer. Equals the total in-flight ring slot +// budget (PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH): once every ring slot +// is in flight, no more tasks can ever be pushed regardless of buffer size. +// scope_tasks_push fatals on overflow rather than growing the arena-owned +// buffer (which would be UB on the arena's malloc'd backing). +#define PTO2_SCOPE_TASKS_CAP (PTO2_TASK_WINDOW_SIZE * PTO2_MAX_RING_DEPTH) + +// Ready queue +#define PTO2_READY_QUEUE_SIZE 65536 // Per-shape queue size + +// Cross-thread early-dispatch work queue (power of two) +#define PTO2_EARLY_DISPATCH_QUEUE_SIZE 64 + +// Wiring queue +#define PTO2_WRIRING_QUEUE_SIZE 1024 // Per-shape queue size + +// Fanin storage +#define PTO2_FANIN_INLINE_CAP 64 + +// TensorMap cleanup interval +#define PTO2_TENSORMAP_CLEANUP_INTERVAL 64 // Cleanup every N retired tasks +#define PTO2_DEP_POOL_CLEANUP_INTERVAL 64 // Cleanup every N retired tasks + +// get_tensor_data/set_tensor_data spin wait timeout in cycles. +// ~10s on hardware (1.5 GHz counter), ~10s on simulation (chrono-based). +constexpr uint64_t PTO2_TENSOR_DATA_TIMEOUT_CYCLES = 15 * 1000 * 1000 * 1000ULL; + +// ============================================================================= +// Task States +// ============================================================================= + +/** + * Task state enumeration + * + * State transitions: + * PENDING -> COMPLETED -> CONSUMED + * + * The slot stays in PENDING from submit through "ready in queue" and "running + * on a worker"; readiness and running-vs-idle are derived from fanin_refcount + * and per-core running_slot_state respectively, not from task_state itself. + * + * Conditions: + * PENDING->COMPLETED: all subtasks finish (set by scheduler) or task is a + * hidden alloc completed inline by the orchestrator + * COMPLETED->CONSUMED: fanout_refcount == fanout_count && state == COMPLETED + */ +typedef enum { + PTO2_TASK_PENDING = 0, // Submitted; awaiting fanin, queued, or dispatched + PTO2_TASK_COMPLETED = 1, // Execution finished, output may still be in use + PTO2_TASK_CONSUMED = 2 // Output fully consumed, buffers can be released +} PTO2TaskState; + +/** + * Result of a unified task allocation. + */ +struct PTO2TaskAllocResult { + int32_t task_id; // Absolute task ID (not wrapped) + int32_t slot; // task_id & (window_size - 1) + void *packed_base; // Heap allocation result (nullptr if failure) + void *packed_end; // packed_base + aligned output_size + + bool failed() const { return task_id < 0; } +}; + +struct PTO2OutputLayout { + uint64_t offsets[MAX_TENSOR_ARGS] = {}; + uint64_t buffer_sizes[MAX_TENSOR_ARGS] = {}; + int32_t total_output_size = 0; +}; + +// ============================================================================= +// Dependency List Entry +// ============================================================================= + +/** + * Fanin spill entry + * Stored in the dedicated fanin spill ring buffer. + */ +struct PTO2TaskSlotState; // Forward declaration +struct PTO2FaninPool; // Forward declaration +struct PTO2FaninSpillEntry { + PTO2TaskSlotState *slot_state; +}; +static_assert(sizeof(PTO2FaninSpillEntry) == sizeof(uintptr_t)); + +/** + * Dependency list entry (singly-linked list node) + * Stored in DepListPool ring buffer. + */ +struct PTO2DepListEntry { + PTO2TaskSlotState *slot_state; // Consumer slot state (direct pointer) + PTO2DepListEntry *next; // next entry +}; + +// ============================================================================= +// Task Descriptor +// ============================================================================= + +/** + * Task descriptor structure (shared memory) + * + * Stored in the TaskDescriptor ring buffer in shared memory. + * Contains static identification and buffer pointers only. + * Dynamic scheduling state (fanin/fanout/task_state) is in PTO2TaskSlotState. + * + * Fields set by Orchestrator at submission, read by Scheduler for dispatch. + */ +struct PTO2TaskDescriptor { + // Mixed-task identification (encodes ring_id in upper 32 bits) + PTO2TaskId task_id; // raw: (ring_id << 32) | local_id + + // Per-slot kernel IDs (INVALID_KERNEL_ID = inactive) + int32_t kernel_id[PTO2_SUBTASK_SLOT_COUNT]; + + // Packed output buffer (all outputs packed into single contiguous buffer) + void *packed_buffer_base; // Start of packed buffer in GM Heap + void *packed_buffer_end; // End of packed buffer (for heap reclamation) +}; + +// ============================================================================= +// Per-Slot Scheduling State +// ============================================================================= + +/** + * Task payload data (cold path - only accessed during orchestration and dispatch) + * + * Layout: metadata + inline fanin packed in the first 9 cache lines, followed + * by bulk tensor and scalar data. Small fanins stay fully inline; larger + * fanins spill into a per-ring ring buffer slice. + */ +// Speculative early-dispatch claim states for PTO2TaskPayload::spec_state. +enum PTO2SpecState : uint8_t { + PTO2_SPEC_NONE = 0, // not pre-staged + PTO2_SPEC_STAGING = 1, // Hook 1 claimed it; staging in progress + PTO2_SPEC_STAGED = 2, // staged on a core, gated; staged_* fields valid + PTO2_SPEC_DISPATCHED = 3 // routed via the normal dispatch path (no pre-stage) +}; + +// A pre-staged consumer occupies one core per gated subtask block. WHICH cores +// it occupies is recorded as a bitmask (staged_core_mask, 1 bit per global +// core_id); the completion-path release iterates the set bits and rings each +// core's doorbell from the scheduler's per-core doorbell table. Bounded by the +// chip's core count (RUNTIME_MAX_WORKER = 72; no two-level pre-dispatch means +// gated cores in flight <= core count), NOT by block_num — so a wide SPMD +// consumer can pre-stage all its idle cores. 2 words = 128 bits >= 72. +inline constexpr int PTO2_SPEC_CORE_MASK_WORDS = 2; + +struct PTO2TaskPayload { + // === Cache lines 0-8 (576B) — metadata + inline fanin === + int32_t tensor_count{0}; + int32_t scalar_count{0}; + int32_t fanin_actual_count{0}; // Actual fanin count (without the +1 redundance) + int32_t fanin_spill_start{0}; // Linear start index in fanin spill pool (0 = no spill) + PTO2FaninPool *fanin_spill_pool{nullptr}; + PTO2TaskSlotState *fanin_inline_slot_states[PTO2_FANIN_INLINE_CAP]; + // Speculative early-dispatch metadata (AICPU-side only). Ordered by descending + // alignment (8B mask, 4B fanin, then 1B flags) so the block packs with no + // internal padding. Kept here after the fanin array (not moved up front): on + // cache line 8 it shares only with the rarely-touched fanin tail, whereas in + // line 0 the spec atomics (written during staging) would false-share with + // tensor_count/scalar_count (read by build_payload at dispatch). Fits in the 40B + // between the fanin array (offset 536) and the 64B-aligned tensors[] (offset + // 576), so sizeof and tensors[] are unchanged. + // + // Bitmask of global core_ids this consumer is pre-staged (gated) on. Set with + // atomic fetch_or by concurrent stagers; read by release. (Re)initialized in + // PTO2TaskPayload::init before the slot can be staged again. + std::atomic staged_core_mask[PTO2_SPEC_CORE_MASK_WORDS]{}; + // Early-dispatch CANDIDATE detection (event-driven, dual of fanin_refcount): + // seeded at wiring with producers already complete, then a flagged producer's + // DISPATCH bumps each consumer's dispatch_fanin. dispatch_fanin == + // fanin_actual_count <=> every producer is flagged-and-dispatched or was + // pre-completed => this task is an early-dispatch candidate (push early_dispatch_queue). + std::atomic dispatch_fanin{0}; // CONSUMER side: flagged-dispatched + pre-completed producers + bool allow_early_resolve{false}; // codegen hint copied from Arg in PTO2TaskPayload::init + // Lock-free claim state shared by the stagers (Hook 1, possibly several AICPU + // threads concurrently) and the completion-path release: 0=NONE, 1=STAGING, + // 3=DISPATCHED (2=STAGED is unused now). STAGING is the STABLE gated state — + // many threads stage blocks concurrently while it holds, each claiming a block + // via the atomic next_block_idx and OR-ing its cores into staged_core_mask. + // Release does STAGING->DISPATCHED then rings the mask; a thread that stages a + // block AFTER release flipped DISPATCHED rings that block's doorbell itself + // (self-ring), so no doorbell is ever missed. + std::atomic spec_state{0}; + std::atomic dispatch_propagated{0}; // PRODUCER side: once-guard for fanout propagation + std::atomic spec_chain_active{0}; // inherited early-dispatch flag (auto-chain past codegen flag) + uint8_t spec_chain_depth{0}; // auto-chain depth; inherited = parent+1, capped + // === Cache lines 9-72 (4096B) — tensors (alignas(64) forces alignment) === + Tensor tensors[MAX_TENSOR_ARGS]; + // === Cache lines 73-74 (128B) — scalars === + uint64_t scalars[MAX_SCALAR_ARGS]; + + // Layout verification (size checks that don't need offsetof). + static_assert(sizeof(Tensor) == 128, "Tensor must be 2 cache lines"); + static_assert(MAX_SCALAR_ARGS * sizeof(uint64_t) == 128, "scalar region must be 128B (2 cache lines)"); + + /** + * Prefetch (for write) the regions init() is about to fill so the stores land + * in warm cache. tensor_count/scalar_count come from the Arg — the payload's + * own counts are not set until init(). Warms the early-dispatch spec block at + * offset 536 (cache line 8) too. A member fn lowers to the same prefetch + * instructions as a free function (`this` is just a register), no cache impact. + */ + void prefetch(int32_t tensor_count, int32_t scalar_count) const { + for (int32_t i = 0; i < tensor_count; i++) { + __builtin_prefetch(&tensors[i], 1, 3); + __builtin_prefetch(reinterpret_cast(&tensors[i]) + 64, 1, 3); + } + for (int32_t i = 0; i < scalar_count; i += 8) { + __builtin_prefetch(&scalars[i], 1, 3); + } + __builtin_prefetch(this, 1, 3); + __builtin_prefetch(reinterpret_cast(this) + 64, 1, 3); + __builtin_prefetch(reinterpret_cast(this) + 128, 1, 3); + __builtin_prefetch(reinterpret_cast(this) + 512, 1, 3); // spec fields (cache line 8) + } + + /** + * Initialize payload: copy tensors, store scalars. + * + * For each param slot, the tensor source is determined by TensorArgType: + * - OUTPUT -> use materialized_outputs.output_ptr(out_idx++) + * - INPUT / INOUT -> use refs[i].tensor + * + * @param args Task arguments (tensors + scalars) + * @param result Materialized output tensors (from TensorCreateInfo path) + */ + void init( + const L0TaskArgs &args, TaskOutputTensors &result, PTO2TaskAllocResult &alloc_result, PTO2OutputLayout &layout + ) { + tensor_count = args.tensor_count(); + scalar_count = args.scalar_count(); + + // int32_t out_idx = 0; + for (int32_t i = 0; i < args.tensor_count(); i++) { + if (args.tag(i) != TensorArgType::OUTPUT) { + tensors[i].copy(args.tensor(i).ref()); + } else { + init_tensor_from_create_info( + tensors[i], args.tensor(i).create_info(), + reinterpret_cast(reinterpret_cast(alloc_result.packed_base) + layout.offsets[i]), + layout.buffer_sizes[i] + ); + tensors[i].owner_task_id = result.task_id(); + result.materialize_output(tensors[i]); + } + } + // Round up to cache line boundary. Both arrays are 128B so no overrun. + // Eliminates branches; extra bytes within the same CL have zero additional cost. + memcpy(scalars, args.scalars(), PTO2_ALIGN_UP(args.scalar_count() * sizeof(uint64_t), 64)); + + // Speculative early-dispatch metadata — the single init point for these + // fields. reset_for_reuse MUST NOT touch the payload (it runs on the + // scheduler's advance-ring path and would pull this cold cache line across + // structures); prepare_task only allocates/binds. prefetch() warms this + // line (offset 512) so these writes land in warm cache. + // + // spec_state / staged_core_mask / dispatch_fanin / spec_chain_* are all + // CONSUMER-side: a task with allow_early_resolve == false still has them + // touched when one of ITS producers is flagged (propagate_dispatch_fanin + // bumps dispatch_fanin and may CAS spec_state / set the auto-chain flag on + // any consumer, independent of the consumer's own hint). So they MUST be + // zeroed here unconditionally — no per-task allow_early_resolve gating. + allow_early_resolve = args.allow_early_resolve(); + spec_state.store(PTO2_SPEC_NONE, std::memory_order_relaxed); + for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++) + staged_core_mask[w].store(0, std::memory_order_relaxed); + dispatch_fanin.store(0, std::memory_order_relaxed); + dispatch_propagated.store(0, std::memory_order_relaxed); + spec_chain_active.store(0, std::memory_order_relaxed); + spec_chain_depth = 0; + } +}; + +// PTO2TaskPayload layout verification (offsetof requires complete type). +static_assert(offsetof(PTO2TaskPayload, fanin_spill_pool) == 16, "spill pool pointer layout drift"); +static_assert( + offsetof(PTO2TaskPayload, fanin_inline_slot_states) == 24, "inline fanin array must follow spill metadata" +); +static_assert(offsetof(PTO2TaskPayload, tensors) == 576, "tensors must start at byte 576 (cache line 9)"); +static_assert( + offsetof(PTO2TaskPayload, scalars) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor), + "scalars must immediately follow tensors" +); +static_assert( + sizeof(PTO2TaskPayload) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor) + MAX_SCALAR_ARGS * sizeof(uint64_t), + "PTO2TaskPayload size must stay on the baseline cache-line footprint" +); + +/** + * Per-task slot scheduling state (scheduler-private, NOT in shared memory) + * + * Consolidates all hot-path scheduling fields into a single cache-friendly + * structure (32 bytes = half a cache line). Accessing any field of a task's + * slot state brings all related fields into the same cache line. + * + * Concurrency notes: + * - fanout_head, fanout_count protected by fanout_lock (per-task spinlock) + * - fanin_count set once at submission, read-only after (hot path for ready check) + * - task_state, fanin_refcount, fanout_refcount updated atomically + */ +struct alignas(64) PTO2TaskSlotState { + // Fanout lock + list (accessed together under lock in on_task_complete) + std::atomic fanout_lock; // Per-task spinlock (0=unlocked, 1=locked) + int32_t fanout_count; // 1 (owning scope) + number of consumers + + PTO2DepListEntry *fanout_head; // Pointer to first fanout entry (nullptr = empty) + + // Task state (completion, consumed check, ready check) + std::atomic task_state; // PENDING/COMPLETED/CONSUMED + + // Fanin (accessed together in release_fanin_and_check_ready) + std::atomic fanin_refcount; // Dynamic: counts completed producers + int32_t fanin_count; // Number of producer dependencies (set once by wiring) + + // Fanout refcount (accessed with fanout_count in check_and_handle_consumed) + std::atomic fanout_refcount; // Dynamic: counts released references + + // --- Per-slot constant, re-bound by orch::prepare_task each submit --- + // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]), + // but written here per-submit instead of in an O(window_size) init loop — + // these are the only "scale-dependent" pointers in this struct, so moving + // them out of init makes startup cost independent of task_window_size. + PTO2TaskPayload *payload; + PTO2TaskDescriptor *task; + + // --- Set per-submit (depend on task inputs) --- + ActiveMask active_mask; // Bitmask of active subtask slots (set once) + uint8_t ring_id; // Ring layer (immutable after init) + // Set by any subtask FIN that pushed deferred-completion CONDITIONs to + // the runtime mailbox; read by the last subtask FIN to decide whether + // the task needs MPSC-deferred completion or can complete inline on this + // thread. Carved out of the otherwise-padding byte between ring_id and + // dep_pool_mark to keep PTO2TaskSlotState at 64 bytes. The write is + // sequenced before on_subtask_complete's acq_rel fetch_add and the read + // after, so all earlier subtasks' writes are visible to the last subtask. + std::atomic any_subtask_deferred{false}; + uint8_t _async_pad{0}; + int32_t dep_pool_mark{0}; // Dep pool top after wiring (thread-0-only) + + std::atomic completed_subtasks{0}; // Each core completion increments by 1 + int16_t total_required_subtasks{0}; // = logical_block_num * popcount(active_mask) + int16_t logical_block_num{1}; // Total logical blocks (set by orchestrator) + // Next block to dispatch. Atomic so concurrent speculative stagers can each + // claim a distinct block via CAS; normal dispatch (ready-queue serialized) + // uses plain relaxed load/store. The two phases never overlap in time (staging + // happens before release; normal dispatch of the remainder happens after). + std::atomic next_block_idx{0}; + + /** + * Bind the slot-invariant ring id. Called once per slot during + * RingSchedState::init(); ring_id never changes across reuses. + */ + void bind_ring(uint8_t rid) { ring_id = rid; } + + /** + * Re-bind the per-slot payload/task pointers. Called by + * orch::prepare_task on every submit. Value is constant for a given + * slot, but we pay the cheap re-write each submit (both fields land on + * the same 64B slot_state cache line that prepare_task is already + * dirtying) to avoid the init-time per-slot loop. + */ + void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) { + payload = p; + task = t; + } + + /** + * Reset dynamic scheduling fields for slot reuse. + * Called by advance_ring_pointers() after a slot transitions to CONSUMED + * and last_task_alive advances past it, but before sync_to_sm() publishes + * the new last_task_alive to the orchestrator. + * + * Skips payload, task, ring_id (immutable, bound once at init). + * Skips task_state: left as CONSUMED so that wait_for_tensor_ready() + * callers holding stale owner_task_id still observe a completed state. + * task_state is set to PENDING by the orchestrator when it reuses the slot. + */ + void reset_for_reuse() { + fanout_lock.store(0, std::memory_order_relaxed); + fanout_count = 1; + fanout_head = nullptr; + fanin_refcount.store(0, std::memory_order_relaxed); + fanout_refcount.store(0, std::memory_order_relaxed); + completed_subtasks.store(0, std::memory_order_relaxed); + next_block_idx.store(0, std::memory_order_relaxed); + any_subtask_deferred.store(false, std::memory_order_relaxed); + // Note: payload spec fields (spec_state / staged_core_mask / dispatch_fanin / + // spec_chain_*) are NOT reset here — this method skips the payload by + // contract. They are (re)initialized in PTO2TaskPayload::init on every + // submit, before the slot becomes visible to the scheduler. + } + + // === Per-task fanout spinlock === + // + // Used by BOTH the orchestrator and the scheduler. The fanout_lock MUST + // be held whenever reading or writing fanout_head / fanout_count, because + // the orchestrator adds consumers concurrently with the scheduler + // traversing the list after task completion. + +#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING + void lock_fanout(uint64_t &atomic_count, uint64_t &wait_cycle) { + uint64_t t0 = get_sys_cnt_aicpu(); + bool contended = false; + uint32_t atomic_ops = 0; + + for (;;) { + while (fanout_lock.load(std::memory_order_acquire) != 0) { + contended = true; + atomic_ops++; + SPIN_WAIT_HINT(); + } + int32_t expected = 0; + if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) { + atomic_ops++; + atomic_count += atomic_ops; + if (contended) { + wait_cycle += (get_sys_cnt_aicpu() - t0); + } + return; + } + contended = true; + atomic_ops++; + } + } +#endif + + void lock_fanout() { + for (;;) { + while (fanout_lock.load(std::memory_order_acquire) != 0) { + SPIN_WAIT_HINT(); + } + int32_t expected = 0; + if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) { + return; + } + } + } + + void unlock_fanout() { fanout_lock.store(0, std::memory_order_release); } +}; + +static_assert(sizeof(PTO2TaskSlotState) == 64); + +#endif // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_ diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h new file mode 100644 index 000000000..cad5cec36 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h @@ -0,0 +1,270 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Shared Memory Layout + * + * Defines the shared memory structure for Orchestrator-Scheduler communication. + * + * Memory Layout (per-ring sections repeat for each ring 0..PTO2_MAX_RING_DEPTH-1): + * +---------------------------+ + * | SharedMemoryHeader | (per-ring flow control + sync) + * +---------------------------+ + * | Ring 0: TaskDescriptor[] | + * | Ring 0: TaskPayload[] | + * | Ring 0: TaskSlotState[] | + * +---------------------------+ + * | Ring 1: TaskDescriptor[] | + * | Ring 1: TaskPayload[] | + * | Ring 1: TaskSlotState[] | + * +---------------------------+ + * | ... | + * +---------------------------+ + * + * Design principles: + * - Only data needed for Orchestrator<->Scheduler communication is here + * - TensorMap, scope_stack, ready_queues, dep_pool are in private memory + * - Flow control via atomic counters/flags (no locks needed for single-word R/W) + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#pragma once + +#include "utils/device_arena.h" +#include "pto_runtime2_types.h" + +// ============================================================================= +// Shared Memory Header +// ============================================================================= + +struct PTO2SharedMemoryHandle; + +/** + * Per-ring flow control state in shared memory. + * Written/read by Orchestrator and Scheduler for synchronization. + */ +struct alignas(64) PTO2RingFlowControl { + // === Cache Line 0: Written by Orchestrator, Read by Scheduler === + alignas(64) std::atomic current_task_index; // Task ring head (next to allocate) + + // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) === + alignas(64) std::atomic last_task_alive; // Task ring tail (oldest active task) + + // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private + // local_task_id_ from initial_local_task_id (default 0 in production) + // *without* dereferencing current_task_index — it relies on this reset + // running on every AICPU boot so 0 stays in sync. If you ever change + // the initial fc value or the boot ordering, update the default in + // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or + // submit IDs will be off by the divergence. + void init() { + current_task_index.store(0, std::memory_order_relaxed); + last_task_alive.store(0, std::memory_order_relaxed); + } + + bool validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const; +}; + +static_assert(sizeof(PTO2RingFlowControl) == 128, "PTO2RingFlowControl must be exactly 2 cache lines (128B)"); + +/** + * Per-ring shared memory header section. + * + * Groups flow-control, layout info, and per-ring data pointers for a single ring. + * Pointers are host-side only (set by setup_pointers, invalid on device). + */ +struct alignas(64) PTO2SharedMemoryRingHeader { + PTO2RingFlowControl fc; + + // Layout metadata (set once at init) + uint64_t task_window_size; + int32_t task_window_mask; + uint64_t heap_size; + uint64_t task_descriptors_offset; // Offset from SM base, in bytes + + // Per-ring data pointers (host-side, set by setup_pointers) + PTO2TaskDescriptor *task_descriptors; + PTO2TaskPayload *task_payloads; + PTO2TaskSlotState *slot_states; + + int32_t get_slot_by_task_id(int32_t local_task_id) { return local_task_id & task_window_mask; } + + PTO2TaskDescriptor &get_task_by_slot(int32_t slot) { return task_descriptors[slot]; } + + PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id) { + return task_descriptors[get_slot_by_task_id(local_id)]; + } + + PTO2TaskPayload &get_payload_by_slot(int32_t slot) { return task_payloads[slot]; } + + PTO2TaskPayload &get_payload_by_task_id(int32_t local_id) { return task_payloads[get_slot_by_task_id(local_id)]; } + + PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; } + + PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) { + return slot_states[get_slot_by_task_id(local_id)]; + } +}; + +/** + * Shared memory header structure + * + * Contains per-ring flow control and global layout information. + */ +struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader { + // === PER-RING FLOW CONTROL + LAYOUT INFO (set once at init) === + PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH]; + + // === GLOBAL FIELDS === + std::atomic orchestrator_done; // Flag: orchestration complete + + // Total shared memory size (for validation) + uint64_t total_size; + + // Graph output for copy-back (set by orchestrator when using packed buffer) + // Host finalize copies from this address instead of dev_ptr when non-zero + std::atomic graph_output_ptr; // Address where final output was written (packed buffer) + std::atomic graph_output_size; // Size in bytes + + // === ERROR REPORTING === + + // Orchestrator fatal error code (Orchestrator → Scheduler, AICPU → Host) + // Non-zero signals fatal error. Written by orchestrator, read by scheduler and host. + std::atomic orch_error_code; + + // Scheduler error state (Scheduler → Host, independent of orchestrator) + // Written by scheduler threads on timeout; read by orchestrator and host. + std::atomic sched_error_bitmap; // Bit X set = thread X had error + std::atomic sched_error_code; // Last scheduler error code (last-writer-wins) + std::atomic sched_error_thread; // Thread index of last error writer +}; + +static_assert( + (sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096), + "PTO2SharedMemoryHeader should be reasonably sized" +); + +// ============================================================================= +// Shared Memory Handle +// ============================================================================= + +/** + * Handle for shared memory lifecycle management (create/destroy). + * Runtime components (orchestrator, scheduler) use PTO2SharedMemoryHeader* directly. + */ +struct PTO2SharedMemoryHandle { + void *sm_base; // Base address of shared memory + uint64_t sm_size; // Total size of shared memory + + PTO2SharedMemoryHeader *header; + + // Ownership flag + bool is_owner; // True if this handle allocated the memory + + // === Static helpers === + + static uint64_t calculate_size(uint64_t task_window_size); + static uint64_t calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]); + + // UT convenience: reserve wrapper + sm_base on `arena`, commit, and init + // using default PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE. Only valid when the + // arena is otherwise empty (the call performs the single commit). All + // memory is owned by the arena — caller must not call destroy(). + static PTO2SharedMemoryHandle *create_and_init_default(DeviceArena &arena); + + // === Instance methods === + + // In-place init for caller-provided wrapper storage (e.g. a region carved + // out of a DeviceArena). Sets is_owner = false, calls setup_pointers and + // init_header. Returns false when `sm_size` is too small for the requested + // `task_window_size`. + bool init(void *sm_base, uint64_t sm_size, uint64_t task_window_size, uint64_t heap_size); + bool init_per_ring( + void *sm_base, uint64_t sm_size, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], + const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] + ); + + void destroy(); + void print_layout(); + bool validate(); + +private: + void init_header(uint64_t task_window_size, uint64_t heap_size); + void init_header_per_ring( + const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] + ); + void setup_pointers(uint64_t task_window_size); + void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]); +}; + +// ============================================================================= +// SM Device Layout Helpers +// ============================================================================= +// +// When the host pre-builds a runtime-arena image, it needs the device-side +// addresses of several SM sub-fields (ring flow-control counters, +// task_descriptors arrays, orch_error_code) so it can wire them into the +// orchestrator / scheduler init_data path without dereferencing the SM — +// the SM lives in device memory and cannot be touched from host. +// +// These helpers compute those addresses by offset arithmetic on the SM +// device base. Pure pointer math, no loads/stores; safe to call from host. +// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's +// own setup_pointers), so values are guaranteed consistent across sides. +namespace pto2_sm_layout { + +inline std::atomic *orch_error_code_addr(void *sm_dev_base) noexcept { + return reinterpret_cast *>( + static_cast(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code) + ); +} + +inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast( + static_cast(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) + + static_cast(ring_id) * sizeof(PTO2SharedMemoryRingHeader) + ); +} + +inline std::atomic *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast *>( + reinterpret_cast(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + + offsetof(PTO2RingFlowControl, current_task_index) + ); +} + +inline std::atomic *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast *>( + reinterpret_cast(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + + offsetof(PTO2RingFlowControl, last_task_alive) + ); +} + +// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring) +// to compute ring `ring_id`'s task_descriptors device address. Accepts a +// per-ring window-size array so the helper's signature mirrors +// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently +// disagree with the SM layout when (hypothetically) ring sizes diverge. +inline PTO2TaskDescriptor *ring_task_descriptors_addr( + void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id +) noexcept { + assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range"); + char *p = static_cast(sm_dev_base); + p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + for (int r = 0; r < ring_id; r++) { + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + } + return reinterpret_cast(p); +} + +} // namespace pto2_sm_layout diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_submit_types.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_submit_types.h new file mode 100644 index 000000000..21c77fce2 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_submit_types.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO Submit Types - Shared submit-contract definitions + * + * Header-only definitions shared by orchestration-facing and runtime-facing + * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h). + */ + +#pragma once + +#include + +inline constexpr int32_t INVALID_KERNEL_ID = -1; + +/** + * Subtask slot count: AIC, AIV0, AIV1 + */ +inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3; + +/** + * Subtask slot indices + */ +enum class PTO2SubtaskSlot : uint8_t { + AIC = 0, + AIV0 = 1, + AIV1 = 2, +}; + +/** + * Subtask mask bits (for ActiveMask) + */ +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0); // 0x1 +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1); // 0x2 +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2); // 0x4 +inline constexpr uint8_t PTO2_SUBTASK_FLAG_SYNC_START = (1u << 3); // 0x8: all blocks must launch atomically + +/** + * Resource shape — classifies a MixedKernels into one of 3 scheduling buckets. + * + * Multi-subtask tasks (2+ active slots) are all scheduled as MIX. Dispatch + * chooses one cluster, then uses active_mask to decide which cores in that + * cluster must be placed together: all used cores idle -> running placement; + * all used cores already running with free pending slots -> pending placement; + * mixed used-core state is rejected and retried later. + * + * DUMMY is a synthetic shape for dep-only tasks (no AICore dispatch). Tasks + * with an empty core_mask route to a dedicated DUMMY ready queue and are + * completed inline by the scheduler dispatch loop, bypassing core allocation. + */ +enum class PTO2ResourceShape : uint8_t { + AIC = 0, // Single AIC + AIV = 1, // Single AIV + MIX = 2, // Full cluster (dispatch uses active_mask) + DUMMY = 3, // Dependency-only (no AICore dispatch) +}; + +// Number of *dispatchable* resource shapes (AIC, AIV, MIX). DUMMY does not +// allocate a per-shape ready_queue entry / local buffer — it lives in a +// dedicated queue inside PTO2SchedulerState. +inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 3; + +/** + * Bitmask of active subtask slots + flags, sizeof == 1. + */ +class ActiveMask { +public: + constexpr ActiveMask() = default; + constexpr explicit ActiveMask(uint8_t raw) : + raw_(raw) {} + + uint8_t raw() const { return raw_; } + + bool subtask_active(PTO2SubtaskSlot slot) const { return (raw_ & (1u << static_cast(slot))) != 0; } + + uint8_t core_mask() const { return raw_ & 0x07u; } + + bool requires_sync_start() const { return (raw_ & PTO2_SUBTASK_FLAG_SYNC_START) != 0; } + + PTO2ResourceShape to_shape() const { + uint8_t cmask = core_mask(); + if (cmask == 0) return PTO2ResourceShape::DUMMY; + int bit_count = __builtin_popcount(cmask); + if (bit_count >= 2) return PTO2ResourceShape::MIX; + if (cmask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC; + return PTO2ResourceShape::AIV; + } + + void set_sync_start() { raw_ |= PTO2_SUBTASK_FLAG_SYNC_START; } + + bool operator==(ActiveMask other) const { return raw_ == other.raw_; } + bool operator!=(ActiveMask other) const { return raw_ != other.raw_; } + + ActiveMask operator|(ActiveMask other) const { return ActiveMask(raw_ | other.raw_); } + ActiveMask &operator|=(ActiveMask other) { + raw_ |= other.raw_; + return *this; + } + + ActiveMask operator&(uint8_t mask) const { return ActiveMask(raw_ & mask); } + + bool has_mask(uint8_t mask) const { return (raw_ & mask) != 0; } + + explicit operator bool() const { return raw_ != 0; } + +private: + uint8_t raw_{0}; +}; + +static_assert(sizeof(ActiveMask) == 1, "ActiveMask must be exactly 1 byte"); + +/** + * Mixed-task submit contract. + * + * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive). + * At least one slot must be valid. + */ +struct MixedKernels { + int32_t aic_kernel_id{INVALID_KERNEL_ID}; + int32_t aiv0_kernel_id{INVALID_KERNEL_ID}; + int32_t aiv1_kernel_id{INVALID_KERNEL_ID}; + + ActiveMask to_active_mask() const { + uint8_t mask = 0; + if (aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC; + if (aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0; + if (aiv1_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV1; + return ActiveMask(mask); + } +}; + +/** + * SPMD launch parameters carried inside Arg. + * + * Controls how many logical blocks (SPMD dimension) a single task + * is expanded into at dispatch time. Each block receives a unique + * block_idx in [0, block_num) via the per-dispatch LocalContext. + */ +class PTO2LaunchSpec { +public: + constexpr PTO2LaunchSpec() = default; + + int16_t block_num() const { return block_num_; } + void set_block_num(int16_t n) { block_num_ = n; } + + bool require_sync_start() const { return require_sync_start_; } + void set_require_sync_start(bool v) { require_sync_start_ = v; } + +private: + int16_t block_num_{1}; + bool require_sync_start_{false}; +}; diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_tensormap.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_tensormap.h new file mode 100644 index 000000000..30017fadd --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_tensormap.h @@ -0,0 +1,723 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO Runtime2 - TensorMap Interface + * + * TensorMap provides producer lookup for dependency discovery: + * - Maps Tensor -> producer task ID + * - Used by pto_submit_task() to find dependencies + * + * Key design features: + * 1. Ring buffer pool for entries (no malloc/free) + * 2. Lazy invalidation (entries become stale when producer retires) + * 3. Per-task per-ring entry tracking for efficient cleanup + * 4. OVERLAP DETECTION: Detects dependencies for overlapping sub-regions + * + * Hash table with chaining: + * - buckets[] array of head offsets + * - Entries linked via next_in_bucket + * - Insert at head (newest first) for sorted chains + * + * CRITICAL: Hash only by base_ptr + * ============================== + * For overlap detection to work, ALL sub-regions of the same base tensor + * MUST be in the SAME hash bucket. This allows lookup to compare all + * potentially overlapping regions. + * + * Overlap detection: Two regions create a dependency if: + * 1. Same base_ptr (raw tensor pointer) + * 2. Byte ranges [offset, offset+size) intersect + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#pragma once + +#include "common.h" +#include "profiling_config.h" +#include "utils/device_arena.h" +#include "pto_runtime2_types.h" +#include "tensor.h" + +// Overlap geometry types. Relocated here from tensor.h: they are used only by +// the runtime's overlap-detection / dependency machinery, not by the +// wire/host-facing Tensor definition. +enum class OverlapStatus { + NO_OVERLAP, + COVERED, + OTHER, +}; + +struct Segment { + uint64_t begin; + uint64_t end; + + bool line_segment_intersection(const Segment &other) const { return end > other.begin && other.end > begin; } + bool contains(const Segment &other) const { return begin <= other.begin && other.end <= end; } +}; + +/** + * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the + * region offsets returned by DeviceArena::reserve() so init_from_layout() + * can fetch the matching pointers after the arena is committed. + * + * All offsets are relative to the arena's base. + */ +struct PTO2TensorMapLayout { + size_t off_buckets; + size_t off_entry_pool; + size_t off_free_entry_list; + size_t off_task_entry_heads[PTO2_MAX_RING_DEPTH]; + int32_t num_buckets; + int32_t pool_size; + int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; +}; + +// TensorMap Lookup Profiling (must precede inline lookup/insert methods). +#if PTO2_TENSORMAP_PROFILING +extern uint64_t g_lookup_chain_total; +extern uint64_t g_lookup_count; +extern int32_t g_lookup_chain_max; +extern uint64_t g_lookup_overlap_checks; +extern uint64_t g_lookup_overlap_hits; +extern uint64_t g_insert_count; +#endif + +// ============================================================================= +// TensorMap Structure +// ============================================================================= + +/** + * TensorMap entry structure — cache-line optimized for lookup + * + * Cache line 1 (64B, lookup hot path) mirrors Tensor cache line 1 byte-for-byte + * from byte 16 onward, so that `memcpy(this, &tensor, 64)` populates everything + * we need for overlap checks. Bytes [0, 16) carry entry-only fields (hash + * bucket head + chain pointer) that overlap Tensor::buffer (addr in [0, 8) is + * the hash key, size in [8, 16) is unused by the entry — we repurpose it for + * `next_in_bucket`). + * + * buffer_addr / next_in_bucket / producer_task_id — chain traversal + match + * start_offset — overlap byte range begin + * version, ndims, dtype, manual_dep, is_contiguous — overlap fast path + * shapes[5] — overlap comparison (line 1) + * + * Cache line 2 (64B, slow-path / non-contiguous overlap): + * prev_in_bucket / next_in_task / prev_in_task — chain manipulation + * bucket_index — bookkeeping + * extent_elem_cache — overlap byte range end + * strides[5] — reserved for L2 overlap (PR-2) + * + * When both entry & probe are `is_contiguous && start_offset == 0`, the overlap + * check derives `extent_elem = prod(shapes)` from cache line 1 alone. + * + * Entry size: 128B (2 cache lines), matches Tensor. + */ +struct alignas(64) PTO2TensorMapEntry { + // === Cache line 1 (64B) — lookup hot path; mirrors Tensor line 1 from byte 16 === + uint64_t buffer_addr; // 8B [0, 8): tensor base address (hash key, mirrors Tensor::buffer.addr) + PTO2TensorMapEntry *next_in_bucket; // 8B [8, 16): next entry in hash bucket chain (overlays Tensor::buffer.size) + PTO2TaskId producer_task_id; // 8B [16,24): mirrors Tensor::owner_task_id slot + uint64_t start_offset; // 8B [24,32): mirrors Tensor::start_offset (element offset) + int32_t version; // 4B [32,36): mirrors Tensor::version + uint32_t ndims; // 4B [36,40): mirrors Tensor::ndims + DataType dtype; // 1B [40,41): mirrors Tensor::dtype + bool manual_dep; // 1B [41,42): mirrors Tensor::manual_dep + bool is_contiguous; // 1B [42,43): mirrors Tensor::is_contiguous + uint8_t __padding1__; // 1B [43,44): mirrors Tensor padding + uint32_t shapes[MAX_TENSOR_DIMS]; // 20B [44,64): mirrors Tensor::shapes + + // === Cache line 2 (64B) — chain manipulation + non-contiguous overlap data === + PTO2TensorMapEntry *prev_in_bucket; // 8B [64, 72) + PTO2TensorMapEntry *next_in_task; // 8B [72, 80) + PTO2TensorMapEntry *prev_in_task; // 8B [80, 88) + int32_t bucket_index; // 4B [88, 92): -1 when unlinked + uint32_t __padding2__; // 4B [92, 96) + uint64_t extent_elem_cache; // 8B [96,104): non-contiguous extent (mirrors Tensor) + uint32_t strides[MAX_TENSOR_DIMS]; // 20B [104,124): element strides, mirrors Tensor::strides + uint8_t __padding3__[4]; // 4B [124,128) + + /** + * Copy overlap-relevant fields from a Tensor into this entry. + * + * 64B memcpy of Tensor cache line 1 populates buffer_addr (byte [0,8)), + * producer_task_id, start_offset, version, ndims, dtype, manual_dep, + * is_contiguous and shapes[]. Byte [8,16) holds Tensor::buffer.size in + * the source and gets written into next_in_bucket; that's harmless + * because link_entry() overwrites next_in_bucket immediately after. + * + * Cache line 2 (stride / extent_elem_cache) is derived from line 1 when + * the source is canonically contiguous (is_contiguous && start_offset==0), + * so the producer Tensor's cache line 2 stays cold during insert. Only + * non-contiguous producers pay one extra line 2 read. + */ + void copy_from_tensor(const Tensor &tensor) { + memcpy(this, &tensor, 64); + if (tensor.is_contiguous && tensor.start_offset == 0) { + uint64_t numel = 1; + for (uint32_t i = 0; i < tensor.ndims; i++) + numel *= tensor.shapes[i]; + extent_elem_cache = numel; + uint32_t s = 1; + for (int32_t i = static_cast(tensor.ndims) - 1; i >= 0; i--) { + strides[i] = s; + s *= tensor.shapes[i]; + } + } else { + extent_elem_cache = tensor.extent_elem_cache; + for (uint32_t i = 0; i < tensor.ndims; i++) { + strides[i] = tensor.strides[i]; + } + } + } + + void copy_tensor_create_info(const TensorCreateInfo &tensor_create_info, uint64_t addr) { + memcpy(this, &tensor_create_info, 64); + buffer_addr = addr; + // Create-info outputs are always contiguous with start_offset = 0; + // extent_elem = prod(shapes); stride is row-major. + uint64_t numel = 1; + for (uint32_t i = 0; i < tensor_create_info.ndims; i++) { + numel *= tensor_create_info.shapes[i]; + } + extent_elem_cache = numel; + uint32_t s = 1; + for (int32_t i = static_cast(tensor_create_info.ndims) - 1; i >= 0; i--) { + strides[i] = s; + s *= tensor_create_info.shapes[i]; + } + } + + /** + * Effective element extent of this entry. + * Contiguous-aligned views compute it from shapes alone (line 1 hit only); + * non-contiguous views read the cached value from line 2. + */ + uint64_t effective_extent_elem() const { + if (is_contiguous) { + uint64_t n = 1; + for (uint32_t i = 0; i < ndims; i++) + n *= shapes[i]; + return n; + } + return extent_elem_cache; + } + + /** + * Check overlap between input tensor and this entry (the producer output). + * + * Three-level cascade: + * L1 — O(1) byte-range intersection. Disjoint -> NO_OVERLAP. + * L2 — O(ndims) hyper-rectangle precise check, eligible only when both + * sides share the same canonical row-major axis layout (same + * dtype/ndims/strides[], stride descends as integer multiples, + * start_offset decomposes cleanly under the reference shape). + * Yields NO_OVERLAP / COVERED / OTHER per-dim. + * L3 — Non-hyper-rectangle pairs (transpose/permute mismatch, slice + * with step, etc): conservative OTHER. Exact enumeration via + * contiguous-segment merge is scheduled for a follow-up. + * + * COVERED is returned when `input` completely contains `entry` per-dim + * — dep_compute uses this to retire the now-redundant entry. + */ + OverlapStatus check_overlap(const Tensor &input) const { + debug_assert(input.buffer.addr == buffer_addr); + debug_assert(input.version >= version); + if (input.version > version) { + return OverlapStatus::OTHER; + } + + // -------- L1: byte-range intersection (O(1) fast reject) -------- + const uint64_t in_begin = input.start_offset; + const uint64_t in_end = input.start_offset + input.extent_elem(); + const uint64_t ent_begin = start_offset; + const uint64_t ent_end = start_offset + effective_extent_elem(); + Segment in_range_bytes{in_begin, in_end}; + Segment ent_range_bytes{ent_begin, ent_end}; + if (!in_range_bytes.line_segment_intersection(ent_range_bytes)) { + return OverlapStatus::NO_OVERLAP; + } + + // -------- L2 prereqs: same axis layout? -------- + if (input.dtype != dtype || input.ndims != ndims || ndims == 0) { + return OverlapStatus::OTHER; + } + for (uint32_t i = 0; i < ndims; i++) { + if (input.strides[i] != strides[i]) return OverlapStatus::OTHER; + } + // strides[ndims-1] must be 1 and strides[i-1] must be an integer + // multiple of strides[i] for the row-major reference-shape derivation + // below to hold. This rejects slice-with-step (strides[d] != prev factor) + // and any view chain that scrambles the axis order. (strides is + // uint32_t with the > 0 invariant enforced at construction, so no + // sign check needed.) + if (strides[ndims - 1] != 1) return OverlapStatus::OTHER; + for (uint32_t i = 1; i < ndims; i++) { + if (strides[i - 1] % strides[i] != 0) return OverlapStatus::OTHER; + } + + // Derive reference shape A from stride. By construction stride is + // row-major over A: strides[i] = prod(A[i+1..ndims-1]). So + // A[i] = strides[i-1] / strides[i] for i >= 1 + // A[0] = (buffer.size / dtype_bytes) / strides[0] + // input.buffer.size is the storage size; entry shares the same buffer + // (debug-asserted by buffer.addr equality at the top), so we read it + // from input rather than mirroring buffer.size into the entry. + // + // Note on buffer padding: runtime allocators may over-allocate + // `buffer.size` (cache-line / 1024B alignment, ring-buffer slot + // rounding, etc). When that happens, `numel_storage` is larger than + // the true logical extent and `ref_shapes[0]` ends up generously over- + // sized. This is intentional: ref_shapes is only used as an *upper + // bound* in the in-bounds checks below; the actual overlap test (the + // per-dim line-segment intersection on the real start_offset / + // shapes / stride further down) is unaffected. A larger-than-truth + // ref_shapes[0] simply makes the bounds check more permissive — it + // can never cause a false NO_OVERLAP nor a false COVERED. + uint32_t ref_shapes[MAX_TENSOR_DIMS] = {}; + for (uint32_t i = 1; i < ndims; i++) { + ref_shapes[i] = strides[i - 1] / strides[i]; + } + const uint64_t elem_size = get_element_size(dtype); + if (elem_size == 0) return OverlapStatus::OTHER; + const uint64_t numel_storage = input.buffer.size / elem_size; + const uint32_t stride0 = strides[0]; // > 0 by Tensor invariant + if (numel_storage % stride0 != 0) return OverlapStatus::OTHER; + ref_shapes[0] = static_cast(numel_storage / stride0); + + // Decompose start_offset into row-major multi-dim offsets. By the same + // relation strides[i] = prod(ref_shapes[i+1..]) so dividing by strides[i] + // (no inner loop) yields each axis offset directly. + uint32_t in_offsets[MAX_TENSOR_DIMS] = {}; + uint32_t ent_offsets[MAX_TENSOR_DIMS] = {}; + uint64_t in_remain = input.start_offset; + uint64_t ent_remain = start_offset; + for (uint32_t i = 0; i < ndims; i++) { + const uint32_t s = strides[i]; + in_offsets[i] = static_cast(in_remain / s); + ent_offsets[i] = static_cast(ent_remain / s); + in_remain %= s; + ent_remain %= s; + } + if (in_remain != 0 || ent_remain != 0) return OverlapStatus::OTHER; + + // Validate that each side fits within ref_shapes (defense in depth — + // a well-formed view always satisfies this). + for (uint32_t i = 0; i < ndims; i++) { + if (static_cast(in_offsets[i]) + input.shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER; + if (static_cast(ent_offsets[i]) + shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER; + } + + // -------- L2 core: per-dim line-segment intersection -------- + bool input_contains_entry = true; + for (uint32_t i = 0; i < ndims; i++) { + Segment in_seg{in_offsets[i], static_cast(in_offsets[i]) + input.shapes[i]}; + Segment ent_seg{ent_offsets[i], static_cast(ent_offsets[i]) + shapes[i]}; + if (!in_seg.line_segment_intersection(ent_seg)) { + return OverlapStatus::NO_OVERLAP; + } + if (!in_seg.contains(ent_seg)) { + input_contains_entry = false; + } + } + return input_contains_entry ? OverlapStatus::COVERED : OverlapStatus::OTHER; + } +}; + +static_assert(sizeof(PTO2TensorMapEntry) == 128, "TensorMapEntry must be exactly 2 cache lines (128 bytes)"); +static_assert(offsetof(PTO2TensorMapEntry, buffer_addr) == offsetof(Tensor, buffer.addr)); +static_assert(offsetof(PTO2TensorMapEntry, producer_task_id) == offsetof(Tensor, owner_task_id)); +static_assert(offsetof(PTO2TensorMapEntry, start_offset) == offsetof(Tensor, start_offset)); +static_assert(offsetof(PTO2TensorMapEntry, version) == offsetof(Tensor, version)); +static_assert(offsetof(PTO2TensorMapEntry, ndims) == offsetof(Tensor, ndims)); +static_assert(offsetof(PTO2TensorMapEntry, dtype) == offsetof(Tensor, dtype)); +static_assert(offsetof(PTO2TensorMapEntry, manual_dep) == offsetof(Tensor, manual_dep)); +static_assert(offsetof(PTO2TensorMapEntry, is_contiguous) == offsetof(Tensor, is_contiguous)); +static_assert(offsetof(PTO2TensorMapEntry, shapes) == offsetof(Tensor, shapes)); +static_assert( + offsetof(PTO2TensorMapEntry, prev_in_bucket) == 64, "TensorMapEntry must be exactly 2 cache lines (128 bytes)" +); + +// ============================================================================= +// TensorMap Lookup Chain Length Statistics (compile-time toggle) +// ============================================================================= + +/** + * TensorMap structure + * + * Hash table with ring buffer entry pool and lazy invalidation. + */ +struct PTO2TensorMap { + // Hash table buckets (fixed size, power of 2) + PTO2TensorMapEntry **buckets; // Array of offsets into entry_pool (-1 = empty) + int32_t num_buckets; // Must be power of 2 for fast modulo + + // Entry pool as ring buffer + PTO2TensorMapEntry *entry_pool; // Ring buffer of entries + PTO2TensorMapEntry **free_entry_list; // free entry ids + int32_t pool_size; // Total pool capacity + int32_t next_entry_idx; // id when next entry insert + int32_t free_num; // free entry number in entry pool + + // Per-ring per-task entry tracking (for efficient bucket cleanup) + // Indexed by [ring_id][local_id & (task_window_sizes[ring_id] - 1)] + PTO2TensorMapEntry **task_entry_heads[PTO2_MAX_RING_DEPTH]; + int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; // Per-ring task window size (for slot masking) + + // Per-ring validity threshold (for lazy invalidation) + int32_t last_task_alives[PTO2_MAX_RING_DEPTH]; // Cached from shared memory per ring + + // Per-ring cleanup progress (for periodic cleanup_retired) + int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{}; + + uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const { + return task_local_id & (task_window_sizes[ring_id] - 1); + } + + // Accessors read by scope_stats_collector. Declared unconditionally so the + // collector .cpp compiles at PTO2_PROFILING=0 (collector is unconditional — + // setter symbols must export for host dlsym; the probe call sites that use + // these accessors stay gated by PTO2_PROFILING). + int32_t current_used() const { return next_entry_idx - free_num; } + int32_t pool_capacity() const { return pool_size; } + + // new_entry only allocates memory, does not assign attributes + PTO2TensorMapEntry *new_entry() { + if (free_num > 0) { + PTO2TensorMapEntry *res = free_entry_list[--free_num]; + debug_assert(res->bucket_index == -1); + return res; + } + always_assert(next_entry_idx < pool_size); + PTO2TensorMapEntry *res = &entry_pool[next_entry_idx++]; + debug_assert(res->bucket_index == -1); + return res; + } + + void free_entry(PTO2TensorMapEntry &entry) { + always_assert(entry.bucket_index != -1); // must still be in a bucket + + // Update predecessor's next pointer (O(1) via prev_in_bucket) + if (entry.prev_in_bucket == nullptr) { + // Entry is the head of its bucket chain, update bucket head + // Must compute hash BEFORE clearing tensor + buckets[entry.bucket_index] = entry.next_in_bucket; + } else { + entry.prev_in_bucket->next_in_bucket = entry.next_in_bucket; + } + + // Update successor's prev pointer + if (entry.next_in_bucket != nullptr) { + entry.next_in_bucket->prev_in_bucket = entry.prev_in_bucket; + } + + free_entry_list[free_num++] = &entry; + entry.bucket_index = -1; + entry.next_in_bucket = nullptr; + entry.prev_in_bucket = nullptr; + entry.next_in_task = nullptr; + entry.prev_in_task = nullptr; + } + + // ============================================================================= + // TensorMap API + // ============================================================================= + + /** + * Phase 1: reserve every sub-region (buckets, entry_pool, free list, per-ring + * task_entry_heads) on the supplied arena. Records the resulting offsets in + * the returned layout descriptor. Must be called before the arena is + * committed. + */ + static PTO2TensorMapLayout reserve_layout( + DeviceArena &arena, int32_t num_buckets, int32_t pool_size, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH] + ); + + /** + * Same as reserve_layout() with default sizes (PTO2_TENSORMAP_NUM_BUCKETS, + * PTO2_TENSORMAP_POOL_SIZE). + */ + static PTO2TensorMapLayout + reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]); + + /** + * Phase 3a: write everything *except* arena-internal pointer fields + * (buckets, entry_pool, free_entry_list, task_entry_heads[r]). + * Uses arena.region_ptr to address the arena regions for data writes, + * but does not store those addresses in struct fields. Safe to call on + * a host arena that holds the prebuilt image. + */ + bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena); + + /** + * Phase 3b: write the arena-internal pointer fields. Idempotent; + * called once on the host arena and once on the AICPU after attach. + */ + void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena); + + /** + * Tear down state. Does not free memory — the arena owns the backing + * buffer. Pointers are set to nullptr so accidental reuse traps. + */ + void destroy(); + + /** + * Update validity threshold from shared memory + * Called periodically to refresh the lazy invalidation threshold. + * + * @param last_task_alive Current value from shared memory + */ + void sync_validity(int32_t ring_id, int32_t last_task_alive) { this->last_task_alives[ring_id] = last_task_alive; } + + /** + * Lookup producer for a tensor region + * + * Searches the hash table for matching regions and invokes the callback + * for each overlapping valid entry. + * Stale entries from different rings are skipped (not truncated). + * + * The callback receives (PTO2TensorMapEntry &, OverlapStatus) and should + * return true to continue iteration, false to stop early. It is safe for + * the callback to call remove_entry() on the current entry: next_in_bucket + * is latched before invocation. + * + * @param tensor Tensor to look up + * @param on_match Callback invoked for each overlapping entry + */ + template + void lookup(const Tensor &tensor, Fn &&on_match) { + uint32_t bucket_index = hash(tensor.buffer.addr); + PTO2TensorMapEntry *cur_entry = buckets[bucket_index]; + +#if PTO2_TENSORMAP_PROFILING + g_lookup_count++; + int32_t chain_len = 0; +#endif + + while (cur_entry != nullptr) { + PTO2TensorMapEntry *next_entry = cur_entry->next_in_bucket; + +#if PTO2_TENSORMAP_PROFILING + chain_len++; +#endif + // Skip stale entries (no chain truncation — entries from different + // rings can be interleaved, so a stale entry from one ring does NOT + // imply subsequent entries from other rings are also stale) + if (!entry_valid(*cur_entry)) { + cur_entry = next_entry; + continue; + } + + // Entry is valid - check if regions OVERLAP (not just exact match) + // Since we hash only by base_ptr, all entries in this bucket have + // potential to overlap. We must check actual byte-range overlap. + if (tensor.buffer.addr == cur_entry->buffer_addr) { +#if PTO2_TENSORMAP_PROFILING + g_lookup_overlap_checks++; +#endif + auto overlap_status = cur_entry->check_overlap(tensor); + if (overlap_status != OverlapStatus::NO_OVERLAP) { +#if PTO2_TENSORMAP_PROFILING + g_lookup_overlap_hits++; +#endif + if (!on_match(*cur_entry, overlap_status)) { +#if PTO2_TENSORMAP_PROFILING + g_lookup_chain_total += chain_len; + if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len; +#endif + return; + } + } + } + + // Move to next entry + cur_entry = next_entry; + } +#if PTO2_TENSORMAP_PROFILING + g_lookup_chain_total += chain_len; + if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len; +#endif + } + + /** + * Insert a new entry (called when task produces output) + * + * Allocates from ring buffer pool, may overwrite stale entries. + * Inserts at head of hash bucket chain (maintains task_id ordering). + * + * @param tensor Tensor produced + * @param producer_task_id Task ID of producer + */ + void insert(const Tensor &tensor, PTO2TaskId producer_task_id) { + PTO2TensorMapEntry *entry = new_entry(); + entry->copy_from_tensor(tensor); + link_entry(entry, tensor.buffer.addr, producer_task_id); + } + + /** + * Cleanup stale entries for retired tasks + * + * Called periodically by Orchestrator when last_task_alive advances. + * Removes entries from bucket chains for tasks in [old, new) range. + * + * @param old_last_task_alive Previous threshold + * @param new_last_task_alive New threshold + */ + void cleanup_retired(int32_t ring_id, int32_t old_last_task_alive, int32_t new_last_task_alive) { + // Iterate through retired tasks on this ring and remove their entries + for (int32_t local_id = old_last_task_alive; local_id < new_last_task_alive; local_id++) { + int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1); + PTO2TensorMapEntry *cur_entry = task_entry_heads[ring_id][task_slot]; + + while (cur_entry != nullptr) { + PTO2TensorMapEntry *next_entry = cur_entry->next_in_task; // Save before clearing + // Only remove if this entry belongs to the retiring task + // (slot may have been reused by a newer task) + debug_assert( + cur_entry->producer_task_id == + PTO2TaskId::make(static_cast(ring_id), static_cast(local_id)) + ); + free_entry(*cur_entry); + cur_entry = next_entry; + } + + // Clear task's entry head (slot will be reused by local_id + task_window_sizes[ring_id]) + task_entry_heads[ring_id][task_slot] = nullptr; + } + } + + // ============================================================================= + // Internal Helpers (exposed for testing) + // ============================================================================= + + /** + * Compute hash for tensor addr + * + * Multiplicative hash using the golden-ratio constant. Multiplication + * mixes ALL input bits into the high bits of the product, so aligned + * addresses (low bits all-zero) still distribute evenly. We extract + * the top log2(num_buckets) bits which carry the most entropy. + */ + uint32_t hash(uint64_t key) { + key *= 0x9E3779B97F4A7C15ULL; + return static_cast(key >> (64 - __builtin_ctz(num_buckets))); + } + + /** + * Link an initialized entry into bucket and task chains. + */ + void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id) { +#if PTO2_TENSORMAP_PROFILING + g_insert_count++; +#endif + uint32_t bucket_index = hash(addr); + auto ring_id = producer_task_id.ring(); + auto local_id = producer_task_id.local(); + int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1); + + entry->producer_task_id = producer_task_id; + + // Insert at head of hash bucket + entry->bucket_index = bucket_index; + entry->next_in_bucket = buckets[bucket_index]; + if (entry->next_in_bucket != nullptr) { + entry->next_in_bucket->prev_in_bucket = entry; + } + buckets[bucket_index] = entry; + entry->prev_in_bucket = nullptr; + + // Link to task's entry list + entry->next_in_task = task_entry_heads[ring_id][task_slot]; + entry->prev_in_task = nullptr; + if (entry->next_in_task != nullptr) { + entry->next_in_task->prev_in_task = entry; + } + task_entry_heads[ring_id][task_slot] = entry; + } + + /** + * Check if entry is valid (producer has not retired) + */ + bool entry_valid(const PTO2TensorMapEntry &entry) const { + return static_cast(entry.producer_task_id.local()) >= last_task_alives[entry.producer_task_id.ring()]; + } + + void remove_entry(PTO2TensorMapEntry &entry) { + remove_from_task(entry); + free_entry(entry); + } + + /** + * Remove entry from its task chain (O(1) with prev pointer) + * Called during pool wrap-around to unlink reused entries. + */ + void remove_from_task(PTO2TensorMapEntry &entry) { + always_assert(entry.bucket_index != -1); // must still be in a bucket + // Update predecessor's next pointer (O(1) via prev_in_task) + if (entry.prev_in_task == nullptr) { + // Entry is the head of its task chain, update task_entry_heads + int32_t ring_id = entry.producer_task_id.ring(); + int32_t local_id = static_cast(entry.producer_task_id.local()); + int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1); + task_entry_heads[ring_id][task_slot] = entry.next_in_task; + } else { + entry.prev_in_task->next_in_task = entry.next_in_task; + } + + // Update successor's prev pointer + if (entry.next_in_task != nullptr) { + entry.next_in_task->prev_in_task = entry.prev_in_task; + } + + entry.next_in_task = nullptr; + entry.prev_in_task = nullptr; + } + + // ============================================================================= + // Debug Utilities + // ============================================================================= + + /** + * Print TensorMap statistics + */ + void print_stats(); + + /** + * Get count of valid entries + */ + int32_t valid_count(); + + // ============================================================================= + // TensorMap Synchronization + // ============================================================================= + + /** + * Sync TensorMap validity threshold from shared memory + * + * Called periodically to refresh the lazy invalidation threshold. + * Also triggers cleanup if threshold has advanced significantly. + */ + void sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive); +}; + +#if PTO2_TENSORMAP_PROFILING +struct PTO2TensorMapProfilingData { + uint64_t lookup_chain_total; + uint64_t lookup_count; + int32_t lookup_chain_max; + uint64_t overlap_checks; + uint64_t overlap_hits; + uint64_t insert_count; +}; + +PTO2TensorMapProfilingData pto2_tensormap_get_profiling(); +#endif diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_types.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_types.h new file mode 100644 index 000000000..65d593a49 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/pto_types.h @@ -0,0 +1,614 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Orchestration Build Graph Types - Data structures for orchestration runtime extensions + * + * Standalone header defining orchestration-specific types for: + * - TaskOutputTensors: Return value from submit containing materialized output Tensors + * - Arg: Aggregated argument container for pto_submit_task API + * + * Tensor descriptor types (Tensor, PTOBufferHandle, TensorCreateInfo) are + * defined in tensor.h. + * + * This header is independent of orch_build_graph_runtime.h to allow inclusion from runtime.h + * without type conflicts (Handshake, TensorPair, HostApi). + */ + +#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_ +#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_ + +#include +#include + +#include +#include +#include + +#if defined(__aarch64__) +#include +#endif + +#include "aicpu/dump_arg_selection.h" +#include "data_type.h" +#include "profiling_config.h" +#include "pto_submit_types.h" +#include "task_args.h" +#include "tensor.h" +#include "tensor_create_info.h" // runtime-only TensorCreateInfo + materialization helpers + +typedef enum { + ASYNC_ENGINE_SDMA = 0, + ASYNC_ENGINE_ROCE = 1, + ASYNC_ENGINE_URMA = 2, + ASYNC_ENGINE_CCU = 3, + NUM_ASYNC_ENGINES = 4, +} AsyncEngine; + +enum class CompletionType : int32_t { + COUNTER = 0, +}; + +// ============================================================================= +// Task Output Tensors (return value from submit) +// ============================================================================= + +enum class PTO2ScopeMode : uint8_t { + AUTO = 0, + MANUAL = 1, +}; + +/** + * TaskOutputTensors — returned by submit, holds materialized output Tensors. + * + * Only runtime-created outputs are stored here, indexed in add_output order. + * + * The underlying storage is uninitialized; only output_count elements are + * valid after submit returns. This avoids default-constructing Tensor[] + * on the hot path (2 KB of unnecessary zeroing per submit). + * + * Users must hold a named TaskOutputTensors variable and borrow via get_ref(); + * binding get_ref() on an rvalue is compile-time rejected to prevent dangling. + * + * LIFETIME — single-scope only: + * Internally this class stores pointers into the submitting task's payload + * (PTO2TaskPayload::tensors[]), which lives in a ring-buffer slot. After + * scope_end the slot becomes eligible for reuse, and a later submit will + * overwrite the same Tensor storage in place. Therefore the + * TaskOutputTensors instance, the const Tensor& returned by get_ref(), and + * any pointer derived from either MUST NOT outlive the PTO2_SCOPE in which + * submit was called — do not move/copy them to outer-scope variables, do + * not capture references by std::reference_wrapper or raw pointers across + * scope boundaries. + * + * This invariant is intentionally not enforced at runtime: a reused slot + * simply carries a different but valid owner_task_id, so checking + * owner_task_id cannot distinguish "still mine" from "silently aliased to + * an unrelated task". Misuse manifests as a wrong-tensor read with no + * diagnostic. + */ +class TaskOutputTensors { +public: + TaskOutputTensors() : + task_id_(PTO2TaskId::invalid()), + output_count_(0) {} + + bool empty() const { return output_count_ == 0; } + uint32_t size() const { return output_count_; } + + /// Borrow a materialized output tensor by index (lvalue only). + const Tensor &get_ref(uint32_t index) const & { + always_assert(index < output_count_); + return *tensors_[index]; + } + const Tensor &get_ref(uint32_t index) const && = delete; + + /// Runtime-internal: append one materialized output Tensor. + void materialize_output(const Tensor &tensor) { + always_assert(output_count_ < MAX_TENSOR_ARGS); + tensors_[output_count_++] = &tensor; + } + + void set_task_id(PTO2TaskId id) { task_id_ = id; } + + PTO2TaskId task_id() const { return task_id_; } + +private: + PTO2TaskId task_id_; + uint32_t output_count_; + // Upper bound: a task cannot have more outputs than total tensor args + // (every OUTPUT/OUTPUT_EXISTING slot is one of the Arg's tensor slots). + const Tensor *tensors_[MAX_TENSOR_ARGS]; +}; + +using TaskSubmitResult = TaskOutputTensors; + +// ============================================================================= +// Argument Types (for pto_submit_task API) +// ============================================================================= + +// TensorArgType is defined in tensor.h (included via task_args.h above) + +/** + * Tagged reference to a single Arg slot — either a Tensor* or a + * TensorCreateInfo*. The active member is determined by the slot's + * TensorArgType tag (OUTPUT → create_info, else → tensor pointer). + * + * Minimal-permission: the union members are private; content is set only via + * operator=(ptr) and read via ref()/create_info(). Copy/move are deleted — a + * TensorRef is written in place inside an Arg's slot array, never passed by + * value. + */ +class TensorRef { + union { + const Tensor *ptr_; + const TensorCreateInfo *create_info_; + }; + +public: + TensorRef() : + ptr_(nullptr) {} + TensorRef(const TensorRef &) = delete; + TensorRef(TensorRef &&) = delete; + TensorRef &operator=(const TensorRef &) = delete; + TensorRef &operator=(TensorRef &&) = delete; + + TensorRef &operator=(const Tensor *p) { + ptr_ = p; + return *this; + } + TensorRef &operator=(const TensorCreateInfo *ci) { + create_info_ = ci; + return *this; + } + + const Tensor &ref() const { return *ptr_; } + const TensorCreateInfo &create_info() const { return *create_info_; } + bool refers_to(const Tensor *t) const { return ptr_ == t; } + bool refers_to(const TensorCreateInfo *ci) const { return create_info_ == ci; } +}; + +/** + * Aggregated argument container for pto_submit_task + * + * Inherits storage from TaskArgsTpl. + * Each tensor slot stores a TensorRef union (Tensor* or TensorCreateInfo) + * discriminated by the corresponding tag(). + * Tensors are dispatched first in kernel args, followed by scalars. + * + * Output arguments follow two distinct ownership models: + * - add_output(const TensorCreateInfo&): OUTPUT — runtime allocates buffer + * and materializes a new Tensor, returned via TaskOutputTensors. + * - add_inout(const Tensor&): INOUT — reuses an existing Tensor as the write target. + * + * Example: + * Tensor x = make_tensor_external(dev_a, shapes, 2); + * TensorCreateInfo ci(shapes, 2); // must outlive submit + * Arg args; + * args.add_input(x); + * args.add_output(ci); + * args.add_scalar(some_value); + * TaskOutputTensors outs = rt_submit_aic_task(kernel_id, args); + * const Tensor& y = outs.get_ref(0); + */ +template +struct Arg : TaskArgsTpl { + using Base = TaskArgsTpl; + // Make dependent-base members visible for unqualified use (two-phase lookup + // does not search a dependent base in a class template). + using Base::scalar_count_; + using Base::scalars_; + using Base::tags_; + using Base::tensor_count_; + using Base::tensors_; + + // Minimal-permission: an Arg is built in place and consumed by reference; + // it is never copied/moved (it is a large object, and its TensorRef slots + // are non-copyable by design). + Arg() = default; + Arg(const Arg &) = delete; + Arg(Arg &&) = delete; + Arg &operator=(const Arg &) = delete; + Arg &operator=(Arg &&) = delete; + + bool has_error{false}; + const char *error_msg{nullptr}; + PTO2LaunchSpec launch_spec; // SPMD launch parameters (block_num, etc.) + + // Speculative early-dispatch hint (codegen-author set, off by default). When + // true, the scheduler may stage this task on an idle core before its producer + // finishes, gating execution on the DATA_MAIN_BASE doorbell — only safe when + // the author knows the task's data dependencies allow it. Read in-process by + // the runtime; never crosses the wire format. + bool allow_early_resolve_{false}; + void set_allow_early_resolve(bool v = true) { allow_early_resolve_ = v; } + bool allow_early_resolve() const { return allow_early_resolve_; } + + void clear() { + Base::clear(); +#if PTO2_PROFILING + dump_arg_selection_.clear(); +#endif + explicit_deps_ = nullptr; + explicit_dep_count_ = 0; + allow_early_resolve_ = false; + } + + void reset() { + clear(); + has_error = false; + error_msg = nullptr; + } + + void set_error(const char *msg) { + if (!has_error) { + has_error = true; + error_msg = msg; + } + } + + template + void dump(Args &&...args) { +#if PTO2_PROFILING + static_assert( + (std::is_lvalue_reference_v && ...), + "dump: temporaries are not allowed — pass tensors/scalars already added to this Arg" + ); + static_assert( + (is_supported_dump_arg_v && ...), + "dump: all arguments must be Tensor, TensorCreateInfo, or scalar lvalues" + ); + if constexpr (sizeof...(Args) == 0) { + mark_all_dump_args(); + } else { + (mark_dump_arg(args), ...); + } +#else + ((void)args, ...); +#endif + } + +#if PTO2_PROFILING + uint64_t dump_arg_mask() const { return dump_arg_selection_.dump_arg_mask(); } + uint64_t dump_arg_index_ambiguous_mask() const { return dump_arg_selection_.dump_arg_index_ambiguous_mask(); } +#else + uint64_t dump_arg_mask() const { return 0; } + uint64_t dump_arg_index_ambiguous_mask() const { return 0; } +#endif + + template + void add_input(Args &&...args) { + assert_add_tensor_args(); + if (!check_add_tensor_capacity(static_cast(sizeof...(Args)))) { + return; + } + ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::INPUT, tensor_count_++), ...); + } + + /// Batch add outputs — all Tensor or all TensorCreateInfo: + /// add_output(ci1, ci2) — runtime allocates buffers (OUTPUT) + /// add_output(t1, t2) — write-only existing tensors (OUTPUT_EXISTING) + template + void add_output(Args &&...args) { + assert_add_tensor_args(); + if (!check_add_tensor_capacity(static_cast(sizeof...(Args)))) return; + if constexpr ((std::is_same_v, TensorCreateInfo> && ...)) { + ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::OUTPUT, tensor_count_++), ...); + } else { + ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::OUTPUT_EXISTING, tensor_count_++), + ...); + } + } + + template + void add_inout(Args &&...args) { + assert_add_tensor_args(); + if (!check_add_tensor_capacity(static_cast(sizeof...(Args)))) { + return; + } + ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::INOUT, tensor_count_++), ...); + } + + /// No-dependency existing tensor: skips OverlapMap lookup, depends on creator only. + template + void add_no_dep(Args &&...args) { + assert_add_tensor_args(); + if (!check_add_tensor_capacity(static_cast(sizeof...(Args)))) return; + ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::NO_DEP, tensor_count_++), ...); + } + + /** + * Attach an explicit dependency array. The Arg stores (ptr, count) without + * copying — the caller's array must outlive the submit (same lifetime rule + * as add_input/add_output, which also store pointers). + * + * count == 0 is a valid "set empty" — it clears any previously stored deps + * and returns. This lets callers that build the dep set conditionally pass + * the result through unguarded, including in the no-dep branch: + * PTO2TaskId deps[3]; + * uint32_t n = 0; + * if (have_prev) deps[n++] = prev; + * if (is_last) deps[n++] = alloc; + * args.set_dependencies(deps, n); // safe even if n == 0 + * + * For count > 0, the call is single-shot: a second non-empty call after + * deps are already set will fail with set_error(). Use count == 0 first + * if you need to re-set. + */ + void set_dependencies(const PTO2TaskId *deps, uint32_t count) { + if (count == 0) { + explicit_deps_ = nullptr; + explicit_dep_count_ = 0; + return; + } + if (deps == nullptr) { + set_error("set_dependencies: deps must not be null when count > 0"); + return; + } + if (explicit_deps_ != nullptr) { + set_error("set_dependencies: may be called at most once per Arg"); + return; + } + explicit_deps_ = deps; + explicit_dep_count_ = count; + } + + uint32_t explicit_dep_count() const { return explicit_dep_count_; } + + PTO2TaskId explicit_dep(uint32_t index) const { + always_assert(index < explicit_dep_count_); + return explicit_deps_[index]; + } + + const PTO2TaskId *explicit_deps_data() const { return explicit_deps_; } + + /** + * Add scalar values. Types are deduced per argument; each value is + * bit-cast to uint64_t for storage. Mixed types are allowed: + * + * args.add_scalar(uint64_val); // single + * args.add_scalar(3.14f, int32_t(42), 7u); // mixed batch + */ + template + void add_scalar(Args &&...args) { + static_assert(sizeof...(Args) >= 1, "add_scalar: at least one argument required"); + static_assert((is_supported_scalar_arg_v && ...), "add_scalar: all types must be arithmetic or enum"); + if (scalar_count_ + sizeof...(Args) > MaxS) { + set_error(scalar_cap_msg()); + return; + } + (add_scalar_one(std::forward(args)), ...); + } + + void add_scalars(const uint64_t *values, int count) { + if (count < 0 || scalar_count_ + count > MaxS) { + set_error(scalar_cap_msg()); + return; + } + memcpy(&scalars_[scalar_count_], values, count * sizeof(uint64_t)); +#if PTO2_PROFILING + dump_arg_selection_.clear_scalar_metadata(scalar_count_, count); +#endif + scalar_count_ += count; + } + + /** + * Zero-extend int32 bit patterns into uint64 scalar slots. + * Negative values are treated as their unsigned 32-bit representation + * (e.g., -1 → 0x00000000FFFFFFFF, not 0xFFFFFFFFFFFFFFFF). + * Uses NEON to process 4 elements per iteration on aarch64. + */ + void add_scalars_i32(const int32_t *values, int count) { + if (count < 0 || scalar_count_ + count > MaxS) { + set_error(scalar_cap_msg()); + return; + } + uint64_t *dst = &scalars_[scalar_count_]; +#if defined(__aarch64__) + int i = 0; + for (; i + 4 <= count; i += 4) { + uint32x4_t v = vld1q_u32(reinterpret_cast(values + i)); + uint64x2_t lo = vmovl_u32(vget_low_u32(v)); + uint64x2_t hi = vmovl_u32(vget_high_u32(v)); + vst1q_u64(dst + i, lo); + vst1q_u64(dst + i + 2, hi); + } + for (; i < count; i++) { + dst[i] = static_cast(static_cast(values[i])); + } +#else + for (int i = 0; i < count; i++) { + dst[i] = static_cast(static_cast(values[i])); + } +#endif +#if PTO2_PROFILING + dump_arg_selection_.clear_scalar_metadata(scalar_count_, count); +#endif + scalar_count_ += count; + } + + /** + * Copy scalars from another Arg's scalar array. + * Useful when multiple tasks share the same scalar data (e.g., block indices). + */ + void copy_scalars_from(const Arg &src, int src_offset, int count) { + if (src_offset < 0 || count < 0 || src_offset + count > src.scalar_count_) { + set_error("Source scalar range out of bounds in copy_scalars_from"); + return; + } + if (scalar_count_ + count > MaxS) { + set_error(scalar_cap_msg()); + return; + } + memcpy(&scalars_[scalar_count_], &src.scalars_[src_offset], count * sizeof(uint64_t)); +#if PTO2_PROFILING + dump_arg_selection_.copy_scalar_dtypes_from(src.dump_arg_selection_, scalar_count_, src_offset, count); +#endif + scalar_count_ += count; + } + +#if PTO2_PROFILING + const uint8_t *scalar_dtypes() const { return dump_arg_selection_.scalar_dtypes(); } +#else + const uint8_t *scalar_dtypes() const { return nullptr; } +#endif + +private: + // Caller-owned dependency array; lifetime must extend through submit. +#if PTO2_PROFILING + DumpArgSelection dump_arg_selection_; +#endif + const PTO2TaskId *explicit_deps_{nullptr}; + uint32_t explicit_dep_count_{0}; +#if PTO2_PROFILING + template + static constexpr bool is_supported_dump_arg_v = + std::is_same_v, Tensor> || std::is_same_v, TensorCreateInfo> || + is_supported_scalar_arg_v; +#endif + + // Capacity-overflow messages — spell the actual limit (MaxS/MaxT, whatever + // the instantiation is) into the text via std::to_string. Built once into a + // function-local static so set_error() can hold the const char* safely. + static const char *scalar_cap_msg() { + static const std::string msg = "Too many scalar args (max " + std::to_string(MaxS) + ")"; + return msg.c_str(); + } + static const char *tensor_cap_msg() { + static const std::string msg = "Too many tensor args (max " + std::to_string(MaxT) + ")"; + return msg.c_str(); + } + + template + void add_scalar_one(T &&value) { + scalars_[scalar_count_] = to_u64(value); +#if PTO2_PROFILING + uintptr_t scalar_source_ptr = 0; + if constexpr (std::is_lvalue_reference_v) { + scalar_source_ptr = reinterpret_cast(&value); + } + dump_arg_selection_.record_scalar_source( + scalar_count_, scalar_source_ptr, dtype_of>>() + ); +#endif + scalar_count_++; + } + +#if PTO2_PROFILING + // No-arg dump(): mark every arg already added to this Arg. + void mark_all_dump_args() { + if (tensor_count_ == 0 && scalar_count_ == 0) { + set_error("dump: no arguments added to this Arg"); + return; + } + dump_arg_selection_.mark_all(tensor_count_, scalar_count_); + } + + void mark_dump_arg(const Tensor &tensor) { + for (int32_t i = 0; i < tensor_count_; i++) { + if (tags_[i] != TensorArgType::OUTPUT && tensors_[i].refers_to(&tensor)) { + dump_arg_selection_.mark_index(i); + return; + } + } + set_error("dump: tensor is not part of this Arg"); + } + + void mark_dump_arg(const TensorCreateInfo &create_info) { + for (int32_t i = 0; i < tensor_count_; i++) { + if (tags_[i] == TensorArgType::OUTPUT && tensors_[i].refers_to(&create_info)) { + dump_arg_selection_.mark_index(i); + return; + } + } + set_error("dump: TensorCreateInfo is not part of this Arg"); + } + + template + std::enable_if_t, void> mark_dump_arg(const T &scalar) { + uintptr_t ptr = reinterpret_cast(&scalar); + if (dump_arg_selection_.mark_scalar_by_ptr(ptr, scalar_count_, tensor_count_)) { + return; + } + set_error("dump: scalar is not part of this Arg"); + } +#endif + + // Compile-time validation: arg count, value category (reject temporaries — + // a stored &arg would dangle after the call), and element type. Driven + // purely by Args, with no runtime state. + template + static void assert_add_tensor_args() { + static_assert(sizeof...(Args) >= 1, "at least one argument required"); + static_assert( + (std::is_lvalue_reference_v && ...), + "temporaries are not allowed — stored pointers would dangle after the call" + ); + if constexpr (is_output) { + static_assert( + (std::is_same_v, Tensor> && ...) || + (std::is_same_v, TensorCreateInfo> && ...), + "add_output: all arguments must be the same type (all Tensor or all TensorCreateInfo)" + ); + } else { + static_assert((std::is_same_v, Tensor> && ...), "all arguments must be Tensor"); + } + } + + // Runtime validation: tensor-before-scalar ordering + slot capacity. Records + // an error and returns false on violation. + bool check_add_tensor_capacity(int32_t count) { + if (scalar_count_ != 0) { + set_error( + "add_input/add_output/add_inout called after add_scalar: " + "all tensors must be added before any scalars" + ); + return false; + } + if (tensor_count_ + count > static_cast(MaxT)) { + set_error(tensor_cap_msg()); + return false; + } + return true; + } +}; + +// ============================================================================= +// Task-args layer aliases +// ============================================================================= +// +// L0TaskArgs — core-level container used to build and submit tasks inside +// orchestration (small, stack-friendly). +using L0TaskArgs = Arg; + +// L2TaskArgs — chip-level entry-arg holding the orchestration entry's +// already-allocated inputs (capacity matches ChipStorageTaskArgs). +// aicpu_orchestration_entry/config receive a const L2TaskArgs&. +struct L2TaskArgs : Arg { + // Build from the executor's ChipStorageTaskArgs: each input becomes a + // TensorRef pointing at src's Tensor, so `src` must outlive this (on the + // executor path src is runtime->orch_args_storage_, alive for the whole run). + void create_from_chip_args(const ChipStorageTaskArgs &src) { + reset(); + for (int32_t i = 0; i < src.tensor_count(); ++i) { + // Entry inputs are external submit-time tensors; the entry binds them + // by const Tensor& (replacing from_tensor_arg's old version/manual_dep + // reset), so this invariant is what keeps that binding behavior-preserving. + const Tensor &t = src.tensor(i); + debug_assert(!t.manual_dep && t.version == 0); + add_input(t); + } + for (int32_t i = 0; i < src.scalar_count(); ++i) { + add_scalar(src.scalar(i)); + } + } +}; + +#endif // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_ diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/runtime.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/runtime.h new file mode 100644 index 000000000..3eb93f564 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/runtime.h @@ -0,0 +1,356 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Runtime Class - Device Execution and Handshake Control + * + * This class manages device-side execution through AICPU-AICore handshake + * protocol. Task graph construction is handled by PTO2Runtime; this class + * only handles: + * - Handshake buffers for AICPU-AICore communication + * - Execution parameters (block_dim, aicpu_thread_num) + * - Tensor pair management for host-device memory tracking + * - Device orchestration state (gm_sm_ptr_, orch_args_) + * - Function address mapping (func_id_to_addr_) + * + * Task dispatch uses a per-core PTO2DispatchPayload written by the scheduler. + * At dispatch time, build_payload() copies tensor pointers and scalars from + * the task payload into the per-core args[], populates SPMD context, then + * signals AICore via DATA_MAIN_BASE. + */ + +#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_ +#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_ + +#include +#include +#include // for fprintf, printf +#include // for memset + +#include + +#include "common/core_type.h" +#include "common/l2_swimlane_profiling.h" +#include "common/platform_config.h" +#include "pto2_dispatch_payload.h" +#include "task_args.h" + +// ============================================================================= +// Configuration Macros +// ============================================================================= + +#define RUNTIME_MAX_ARGS 128 +#define RUNTIME_MAX_WORKER 72 // 24 AIC + 48 AIV cores +#define RUNTIME_MAX_FUNC_ID 1024 +#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 4MB max for orchestration SO +#define RUNTIME_MAX_ORCH_SYMBOL_NAME 64 + +// Default ready queue shards: one shard per worker thread (total minus orchestrator) +constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1; + +// ============================================================================= +// Data Structures +// ============================================================================= + +/** + * Handshake Structure - Shared between Host, AICPU, and AICore + * + * This structure facilitates communication and synchronization between + * AICPU and AICore during task execution. + * + * Protocol State Machine: + * 1. Initialization: AICPU sets aicpu_ready=1 + * 2. Acknowledgment: AICore sets aicore_done=core_id+1 + * 3. Task Dispatch: AICPU writes DATA_MAIN_BASE after updating the per-core payload + * 4. Task Execution: AICore reads the cached PTO2DispatchPayload and executes + * 5. Task Completion: AICore writes FIN to COND; AICPU observes completion + * 6. Shutdown: AICPU sets control=1, AICore exits + * + * Each AICore instance has its own handshake buffer to enable concurrent + * task execution across multiple cores. + */ + +/** + * Handshake buffer for AICPU-AICore communication + * + * Each AICore has its own handshake buffer for synchronization with AICPU. + * The structure is cache-line aligned (64 bytes) to prevent false sharing + * between cores and optimize cache coherency operations. + * + * Field Access Patterns: + * - aicpu_ready: Written by AICPU, read by AICore + * - aicore_done: Written by AICore, read by AICPU + * - task: Written by AICPU, read by AICore (0 = not ready, non-zero = PTO2DispatchPayload*) + * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV) + */ +struct Handshake { + volatile uint32_t aicpu_ready; // AICPU ready signal: 0=not ready, 1=ready + volatile uint32_t aicore_done; // AICore ready signal: 0=not ready, core_id+1=ready + volatile uint64_t task; // Init: PTO2DispatchPayload* (set before aicpu_ready); runtime: unused + volatile CoreType core_type; // Core type: CoreType::AIC or CoreType::AIV + volatile uint32_t physical_core_id; // Physical core ID + volatile uint32_t aicpu_regs_ready; // AICPU register init done: 0=pending, 1=done + volatile uint32_t aicore_regs_ready; // AICore ID reported: 0=pending, 1=done +} __attribute__((aligned(64))); + +/** + * Tensor pair for tracking host-device memory mappings. + * Used for copy-back during finalize. + */ +struct TensorPair { + void *host_ptr; + void *dev_ptr; + size_t size; + // false for read-only INPUT tensors: they are never written by the kernel, + // so the end-of-run D2H copy-back is skipped. OUTPUT/INOUT/unknown + // keep the safe default of copying back. + bool needs_copy_back = true; +}; + +/** + * Host API function pointers for device memory operations. + * Allows runtime to use pluggable device memory backends. + */ +struct HostApi { + void *(*device_malloc)(size_t size); + void (*device_free)(void *dev_ptr); + int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size); + int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size); + // Set a device buffer to a byte value (device-side, no PCIe). Used to + // zero-init pure OUTPUT buffers in lieu of an H2D copy-in. May be + // null on backends that don't wire it; callers must fall back to + // copy_to_device. + int (*device_memset)(void *dev_ptr, int value, size_t size); + // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + // memory, trb prebuilt runtime arena) as three independent device + // allocations. `runtime_arena_size == 0` skips the third region (hbg + // path: hbg has no prebuilt runtime arena). Idempotent on identical + // sizes; returns 0 on success, -1 on allocation failure. + int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); + // Return the per-Worker pooled pointer for the PTO2 GM heap / shared + // memory / prebuilt runtime arena. setup_static_arena must have already + // committed the relevant region; the returned pointer is owned by the + // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it + // to device_free or record it in `tensor_pairs_`. + // + // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is + // only committed when setup_static_arena was invoked with + // runtime_arena_size > 0. Calling it on the hbg path + // (setup_static_arena(...,0)) returns nullptr (not undefined). + void *(*acquire_pooled_gm_heap)(); + void *(*acquire_pooled_gm_sm)(); + void *(*acquire_pooled_runtime_arena)(); + // Single-shot upload of the entire ChipCallable buffer. `callable` is a + // `const ChipCallable *` (declared void* to avoid pulling task_interface + // headers into runtime.h). DeviceRunner walks child_offsets_ to compute + // total byte size, allocates device GM once, fixes up each child's + // resolved_addr_ in an internal host scratch (onboard: device addr; sim: + // dlopen function pointer), H2D's once, and returns the device-side + // address of the ChipCallable header. Pool-managed: identical buffer + // contents (FNV-1a 64-bit) hit the dedup cache; all chip buffers are + // bulk-freed in DeviceRunner::finalize(). Returns 0 on error or when + // child_count() == 0. Caller computes child addrs as + // chip_dev + offsetof(ChipCallable, storage_) + child_offset(i) + // and stores them via runtime->set_function_bin_addr(fid, child_dev). + uint64_t (*upload_chip_callable_buffer)(const void *callable); +}; + +/** + * Task structure - Compatibility stub for platform layer + * + * RT2 uses PTO2DispatchPayload instead of Task for task dispatch. + * This stub exists only for API compatibility with device_runner.cpp. + * Since get_task_count() returns 0, this struct is never actually used. + */ +struct Task { + int func_id; + uint64_t function_bin_addr; +}; + +// Per-core entry point of the fully_distributed_within_core engine. Implemented +// in runtime/dist_engine.cpp (compiled into the AICPU .so), invoked by each +// AICore worker thread via Runtime::dist.core_main_fn. `runtime` is Runtime*, +// `core_type` is CoreType (cast to int to keep this typedef header-light). +// See docs/fully_distributed_within_core.md. +typedef void (*DistCoreMainFn)(void *runtime, int core_idx, int core_type); + +// ============================================================================= +// Runtime Class +// ============================================================================= + +/** + * Runtime class for device execution and handshake control + * + * This class manages AICPU-AICore communication through handshake buffers. + * Task graph construction is handled by PTO2Runtime; this class only handles + * execution control and device orchestration state. + */ +class Runtime { +public: + // Handshake buffers for AICPU-AICore communication + Handshake workers[RUNTIME_MAX_WORKER]; // Worker (AICore) handshake buffers + int worker_count; // Number of active workers + + // Execution parameters for AICPU scheduling. + // + // aicpu_thread_num is the *total* AICPU thread count launched on this run + // (= orch + schedulers). AicpuExecutor splits this into one orchestrator + // thread (highest idx, runs aicpu_orchestration_entry) and the remaining + // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore. + // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set. + int aicpu_thread_num; + int ready_queue_shards; // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1) + + // PTO2 integration: kernel_id -> GM function_bin_addr mapping + // NOTE: Made public for direct access from aicore code + uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID]; + + // Orchestrator-to-scheduler transition control + // When true, orchestrator threads convert to scheduler threads after orchestration completes. + // When false (default), orchestrator threads exit after orchestration without dispatching tasks. + // Controlled via PTO2_ORCH_TO_SCHED environment variable. + bool orch_to_sched; + + // ---- fully_distributed_within_core handoff (SPMD-on-core) ---- + // The AICPU orchestrator thread does dlopen/arena setup, then hands the + // resolved orchestration entry + per-core engine off to the AICore worker + // threads through these fields instead of running orchestration/scheduling + // itself. Each AICore worker invokes core_main_fn(runtime, idx, core_type) + // once `go` is set, then increments `done_count` when finished. See + // runtime/dist_engine.* and docs/fully_distributed_within_core.md. + struct DistHandoff { + volatile uint64_t core_main_fn; // DistCoreMainFn (in AICPU .so) + volatile uint32_t go; // 1 once engine wired and cores may start + volatile int32_t num_workers; // number of AICore workers participating + volatile int32_t done_count; // workers atomically increment when done + } dist; + +private: + // Kernel binary tracking for cleanup + int registered_kernel_func_ids_[RUNTIME_MAX_FUNC_ID]; + int registered_kernel_count_; + + void *gm_sm_ptr_; // GM pointer to PTO2 shared memory (device) + void *gm_heap_ptr_; // GM heap for orchestrator output buffers (device) + void *slot_states_ptr_; // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling) + ChipStorageTaskArgs orch_args_storage_; // Copy of args for device + + // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing + // Runtime to device; AICPU reads them in the boot path to skip + // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer + // (already populated by runtime_init_data_from_layout + wire on host). + void *prebuilt_arena_base_; + size_t prebuilt_runtime_offset_; + + // Device orchestration SO (for dlopen on AICPU thread 3). + // The SO bytes themselves live in a separately-allocated device buffer + // owned by DeviceRunner; only the metadata below travels inside Runtime. + uint64_t dev_orch_so_addr_; + uint64_t dev_orch_so_size_; + // Per-callable_id dispatch. AICPU dispatches via + // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_` + // signals whether the host is delivering a freshly-registered + // callable_id (write+dlopen) or reusing an already-loaded one. + int32_t active_callable_id_; + bool register_new_callable_id_; + char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; + char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; + +public: + /** + * Constructor - zero-initialize all arrays + */ + Runtime(); + + // ========================================================================= + // Performance Profiling + // ========================================================================= + + // ========================================================================= + // Device orchestration (for AICPU thread 3) + // ========================================================================= + + void *get_gm_sm_ptr() const; + void *get_gm_heap_ptr() const; + const ChipStorageTaskArgs &get_orch_args() const; + void set_gm_sm_ptr(void *p); + void set_gm_heap(void *p); + void set_slot_states_ptr(void *p); + void set_orch_args(const ChipStorageTaskArgs &args); + + // Prebuilt-arena fast path (trb only). Set by host's + // bind_callable_to_runtime_impl; consumed by AICPU at boot to attach a + // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at + // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on + // first construction (Runtime() ctor zeros them) so a non-prebuilt boot + // path can still detect "no prebuilt image set" via nullptr. + void set_prebuilt_arena(void *arena_base, size_t runtime_off); + void *get_prebuilt_arena_base() const; + size_t get_prebuilt_runtime_offset() const; + + // Device orchestration SO binary (for dlopen on AICPU thread 3) + void set_dev_orch_so(uint64_t dev_addr, uint64_t size); + uint64_t get_dev_orch_so_addr() const; + uint64_t get_dev_orch_so_size() const; + // Per-callable_id dispatch. callable_id must be in + // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU + // whether to (re)load the orch SO into orch_so_table_[callable_id] or + // reuse the cached entry. + void set_active_callable_id(int32_t callable_id, bool is_new); + int32_t get_active_callable_id() const; + bool register_new_callable_id() const; + void set_device_orch_func_name(const char *name); + const char *get_device_orch_func_name() const; + void set_device_orch_config_name(const char *name); + const char *get_device_orch_config_name() const; + + uint64_t get_function_bin_addr(int func_id) const; + void set_function_bin_addr(int func_id, uint64_t addr); + /** + * Replay a previously-uploaded kernel address onto a fresh Runtime + * without recording it in registered_kernel_func_ids_. Used by + * DeviceRunner::bind_callable_to_runtime so prepared kernel + * binaries are not freed by validate_runtime_impl across runs. + */ + void replay_function_bin_addr(int func_id, uint64_t addr); + + int get_registered_kernel_count() const; + int get_registered_kernel_func_id(int index) const; + void clear_registered_kernels(); + + // ========================================================================= + // Deprecated API (for platform compatibility, always returns 0/nullptr) + // Task graph is now managed by PTO2Runtime, not Runtime + // ========================================================================= + + /** @deprecated Task count is now in PTO2 shared memory */ + int get_task_count() const { return 0; } + + /** @deprecated RT2 uses PTO2DispatchPayload, not Task. Always returns nullptr. */ + Task *get_task(int) { return nullptr; } + + // ========================================================================= + // Host API (host-only, not copied to device) + // ========================================================================= + + // Host API function pointers for device memory operations + // NOTE: Placed at end of class to avoid affecting device memory layout + HostApi host_api; + + // Host-side tensor ledger for D2H copy-back at finalize. Populated by + // runtime_maker.cpp from orch_args at bind time, then iterated in + // validate_runtime_impl. Not read by AICPU/AICore — the device-side + // Runtime image carries the std::vector control block as harmless + // garbage, identical to host_api above. No fixed cap — grows with the + // chip-level entry-tensor count. + std::vector tensor_pairs_; +}; + +#endif // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_ diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp new file mode 100644 index 000000000..4b7484bc9 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Scheduler Implementation + * + * Implements scheduler state management, ready queues, and task lifecycle. + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#include "pto_scheduler.h" +#include +#include +#include "common/unified_log.h" + +#if PTO2_PROFILING +// Weak fallbacks for host/UT builds that don't link the scope_stats collector. +extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; } +extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {} +#endif + +// ============================================================================= +// Scheduler Profiling Counters +// ============================================================================= + +#if PTO2_SCHED_PROFILING +#include "common/platform_config.h" + +uint64_t g_sched_lock_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_fanout_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_fanin_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_self_consumed_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_lock_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_push_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_pop_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_lock_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_fanout_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_fanin_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_self_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_pop_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_complete_count[PLATFORM_MAX_AICPU_THREADS] = {}; + +PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) { + PTO2SchedProfilingData d; + d.lock_cycle = std::exchange(g_sched_lock_cycle[thread_idx], 0); + d.fanout_cycle = std::exchange(g_sched_fanout_cycle[thread_idx], 0); + d.fanin_cycle = std::exchange(g_sched_fanin_cycle[thread_idx], 0); + d.self_consumed_cycle = std::exchange(g_sched_self_consumed_cycle[thread_idx], 0); + d.lock_wait_cycle = std::exchange(g_sched_lock_wait_cycle[thread_idx], 0); + d.push_wait_cycle = std::exchange(g_sched_push_wait_cycle[thread_idx], 0); + d.pop_wait_cycle = std::exchange(g_sched_pop_wait_cycle[thread_idx], 0); + d.lock_atomic_count = std::exchange(g_sched_lock_atomic_count[thread_idx], 0); + d.fanout_atomic_count = std::exchange(g_sched_fanout_atomic_count[thread_idx], 0); + d.fanin_atomic_count = std::exchange(g_sched_fanin_atomic_count[thread_idx], 0); + d.self_atomic_count = std::exchange(g_sched_self_atomic_count[thread_idx], 0); + d.pop_atomic_count = std::exchange(g_sched_pop_atomic_count[thread_idx], 0); + d.complete_count = std::exchange(g_sched_complete_count[thread_idx], 0); + return d; +} +#endif + +// ============================================================================= +// Debug Utilities +// ============================================================================= + +void PTO2SchedulerState::print_stats() { + PTO2SchedulerState *sched = this; + LOG_INFO_V0("=== Scheduler Statistics ==="); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (sched->ring_sched_states[r].last_task_alive > 0) { + LOG_INFO_V0("Ring %d:", r); + LOG_INFO_V0(" last_task_alive: %d", sched->ring_sched_states[r].last_task_alive); + auto &dp = sched->ring_sched_states[r].dep_pool; + if (dp.top > 0) { + LOG_INFO_V0( + " dep_pool: top=%d tail=%d used=%d high_water=%d capacity=%d", dp.top, dp.tail, dp.top - dp.tail, + dp.high_water, dp.capacity + ); + } + } + } +#if PTO2_SCHED_PROFILING + LOG_INFO_V0("tasks_completed: %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed)); + LOG_INFO_V0("tasks_consumed: %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed)); +#endif + LOG_INFO_V0("============================"); +} + +void PTO2SchedulerState::print_queues() { + PTO2SchedulerState *sched = this; + LOG_INFO_V0("=== Ready Queues ==="); + + const char *shape_names[] = {"AIC", "AIV", "MIX"}; + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + LOG_INFO_V0(" %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size()); + } + LOG_INFO_V0(" DUMMY: count=%" PRIu64, sched->dummy_ready_queue.size()); + + LOG_INFO_V0("===================="); +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h new file mode 100644 index 000000000..ca88d7a87 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h @@ -0,0 +1,1485 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO Runtime2 - Scheduler Interface + * + * The Scheduler is responsible for: + * 1. Maintaining per-resource-shape ready queues + * 2. Tracking task state (PENDING -> COMPLETED -> CONSUMED) + * 3. Managing fanin/fanout refcounts for dependency resolution + * 4. Advancing last_task_alive for heap reclamation + * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete) + * + * The Scheduler runs on Device AI_CPU and processes: + * - Task state transitions based on fanin_refcount + * - Buffer lifecycle based on fanout_refcount + * - Ring pointer advancement for flow control + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#pragma once + +#include + +#include "common/core_type.h" +#include "utils/device_arena.h" +#include "aicpu/platform_regs.h" // get_reg_ptr / RegId for the speculative doorbell +#include "pto_async_wait.h" +#include "pto_ring_buffer.h" +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" + +#include "aicpu/device_time.h" // get_sys_cnt_aicpu (weak; used by spec doorbell timing too) +#if PTO2_SCHED_PROFILING +#define PTO2_SCHED_CYCLE_START() uint64_t _st0 = get_sys_cnt_aicpu(), _st1 +#define PTO2_SCHED_CYCLE_LAP(acc) \ + do { \ + _st1 = get_sys_cnt_aicpu(); \ + acc += (_st1 - _st0); \ + _st0 = _st1; \ + } while (0) +#endif + +// ============================================================================= +// Ready Queue (Lock-free bounded MPMC — Vyukov design) +// ============================================================================= + +/** + * Per-slot entry: sequence counter for ABA safety + task payload + */ +struct PTO2ReadyQueueSlot { + std::atomic sequence; + PTO2TaskSlotState *slot_state; +}; + +/** + * Thread-local ready buffer for local-first dispatch optimization. + * + * Two buffers per scheduling thread, one per CoreType (AIC=0, AIV=1). + * Initialized once before the scheduling loop; must be empty at + * the start of each iteration (verified by always_assert). + * + * Phase 1 fills per-CoreType buffers via on_task_complete(). + * The dispatch stage drains them local-first via get_ready_tasks_batch, + * with any remaining tasks pushed to the global ready queue. + */ +// Number of CoreType values eligible for local dispatch (AIC=0, AIV=1) +static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2; + +struct PTO2LocalReadyBuffer { + PTO2TaskSlotState **slot_states = nullptr; + int count = 0; + int capacity = 0; + + void reset(PTO2TaskSlotState **buf, int cap) { + slot_states = buf; + count = 0; + capacity = cap; + } + + bool try_push(PTO2TaskSlotState *s) { + if (slot_states && count < capacity) { + slot_states[count++] = s; + return true; + } + return false; + } + + PTO2TaskSlotState *pop() { return (count > 0) ? slot_states[--count] : nullptr; } +}; + +/** + * Lock-free bounded MPMC queue (Dmitry Vyukov design) + * + * Key properties: + * - enqueue_pos and dequeue_pos on separate cache lines (no false sharing) + * - Per-slot sequence counter prevents ABA problem + * - Empty queue pop returns immediately (single atomic load, no lock) + * - CAS contention is split: producers only touch enqueue_pos, + * consumers only touch dequeue_pos + */ +struct alignas(64) PTO2ReadyQueue { + PTO2ReadyQueueSlot *slots; + uint64_t capacity; + uint64_t mask; // capacity - 1 + char _pad0[64 - 24]; // Pad to own cache line + + std::atomic enqueue_pos; + char _pad1[64 - sizeof(std::atomic)]; // Own cache line + + std::atomic dequeue_pos; + char _pad2[64 - sizeof(std::atomic)]; // Own cache line + + uint64_t size() { + uint64_t e = enqueue_pos.load(std::memory_order_relaxed); + uint64_t d = dequeue_pos.load(std::memory_order_relaxed); + return (e >= d) ? (e - d) : 0; + } + + bool push(PTO2TaskSlotState *slot_state) { + uint64_t pos; + PTO2ReadyQueueSlot *slot; + while (true) { + pos = enqueue_pos.load(std::memory_order_relaxed); + slot = &slots[pos & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos); + if (diff == 0) { + if (enqueue_pos.compare_exchange_weak( + pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed + )) { + break; + } + } else if (diff < 0) { + return false; // Queue full + } + } + + slot->slot_state = slot_state; + slot->sequence.store(static_cast(pos + 1), std::memory_order_release); + return true; + } + + // Batch push: reserve count slots with a single CAS after confirming + // every target slot is available under the usual Vyukov sequence check. + void push_batch(PTO2TaskSlotState **items, int count) { + if (count == 0) return; + + uint64_t pos; + while (true) { + pos = enqueue_pos.load(std::memory_order_relaxed); + bool ready = true; + for (int i = 0; i < count; i++) { + PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos + i); + if (diff != 0) { + ready = false; + break; + } + } + if (!ready) { + continue; + } + if (enqueue_pos.compare_exchange_weak( + pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed + )) { + break; + } + } + + for (int i = 0; i < count; i++) { + PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask]; + slot->slot_state = items[i]; + slot->sequence.store(static_cast(pos + i + 1), std::memory_order_release); + } + } + +#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING + bool push(PTO2TaskSlotState *slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) { + uint64_t pos; + PTO2ReadyQueueSlot *slot; + uint64_t t0 = get_sys_cnt_aicpu(); + bool contended = false; + uint32_t atomic_ops = 0; + while (true) { + pos = enqueue_pos.load(std::memory_order_relaxed); + slot = &slots[pos & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos); + atomic_ops += 2; // enqueue_pos.load + sequence.load + if (diff == 0) { + if (enqueue_pos.compare_exchange_weak( + pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed + )) { + atomic_ops++; // successful CAS + break; + } + contended = true; + atomic_ops++; // failed CAS + } else if (diff < 0) { + return false; // Queue full + } else { + contended = true; // diff > 0: slot not yet released, spin + } + } + atomic_ops++; // final sequence.store + atomic_count += atomic_ops; + if (contended) { + wait_cycle += (get_sys_cnt_aicpu() - t0); + } + + slot->slot_state = slot_state; + slot->sequence.store(static_cast(pos + 1), std::memory_order_release); + return true; + } +#endif + + PTO2TaskSlotState *pop() { + // Fast-path: skip slot load when queue is clearly empty + uint64_t d = dequeue_pos.load(std::memory_order_relaxed); + uint64_t e = enqueue_pos.load(std::memory_order_relaxed); + if (d >= e) { + return nullptr; + } + + uint64_t pos; + PTO2ReadyQueueSlot *slot; + while (true) { + pos = dequeue_pos.load(std::memory_order_relaxed); + slot = &slots[pos & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos + 1); + if (diff == 0) { + if (dequeue_pos.compare_exchange_weak( + pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed + )) + break; + } else if (diff < 0) { + return nullptr; // Queue empty + } + } + + PTO2TaskSlotState *result = slot->slot_state; + slot->sequence.store(static_cast(pos + mask + 1), std::memory_order_release); + return result; + } + +#if PTO2_SCHED_PROFILING + PTO2TaskSlotState *pop(uint64_t &atomic_count, uint64_t &wait_cycle) { + // Fast-path: skip slot load when queue is clearly empty + uint64_t d = dequeue_pos.load(std::memory_order_relaxed); + uint64_t e = enqueue_pos.load(std::memory_order_relaxed); + atomic_count += 2; // dequeue_pos.load + enqueue_pos.load + if (d >= e) { + return nullptr; + } + + uint64_t pos; + PTO2ReadyQueueSlot *slot; + uint64_t t0 = get_sys_cnt_aicpu(); + bool contended = false; + uint32_t atomic_ops = 0; + while (true) { + pos = dequeue_pos.load(std::memory_order_relaxed); + slot = &slots[pos & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos + 1); + atomic_ops += 2; // dequeue_pos.load + sequence.load + if (diff == 0) { + if (dequeue_pos.compare_exchange_weak( + pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed + )) { + atomic_ops++; // successful CAS + break; + } + contended = true; + atomic_ops++; // failed CAS + } else if (diff < 0) { + atomic_count += atomic_ops; + return nullptr; // Queue empty + } else { + contended = true; + } + } + atomic_ops++; // final sequence.store + atomic_count += atomic_ops; + if (contended) { + wait_cycle += (get_sys_cnt_aicpu() - t0); + } + + PTO2TaskSlotState *result = slot->slot_state; + slot->sequence.store(static_cast(pos + mask + 1), std::memory_order_release); + return result; + } +#endif + + // Batch pop: reserve a contiguous run of ready slots with a single CAS. + // Returns actual number of items popped (may be less than max_count). + int pop_batch(PTO2TaskSlotState **out, int max_count) { + uint64_t pos; + int count; + while (true) { + pos = dequeue_pos.load(std::memory_order_relaxed); + count = 0; + while (count < max_count) { + PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos + count + 1); + if (diff == 0) { + count++; + continue; + } + if (diff < 0) { + break; + } + count = -1; + break; + } + if (count == 0) return 0; + if (count < 0) continue; + if (dequeue_pos.compare_exchange_weak( + pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed + )) { + break; + } + } + + for (int i = 0; i < count; i++) { + PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask]; + out[i] = slot->slot_state; + slot->sequence.store(static_cast(pos + i + mask + 1), std::memory_order_release); + } + return count; + } + +#if PTO2_SCHED_PROFILING + int pop_batch(PTO2TaskSlotState **out, int max_count, uint64_t &atomic_count, uint64_t &wait_cycle) { + uint64_t pos; + int count; + uint64_t t0 = get_sys_cnt_aicpu(); + bool contended = false; + uint32_t atomic_ops = 0; + while (true) { + pos = dequeue_pos.load(std::memory_order_relaxed); + atomic_ops++; // dequeue_pos.load + count = 0; + while (count < max_count) { + PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos + count + 1); + atomic_ops++; // sequence.load + if (diff == 0) { + count++; + continue; + } + if (diff < 0) { + break; + } + contended = true; + count = -1; + break; + } + if (count == 0) { + atomic_count += atomic_ops; + return 0; + } + if (count < 0) { + continue; + } + if (dequeue_pos.compare_exchange_weak( + pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed + )) { + atomic_ops++; // successful CAS + break; + } + contended = true; + atomic_ops++; // failed CAS + } + + for (int i = 0; i < count; i++) { + PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask]; + out[i] = slot->slot_state; + slot->sequence.store(static_cast(pos + i + mask + 1), std::memory_order_release); + atomic_ops++; // sequence.store + } + atomic_count += atomic_ops; + if (contended) { + wait_cycle += (get_sys_cnt_aicpu() - t0); + } + return count; + } +#endif +}; + +// Cold-path ready queue operations (defined in pto_scheduler.cpp). Declared +// as non-member so PTO2ReadyQueue stays a POD-like struct with cache-line +// alignment. Storage is owned by the caller-supplied arena. +// reserve_layout: declare the slots[] region on the arena (must precede commit) +// init_from_layout: bind slots pointer from arena.region_ptr(off) and +// initialize sequence counters +// destroy: forget the slots pointer (arena owns the buffer) +size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity); +// Writes everything *except* the arena-internal `slots` pointer field +// (sequences/positions on the slot array, capacity, mask). Uses +// arena.region_ptr(slots_off) only to address the slot array for writes; +// does NOT store the pointer in `queue->slots`. Call +// `ready_queue_wire_arena_pointers` afterwards to set the field itself. +bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity); +// Stores queue->slots = arena.region_ptr(slots_off). Idempotent. +void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off); +void ready_queue_destroy(PTO2ReadyQueue *queue); + +// ============================================================================= +// SPSC Queue (Single-Producer Single-Consumer, wait-free) +// ============================================================================= +// +// Bounded ring buffer optimized for the wiring queue use case: +// - Producer: orchestrator thread (push) +// - Consumer: scheduler thread 0 (pop_batch) +// +// Design based on Rigtorp's cached-index technique: each side caches +// the other's index locally, avoiding cross-core cache line bouncing +// on the hot path. Only when the local cache says "full" or "empty" +// does the thread issue an acquire load on the remote index. +// +// Memory layout: 5 cache-line-aligned fields ensure zero false sharing. + +struct alignas(64) PTO2SpscQueue { + // --- Producer cache lines (orchestrator thread) --- + alignas(64) std::atomic head_{0}; + alignas(64) uint64_t tail_cached_{0}; + + // --- Consumer cache lines (scheduler thread 0) --- + alignas(64) std::atomic tail_{0}; + alignas(64) uint64_t head_cached_{0}; + + // --- Shared Cacheline (read only) with mask and data ptr (immutable after init) --- + alignas(64) PTO2TaskSlotState **buffer_{nullptr}; + uint64_t mask_{0}; + + // Padding to exactly 5 cache lines + char padding[64 - sizeof(PTO2TaskSlotState **) - sizeof(uint64_t)]; + + // Reserve the backing buffer region on the supplied arena. Returns the + // region offset, to be passed to init_from_layout() after the arena is + // committed. Cache-line aligned: the buffer is shared between the + // orchestrator (push) and scheduler thread 0 (pop_batch), so its base + // must not false-share with neighboring regions. + static size_t reserve_layout(DeviceArena &arena, uint64_t capacity) { + return arena.reserve(capacity * sizeof(uintptr_t), PTO2_ALIGN_SIZE); + } + + // Writes everything except the arena-internal `buffer_` pointer field + // (zeros the slot pointer array, mask/head/tail). The host pre-builds the + // image without storing a host address in buffer_; the AICPU wires + // buffer_ at boot via wire_arena_pointers(). + bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) { + if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false; + auto *buf = static_cast(arena.region_ptr(buffer_off)); + // calloc'd-equivalent: zero the slot pointers so spurious early pops + // observe nullptr. + for (uint64_t i = 0; i < capacity; i++) + buf[i] = nullptr; + mask_ = capacity - 1; + head_.store(0, std::memory_order_relaxed); + tail_.store(0, std::memory_order_relaxed); + tail_cached_ = 0; + head_cached_ = 0; + return true; + } + + // Wire the arena-internal pointer. Called by both host (with host arena) + // and AICPU (with device arena attached to the prebuilt image). + void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) { + buffer_ = static_cast(arena.region_ptr(buffer_off)); + } + + // Arena owns the buffer; here we only forget our pointer. + void destroy() { buffer_ = nullptr; } + + // Push one item (producer only). Returns false if queue is full. + // Full condition: next_h - tail > mask_ (i.e. > capacity-1), so the + // effective usable capacity is capacity-1 (one slot is wasted as a + // sentinel to distinguish full from empty). uint64_t wrapping is safe + // since head and tail are monotonically increasing and subtraction + // wraps correctly. + bool push(PTO2TaskSlotState *item) { + uint64_t h = head_.load(std::memory_order_relaxed); + uint64_t next_h = h + 1; + if (next_h - tail_cached_ > mask_) { + tail_cached_ = tail_.load(std::memory_order_acquire); + if (next_h - tail_cached_ > mask_) { + return false; + } + } + buffer_[h & mask_] = item; + head_.store(next_h, std::memory_order_release); + return true; + } + + // Pop up to max_count items (consumer only). Returns actual count. + int pop_batch(PTO2TaskSlotState **out, int max_count) { + uint64_t t = tail_.load(std::memory_order_relaxed); + uint64_t avail = head_cached_ - t; + if (avail < static_cast(max_count)) { + head_cached_ = head_.load(std::memory_order_acquire); + avail = head_cached_ - t; + if (avail == 0) return 0; + } + int count = (avail < static_cast(max_count)) ? static_cast(avail) : max_count; + for (int i = 0; i < count; i++) { + out[i] = buffer_[(t + i) & mask_]; + } + tail_.store(t + count, std::memory_order_release); + return count; + } + + // Approximate size (used for backoff decisions, not exact). + uint64_t size() const { + uint64_t h = head_.load(std::memory_order_acquire); + uint64_t t = tail_.load(std::memory_order_acquire); + return h - t; + } +}; + +static_assert(sizeof(PTO2SpscQueue) == 5 * 64, "PTO2SpscQueue must be exactly 5 cache lines (320B)"); +// ============================================================================= + +/** + * Statistics returned by mixed-task completion processing + */ +struct CompletionStats { + int32_t fanout_edges; // Number of fanout edges traversed (notify consumers) + int32_t tasks_enqueued; // Number of consumers that became READY + int32_t fanin_edges; // Number of fanin edges traversed (release producers) + bool mixed_task_completed; // True only when this callback completed a mixed task +}; + +/** + * Layout descriptor produced by PTO2SchedulerState::reserve_layout(). Holds + * the arena offsets of every sub-region the scheduler needs plus the + * capacities used at layout time (init_from_layout reuses them). + */ +struct PTO2SchedulerLayout { + size_t off_ready_queue_slots[PTO2_NUM_RESOURCE_SHAPES]; + size_t off_dummy_ready_queue_slots; + size_t off_early_dispatch_queue_slots; + size_t off_dep_pool_entries[PTO2_MAX_RING_DEPTH]; + size_t off_wiring_spsc_buffer; + uint64_t ready_queue_capacity; + uint64_t spsc_capacity; + int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; +}; + +/** + * Scheduler state structure + * + * Contains dynamic state updated during task execution. + * Separated from shared memory for cache efficiency. + * Hot-path methods are defined inline (implicitly inline as member functions). + */ +struct PTO2SchedulerState { + // Shared memory access + PTO2SharedMemoryHeader *sm_header; + + // Per-ring state + struct alignas(64) RingSchedState { + // --- Cache Line 0: ring pointer (read-only) + hot path (read-write) --- + PTO2SharedMemoryRingHeader *ring; + int32_t last_task_alive; + std::atomic advance_lock; // multi-thread CAS + + // --- Cache Line 1+: Thread 0 only (wiring dep_pool) --- + alignas(64) PTO2DepListPool dep_pool; +#if PTO2_PROFILING + // Published only for scope_stats; orchestrator must not read dep_pool's non-atomic counters directly. + alignas(64) std::atomic dep_pool_snapshot_tail; + std::atomic dep_pool_snapshot_top; +#endif + + // Initialize arena-internal data + arena-external pointers; does NOT + // store dep_pool.base (that lives in the runtime arena and is wired + // by SchedulerState::wire_arena_pointers). The `ring` field stores + // the device address of the SM ring header — computed via offset + // arithmetic, no SM dereference. + bool init_data_from_layout(void *sm_dev_base, int32_t ring_id); + void destroy(); + + void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); } + +#if PTO2_PROFILING + void publish_dep_pool_snapshot() { + dep_pool_snapshot_tail.store(dep_pool.tail, std::memory_order_release); + dep_pool_snapshot_top.store(dep_pool.top, std::memory_order_release); + } + + void read_dep_pool_snapshot(int32_t &tail, int32_t &top) const { + top = dep_pool_snapshot_top.load(std::memory_order_acquire); + tail = dep_pool_snapshot_tail.load(std::memory_order_acquire); + if (tail > top) tail = top; + } +#endif + + void advance_ring_pointers() { + int32_t current_task_index = ring->fc.current_task_index.load(std::memory_order_acquire); + int32_t old_last_task_alive = last_task_alive; + + while (last_task_alive < current_task_index) { + PTO2TaskSlotState &slot_state = ring->get_slot_state_by_task_id(last_task_alive); + if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) { + break; + } + last_task_alive++; + } + + // Eager reset: prepare reclaimed slots for reuse while still hot in cache. + // Safe because last_task_alive has advanced past these slots but + // sync_to_sm has not yet published — the orchestrator cannot reuse + // them until the release store below. + // Skips payload, task, ring_id — immutable after RingSchedState::init(). + for (int32_t id = old_last_task_alive; id < last_task_alive; id++) { + ring->get_slot_state_by_task_id(id).reset_for_reuse(); + } + + sync_to_sm(); + } + } ring_sched_states[PTO2_MAX_RING_DEPTH]; + + // Ready queues remain global (scheduling is ring-agnostic) + PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES]; + + // Dependency-only tasks (active_mask is empty, shape == DUMMY). Drained by + // the dispatch loop and completed inline -- never goes to AICore. + PTO2ReadyQueue dummy_ready_queue; + + // Wiring subsystem — groups all wiring-related state for cache-line isolation. + // + // Three cache-line regions by writer: + // 1. batch_* / backoff — thread 0 exclusive (local batch buffer) + // 2. queue — SPSC: orchestrator push, thread 0 pop + // 3. orch_needs_drain — orchestrator write, thread 0 read + struct alignas(64) WiringState { + static constexpr uint64_t BATCH_SIZE = 30; + static constexpr int BACKOFF_LIMIT = 32; + + // --- Thread 0 exclusive: local batch buffer + backoff --- + int batch_count = 0; + int batch_index = 0; + int backoff_counter = 0; + PTO2TaskSlotState *batch[BATCH_SIZE]; + + // --- SPSC queue: orchestrator (push) ↔ thread 0 (pop) --- + PTO2SpscQueue queue; + + // --- Orchestrator write, thread 0 read --- + alignas(64) std::atomic orch_needs_drain{false}; + } wiring; + + static_assert( + offsetof(WiringState, queue) == 256, "WiringState: batch region must be exactly 4 cache lines before queue" + ); + static_assert(sizeof(WiringState) == 640, "WiringState must be exactly 10 cache lines (640B)"); + + alignas(64) AsyncWaitList async_wait_list; + + // Statistics (cold path, isolated from hot-path fields) +#if PTO2_SCHED_PROFILING + alignas(64) std::atomic tasks_completed; + std::atomic tasks_consumed; +#endif + // ========================================================================= + // Inline hot-path methods + // ========================================================================= + + /** + * Drain wiring queue: pop submitted tasks and wire their fanout edges. + * Called by scheduler thread 0 each loop iteration. Sets fanin_count, + * acquires fanout_lock per producer, allocates dep_pool entries, and + * pushes ready tasks to the appropriate ready queue. + * + * @return Number of tasks wired this call. + */ + + int drain_wiring_queue(bool force_drain = false) { + int wired = 0; + + // Refill local batch buffer when exhausted. + if (wiring.batch_index >= wiring.batch_count) { + // Backoff: defer pop when queue holds fewer than a full batch, + // unless force_drain, orch_needs_drain, or backoff limit reached. + if (!force_drain && wiring.queue.size() < WiringState::BATCH_SIZE) { + if (!wiring.orch_needs_drain.load(std::memory_order_acquire) && + wiring.backoff_counter < WiringState::BACKOFF_LIMIT) { + wiring.backoff_counter++; + return 0; + } + } + wiring.backoff_counter = 0; + wiring.batch_count = wiring.queue.pop_batch(wiring.batch, WiringState::BATCH_SIZE); + wiring.batch_index = 0; + if (wiring.batch_count == 0) return 0; + } + + // Process tasks from local buffer in strict FIFO order. + while (wiring.batch_index < wiring.batch_count) { + PTO2TaskSlotState *ws = wiring.batch[wiring.batch_index]; + int ring_id = ws->ring_id; + auto &rss = ring_sched_states[ring_id]; + int32_t wfanin = ws->payload->fanin_actual_count; + + if (wfanin > 0 && rss.dep_pool.available() < wfanin) { + rss.dep_pool.reclaim(*rss.ring, rss.last_task_alive); + if (rss.dep_pool.available() < wfanin) { +#if PTO2_PROFILING + if (is_scope_stats_enabled()) { + rss.publish_dep_pool_snapshot(); + } +#endif + break; // not enough dep_pool space — keep remainder for next call + } + } + + wiring.batch_index++; + wire_task(rss, ws, wfanin); + wired++; + } + + return wired; + } + + // Route a ready slot to the right global queue. Dummy tasks (empty + // active_mask) live in dummy_ready_queue; everything else goes to the + // per-shape ready_queues[]. Used by paths that do not have a thread-local + // ready buffer (e.g. wiring). See push_ready_routed_local for the + // dispatch-time fast path. + void push_ready_routed(PTO2TaskSlotState *slot_state) { + PTO2ResourceShape shape = slot_state->active_mask.to_shape(); + if (shape == PTO2ResourceShape::DUMMY) { + dummy_ready_queue.push(slot_state); + } else { + ready_queues[static_cast(shape)].push(slot_state); + } + } + + /** + * Wire fanout edges for a single task. Sets fanin_count, acquires each + * producer's fanout_lock, allocates dep_pool entries for live producers, + * pushes the task to the ready queue once its fanin refcount is satisfied. + */ + void wire_task(RingSchedState &rss, PTO2TaskSlotState *ws, int32_t wfanin) { + PTO2TaskPayload *wp = ws->payload; + ws->fanin_count = wfanin + 1; + + if (wfanin != 0) { + int32_t early_finished = 0; + for_each_fanin_slot_state(*wp, [&](PTO2TaskSlotState *producer) { + producer->lock_fanout(); + int32_t pstate = producer->task_state.load(std::memory_order_acquire); + if (pstate >= PTO2_TASK_COMPLETED) { + early_finished++; + } else { + producer->fanout_head = rss.dep_pool.prepend(producer->fanout_head, ws); + } + producer->unlock_fanout(); + }); + + // Seed dispatch_fanin with producers already complete at wiring + // time (e.g. buffer-creator tasks that finished before this + // consumer entered the graph). Such producers never dispatch at + // runtime, so they can never bump dispatch_fanin via the fanout + // walk; without this seed the candidate compare + // (dispatch_fanin == fanin_actual_count) would be unreachable + // whenever any producer is pre-completed. Mirrors the + // early_finished seed that ready_fanin gets via init_rc. + if (early_finished != 0) { + wp->dispatch_fanin.fetch_add(early_finished, std::memory_order_acq_rel); + } + + int32_t init_rc = early_finished + 1; + int32_t new_rc = ws->fanin_refcount.fetch_add(init_rc, std::memory_order_acq_rel) + init_rc; + if (new_rc >= ws->fanin_count) { + push_ready_routed(ws); + } + } else { + ws->fanin_refcount.fetch_add(1, std::memory_order_acq_rel); + push_ready_routed(ws); + } + + ws->dep_pool_mark = rss.dep_pool.top; +#if PTO2_PROFILING + if (is_scope_stats_enabled()) { + rss.publish_dep_pool_snapshot(); + } +#endif + } + + void check_and_handle_consumed(PTO2TaskSlotState &slot_state) { + if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return; + + PTO2TaskState expected = PTO2_TASK_COMPLETED; + if (!slot_state.task_state.compare_exchange_strong( + expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire + )) { + return; + } + +#if PTO2_SCHED_PROFILING + tasks_consumed.fetch_add(1, std::memory_order_relaxed); +#endif + + int32_t ring_id = slot_state.ring_id; + // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task + int32_t expected_lock = 0; + if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( + expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed + )) { + ring_sched_states[ring_id].advance_ring_pointers(); + ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); + } + } + +#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING + void check_and_handle_consumed(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) { + int32_t fc = slot_state.fanout_count; + int32_t rc = slot_state.fanout_refcount.load(std::memory_order_acquire); + + atomic_count += 2; // fanout_count.load + fanout_refcount.load + + if (rc != fc) return; + + PTO2TaskState expected = PTO2_TASK_COMPLETED; + if (!slot_state.task_state.compare_exchange_strong( + expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire + )) { + atomic_count += 1; // failed CAS + return; + } + + atomic_count += 1; // successful CAS + +#if PTO2_SCHED_PROFILING + tasks_consumed.fetch_add(1, std::memory_order_relaxed); +#endif + + int32_t ring_id = slot_state.ring_id; + // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task + int32_t expected_lock = 0; + if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( + expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed + )) { + ring_sched_states[ring_id].advance_ring_pointers(); + ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); + atomic_count += 2; // try-lock CAS + unlock store + } else { + atomic_count += 1; // failed try-lock CAS + } + } +#endif + + void release_producer(PTO2TaskSlotState &slot_state) { + slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel); + check_and_handle_consumed(slot_state); + } + +#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING + void release_producer(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) { + slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel); + atomic_count += 1; // fanout_refcount.fetch_add + check_and_handle_consumed(slot_state, atomic_count); + } +#endif + + // Speculative early-dispatch release. If the now-ready task was pre-staged + // (gated on a core), ring its DATA_MAIN_BASE high-32 doorbell RIGHT HERE in + // the completion path — the moment its last producer's FIN satisfies fanin — + // instead of routing it through the ready queue and waiting for the dispatch + // pass to pop it. Returns true if the task is fully handled (caller must NOT + // push to the ready queue). Returns false when the caller must route C + // normally: either it was never pre-staged, OR it is a SPMD consumer only + // PARTIALLY pre-staged — the gated blocks are released by the doorbells rung + // here, and the remaining (next_block_idx .. logical_block_num) blocks + // dispatch normally off the ready queue. Lock-free claim shared with Hook 1 + // (the stager): CAS NONE->DISPATCHED wins => not pre-staged; lose => STAGED + // (spin past the brief STAGING window so the mask is visible), then ring. + + // Per-core speculative doorbell table. Hook 1 records each gated core's + // (reg_addr, dispatch token) here at stage time; the completion-path release + // reads it back for the cores set in the consumer's staged_core_mask. One + // global table indexed by core_id (not per-task): gated cores in flight are + // bounded by the chip's core count (no two-level pre-dispatch), so this is the + // natural capacity and removes the old per-task 3-doorbell cap. + struct SpecDoorbell { + uint64_t addr{0}; + uint32_t token{0}; + }; + SpecDoorbell spec_doorbell_table[PTO2_SPEC_CORE_MASK_WORDS * 64]{}; + + // Cross-thread early-dispatch work queue (a PTO2ReadyQueue MPMC instance, + // arena-backed — reserved/wired in pto_runtime2_init alongside the ready queues). + // A consumer's SPMD blocks span cores owned by several AICPU threads, but only a + // thread RUNNING the consumer's producer discovers it (via the producer's + // fanout). When that producer is thread-local (e.g. a 16-block AIV op filling one + // thread's cores), the other threads never see the consumer and its blocks on + // their cores can't pre-stage. The first claimer pushes the partially-staged + // consumer here; every idle thread's early_dispatch pass pops one, stages a range onto + // ITS OWN cores (range-claim via next_block_idx), and re-pushes if blocks remain + // — exactly mirroring how a partially-dispatched SPMD task is re-pushed to the + // ready queue (scheduler_dispatch: pop -> claim -> re-push). A stale/released + // entry fails the STAGING check on pop and is dropped; a push that overflows is + // logged and the consumer's blocks fall back to normal dispatch. + PTO2ReadyQueue early_dispatch_queue; + + static inline void ring_one_doorbell(uint64_t reg_addr, uint32_t token) { + volatile uint64_t *dmb = reinterpret_cast(get_reg_ptr(reg_addr, RegId::DATA_MAIN_BASE)); + uint64_t tk = static_cast(token); + *dmb = (tk << 32) | tk; // 64-bit STR: high=low=token releases the gated AICore + } + + // auto-chain depth cap: a candidate inherits the flag only while depth < this. + static constexpr uint8_t PTO2_SPEC_CHAIN_MAX = 4; + + // Event-driven candidate detection (the dual of fanin_refcount/ready). Call when a + // FLAGGED producer `p` DISPATCHES (starts running): walk its fanout and bump each + // consumer's dispatch_fanin. A consumer whose dispatch_fanin reaches + // fanin_actual_count (= every producer is either flagged-and-dispatched, or was + // already complete when the consumer was wired) is an early-dispatch candidate: + // CAS NONE->STAGING (exactly-once) and push to early_dispatch_queue for the idle drain to + // pre-stage. Once-guarded per producer so an SPMD producer's block-by-block + // dispatch propagates once. Replaces the old per-iteration pass-1 PULL scan. + void propagate_dispatch_fanin(PTO2TaskSlotState &p) { + if (!(p.payload->allow_early_resolve || p.payload->spec_chain_active.load(std::memory_order_acquire))) + return; // only flagged (codegen or inherited) producers propagate + if (p.payload->dispatch_propagated.exchange(1, std::memory_order_acq_rel) != 0) + return; // already propagated once + uint8_t child_depth = static_cast(p.payload->spec_chain_depth + 1); + p.lock_fanout(); + PTO2DepListEntry *edge = p.fanout_head; // snapshot head, walk lock-free (fanout stable by dispatch) + p.unlock_fanout(); + for (; edge != nullptr; edge = edge->next) { + PTO2TaskSlotState *c = edge->slot_state; + // Compare to fanin_actual_count (the real producer-edge count), NOT + // fanin_count: fanin_count = fanin_actual_count + 1 (a self/wiring +1 that + // ready_fanin gets but dispatch_fanin does not). dispatch_fanin starts at + // the wiring-time early_finished seed (producers already complete) and is + // bumped here by flagged producers; reaching fanin_actual_count means every + // producer is flagged-dispatched or was pre-completed. + int32_t nf = c->payload->dispatch_fanin.fetch_add(1, std::memory_order_acq_rel) + 1; + if (nf != c->payload->fanin_actual_count) continue; + if (c->active_mask.requires_sync_start()) continue; // sync_start can't be block-by-block pre-staged + PTO2ResourceShape shape = c->active_mask.to_shape(); + if (shape != PTO2ResourceShape::AIC && shape != PTO2ResourceShape::AIV && shape != PTO2ResourceShape::MIX) + continue; + uint8_t expect = PTO2_SPEC_NONE; // exactly-once: only the CAS winner enqueues + if (!c->payload->spec_state.compare_exchange_strong( + expect, PTO2_SPEC_STAGING, std::memory_order_seq_cst, std::memory_order_seq_cst + )) + continue; + if (child_depth < PTO2_SPEC_CHAIN_MAX) { // auto-chain: C propagates to ITS consumers + c->payload->spec_chain_depth = child_depth; + c->payload->spec_chain_active.store(1, std::memory_order_release); + } + early_dispatch_queue.push(c); + } + } + + // Collects consumers released via the speculative-doorbell path during a + // single on_task_complete fanout walk, so their dispatch_fanin + // propagation runs AFTER the walk — never between two siblings' doorbells. + struct SpecReleaseSink { + static constexpr int CAP = 32; + PTO2TaskSlotState *items[CAP]; + int n = 0; + inline bool push(PTO2TaskSlotState *s) { + if (n >= CAP) return false; + items[n++] = s; + return true; + } + }; + + inline bool try_speculative_release(PTO2TaskSlotState &slot_state, SpecReleaseSink *sink = nullptr) { + // Never staged => CAS NONE->DISPATCHED wins => dispatch normally. + uint8_t expect = PTO2_SPEC_NONE; + if (slot_state.payload->spec_state.compare_exchange_strong( + expect, PTO2_SPEC_DISPATCHED, std::memory_order_seq_cst, std::memory_order_seq_cst + )) { + return false; + } + // Staged (STAGING). Flip STAGING->DISPATCHED, THEN read the mask. seq_cst + // gives a total order with the concurrent stagers, each of which OR-s its + // core into the mask and THEN loads spec_state: a stager whose bit lands + // before this CAS is read here and rung; a stager whose bit lands after + // sees DISPATCHED and rings that core itself (self-ring in + // stage_consumer_blocks). Either way every gated core's doorbell fires once + // (a double-ring is harmless — the AICore already matched). This replaces + // the old transient-STAGING spin: STAGING is now the stable gated state. + expect = PTO2_SPEC_STAGING; + slot_state.payload->spec_state.compare_exchange_strong( + expect, PTO2_SPEC_DISPATCHED, std::memory_order_seq_cst, std::memory_order_seq_cst + ); + for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++) { + uint64_t bits = slot_state.payload->staged_core_mask[w].load(std::memory_order_seq_cst); + while (bits != 0) { + int core_id = w * 64 + __builtin_ctzll(bits); + bits &= bits - 1; + ring_one_doorbell(spec_doorbell_table[core_id].addr, spec_doorbell_table[core_id].token); + } + } + // This pre-staged consumer was just released by its doorbell — it starts + // running NOW, so propagate dispatch_fanin to ITS consumers (auto-chain, + // knob A). Defer it via the sink so it runs after the whole fanout walk: + // doing it inline here would delay the doorbells of later consumers in the + // same producer's fanout. Fallback to inline if no sink / sink full. + if (sink == nullptr || !sink->push(&slot_state)) { + propagate_dispatch_fanin(slot_state); + } + // No explicit removal from the cross-thread queue: a still-queued entry for + // this consumer is now DISPATCHED and is dropped when a peer pops it. + // Fully pre-staged => skip the ready queue. Partially staged SPMD consumer => + // fall through so the caller pushes C; dispatch resumes from next_block_idx. + return slot_state.next_block_idx.load(std::memory_order_seq_cst) >= slot_state.logical_block_num; + } + + bool release_fanin_and_check_ready( + PTO2TaskSlotState &slot_state, PTO2LocalReadyBuffer *local_bufs = nullptr, SpecReleaseSink *sink = nullptr + ) { + // Atomically increment fanin_refcount and check if all producers are done + // ACQ_REL on fanin_refcount already synchronizes with the orchestrator's + // init release, making fanin_count visible — plain load suffices. + int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1; + + if (new_refcount == slot_state.fanin_count) { + // Speculative early-dispatch: pre-staged tasks are released by doorbell + // here, skipping the ready-queue round-trip entirely. + if (try_speculative_release(slot_state, sink)) return true; + // Local-first: try per-CoreType thread-local buffer before global queue + // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1] + // DUMMY shape is out of range for local_bufs (sized PTO2_NUM_RESOURCE_SHAPES); + // dummy slots bypass the local fast path and go straight to dummy_ready_queue. + PTO2ResourceShape shape = slot_state.active_mask.to_shape(); + if (shape == PTO2ResourceShape::DUMMY) { + dummy_ready_queue.push(&slot_state); + } else if (!local_bufs || !local_bufs[static_cast(shape)].try_push(&slot_state)) { + ready_queues[static_cast(shape)].push(&slot_state); + } + return true; + } + return false; + } + +#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING + bool release_fanin_and_check_ready( + PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &push_wait, + PTO2LocalReadyBuffer *local_bufs = nullptr, SpecReleaseSink *sink = nullptr + ) { + int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1; + atomic_count += 1; // fanin_refcount.fetch_add + + if (new_refcount == slot_state.fanin_count) { + // Speculative early-dispatch: pre-staged tasks are released by doorbell + // here, skipping the ready-queue round-trip entirely. + if (try_speculative_release(slot_state, sink)) return true; + // Local-first: try per-CoreType thread-local buffer before global queue. + // Dummy slots bypass local_bufs (out-of-range for PTO2_NUM_RESOURCE_SHAPES) + // and go straight to dummy_ready_queue; use the profiling-aware push so + // atomic_count / push_wait stay consistent with the non-dummy path. + PTO2ResourceShape shape = slot_state.active_mask.to_shape(); + if (shape == PTO2ResourceShape::DUMMY) { + dummy_ready_queue.push(&slot_state, atomic_count, push_wait); + } else if (!local_bufs || !local_bufs[static_cast(shape)].try_push(&slot_state)) { + ready_queues[static_cast(shape)].push(&slot_state, atomic_count, push_wait); + } + return true; + } + return false; + } +#endif + + int get_ready_tasks_batch( + PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count + ) { + int count = 0; + while (count < max_count && local_buf.count > 0) { + out[count++] = local_buf.slot_states[--local_buf.count]; + } + int remaining = max_count - count; + if (remaining > 0) { + count += ready_queues[static_cast(shape)].pop_batch(out + count, remaining); + } + return count; + } + +#if PTO2_SCHED_PROFILING + int get_ready_tasks_batch( + PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count, + uint64_t &atomic_count, uint64_t &wait_cycle + ) { + int count = 0; + while (count < max_count && local_buf.count > 0) { + out[count++] = local_buf.slot_states[--local_buf.count]; + } + int remaining = max_count - count; + if (remaining > 0) { + count += + ready_queues[static_cast(shape)].pop_batch(out + count, remaining, atomic_count, wait_cycle); + } + return count; + } +#endif + + void on_scope_end(PTO2TaskSlotState **task_slot_states, int32_t count) { +#if PTO2_ORCH_PROFILING + extern uint64_t g_orch_scope_end_atomic_count; + if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0); + for (int32_t i = 0; i < count; i++) { + if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0); + release_producer(*task_slot_states[i], g_orch_scope_end_atomic_count); + } +#else + if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0); + for (int32_t i = 0; i < count; i++) { + if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0); + release_producer(*task_slot_states[i]); + } +#endif + } + + /** + * Subtask completion: atomic counter model. + * Called when a single subtask (AIC, AIV0, or AIV1) finishes on any block. + * Atomically increments completed_subtasks and checks whether all subtasks + * across all blocks are done. + * + * @return true if this was the last subtask, completing the entire task. + */ + bool on_subtask_complete(PTO2TaskSlotState &slot_state) { + int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel); + return (prev + 1) == slot_state.total_required_subtasks; + } + + /** + * Two-stage completion: second stage. + * Called exactly once when all subtasks of a task are done (i.e., + * on_subtask_complete returned true). Walks the consumer (fanout) list, + * decrements each consumer's fanin, pushes newly-ready ones, and rings + * doorbells for speculative hits. + * + * Non-PROFILING returns the consumer-walk count (= edges traversed). The + * Resolve swimlane bar reads it to label the bar with how many successors + * actually got resolved. PROFILING returns the richer CompletionStats + * whose `fanout_edges` carries the same number. + */ +#if PTO2_SCHED_PROFILING + CompletionStats +#else + uint32_t +#endif + on_task_complete( + PTO2TaskSlotState &slot_state, +#if PTO2_SCHED_PROFILING + int thread_idx, +#endif + + PTO2LocalReadyBuffer *local_bufs = nullptr + ) { +#if PTO2_SCHED_PROFILING + CompletionStats stats = {0, 0, 0, true}; +#else + uint32_t consumer_walk_count = 0; +#endif +#if PTO2_SCHED_PROFILING + extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[]; + extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[]; + extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[]; + uint64_t lock_atomics = 0, lock_wait = 0; + PTO2_SCHED_CYCLE_START(); +#endif + +#if PTO2_SCHED_PROFILING + slot_state.lock_fanout(lock_atomics, lock_wait); +#else + slot_state.lock_fanout(); +#endif + slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release); + PTO2DepListEntry *current = slot_state.fanout_head; // Protected by fanout_lock + slot_state.unlock_fanout(); + +#if PTO2_SCHED_PROFILING + lock_atomics += 2; // state.store + unlock.store + g_sched_lock_atomic_count[thread_idx] += lock_atomics; + g_sched_lock_wait_cycle[thread_idx] += lock_wait; + PTO2_SCHED_CYCLE_LAP(g_sched_lock_cycle[thread_idx]); +#endif + + // Fanout: notify consumers. A pre-staged consumer that becomes ready has + // its doorbell rung INLINE (db = nullptr) the moment its node is reached, + // not batched to after the whole walk — so a flagged consumer near the + // front of the list starts immediately and overlaps the remaining + // release_fanin work for the other consumers, instead of waiting for the + // full O(fanout-degree) walk (~5us for a 50-consumer producer). + // + // Safe on silicon: the producer's slot is already COMPLETED here — every + // SPMD block has FIN'd AND dcci-flushed its output to HBM before + // on_task_complete runs — so a released consumer never reads stale + // producer output. (Batching used to align the released wave, but pushed + // every doorbell to the end of the walk, defeating the whole point of + // speculative early-dispatch: minimal producer-end -> consumer-start.) +#if PTO2_SCHED_PROFILING + uint64_t fanout_atomics = 0, push_wait = 0; +#endif + // Doorbells for released pre-staged consumers fire INLINE in the walk + // below; their dispatch_fanin propagation is collected here and replayed + // after the walk, so no consumer's doorbell waits on a sibling's propagate. + SpecReleaseSink rel_sink; + while (current != nullptr) { + PTO2TaskSlotState &consumer_slot = *current->slot_state; +#if PTO2_SCHED_PROFILING + stats.fanout_edges++; + if (release_fanin_and_check_ready(consumer_slot, fanout_atomics, push_wait, local_bufs, &rel_sink)) { + stats.tasks_enqueued++; + } +#else + consumer_walk_count++; + release_fanin_and_check_ready(consumer_slot, local_bufs, &rel_sink); +#endif + current = current->next; + } + for (int i = 0; i < rel_sink.n; i++) { + propagate_dispatch_fanin(*rel_sink.items[i]); + } + +#if PTO2_SCHED_PROFILING + g_sched_fanout_atomic_count[thread_idx] += fanout_atomics; + g_sched_push_wait_cycle[thread_idx] += push_wait; + PTO2_SCHED_CYCLE_LAP(g_sched_fanout_cycle[thread_idx]); + return stats; +#else + return consumer_walk_count; +#endif + } + + /** + * Cold path: release producers (fanin traversal) + check self for CONSUMED. + * Returns fanin edge count for profiling. + */ + +#if PTO2_SCHED_PROFILING + int32_t on_task_release(PTO2TaskSlotState &slot_state, int32_t thread_idx) { + PTO2_SCHED_CYCLE_START(); + extern uint64_t g_sched_fanin_cycle[], g_sched_fanin_atomic_count[]; + extern uint64_t g_sched_self_atomic_count[]; + extern uint64_t g_sched_self_consumed_cycle[]; + extern uint64_t g_sched_complete_count[]; + uint64_t fanin_atomics = 0; +#else + int32_t on_task_release(PTO2TaskSlotState &slot_state) { +#endif + PTO2TaskPayload *payload = slot_state.payload; + for_each_fanin_slot_state(*payload, [&](PTO2TaskSlotState *producer_slot_state) { +#if PTO2_SCHED_PROFILING + release_producer(*producer_slot_state, fanin_atomics); +#else + release_producer(*producer_slot_state); +#endif + }); +#if PTO2_SCHED_PROFILING + g_sched_fanin_atomic_count[thread_idx] += fanin_atomics; + PTO2_SCHED_CYCLE_LAP(g_sched_fanin_cycle[thread_idx]); +#endif + + // Self consumed check +#if PTO2_SCHED_PROFILING + uint64_t self_atomics = 0; + check_and_handle_consumed(slot_state, self_atomics); + g_sched_self_atomic_count[thread_idx] += self_atomics; + PTO2_SCHED_CYCLE_LAP(g_sched_self_consumed_cycle[thread_idx]); + g_sched_complete_count[thread_idx]++; +#else + check_and_handle_consumed(slot_state); +#endif + return payload->fanin_actual_count; + } + + // === Cold-path API (defined in pto_scheduler.cpp) === + + // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots, + // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena. + // Capacities are baked into the returned layout; init_data_from_layout uses + // the same values. + static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE); + static PTO2SchedulerLayout + reserve_layout(DeviceArena &arena, const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]); + + // Phase 3a: write everything *except* arena-internal pointer fields. + // `sm_dev_base` is the device address of the SM (only stored, never + // dereferenced here). Safe to call on a host arena that holds the + // prebuilt image buffer. (The orchestrator counterpart takes + // task_window_size for ring task_descriptors address arithmetic; the + // scheduler only needs the SM header / ring header base addresses, + // both window-size-independent.) + bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base); + + // Phase 3b: write the arena-internal pointer fields + // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each + // ring, wiring.queue.buffer_). Called on both host and device sides. + void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena); + + // Forget per-region pointers; arena owns the backing memory. + void destroy(); + void print_stats(); + void print_queues(); +}; + +// Scheduler cold-path API is declared as PTO2SchedulerState member functions. +// See init()/destroy()/print_stats()/print_queues() below the struct definition. + +// try_inline_complete_locked: short-circuit NotDeferred completions seen during +// drain so they don't grow entries[]. Defined here (not in pto_async_wait.h) +// because PTO2SchedulerState's on_task_complete signature is only known +// after its full definition above. +// +// When the deferred_release_slot_states[] buffer is full, drain it via +// on_task_release before appending — mirrors the same overflow-drain idiom +// that scheduler_completion.cpp's inline NotDeferred path uses, so high task +// rates don't surface as ASYNC_WAIT_OVERFLOW errors. +inline bool +AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state) { + // Return value (CompletionStats / consumer-walk count) discarded: + // async-wait drain path has no Resolve swimlane bar attached. +#if PTO2_SCHED_PROFILING + (void)sink.sched->on_task_complete(slot_state, sink.thread_idx, sink.local_bufs); +#else + (void)sink.sched->on_task_complete(slot_state, sink.local_bufs); +#endif + if (*sink.deferred_release_count >= sink.deferred_release_capacity) { + while (*sink.deferred_release_count > 0) { +#if PTO2_SCHED_PROFILING + (void)sink.sched->on_task_release( + *sink.deferred_release_slot_states[--(*sink.deferred_release_count)], sink.thread_idx + ); +#else + sink.sched->on_task_release(*sink.deferred_release_slot_states[--(*sink.deferred_release_count)]); +#endif + } + } + sink.deferred_release_slot_states[(*sink.deferred_release_count)++] = &slot_state; + sink.inline_completed++; + return true; +} + +template +inline AsyncPollResult AsyncWaitList::poll_and_complete( + AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs, + PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, int32_t deferred_release_capacity +#if PTO2_SCHED_PROFILING + , + int thread_idx +#endif +) { + AsyncPollResult result; + if (!try_lock()) return result; + + AsyncWaitList::DrainCompletionSink sink{}; + sink.sched = sched; + sink.local_bufs = local_bufs; + sink.deferred_release_slot_states = deferred_release_slot_states; + sink.deferred_release_count = &deferred_release_count; + sink.deferred_release_capacity = deferred_release_capacity; +#if PTO2_SCHED_PROFILING + sink.thread_idx = thread_idx; +#endif + + int32_t drain_err = PTO2_ERROR_NONE; + drain_aicore_completion_mailbox_locked(aicore_mailbox, sink, drain_err); + if (drain_err != PTO2_ERROR_NONE) { + result.error_code = drain_err; + unlock(); + return result; + } + result.completed += sink.inline_completed; + + for (int32_t i = count - 1; i >= 0; --i) { + AsyncWaitEntry &entry = entries[i]; + uintptr_t last_invalidated_counter_line = static_cast(-1); + for (int32_t c = 0; c < entry.condition_count; c++) { + CompletionCondition &cond = entry.conditions[c]; + if (cond.satisfied) continue; + if (cond.completion_type == COMPLETION_TYPE_COUNTER && cond.counter_addr != nullptr) { + uintptr_t counter_line = mailbox_cache_line(cond.counter_addr); + if (counter_line != last_invalidated_counter_line) { + cache_invalidate_range(reinterpret_cast(counter_line), sizeof(uint32_t)); + last_invalidated_counter_line = counter_line; + } + } + CompletionPollResult poll = cond.test(); + if (poll.state == CompletionPollState::FAILED) { + result.error_code = poll.error_code; + result.failed_slot_state = entry.slot_state; + unlock(); + return result; + } + if (poll.state == CompletionPollState::READY) { + cond.satisfied = true; + cond.retire(); + entry.waiting_completion_count--; + } + } + + if (entry.normal_done && entry.waiting_completion_count <= 0) { + // Return value (CompletionStats / consumer-walk count) discarded: + // deferred-completion drain has no Resolve swimlane bar attached. +#if PTO2_SCHED_PROFILING + (void)sched->on_task_complete(*entry.slot_state, thread_idx, local_bufs); +#else + (void)sched->on_task_complete(*entry.slot_state, local_bufs); +#endif + // Drain deferred_release in place when the buffer fills — same + // overflow-drain idiom used by complete_slot_task's inline path + // and by try_inline_complete_locked. Without this, large bursts + // of completable wait_list entries in a single poll surfaced as + // ASYNC_WAIT_OVERFLOW under the MPSC model. + if (deferred_release_count >= deferred_release_capacity) { + while (deferred_release_count > 0) { +#if PTO2_SCHED_PROFILING + (void)sched->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx); +#else + sched->on_task_release(*deferred_release_slot_states[--deferred_release_count]); +#endif + } + } + deferred_release_slot_states[deferred_release_count++] = entry.slot_state; + result.completed++; + + int32_t last = count - 1; + if (i != last) entries[i] = entries[last]; + count = last; + } + } + + unlock(); + return result; +} + +// ============================================================================= +// Scheduler Profiling Data +// ============================================================================= + +#if PTO2_SCHED_PROFILING +struct PTO2SchedProfilingData { + // Sub-phase cycle breakdown within on_task_complete + uint64_t lock_cycle; // lock_fanout + state store + unlock + uint64_t fanout_cycle; // fanout traversal + uint64_t fanin_cycle; // fanin traversal + uint64_t self_consumed_cycle; // self check_and_handle_consumed + + // Wait times + uint64_t lock_wait_cycle; // spin-wait in fanout_lock + uint64_t push_wait_cycle; // CAS contention in push() + uint64_t pop_wait_cycle; // CAS contention in pop() + + // Atomic counts per sub-phase + uint64_t lock_atomic_count; + uint64_t fanout_atomic_count; + uint64_t fanin_atomic_count; + uint64_t self_atomic_count; + uint64_t pop_atomic_count; + + int64_t complete_count; +}; + +/** + * Get and reset scheduler profiling data for a specific thread. + * Returns accumulated profiling data and resets counters. + */ +PTO2SchedProfilingData scheduler_get_profiling(int thread_idx); +#endif diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp new file mode 100644 index 000000000..4dd0cb28d --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp @@ -0,0 +1,1093 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include "scheduler_context.h" + +#include +#include + +#include "common/unified_log.h" +#include "aicpu/device_time.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" +#include "aicpu/platform_regs.h" +#include "aicpu/pmu_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" +#include "common/memory_barrier.h" +#include "common/l2_swimlane_profiling.h" +#include "common/platform_config.h" +#include "pto_runtime2.h" +#include "pto_shared_memory.h" +#include "runtime.h" +#include "spin_hint.h" + +// ============================================================================= +// Cold-path helpers for the main dispatch loop (noinline to reduce hot-loop icache) +// ============================================================================= + +static void latch_scheduler_error(PTO2SharedMemoryHeader *header, int32_t thread_idx, int32_t error_code) { + if (header == nullptr || error_code == PTO2_ERROR_NONE) { + return; + } + // The first error code/thread pair wins; the bitmap cumulatively records all reporting threads. + int32_t expected = PTO2_ERROR_NONE; + if (header->sched_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) { + header->sched_error_thread.store(thread_idx, std::memory_order_release); + } + if (thread_idx >= 0 && thread_idx < 32) { + header->sched_error_bitmap.fetch_or(1U << static_cast(thread_idx), std::memory_order_acq_rel); + } +} + +LoopAction SchedulerContext::handle_orchestrator_exit( + int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count +) { + if (completed_.load(std::memory_order_acquire)) { + return LoopAction::BREAK_LOOP; + } + int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire); + if (orch_err != PTO2_ERROR_NONE) { + LOG_ERROR( + "Thread %d: Fatal error (code=%d), sending EXIT_SIGNAL to all cores. " + "completed_tasks=%d, total_tasks=%d", + thread_idx, orch_err, completed_tasks_.load(std::memory_order_relaxed), total_tasks_ + ); + if (!completed_.exchange(true, std::memory_order_acq_rel)) { + emergency_shutdown(runtime); + } + return LoopAction::BREAK_LOOP; + } + int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire); + if (sched_err != PTO2_ERROR_NONE) { + LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err); + if (!completed_.exchange(true, std::memory_order_acq_rel)) { + emergency_shutdown(runtime); + } + return LoopAction::BREAK_LOOP; + } + + bool orch_done = orchestrator_done_; + if (!orch_done) return LoopAction::NONE; + + task_count = total_tasks_; + // task_count == 0 is the fully_distributed_within_core path: orchestration + + // scheduling + execution all ran on the AI cores, so nothing was submitted to + // shared memory. Once orchestration is done (checked above) an empty SM graph + // means there is no AICPU-side work left — complete immediately rather than + // spinning forever. The centralized path (task_count > 0) is unchanged. + if (completed_tasks_.load(std::memory_order_relaxed) >= task_count) { + completed_.store(true, std::memory_order_release); + LOG_INFO_V0( + "Thread %d: PTO2 completed tasks %d/%d", thread_idx, completed_tasks_.load(std::memory_order_relaxed), + task_count + ); + return LoopAction::BREAK_LOOP; + } + return LoopAction::NONE; +} + +LoopAction SchedulerContext::handle_core_transition(bool &cores_released) { + if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE; + if (!reassigned_.load(std::memory_order_acquire)) { + wait_reassign_.fetch_add(1, std::memory_order_release); + while (!reassigned_.load(std::memory_order_acquire)) { + if (completed_.load(std::memory_order_acquire)) { + return LoopAction::BREAK_LOOP; + } + SPIN_WAIT_HINT(); + } + } + cores_released = true; + return LoopAction::NONE; +} + +LoopAction +SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) { + if (completed_.load(std::memory_order_acquire)) { + return LoopAction::BREAK_LOOP; + } + int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire); + if (orch_err != PTO2_ERROR_NONE) { + LOG_ERROR("Thread %d: Fatal error detected (code=%d), sending EXIT_SIGNAL to all cores", thread_idx, orch_err); + if (!completed_.exchange(true, std::memory_order_acq_rel)) { + emergency_shutdown(runtime); + } + return LoopAction::BREAK_LOOP; + } + int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire); + if (sched_err != PTO2_ERROR_NONE) { + LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err); + if (!completed_.exchange(true, std::memory_order_acq_rel)) { + emergency_shutdown(runtime); + } + return LoopAction::BREAK_LOOP; + } + return LoopAction::NONE; +} + +// ============================================================================= +// Stall diagnostic log format. +// +// Every line is self-contained — when scheduler threads emit concurrently and +// device_log interleaves their output, each line still carries enough context +// to identify which thread / iteration / object it belongs to. +// +// Prefix on every line: +// [STALL thread=N idle_iterations=K] CATEGORY ... +// +// All scheduler threads spinning at the same idle rate hit STALL_LOG_INTERVAL +// together, so lines with the same idle_iterations belong to one diagnostic +// round; grep "idle_iterations=N" groups one round's output. +// +// Categories (and which thread emits them): +// SUMMARY — completed / total counts and scan totals (thread 0 only) +// TASK — one per non-completed task scanned from shared rings (thread 0 only) +// - state=RUNNING: includes running_on=[...] cross-ref +// - state=READY: fanin satisfied but no idle core yet +// - state=WAIT: includes missing_deps=N +// CLUSTER — one per cluster owned by this thread (every thread) +// - busy slot shows kernel + task_id + cond_reg_state; +// ANOMALY suffix when COND register is fin while software +// still has the slot marked busy. +// +// Reader workflow: +// 1. grep SUMMARY -> overall completion status +// 2. grep "idle_iterations=N TASK" -> stuck RUNNING task and which +// core/thread it is on +// 3. grep "idle_iterations=N CLUSTER.*task=" -> cross-check via the +// cluster line (or just +// read running_on in step 2) +// ============================================================================= + +namespace { + +// Format a core's idle/busy state into a fixed buffer. Used inside CLUSTER lines. +// Layout (idle): coreN(idle) +// Layout (busy): coreN(busy kernel=K task=T cond_reg_state=ack) +// Layout (anomaly): coreN(busy kernel=K task=T cond_reg_state=fin ANOMALY) +// +// Healthy busy: COND register reports ack (AICore still executing). fin means +// AICore wrote completion but AICPU hasn't recycled the running slot yet — +// either a completion-poll bug or the diagnostic raced the recycle. +void format_core_status( + char *buf, size_t buf_size, int32_t core_id, bool idle, const CoreExecState *core_state, uint64_t reg_addr_for_cond +) { + if (idle) { + snprintf(buf, buf_size, "core%d(idle)", core_id); + return; + } + int32_t kernel = -1; + int64_t task_id_raw = -1; + if (core_state && core_state->running_slot_state) { + int32_t subslot = static_cast(core_state->running_subslot); + kernel = core_state->running_slot_state->task->kernel_id[subslot]; + task_id_raw = static_cast(core_state->running_slot_state->task->task_id.raw); + } + uint64_t cond_reg = read_reg(reg_addr_for_cond, RegId::COND); + int32_t hw_state = EXTRACT_TASK_STATE(cond_reg); + const char *cond_reg_state_str = (hw_state == TASK_ACK_STATE) ? "ack" : "fin"; + if (hw_state == TASK_ACK_STATE) { + snprintf( + buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s)", core_id, kernel, task_id_raw, + cond_reg_state_str + ); + } else { + snprintf( + buf, buf_size, + "core%d(busy kernel=%d task=%" PRId64 + " cond_reg_state=%s ANOMALY cond_tok=%d running_tok=%d pending_tok=%d)", + core_id, kernel, task_id_raw, cond_reg_state_str, EXTRACT_TASK_ID(cond_reg), + core_state->running_reg_task_id, core_state->pending_reg_task_id + ); + } +} + +} // namespace + +int32_t SchedulerContext::find_core_owner_thread(int32_t core_id) const { + for (int32_t t = 0; t < aicpu_thread_num_; t++) { + const int32_t *ids = core_trackers_[t].core_ids(); + int32_t n = core_trackers_[t].core_num(); + for (int32_t i = 0; i < n; i++) { + if (ids[i] == core_id) return t; + } + } + return -1; +} + +bool SchedulerContext::self_owns_running_task(int32_t thread_idx) const { + const int32_t *cores = core_trackers_[thread_idx].core_ids(); + int32_t core_num = core_trackers_[thread_idx].core_num(); + for (int32_t i = 0; i < core_num; i++) { + if (core_exec_states_[cores[i]].running_slot_state != nullptr) { + return true; + } + } + return false; +} + +bool SchedulerContext::no_thread_owns_running_task() const { + for (int32_t t = 0; t < aicpu_thread_num_; t++) { + if (self_owns_running_task(t)) return false; + } + return true; +} + +void SchedulerContext::log_stall_diagnostics( + int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count +) { + CoreTracker &tracker = core_trackers_[thread_idx]; + + // T0 owns the shared-ring scan; printing it from other threads would + // produce identical TASK lines once per scheduler thread. + if (thread_idx == 0) { + int32_t cnt_ready = 0, cnt_waiting = 0, cnt_running = 0, submitted_in_ring = 0; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + PTO2SharedMemoryRingHeader &ring = *sched_->ring_sched_states[r].ring; + int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed); + submitted_in_ring += ring_task_count; + for (int32_t si = 0; si < ring_task_count; si++) { + PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si); + PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed); + int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed); + int32_t fi = slot_state.fanin_count; + int32_t kid_aic = slot_state.task->kernel_id[0]; + int32_t kid_aiv0 = slot_state.task->kernel_id[1]; + int32_t kid_aiv1 = slot_state.task->kernel_id[2]; + int64_t task_id = static_cast(slot_state.task->task_id.raw); + if (st >= PTO2_TASK_COMPLETED) continue; + // task_state has no intermediate ready/running value — it + // stays PENDING until the worker stores COMPLETED. Classify + // by the ground truth instead: a slot is RUNNING iff some + // core has it as running_slot_state. A task occupies at most + // 3 cores (one cluster), all under the same owner thread by + // construction of assign_cores_to_threads. + char running_on[192] = {0}; + int32_t owner = -1; + int32_t pos = 0; + bool is_running = false; + for (int32_t cid = 0; cid < cores_total_num_ && pos + 32 < (int32_t)sizeof(running_on); cid++) { + if (core_exec_states_[cid].running_slot_state != &slot_state) continue; + is_running = true; + if (owner < 0) owner = find_core_owner_thread(cid); + const char *sname = subslot_name(core_exec_states_[cid].running_subslot); + int32_t written = snprintf( + running_on + pos, sizeof(running_on) - pos, "%score=%d(%s)", pos == 0 ? "" : " ", cid, sname + ); + if (written > 0) pos += written; + } + + if (is_running) { + cnt_running++; + if (cnt_running > STALL_DUMP_READY_MAX) continue; + LOG_INFO_V9( + "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64 + " state=RUNNING fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] " + "running_on=[owner_thread=%d cores=[%s]]", + thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, owner, running_on + ); + continue; + } + if (rc >= fi) { + cnt_ready++; + if (cnt_ready > STALL_DUMP_READY_MAX) continue; + LOG_INFO_V9( + "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64 + " state=READY fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d]", + thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1 + ); + continue; + } + cnt_waiting++; + if (cnt_waiting > STALL_DUMP_WAIT_MAX) continue; + LOG_INFO_V9( + "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64 + " state=WAIT fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] missing_deps=%d", + thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, fi - rc + ); + } + } + int32_t effective_total = task_count > 0 ? task_count : submitted_in_ring; + int32_t c = completed_tasks_.load(std::memory_order_relaxed); + LOG_INFO_V9( + "[STALL thread=%d idle_iterations=%d] SUMMARY completed=%d/%d last_progress_iteration=%d " + "scan_ready=%d scan_waiting=%d scan_running=%d", + thread_idx, idle_iterations, c, effective_total, last_progress_count, cnt_ready, cnt_waiting, cnt_running + ); + } + + // CLUSTER lines: one per cluster this thread owns. + // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the + // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads. + int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_; + for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) { + int32_t offset = cli * 3; + int32_t aic_id = tracker.get_aic_core_id(offset); + int32_t aiv0_id = tracker.get_aiv0_core_id(offset); + int32_t aiv1_id = tracker.get_aiv1_core_id(offset); + bool aic_idle = tracker.is_aic_core_idle(offset); + bool aiv0_idle = tracker.is_aiv0_core_idle(offset); + bool aiv1_idle = tracker.is_aiv1_core_idle(offset); + int32_t cluster_id = cli * ast + thread_idx; + char aic_buf[192], aiv0_buf[192], aiv1_buf[192]; + format_core_status( + aic_buf, sizeof(aic_buf), aic_id, aic_idle, &core_exec_states_[aic_id], core_exec_states_[aic_id].reg_addr + ); + format_core_status( + aiv0_buf, sizeof(aiv0_buf), aiv0_id, aiv0_idle, &core_exec_states_[aiv0_id], + core_exec_states_[aiv0_id].reg_addr + ); + format_core_status( + aiv1_buf, sizeof(aiv1_buf), aiv1_id, aiv1_idle, &core_exec_states_[aiv1_id], + core_exec_states_[aiv1_id].reg_addr + ); + LOG_INFO_V9( + "[STALL thread=%d idle_iterations=%d] CLUSTER cluster_id=%d aic=%s aiv0=%s aiv1=%s", thread_idx, + idle_iterations, cluster_id, aic_buf, aiv0_buf, aiv1_buf + ); + } +} + +void SchedulerContext::log_shutdown_stall_snapshot( + int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count +) { + LOG_WARN( + "[SHUTDOWN_SNAPSHOT trigger_thread=%d reason=scheduler_timeout idle_iterations=%d] " + "dumping all scheduler threads before emergency shutdown", + trigger_thread_idx, trigger_idle_iterations + ); + int32_t thread_count = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_; + if (thread_count < 0 || thread_count > MAX_AICPU_THREADS) { + LOG_ERROR( + "[SHUTDOWN_SNAPSHOT trigger_thread=%d] invalid thread_count=%d, clamping to [0,%d]", trigger_thread_idx, + thread_count, MAX_AICPU_THREADS + ); + thread_count = thread_count < 0 ? 0 : MAX_AICPU_THREADS; + } + for (int32_t t = 0; t < thread_count; t++) { + log_stall_diagnostics(t, total_tasks_, trigger_idle_iterations, trigger_last_progress_count); + } +} + +int32_t SchedulerContext::handle_timeout_exit( + int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations, + int32_t last_progress_count +#if PTO2_PROFILING + , + uint64_t sched_start_ts +#endif +) { + LOG_ERROR( + "[STALL thread=%d idle_iterations=%d] TIMEOUT_EXIT after_idle_iterations=%d", thread_idx, idle_iterations, + idle_iterations + ); + latch_scheduler_error(header, thread_idx, PTO2_ERROR_SCHEDULER_TIMEOUT); + if (!completed_.exchange(true, std::memory_order_acq_rel)) { + log_shutdown_stall_snapshot(thread_idx, idle_iterations, last_progress_count); +#if PTO2_PROFILING + // Capture the in-flight kernels' partial output before signalling the + // cores to exit, so the dump reflects the live stuck state. + if (is_dump_args_enabled()) { + dump_running_task_outputs( + thread_idx, cores_total_num_, + [this](int32_t cid) { + return core_exec_states_[cid].running_slot_state; + }, + [](ActiveMask active_mask, int raw_subtask_id) { + return active_mask.subtask_active(static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif + emergency_shutdown(runtime); + } +#if PTO2_PROFILING + uint64_t sched_timeout_ts = get_sys_cnt_aicpu(); + LOG_INFO_V9( + "Thread %d: sched_start=%" PRIu64 " sched_end(timeout)=%" PRIu64 " sched_cost=%.3fus", thread_idx, + static_cast(sched_start_ts), static_cast(sched_timeout_ts), + cycles_to_us(sched_timeout_ts - sched_start_ts) + ); +#endif + return -PTO2_ERROR_SCHEDULER_TIMEOUT; +} + +#if PTO2_PROFILING +void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) { + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; + uint64_t sched_end_ts = get_sys_cnt_aicpu(); + LOG_INFO_V9( + "Thread %d: sched_start=%" PRIu64 " sched_end=%" PRIu64 " sched_cost=%.3fus", thread_idx, + static_cast(l2_swimlane.sched_start_ts), static_cast(sched_end_ts), + cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts) + ); + + uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle + + l2_swimlane.sched_dispatch_cycle + l2_swimlane.sched_idle_cycle; + if (sched_total == 0) sched_total = 1; + +#if PTO2_SCHED_PROFILING + { + PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx); + uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle; + uint64_t complete_poll = + (l2_swimlane.sched_complete_cycle > otc_total + l2_swimlane.sched_complete_perf_cycle) ? + (l2_swimlane.sched_complete_cycle - otc_total - l2_swimlane.sched_complete_perf_cycle) : + 0; + uint64_t dispatch_poll = (l2_swimlane.sched_dispatch_cycle > + l2_swimlane.sched_dispatch_pop_cycle + l2_swimlane.sched_dispatch_setup_cycle) ? + (l2_swimlane.sched_dispatch_cycle - l2_swimlane.sched_dispatch_pop_cycle - + l2_swimlane.sched_dispatch_setup_cycle) : + 0; + + LOG_INFO_V9( + "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx, + cycles_to_us(sched_total), cur_thread_completed + ); + + // fanout / fanin per-thread aggregates live in + // sched_overhead_analysis.compute_dag_stats_from_deps (deps.json edges + // × core_to_thread). + LOG_INFO_V9( + "Thread %d: complete : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_complete_cycle), + l2_swimlane.sched_complete_cycle * 100.0 / sched_total + ); + + uint64_t c_parent = l2_swimlane.sched_complete_cycle > 0 ? l2_swimlane.sched_complete_cycle : 1; + uint64_t complete_miss_count = (l2_swimlane.complete_probe_count > l2_swimlane.complete_hit_count) ? + (l2_swimlane.complete_probe_count - l2_swimlane.complete_hit_count) : + 0; + double complete_hit_rate = l2_swimlane.complete_probe_count > 0 ? + l2_swimlane.complete_hit_count * 100.0 / l2_swimlane.complete_probe_count : + 0.0; + LOG_INFO_V9( + "Thread %d: poll : %.3fus (%.1f%%) hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%", + thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent, + static_cast(l2_swimlane.complete_hit_count), static_cast(complete_miss_count), + complete_hit_rate + ); + LOG_INFO_V9( + "Thread %d: otc_lock : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", thread_idx, + cycles_to_us(sp.lock_cycle), sp.lock_cycle * 100.0 / c_parent, + cycles_to_us(sp.lock_cycle - sp.lock_wait_cycle), cycles_to_us(sp.lock_wait_cycle), + static_cast(sp.lock_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: otc_fanout : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", thread_idx, + cycles_to_us(sp.fanout_cycle), sp.fanout_cycle * 100.0 / c_parent, + cycles_to_us(sp.fanout_cycle - sp.push_wait_cycle), cycles_to_us(sp.push_wait_cycle), + static_cast(sp.fanout_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: otc_fanin : %.3fus (%.1f%%) atomics=%" PRIu64 "", thread_idx, + cycles_to_us(sp.fanin_cycle), sp.fanin_cycle * 100.0 / c_parent, + static_cast(sp.fanin_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: otc_self : %.3fus (%.1f%%) atomics=%" PRIu64 "", thread_idx, + cycles_to_us(sp.self_consumed_cycle), sp.self_consumed_cycle * 100.0 / c_parent, + static_cast(sp.self_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: perf : %.3fus (%.1f%%)", thread_idx, + cycles_to_us(l2_swimlane.sched_complete_perf_cycle), + l2_swimlane.sched_complete_perf_cycle * 100.0 / c_parent + ); + + LOG_INFO_V9( + "Thread %d: dispatch : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_dispatch_cycle), + l2_swimlane.sched_dispatch_cycle * 100.0 / sched_total + ); + + uint64_t d_parent = l2_swimlane.sched_dispatch_cycle > 0 ? l2_swimlane.sched_dispatch_cycle : 1; + LOG_INFO_V9( + "Thread %d: poll : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll), + dispatch_poll * 100.0 / d_parent + ); + LOG_INFO_V9( + "Thread %d: pop : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", thread_idx, + cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle), l2_swimlane.sched_dispatch_pop_cycle * 100.0 / d_parent, + cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle), + static_cast(sp.pop_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: setup : %.3fus (%.1f%%)", thread_idx, + cycles_to_us(l2_swimlane.sched_dispatch_setup_cycle), + l2_swimlane.sched_dispatch_setup_cycle * 100.0 / d_parent + ); + +#if PTO2_SCHED_PROFILING + LOG_INFO_V9( + "Thread %d: wiring : %.3fus (%.1f%%) tasks=%d", thread_idx, + cycles_to_us(l2_swimlane.sched_wiring_cycle), l2_swimlane.sched_wiring_cycle * 100.0 / sched_total, + l2_swimlane.phase_wiring_count + ); +#else + LOG_INFO_V9( + "Thread %d: wiring : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_wiring_cycle), + l2_swimlane.sched_wiring_cycle * 100.0 / sched_total + ); +#endif + + LOG_INFO_V9( + "Thread %d: idle : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_idle_cycle), + l2_swimlane.sched_idle_cycle * 100.0 / sched_total + ); + + if (cur_thread_completed > 0) { + LOG_INFO_V9( + "Thread %d: avg/complete : %.3fus", thread_idx, + cycles_to_us(l2_swimlane.sched_complete_cycle) / cur_thread_completed + ); + } + } +#endif + LOG_INFO_V9( + "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx, + cycles_to_us(sched_total), static_cast(l2_swimlane.sched_loop_count), cur_thread_completed + ); +} +#endif + +// ============================================================================= +// Shutdown: deinit AICore regs for this thread's cores (and PMU finalize if enabled). +// Orchestrator threads have core_trackers_[thread_idx].core_num() == 0 -> no-op. +// platform_deinit_aicore_regs is idempotent; safe to call after early completion. +// ============================================================================= +int32_t SchedulerContext::shutdown(int32_t thread_idx) { + const int32_t *cores = core_trackers_[thread_idx].core_ids(); + int32_t core_num = core_trackers_[thread_idx].core_num(); + if (core_num == 0) return 0; + +#if PTO2_PROFILING + if (is_pmu_enabled()) { + pmu_aicpu_finalize(cores, core_num); + } +#endif + + LOG_INFO_V0("Thread %d: Shutting down %d cores", thread_idx, core_num); + int32_t rc = 0; + for (int32_t i = 0; i < core_num; i++) { + int32_t core_id = cores[i]; + uint64_t reg_addr = core_exec_states_[core_id].reg_addr; + if (reg_addr != 0) { + // Timeout means AICore is unresponsive. Log and continue deiniting remaining cores. + if (platform_deinit_aicore_regs(reg_addr) != 0) { + LOG_ERROR("Thread %d: Core %d deinit timed out", thread_idx, core_id); + rc = -1; + } + } else { + LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id); + } + } + LOG_INFO_V0("Thread %d: Shutdown complete", thread_idx); + return rc; +} + +// ============================================================================= +// Handshake with all AICore workers; discover core type and reg address. +// ============================================================================= +int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) { + Handshake *all_handshakes = reinterpret_cast(runtime->workers); + cores_total_num_ = runtime->worker_count; + + // Validate cores_total_num_ before using as array index + if (cores_total_num_ == 0 || cores_total_num_ > RUNTIME_MAX_WORKER) { + LOG_ERROR("Invalid cores_total_num %d (expected 1-%d)", cores_total_num_, RUNTIME_MAX_WORKER); + return -1; + } + + aic_count_ = 0; + aiv_count_ = 0; + + LOG_INFO_V0("Handshaking with %d cores", cores_total_num_); + + // Step 1: Write per-core payload addresses and send handshake signal. + // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before + // aicpu_ready=1, so AICore reads the correct payload pointer after waking up. + for (int32_t i = 0; i < cores_total_num_; i++) { + all_handshakes[i].task = reinterpret_cast(&payload_per_core_[i][0]); + OUT_OF_ORDER_STORE_BARRIER(); + all_handshakes[i].aicpu_ready = 1; + } + OUT_OF_ORDER_STORE_BARRIER(); + + // Get platform physical cores count for validation + uint32_t max_physical_cores_count = platform_get_physical_cores_count(); + + // Step 2: Wait for all cores to respond, collect core type and register addresses + bool handshake_failed = false; + for (int32_t i = 0; i < cores_total_num_; i++) { + Handshake *hank = &all_handshakes[i]; + + while (hank->aicore_regs_ready == 0) { + SPIN_WAIT_HINT(); + } + + uint32_t physical_core_id = hank->physical_core_id; + + if (physical_core_id >= max_physical_cores_count) { + LOG_ERROR( + "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id, + max_physical_cores_count + ); + handshake_failed = true; + continue; + } + + uint64_t *regs = reinterpret_cast(regs_); + uint64_t reg_addr = regs[physical_core_id]; + + // Initialize AICore registers after discovery (first round) + platform_init_aicore_regs(reg_addr); + OUT_OF_ORDER_STORE_BARRIER(); + hank->aicpu_regs_ready = 1; + + OUT_OF_ORDER_STORE_BARRIER(); + + while (hank->aicore_done == 0) { + SPIN_WAIT_HINT(); + } + + CoreType type = hank->core_type; + + core_exec_states_[i].reg_addr = reg_addr; + core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND); + +#if PTO2_PROFILING + // Record physical_core_id for PMU init later (CoreExecState has no room + // for this field under PTO2_PROFILING). + physical_core_ids_[i] = physical_core_id; +#endif +#if !PTO2_PROFILING + core_exec_states_[i].worker_id = i; + core_exec_states_[i].physical_core_id = physical_core_id; + core_exec_states_[i].core_type = type; +#endif + + if (type == CoreType::AIC) { + aic_worker_ids_[aic_count_++] = i; + LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); + } else { + aiv_worker_ids_[aiv_count_++] = i; + LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); + } + } + + if (handshake_failed) { + emergency_shutdown(runtime); + return -1; + } + + LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_); + return 0; +} + +// ============================================================================= +// Assign discovered cores to scheduler threads (cluster-aligned round-robin). +// ============================================================================= +bool SchedulerContext::assign_cores_to_threads() { + // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % active_sched_threads_. + // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together. + active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; + int32_t cluster_count = aic_count_; + + // Max clusters any single sched thread can hold: ceil(cluster_count / active_sched_threads_). + int32_t max_clusters_per_thread = (cluster_count + active_sched_threads_ - 1) / active_sched_threads_; + int32_t thread_cores_num = max_clusters_per_thread * 3; + + if (thread_cores_num > CoreTracker::MAX_CORE_PER_THREAD) { + LOG_ERROR("Can't assign more then 64 cores in per scheduler"); + return false; + } + + LOG_INFO_V0( + "Assigning cores (round-robin): %d clusters across %d sched threads (%d AIC, %d AIV)", cluster_count, + active_sched_threads_, aic_count_, aiv_count_ + ); + + for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) { + core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID; + core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID; + } + + // Count clusters per thread first (round-robin may distribute unevenly) + int32_t clusters_per_thread[MAX_AICPU_THREADS] = {}; + for (int32_t ci = 0; ci < cluster_count; ci++) { + clusters_per_thread[ci % active_sched_threads_]++; + } + for (int32_t i = 0; i < active_sched_threads_; i++) { + core_trackers_[i].init(clusters_per_thread[i]); + } + + int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {}; + + for (int32_t ci = 0; ci < cluster_count; ci++) { + int32_t t = ci % active_sched_threads_; + + int32_t aic_wid = aic_worker_ids_[ci]; + int32_t aiv0_wid = aiv_worker_ids_[2 * ci]; + int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1]; + + core_trackers_[t].set_cluster(cluster_idx_per_thread[t]++, aic_wid, aiv0_wid, aiv1_wid); + + LOG_INFO_V0("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", t, ci, aic_wid, aiv0_wid, aiv1_wid); + } + + for (int32_t t = 0; t < aicpu_thread_num_; t++) { + LOG_INFO_V0( + "Thread %d: total %d cores (%d clusters)", t, core_trackers_[t].core_num(), + core_trackers_[t].get_cluster_count() + ); + } + + LOG_INFO_V0( + "Config: threads=%d, cores=%d, cores_per_thread=%d", aicpu_thread_num_, cores_total_num_, thread_cores_num + ); + return true; +} + +// ============================================================================= +// Reassign all cores across all threads (sched + orchestrator) after orchestration. +// ============================================================================= +void SchedulerContext::reassign_cores_for_all_threads() { + LOG_INFO_V0( + "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_ + ); + + // Collect running worker_ids from all current trackers + bool running_cores[RUNTIME_MAX_WORKER] = {}; + for (int32_t i = 0; i < aicpu_thread_num_; i++) { + auto all_running = core_trackers_[i].get_all_running_cores(); + int32_t bp; + while ((bp = all_running.pop_first()) >= 0) { + running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true; + } + } + + // Count clusters per thread (round-robin across all threads) + int32_t cluster_count = aic_count_; + int32_t clusters_per_thread[MAX_AICPU_THREADS] = {}; + for (int32_t ci = 0; ci < cluster_count; ci++) { + clusters_per_thread[ci % aicpu_thread_num_]++; + } + + // Re-init all trackers and reset core counts + for (int32_t i = 0; i < aicpu_thread_num_; i++) { + core_trackers_[i].init(clusters_per_thread[i]); + } + + // Assign clusters round-robin and restore running state + int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {}; + for (int32_t ci = 0; ci < cluster_count; ci++) { + int32_t t = ci % aicpu_thread_num_; + + int32_t aic_wid = aic_worker_ids_[ci]; + int32_t aiv0_wid = aiv_worker_ids_[2 * ci]; + int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1]; + + int32_t cl_idx = cluster_idx_per_thread[t]++; + core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid); + + // init() marks all idle; toggle cores that were running and restore pending_occupied + if (running_cores[aic_wid]) { + core_trackers_[t].change_core_state(cl_idx * 3); + core_trackers_[t].set_pending_occupied(cl_idx * 3); + } + if (running_cores[aiv0_wid]) { + core_trackers_[t].change_core_state(cl_idx * 3 + 1); + core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1); + } + if (running_cores[aiv1_wid]) { + core_trackers_[t].change_core_state(cl_idx * 3 + 2); + core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2); + } + } + + // Log final distribution + LOG_INFO_V0("Core reassignment complete:"); + for (int32_t t = 0; t < aicpu_thread_num_; t++) { + int32_t aic_running = core_trackers_[t].get_running_count(); + int32_t aiv_running = core_trackers_[t].get_running_count(); + LOG_INFO_V0( + " Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(), + core_trackers_[t].get_cluster_count(), aic_running, aiv_running + ); + } + active_sched_threads_ = aicpu_thread_num_; +} + +// ============================================================================= +// Emergency shutdown: broadcast exit signal to every handshake'd core and +// deinit their AICore register blocks. Idempotent. +// ============================================================================= +void SchedulerContext::emergency_shutdown(Runtime *runtime) { + LOG_WARN("Emergency shutdown: sending exit signal to all initialized cores"); + Handshake *all_handshakes = reinterpret_cast(runtime->workers); + int32_t timeout_count = 0; + for (int32_t i = 0; i < cores_total_num_; i++) { + Handshake *hank = &all_handshakes[i]; + OUT_OF_ORDER_STORE_BARRIER(); + hank->aicpu_regs_ready = 1; + if (core_exec_states_[i].reg_addr != 0) { + if (platform_deinit_aicore_regs(core_exec_states_[i].reg_addr) != 0) { + timeout_count++; + } + } + } + if (timeout_count > 0) { + LOG_ERROR("Emergency shutdown: %d cores did not acknowledge exit", timeout_count); + } + LOG_WARN("Emergency shutdown complete"); +} + +// ============================================================================= +// Lifecycle: init / deinit +// ============================================================================= +int32_t SchedulerContext::init( + Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base +) { + always_assert(runtime != nullptr); + + // Zero all per-core execution state before handshake + memset(core_exec_states_, 0, sizeof(core_exec_states_)); + + // Wire thread/transition configuration that handshake/assign need to read. + aicpu_thread_num_ = aicpu_thread_num; + sched_thread_num_ = sched_thread_num; + orch_to_sched_ = orch_to_sched; + regs_ = regs_base; + +#if PTO2_PROFILING + // l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory + // header — must be called BEFORE caching the level, otherwise the cached + // value would still be 0 (only the binary enable bit has been seeded by + // kernel.cpp at this point). Reset the cached level on disabled runs so a + // prior enabled launch's level can't leak into the phase-record gates in + // scheduler_dispatch. + if (is_l2_swimlane_enabled()) { + l2_swimlane_aicpu_init(runtime->worker_count); + l2_swimlane_level_ = get_l2_swimlane_level(); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + // Sched-phase pool count: matches the dump_args_init branch in + // scheduler_dispatch.cpp. sched_thread_num_ <= 0 means "use all + // AICPU threads as scheduler threads" (see assign_cores_to_threads' + // active_sched_threads_ normalization at line 689). Without this + // normalization here, init_phase would prime zero sched pools and + // all sched_phase emits would silently drop. + const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; + const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched; + // Orchestration is always single-threaded, so orch-phase is one pool + // (ordinal 0) in both modes — see record_orch_phase. + const int orch_phase_threads = 1; + l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_phase_threads, orch_phase_threads); + } + } else { + l2_swimlane_level_ = L2SwimlaneLevel::DISABLED; + } +#endif + + // Discover cores and assign to scheduler threads. + int32_t rc = handshake_all_cores(runtime); + if (rc != 0) { + LOG_ERROR("handshake_all_cores failed"); + return rc; + } + if (!assign_cores_to_threads()) { + return -1; + } + + // Initialize task counters. Task count comes from PTO2 shared memory. + if (runtime->get_gm_sm_ptr()) { + auto *header = static_cast(runtime->get_gm_sm_ptr()); + // Read at one-time boot init, before the SM is reset for the run, so a + // ring not yet written holds uninitialized memory (0xbe... under ASAN's + // malloc-fill). Sum in int64 and only count rings whose value is a + // plausible task count — (0, PTO2_SCOPE_TASKS_CAP]; a ring cannot hold + // more than the scope cap. This rejects any garbage pattern (negative + // or positive), so uninitialized rings contribute 0 (the correct boot + // count) while valid counts still add up, with no signed overflow. + int64_t pto2_count = 0; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + int32_t ring_tasks = header->rings[r].fc.current_task_index.load(std::memory_order_acquire); + if (ring_tasks > 0 && ring_tasks <= PTO2_SCOPE_TASKS_CAP) pto2_count += ring_tasks; + } + total_tasks_ = static_cast(pto2_count); + } else { + total_tasks_ = 0; + } + completed_tasks_.store(0, std::memory_order_release); + + // Device orchestration: the orchestrator thread flips this when the graph is built. + orchestrator_done_ = false; + + // Clear per-core dispatch payloads + memset(payload_per_core_, 0, sizeof(payload_per_core_)); + memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_)); + + // Initialize per-core GlobalContext (sub_block_id) based on cluster position. + // This is done once at startup and never modified afterwards. + for (int32_t t = 0; t < sched_thread_num_; t++) { + CoreTracker &tracker = core_trackers_[t]; + for (int32_t c = 0; c < tracker.get_cluster_count(); c++) { + int32_t cluster_offset = c * 3; // Each cluster = 1 AIC + 2 AIV + auto aiv0_id = tracker.get_core_id_by_offset(tracker.get_aiv0_core_offset(cluster_offset)); + auto aiv1_id = tracker.get_core_id_by_offset(tracker.get_aiv1_core_offset(cluster_offset)); + payload_per_core_[aiv0_id][0].global_context.sub_block_id = 0; + payload_per_core_[aiv0_id][1].global_context.sub_block_id = 0; + payload_per_core_[aiv1_id][0].global_context.sub_block_id = 1; + payload_per_core_[aiv1_id][1].global_context.sub_block_id = 1; + } + } + + func_id_to_addr_ = runtime->func_id_to_addr_; + + return 0; +} + +void SchedulerContext::deinit() { + // Reset all per-core execution state + for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) { + core_exec_states_[i] = {}; + core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID; + core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID; + } + + // Clear per-core dispatch payloads + memset(payload_per_core_, 0, sizeof(payload_per_core_)); + memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_)); + + // Reset sync-start drain coordination — a previous run that aborted mid-drain + // would otherwise leave dirty pending/elected/ack state for the next reuse. + drain_state_.sync_start_pending.store(0, std::memory_order_release); + drain_state_.drain_worker_elected.store(0, std::memory_order_release); + drain_state_.drain_ack_mask.store(0, std::memory_order_release); + drain_state_.pending_task.store(nullptr, std::memory_order_release); + + // Reset task counters and orchestrator state + completed_tasks_.store(0, std::memory_order_release); + total_tasks_ = 0; + orchestrator_done_ = false; + pto2_init_claimed_.store(false, std::memory_order_release); + pto2_init_complete_.store(false, std::memory_order_release); + + // Reset core transition state + transition_requested_.store(false, std::memory_order_release); + wait_reassign_.store(0, std::memory_order_release); + reassigned_.store(false, std::memory_order_release); + completed_.store(false, std::memory_order_release); + + // Reset core discovery and assignment state + aic_count_ = 0; + aiv_count_ = 0; + cores_total_num_ = 0; + aicpu_thread_num_ = 0; + sched_thread_num_ = 0; + orch_to_sched_ = false; + active_sched_threads_ = 0; + for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) { + core_trackers_[t] = CoreTracker{}; + } + + regs_ = 0; + sched_ = nullptr; + rt_ = nullptr; + func_id_to_addr_ = nullptr; +} + +void SchedulerContext::wait_pto2_init_complete() const { + while (!pto2_init_complete_.load(std::memory_order_acquire)) { + SPIN_WAIT_HINT(); + } +} + +void SchedulerContext::bind_runtime(PTO2Runtime *rt) { + rt_ = rt; + sched_ = &rt->scheduler; +} + +// ============================================================================= +// Post-orchestration bookkeeping. Runs on the orchestrator thread once the +// build phase finishes; folds inline-completed tasks, flips orchestrator_done_, +// and drives the orchestrator → scheduler core transition (or fatal shutdown). +// ============================================================================= +void SchedulerContext::on_orchestration_done( + Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks +) { +#if PTO2_PROFILING + if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) { + // Flush the orchestrator's orch-phase buffer (single instance, pool 0). + // The orchestrator has no scheduler-phase pool of its own — those belong + // to the scheduler threads and are flushed in scheduler_dispatch. + l2_swimlane_aicpu_flush_orch_phase_buffer(thread_idx); + } +#endif + + total_tasks_ = total_tasks; + + // Fold tasks completed inline during orchestration + int32_t inline_completed = static_cast(rt->orchestrator.inline_completed_tasks); + if (inline_completed > 0) { + completed_tasks_.fetch_add(inline_completed, std::memory_order_relaxed); +#if PTO2_SCHED_PROFILING + rt->scheduler.tasks_completed.fetch_add(inline_completed, std::memory_order_relaxed); +#endif + } + orchestrator_done_ = true; + + // Check for fatal error from orchestration; if so, shut down immediately. + int32_t orch_err = 0; + if (sched_->sm_header) { + orch_err = sched_->sm_header->orch_error_code.load(std::memory_order_relaxed); + } + if (orch_err != PTO2_ERROR_NONE) { + if (!completed_.exchange(true, std::memory_order_acq_rel)) { + emergency_shutdown(runtime); + } + } + + // Skip core transition on fatal error — cores already shut down above. + if (completed_.load(std::memory_order_acquire)) { + // Signal transition to unblock scheduler threads waiting at core transition + transition_requested_.store(true, std::memory_order_release); + reassigned_.store(true, std::memory_order_release); + } else if (orch_to_sched_) { + LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx); + transition_requested_.store(true, std::memory_order_release); + + // Wait for scheduler threads to acknowledge transition request + while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) { + if (completed_.load(std::memory_order_acquire)) { + break; + } + SPIN_WAIT_HINT(); + } + if (!completed_.load(std::memory_order_acquire)) { + reassign_cores_for_all_threads(); + reassigned_.store(true, std::memory_order_release); + } + } + +#if PTO2_PROFILING + // Write core-to-thread mapping AFTER reassignment so the profiling data + // reflects the final distribution (all active_sched_threads_, including + // former orchestrator threads when orch_to_sched_ is enabled). + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + l2_swimlane_aicpu_init_core_assignments(cores_total_num_); + for (int32_t t = 0; t < active_sched_threads_; t++) { + l2_swimlane_aicpu_write_core_assignments_for_thread( + t, core_trackers_[t].core_ids(), core_trackers_[t].core_num() + ); + } + } +#endif +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp new file mode 100644 index 000000000..774589865 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp @@ -0,0 +1,614 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include "scheduler_context.h" + +#include + +#include "common/unified_log.h" +#include "aicpu/device_time.h" +#include "aicpu/platform_regs.h" +#include "common/l2_swimlane_profiling.h" +#include "common/memory_barrier.h" +#include "common/platform_config.h" +#include "pto_runtime2.h" +#include "runtime.h" +#include "spin_hint.h" + +// Performance profiling headers +#include "aicpu/l2_swimlane_collector_aicpu.h" +#include "aicpu/pmu_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" + +// ============================================================================= +// Dual-slot state machine helpers +// ============================================================================= + +namespace { +inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256; +} + +// Pure function: read register result -> SlotTransition (no side effects). +SlotTransition SchedulerContext::decide_slot_transition( + int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id, bool pending_gated +) { + SlotTransition t; + if (pending_id != AICPU_TASK_INVALID && reg_task_id == pending_id) { + t.matched = true; + t.running_done = true; // Serial execution: pending event implies running done + t.running_freed = true; + t.pending_freed = true; + if (reg_state == TASK_FIN_STATE) { + t.pending_done = true; // Case 1: pending FIN + } + // else: Case 2: pending ACK (pending_done stays false) + } else if (reg_task_id == running_id) { + if (reg_state == TASK_FIN_STATE) { + if (pending_id == AICPU_TASK_INVALID) { + // Case 3.2: running FIN, no pending -> core goes idle + t.matched = true; + t.running_done = true; + t.running_freed = true; + } else if (pending_gated) { + // Case 3.3: running FIN, pending is a SPECULATIVE GATED task. The + // Case 3.1 "wait for the pending's ack" shortcut assumes the AICore + // immediately runs the pending task; a gated task instead spins on + // its doorbell and never acks until its producer completes — and + // that producer's completion depends on collecting THIS running FIN. + // Waiting would deadlock. Complete the running FIN now and promote + // the gated task (it then skip-gates until its doorbell). pending is + // NOT freed (it promotes, not retires) so the bitmap update keeps the + // core off-limits — no second gated block, no doorbell overwrite. + t.matched = true; + t.running_done = true; + t.running_freed = true; + } + // Case 3.1: running FIN, NON-gated pending exists -> skip (transient + // state). Case 1/2 (pending ack/FIN) completes running implicitly. + } else { + // Case 4: running ACK -- only pending_freed (slot now hardware-latched) + t.matched = true; + t.pending_freed = true; + } + } + return t; +} + +// Complete one slot's task: subtask counting, mixed completion, deferred release, profiling. +void SchedulerContext::complete_slot_task( + PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, [[maybe_unused]] PTO2SubtaskSlot subslot, + int32_t thread_idx, int32_t core_id, Handshake *hank, int32_t &completed_this_turn, + PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs +#if PTO2_PROFILING + , + uint64_t dispatch_ts, uint64_t finish_ts +#endif +) { +#if PTO2_PROFILING + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; +#else + (void)hank; +#endif + // MPSC fast-path is opt-in per task: only tasks with at least one subtask + // that registered a deferred condition route through the mailbox. Pure + // non-deferred tasks complete inline on this thread (matching pre-MPSC + // behavior — keeps the common case parallelized across scheduler threads + // instead of serializing through the single consumer). The + // any_subtask_deferred flag on slot_state is the discriminator; it's set + // (release) before on_subtask_complete and read (acquire) after, so the + // last subtask sees flag writes from any earlier subtask of the same task. + AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr; + bool defer_completion_to_consumer = false; + + if (slot_state.payload != nullptr) { + volatile DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][expected_reg_task_id & 1]; + int32_t slab_err = deferred_slab->error_code; + if (slab_err != PTO2_ERROR_NONE) { + int32_t expected = PTO2_ERROR_NONE; + sched_->sm_header->sched_error_code.compare_exchange_strong( + expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire + ); + completed_.store(true, std::memory_order_release); + return; + } + + uint32_t cond_count = deferred_slab->count; + if (cond_count > MAX_COMPLETIONS_PER_TASK) { + int32_t expected = PTO2_ERROR_NONE; + sched_->sm_header->sched_error_code.compare_exchange_strong( + expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire + ); + completed_.store(true, std::memory_order_release); + return; + } + + if (cond_count > 0) { + // Publish "this task is deferred" before on_subtask_complete so the + // acq_rel fetch_add inside on_subtask_complete makes the flag + // visible to whichever subtask sees task_complete=true (which may + // be this thread or a later one). + slot_state.any_subtask_deferred.store(true, std::memory_order_release); + + const PTO2TaskId token = slot_state.task->task_id; + for (uint32_t i = 0; i < cond_count; ++i) { + volatile DeferredCompletionEntry *e = &deferred_slab->entries[i]; + while (!mailbox->try_push_condition(token, e->addr, e->expected_value, e->engine, e->completion_type)) { + sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed); + SPIN_WAIT_HINT(); + } + } + } + } + + bool task_complete = sched_->on_subtask_complete(slot_state); + +#if PTO2_PROFILING + // Sub-block retire that did not finish the slot: record it so the poll + // iteration becomes visible on the scheduler lane (the SPMD harvest tail). + if (!task_complete && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + l2_swimlane.phase_subretire_count++; + } +#endif + + if (task_complete && slot_state.payload != nullptr && + slot_state.any_subtask_deferred.load(std::memory_order_acquire)) { + // Some subtask of this task registered conditions; finish the + // registration by handing the slot_state off to the consumer. + while (!mailbox->try_push_normal_done(slot_state.task->task_id, reinterpret_cast(&slot_state))) { + sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed); + SPIN_WAIT_HINT(); + } + defer_completion_to_consumer = true; + } + + if (task_complete && !defer_completion_to_consumer) { +#if PTO2_PROFILING + if (is_dump_args_enabled()) { + dump_args_for_task( + thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION, + [](ActiveMask active_mask, int raw_subtask_id) { + return active_mask.subtask_active(static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif +#if PTO2_PROFILING + // Time Resolve (walk the consumer list, decrement each consumer's + // fanin, push the newly-ready ones, ring doorbells for speculative + // hits) so it renders as a child bar nested inside this iteration's + // Complete bar. The 1 µs floor below filters out the ~88% of tasks + // with 1-2 consumers (~500 ns Resolve) so only the long broadcast / + // reduction walks stand out on the lane. + uint64_t resolve_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0; +#endif + // [[maybe_unused]] silences -Werror=unused-but-set-variable on the + // profiling-flags-smoke build path where PTO2_PROFILING is OFF and + // the Resolve emit below is excluded. + [[maybe_unused]] uint32_t consumers_resolved = 0; +#if PTO2_SCHED_PROFILING + // SCHED_PROFILING variant takes thread_idx for its per-thread atomic + // counter side-effects (g_sched_*_atomic_count[thread_idx], consumed + // by the otc_* log lines). It returns CompletionStats whose + // `fanout_edges` is the consumer-walk count. + consumers_resolved = sched_->on_task_complete(slot_state, thread_idx, local_bufs).fanout_edges; +#else + consumers_resolved = sched_->on_task_complete(slot_state, local_bufs); +#endif +#if PTO2_PROFILING + if (resolve_t0 != 0) { + uint64_t resolve_t1 = get_sys_cnt_aicpu(); + // Filter: drop Resolve bars under 1 µs so the lane shows only + // resolves that did meaningful work (high consumer counts or + // doorbells). 50 cycles @ 50 MHz = 1 µs (PLATFORM_PROF_SYS_CNT_FREQ + // is the device sys-cnt frequency). + constexpr uint64_t RESOLVE_EMIT_MIN_CYCLES = PLATFORM_PROF_SYS_CNT_FREQ / 1'000'000; // 1 µs + if (resolve_t1 - resolve_t0 >= RESOLVE_EMIT_MIN_CYCLES) { + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Resolve, resolve_t0, resolve_t1, l2_swimlane.sched_loop_count, + consumers_resolved + ); + } + } + l2_swimlane.phase_complete_count++; +#endif + if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP) { + deferred_release_slot_states[deferred_release_count++] = &slot_state; + } else { + LOG_INFO_V9("Thread %d: release", thread_idx); + while (deferred_release_count > 0) { +#if PTO2_SCHED_PROFILING + // SCHED_PROFILING variant takes thread_idx for the per-thread + // atomic counter side-effects. The return value is unused. + (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx); +#else + sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]); +#endif + } + deferred_release_slot_states[deferred_release_count++] = &slot_state; + } + completed_this_turn++; + } + +#if PTO2_PROFILING + // Level gate: at AICORE_TIMING (level=1) the AICore record alone carries + // {start, end, task_token_raw}, host resolves func_id/core_type from + // dep_gen / per-core mapping, and AICPU has nothing to write. Only at + // AICPU_TIMING (level=2) and above does AICPU contribute dispatch/finish + // timestamps via complete_task. Bypassing here saves the per-completion + // hot-path cost (counter inc + ring lookup + record store + wmb + buffer + // rotation bookkeeping) for runs that only want AICore timing. + if (l2_swimlane.l2_swimlane_enabled && l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) { +#if PTO2_SCHED_PROFILING + uint64_t t_perf_start = get_sys_cnt_aicpu(); +#endif + + if (l2_swimlane_aicpu_complete_task( + core_id, thread_idx, static_cast(expected_reg_task_id), dispatch_ts, finish_ts + ) != 0) { + LOG_ERROR( + "Core %d: l2_swimlane_aicpu_complete_task failed for task 0x%" PRIx64, core_id, + static_cast(slot_state.task->task_id.raw) + ); + } +#if PTO2_SCHED_PROFILING + l2_swimlane.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start); +#endif + } + + if (is_pmu_enabled()) { + pmu_aicpu_record_task( + core_id, thread_idx, slot_state.task->task_id.raw, + slot_state.task->kernel_id[static_cast(subslot)], hank[core_id].core_type + ); + } +#endif +} + +// Promote pending slot data to running slot. Clears pending fields. +void SchedulerContext::promote_pending_to_running(CoreExecState &core) { + core.running_slot_state = core.pending_slot_state; + core.running_reg_task_id = core.pending_reg_task_id; + core.running_subslot = core.pending_subslot; +#if PTO2_PROFILING + core.running_dispatch_timestamp = core.pending_dispatch_timestamp; +#endif + core.pending_slot_state = nullptr; + core.pending_reg_task_id = AICPU_TASK_INVALID; +} + +// Clear running slot (core becomes idle). +void SchedulerContext::clear_running_slot(CoreExecState &core) { + core.running_slot_state = nullptr; + core.running_reg_task_id = AICPU_TASK_INVALID; +} + +void SchedulerContext::check_running_cores_for_completion( + int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed, + bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, + PTO2LocalReadyBuffer *local_bufs +) { +#if PTO2_SCHED_PROFILING + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; +#endif + CoreTracker &tracker = core_trackers_[thread_idx]; + auto running_core_states = tracker.get_all_running_cores(); + while (running_core_states.has_value()) { + int32_t bit_pos = running_core_states.pop_first(); + int32_t core_id = tracker.get_core_id_by_offset(bit_pos); + CoreExecState &core = core_exec_states_[core_id]; + + // Skip gated speculative cores. A STAGED task is parked on this core + // waiting for its doorbell — it physically cannot ACK/FIN yet, so + // reading its COND (MMIO, and the core is hot-spinning on its own SPR) + // every poll is pure waste that drags out the completion phase. The + // doorbell (try_speculative_release) flips spec_state to DISPATCHED, at + // which point the core becomes pollable again and its FIN is caught. + // Cheap cacheable load; no MMIO. Pending slot is empty while gated. + { + PTO2TaskSlotState *rs = core.running_slot_state; + if (rs != nullptr && rs->payload != nullptr && + rs->payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING) { + continue; + } + } + + // --- Judgment phase: read register, derive transition --- + // Use the precomputed cond_ptr (resolved once in handshake) to skip + // the reg_offset switch and reg_addr addition on every poll. + uint64_t reg_val = static_cast(*core.cond_ptr); + // ARM64 allows Device-nGnRnE -> Normal-cacheable load reorder; the + // rmb() pins any AICore-published cacheable reads downstream of the + // FIN observation. Replaces the post-`__sync_synchronize` that the + // old read_reg() helper carried implicitly. + rmb(); + int32_t reg_task_id = EXTRACT_TASK_ID(reg_val); + int32_t reg_state = EXTRACT_TASK_STATE(reg_val); + +#if PTO2_SCHED_PROFILING + if (l2_swimlane.l2_swimlane_enabled) { + l2_swimlane.complete_probe_count++; + } +#endif + + // A pending task is "gated" when it is a speculative pre-stage still + // waiting on its doorbell (STAGED): it will not ack on the producer's FIN, + // so the Case 3.1 wait-for-pending-ack shortcut would deadlock. Detect it + // so decide_slot_transition completes the running FIN and promotes it. + bool pending_gated = + (core.pending_slot_state != nullptr && core.pending_slot_state->payload != nullptr && + core.pending_slot_state->payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING); + SlotTransition t = decide_slot_transition( + reg_task_id, reg_state, core.running_reg_task_id, core.pending_reg_task_id, pending_gated + ); + if (!t.matched) continue; + +#if PTO2_SCHED_PROFILING + if (l2_swimlane.l2_swimlane_enabled && (t.running_done || t.pending_done)) { + l2_swimlane.complete_hit_count++; + } +#endif + +#if PTO2_PROFILING + // Capture finish_ts at the FIN observation point — right after rmb() + // above pinned the cacheable AICore reads downstream of the register + // load, and BEFORE any fanin / deferred-release work. Anything later + // (slot transition apply, complete_slot_task fanin processing) would + // charge AICPU completion-processing cost to the (end → finish) + // span, masking the actual FIN-delivery latency. + uint64_t finish_ts = 0; + if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING && (t.pending_done || t.running_done)) { + finish_ts = get_sys_cnt_aicpu(); + } +#endif + + // --- Apply phase: execute actions based on transition --- + + // 1. Complete finished tasks (capture pointers before modifying core state) + if (t.pending_done) { + complete_slot_task( + *core.pending_slot_state, core.pending_reg_task_id, core.pending_subslot, thread_idx, core_id, hank, + completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs +#if PTO2_PROFILING + , + core.pending_dispatch_timestamp, finish_ts +#endif + ); + cur_thread_completed++; + } + if (t.running_done) { + complete_slot_task( + *core.running_slot_state, core.running_reg_task_id, core.running_subslot, thread_idx, core_id, hank, + completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs +#if PTO2_PROFILING + , + core.running_dispatch_timestamp, finish_ts +#endif + ); + cur_thread_completed++; + } + + // 2. Update slot data + if (t.running_freed) { + if (core.pending_slot_state != nullptr && !t.pending_done) { + promote_pending_to_running(core); // Case 2 or Case 3 (with pending) + } else { + clear_running_slot(core); // Case 1 or Case 3 (no pending) + if (t.pending_done) { + // Case 1: pending FIN observed directly -- clear stale pending fields. + // Without this, pending_reg_task_id retains a stale value that blocks + // clear_pending_occupied and permanently degrades pipelining. + core.pending_slot_state = nullptr; + core.pending_reg_task_id = AICPU_TASK_INVALID; + } + } + } + + // 3. Update tracker bitmap + bool is_idle = (core.running_reg_task_id == AICPU_TASK_INVALID); + if (is_idle) { + tracker.change_core_state(bit_pos); // Mark idle + tracker.clear_pending_occupied(bit_pos); // Idle safeguard: no payload to protect + } else if (t.pending_freed && core.pending_reg_task_id == AICPU_TASK_INVALID) { + // Case 4 (running ACK) or Case 2 (pending ACK): clear pending_occupied only + // when no pending task is currently held. Otherwise pending slot is occupied + // by a pre-loaded task and must stay protected. + tracker.clear_pending_occupied(bit_pos); + } + + // 4. Progress signal (only when running task completes) + if (t.running_done) { + made_progress = true; + } + } +} + +// ============================================================================= +// sync_start drain protocol +// ============================================================================= + +// Take ownership of slot_state and signal all threads to enter drain mode. +// Returns true if this thread won the CAS and owns the drain slot. +// Returns false if another thread already holds drain; caller must re-push slot_state. +// +// Two-phase protocol: CAS 0 -> -1 (sentinel) to claim ownership, store task and +// reset election flag, then release-store block_num. Other threads acquire-load +// sync_start_pending; seeing block_num > 0 ensures all relaxed stores are visible. +bool SchedulerContext::enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num) { + int32_t expected = 0; + if (!drain_state_.sync_start_pending.compare_exchange_strong( + expected, -1, std::memory_order_relaxed, std::memory_order_relaxed + )) { + return false; // Another thread already holds the drain slot. + } + // We own the drain slot. Store the task and reset election flag before making it visible. + drain_state_.pending_task.store(slot_state, std::memory_order_release); + drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed); + drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed); + // Release store: all stores above are now visible to any thread that + // acquire-loads sync_start_pending and sees block_num > 0. + drain_state_.sync_start_pending.store(block_num, std::memory_order_release); + return true; +} + +// Count total available resources across all scheduler threads for a given shape. +int32_t SchedulerContext::count_global_available(PTO2ResourceShape shape, uint8_t core_mask) { + int32_t total = 0; + for (int32_t t = 0; t < active_sched_threads_; t++) { + if (shape == PTO2ResourceShape::MIX) { + total += core_trackers_[t].count_mix_running_clusters(core_mask); + } else { + total += core_trackers_[t].get_idle_core_offset_states(shape).count(); + } + } + return total; +} + +// Drain worker: dispatch all blocks in one pass across all threads' trackers. +// Called only when global resources >= block_num, so one pass always suffices. +// All other threads are spinning -- the drain worker has exclusive tracker access. +void SchedulerContext::drain_worker_dispatch(int32_t block_num) { + PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire); + if (!slot_state) { + drain_state_.sync_start_pending.store(0, std::memory_order_release); + return; + } + PTO2ResourceShape shape = slot_state->active_mask.to_shape(); + uint8_t core_mask = slot_state->active_mask.core_mask(); + + for (int32_t t = 0; + t < active_sched_threads_ && slot_state->next_block_idx.load(std::memory_order_relaxed) < block_num; t++) { + auto valid = (shape == PTO2ResourceShape::MIX) ? + core_trackers_[t].get_mix_running_cluster_offset_states(core_mask) : + core_trackers_[t].get_idle_core_offset_states(shape); + int32_t start = slot_state->next_block_idx.load(std::memory_order_relaxed); + int32_t remaining = slot_state->logical_block_num - start; + int32_t claim = std::min(valid.count(), remaining); + slot_state->next_block_idx.store(static_cast(start + claim), std::memory_order_relaxed); + PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3]; + int handle_count = 0; + for (int32_t b = 0; b < claim; b++) { + auto core_offset = valid.pop_first(); + handle_count += prepare_block_for_dispatch( + t, core_offset, *slot_state, shape, false, start + b, &handles[handle_count] + ); + } + wmb(); + uint64_t dispatch_ts = 0; +#if PTO2_PROFILING + if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) { + dispatch_ts = get_sys_cnt_aicpu(); + } +#endif + for (int i = 0; i < handle_count; i++) { + publish_subtask_to_core(handles[i], dispatch_ts); + } + } + + // All blocks dispatched -- clear drain state. + // Release fence ensures tracker mutations are visible to threads that + // acquire-load sync_start_pending == 0 and resume normal operation. + std::atomic_thread_fence(std::memory_order_release); + drain_state_.pending_task.store(nullptr, std::memory_order_release); + drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed); + drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed); + drain_state_.sync_start_pending.store(0, std::memory_order_release); +} + +// Called by each scheduler thread when drain_state_.sync_start_pending != 0. +// +// Protocol (single-stage ack barrier): +// 1. Ack barrier: all threads signal they've stopped dispatch, then spin +// until all ack bits are set. +// If this thread's bit gets cleared while waiting, a reset occurred -- return. +// 2. Election: one thread wins the CAS and becomes the drain worker. +// If resources are insufficient, reset ack/election fields and return -- +// all threads resume completion polling to free running cores, then retry. +// 3. Dispatch: elected thread dispatches all blocks (one pass, resources guaranteed). +// Non-elected threads spin-wait until sync_start_pending == 0. +// During dispatch the elected thread has exclusive tracker access. +void SchedulerContext::handle_drain_mode(int32_t thread_idx) { + // Every spin in this function honors is_completed(): once the run latches + // completed_ (all tasks done, or a fatal error raised elsewhere), peers leave + // the dispatch loop and stop participating in the drain. A thread parked in a + // drain spin would then wait forever for acks / a gate-open that can no longer + // arrive -- the AICPU watchdog never fires here because these spins live + // outside the dispatch loop's wall-clock budget, so the hang escalates straight + // to the 3 s STARS op-exec timeout (507018) and poisons the device. Bailing on + // completed_ is always safe: any pending sync_start task is either already + // dispatched (a stale re-popped slot) or moot under teardown, and deinit() + // resets drain_state_ before the next run, so leaving it dirty is harmless. + // Spin until drain is fully initialized (sentinel -1 -> block_num > 0). + int32_t block_num; + do { + if (is_completed()) return; + block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire); + } while (block_num < 0); + if (block_num == 0) return; + + uint32_t all_acked = (1u << active_sched_threads_) - 1; + + // Ack barrier -- signal this thread has stopped dispatch. + drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release); + + // Spin until all threads have acked. + // If our bit is cleared while waiting, elected reset due to insufficient resources. + while (true) { + if (is_completed()) return; + uint32_t ack = drain_state_.drain_ack_mask.load(std::memory_order_acquire); + if ((ack & all_acked) == all_acked) break; + if ((ack & (1u << thread_idx)) == 0) return; + SPIN_WAIT_HINT(); + } + + // Election -- exactly one thread wins the CAS. + int32_t expected = 0; + drain_state_.drain_worker_elected.compare_exchange_strong( + expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed + ); + + if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1) { + // Non-elected: spin-wait for drain completion or resource-insufficient reset. + while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) { + if (is_completed()) return; + if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return; + SPIN_WAIT_HINT(); + } + return; + } + + // Elected: check if global resources are sufficient. + PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire); + if (slot_state == nullptr) { + // pending_task is observed null only when a concurrent drain completion + // already cleared it (drain_worker_dispatch nulls it before reopening the + // gate). That drain is done and this is a stale-elected thread, so just + // release the election lock and return. Do NOT clear drain_ack_mask or + // sync_start_pending: a *new* drain run may already be active and + // accumulating acks, and zeroing them would corrupt it into a hang. + drain_state_.drain_worker_elected.store(0, std::memory_order_release); + return; + } + PTO2ResourceShape shape = slot_state->active_mask.to_shape(); + int32_t available = count_global_available(shape, slot_state->active_mask.core_mask()); + + if (available < block_num) { + // Insufficient resources -- reset drain fields so threads can resume + // completion polling to free running cores, then retry. + drain_state_.drain_ack_mask.store(0, std::memory_order_release); + drain_state_.drain_worker_elected.store(0, std::memory_order_release); + return; + } + + // Dispatch -- all other threads are spinning, elected thread has exclusive tracker access. + drain_worker_dispatch(block_num); +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h new file mode 100644 index 000000000..88bcff170 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h @@ -0,0 +1,423 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#ifndef SCHEDULER_CONTEXT_H +#define SCHEDULER_CONTEXT_H + +#include "aicpu/platform_regs.h" +#include "common/l2_swimlane_profiling.h" +#include "common/unified_log.h" +#include "scheduler_types.h" + +#include "scheduler/pto_scheduler.h" + +#include "aicore_completion_mailbox.h" +#include "pto2_dispatch_payload.h" + +// These macros are defined in runtime.h, but we cannot include it here +// (it pulls in Handshake which we only forward-declare). Mirror the +// authoritative values so the class layout compiles standalone. +#ifndef RUNTIME_MAX_WORKER +#define RUNTIME_MAX_WORKER 72 +#endif +#ifndef RUNTIME_MAX_FUNC_ID +#define RUNTIME_MAX_FUNC_ID 1024 +#endif + +// Forward declarations — avoid pulling in full headers for pointer/reference params. +class Runtime; +struct Handshake; +struct PTO2Runtime; + +/** + * SchedulerContext: owns all scheduler-side state and methods. + * + * Held as a member of AicpuExecutor (sched_ctx_). The single public entry + * point is resolve_and_dispatch(), called once per scheduler thread. + * + * All dispatch/completion/drain/cold-path logic is implemented as private + * member methods, split across three .cpp files by responsibility: + * - scheduler_completion.cpp (completion polling, drain protocol) + * - scheduler_cold_path.cpp (exit checks, stall diagnostics, profiling) + * - scheduler_dispatch.cpp (task dispatch loop and helpers) + */ +class SchedulerContext { +public: + // ========================================================================= + // Lifecycle + // ========================================================================= + + // Initialize scheduler state from the given runtime and thread layout. + // - Discovers cores via handshake_all_cores() + // - Assigns cores to scheduler threads + // - Resets task counters, payloads, per-core GlobalContext + // - Binds func_id_to_addr_ / initial sched_ (if rt is already known) + // - Captures AICore-register base (consumed by handshake_all_cores()) + // Returns 0 on success, negative on failure (handshake / assignment error). + int32_t + init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base); + + // Reset all SchedulerContext-owned state to its post-construction defaults. + // Called by AicpuExecutor::deinit() during per-run teardown. + void deinit(); + + // ========================================================================= + // Per-thread execution entry points (called by AicpuExecutor::run) + // ========================================================================= + + // Main scheduler thread entry: poll completion + dispatch ready tasks. + int32_t resolve_and_dispatch(Runtime *runtime, int32_t thread_idx); + + // Shutdown AICore registers for this thread's assigned cores. + // Also runs PMU finalize (PTO2_PROFILING) before deinit when enabled. + // Orchestrator threads (core_trackers_[thread_idx].core_num() == 0) are a no-op. + int32_t shutdown(int32_t thread_idx); + + // Run all post-orchestration scheduler bookkeeping: + // - publishes core assignments to the perf collector (PTO2_PROFILING) + // - latches submitted task count from PTO2 shared memory + // - folds inline_completed_tasks into completed_tasks_ + // - flips orchestrator_done_ and triggers core transition + // (skipped on fatal error — emergency_shutdown runs instead) + // Callers must invoke rt_orchestration_done(rt) before this — that + // step belongs to the orchestrator lifecycle, not the scheduler. + void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks); + + // Bind the PTO2Runtime scheduler pointer. Required in device-orchestration + // mode where rt is created by the orchestrator thread after init(). + void bind_runtime(PTO2Runtime *rt); + + // ========================================================================= + // State queries / external synchronization points + // ========================================================================= + + int32_t aic_count() const { return aic_count_; } + int32_t aiv_count() const { return aiv_count_; } + bool is_completed() const { return completed_.load(std::memory_order_acquire); } + int32_t completed_tasks_count() const { return completed_tasks_.load(std::memory_order_acquire); } + + // Block until the first scheduler thread has finished one-time PTO2 init. + // Called by the orchestrator thread in device-orch mode. + void wait_pto2_init_complete() const; + +private: + // ========================================================================= + // State + // ========================================================================= + + // --- Scheduler binding & per-core runtime state --- + alignas(64) PTO2SchedulerState *sched_{nullptr}; + PTO2Runtime *rt_{nullptr}; + + // Per-core execution state, indexed by core_id (= worker_id) + CoreExecState core_exec_states_[RUNTIME_MAX_WORKER]; + + // Cluster-ordered core trackers, one per scheduler thread + CoreTracker core_trackers_[MAX_AICPU_THREADS]; + + // Per-core dispatch payload storage: dual-buffer for pipelining. + // buf_idx = reg_task_id & 1; adjacent dispatches alternate automatically. + PTO2DispatchPayload payload_per_core_[RUNTIME_MAX_WORKER][2]; + + // Per-core deferred-completion software registration storage. This has + // the same runtime lifetime as payload_per_core_, but is kept out of the + // dispatch payload so normal task dispatch layout and cache footprint stay + // unchanged. + DeferredCompletionSlab deferred_slab_per_core_[RUNTIME_MAX_WORKER][2]; + + // sync_start drain coordination + SyncStartDrainState drain_state_; + +#if PTO2_PROFILING + SchedL2SwimlaneCounters sched_l2_swimlane_[MAX_AICPU_THREADS]; + // Cached once at init() from get_l2_swimlane_level(), AFTER + // l2_swimlane_aicpu_init has promoted the level from the shared-memory header. + L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED}; +#endif + + // --- Task-execution tracking --- + std::atomic completed_tasks_{0}; + int32_t total_tasks_{0}; + // Device orchestration: set by last orchestrator when graph is built; schedulers poll it. + // volatile prevents the compiler from hoisting the load out of spin loops. + volatile bool orchestrator_done_{false}; + std::atomic completed_{false}; + uint64_t *func_id_to_addr_{nullptr}; + + // --- Core-transition coordination --- + std::atomic transition_requested_{false}; + std::atomic wait_reassign_{0}; + std::atomic reassigned_{false}; + + // --- Thread/core configuration --- + int32_t active_sched_threads_{0}; + int32_t sched_thread_num_{0}; + bool orch_to_sched_{false}; + int32_t aicpu_thread_num_{0}; + int32_t cores_total_num_{0}; + + // Cluster-ordered worker_id lists, populated by handshake_all_cores(). + int32_t aic_worker_ids_[RUNTIME_MAX_WORKER]{}; + int32_t aiv_worker_ids_[RUNTIME_MAX_WORKER]{}; + int32_t aic_count_{0}; + int32_t aiv_count_{0}; + + // Platform AICore-register base array (set by AicpuExecutor before init()). + uint64_t regs_{0}; + +#if PTO2_PROFILING + // PMU profiling: physical core IDs for PMU MMIO base resolution. + // Separate storage because CoreExecState's 64-byte budget has no room for + // physical_core_id when PTO2_PROFILING=1. + uint32_t physical_core_ids_[RUNTIME_MAX_WORKER]{}; +#endif + + // --- One-time init coordination --- + std::atomic pto2_init_claimed_{false}; + std::atomic pto2_init_complete_{false}; + + // ========================================================================= + // Core management (scheduler_cold_path.cpp) + // ========================================================================= + + // Handshake with all AICore workers; populates core_exec_states_, worker id lists. + int32_t handshake_all_cores(Runtime *runtime); + + // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads. + bool assign_cores_to_threads(); + + // Re-distribute all cores across all threads after orchestration completes. + void reassign_cores_for_all_threads(); + + // Emergency shutdown: broadcast exit signal to every handshake'd core and + // deinit their AICore register blocks. Idempotent. + void emergency_shutdown(Runtime *runtime); + + // ========================================================================= + // Dispatch (scheduler_dispatch.cpp) + // ========================================================================= + + static const char *shape_name(PTO2ResourceShape shape); + + // Lower-case rendering of PTO2SubtaskSlot, used by dispatch and stall logs. + // Kept lower-case to match the `kernels=[aic:N aiv0:N aiv1:N]` field + // convention already established in the stall log family. + static inline const char *subslot_name(PTO2SubtaskSlot s) { + switch (s) { + case PTO2SubtaskSlot::AIC: + return "aic"; + case PTO2SubtaskSlot::AIV0: + return "aiv0"; + case PTO2SubtaskSlot::AIV1: + return "aiv1"; + } + return "?"; + } + + int pop_ready_tasks_batch( + PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, + int max_count + ); + + void build_payload( + PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, + const AsyncCtx &async_ctx, int32_t block_idx + ); + + // Batched-dispatch primitives. prepare_* builds the payload and per-core + // state; publish_* issues the MMIO register write. Callers must wmb() + // between the prepare batch and the publish batch, then sample + // get_sys_cnt_aicpu() once and pass it to publish_* for every handle. + // + // dispatch_timestamp_slot points to the CoreExecState slot + // (pending_dispatch_timestamp / running_dispatch_timestamp) selected at + // prepare time, or nullptr when L2 swimlane is below AICPU_TIMING and no + // dispatch timestamp is being recorded. + struct PublishHandle { + uint64_t reg_addr; + uint32_t reg_task_id; + int32_t core_offset; + uint64_t *dispatch_timestamp_slot; + }; + + PublishHandle prepare_subtask_to_core( + int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, + bool to_pending, int32_t block_idx + ); + + inline void publish_subtask_to_core(const PublishHandle &h, uint64_t dispatch_ts) { + if (h.dispatch_timestamp_slot != nullptr) { + *h.dispatch_timestamp_slot = dispatch_ts; + } + write_reg(h.reg_addr, RegId::DATA_MAIN_BASE, static_cast(h.reg_task_id)); + } + + // Fan out one block's subtasks (1 for AIC/AIV, 1-3 for MIX) into the + // caller-supplied handles buffer. Returns the number of handles written. + int prepare_block_for_dispatch( + int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape, + bool to_pending, int32_t block_idx, PublishHandle *out_handles + ); + + void dispatch_shape( + int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf, + CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed + ); + + // Speculative early-dispatch (Hook 1). After normal dispatch leaves idle + // cores spare, pre-stage the consumers of any RUNNING flagged producer onto + // those cores with not_ready=1 (gated). Touches no dependency state — the + // task is released by the doorbell at its normal ready-pop (Hook 2). + int32_t try_speculative_early_dispatch(int32_t thread_idx); + + // Stage the already-claimed range [start, start+count) of consumer `c` onto + // thread_idx's idle (RUNNING slot) then pending (gated-pending, promote-on-FIN) + // cores from the provided free-core sets. The caller advances next_block_idx and + // re-pushes `c` BEFORE calling, so this expensive prepare+publish runs + // concurrently with peers (mirrors the normal SPMD dispatch path). Returns the + // number of blocks staged. + int32_t stage_consumer_blocks( + int32_t thread_idx, PTO2TaskSlotState *c, PTO2ResourceShape shape, int32_t start, int32_t count, + CoreTracker::BitStates &idle, CoreTracker::BitStates &pend + ); + + // One pass of "Phase 4" in the resolve_and_dispatch loop: IDLE-stage dispatch + // for MIX then (if no mix residual) AIC/AIV; mid-flush of local buffers; then + // PENDING-stage dispatch with cross-thread idle gating. MIX is strictly + // prioritized — when mix residual is detected after MIX-IDLE, AIC/AIV are + // skipped for the whole pass but MIX-PENDING still runs. + // + // Forward-progress argument for AIC/AIV: skip_aic_aiv is sticky for the + // current pass only. The next loop iteration re-evaluates after Phase 1 + // completion polling and the global MIX queue draining (here or on any + // peer thread). AIC/AIV starvation is therefore bounded by MIX throughput, + // not unbounded — once mix completes on at least one cluster, the next + // pass either drains the residual or admits AIC/AIV. + void dispatch_ready_tasks( + int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES], + bool pmu_active, bool &made_progress, bool &try_pushed + ); + + // Returns true if any *other* scheduler thread currently has an idle core + // matching `shape`. Used as a scheduling hint on the PENDING dispatch path + // — see the implementation in scheduler_dispatch.cpp for the hint-semantics + // rationale and the safety argument against the drain worker. + bool has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const; + + // True if mix tasks remain anywhere this thread could see them: the caller's + // MIX local LIFO stack or the global MIX ready queue. Approximate — + // PTO2ReadyQueue::size() (see pto_scheduler.h) snapshots its enqueue/dequeue + // positions with std::memory_order_relaxed and may interleave with concurrent + // push/pop. Don't confuse with PTO2SpscQueue::size(), which uses acquire + // loads — that one isn't on this path. A stale read here causes at most one + // extra/missed AIC/AIV skip and self-corrects on the next loop iteration. + bool has_residual_mix(const PTO2LocalReadyBuffer &mix_local_buf) const { + return mix_local_buf.count > 0 || sched_->ready_queues[static_cast(PTO2ResourceShape::MIX)].size() > 0; + } + + // ========================================================================= + // Completion & drain (scheduler_completion.cpp) + // ========================================================================= + + static SlotTransition decide_slot_transition( + int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id, bool pending_gated = false + ); + + void complete_slot_task( + PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, PTO2SubtaskSlot subslot, int32_t thread_idx, + int32_t core_id, Handshake *hank, int32_t &completed_this_turn, + PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, + PTO2LocalReadyBuffer *local_bufs +#if PTO2_PROFILING + , + uint64_t dispatch_ts, uint64_t finish_ts +#endif + ); + + static void promote_pending_to_running(CoreExecState &core); + static void clear_running_slot(CoreExecState &core); + + void check_running_cores_for_completion( + int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed, + bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, + PTO2LocalReadyBuffer *local_bufs + ); + + bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num); + int32_t count_global_available(PTO2ResourceShape shape, uint8_t core_mask); + void drain_worker_dispatch(int32_t block_num); + void handle_drain_mode(int32_t thread_idx); + + // ========================================================================= + // Cold path: exit checks, stall diagnostics, profiling (scheduler_cold_path.cpp) + // ========================================================================= + + __attribute__((noinline, cold)) LoopAction + handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count); + + __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released); + + __attribute__((noinline, cold)) LoopAction + check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime); + + __attribute__((noinline, cold)) void + log_stall_diagnostics(int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count); + + __attribute__((noinline, cold)) void log_shutdown_stall_snapshot( + int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count + ); + + // Reverse lookup: given a global core_id, find which scheduler thread's + // tracker owns it. Returns -1 if not found. Linear scan — only used on + // the cold diagnostic path. + int32_t find_core_owner_thread(int32_t core_id) const; + + // Does this thread own any core with a RUNNING task (running_slot_state set)? + // Gates the scheduler timeout fatal latch: a thread without an owned + // RUNNING task has no first-hand evidence of a stuck dispatch and must + // not declare global fatal on its own idle observation. The thread that + // does own the stuck task will reach the budget on its own polls and + // latch with valid evidence (or recover when the COND register flips). + bool self_owns_running_task(int32_t thread_idx) const; + + // Does *any* scheduler thread own a RUNNING task? Used as the second + // fatal-latch condition: if the wall-clock budget elapsed AND no thread + // owns RUNNING work AND tasks remain incomplete, the system is in a + // pre-dispatch / WAIT-only deadlock (e.g. dependency cycle) and the + // ownerless idle threads are the only observers — let one of them latch. + bool no_thread_owns_running_task() const; + + __attribute__((noinline, cold)) int32_t handle_timeout_exit( + int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations, + int32_t last_progress_count +#if PTO2_PROFILING + , + uint64_t sched_start_ts +#endif + ); + +#if PTO2_PROFILING + __attribute__((noinline, cold)) void log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed); +#endif + + // ========================================================================= + // Small inline helpers + // ========================================================================= + + uint64_t get_function_bin_addr(int func_id) const { + if (!func_id_to_addr_ || func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("func_id=%d is out of range [0, %d) or map is null", func_id, RUNTIME_MAX_FUNC_ID); + return 0; + } + return func_id_to_addr_[func_id]; + } +}; + +#endif // SCHEDULER_CONTEXT_H diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp new file mode 100644 index 000000000..c727ff16c --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp @@ -0,0 +1,1501 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include "scheduler_context.h" + +#include +#include +#include + +#include "common.h" // debug_assert + +#include "common/unified_log.h" +#include "aicpu/device_time.h" +#include "aicpu/platform_regs.h" +#include "callable.h" +#include "common/l2_swimlane_profiling.h" +#include "common/memory_barrier.h" +#include "common/platform_config.h" +#include "pto_runtime2.h" +#include "runtime.h" +#include "spin_hint.h" + +// Performance profiling headers +#include "aicpu/l2_swimlane_collector_aicpu.h" +#include "aicpu/pmu_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" + +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif + +// ============================================================================= +// Dispatch helpers +// ============================================================================= + +namespace { +inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256; +} + +// The speculative core bitmask (PTO2_SPEC_CORE_MASK_WORDS * 64 bits) must cover +// every global core_id, and the per-core doorbell table is sized to match. +static_assert( + RUNTIME_MAX_WORKER <= PTO2_SPEC_CORE_MASK_WORDS * 64, "staged_core_mask too small for RUNTIME_MAX_WORKER cores" +); + +const char *SchedulerContext::shape_name(PTO2ResourceShape shape) { + switch (shape) { + case PTO2ResourceShape::AIC: + return "AIC"; + case PTO2ResourceShape::AIV: + return "AIV"; + case PTO2ResourceShape::MIX: + return "MIX"; + case PTO2ResourceShape::DUMMY: + return "DUMMY"; + } + return "UNKNOWN"; +} + +bool SchedulerContext::has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const { + // Cross-thread read of peer trackers without explicit synchronization. The + // backing `core_states_` is a naturally aligned uint64_t; aarch64 guarantees + // single-copy atomicity for an 8-byte aligned load, so no torn read. The + // value is consumed only as a scheduling *hint* — a stale read at worst + // causes one missed/extra pending dispatch, corrected on the next iteration. + // Drain-mode cross-thread writes are serialized by handle_drain_mode's ack + // barrier (all peers spin out of the dispatch path before any tracker + // mutation), so this routine is never racing the drain worker. + for (int32_t t = 0; t < active_sched_threads_; t++) { + if (t == self_thread_idx) continue; + if (core_trackers_[t].get_idle_core_offset_states(shape).has_value()) { + return true; + } + } + return false; +} + +int SchedulerContext::pop_ready_tasks_batch( + PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count +) { +#if PTO2_PROFILING + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; +#if PTO2_SCHED_PROFILING + extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[]; + uint64_t t_pop_start = get_sys_cnt_aicpu(); + int count = sched_->get_ready_tasks_batch( + shape, local_buf, out, max_count, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx] + ); + l2_swimlane.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start); +#else + int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count); +#endif + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + if (count > 0) { + l2_swimlane.pop_hit += count; + } else { + l2_swimlane.pop_miss++; + } + } +#else + (void)thread_idx; + int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count); +#endif + return count; +} + +void SchedulerContext::build_payload( + PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, + const AsyncCtx &async_ctx, int32_t block_idx +) { + int32_t slot_idx = static_cast(subslot); + uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]); + const CoreCallable *callable = reinterpret_cast(callable_addr); + dispatch_payload.function_bin_addr = callable->resolved_addr(); + auto &payload = *slot_state.payload; + int n = 0; + for (int32_t i = 0; i < payload.tensor_count; i++) { + dispatch_payload.args[n++] = reinterpret_cast(&payload.tensors[i]); + } + for (int32_t i = 0; i < payload.scalar_count; i++) { + dispatch_payload.args[n++] = payload.scalars[i]; + } + dispatch_payload.local_context.block_idx = block_idx; + dispatch_payload.local_context.block_num = slot_state.logical_block_num; + dispatch_payload.local_context.async_ctx = async_ctx; + dispatch_payload.args[PAYLOAD_LOCAL_CONTEXT_INDEX] = reinterpret_cast(&dispatch_payload.local_context); + dispatch_payload.args[PAYLOAD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast(&dispatch_payload.global_context); + // Speculative early-dispatch: a task being staged (Hook 1 set spec_state to + // STAGING before this call) is gated — the AICore must wait for the + // DATA_MAIN_BASE high-32 doorbell. All other dispatches run on pickup. + dispatch_payload.not_ready = + (slot_state.payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING) ? 1 : 0; +} + +SchedulerContext::PublishHandle SchedulerContext::prepare_subtask_to_core( + int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, bool to_pending, + int32_t block_idx +) { + CoreTracker &tracker = core_trackers_[thread_idx]; + auto core_id = tracker.get_core_id_by_offset(core_offset); + CoreExecState &core_exec_state = core_exec_states_[core_id]; + + core_exec_state.dispatch_seq++; + uint32_t reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK; + static_assert( + (TASK_ID_MASK - AICORE_EXIT_SIGNAL + 1) % 2 == 0, "Sentinel skip must be even to preserve dual-buffer parity" + ); + if (reg_task_id >= AICORE_EXIT_SIGNAL) { + core_exec_state.dispatch_seq += (TASK_ID_MASK - reg_task_id + 1); + reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK; + } + + uint32_t buf_idx = reg_task_id & 1u; + PTO2DispatchPayload &payload = payload_per_core_[core_id][buf_idx]; + DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][buf_idx]; + deferred_slab->count = 0; + deferred_slab->error_code = PTO2_ERROR_NONE; + AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab); + build_payload(payload, slot_state, subslot, async_ctx, block_idx); + + if (to_pending) { + core_exec_state.pending_subslot = subslot; + core_exec_state.pending_slot_state = &slot_state; + core_exec_state.pending_reg_task_id = static_cast(reg_task_id); + } else { + core_exec_state.running_subslot = subslot; + core_exec_state.running_slot_state = &slot_state; + core_exec_state.running_reg_task_id = static_cast(reg_task_id); + tracker.change_core_state(core_offset); + } + tracker.set_pending_occupied(core_offset); + + LOG_DEBUG( + "Thread %d: Dispatched %s %s task %" PRId64 " kernel_id=[%d,%d,%d] block_idx=%d/total_blocks=%d to" + " core_offset=%d core_id=%d reg_task_id=%u", + thread_idx, to_pending ? "pending" : "idle", subslot_name(subslot), + static_cast(slot_state.task->task_id.raw), slot_state.task->kernel_id[0], + slot_state.task->kernel_id[1], slot_state.task->kernel_id[2], block_idx, slot_state.logical_block_num, + core_offset, core_id, reg_task_id + ); + + // AICore buffer rotation lives on the dispatch path: count this dispatch + // and rotate before write_reg when we're about to cross a BUFFER_SIZE + // boundary. The completion-before-dispatch invariant makes this race-free + // (all prior tasks on this core have FIN'd, so AICore has dcci'd their + // records out of the old buffer). Gated on the same enable bit as flush + // so level=1 (AICORE_TIMING-only) participates without needing complete_task. +#if PTO2_PROFILING + if (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED) { + l2_swimlane_aicpu_on_aicore_dispatch(core_id, thread_idx); + } +#endif + + uint64_t *dispatch_timestamp_slot = nullptr; +#if PTO2_PROFILING + if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) { + dispatch_timestamp_slot = + to_pending ? &core_exec_state.pending_dispatch_timestamp : &core_exec_state.running_dispatch_timestamp; + } +#endif + + return PublishHandle{core_exec_state.reg_addr, reg_task_id, core_offset, dispatch_timestamp_slot}; +} + +int SchedulerContext::prepare_block_for_dispatch( + int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape, bool to_pending, + int32_t block_idx, PublishHandle *out_handles +) { +#if PTO2_PROFILING + if (is_dump_args_enabled()) { + dump_args_for_task( + thread_idx, slot_state, TensorDumpStage::BEFORE_DISPATCH, + [](ActiveMask active_mask, int raw_subtask_id) { + return active_mask.subtask_active(static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif + CoreTracker &tracker = core_trackers_[thread_idx]; + if (shape == PTO2ResourceShape::MIX) { + uint8_t cmask = slot_state.active_mask.core_mask(); + int n = 0; + if (cmask & PTO2_SUBTASK_MASK_AIC) { + out_handles[n++] = prepare_subtask_to_core( + thread_idx, tracker.get_aic_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIC, to_pending, + block_idx + ); + } + if (cmask & PTO2_SUBTASK_MASK_AIV0) { + out_handles[n++] = prepare_subtask_to_core( + thread_idx, tracker.get_aiv0_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV0, to_pending, + block_idx + ); + } + if (cmask & PTO2_SUBTASK_MASK_AIV1) { + out_handles[n++] = prepare_subtask_to_core( + thread_idx, tracker.get_aiv1_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV1, to_pending, + block_idx + ); + } +#if PTO2_PROFILING + sched_l2_swimlane_[thread_idx].phase_dispatch_count += __builtin_popcount(cmask); +#endif + return n; + } else if (shape == PTO2ResourceShape::AIC) { + out_handles[0] = + prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIC, to_pending, block_idx); +#if PTO2_PROFILING + sched_l2_swimlane_[thread_idx].phase_dispatch_count += 1; +#endif + return 1; + } else { + out_handles[0] = + prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx); +#if PTO2_PROFILING + sched_l2_swimlane_[thread_idx].phase_dispatch_count += 1; +#endif + return 1; + } +} + +void SchedulerContext::dispatch_shape( + int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf, + CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed +) { +#if PTO2_SCHED_PROFILING + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; +#endif + if (entered_drain) return; + + bool is_pending = (phase == CoreTracker::DispatchPhase::PENDING); + bool is_mix = (shape == PTO2ResourceShape::MIX); + auto cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase); + if (!cores.has_value()) return; + + while (cores.has_value() && !entered_drain) { + int want = cores.count(); + PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS * 3]; + int got = pop_ready_tasks_batch(shape, thread_idx, local_buf, batch, want); + if (got == 0) break; + + // sync_start exclusion gate. + // + // When the popped batch contains a sync_start task we MUST publish each + // prior task with its own wmb so AICore receives them with time + // separation. The drain coordinator's `count_global_available()` check + // reads the per-thread CoreTracker, and although `prepare_block_for_dispatch` + // marks cores occupied synchronously, the head-start between successive + // tasks is what lets the surrounding completion loop catch up on FINs in + // the retry window when the sync_start task hits insufficient resources. + // Bursting all prior tasks at the end of the pop (cross-task batching) + // collapses that head-start and causes spmd_sync_start_stress to time + // out via 507018 on ~40% of runs — see + // docs/investigations/2026-06-cross-task-batched-publish.md. + // + // When the batch carries no sync_start task, no drain entry can happen + // in this pop, so we hoist `handles[]`, `wmb()`, and the publish loop + // out of the per-task body. One wmb amortizes across all tasks and one + // dispatch_ts is shared, which restores ~60 ns first-to-last AICore + // start span for single-block decode kernels (out_proj, q_proj, ...). + // Detection is a single mask check per task — cheap relative to even + // one register write. + bool any_sync_start = false; + for (int bi = 0; bi < got; bi++) { + if (batch[bi]->active_mask.requires_sync_start()) { + any_sync_start = true; + break; + } + } + + // handles[] is sized for the MIX worst case: total claims across the + // pop bounded by `cores.count() ≤ MAX_CLUSTERS`, and each block + // contributes ≤ 3 subtasks for MIX. + PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3]; + int handle_count = 0; + bool dispatched_any = false; + // Slots dispatched this pop whose dispatch_fanin must be propagated to + // consumers. Deferred until AFTER publish (below) so a flagged producer's + // fanout walk never sits between claiming cores and publishing its own + // blocks — doing it inline delays this thread's blocks while peer threads + // co-dispatching the same SPMD task publish immediately, misaligning the + // task's block starts. Bounded by cores.count() ≤ MAX_CLUSTERS dispatches. + PTO2TaskSlotState *prop_list[CoreTracker::MAX_CLUSTERS]; + int prop_n = 0; +#if PTO2_SCHED_PROFILING + uint64_t t_setup_start = get_sys_cnt_aicpu(); +#endif + + // Flush prepared-but-unpublished handles. Required before + // `enter_drain_mode` so the drain coordinator sees cores as occupied, + // and at the per-task boundary when `any_sync_start` is true. + auto flush_publish = [&]() { + if (handle_count == 0) return; + wmb(); + uint64_t dispatch_ts = 0; +#if PTO2_PROFILING + if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) { + dispatch_ts = get_sys_cnt_aicpu(); + } +#endif + for (int i = 0; i < handle_count; i++) { + publish_subtask_to_core(handles[i], dispatch_ts); + } + handle_count = 0; + made_progress = true; + }; + + for (int bi = 0; bi < got; bi++) { + PTO2TaskSlotState *slot_state = batch[bi]; + CoreTracker::BitStates selected_mix_clusters(0ULL); + + if (is_mix) { + auto candidates = cores; + uint8_t cmask = slot_state->active_mask.core_mask(); + auto wanted = is_pending ? CoreTracker::MixPlacement::PENDING : CoreTracker::MixPlacement::RUNNING; + while (candidates.has_value()) { + int32_t cluster_offset = candidates.pop_first(); + if (tracker.classify_mix_cluster(cluster_offset, cmask) == wanted) { + selected_mix_clusters |= CoreTracker::BitStates(1ULL << cluster_offset); + } + } + if (!selected_mix_clusters.has_value()) { + sched_->ready_queues[static_cast(shape)].push(slot_state); + continue; + } + } + + // (Speculative pre-staged tasks never reach this ready-pop: they are + // released by their doorbell in release_fanin_and_check_ready the + // instant their last producer completes — see try_speculative_release.) + + if (slot_state->active_mask.requires_sync_start()) { + if (is_pending) { + sched_->ready_queues[static_cast(shape)].push(slot_state); + continue; + } + int32_t available = is_mix ? selected_mix_clusters.count() : cores.count(); + if (available < slot_state->logical_block_num) { + flush_publish(); + if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) { + sched_->ready_queues[static_cast(shape)].push(slot_state); + } + for (int rem = bi + 1; rem < got; rem++) { + sched_->ready_queues[static_cast(shape)].push(batch[rem]); + } + entered_drain = true; + break; + } + } + + if (!cores.has_value()) { + flush_publish(); + sched_->ready_queues[static_cast(shape)].push_batch(&batch[bi], got - bi); + break; + } + + dispatched_any = true; + try_pushed = true; + // Record for deferred dispatch_fanin propagation after this pop's + // blocks are published (see after the loop). propagate's own guard + // filters non-flagged slots, so recording unconditionally is cheap. + if (prop_n < static_cast(sizeof(prop_list) / sizeof(prop_list[0]))) { + prop_list[prop_n++] = slot_state; + } + // Claim a contiguous range of blocks, hand the slot back to the + // ready queue immediately, then perform the expensive dispatches. + // This lets other schedulers concurrently claim and dispatch the + // remaining blocks of the same SPMD task instead of spinning while + // this thread fills all its own cores. Only local `start + b` is + // read after the push — `next_block_idx` may already be advanced + // by another scheduler that popped the slot. + int32_t start = slot_state->next_block_idx.load(std::memory_order_relaxed); + int32_t remaining = slot_state->logical_block_num - start; + int32_t available = is_mix ? selected_mix_clusters.count() : cores.count(); + int32_t claim = std::min(available, remaining); + slot_state->next_block_idx.store(static_cast(start + claim), std::memory_order_relaxed); + + if (start + claim < slot_state->logical_block_num) { + sched_->ready_queues[static_cast(shape)].push(slot_state); + } + + for (int32_t b = 0; b < claim; b++) { + auto core_offset = is_mix ? selected_mix_clusters.pop_first() : cores.pop_first(); + if (is_mix) { + cores.clear_bit(core_offset); + } + handle_count += prepare_block_for_dispatch( + thread_idx, core_offset, *slot_state, shape, is_pending, start + b, &handles[handle_count] + ); + } + + // Sync_start exclusion: flush per task so prior tasks have head- + // start time before any sync_start drain check. Normal batches + // fall through and accumulate for one cross-task flush at the + // end of the pop. + if (any_sync_start) { + flush_publish(); + } + } + + flush_publish(); + // Blocks are published; now propagate dispatch_fanin for any flagged + // producers dispatched above (knob A: producer is running). Off the + // pre-publish path so it cannot delay or misalign their blocks. + for (int i = 0; i < prop_n; i++) { + sched_->propagate_dispatch_fanin(*prop_list[i]); + } +#if PTO2_SCHED_PROFILING + l2_swimlane.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start); +#endif + + if (!dispatched_any) break; + + if (!cores.has_value()) { + cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase); + } + } +} + +void SchedulerContext::dispatch_ready_tasks( + int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES], + bool pmu_active, bool &made_progress, bool &try_pushed +) { + using Phase = CoreTracker::DispatchPhase; + constexpr int32_t MIX_I = static_cast(PTO2ResourceShape::MIX); + + // MIX is handled explicitly at the top of each stage; only AIC/AIV cycle + // through this 2-elem array, with order toggled by thread parity for + // shape-level load balancing across threads. + static constexpr PTO2ResourceShape kAicAivOrder[2][2] = { + {PTO2ResourceShape::AIC, PTO2ResourceShape::AIV}, + {PTO2ResourceShape::AIV, PTO2ResourceShape::AIC}, + }; + const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1]; + + // Spill overflow from local_bufs to the shared ready queue BEFORE we start + // dispatching. release_fanin's fast path packs all newly-ready consumers + // into the producing thread's local_bufs (zero atomic, peer-invisible). For + // batch releases (e.g. attn_fence → 50 out_proj consumers) that + // overshoots this thread's slot budget so peers are starving while we + // hoard. The cross-thread invisibility window between "complete pushes 50 + // to local" and "IDLE-AIC's mid-phase flush exposes overflow to shared" + // is what shows up in the swimlane as the multi-microsecond inter-thread + // stagger on out_proj's first wave. + // + // Gate conditions: + // (a) local count exceeds this thread's per-shape block budget — we + // can't dispatch them all even with both RUNNING+PENDING slots; + // (b) at least one peer has idle cores in this shape — they want work. + // Both must hold to avoid wasting a CAS push when we could profitably + // self-dispatch the overflow. Condition (b) reads peer CoreTracker + // (plain 8-byte load on a rarely-contended cache line, ~5 ns) — we + // deliberately avoid ready_queues[s].size() here, which is two atomic + // loads on lines pushers + poppers actively bounce. + // + // Capacity derives from how cores are partitioned across sched threads: + // per-shape budget = (PLATFORM_MAX_BLOCKDIM / active_sched_threads_) + // × cores_per_blockdim_for_that_shape + // MIX is 1 cluster per block dim, so its budget equals the block-dim + // share without multiplying. + // + // Push the trailing `excess` slot pointers — O(1) count decrement, no + // memmove. push_batch is one CAS for the whole excess; peers see the + // batch immediately and can race for them. + const int32_t bd_per_thread = PLATFORM_MAX_BLOCKDIM / active_sched_threads_; + const int32_t thread_capacity[PTO2_NUM_RESOURCE_SHAPES] = { + /*AIC=*/bd_per_thread * PLATFORM_AIC_CORES_PER_BLOCKDIM, + /*AIV=*/bd_per_thread * PLATFORM_AIV_CORES_PER_BLOCKDIM, + /*MIX=*/bd_per_thread, + }; + for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) { + auto &lb = local_bufs[s]; + int32_t excess = lb.count - thread_capacity[s]; + if (excess <= 0) continue; + if (!has_idle_in_other_threads(thread_idx, static_cast(s))) continue; + sched_->ready_queues[s].push_batch(&lb.slot_states[lb.count - excess], excess); + lb.count -= excess; + } + + auto flush_local_bufs = [&]() { + for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) { + auto &lb = local_bufs[s]; + if (lb.count > 0) { + sched_->ready_queues[s].push_batch(lb.slot_states, lb.count); + lb.count = 0; + } + } + }; + // Every return path below must flush; wrap in RAII so we cannot forget. + // The mid-function flush between IDLE and PENDING is still called + // explicitly — guard only covers exit. + struct FlushGuard { + decltype(flush_local_bufs) &flush_fn; + ~FlushGuard() { flush_fn(); } + } flush_guard{flush_local_bufs}; + + bool entered_drain = false; + + // ===== IDLE stage ===== + dispatch_shape( + thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain, made_progress, + try_pushed + ); + if (entered_drain) return; + + // MIX-IDLE residual: AIC/AIV (both IDLE and PENDING) yield for this pass. + // MIX-PENDING below still runs — that is the core of "mix strict priority": + // pending slots are spent on mix before AIC/AIV get any chance. + bool skip_aic_aiv = has_residual_mix(local_bufs[MIX_I]); + + if (!skip_aic_aiv) { + for (int i = 0; i < 2; i++) { + PTO2ResourceShape s = aic_aiv[i]; + dispatch_shape( + thread_idx, s, Phase::IDLE, local_bufs[static_cast(s)], tracker, entered_drain, made_progress, + try_pushed + ); + if (entered_drain) return; + } + } + + // Flush between IDLE and PENDING so PENDING-stage queue-size checks and any + // peer-thread reads see the IDLE-stage release_fanin output. + flush_local_bufs(); + + if (pmu_active) return; + + // ===== PENDING stage ===== + // MIX-PENDING gate: skip when a peer has an idle MIX-capable cluster — that + // peer's next IDLE-MIX iteration will pull the mix task from the global + // queue (already flushed above) at lower latency than us pre-loading a + // pending slot here. Forward progress for MIX is preserved: at least one + // thread will run MIX-IDLE next pass and consume the residual. + // + // The gate is NOT subject to skip_aic_aiv — residual mix continues to drain + // via pending slots on this thread when no peer is idle. + if (!has_idle_in_other_threads(thread_idx, PTO2ResourceShape::MIX)) { + dispatch_shape( + thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain, + made_progress, try_pushed + ); + if (entered_drain) return; + } + + // Re-check after MIX-PENDING. If MIX-IDLE already set skip_aic_aiv, leave + // it set; otherwise, escalate iff PENDING-MIX left residual. + if (!skip_aic_aiv && has_residual_mix(local_bufs[MIX_I])) { + skip_aic_aiv = true; + } + + // PENDING-MIX may have re-populated AIC/AIV local_bufs via release_fanin + // during in-flight completions; flush_guard ensures these don't carry + // across to the next iteration's IDLE stage. + if (skip_aic_aiv) return; + + // AIC/AIV-PENDING gate: a peer-idle skip is a delay, not a loss — the peer + // will pull from the global queue on its next IDLE pass. + for (int i = 0; i < 2; i++) { + PTO2ResourceShape s = aic_aiv[i]; + if (has_idle_in_other_threads(thread_idx, s)) continue; + dispatch_shape( + thread_idx, s, Phase::PENDING, local_bufs[static_cast(s)], tracker, entered_drain, made_progress, + try_pushed + ); + if (entered_drain) return; + } +} + +// Stage the ALREADY-CLAIMED range [start, start+count) of consumer `c` onto +// thread_idx's idle then pending cores. The caller (the queue drain) has advanced +// next_block_idx by `count` under pop-exclusivity AND re-pushed `c` for peers +// BEFORE calling this — so this, the expensive prepare+publish, runs CONCURRENTLY +// with peers staging other ranges of the same consumer. This mirrors the normal +// SPMD dispatch path (claim range -> store next_block_idx -> re-push -> dispatch). +// `idle`/`pend` are this thread's free-core sets, sized so idle.count+pend.count >= +// count (the caller clamped the claim to them), so all `count` blocks get a core. +// +// Rule 1: idle cores -> gated task in the RUNNING slot. Rule 2: PENDING slot of +// cores running a real task -> promoted in when that task FINs (gated-pending Case +// 3.3 in decide_slot_transition completes the running FIN + promotes instead of +// waiting for an ack the gated task never sends). Each staged core stays +// pending_occupied while gated, so no second gated block stacks on it. +// +// Self-ring: release flips STAGING->DISPATCHED then rings the mask. A block staged +// after that flip isn't in the mask release read, so this thread rings it here. The +// seq_cst order between "OR mask then load spec_state" (here) and "store DISPATCHED +// then read mask" (release) guarantees every gated core's doorbell fires. +int32_t SchedulerContext::stage_consumer_blocks( + int32_t thread_idx, PTO2TaskSlotState *c, PTO2ResourceShape shape, int32_t start, int32_t count, + CoreTracker::BitStates &idle, CoreTracker::BitStates &pend +) { + CoreTracker &tracker = core_trackers_[thread_idx]; + // Stamp the real pre-stage time (NOT 0) so the swimlane shows these blocks + // dispatched during the producer's run, not at trace start. + uint64_t early_dispatch_ts = get_sys_cnt_aicpu(); + uint64_t my_cores[PTO2_SPEC_CORE_MASK_WORDS] = {0}; // cores this thread gated (for self-ring) + int32_t staged = 0; + int32_t block = start; + auto stage_from = [&](CoreTracker::BitStates &avail, bool to_pending) { + // Mirror the normal flush_publish (scheduler_dispatch.cpp wmb()+publish loop): + // prepare all claimed blocks' payloads, one wmb(), then publish. The wmb + // guarantees the not_ready gate + args are globally visible before any + // DATA_MAIN_BASE token — without it a gated core can pick up the token and + // dcci a stale payload (the doorbell/release path mirrors normal dispatch). + PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3]; + int n = 0; + while (count > 0 && avail.has_value()) { + int32_t core_offset = avail.pop_first(); + n += prepare_block_for_dispatch(thread_idx, core_offset, *c, shape, to_pending, block, &handles[n]); + block++; + count--; + staged++; + } + if (n == 0) return; + wmb(); + for (int i = 0; i < n; i++) { + publish_subtask_to_core(handles[i], early_dispatch_ts); + int32_t cid = tracker.get_core_id_by_offset(handles[i].core_offset); + sched_->spec_doorbell_table[cid].addr = handles[i].reg_addr; + sched_->spec_doorbell_table[cid].token = handles[i].reg_task_id; + my_cores[cid >> 6] |= (1ULL << (cid & 63)); + } + }; + if (idle.has_value()) stage_from(idle, /*to_pending=*/false); + if (pend.has_value()) stage_from(pend, /*to_pending=*/true); + // Publish all this thread's gated cores into the shared mask in one OR per word + // (vs one per subtask) so release sees them; seq_cst keeps the self-ring order. + for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++) + if (my_cores[w] != 0) c->payload->staged_core_mask[w].fetch_or(my_cores[w], std::memory_order_seq_cst); + + // If release already flipped DISPATCHED, it may have read the mask before our + // bits landed — ring our own cores so none is left gated forever. + if (staged > 0 && c->payload->spec_state.load(std::memory_order_seq_cst) == PTO2_SPEC_DISPATCHED) { + for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++) { + uint64_t bits = my_cores[w]; + while (bits != 0) { + int cid = w * 64 + __builtin_ctzll(bits); + bits &= bits - 1; + PTO2SchedulerState::ring_one_doorbell( + sched_->spec_doorbell_table[cid].addr, sched_->spec_doorbell_table[cid].token + ); + } + } + } + return staged; +} + +// Early-dispatch drain (idle pass). Candidates are pushed to early_dispatch_queue +// EVENT-DRIVEN by propagate_dispatch_fanin (a flagged producer's dispatch bumps its +// consumers' dispatch_fanin; reaching fanin_count enqueues the consumer) — there is +// no per-iteration PULL scan here anymore. This pass only DRAINS the queue. +// Returns the number of blocks staged this pass (for the EarlyDispatch swimlane bar). +int32_t SchedulerContext::try_speculative_early_dispatch(int32_t thread_idx) { + constexpr int PTO2_EARLY_DISPATCH_DRAIN_MAX = 8; // bounded pops per pass + CoreTracker &tracker = core_trackers_[thread_idx]; + int32_t total_staged = 0; + + // Drain the queue — mirrors the normal SPMD dispatch path. Pop a consumer, + // CLAIM a range sized to THIS thread's free cores by advancing next_block_idx with + // a CAS (atomic — next_block_idx is shared with normal dispatch, which also claims + // it if release routes the consumer to the ready queue, so a plain store could + // double-dispatch), RE-PUSH it for peers, THEN do the expensive prepare+publish. + // Re-pushing before staging lets peers claim the next range and stage CONCURRENTLY + // — a wide consumer (online_softmax, 48 blocks) is filled by all idle threads in + // parallel instead of a serial winner-then-peer daisy chain. Bounded pops/pass. + for (int n = 0; n < PTO2_EARLY_DISPATCH_DRAIN_MAX; n++) { + PTO2TaskSlotState *c = sched_->early_dispatch_queue.pop(); + if (c == nullptr) break; + if (c->payload->spec_state.load(std::memory_order_acquire) != PTO2_SPEC_STAGING) continue; // released + PTO2ResourceShape shape = c->active_mask.to_shape(); + auto idle = tracker.get_idle_core_offset_states(shape); + auto pend = tracker.get_pending_core_offset_states(shape); + int32_t freecores = (idle.has_value() ? idle.count() : 0) + (pend.has_value() ? pend.count() : 0); + if (freecores == 0) { // no free cores of this shape — give it back for peers and stop + sched_->early_dispatch_queue.push(c); + break; + } + // CAS-claim a contiguous range [start, start+claim) sized to this thread's + // free cores; CAS keeps it atomic against peers AND normal dispatch. + int32_t start = 0, claim = 0; + while (true) { + int16_t cur = c->next_block_idx.load(std::memory_order_relaxed); + if (cur >= c->logical_block_num) break; // fully claimed + int32_t cnt = c->logical_block_num - cur; + if (cnt > freecores) cnt = freecores; + if (c->next_block_idx.compare_exchange_weak( + cur, static_cast(cur + cnt), std::memory_order_seq_cst, std::memory_order_relaxed + )) { + start = cur; + claim = cnt; + break; + } + } + if (claim == 0) continue; // nothing left to claim -> drop (no re-push) + // Re-push for concurrent peers BEFORE the expensive staging. + if (start + claim < c->logical_block_num) { + if (!sched_->early_dispatch_queue.push(c)) + LOG_INFO_V9( + "[SPEC] queue full on re-push, consumer=%" PRId64, static_cast(c->task->task_id.raw) + ); + } + total_staged += stage_consumer_blocks(thread_idx, c, shape, start, claim, idle, pend); + } + return total_staged; +} + +// ============================================================================= +// Main scheduler dispatch loop +// ============================================================================= + +int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_idx) { + always_assert(sched_ != nullptr); + CoreTracker &tracker = core_trackers_[thread_idx]; + LOG_INFO_V0("Thread %d: resolve_and_dispatch entry", thread_idx); + + PTO2SharedMemoryHeader *header = sched_->sm_header; + if (!header) { + LOG_ERROR("PTO2 dispatch: header is null"); + return -1; + } + LOG_INFO_V0( + "Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu", thread_idx, static_cast(header), + static_cast(header->rings[0].task_descriptors_offset), + static_cast(header->rings[0].task_window_size) + ); + + Handshake *hank = static_cast(runtime->workers); + LOG_INFO_V0( + "Thread %d: hank=%p, window_size=%lu", thread_idx, static_cast(hank), + static_cast(header->rings[0].task_window_size) + ); + + // One-time init: assign perf buffers (one thread does it; others wait) + if (!pto2_init_claimed_.exchange(true, std::memory_order_acq_rel)) { + LOG_INFO_V0("Thread %d: doing one-time init", thread_idx); + +#if PTO2_PROFILING + if (is_dump_args_enabled()) { + dump_args_init(orch_to_sched_ ? aicpu_thread_num_ : sched_thread_num_); + } +#endif + +#if PTO2_PROFILING + // Initialize PMU: program events, start counters, and pop initial buffers + if (is_pmu_enabled()) { + pmu_aicpu_init(physical_core_ids_, cores_total_num_); + LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_); + } +#endif + + LOG_INFO_V0("Thread %d: one-time init done", thread_idx); + pto2_init_complete_.store(true, std::memory_order_release); + } else { + while (!pto2_init_complete_.load(std::memory_order_acquire)) { + SPIN_WAIT_HINT(); + } + } + + LOG_INFO_V0("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, core_trackers_[thread_idx].core_num()); + int32_t cur_thread_completed = 0; + // Non-zero once a scheduler-hang timeout latches; returned in place of the + // completed count so the caller still sees the negative error rc while the + // shared end-of-loop flush below runs. + int32_t timeout_rc = 0; + int32_t idle_iterations = 0; + int32_t last_progress_count = 0; +#if PTO2_PROFILING + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; + l2_swimlane.reset(); + l2_swimlane.l2_swimlane_enabled = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED); +#endif + + constexpr int LOCAL_READY_CAP_PER_TYPE = 64; + PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE]; + PTO2LocalReadyBuffer local_bufs[PTO2_NUM_RESOURCE_SHAPES]; + for (int32_t i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + local_bufs[i].reset(local_ptrs[i], LOCAL_READY_CAP_PER_TYPE); + } + PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP]; + int32_t deferred_release_count = 0; + + bool cores_released = false; + + // PMU runs require single-issue dispatch — overlapping in-flight tasks + // pollute per-task PMU counters, so skip the PENDING pre-load phase. + // Cached at function scope: is_pmu_enabled() is extern "C" and the + // compiler cannot hoist it across the dispatch loop on its own. + const bool pmu_active = is_pmu_enabled(); + +#if PTO2_PROFILING + l2_swimlane.sched_start_ts = get_sys_cnt_aicpu(); +#endif + +#if PTO2_PROFILING + // Queue-depth snapshot carried across the iteration boundary: each phase + // emit consumes (phase_start_*) and refreshes them with its own end snapshot + // so the next phase's "at_start" equals the previous phase's "at_end". + // + // L2SWIMLANE_NUM_QUEUE_SHAPES (3) matches PTO2_NUM_RESOURCE_SHAPES: AIC/AIV/MIX. + // + // **Hot-path cost discipline.** Local depth (this thread's PTO2LocalReadyBuffer) + // is a single int read on a register-cached stack — free. Shared depth + // (PTO2ReadyQueue::size) is two atomic relaxed loads against cache lines + // that all peer sched threads also write to (enqueue_pos and dequeue_pos + // bounce on every flush_local_bufs + every pop). With both phases emitting + // per iter that's 12 cross-core loads × thousands of iters per run, a + // measurable AICPU slowdown. Mitigation: lazy + per-iter cached shared + // snapshot, refreshed at most once per iteration. The complete-emit and + // dispatch-emit in the same iter both reuse the same shared sample; the + // big transitions (local→shared flush) still show up across iter boundaries. + static_assert( + L2SWIMLANE_NUM_QUEUE_SHAPES == PTO2_NUM_RESOURCE_SHAPES, + "queue snapshot width must match runtime resource shape count" + ); + int16_t phase_start_local[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0}; + int16_t phase_start_shared[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0}; + int16_t iter_shared_snapshot[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0}; + bool iter_shared_sampled = false; + auto capture_local_snapshot = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) { + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + local_out[s] = static_cast(local_bufs[s].count); + } + }; + auto get_or_sample_shared = [&]() -> const int16_t * { + if (!iter_shared_sampled) { + // Clamp to int16_t max before narrowing. PTO2_PROF_READYQUEUE_SIZE + // is in the low thousands today but could grow with platform + // scaling — without clamp, sizes above 32767 wrap to negatives + // and silently corrupt the snapshot. + constexpr size_t kMax = static_cast(std::numeric_limits::max()); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + const size_t qsize = sched_->ready_queues[s].size(); + iter_shared_snapshot[s] = static_cast(std::min(qsize, kMax)); + } + iter_shared_sampled = true; + } + return iter_shared_snapshot; + }; + auto capture_phase_end = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES], + int16_t shared_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) { + capture_local_snapshot(local_out); + const int16_t *shared_cached = get_or_sample_shared(); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) + shared_out[s] = shared_cached[s]; + }; + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + capture_phase_end(phase_start_local, phase_start_shared); + } +#endif + + // Wall-clock timestamp of the last completed task on this thread. + // Updated on made_progress; consulted to decide whether the wall-clock + // budget for declaring a scheduler hang has elapsed. Initialized to + // "now" so the first budget cycle starts when this thread does, not at + // an undefined value. + uint64_t last_progress_ts = get_sys_cnt_aicpu(); + + while (true) { + if (completed_.load(std::memory_order_acquire)) { + break; + } + bool made_progress = false; +#if PTO2_PROFILING + CYCLE_COUNT_START(); + l2_swimlane.sched_loop_count++; + uint64_t _t0_phase = _t0; + // Release is the only "no Complete/Dispatch bar" attribution we keep — + // emitted with its own span in the idle branch below. Iterations that + // only scan/poll show as blank gaps; the per-loop Poll/Scan bars (PR + // #1079 debug overlay) were removed since "scheduler is polling when + // there's nothing to do" carries no actionable signal. + // Per-iter lazy shared-queue snapshot: first phase emit in this iter + // pays the atomic-load cost, subsequent emits in the same iter reuse + // the cached value. Reset here so we re-sample exactly once per iter + // (or skip entirely on iters with no phase emit). + iter_shared_sampled = false; +#endif + int32_t task_count = 0; + if (!tracker.has_any_running_cores()) { + LoopAction action = handle_orchestrator_exit(thread_idx, header, runtime, task_count); + if (action == LoopAction::BREAK_LOOP) break; + } + + if (!cores_released && orch_to_sched_) { + LoopAction action = handle_core_transition(cores_released); + if (action == LoopAction::BREAK_LOOP) break; + } + +#if PTO2_PROFILING + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); +#endif + + // Phase 1: Check running cores for completion + int32_t completed_this_turn = 0; + + bool try_completed = tracker.has_any_running_cores(); + if (try_completed) { + check_running_cores_for_completion( + thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress, + deferred_release_slot_states, deferred_release_count, local_bufs + ); + } + if (completed_this_turn > 0) { +#if PTO2_SCHED_PROFILING + sched_->tasks_completed.fetch_add(completed_this_turn, std::memory_order_relaxed); +#endif + int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed); + int32_t new_total = prev + completed_this_turn; + last_progress_count = new_total; + if (thread_idx == 0 && task_count > 0) { + if (new_total <= PROGRESS_VERBOSE_THRESHOLD || + new_total / PROGRESS_LOG_INTERVAL != prev / PROGRESS_LOG_INTERVAL || new_total >= task_count) { + LOG_INFO_V9( + "PTO2 progress: completed=%d total=%d (%.1f%%)", new_total, task_count, + 100.0 * new_total / task_count + ); + } + } + } + + if (rt_ != nullptr && rt_->aicore_mailbox != nullptr && + (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending())) { + AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete( + rt_->aicore_mailbox, sched_, local_bufs, deferred_release_slot_states, deferred_release_count, + PTO2_DEFERRED_RELEASE_CAP +#if PTO2_SCHED_PROFILING + , + thread_idx +#endif + ); + if (poll_result.error_code != PTO2_ERROR_NONE) { + int32_t expected = PTO2_ERROR_NONE; + header->sched_error_code.compare_exchange_strong( + expected, poll_result.error_code, std::memory_order_acq_rel, std::memory_order_acquire + ); + completed_.store(true, std::memory_order_release); + break; + } + if (poll_result.completed > 0) { +#if PTO2_SCHED_PROFILING + sched_->tasks_completed.fetch_add(poll_result.completed, std::memory_order_relaxed); +#endif + int32_t prev = completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed); + int32_t new_total = prev + poll_result.completed; + last_progress_count = new_total; + made_progress = true; + } + } + +#if PTO2_PROFILING + if (!try_completed) { + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); + } else { + CYCLE_COUNT_LAP(l2_swimlane.sched_complete_cycle); + // Emit on any completion work this iteration — a finished slot OR + // sub-block retires that did not finish a slot. The latter makes the + // SPMD harvest tail visible (count field = blocks processed this + // iteration; on a pure-retire iteration phase_complete_count is 0). + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && + (l2_swimlane.phase_complete_count > 0 || l2_swimlane.phase_subretire_count > 0)) { + // Local depth is cheap (this thread's own buffer counter). + // Shared depth is NOT sampled here: complete's release_fanin + // pushes to local_bufs in the fast path (try_push succeeds + // until cap=64). Shared only changes on dispatch's flush + // path. Carrying phase_start_shared forward as end_shared + // is the right answer 99% of the time AND skips three + // contended atomic loads per emit. + int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES]; + capture_local_snapshot(phase_end_local); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Complete, _t0_phase, _t1, l2_swimlane.sched_loop_count, + l2_swimlane.phase_complete_count + l2_swimlane.phase_subretire_count, /*pop_hit=*/0, + /*pop_miss=*/0, phase_start_local, phase_start_shared, phase_end_local, phase_start_shared + ); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + phase_start_local[s] = phase_end_local[s]; + // phase_start_shared unchanged — carried forward + } + _t0_phase = _t1; + l2_swimlane.phase_complete_count = 0; + l2_swimlane.phase_subretire_count = 0; + } + } +#endif + + bool try_pushed = false; + + // Phase 2 drain check + if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) { + handle_drain_mode(thread_idx); + continue; + } + + // Phase 3: Drain wiring queue (thread 0 only) + int wired = 0; + if (thread_idx == 0) { + wired = sched_->drain_wiring_queue(orchestrator_done_); + if (wired > 0) { + made_progress = true; +#if PTO2_SCHED_PROFILING + l2_swimlane.phase_wiring_count += wired; +#endif + } + } +#if PTO2_PROFILING + CYCLE_COUNT_LAP(l2_swimlane.sched_wiring_cycle); + // Wire outer phase: emit one bar covering this iter's drain_wiring_queue + // pass when it wired any tasks. tasks_processed = wired count. Resolve + // does NOT nest under Wire — wiring only enqueues, the consumer release + // happens later in Complete/Dummy. + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && wired > 0) { + int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES]; + capture_local_snapshot(phase_end_local); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Wire, _t0_phase, _t1, l2_swimlane.sched_loop_count, + static_cast(wired), /*pop_hit=*/0, /*pop_miss=*/0, phase_start_local, phase_start_shared, + phase_end_local, phase_start_shared + ); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + phase_start_local[s] = phase_end_local[s]; + } + _t0_phase = _t1; + } +#endif + + // Phase 3b: Drain dummy ready queue (thread 0 only). + // + // Dependency-only tasks bypass AICore dispatch: they go through the + // scheduler so fanin/fanout edges stay consistent, but completion is + // signalled inline here. Pinned to thread 0 to avoid cross-thread + // races and to keep cache hot near the wiring drain above. + if (thread_idx == 0) { + constexpr int DUMMY_DRAIN_BATCH = 16; + PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH]; + int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH); +#if PTO2_PROFILING + // Dummy outer phase: covers handling of all dummies popped this + // iter. Per-dummy DummyTask markers are emitted to a SEPARATE lane + // (Worker View AICPU_N) by the converter, so they do not nest + // under this bar. Resolve emits below DO land on the sched lane + // and nest under this Dummy outer by time containment. + uint64_t dummy_outer_t0 = + (dummy_got > 0 && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0; +#endif + for (int di = 0; di < dummy_got; di++) { + PTO2TaskSlotState &dummy_slot = *dummy_batch[di]; + + // ----- DummyTask phase: dummy "task" identity marker. -------- + // The dummy has no AICore presence — start ≈ end (1 cycle + // wide, just "we identified it"). Converter renders this on + // Worker View's DUMMY_T{thread} lane so the DAG node is + // visually present. tasks_processed = task_token low 32 bits + // (= local_id within ring) so deps.json flow arrows can land. + // The Resolve work that follows is emitted separately below. +#if PTO2_PROFILING + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + uint64_t dummy_marker_t = get_sys_cnt_aicpu(); + uint32_t dummy_id_low32 = static_cast(dummy_slot.task->task_id.raw & 0xFFFFFFFFu); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::DummyTask, dummy_marker_t, dummy_marker_t, + sched_l2_swimlane_[thread_idx].sched_loop_count, dummy_id_low32 + ); + } +#endif + + // ----- Resolve work: walk this dummy's consumer list. ------ + // Same 1 µs filter as the main-path Resolve emit suppresses + // dummies whose consumer release runs sub-microsecond. +#if PTO2_PROFILING + uint64_t dummy_resolve_t0 = + (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0; +#endif + // [[maybe_unused]] silences -Werror=unused-but-set-variable on + // the profiling-flags-smoke build path where PTO2_PROFILING is + // OFF and the Resolve emit below is excluded. + [[maybe_unused]] uint32_t dummy_consumers = 0; +#if PTO2_SCHED_PROFILING + dummy_consumers = sched_->on_task_complete(dummy_slot, thread_idx, local_bufs).fanout_edges; +#else + dummy_consumers = sched_->on_task_complete(dummy_slot, local_bufs); +#endif +#if PTO2_PROFILING + if (dummy_resolve_t0 != 0) { + uint64_t dummy_resolve_t1 = get_sys_cnt_aicpu(); + constexpr uint64_t RESOLVE_EMIT_MIN_CYCLES = PLATFORM_PROF_SYS_CNT_FREQ / 1'000'000; // 1 µs + if (dummy_resolve_t1 - dummy_resolve_t0 >= RESOLVE_EMIT_MIN_CYCLES) { + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Resolve, dummy_resolve_t0, dummy_resolve_t1, + sched_l2_swimlane_[thread_idx].sched_loop_count, dummy_consumers + ); + } + } +#endif + // Dummy tasks have no subtasks to retire and no fanout pre-conditions + // beyond their own producers; release self-reference so the slot can + // reach CONSUMED once all consumers drain. + deferred_release_slot_states[deferred_release_count++] = &dummy_slot; + if (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP) { + while (deferred_release_count > 0) { +#if PTO2_SCHED_PROFILING + (void)sched_->on_task_release( + *deferred_release_slot_states[--deferred_release_count], thread_idx + ); +#else + sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]); +#endif + } + } + int32_t prev = completed_tasks_.fetch_add(1, std::memory_order_relaxed); + last_progress_count = prev + 1; + cur_thread_completed++; + } + if (dummy_got > 0) { + made_progress = true; + } +#if PTO2_PROFILING + // Emit Dummy outer over the whole dummy_drain pass. Span starts at + // dummy_outer_t0 (captured before the pop_batch) and ends at "now". + // tasks_processed = dummy_got. Advancing _t0_phase here makes the + // following Dispatch / EarlyDispatch / second-Complete bars start + // at this end. + if (dummy_outer_t0 != 0) { + int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES]; + capture_local_snapshot(phase_end_local); + uint64_t dummy_outer_t1 = get_sys_cnt_aicpu(); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Dummy, dummy_outer_t0, dummy_outer_t1, + l2_swimlane.sched_loop_count, static_cast(dummy_got), /*pop_hit=*/0, + /*pop_miss=*/0, phase_start_local, phase_start_shared, phase_end_local, phase_start_shared + ); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + phase_start_local[s] = phase_end_local[s]; + } + _t0_phase = dummy_outer_t1; + // We do NOT re-sync _t0/_t1 — the dummy span will be absorbed + // into the next CYCLE_COUNT_LAP accumulator. The phase-model + // anchor (_t0_phase) is the authoritative source for bar spans + // on the swimlane; the cycle accumulators are coarse aggregates. + } +#endif + } + + // Phase 4: MIX-strict-priority dispatch with phase-split and + // cross-thread idle gating. See dispatch_ready_tasks for the policy. +#if PTO2_PROFILING + uint64_t dispatch_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0; +#endif + dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed); +#if PTO2_PROFILING + // Emit Dispatch IMMEDIATELY after dispatch_ready_tasks so its span + // covers the actual publish work — not the trailing second-poll / + // early-dispatch time. (Pre-redesign the Dispatch emit lived at iter + // end with span extending past the second poll, which made finish_time + // events from the second poll fall under the Dispatch bar rather than + // a Complete bar of their own — confusing for trace consumers.) + if (dispatch_t0 != 0 && try_pushed && l2_swimlane.phase_dispatch_count > 0) { + uint64_t dispatch_t1 = get_sys_cnt_aicpu(); + uint64_t pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit; + uint64_t pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit; + debug_assert(pop_hit_delta < (1ULL << 32)); + debug_assert(pop_miss_delta < (1ULL << 32)); + int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES]; + int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES]; + capture_phase_end(phase_end_local, phase_end_shared); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, _t0_phase, dispatch_t1, l2_swimlane.sched_loop_count, + l2_swimlane.phase_dispatch_count, static_cast(pop_hit_delta), + static_cast(pop_miss_delta), phase_start_local, phase_start_shared, phase_end_local, + phase_end_shared + ); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + phase_start_local[s] = phase_end_local[s]; + phase_start_shared[s] = phase_end_shared[s]; + } + _t0_phase = dispatch_t1; + l2_swimlane.phase_dispatch_count = 0; + l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit; + l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss; + } +#endif + + // Phase 4b: early-dispatch onto spare cores, but ONLY when this thread is + // otherwise idle — nothing was dispatched this iteration AND no ready work is + // queued for any shape. Early-dispatch competes with normal dispatch for + // pending slots, so gating on "no ready work" keeps it from delaying a real + // ready task; skipping the producer-fanout scan when busy also removes its + // per-iteration cost (the discovery walk only runs on genuinely idle passes). + bool any_ready_work = try_pushed; + for (int s = 0; !any_ready_work && s < PTO2_NUM_RESOURCE_SHAPES; s++) { + if (sched_->ready_queues[s].size() > 0 || local_bufs[s].count > 0) any_ready_work = true; + } +#if PTO2_PROFILING + bool early_dispatch_record = l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES; + uint64_t early_dispatch_t0 = early_dispatch_record ? get_sys_cnt_aicpu() : 0; +#endif + // Skip speculative early-dispatch under PMU: dispatch_ready_tasks already + // withholds PENDING dispatch when pmu_active to preserve single-issue PMU + // windows, and staging gated work into idle/pending slots would perturb the + // same windows. + [[maybe_unused]] int32_t staged_count = + (pmu_active || any_ready_work) ? 0 : try_speculative_early_dispatch(thread_idx); +#if PTO2_PROFILING + // Emit an EarlyDispatch bar so a staging-dominated iteration is attributed + // to early-dispatch rather than disappearing into a blank gap. + if (early_dispatch_record && staged_count > 0) { + uint64_t early_dispatch_t1 = get_sys_cnt_aicpu(); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::EarlyDispatch, early_dispatch_t0, early_dispatch_t1, + sched_l2_swimlane_[thread_idx].sched_loop_count, static_cast(staged_count) + ); + // prepare_block_for_dispatch bumped phase_dispatch_count while staging; + // those blocks belong to this EarlyDispatch bar, so clear the counter + // before it leaks into the next Dispatch bar. + sched_l2_swimlane_[thread_idx].phase_dispatch_count = 0; + // Advance _t0_phase so the following second-poll's Complete bar + // starts at the EarlyDispatch end, not before it (otherwise their + // spans overlap and the outer-phase mutual-exclusion breaks). + _t0_phase = early_dispatch_t1; + } +#endif + + // Second completion poll. dispatch_ready_tasks + try_speculative_early_dispatch + // above can take several us in a busy window; a producer block that FINs + // during them would otherwise wait for the NEXT iteration's top-of-loop + // Phase-1 poll (the ~7us detection latency that delays a flagged + // producer's doorbell). Re-polling here observes those FINs immediately, + // so the doorbell fires this iteration. Idempotent (the poll is a poll); + // we drain deferred releases eagerly to keep the buffer from growing. +#if PTO2_PROFILING + uint64_t complete2_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0; +#endif + if (tracker.has_any_running_cores()) { + int32_t completed_2nd = 0; + check_running_cores_for_completion( + thread_idx, hank, completed_2nd, cur_thread_completed, made_progress, deferred_release_slot_states, + deferred_release_count, local_bufs + ); + if (completed_2nd > 0) { +#if PTO2_SCHED_PROFILING + sched_->tasks_completed.fetch_add(completed_2nd, std::memory_order_relaxed); +#endif + completed_tasks_.fetch_add(completed_2nd, std::memory_order_relaxed); + last_progress_count = completed_tasks_.load(std::memory_order_relaxed); + } + // Eager drain so the second poll can't push deferred_release toward + // its cap between idle iterations. + while (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP - 96) { +#if PTO2_SCHED_PROFILING + (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx); +#else + sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]); +#endif + } + } +#if PTO2_PROFILING + // Complete2 outer phase: covers second-poll FIN observation. Without + // this emit, FIN counts from the second poll would carry over into the + // next iter's first-Complete bar and be displayed with a span that + // doesn't actually include those FINs' timestamps (visible mismatch + // between Complete bar span and per-task finish_time in Worker / + // Scheduler View). + if (complete2_t0 != 0 && (l2_swimlane.phase_complete_count > 0 || l2_swimlane.phase_subretire_count > 0)) { + uint64_t complete2_t1 = get_sys_cnt_aicpu(); + int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES]; + capture_local_snapshot(phase_end_local); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Complete, complete2_t0, complete2_t1, + l2_swimlane.sched_loop_count, l2_swimlane.phase_complete_count + l2_swimlane.phase_subretire_count, + /*pop_hit=*/0, + /*pop_miss=*/0, phase_start_local, phase_start_shared, phase_end_local, phase_start_shared + ); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + phase_start_local[s] = phase_end_local[s]; + } + _t0_phase = complete2_t1; + l2_swimlane.phase_complete_count = 0; + l2_swimlane.phase_subretire_count = 0; + } + + // Cycle-counter LAP for the iter tail. Dispatch's emit moved earlier + // (see Phase 4 above) so this branch only routes the time accumulator. + if (!try_pushed) { + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); + } else { + CYCLE_COUNT_LAP(l2_swimlane.sched_dispatch_cycle); + } +#endif + +#if !PTO2_PROFILING + (void)try_completed; + (void)try_pushed; +#endif + + if (made_progress) { + idle_iterations = 0; + last_progress_ts = get_sys_cnt_aicpu(); + } else { +#if PTO2_PROFILING + uint64_t rel_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && deferred_release_count > 0) ? + get_sys_cnt_aicpu() : + 0; +#endif + while (deferred_release_count > 0) { +#if PTO2_SCHED_PROFILING + (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx); +#else + sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]); +#endif + } +#if PTO2_PROFILING + // Release is a distinct operation from the poll scan — emit it with + // its own span (Perfetto nests it inside the surrounding poll/idle + // run by time-containment) rather than competing with poll for one + // per-iteration label. + if (rel_t0 != 0) { + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Release, rel_t0, get_sys_cnt_aicpu(), + l2_swimlane.sched_loop_count, /*tasks_processed=*/0 + ); + } +#endif + idle_iterations++; + + if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0) { + LoopAction action = check_idle_fatal_error(thread_idx, header, runtime); + if (action == LoopAction::BREAK_LOOP) break; + } + + if (idle_iterations % STALL_LOG_INTERVAL == 0) { + log_stall_diagnostics(thread_idx, total_tasks_, idle_iterations, last_progress_count); + } + // Wall-clock budget gate, with two fatal-latch branches: + // + // 1. Self owns a RUNNING task — first-hand evidence the + // dispatch is stuck. Latch. + // 2. No thread anywhere owns a RUNNING task AND tasks remain + // unfinished — the system is in a pre-dispatch / WAIT-only + // deadlock (e.g. dependency cycle). Ownerless idle threads + // are the only observers; let this one latch on the global + // evidence (`completed_tasks_ < total_tasks_` and + // `no_thread_owns_running_task()`). + // + // Otherwise: a sibling thread owns a RUNNING task but hasn't + // hit its own budget yet (typical distributed startup-skew + // case) — refresh last_progress_ts and keep spinning. The + // STALL diagnostic above still fires periodically so + // observability is preserved. + if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) { + bool self_owns = self_owns_running_task(thread_idx); + bool global_stuck = !self_owns && total_tasks_ > 0 && + completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ && + no_thread_owns_running_task(); + if (self_owns || global_stuck) { + // Latch the error + emergency_shutdown, then break to the + // shared end-of-loop cleanup so the diagnostic buffers get + // flushed to the host. An early return here would strand the + // stuck task's already-dumped inputs and every completed + // task's in/out records in the unflushed per-thread dump + // buffer — exactly the state we need to triage the hang. + timeout_rc = handle_timeout_exit( + thread_idx, header, runtime, idle_iterations, last_progress_count +#if PTO2_PROFILING + , + l2_swimlane.sched_start_ts +#endif + ); + break; + } + last_progress_ts = get_sys_cnt_aicpu(); + } + SPIN_WAIT_HINT(); +#if PTO2_PROFILING + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); + // _t0_phase advances through idle laps so the next emitted + // COMPLETE/DISPATCH bar starts at the iter it actually ran in, not + // at the start of the preceding idle stretch. The idle/poll time + // itself is attributed by the activity-fill below — no blanks. + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + _t0_phase = _t1; + } +#endif + } + } + + // Drain any entries left in the deferred-release batch. The in-loop flush + // only fires on idle iterations and on buffer-full; a loop exit while the + // last iteration made progress can leave entries un-released. Drop them + // here so every consumed producer slot completes its on_task_release + // regardless of which loop-exit path fired. + while (deferred_release_count > 0) { +#if PTO2_SCHED_PROFILING + (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx); +#else + sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]); +#endif + } + +#if PTO2_PROFILING + // Final-drain: emit any pop_hit / pop_miss accrued since the last + // dispatch emit (typically the trailing idle loops while waiting for + // orchestrator_done_) as a zero-duration synthetic dispatch record so + // sum(record.pop_*) reconciles with the run-cumulative counter. + // Gate on SCHED_PHASES — at lower levels the phase buffer is never + // flushed (see below), so writing this record would be wasted work. + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + uint64_t final_pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit; + uint64_t final_pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit; + debug_assert(final_pop_hit_delta < (1ULL << 32)); + debug_assert(final_pop_miss_delta < (1ULL << 32)); + if (final_pop_hit_delta != 0 || final_pop_miss_delta != 0) { + uint64_t t_now = get_sys_cnt_aicpu(); + int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES]; + int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES]; + capture_phase_end(phase_end_local, phase_end_shared); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, t_now, t_now, l2_swimlane.sched_loop_count, 0, + static_cast(final_pop_hit_delta), static_cast(final_pop_miss_delta), + phase_end_local, phase_end_shared, phase_end_local, phase_end_shared + ); + l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit; + l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss; + } + } + log_l2_swimlane_summary(thread_idx, cur_thread_completed); +#endif + +#if PTO2_PROFILING + if (l2_swimlane.l2_swimlane_enabled) { + l2_swimlane_aicpu_flush( + thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num() + ); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + l2_swimlane_aicpu_flush_sched_phase_buffer(thread_idx); + } + } +#endif +#if PTO2_PROFILING + if (is_dump_args_enabled()) { + dump_args_flush(thread_idx); + } +#endif +#if PTO2_PROFILING + if (is_pmu_enabled()) { + pmu_aicpu_flush_buffers( + thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num() + ); + } +#endif + + return timeout_rc != 0 ? timeout_rc : cur_thread_completed; +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h new file mode 100644 index 000000000..f1dc5d7f8 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h @@ -0,0 +1,468 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#ifndef SCHEDULER_TYPES_H +#define SCHEDULER_TYPES_H + +#include +#include + +#include "common/core_type.h" +#include "common/platform_config.h" +#include "pto_runtime2_types.h" +#include "spin_hint.h" + +// ============================================================================= +// Profiling macros (compile-time gated) +// ============================================================================= + +#if PTO2_PROFILING +#include "aicpu/device_time.h" +// Accumulated nanoseconds per sub-step +#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1 +#define CYCLE_COUNT_LAP(acc) \ + do { \ + _t1 = get_sys_cnt_aicpu(); \ + acc += (_t1 - _t0); \ + _t0 = _t1; \ + } while (0) +#else +#define CYCLE_COUNT_START() +#define CYCLE_COUNT_LAP(acc) +#endif + +// ============================================================================= +// Scheduler constants +// ============================================================================= + +constexpr int32_t MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS; + +// Periodic cadence (in idle iterations) for emitting the per-thread STALL +// diagnostic while no progress is being made. Purely an observability knob, +// independent of the wall-clock timeout below: small enough to fire a few times +// before the budget expires, large enough not to flood device_log. +constexpr int32_t STALL_LOG_INTERVAL = 480000; +constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024; // Check orchestrator error every N idle iters + +// Wall-clock budget for declaring "no progress = scheduler timeout". Replaces +// the per-thread iteration-count cap that once lived here as MAX_IDLE_ITERATIONS +// for the fatal-latch decision; STALL_LOG_INTERVAL above keeps the per-thread +// diagnostic cadence. +// +// Using wall-clock here is load-bearing for distributed runs: with per-thread +// iteration counts, a pure-idle thread spinning ~115 ns/iter hits the cap in +// ~92 ms while a sibling thread polling a RUNNING task takes ~200 ms for the +// same iteration count. The fast spinner racing ahead and latching fatal +// kills the slower-but-correct poller mid-poll — see the distributed +// startup-skew scenario in issue #897. +// +// The budget is platform-defined (PLATFORM_SCHEDULER_TIMEOUT_MS in spin_hint.h) +// because the safe value differs per variant: onboard trims it to 2 s so the +// AICPU detects a hang and flushes its diagnostics (tensor dump, in-flight +// partial output) before STARS reaps the op and poisons the context (chain: +// this < op-exec < host stream-sync, platform_config.h); sim has no STARS to +// race and keeps the full 5 s #897 headroom. See spin_hint.h for the per-variant +// rationale. +constexpr int32_t SCHEDULER_TIMEOUT_MS = PLATFORM_SCHEDULER_TIMEOUT_MS; +constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES = + static_cast(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000); +constexpr int32_t STALL_DUMP_READY_MAX = 8; +constexpr int32_t STALL_DUMP_WAIT_MAX = 4; +constexpr int32_t STALL_DUMP_CORE_MAX = 8; +constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10; // log every completion for the first N tasks +constexpr int32_t PROGRESS_LOG_INTERVAL = 250; // log every N completions after threshold + +// ============================================================================= +// Control flow signal from cold-path helpers back to the main dispatch loop. +// ============================================================================= + +enum class LoopAction : int8_t { + NONE, // cold path did not trigger; proceed normally + BREAK_LOOP, // equivalent to 'break' from the while(true) loop +}; + +// ============================================================================= +// Per-core state: one cache line per core to eliminate false sharing +// and co-locate all hot-path fields for minimal cache misses. +// Dual-slot layout: running (currently executing) + pending (pre-loaded, awaiting hardware pickup). +// ============================================================================= + +struct alignas(64) CoreExecState { + // --- Hot fields (completion + dispatch, every iteration) --- + uint64_t reg_addr; // offset 0: register base address (set once in handshake) + PTO2TaskSlotState *running_slot_state; // offset 8: slot state for running task (nullptr = empty) + PTO2TaskSlotState *pending_slot_state; // offset 16: slot state for pending task (nullptr = empty) + int32_t running_reg_task_id; // offset 24: register task ID (AICPU_TASK_INVALID = idle) + int32_t pending_reg_task_id; // offset 28: pending register task ID (AICPU_TASK_INVALID = none) + uint32_t dispatch_seq; // offset 32: monotonic dispatch counter + PTO2SubtaskSlot running_subslot; // offset 36: which subtask slot is running + PTO2SubtaskSlot pending_subslot; // offset 37: which subtask slot is pending + uint8_t pad0_[2]; // offset 38: alignment padding + // Precomputed COND register pointer; resolved once in handshake so the + // hot completion poll does a single volatile load instead of recomputing + // reg_base + reg_offset(COND) on every iteration. + volatile uint32_t *cond_ptr; // offset 40: precomputed pointer to COND register +#if PTO2_PROFILING + // --- Profiling fields (dispatch path, compile-time gated) --- + uint64_t running_dispatch_timestamp; // offset 48: AICPU dispatch timestamp for running task + uint64_t pending_dispatch_timestamp; // offset 56: AICPU dispatch timestamp for pending task +#else + // --- Cold fields (init/diagnostics only, never in hot path) --- + int32_t worker_id; // offset 48: index in runtime.workers[] + uint32_t physical_core_id; // offset 52: hardware physical core ID + CoreType core_type; // offset 56: AIC or AIV (enum class : int32_t) + uint8_t pad2_[4]; // offset 60: pad to 64 bytes +#endif +}; +static_assert(sizeof(CoreExecState) == 64, "CoreExecState must occupy exactly one cache line"); + +// ============================================================================= +// CoreTracker: cluster-based bitmask tracker for idle/running core state. +// +// core_states_ encodes per-cluster core idle/running in 3 bits per cluster: +// bit i*3 = AIC of cluster i (1 = idle, 0 = running) +// bit i*3+1 = AIV0 of cluster i +// bit i*3+2 = AIV1 of cluster i +// Max 21 clusters per tracker (63 bits in uint64_t). +// ============================================================================= + +class alignas(64) CoreTracker { +public: + static inline int32_t MAX_CORE_PER_THREAD = 63; + static constexpr int32_t MAX_CLUSTERS = 63 / 3; + +public: + CoreTracker() = default; + + class BitStates { + public: + BitStates() = default; + + explicit BitStates(uint64_t states) : + states_(states) {} + void init() { states_ = 0; } + + BitStates operator~() const { return BitStates(~states_); } + BitStates operator&(const BitStates &other) const { return BitStates(states_ & other.states_); } + BitStates operator|(const BitStates &other) const { return BitStates(states_ | other.states_); } + BitStates operator^(const BitStates &other) const { return BitStates(states_ ^ other.states_); } + BitStates operator>>(int32_t offset) const { return BitStates(states_ >> offset); } + BitStates operator<<(int32_t offset) const { return BitStates(states_ << offset); } + void operator&=(const BitStates &other) { states_ &= other.states_; } + void operator|=(const BitStates &other) { states_ |= other.states_; } + void operator^=(const BitStates &other) { states_ ^= other.states_; } + + bool has_value() const { return states_ > 0; } + int32_t count() const { return __builtin_popcountll(states_); } + void clear_bit(int32_t offset) { states_ &= ~(1ULL << offset); } + + // Extract the lowest set bit from mask, clear it, and return its position. + // Returns -1 if mask is empty. + int32_t pop_first() { + if (states_ == 0) return -1; + int32_t pos = __builtin_ctzll(states_); + states_ &= states_ - 1; + return pos; + } + + private: + uint64_t states_{0}; + }; + +public: + void init(int32_t cluster_count) { + cluster_count_ = cluster_count; + aic_mask_.init(); + aiv_mask_.init(); + pending_occupied_.init(); + for (int32_t i = 0; i < cluster_count; i++) { + aic_mask_ |= BitStates(1ULL << (i * 3)); + aiv_mask_ |= BitStates(6ULL << (i * 3)); + } + core_states_ = aic_mask_ | aiv_mask_; + } + + void set_cluster(int32_t cluster_idx, int32_t aic_wid, int32_t aiv0_wid, int32_t aiv1_wid) { + core_id_map_[cluster_idx * 3] = aic_wid; + core_id_map_[cluster_idx * 3 + 1] = aiv0_wid; + core_id_map_[cluster_idx * 3 + 2] = aiv1_wid; + } + + int32_t get_cluster_count() const { return cluster_count_; } + + // --- Running core queries --- + + template + bool has_running_cores() const { + if constexpr (CT == CoreType::AIC) { + return ((~core_states_) & aic_mask_).has_value(); + } else { + return ((~core_states_) & aiv_mask_).has_value(); + } + } + + bool has_any_running_cores() const { return ((~core_states_) & (aic_mask_ | aiv_mask_)).has_value(); } + + template + int32_t get_running_count() const { + if constexpr (CT == CoreType::AIC) { + return ((~core_states_) & aic_mask_).count(); + } else { + return ((~core_states_) & aiv_mask_).count(); + } + } + + // Return an opaque bitmask for iterating running cores of a given type. + // Use pop_first() to extract core bit offsets one at a time. + template + BitStates get_running_cores() const { + if constexpr (CT == CoreType::AIC) { + return (~core_states_) & aic_mask_; + } else { + return (~core_states_) & aiv_mask_; + } + } + + BitStates get_all_running_cores() const { return (~core_states_) & (aic_mask_ | aiv_mask_); } + BitStates get_cluster_offset_states() const { return aic_mask_; } + + // --- Cluster matching --- + + BitStates get_valid_cluster_offset_states(PTO2ResourceShape shape) const { + switch (shape) { + case PTO2ResourceShape::AIC: + return core_states_ & aic_mask_; + case PTO2ResourceShape::AIV: + return ((core_states_ >> 1) | (core_states_ >> 2)) & aic_mask_; + case PTO2ResourceShape::MIX: + return (core_states_ >> 1) & (core_states_ >> 2) & core_states_ & aic_mask_; + case PTO2ResourceShape::DUMMY: + // DUMMY tasks never reach the core-tracker dispatch path; they are + // completed inline by resolve_and_dispatch via dummy_ready_queue. + return BitStates(0ULL); + } + return BitStates(0ULL); + } + + int32_t get_aic_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset]; } + int32_t get_aiv0_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 1]; } + int32_t get_aiv1_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 2]; } + + int32_t get_aic_core_offset(int32_t cluster_offset) const { return cluster_offset; } + int32_t get_aiv0_core_offset(int32_t cluster_offset) const { return cluster_offset + 1; } + int32_t get_aiv1_core_offset(int32_t cluster_offset) const { return cluster_offset + 2; } + + bool is_aic_core_idle(int32_t cluster_offset) const { + return ((core_states_ >> cluster_offset) & BitStates(1ULL)).has_value(); + } + bool is_aiv0_core_idle(int32_t cluster_offset) const { + return ((core_states_ >> (cluster_offset + 1)) & BitStates(1ULL)).has_value(); + } + bool is_aiv1_core_idle(int32_t cluster_offset) const { + return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value(); + } + + // --- State mutation --- + + // Toggle bit at the given bit offset (running <-> idle) + void change_core_state(int32_t bit_offset) { core_states_ ^= BitStates(1ULL << bit_offset); } + + // --- Pending-occupied tracking --- + // Tracks whether a core's pending payload slot is occupied (awaiting hardware ACK). + // SET on dispatch (both running-first and pending), CLEAR on idle or pending_freed. + + void set_pending_occupied(int32_t bit_offset) { pending_occupied_ |= BitStates(1ULL << bit_offset); } + void clear_pending_occupied(int32_t bit_offset) { + pending_occupied_ ^= (pending_occupied_ & BitStates(1ULL << bit_offset)); + } + + // --- Two-phase dispatch queries --- + + // Idle dispatch: returns bit offsets of idle cores for the given shape. + // For AIC: 1 bit per cluster (core offset == cluster offset). + // For AIV: 1 bit per AIV core (2 bits per cluster at aiv_mask_ positions). + // Only AIC needs pending_occupied filtering: by invariant, idle cores (core_states_ bit=1) + // always have pending_occupied=0, so AIV/MIX need no extra filtering. + // Skipping the AIC-centric filter also fixes a latent bug where a running+pending AIC core + // would incorrectly block AIV idle dispatch on the same cluster. + BitStates get_idle_core_offset_states(PTO2ResourceShape shape) const { + if (shape == PTO2ResourceShape::AIC) { + return get_valid_cluster_offset_states(shape) & ~(pending_occupied_ & aic_mask_); + } + if (shape == PTO2ResourceShape::AIV) { + return core_states_ & aiv_mask_; + } + return get_valid_cluster_offset_states(shape); // MIX: cluster-level + } + + // Pending dispatch: returns bit offsets of cores eligible for pending-slot dispatch. + // AIC: 1 bit per cluster (aic_mask_ positions). AIV: 1 bit per AIV core (aiv_mask_ positions). + // Runtime MIX dispatch uses classify_mix_cluster() so the decision follows the task's active_mask. + enum class MixPlacement : uint8_t { RUNNING, PENDING, REJECT }; + + // A MIX block must place all cores named by active_mask the same way: + // all idle means running placement, all running means pending placement, + // and any mixed state is retried later. + MixPlacement classify_mix_cluster(int32_t cluster_offset, uint8_t core_mask) const { + BitStates used(0ULL); + if (core_mask & PTO2_SUBTASK_MASK_AIC) { + used |= BitStates(1ULL << cluster_offset); + } + if (core_mask & PTO2_SUBTASK_MASK_AIV0) { + used |= BitStates(1ULL << (cluster_offset + 1)); + } + if (core_mask & PTO2_SUBTASK_MASK_AIV1) { + used |= BitStates(1ULL << (cluster_offset + 2)); + } + if (!used.has_value() || (pending_occupied_ & used).has_value()) { + return MixPlacement::REJECT; + } + + BitStates idle = core_states_ & used; + if (idle.count() == used.count()) { + return MixPlacement::RUNNING; + } + if (!idle.has_value()) { + return MixPlacement::PENDING; + } + return MixPlacement::REJECT; + } + + BitStates get_mix_running_cluster_offset_states(uint8_t core_mask) const { + BitStates result(0ULL); + BitStates candidates = get_cluster_offset_states(); + while (candidates.has_value()) { + int32_t cluster_offset = candidates.pop_first(); + if (classify_mix_cluster(cluster_offset, core_mask) == MixPlacement::RUNNING) { + result |= BitStates(1ULL << cluster_offset); + } + } + return result; + } + + int32_t count_mix_running_clusters(uint8_t core_mask) const { + return get_mix_running_cluster_offset_states(core_mask).count(); + } + + BitStates get_pending_core_offset_states(PTO2ResourceShape shape) const { + if (shape == PTO2ResourceShape::MIX) { + // Shape-level query kept conservative for legacy callers/tests. + // The real MIX dispatch path applies active_mask in classify_mix_cluster(). + // Any core without a pending payload can accept a dispatch (idle or running). + BitStates available = ~pending_occupied_; + BitStates mix_available = + (available & aic_mask_) & ((available >> 1) & aic_mask_) & ((available >> 2) & aic_mask_); + // Pending MIX can only reuse a fully-running cluster. Partially-running clusters + // could split one MIX block across immediate and pending placement. + BitStates running = ~core_states_; + BitStates cluster_all_running = + (running & aic_mask_) & ((running >> 1) & aic_mask_) & ((running >> 2) & aic_mask_); + return mix_available & cluster_all_running; + } + if (shape == PTO2ResourceShape::AIC) { + return (~core_states_) & aic_mask_ & ~(pending_occupied_ & aic_mask_); + } + // AIV + return (~core_states_) & aiv_mask_ & ~pending_occupied_; + } + + // --- Two-phase dispatch unified query --- + + enum class DispatchPhase : uint8_t { IDLE, PENDING }; + + BitStates get_dispatchable_cores(PTO2ResourceShape shape, DispatchPhase phase) const { + return (phase == DispatchPhase::IDLE) ? get_idle_core_offset_states(shape) : + get_pending_core_offset_states(shape); + } + + // --- Bit offset <-> worker_id mapping --- + + int32_t get_core_id_by_offset(int32_t offset) const { return core_id_map_[offset]; } + + const int32_t *core_ids() const { return core_id_map_; } + int32_t core_num() const { return cluster_count_ * 3; } + +private: + int32_t cluster_count_; + BitStates aic_mask_; + BitStates aiv_mask_; + BitStates core_states_; + BitStates pending_occupied_; + int32_t core_id_map_[63]; // bit_position -> worker_id, max 21 clusters * 3 +}; + +// ============================================================================= +// SlotTransition: pure event signals from a single register poll. +// true = event occurred, false = no-op (maintain current state). +// ============================================================================= + +struct SlotTransition { + bool running_done = false; // running task completed + bool pending_done = false; // pending task completed + bool running_freed = false; // running slot data should be released + bool pending_freed = false; // pending_occupied can be cleared + bool matched = false; // some case was hit (otherwise skip apply) +}; + +// ============================================================================= +// Profiling counters (compile-time gated) +// ============================================================================= + +#if PTO2_PROFILING +struct alignas(64) SchedL2SwimlaneCounters { + bool l2_swimlane_enabled{false}; + uint64_t sched_start_ts{0}; + uint64_t sched_complete_cycle{0}; + uint64_t sched_dispatch_cycle{0}; + uint64_t sched_wiring_cycle{0}; + uint64_t sched_idle_cycle{0}; + uint64_t sched_loop_count{0}; + uint32_t phase_complete_count{0}; + // Sub-block retires that did NOT finish a slot (SPMD blocks of a multi-block + // task retiring one at a time). Counted separately so the Complete-phase + // emit can fire on poll iterations that only retired sub-blocks — otherwise + // the serial-harvest tail of an SPMD slot is invisible (no slot completes + // until the last block, leaving the scheduler lane blank for that window). + uint32_t phase_subretire_count{0}; + uint32_t phase_dispatch_count{0}; + // Per-emit delta is (current - *_at_last_emit). Accumulated only when + // l2_swimlane_level_ >= SCHED_PHASES. + uint64_t pop_hit{0}; + uint64_t pop_miss{0}; + uint64_t pop_hit_at_last_emit{0}; + uint64_t pop_miss_at_last_emit{0}; +#if PTO2_SCHED_PROFILING + uint32_t phase_wiring_count{0}; + uint64_t complete_probe_count{0}; + uint64_t complete_hit_count{0}; + uint64_t sched_complete_perf_cycle{0}; + uint64_t sched_dispatch_pop_cycle{0}; + uint64_t sched_dispatch_setup_cycle{0}; +#endif + void reset() { *this = SchedL2SwimlaneCounters{}; } +}; +#endif + +// ============================================================================= +// sync_start drain coordination +// ============================================================================= + +// When sync_start_pending != 0, all scheduler threads skip dispatch +// (only process completions) until the drain worker finishes launching all blocks. +struct alignas(64) SyncStartDrainState { + std::atomic sync_start_pending{0}; // 0=normal; -1=initializing; >0=active (value=block_num) + std::atomic drain_worker_elected{0}; // 0=none; >0: elected thread's (thread_idx+1) + std::atomic drain_ack_mask{0}; // bit per thread; all-set = all threads reached ack barrier + std::atomic pending_task{nullptr}; // held task (not re-queued) + int32_t _pad[10]; +}; +static_assert(sizeof(SyncStartDrainState) == 64); + +#endif // SCHEDULER_TYPES_H diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp new file mode 100644 index 000000000..0ee5919ce --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp @@ -0,0 +1,466 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Host/AICPU shared runtime-arena layout, init_data and wire implementations. + * + * Lives under runtime/shared/ so it is included in both the host_runtime.so + * build (host pre-populates the prebuilt arena image) and the aicpu_runtime + * build (AICPU runs wire_arena_pointers + destroy after attach). The + * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp + * (ops table, scope/submit/dispatch business logic, profiling) stay in their + * original files and the aicpu build only. + */ + +#include +#include + +#include + +#include "pto_orchestrator.h" +#include "pto_runtime2.h" +#include "pto_ring_buffer.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "scheduler/pto_scheduler.h" + +static bool sum_ring_heap_sizes(const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], uint64_t *total) { + uint64_t sum = 0; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (heap_sizes[r] > std::numeric_limits::max() - sum) { + LOG_ERROR("Total ring heap size overflows uint64_t"); + return false; + } + sum += heap_sizes[r]; + } + *total = sum; + return true; +} + +// ============================================================================= +// Ready queue +// ============================================================================= + +size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) { + // Align the slots[] base to a full cache line so MPMC CAS traffic on the + // first slot cannot false-share with whatever region sits in front of us + // (e.g. orchestrator tensormap heads written by the orch thread). + return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE); +} + +bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) { + // Address the slots region for data writes without storing the pointer in + // queue->slots — that field is set by ready_queue_wire_arena_pointers. + auto *slots_arena = static_cast(arena.region_ptr(slots_off)); + queue->capacity = capacity; + queue->mask = capacity - 1; + queue->enqueue_pos.store(0, std::memory_order_relaxed); + queue->dequeue_pos.store(0, std::memory_order_relaxed); + + for (uint64_t i = 0; i < capacity; i++) { + slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed); + slots_arena[i].slot_state = nullptr; + } + + return true; +} + +void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) { + queue->slots = static_cast(arena.region_ptr(slots_off)); +} + +void ready_queue_destroy(PTO2ReadyQueue *queue) { + // Arena owns the slots[] buffer; just forget the pointer. + queue->slots = nullptr; +} + +// ============================================================================= +// Scheduler +// ============================================================================= + +bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) { + // ring stores the device address of the SM ring header — pure offset + // arithmetic, no SM load. + ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id); + last_task_alive = 0; + advance_lock.store(0, std::memory_order_relaxed); +#if PTO2_PROFILING + dep_pool_snapshot_tail.store(1, std::memory_order_relaxed); + dep_pool_snapshot_top.store(1, std::memory_order_relaxed); +#endif + + // Per-slot SM-side initialization (bind_ring + reset_for_reuse + + // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle:: + // init_header_per_ring so the AICPU performs it during SM reset; host + // prebuilt-arena init skips SM access here. + + return true; +} + +void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; } + +PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) { + int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + dep_pool_capacities[r] = dep_pool_capacity; + } + return reserve_layout(arena, dep_pool_capacities); +} + +PTO2SchedulerLayout +PTO2SchedulerState::reserve_layout(DeviceArena &arena, const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]) { + PTO2SchedulerLayout layout{}; + layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE; + layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + layout.dep_pool_capacities[r] = dep_pool_capacities[r]; + } + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); + } + layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); + layout.off_early_dispatch_queue_slots = ready_queue_reserve_layout(arena, PTO2_EARLY_DISPATCH_QUEUE_SIZE); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + // Force a cache-line base so writes from scheduler thread 0 (sole + // writer of this ring's dep_pool) do not invalidate adjacent + // multi-threaded regions like ready_queue.slots. + layout.off_dep_pool_entries[r] = + arena.reserve(static_cast(dep_pool_capacities[r]) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE); + } + layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE); + return layout; +} + +bool PTO2SchedulerState::init_data_from_layout( + const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base +) { + PTO2SchedulerState *sched = this; + sched->sm_header = reinterpret_cast(sm_dev_base); +#if PTO2_SCHED_PROFILING + sched->tasks_completed.store(0, std::memory_order_relaxed); + sched->tasks_consumed.store(0, std::memory_order_relaxed); +#endif + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) { + return false; + } + } + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + if (!ready_queue_init_data_from_layout( + &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity + )) { + return false; + } + } + if (!ready_queue_init_data_from_layout( + &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity + )) { + return false; + } + if (!ready_queue_init_data_from_layout( + &sched->early_dispatch_queue, arena, layout.off_early_dispatch_queue_slots, PTO2_EARLY_DISPATCH_QUEUE_SIZE + )) { + return false; + } + + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto *dep_entries = static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); + memset(dep_entries, 0, static_cast(layout.dep_pool_capacities[r]) * sizeof(PTO2DepListEntry)); + sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacities[r], orch_err); + } + + if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) { + return false; + } + sched->wiring.batch_count = 0; + sched->wiring.batch_index = 0; + sched->wiring.backoff_counter = 0; + + return true; +} + +void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) { + PTO2SchedulerState *sched = this; + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]); + } + ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots); + ready_queue_wire_arena_pointers(&sched->early_dispatch_queue, arena, layout.off_early_dispatch_queue_slots); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + sched->ring_sched_states[r].dep_pool.base = + static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); + } + sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer); +} + +void PTO2SchedulerState::destroy() { + PTO2SchedulerState *sched = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + sched->ring_sched_states[r].destroy(); + sched->ring_sched_states[r].dep_pool.base = nullptr; + } + sched->wiring.queue.destroy(); + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + ready_queue_destroy(&sched->ready_queues[i]); + } + ready_queue_destroy(&sched->dummy_ready_queue); + ready_queue_destroy(&sched->early_dispatch_queue); +} + +// ============================================================================= +// Orchestrator +// ============================================================================= + +PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout( + DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity +) { + int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + dep_pool_capacities[r] = dep_pool_capacity; + } + return reserve_layout(arena, task_window_sizes, dep_pool_capacities); +} + +PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout( + DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], + const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH] +) { + PTO2OrchestratorLayout layout{}; + layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP; + layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + layout.dep_pool_capacities[r] = dep_pool_capacities[r]; + } + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + const size_t fanin_pool_bytes = + PTO2_ALIGN_UP(static_cast(dep_pool_capacities[r]) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); + layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE); + + always_assert(task_window_sizes[r] > 0 && (task_window_sizes[r] & (task_window_sizes[r] - 1)) == 0); + const size_t seen_epoch_bytes = + PTO2_ALIGN_UP(static_cast(task_window_sizes[r]) * sizeof(uint32_t), PTO2_ALIGN_SIZE); + layout.off_fanin_seen_epoch[r] = arena.reserve(seen_epoch_bytes, PTO2_ALIGN_SIZE); + } + layout.off_scope_tasks = + arena.reserve(static_cast(layout.scope_tasks_cap) * sizeof(uintptr_t), alignof(PTO2TaskSlotState *)); + layout.off_scope_begins = + arena.reserve(static_cast(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t)); + layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes); + return layout; +} + +bool PTO2OrchestratorState::init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, + uint64_t task_window_size +) { + uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + heap_sizes[r] = heap_size; + task_window_sizes[r] = task_window_size; + } + return init_data_from_layout(layout, arena, sm_dev_base, gm_heap, heap_sizes, task_window_sizes); +} + +bool PTO2OrchestratorState::init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, + const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH] +) { + auto *orch = this; + *orch = PTO2OrchestratorState{}; + + orch->sm_header = reinterpret_cast(sm_dev_base); + orch->gm_heap_base = gm_heap; + uint64_t total_heap_size = 0; + if (!sum_ring_heap_sizes(heap_sizes, &total_heap_size)) { + return false; + } + orch->gm_heap_size = total_heap_size; + orch->fatal = false; + + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); + uint64_t heap_offset = 0; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + void *ring_heap_base = reinterpret_cast(gm_heap) + heap_offset; + auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r); + auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r); + auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r); + + orch->rings[r].task_allocator.init( + task_descs_dev, static_cast(task_window_sizes[r]), cur_idx_dev, last_alive_dev, ring_heap_base, + heap_sizes[r], orch_err + ); + heap_offset += heap_sizes[r]; + + const size_t fanin_pool_bytes = PTO2_ALIGN_UP( + static_cast(layout.dep_pool_capacities[r]) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE + ); + auto *fanin_entries = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); + memset(fanin_entries, 0, fanin_pool_bytes); + orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacities[r], orch_err); + + const size_t seen_epoch_bytes = PTO2_ALIGN_UP( + static_cast(layout.tensor_map.task_window_sizes[r]) * sizeof(uint32_t), PTO2_ALIGN_SIZE + ); + auto *seen_epoch = static_cast(arena.region_ptr(layout.off_fanin_seen_epoch[r])); + memset(seen_epoch, 0, seen_epoch_bytes); + orch->fanin_seen_epoch[r] = seen_epoch; + } + + if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) { + return false; + } + + orch->scope_tasks_size = 0; + orch->scope_tasks_capacity = layout.scope_tasks_cap; + orch->scope_stack_top = -1; + orch->scope_stack_capacity = layout.scope_stack_capacity; + orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; + + return true; +} + +void PTO2OrchestratorState::wire_arena_pointers( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg +) { + auto *orch = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + orch->rings[r].fanin_pool.base = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); + orch->fanin_seen_epoch[r] = static_cast(arena.region_ptr(layout.off_fanin_seen_epoch[r])); + } + orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena); + orch->scope_tasks = static_cast(arena.region_ptr(layout.off_scope_tasks)); + orch->scope_begins = static_cast(arena.region_ptr(layout.off_scope_begins)); + orch->scheduler = scheduler_arg; +} + +void PTO2OrchestratorState::destroy() { + auto *orch = this; + orch->tensor_map.destroy(); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + orch->rings[r].fanin_pool.base = nullptr; + orch->fanin_seen_epoch[r] = nullptr; + } + orch->scope_tasks = nullptr; + orch->scope_begins = nullptr; +} + +void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; } + +// ============================================================================= +// Top-level runtime arena +// ============================================================================= + +PTO2RuntimeArenaLayout +runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) { + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; + int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = task_window_size; + heap_sizes[r] = 0; + dep_pool_capacities[r] = dep_pool_capacity; + } + return runtime_reserve_layout(arena, task_window_sizes, heap_sizes, dep_pool_capacities); +} + +PTO2RuntimeArenaLayout runtime_reserve_layout( + DeviceArena &arena, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], + const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH] +) { + PTO2RuntimeArenaLayout layout{}; + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + layout.task_window_sizes[r] = task_window_sizes[r]; + layout.heap_sizes[r] = heap_sizes[r]; + layout.dep_pool_capacities[r] = dep_pool_capacities[r]; + } + + layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle)); + int32_t task_window_sizes_i32[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes_i32[r] = static_cast(task_window_sizes[r]); + } + layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes_i32, dep_pool_capacities); + layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacities); + layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE); + layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox)); + + layout.arena_size = arena.total_size(); + return layout; +} + +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, + uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size +) { + uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + heap_sizes[r] = heap_size; + } + return runtime_init_data_from_layout(arena, layout, mode, sm_dev_base, 0, gm_heap_dev_base, heap_sizes); +} + +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, + uint64_t /*sm_size*/, void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] +) { + PTO2Runtime *rt = static_cast(arena.region_ptr(layout.off_runtime)); + memset(rt, 0, sizeof(*rt)); + + auto *sm_wrap = static_cast(arena.region_ptr(layout.off_sm_handle)); + memset(sm_wrap, 0, sizeof(*sm_wrap)); + + // rt->ops is filled by the AICPU at boot. + rt->mode = mode; + rt->gm_heap = gm_heap_dev_base; + uint64_t total_heap_size = 0; + if (!sum_ring_heap_sizes(heap_sizes, &total_heap_size)) { + return nullptr; + } + rt->gm_heap_size = total_heap_size; + rt->gm_heap_owned = false; + rt->total_cycles = 0; + + if (!rt->orchestrator.init_data_from_layout( + layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_sizes, layout.task_window_sizes + )) { + return nullptr; + } + if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) { + return nullptr; + } + + auto *mailbox = static_cast(arena.region_ptr(layout.off_mailbox)); + memset(mailbox, 0, sizeof(*mailbox)); + + return rt; +} + +void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) { + rt->sm_handle = static_cast(arena.region_ptr(layout.off_sm_handle)); + rt->aicore_mailbox = static_cast(arena.region_ptr(layout.off_mailbox)); + rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler); + rt->scheduler.wire_arena_pointers(layout.sched, arena); +} + +void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) { + // Arena buffer is pooled across runs by DeviceRunner — never freed here. + if (!rt) return; + rt->scheduler.destroy(); + rt->orchestrator.destroy(); + rt->aicore_mailbox = nullptr; + rt->sm_handle = nullptr; +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp new file mode 100644 index 000000000..d704bd85d --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp @@ -0,0 +1,268 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Shared Memory Implementation + * + * Implements shared memory allocation, initialization, and management + * for Orchestrator-Scheduler communication. + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#include "pto_shared_memory.h" +#include +#include +#include +#include "common/unified_log.h" + +// ============================================================================= +// Size Calculation +// ============================================================================= + +uint64_t PTO2SharedMemoryHandle::calculate_size(uint64_t task_window_size) { + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = task_window_size; + } + return calculate_size_per_ring(task_window_sizes); +} + +uint64_t PTO2SharedMemoryHandle::calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) { + uint64_t size = 0; + + // Header (aligned to cache line) + size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + + // Per-ring task descriptors and payloads + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + } + + return size; +} + +// ============================================================================= +// Creation and Destruction +// ============================================================================= + +void PTO2SharedMemoryHandle::setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) { + char *ptr = (char *)sm_base; + + // Header + header = (PTO2SharedMemoryHeader *)ptr; + ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + + // Per-ring task descriptors, payloads, and slot states + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &ring = header->rings[r]; + ring.task_descriptors = (PTO2TaskDescriptor *)ptr; + ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + + ring.task_payloads = (PTO2TaskPayload *)ptr; + ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + + ring.slot_states = (PTO2TaskSlotState *)ptr; + ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + } +} + +void PTO2SharedMemoryHandle::setup_pointers(uint64_t task_window_size) { + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = task_window_size; + } + setup_pointers_per_ring(task_window_sizes); +} + +bool PTO2SharedMemoryHandle::init( + void *sm_base_arg, uint64_t sm_size_arg, uint64_t task_window_size, uint64_t heap_size +) { + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = task_window_size; + heap_sizes[r] = heap_size; + } + return init_per_ring(sm_base_arg, sm_size_arg, task_window_sizes, heap_sizes); +} + +bool PTO2SharedMemoryHandle::init_per_ring( + void *sm_base_arg, uint64_t sm_size_arg, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], + const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] +) { + if (!sm_base_arg || sm_size_arg == 0) return false; + if (sm_size_arg < calculate_size_per_ring(task_window_sizes)) return false; + + sm_base = sm_base_arg; + sm_size = sm_size_arg; + is_owner = false; + setup_pointers_per_ring(task_window_sizes); + init_header_per_ring(task_window_sizes, heap_sizes); + return true; +} + +PTO2SharedMemoryHandle *PTO2SharedMemoryHandle::create_and_init_default(DeviceArena &arena) { + const uint64_t buffer_size = calculate_size(PTO2_TASK_WINDOW_SIZE); + const size_t off_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle)); + const size_t off_buffer = arena.reserve(static_cast(buffer_size), PTO2_ALIGN_SIZE); + if (arena.commit() == nullptr) return nullptr; + + auto *handle = static_cast(arena.region_ptr(off_handle)); + memset(handle, 0, sizeof(*handle)); + void *buffer = arena.region_ptr(off_buffer); + memset(buffer, 0, static_cast(buffer_size)); + if (!handle->init(buffer, buffer_size, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE)) return nullptr; + return handle; +} + +void PTO2SharedMemoryHandle::destroy() { + // Arena-owned wrappers (is_owner == false) are reclaimed by arena.release(); + // calling destroy on them is a no-op so existing callers stay safe. + if (is_owner && sm_base) { + free(sm_base); + free(this); + } +} + +// ============================================================================= +// Initialization +// ============================================================================= +// +// no need init data in pool, init pool data when used +void PTO2SharedMemoryHandle::init_header(uint64_t task_window_size, uint64_t heap_size) { + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = task_window_size; + heap_sizes[r] = heap_size; + } + init_header_per_ring(task_window_sizes, heap_sizes); +} + +void PTO2SharedMemoryHandle::init_header_per_ring( + const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] +) { + // Per-ring flow control (start at 0) + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + header->rings[r].fc.init(); + } + + header->orchestrator_done.store(0, std::memory_order_relaxed); + + // Per-ring layout info + uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + header->rings[r].task_window_size = task_window_sizes[r]; + header->rings[r].task_window_mask = static_cast(task_window_sizes[r] - 1); + header->rings[r].heap_size = heap_sizes[r]; + header->rings[r].task_descriptors_offset = offset; + offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + } + + header->total_size = sm_size; + header->graph_output_ptr.store(0, std::memory_order_relaxed); + header->graph_output_size.store(0, std::memory_order_relaxed); + + // Error reporting + header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed); + header->sched_error_bitmap.store(0, std::memory_order_relaxed); + header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed); + header->sched_error_thread.store(-1, std::memory_order_relaxed); + + // Per-ring slot_states reset. Previously lived in + // PTO2SchedulerState::RingSchedState::init(), but it writes into + // ring->slot_states[] which is SM-side storage — keeping it here lets + // host-side prebuilt-arena init skip all SM dereferences. + // bind_ring() pins the ring_id (slot-invariant after this point); + // reset_for_reuse() prepares dynamic fanout/refcount fields so the first + // submit doesn't need an explicit reset. + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &ring = header->rings[r]; + for (uint64_t i = 0; i < task_window_sizes[r]; i++) { + ring.slot_states[i].bind_ring(static_cast(r)); + ring.slot_states[i].reset_for_reuse(); + ring.slot_states[i].fanin_count = 0; + ring.slot_states[i].active_mask = ActiveMask{}; + } + } +} + +// ============================================================================= +// Debug Utilities +// ============================================================================= + +void PTO2SharedMemoryHandle::print_layout() { + if (!header) return; + + PTO2SharedMemoryHeader *h = header; + + LOG_INFO_V0("=== PTO2 Shared Memory Layout ==="); + LOG_INFO_V0("Base address: %p", sm_base); + LOG_INFO_V0("Total size: %" PRIu64 " bytes", h->total_size); + LOG_INFO_V0("Ring depth: %d", PTO2_MAX_RING_DEPTH); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + LOG_INFO_V0("Ring %d:", r); + LOG_INFO_V0(" task_window_size: %" PRIu64, h->rings[r].task_window_size); + LOG_INFO_V0(" heap_size: %" PRIu64 " bytes", h->rings[r].heap_size); + LOG_INFO_V0( + " descriptors_off: %" PRIu64 " (0x%" PRIx64 ")", h->rings[r].task_descriptors_offset, + h->rings[r].task_descriptors_offset + ); + LOG_INFO_V0(" current_task_idx: %d", h->rings[r].fc.current_task_index.load(std::memory_order_acquire)); + LOG_INFO_V0(" last_task_alive: %d", h->rings[r].fc.last_task_alive.load(std::memory_order_acquire)); + } + LOG_INFO_V0("orchestrator_done: %d", h->orchestrator_done.load(std::memory_order_acquire)); + LOG_INFO_V0("Error state:"); + LOG_INFO_V0(" orch_error_code: %d", h->orch_error_code.load(std::memory_order_relaxed)); + LOG_INFO_V0(" sched_error_bitmap: 0x%x", h->sched_error_bitmap.load(std::memory_order_relaxed)); + LOG_INFO_V0(" sched_error_code: %d", h->sched_error_code.load(std::memory_order_relaxed)); + LOG_INFO_V0(" sched_error_thread: %d", h->sched_error_thread.load(std::memory_order_relaxed)); + LOG_INFO_V0("================================"); +} + +bool PTO2SharedMemoryHandle::validate() { + if (!sm_base) return false; + if (!header) return false; + + PTO2SharedMemoryHeader *h = header; + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (!h->rings[r].fc.validate(this, r)) return false; + } + + return true; +} + +bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const { + if (!handle) return false; + if (!handle->header) return false; + if (ring_id < 0 || ring_id >= PTO2_MAX_RING_DEPTH) return false; + + const PTO2SharedMemoryHeader *h = handle->header; + + // Check that offsets are within bounds + if (h->rings[ring_id].task_descriptors_offset >= h->total_size) return false; + + // Check pointer alignment + if ((uintptr_t)h->rings[ring_id].task_descriptors % PTO2_ALIGN_SIZE != 0) return false; + + // Check flow control pointer sanity + int32_t current = current_task_index.load(std::memory_order_acquire); + int32_t last_alive = last_task_alive.load(std::memory_order_acquire); + if (current < 0) return false; + if (last_alive < 0) return false; + + return true; +} diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp new file mode 100644 index 000000000..b99c67233 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp @@ -0,0 +1,261 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - TensorMap Implementation + * + * Implements TensorMap with ring buffer pool, lazy invalidation, + * and chain truncation optimization. + * + * Key features: + * 1. O(1) insert at bucket head + * 2. O(valid_entries) lookup with chain truncation + * 3. Automatic stale entry cleanup during lookup + * 4. Periodic explicit cleanup for long chains + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#include "pto_tensormap.h" + +#include +#include + +#include "common.h" +#include "common/unified_log.h" + +// ============================================================================= +// TensorMap Lookup Chain Length Statistics (compile-time toggle) +// ============================================================================= +#if PTO2_TENSORMAP_PROFILING +uint64_t g_lookup_chain_total = 0; +uint64_t g_lookup_count = 0; +int32_t g_lookup_chain_max = 0; +uint64_t g_lookup_overlap_checks = 0; +uint64_t g_lookup_overlap_hits = 0; +uint64_t g_insert_count = 0; +#endif + +// ============================================================================= +// Initialization and Destruction +// ============================================================================= + +PTO2TensorMapLayout PTO2TensorMap::reserve_layout( + DeviceArena &arena, int32_t new_num_buckets, int32_t new_pool_size, + const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH] +) { + // num_buckets must be a power of two for the hash truncation to work. + always_assert((new_num_buckets & (new_num_buckets - 1)) == 0); + + PTO2TensorMapLayout layout{}; + layout.num_buckets = new_num_buckets; + layout.pool_size = new_pool_size; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + layout.task_window_sizes[r] = new_task_window_sizes[r]; + } + + layout.off_buckets = arena.reserve( + static_cast(new_num_buckets) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *) + ); + layout.off_entry_pool = + arena.reserve(static_cast(new_pool_size) * sizeof(PTO2TensorMapEntry), alignof(PTO2TensorMapEntry)); + layout.off_free_entry_list = + arena.reserve(static_cast(new_pool_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + layout.off_task_entry_heads[r] = arena.reserve( + static_cast(new_task_window_sizes[r]) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *) + ); + } + return layout; +} + +PTO2TensorMapLayout +PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]) { + return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes); +} + +bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) { + num_buckets = layout.num_buckets; + pool_size = layout.pool_size; + + // Address arena regions for data writes; do not store these in struct + // fields (wire_arena_pointers does that). + auto *buckets_arena = static_cast(arena.region_ptr(layout.off_buckets)); + auto *entry_pool_arena = static_cast(arena.region_ptr(layout.off_entry_pool)); + auto *free_list_arena = static_cast(arena.region_ptr(layout.off_free_entry_list)); + + // buckets[]: empty == nullptr. + for (int32_t i = 0; i < num_buckets; i++) { + buckets_arena[i] = nullptr; + } + + // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...). + // The pool's persistent invariant after init is "bucket_index == -1 means + // not linked", set explicitly below. + memset(entry_pool_arena, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry)); + for (int32_t i = 0; i < pool_size; i++) { + entry_pool_arena[i].bucket_index = -1; + entry_pool_arena[i].next_in_bucket = nullptr; + entry_pool_arena[i].prev_in_bucket = nullptr; + entry_pool_arena[i].next_in_task = nullptr; + entry_pool_arena[i].prev_in_task = nullptr; + entry_pool_arena[i].producer_task_id = PTO2TaskId{}; + } + + // free_entry_list: zeroed (was calloc'd before); contents become meaningful + // only after entries are freed back, so the body of the array stays as 0. + memset(free_list_arena, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry *)); + + next_entry_idx = 0; + free_num = 0; + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto *heads_arena = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); + for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) { + heads_arena[i] = nullptr; + } + task_window_sizes[r] = layout.task_window_sizes[r]; + last_task_alives[r] = 0; + last_cleanup[r] = 0; + } + + return true; +} + +void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) { + buckets = static_cast(arena.region_ptr(layout.off_buckets)); + entry_pool = static_cast(arena.region_ptr(layout.off_entry_pool)); + free_entry_list = static_cast(arena.region_ptr(layout.off_free_entry_list)); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_entry_heads[r] = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); + } +} + +void PTO2TensorMap::destroy() { + // Arena owns the backing memory; here we only forget our pointers so any + // stray post-destroy access trips a nullptr dereference instead of reading + // a recycled allocation. + buckets = nullptr; + entry_pool = nullptr; + free_entry_list = nullptr; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_entry_heads[r] = nullptr; + } +} + +// ============================================================================= +// Debug Utilities +// ============================================================================= + +void PTO2TensorMap::print_stats() { + int32_t valid = 0; + int32_t stale = 0; + int32_t empty_buckets = 0; + int32_t max_chain = 0; + int64_t total_chain = 0; + int32_t non_empty_buckets = 0; + + // Count entries + for (int32_t i = 0; i < pool_size; i++) { + if (entry_pool[i].bucket_index != -1) { + if (entry_valid(entry_pool[i])) { + valid++; + } else { + stale++; + } + } + } + + // Count bucket stats + for (int32_t b = 0; b < num_buckets; b++) { + int32_t chain_len = 0; + auto cur_entry = buckets[b]; + + while (cur_entry != nullptr) { + chain_len++; + cur_entry = cur_entry->next_in_bucket; + } + + if (chain_len == 0) { + empty_buckets++; + } else { + non_empty_buckets++; + total_chain += chain_len; + if (chain_len > max_chain) { + max_chain = chain_len; + } + } + } + + LOG_INFO_V0("=== TensorMap Statistics ==="); + LOG_INFO_V0("Pool size: %d", pool_size); + LOG_INFO_V0("Pool next entry idx: %d", next_entry_idx); + LOG_INFO_V0("Pool free_num: %d", free_num); + LOG_INFO_V0("Num buckets: %d", num_buckets); + LOG_INFO_V0("Valid entries: %d", valid); + LOG_INFO_V0("Stale entries: %d", stale); + LOG_INFO_V0("Empty buckets: %d", empty_buckets); + LOG_INFO_V0("Max chain len: %d", max_chain); + LOG_INFO_V0("Avg chain len: %.2f", non_empty_buckets > 0 ? (float)total_chain / non_empty_buckets : 0); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + LOG_INFO_V0("Last task alive[%d]: %d", r, last_task_alives[r]); + } + LOG_INFO_V0("============================"); +} + +int32_t PTO2TensorMap::valid_count() { + int32_t count = 0; + + for (int32_t i = 0; i < pool_size; i++) { + if (entry_pool[i].bucket_index != -1 && entry_valid(entry_pool[i])) { + count++; + } + } + + return count; +} + +void PTO2TensorMap::sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive) { + auto ring_id = task_id.ring(); + auto local_id = task_id.local(); + sync_validity(ring_id, sm_last_task_alive); + + // Only attempt cleanup when last_task_alive has actually advanced; + // otherwise cleanup_retired would empty-loop and we'd spin forever. + auto overlap = get_task_local_id_slot(ring_id, local_id) == get_task_local_id_slot(ring_id, last_cleanup[ring_id]); + if (sm_last_task_alive - last_cleanup[ring_id] >= PTO2_TENSORMAP_CLEANUP_INTERVAL || overlap) { + cleanup_retired(ring_id, last_cleanup[ring_id], sm_last_task_alive); + last_cleanup[ring_id] = sm_last_task_alive; + } +} + +// ============================================================================= +// TensorMap Lookup Profiling +// ============================================================================= +#if PTO2_TENSORMAP_PROFILING +PTO2TensorMapProfilingData pto2_tensormap_get_profiling() { + PTO2TensorMapProfilingData d; + d.lookup_chain_total = g_lookup_chain_total; + d.lookup_count = g_lookup_count; + d.lookup_chain_max = g_lookup_chain_max; + d.overlap_checks = g_lookup_overlap_checks; + d.overlap_hits = g_lookup_overlap_hits; + d.insert_count = g_insert_count; + + // Reset + g_lookup_chain_total = 0; + g_lookup_count = 0; + g_lookup_chain_max = 0; + g_lookup_overlap_checks = 0; + g_lookup_overlap_hits = 0; + g_insert_count = 0; + return d; +} +#endif diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp new file mode 100644 index 000000000..d19e52724 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Runtime Class - Implementation + * + * Device execution and handshake control. + * Task graph construction is handled by PTO2Runtime. + */ + +#include "runtime.h" + +#include "common/unified_log.h" +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" + +// ============================================================================= +// Constructor +// ============================================================================= + +Runtime::Runtime() { + // NOTE: host_api is initialized in InitRuntime() (host-only code) + // because the CApi functions don't exist when compiled for device. + + // Initialize handshake buffers + memset(workers, 0, sizeof(workers)); + worker_count = 0; + aicpu_thread_num = 1; + ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS; + orch_to_sched = false; + + // fully_distributed_within_core handoff fields + dist.core_main_fn = 0; + dist.go = 0; + dist.num_workers = 0; + dist.done_count = 0; + + // Initialize device orchestration state + gm_sm_ptr_ = nullptr; + gm_heap_ptr_ = nullptr; + slot_states_ptr_ = nullptr; + orch_args_storage_.clear(); + prebuilt_arena_base_ = nullptr; + prebuilt_runtime_offset_ = 0; + + // Initialize device orchestration SO binary + dev_orch_so_addr_ = 0; + dev_orch_so_size_ = 0; + active_callable_id_ = -1; + register_new_callable_id_ = false; + device_orch_func_name_[0] = '\0'; + device_orch_config_name_[0] = '\0'; + + // Initialize kernel binary tracking + registered_kernel_count_ = 0; + + // Initialize function address mapping + for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) { + func_id_to_addr_[i] = 0; + } +} + +// ============================================================================= +// Device orchestration +// ============================================================================= + +void *Runtime::get_gm_sm_ptr() const { return gm_sm_ptr_; } +void *Runtime::get_gm_heap_ptr() const { return gm_heap_ptr_; } +const ChipStorageTaskArgs &Runtime::get_orch_args() const { return orch_args_storage_; } +void Runtime::set_gm_sm_ptr(void *p) { gm_sm_ptr_ = p; } +void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; } +void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; } +void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; } + +void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) { + prebuilt_arena_base_ = arena_base; + prebuilt_runtime_offset_ = runtime_off; +} +void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; } +size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; } + +// Device orchestration SO metadata (bytes live in a separate device buffer +// owned by DeviceRunner; only the address/size travels in Runtime). +void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) { + dev_orch_so_addr_ = dev_addr; + dev_orch_so_size_ = size; +} + +uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; } + +uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; } + +void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) { + active_callable_id_ = callable_id; + register_new_callable_id_ = is_new; +} + +int32_t Runtime::get_active_callable_id() const { return active_callable_id_; } + +bool Runtime::register_new_callable_id() const { return register_new_callable_id_; } + +void Runtime::set_device_orch_func_name(const char *name) { + if (name == nullptr) { + device_orch_func_name_[0] = '\0'; + return; + } + std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1); + device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0'; +} + +const char *Runtime::get_device_orch_func_name() const { return device_orch_func_name_; } + +void Runtime::set_device_orch_config_name(const char *name) { + if (name == nullptr) { + device_orch_config_name_[0] = '\0'; + return; + } + std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1); + device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0'; +} + +const char *Runtime::get_device_orch_config_name() const { return device_orch_config_name_; } + +uint64_t Runtime::get_function_bin_addr(int func_id) const { + if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0; + return func_id_to_addr_[func_id]; +} + +void Runtime::set_function_bin_addr(int func_id, uint64_t addr) { + if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID); + return; + } + if (addr != 0 && func_id_to_addr_[func_id] == 0) { + if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID) { + registered_kernel_func_ids_[registered_kernel_count_++] = func_id; + } else { + LOG_ERROR( + "[Runtime] Registration limit reached (%d). Cannot track func_id=%d for cleanup.", RUNTIME_MAX_FUNC_ID, + func_id + ); + } + } + func_id_to_addr_[func_id] = addr; +} + +void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) { + if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID); + return; + } + func_id_to_addr_[func_id] = addr; +} + +int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; } + +int Runtime::get_registered_kernel_func_id(int index) const { + if (index < 0 || index >= registered_kernel_count_) return -1; + return registered_kernel_func_ids_[index]; +} + +void Runtime::clear_registered_kernels() { registered_kernel_count_ = 0; } diff --git a/src/a2a3/runtime/fully_distributed_within_core/runtime/tensor_create_info.h b/src/a2a3/runtime/fully_distributed_within_core/runtime/tensor_create_info.h new file mode 100644 index 000000000..912839a34 --- /dev/null +++ b/src/a2a3/runtime/fully_distributed_within_core/runtime/tensor_create_info.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * TensorCreateInfo — submit-time create-info for runtime-allocated outputs. + * + * Runtime-only: this header (and the materialization helpers below) are NOT + * part of the wire/host-facing Tensor in src/common/task_interface/tensor.h. + * It carries the metadata required to materialize a fresh contiguous output: + * dtype, ndims, shapes, manual_dep, and an optional initial value fill. Its + * 64B layout mirrors Tensor cache line 1 so init_tensor_from_create_info() can + * copy the whole line with a single memcpy. + */ + +#pragma once + +#include +#include +#include + +#include "data_type.h" +#include "tensor.h" + +class alignas(64) TensorCreateInfo { +public: + TensorCreateInfo( + const uint32_t shapes_in[], uint32_t ndims_in, DataType dtype_in = DataType::FLOAT32, bool manual_dep_in = false + ) : + initial_value(0), + has_initial_value(false), + __pad2__(0), + start_offset(0), // mirrors Tensor::start_offset; pre-zeroed for create-info outputs + version(0), + ndims(ndims_in), + dtype(dtype_in), + manual_dep(manual_dep_in), + is_contiguous(true), // mirrors Tensor::is_contiguous; pre-set for create-info outputs + __pad_flags__(0) { + // Bound the write below: shapes[] holds MAX_TENSOR_DIMS, and ndims_in + // comes from user-submitted output shapes — guard before the loop so an + // oversized rank can't overrun the fixed array. + always_assert(ndims_in > 0 && ndims_in <= MAX_TENSOR_DIMS); + for (uint32_t i = 0; i < ndims_in; i++) { + shapes[i] = shapes_in[i]; + } + } + + void copy(const TensorCreateInfo &other) { memcpy(this, &other, sizeof(other)); } + + template + void set_initial_value(T value) { + has_initial_value = true; + initial_value = to_u64(value); + } + + uint64_t buffer_size_bytes() const { + uint64_t total = 1; + for (uint32_t i = 0; i < ndims; i++) { + total *= shapes[i]; + } + return total * get_element_size(dtype); + } + +public: + // --- Bytes [0, 32): TensorCreateInfo-only fields --- + // These occupy the same positions as Tensor::buffer, Tensor::owner_task_id, + // and Tensor::start_offset. The runtime overwrites owner metadata after the + // memcpy and recomputes start_offset / stride during payload materialization. + uint64_t initial_value; + bool has_initial_value; + uint8_t __pad1__[7]; + uint64_t __pad2__; // → Tensor::owner_task_id (overwritten post-memcpy) + uint64_t start_offset; // mirrors Tensor::start_offset; always 0 for create-info outputs + + // --- Bytes [32, 64): Matches Tensor cache line 1 layout --- + int32_t version; // Always 0 for create-info outputs + uint32_t ndims; + DataType dtype; + bool manual_dep; + bool is_contiguous; // Always true for create-info outputs + uint8_t __pad_flags__; // → Tensor::child_memory (always 0 for create-info outputs) + uint32_t shapes[MAX_TENSOR_DIMS]; // → Tensor::shapes + + TensorCreateInfo() = default; +}; + +// TensorCreateInfo layout must match Tensor cacheline 1 for memcpy optimization +static_assert(sizeof(TensorCreateInfo) == 64, "TensorCreateInfo must match Tensor cacheline 1 size (64 bytes)"); +static_assert(offsetof(TensorCreateInfo, start_offset) == offsetof(Tensor, start_offset)); +static_assert(offsetof(TensorCreateInfo, version) == offsetof(Tensor, version)); +static_assert(offsetof(TensorCreateInfo, ndims) == offsetof(Tensor, ndims)); +static_assert(offsetof(TensorCreateInfo, dtype) == offsetof(Tensor, dtype)); +static_assert(offsetof(TensorCreateInfo, manual_dep) == offsetof(Tensor, manual_dep)); +static_assert(offsetof(TensorCreateInfo, is_contiguous) == offsetof(Tensor, is_contiguous)); +static_assert(offsetof(TensorCreateInfo, __pad_flags__) == offsetof(Tensor, child_memory)); +static_assert(offsetof(TensorCreateInfo, shapes) == offsetof(Tensor, shapes)); + +// ============================================================================ +// Materialization helpers — operate on a Tensor& through its public members. +// Factored out of Tensor (which now lives in the wire/host-facing common +// header) so the create-info dependency stays runtime-only. +// ============================================================================ + +/// Fill the entire backing buffer of `t` with `initial_value` (doubling memcpy). +inline void fill_tensor_initial_value(Tensor &t, uint64_t initial_value) { + always_assert(reinterpret_cast(t.buffer.addr) != nullptr); + uint64_t elem_size = get_element_size(t.dtype); + char *dst = reinterpret_cast(t.buffer.addr); + constexpr uint64_t blk_size = 64; + uint64_t blk = (t.buffer.size < blk_size) ? t.buffer.size : blk_size; + for (uint64_t b = 0; b < blk; b += elem_size) { + memcpy(dst + b, &initial_value, elem_size); + } + uint64_t filled = blk; + while (filled < t.buffer.size) { + uint64_t copy_size = ((t.buffer.size - filled) < filled) ? (t.buffer.size - filled) : filled; + memcpy(dst + filled, dst, copy_size); + filled += copy_size; + } +} + +/// Materialize a TensorCreateInfo into `t` (fresh contiguous output). +/// Single 64B memcpy covers cache line 1; `ci` pre-initialises start_offset (=0) +/// and is_contiguous (=true) in its line-1 slots so they need no reset here. +/// Cache line 2 (stride/extent) is computed from `ci.shapes` in a single reverse pass. +inline void init_tensor_from_create_info(Tensor &t, const TensorCreateInfo &ci, void *addr, uint64_t buffer_size) { + always_assert(ci.ndims > 0 && ci.ndims <= MAX_TENSOR_DIMS); + memcpy(&t, &ci, 64); + t.buffer = {reinterpret_cast(addr), buffer_size}; + t.owner_task_id = PTO2TaskId::invalid(); // caller (orchestrator) overwrites with actual task_id + uint32_t s = 1; + for (int32_t i = static_cast(t.ndims) - 1; i >= 0; --i) { + t.strides[i] = s; + s *= t.shapes[i]; + } + t.extent_elem_cache = s; + if (ci.has_initial_value) { + fill_tensor_initial_value(t, ci.initial_value); + } +} diff --git a/src/a5/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp b/src/a5/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp new file mode 100644 index 000000000..21e79b3ed --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/aicore/aicore_executor.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include "aicore/aicore.h" +#include "aicore/aicore_profiling_state.h" +#include "aicore/l2_swimlane_collector_aicore.h" +#include "aicore/pmu_collector_aicore.h" +#include "common/l2_swimlane_profiling.h" +#include "common/platform_config.h" // Register-based communication +#include "common/pmu_profiling.h" +#include "pto2_dispatch_payload.h" +#include "runtime.h" + +/** + * Unified function pointer type for kernel dispatch + * + * All kernels follow the same signature: void kernel(__gm__ int64_t* args) + * This enables simple, switch-free dispatch. + */ +typedef void (*UnifiedKernelFunc)(__gm__ int64_t *); + +/** + * Execute task from PTO2DispatchPayload. + * + * Reads function_bin_addr and args from the dispatch payload. + * + * @param payload Pointer to PTO2DispatchPayload in global memory + */ +__aicore__ __attribute__((always_inline)) static void execute_task(__gm__ PTO2DispatchPayload *payload) { + if (payload == nullptr || payload->function_bin_addr == 0) { + return; + } + + UnifiedKernelFunc kernel = (UnifiedKernelFunc)payload->function_bin_addr; + kernel(reinterpret_cast<__gm__ int64_t *>(payload->args)); + OUT_OF_ORDER_STORE_BARRIER(); +} + +/** + * AICore main execution loop + * + * Implements the AICPU-AICore register-based dispatch protocol: + * 1. Wait for AICPU ready signal via handshake buffer + * 2. Report physical core ID and core type, signal AICore ready + * 3. Cache per-core PTO2DispatchPayload pointer from hank->task + * 4. Poll DATA_MAIN_BASE register for task dispatch until exit signal + * + * AICPU writes &s_payload_per_core[i] to hank->task before setting + * aicpu_ready=1. AICore caches this pointer and reads function_bin_addr + + * args pointer from it on each dispatch. reg_val is a monotonically + * increasing task ID used only for dispatch signaling and ACK/FIN protocol. + * + * @param runtime Pointer to Runtime in global memory + * @param s_block_idx Block index (core ID) + * @param core_type Core type (AIC or AIV) + */ +__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, int s_block_idx, CoreType core_type) { + __gm__ Handshake *my_hank = (__gm__ Handshake *)(&runtime->workers[s_block_idx]); + + // Phase 1: Wait for AICPU initialization signal + while (my_hank->aicpu_ready == 0) { + dcci(my_hank, SINGLE_CACHE_LINE); + SPIN_WAIT_HINT(); + } + + // Phase 2: Report physical core ID, signal ready + my_hank->physical_core_id = get_physical_core_id(); + OUT_OF_ORDER_STORE_BARRIER(); + my_hank->aicore_regs_ready = 1; + dcci(&my_hank->aicore_regs_ready, SINGLE_CACHE_LINE, CACHELINE_OUT); + while (my_hank->aicpu_regs_ready == 0) { + dcci(&my_hank->aicpu_regs_ready, SINGLE_CACHE_LINE); + SPIN_WAIT_HINT(); + } + // Report initial idle status via register + write_reg(RegId::COND, AICORE_IDLE_VALUE); + + // Phase 3: Report core type, signal ready + my_hank->core_type = core_type; + OUT_OF_ORDER_STORE_BARRIER(); + my_hank->aicore_done = s_block_idx + 1; // Signal ready (use s_block_idx + 1 to avoid 0) + + dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); + + // Cache per-core dispatch payload pointer (set by AICPU before aicpu_ready) + __gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task); + + // Cache profiling state once after Phase 3. The L2 / PMU rings and the + // PMU MMIO base are all stable for the entire run (host-resolved at + // AICore kernel entry from KernelArgs::regs[physical_core_id]), so + // they are safe to cache here. + uint32_t profiling_flag = get_aicore_profiling_flag(); + bool l2_swimlane_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_L2_SWIMLANE); + bool dump_tensor_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_DUMP_TENSOR); + bool pmu_enabled = GET_PROFILING_FLAG(profiling_flag, PROFILING_FLAG_PMU); + // Per-core L2SwimlaneActiveHead channel — lazy-resolved on first task; the + // table slot AICPU populates inside `l2_swimlane_aicpu_init` runs + // concurrently with kernel entry, so we cannot deref at startup. The + // first dispatch is proof AICPU init is done. + __gm__ L2SwimlaneActiveHead *l2_swimlane_head = nullptr; + L2SwimlaneAicoreLocalState l2_swimlane_local = {nullptr, UINT32_MAX, 0}; + __gm__ PmuAicoreRing *pmu_ring = pmu_enabled ? get_aicore_pmu_ring() : nullptr; + uint64_t pmu_reg_base = pmu_enabled ? get_aicore_pmu_reg_base() : 0; + + // Phase 4: Main execution loop - poll register for tasks until exit signal + // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit + uint32_t reg_val = AICPU_IDLE_TASK_ID; + uint32_t last_reg_val = AICPU_IDLE_TASK_ID; + + while (true) { + reg_val = static_cast(read_reg(RegId::DATA_MAIN_BASE)); + if (reg_val == AICORE_EXIT_SIGNAL) { + // Signal exit acknowledgment to AICPU + write_reg(RegId::COND, AICORE_EXITED_VALUE); + break; + } + + // Execute task if new (reg_val encoding: AICPU_IDLE_TASK_ID=idle, task_id=task) + if (reg_val == AICPU_IDLE_TASK_ID || reg_val == last_reg_val) { + SPIN_WAIT_HINT(); + continue; + } + + { + // receive_time is captured the instant DATA_MAIN_BASE returned a + // new task_id, BEFORE the per-task dcci + ack pair. Paired with + // start_time (captured after dcci + ack) it lets DFX split head_OH + // into the AICPU→AICore NoC propagation (dispatch_ts → receive_time, + // hardware-bound) and the AICore-local dcci+ack cost + // (receive_time → start_time, software-tunable). Stored in the + // record as a 32-bit delta `start_time - receive_time`. + uint64_t receive_time = get_sys_cnt_aicore(); + + uint32_t task_id = reg_val; // Decode: register holds task_id directly + + // First-task lazy resolve of the rotation channel. + if (l2_swimlane_enabled && l2_swimlane_head == nullptr) { + l2_swimlane_head = get_l2_swimlane_aicore_head(); + } + + // Select dual-buffer slot: same bit as AICPU used when writing payload + __gm__ PTO2DispatchPayload *exec_payload = payload + (task_id & 1u); + + // Invalidate payload buffer (AICPU updates its content each dispatch) + dcci(exec_payload, ENTIRE_DATA_CACHE); + + write_reg(RegId::COND, MAKE_ACK_VALUE(task_id)); + + // Performance profiling: record start time + uint64_t start_time = get_sys_cnt_aicore(); + + if (pmu_enabled) { + pmu_aicore_begin(); + } + + // Execute the task + execute_task(exec_payload); + + if (pmu_enabled) { + pmu_aicore_end(); + pmu_aicore_record_task(pmu_ring, pmu_reg_base, task_id); + } + + if (dump_tensor_enabled) { + pipe_barrier(PIPE_ALL); + } + + // Performance profiling: record task execution. task_token_raw is + // the PTO2 identity (already in AICore cache from the dispatch + // payload); reg_task_id is the per-core dispatch token AICore just + // read. Host uses reg_task_id as join key vs the AICPU stream. + if (l2_swimlane_enabled) { + uint64_t end_time = get_sys_cnt_aicore(); + uint64_t task_token_raw = exec_payload->local_context.async_ctx.task_token.raw; + l2_swimlane_aicore_record_task( + l2_swimlane_head, &l2_swimlane_local, task_token_raw, task_id, receive_time, start_time, end_time + ); + } + + last_reg_val = reg_val; + write_reg(RegId::COND, MAKE_FIN_VALUE(task_id)); + } + } + + // Flush all dirty cache lines to HBM before kernel exit. + dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT); +} diff --git a/src/a5/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp b/src/a5/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp new file mode 100644 index 000000000..313e3a36e --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/aicpu/aicpu_executor.cpp @@ -0,0 +1,848 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#ifdef __linux__ +#include +#endif + +#include "aicpu/device_time.h" +#include "aicpu/orch_so_file.h" +#include "aicpu/platform_aicpu_affinity.h" +#include "callable_protocol.h" +#include "pto2_dispatch_payload.h" +#include "runtime.h" +#include "spin_hint.h" + +// Runtime headers (full struct definition for create/destroy + PTO2_SCOPE) +#include "pto_runtime2.h" +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" + +// Performance profiling headers +#include "aicpu/dep_gen_collector_aicpu.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" +#include "aicpu/scope_stats_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" +#include "common/l2_swimlane_profiling.h" +#include "common/unified_log.h" + +// Register-based communication +#include "aicpu/platform_regs.h" +#include "common/platform_config.h" + +// Core type definitions +#include "common/core_type.h" + +// CoreCallable for resolved dispatch address +#include "callable.h" + +// Scheduler data structures (CoreExecState, CoreTracker, etc.) +#include "scheduler/scheduler_types.h" + +// Scheduler context class +#include "scheduler/scheduler_context.h" + +// Device orchestration function signature (loaded via dlopen). +// The executor binds the current thread's PTO2Runtime into orchestration TLS +// before calling the user entry. +typedef void (*DeviceOrchestrationFunc)(const L2TaskArgs &orch_args); +typedef void (*DeviceOrchestrationBindRuntimeFunc)(PTO2Runtime *rt); + +// Config function exported by orchestration .so +typedef PTO2OrchestrationConfig (*DeviceOrchestrationConfigFunc)(const L2TaskArgs &orch_args); + +// From orchestration/common.cpp linked into this DSO — updates g_current_runtime here (distinct from +// framework_bind_runtime in the dlopen'd libdevice_orch_*.so). +extern "C" void framework_bind_runtime(PTO2Runtime *rt); + +constexpr const char *DEFAULT_ORCH_ENTRY_SYMBOL = "aicpu_orchestration_entry"; +constexpr const char *DEFAULT_ORCH_CONFIG_SYMBOL = "aicpu_orchestration_config"; + +static int32_t read_runtime_status(Runtime *runtime) { + if (runtime == nullptr) { + return 0; + } + + void *sm = runtime->get_gm_sm_ptr(); + if (sm == nullptr) { + return 0; + } + + auto *header = static_cast(sm); + int32_t orch_error_code = header->orch_error_code.load(std::memory_order_acquire); + int32_t sched_error_code = header->sched_error_code.load(std::memory_order_acquire); + return runtime_status_from_error_codes(orch_error_code, sched_error_code); +} + +static PTO2Runtime *rt{nullptr}; + +// Per-callable_id orchestration SO table. The executor dispatches +// `orch_so_table_[active_callable_id_]` (created on first sighting of +// that callable_id, kept warm across runs). +// MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values +// (mailbox uint32 callable_id, register() returns small ints) and is shared +// with the host bounds check in DeviceRunner::register_callable — +// see src/common/task_interface/callable_protocol.h. + +struct OrchSoEntry { + bool in_use{false}; + void *handle{nullptr}; + char path[256]{}; + DeviceOrchestrationFunc func{nullptr}; + DeviceOrchestrationBindRuntimeFunc bind{nullptr}; + DeviceOrchestrationConfigFunc config_func{nullptr}; +}; + +struct AicpuExecutor { + int32_t sched_thread_num_; + bool orch_to_sched_{false}; + + // ===== Thread management state ===== + std::atomic thread_idx_{0}; + std::atomic initialized_{false}; + std::atomic init_done_{false}; + std::atomic init_failed_{false}; + std::atomic finished_{false}; + + int32_t aicpu_thread_num_{0}; + + // ===== Task queue state (managed by scheduler ready queues) ===== + + std::atomic finished_count_{0}; + std::atomic runtime_init_ready_{false}; + + // Per-Worker arena attaching to the pooled prebuilt runtime image. Host + // populates the layout + data on its own arena, rtMemcpys into a pooled + // device buffer owned by DeviceRunner, and the AICPU attach()es to that + // buffer on each boot — no AICPU-side commit, no per-boot rtMalloc. + // Default-constructed: libc-backed backend, no ctx. + DeviceArena runtime_arena_; + + // Entry-arg L2TaskArgs built (via create_from_chip_args) from get_orch_args() + // before scheduler init; consumed by the (*p_func)(orch_args_cached_) below. + L2TaskArgs orch_args_cached_; + + // Per-callable_id table. Single orch thread today, so first-write/read + // race is not possible; if multiple orch threads are ever introduced, + // guard the in_use=false→true transition with a mutex. + OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS]; + + // ===== Scheduler context (owns all dispatch/completion/drain state) ===== + SchedulerContext sched_ctx_; + + // ===== Methods ===== + int32_t init(Runtime *runtime); + int32_t run(Runtime *runtime); + void deinit(Runtime *runtime); + + ~AicpuExecutor() { + // Process-wide teardown (the single static instance dies here). Every + // in-use callable_id slot is dlclose()'d here; each is otherwise kept + // alive across runs for cache-hit reuse. + for (auto &e : orch_so_table_) { + if (!e.in_use) continue; + if (e.handle != nullptr) dlclose(e.handle); + if (e.path[0] != '\0') unlink(e.path); + e = OrchSoEntry{}; + } + } +}; + +static AicpuExecutor g_aicpu_executor; + +// ===== AicpuExecutor Method Implementations ===== + +int32_t AicpuExecutor::init(Runtime *runtime) { + bool expected = false; + if (!initialized_.compare_exchange_strong(expected, true, std::memory_order_acq_rel, std::memory_order_acquire)) { + return 0; + } + + LOG_INFO_V0("AicpuExecutor: Initializing"); + + if (runtime == nullptr) { + LOG_ERROR("runtime is nullptr"); + init_failed_.store(true, std::memory_order_release); + return -1; + } + + // Read execution parameters from runtime. The 0 → 1 fixup runs before the + // sched_thread_num_ derivation so a zero input doesn't leave the scheduler + // count at -1. + aicpu_thread_num_ = runtime->aicpu_thread_num; + if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1; + sched_thread_num_ = aicpu_thread_num_ - 1; + orch_to_sched_ = runtime->orch_to_sched; + + if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) { + LOG_ERROR("Invalid aicpu_thread_num: %d", aicpu_thread_num_); + init_failed_.store(true, std::memory_order_release); + return -1; + } + + if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) { + init_failed_.store(true, std::memory_order_release); + return -1; + } + + finished_count_.store(0, std::memory_order_release); + + init_done_.store(true, std::memory_order_release); + LOG_INFO_V0("AicpuExecutor: Init complete"); + return 0; +} + +/** + * Shutdown AICore - Send exit signal via registers to all AICore kernels + */ +int32_t AicpuExecutor::run(Runtime *runtime) { + // Prefer the filter gate's deterministic exec_idx so role assignment + // (sched 0..N-2 / orch N-1) is driven by host-computed ALLOWED_CPUS, + // not arrival order. Fall back to the legacy fetch-add counter on + // platforms where the filter gate is inactive (sim sets exec_idx via + // its own stub; the fallback covers any path that bypassed the gate). + int32_t affinity_exec_idx = platform_aicpu_affinity_thread_idx(); + int32_t thread_idx = (affinity_exec_idx >= 0) ? affinity_exec_idx : (thread_idx_++); + int32_t run_rc = 0; + LOG_INFO_V0("Thread %d: Start (exec_idx=%d)", thread_idx, affinity_exec_idx); + + // Orchestrator check + if (thread_idx >= sched_thread_num_) { +#if PTO2_PROFILING + uint64_t orch_cycle_start = 0; + int32_t submitted_tasks = -1; +#endif + // Orchestrator thread: load + run the device orchestration SO. The braces + // scope the per-callable dlopen / SO-table locals to this block. + { + // Per-callable_id dispatch: the orch SO state lives in + // `orch_so_table_[callable_id]` keyed by registration order; + // reload is governed by `register_new_callable_id_`. + const int32_t callable_id = runtime->get_active_callable_id(); + if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) { + LOG_ERROR( + "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS + ); + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + void **p_handle = &orch_so_table_[callable_id].handle; + char *p_path = orch_so_table_[callable_id].path; + DeviceOrchestrationFunc *p_func = &orch_so_table_[callable_id].func; + DeviceOrchestrationBindRuntimeFunc *p_bind = &orch_so_table_[callable_id].bind; + DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func; + const bool reload_so = runtime->register_new_callable_id(); + + if (reload_so) { + LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id); + if (*p_handle != nullptr) { + dlclose(*p_handle); + *p_handle = nullptr; + *p_func = nullptr; + *p_bind = nullptr; + if (p_path[0] != '\0') { + // Unlink the old file so the new open() lands on a + // fresh inode — protects against SIGBUS / ETXTBSY when + // the kernel still has the old mapping pinned. + unlink(p_path); + p_path[0] = '\0'; + } + } + + const void *so_data = reinterpret_cast(runtime->get_dev_orch_so_addr()); + size_t so_size = runtime->get_dev_orch_so_size(); + + if (so_data == nullptr || so_size == 0) { + LOG_ERROR("Thread %d: Device orchestration SO not set", thread_idx); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + + // Try multiple paths that may allow execution on AICPU + char so_path[256]; + bool file_created = false; + const char *candidate_dirs[] = { + "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp" + }; + const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); + + for (int32_t i = 0; i < num_candidates && !file_created; i++) { + int32_t fd = create_orch_so_file( + candidate_dirs[i], callable_id, get_orch_device_id(), so_path, sizeof(so_path) + ); + if (fd < 0) { + LOG_INFO_V0( + "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno + ); + continue; + } + ssize_t written = write(fd, so_data, so_size); + close(fd); + if (written != static_cast(so_size)) { + LOG_INFO_V0( + "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno + ); + unlink(so_path); + continue; + } + file_created = true; + LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size); + } + + if (!file_created) { + LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + + dlerror(); + void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); + const char *dlopen_err = dlerror(); + if (handle == nullptr) { + LOG_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown"); + unlink(so_path); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); + + // Unlink the on-disk SO immediately: dlopen has already mmap'd + // the image, so the kernel keeps the inode alive until the + // matching dlclose / process exit. This prevents stale + // libdevice_orch__.so files from accumulating in + // /tmp when child processes exit via os._exit(0), which skips + // ~AicpuExecutor (worker.py: _sub/_chip/_child loops). + unlink(so_path); + + const char *entry_symbol = runtime->get_device_orch_func_name(); + if (entry_symbol == nullptr || entry_symbol[0] == '\0') { + entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL; + } + const char *config_symbol = runtime->get_device_orch_config_name(); + if (config_symbol == nullptr || config_symbol[0] == '\0') { + config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL; + } + + dlerror(); + DeviceOrchestrationFunc orch_func = + reinterpret_cast(dlsym(handle, entry_symbol)); + const char *entry_dlsym_error = dlerror(); + if (entry_dlsym_error != nullptr) { + LOG_ERROR( + "Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error + ); + dlclose(handle); + unlink(so_path); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + if (orch_func == nullptr) { + LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol); + dlclose(handle); + unlink(so_path); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + + dlerror(); + auto config_func = reinterpret_cast(dlsym(handle, config_symbol)); + const char *config_dlsym_error = dlerror(); + if (config_dlsym_error != nullptr || config_func == nullptr) { + LOG_ERROR( + "Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol, + config_dlsym_error ? config_dlsym_error : "NULL function pointer" + ); + config_func = nullptr; + } + + dlerror(); + auto bind_runtime_func = + reinterpret_cast(dlsym(handle, "framework_bind_runtime")); + const char *bind_runtime_error = dlerror(); + if (bind_runtime_error != nullptr) { + LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", thread_idx, bind_runtime_error); + bind_runtime_func = nullptr; + } + + *p_handle = handle; + *p_func = orch_func; + *p_bind = bind_runtime_func; + *p_config_func = config_func; + snprintf(p_path, 256, "%s", so_path); + orch_so_table_[callable_id].in_use = true; + } else { + LOG_INFO_V0( + "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id + ); + if (*p_handle == nullptr || *p_func == nullptr) { + LOG_ERROR( + "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx, + callable_id + ); + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + } + + // Build the entry-arg once per run; both the config call below and + // the orchestration entry (consumed at orch_args_cached_) use it. + orch_args_cached_.create_from_chip_args(runtime->get_orch_args()); + + // Validate arg count on every run (reload or cache hit). + if (*p_config_func != nullptr) { + PTO2OrchestrationConfig cfg = (*p_config_func)(orch_args_cached_); + LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count); + if (cfg.expected_arg_count > 0) { + const ChipStorageTaskArgs &args_validate = runtime->get_orch_args(); + int32_t actual_arg_count = args_validate.tensor_count() + args_validate.scalar_count(); + if (actual_arg_count < cfg.expected_arg_count) { + LOG_ERROR( + "Thread %d: arg_count %d < expected %d", thread_idx, actual_arg_count, + cfg.expected_arg_count + ); + // Clean up cached state so a subsequent run does a full reload. + if (*p_handle != nullptr) { + dlclose(*p_handle); + *p_handle = nullptr; + } + if (p_path[0] != '\0') { + unlink(p_path); + p_path[0] = '\0'; + } + *p_func = nullptr; + *p_bind = nullptr; + *p_config_func = nullptr; + orch_so_table_[callable_id].in_use = false; + // Unblock scheduler threads before returning so they don't spin forever. + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + } + } else { + LOG_INFO_V0("Thread %d: No config function, using defaults", thread_idx); + } + + // sm_handle / rt are bound to *this* run's memory and must be + // (re)created every run, regardless of whether the SO itself was + // reused above. + const ChipStorageTaskArgs &args = runtime->get_orch_args(); + int32_t arg_count = args.tensor_count() + args.scalar_count(); + LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", thread_idx, runtime->get_gm_sm_ptr(), arg_count); + for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) { + const Tensor &t = args.tensor(i); + LOG_INFO_V0( + "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", thread_idx, i, + static_cast(t.buffer.addr), t.ndims, static_cast(t.dtype) + ); + } + for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) { + LOG_INFO_V0( + "Thread %d: orch_args[%d] = SCALAR(0x%lx)", thread_idx, args.tensor_count() + i, + static_cast(args.scalar(i)) + ); + } + + void *sm_ptr = runtime->get_gm_sm_ptr(); + + // Prebuilt-arena fast path. Host has pre-populated the entire + // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map + // sub-regions + sm_handle wrapper + mailbox) and uploaded it via + // rtMemcpy into the pooled runtime_arena buffer. We attach to it, + // wire arena-internal pointers to their device addresses, reset + // the SM, and finalize the few device-only fields the host could + // not know at image-build time. + void *prebuilt_arena = runtime->get_prebuilt_arena_base(); + size_t off_runtime = runtime->get_prebuilt_runtime_offset(); + if (prebuilt_arena == nullptr) { + LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx); + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + runtime_arena_.attach(prebuilt_arena, DeviceArena::kDefaultBaseAlign); + rt = reinterpret_cast(static_cast(prebuilt_arena) + off_runtime); + + // Wire every arena-internal pointer field (host wrote host-mirror + // addresses; we overwrite them with device addresses). + runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt); + uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(rt->prebuilt_layout.task_window_sizes); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; ++r) { + LOG_INFO_V0( + "Thread %d: Ring %d sizes: task_window=%" PRIu64 " heap=%" PRIu64 " dep_pool=%d", thread_idx, r, + rt->prebuilt_layout.task_window_sizes[r], rt->prebuilt_layout.heap_sizes[r], + rt->prebuilt_layout.dep_pool_capacities[r] + ); + } + + // Reset SM state. setup_pointers + init_header_per_ring restore + // ring flow-control counters, layout metadata, error flags, and + // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse + + // fanin_count/active_mask zero — previously done inside + // RingSchedState::init). + memset(rt->sm_handle, 0, sizeof(*rt->sm_handle)); + if (!rt->sm_handle->init_per_ring( + sm_ptr, sm_size, rt->prebuilt_layout.task_window_sizes, rt->prebuilt_layout.heap_sizes + )) { + LOG_ERROR("Thread %d: sm_handle->init_per_ring failed", thread_idx); + rt = nullptr; + runtime_init_ready_.store(true, std::memory_order_release); + return -1; + } + + // AICore completion mailbox lives in the arena; reset it each + // boot so stale completion notifications from a previous run do + // not leak. + memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox)); + + // Fill ops / core counts (host can't resolve s_runtime_ops's + // device address nor know the SchedulerContext's core fan-out). + runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count()); + +#if PTO2_PROFILING + rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level(); + { + auto &orch = rt->orchestrator; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &alloc = orch.rings[r].task_allocator; + scope_stats_set_ring_capacity( + r, alloc.window_size(), alloc.heap_capacity(), rt->prebuilt_layout.dep_pool_capacities[r] + ); + } + scope_stats_set_tensormap_capacity(orch.tensor_map.pool_capacity()); + } +#endif + + // With multi-ring, slot_states are per-ring inside the scheduler. + runtime->set_slot_states_ptr(nullptr); + + // Wire scheduler context to the newly created PTO2Runtime before + // releasing scheduler threads from runtime_init_ready_. + sched_ctx_.bind_runtime(rt); + + runtime_init_ready_.store(true, std::memory_order_release); + + // Wait for scheduler's one-time init to complete + sched_ctx_.wait_init_complete(); + +#if PTO2_PROFILING + if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { + l2_swimlane_aicpu_set_orch_thread_idx(thread_idx); + } + // scope_stats streams scope_end records off the orchestrator thread: + // record the per-thread ready_queue index. No-op (writer shared + // state null) when scope_stats is disabled; the current buffer is + // popped lazily on the first scope_end append. + scope_stats_aicpu_set_orch_thread_idx(thread_idx); +#endif + + // dep_gen plugs into the orchestrator thread (single-instance subsystem): + // set the per-thread queue index and pop the initial buffer before any + // submit_task can fire inside orch_func_. + if (is_dep_gen_enabled()) { + dep_gen_aicpu_set_orch_thread_idx(thread_idx); + dep_gen_aicpu_init(); + } + +#if PTO2_PROFILING + orch_cycle_start = get_sys_cnt_aicpu(); +#endif + framework_bind_runtime(rt); + if (*p_bind != nullptr) { + (*p_bind)(rt); + } + rt_scope_begin(rt); + (*p_func)(orch_args_cached_); + rt_scope_end(rt); + + // Flush the (potentially partially-filled) DepGenBuffer so the host + // collector can pick it up before this orchestrator thread joins. + if (is_dep_gen_enabled()) { + dep_gen_aicpu_flush(); + } +#if PTO2_PROFILING + // Push the partially-filled scope_stats buffer so the host gets the + // final scope_end records. Idempotent / no-op when disabled. + scope_stats_aicpu_flush_buffers(); +#endif +#if PTO2_PROFILING + uint64_t orch_cycle_end = get_sys_cnt_aicpu(); + (void)orch_cycle_end; +#endif + + // Print orchestrator profiling data +#if PTO2_ORCH_PROFILING + PTO2OrchProfilingData p = orchestrator_get_profiling(); + uint64_t total = + p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle; + if (total == 0) total = 1; // avoid div-by-zero + LOG_INFO_V9( + "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", thread_idx, + static_cast(p.submit_count), cycles_to_us(total) + ); + LOG_INFO_V9( + "Thread %d: task+heap_alloc: %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", + thread_idx, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total, + cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle), + static_cast(p.alloc_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: sync_tensormap : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.sync_cycle), + p.sync_cycle * 100.0 / total + ); + LOG_INFO_V9( + "Thread %d: lookup+dep : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.lookup_cycle), + p.lookup_cycle * 100.0 / total + ); + LOG_INFO_V9( + "Thread %d: tensormap_ins : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.insert_cycle), + p.insert_cycle * 100.0 / total + ); + LOG_INFO_V9( + "Thread %d: param_copy : %.3fus (%.1f%%) atomics=%" PRIu64 "", thread_idx, + cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast(p.args_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: fanin+ready : %.3fus (%.1f%%) work=%.3fus wait=%.3fus", thread_idx, + cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total, + cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle) + ); + LOG_INFO_V9( + "Thread %d: avg/task : %.3fus", thread_idx, + p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0 + ); + +#if PTO2_TENSORMAP_PROFILING + PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling(); + LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", thread_idx); + LOG_INFO_V9( + "Thread %d: lookups : %" PRIu64 ", inserts: %" PRIu64 "", thread_idx, + static_cast(tp.lookup_count), static_cast(tp.insert_count) + ); + LOG_INFO_V9( + "Thread %d: chain walked : total=%" PRIu64 ", avg=%.1f, max=%d", thread_idx, + static_cast(tp.lookup_chain_total), + tp.lookup_count > 0 ? static_cast(tp.lookup_chain_total) / tp.lookup_count : 0.0, + tp.lookup_chain_max + ); + LOG_INFO_V9( + "Thread %d: overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", thread_idx, + static_cast(tp.overlap_checks), static_cast(tp.overlap_hits), + tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0 + ); +#endif +#endif // PTO2_ORCH_PROFILING + + // Latch task count from PTO2 shared memory to hand off to the + // scheduler. The orchestrator's run window (start_time / end_time / + // submit_count) is no longer published to shared memory — the + // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line + // below carries the same envelope info for debugging, and + // host-side swimlane derives per-phase timing from the per-event + // L2SwimlaneAicpuPhaseRecord[] stream that already covers everything inside + // submit_task(). + int32_t total_tasks = 0; + if (rt->orchestrator.sm_header) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + total_tasks += + rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire); + } + } + +#if PTO2_PROFILING + submitted_tasks = total_tasks; +#endif + + // Signal completion to the orchestrator state machine + rt_orchestration_done(rt); + + sched_ctx_.on_orchestration_done(runtime, rt, thread_idx, total_tasks); + } +#if PTO2_PROFILING + uint64_t orch_end_ts = get_sys_cnt_aicpu(); + LOG_INFO_V9( + "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", thread_idx, + static_cast(orch_cycle_start), static_cast(orch_end_ts), + cycles_to_us(orch_end_ts - orch_cycle_start) + ); + if (submitted_tasks >= 0) { + LOG_INFO_V9( + "PTO2 total submitted tasks = %d, already executed %d tasks", submitted_tasks, + sched_ctx_.completed_tasks_count() + ); + } +#endif + LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx); + } + + // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false) + if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) { + // Device orchestration: wait for the primary orchestrator to initialize the SM header + while (!runtime_init_ready_.load(std::memory_order_acquire)) { + SPIN_WAIT_HINT(); + } + if (rt == nullptr) { + LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx); + } else { + sched_ctx_.bind_runtime(rt); + int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, thread_idx); + if (completed < 0) { + LOG_ERROR("Thread %d: Scheduler failed with rc=%d", thread_idx, completed); + run_rc = completed; + } else { + LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", thread_idx, completed); + } + } + } + + // Always shutdown AICore — even if sched_ctx_.completed_ was already true. + // platform_deinit_aicore_regs is idempotent; orchestrator threads have + // core_trackers_[thread_idx].core_num() == 0 so they skip the loop harmlessly. + int32_t shutdown_rc = sched_ctx_.shutdown(thread_idx); + if (shutdown_rc != 0 && run_rc == 0) { + run_rc = shutdown_rc; + } + + LOG_INFO_V0("Thread %d: Completed", thread_idx); + + // Check if this is the last thread to finish + int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel); + if (prev_finished + 1 == aicpu_thread_num_) { + finished_.store(true, std::memory_order_release); + // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we + // always tear them down here, but we keep the per-cid orch SO entries + // alive for the next run's cache-hit reuse (see run() reload_so branch). + if (rt != nullptr) { + // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt. + const int32_t callable_id = runtime->get_active_callable_id(); + framework_bind_runtime(nullptr); + if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) { + DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind; + if (bind != nullptr) { + bind(nullptr); + } + } + runtime_destroy(rt, runtime_arena_); + rt = nullptr; + } + } + + return run_rc; +} + +void AicpuExecutor::deinit(Runtime *runtime) { + // 1. Invalidate AICPU cache for Runtime address range. + // Next round's Host DMA (rtMemcpy) writes fresh Runtime to HBM but + // bypasses this cache. Invalidating now ensures next round reads from HBM. + cache_invalidate_range(runtime, sizeof(Runtime)); + + // Reset all SchedulerContext-owned state in one place. + sched_ctx_.deinit(); + + finished_count_.store(0, std::memory_order_release); + runtime_init_ready_.store(false, std::memory_order_release); + + aicpu_thread_num_ = 0; + sched_thread_num_ = 0; + orch_to_sched_ = false; + + orch_args_cached_.reset(); + // orch_so_table_ entries are intentionally preserved across deinit: the + // next run reuses cached handles when register_new_callable_id() returns + // false. The destructor releases them at process teardown. + + // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit) + rt = nullptr; + + // Clear dep_gen file-local bookkeeping. No-op when dep_gen is disabled. + dep_gen_aicpu_finalize(); + + LOG_INFO_V0("DeInit: Runtime execution state reset"); + + initialized_.store(false, std::memory_order_release); + init_done_.store(false, std::memory_order_release); + init_failed_.store(false, std::memory_order_release); + thread_idx_.store(0, std::memory_order_release); + finished_.store(false, std::memory_order_release); + + LOG_INFO_V0("DeInit: AicpuExecutor reset complete"); +} + +// ===== Public Entry Point ===== + +/** + * aicpu_execute - Main AICPU kernel execution entry point + * + * This is called by DynTileFwkBackendKernelServer in kernel.cpp. + * Orchestrates the complete task runtime execution: + * 1. Initialize executor (thread-safe, first thread only) + * 2. Wait for initialization to complete + * 3. Execute tasks on managed cores + * 4. Cleanup when last thread finishes + * + * @param runtime Pointer to Runtime structure + * @return 0 on success, non-zero on error + */ +extern "C" int32_t aicpu_execute(Runtime *runtime) { + if (runtime == nullptr) { + LOG_ERROR("%s", "Invalid argument: null Runtime pointer"); + return -1; + } + + LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution"); + + g_aicpu_executor.init(runtime); + + while (!g_aicpu_executor.init_done_.load(std::memory_order_acquire)) { + if (g_aicpu_executor.init_failed_.load(std::memory_order_acquire)) { + LOG_ERROR("%s", "aicpu_execute: Initialization failed, aborting execution"); + return -1; + } + } + + int32_t rc = g_aicpu_executor.run(runtime); + if (rc != 0) { + LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc); + } + + int32_t runtime_rc = read_runtime_status(runtime); + + // Last thread cleans up + if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) { + LOG_INFO_V0("aicpu_execute: Last thread finished, cleaning up"); + g_aicpu_executor.deinit(runtime); + } + + if (runtime_rc != 0) { + LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc); + return runtime_rc; + } + + if (rc != 0) { + return rc; + } + + LOG_INFO_V0("%s", "aicpu_execute: Kernel execution completed successfully"); + return 0; +} diff --git a/src/a5/runtime/fully_distributed_within_core/build_config.py b/src/a5/runtime/fully_distributed_within_core/build_config.py new file mode 100644 index 000000000..da34f14f9 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/build_config.py @@ -0,0 +1,32 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +# fully_distributed_within_core runtime build configuration +# All paths are relative to this file's directory (src/runtime/fully_distributed_within_core/) +# +# Goal: orchestration + scheduling + execution run on the AI cores themselves in +# SPMD fashion, removing AICPU from orchestration/scheduling. See the design spec: +# docs/fully_distributed_within_core.md +# +# This tree is currently re-based on the tensormap_and_ringbuffer runtime so it +# is discoverable and compiles; it reuses TensorMap, MixedKernels/ActiveMask, +# L0TaskArgs, the pto_orchestration_api submit API, and kernel-address +# resolution. The distributed model (claim race + per-core TensorMap + private +# task ring + global completion-flag ring) is layered on incrementally per the +# spec; the AICPU is reduced to an init/teardown stub. +# +# The "orchestration" directory contains source files compiled into both +# runtime targets AND the orchestration .so (e.g., tensor methods needed +# by the Tensor constructor's validation logic). + +BUILD_CONFIG = { + "aicore": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicore", "orchestration"]}, + "aicpu": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["aicpu", "runtime", "orchestration"]}, + "host": {"include_dirs": ["runtime", "common", ".."], "source_dirs": ["host", "runtime/shared", "orchestration"]}, + "orchestration": {"include_dirs": ["runtime", "orchestration", "common", ".."], "source_dirs": ["orchestration"]}, +} diff --git a/src/a5/runtime/fully_distributed_within_core/common/intrinsic.h b/src/a5/runtime/fully_distributed_within_core/common/intrinsic.h new file mode 100644 index 000000000..99803483a --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/common/intrinsic.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file intrinsic.h + * @brief SPMD execution context for AICore user kernels + * + * Topology data exposed to user kernels has two distinct lifetimes: + * + * 1. Global topology (per-core, fixed after runtime init): + * - sub_block_id : identifies the AIV lane within a cluster + * (0 = AIV0/left, 1 = AIV1/right). Initialized once at runtime + * startup based on each core's cluster position; never changes. + * Only meaningful for AIV kernels in MIX tasks. + * + * 2. Local per-dispatch context (changes each dispatch): + * - s_block_idx : which logical block the current worker is executing + * - s_block_num : total number of blocks in this task (= block_dim) + * Written by build_payload() before each dispatch. + * + * Both categories are injected via two pointer slots appended at the tail + * of the kernel args[] array: + * + * args layout: + * [0 .. tensor_count-1] = tensor GM pointers + * [tensor_count .. +scalar_count-1] = scalar values + * ... + * [SPMD_LOCAL_CONTEXT_INDEX] = (uint64_t)&LocalContext (per-dispatch) + * [SPMD_GLOBAL_CONTEXT_INDEX] = (uint64_t)&GlobalContext (per-core) + * + * The suffix positions are compile-time constants and do not depend on the + * runtime tensor_count or scalar_count. + * + * Include this header in AICore kernel source files to use the Get* accessors. + * Do NOT depend on the raw index constants; always use the accessor functions. + * + * On CCEC (real hardware), __gm__ and __aicore__ must be defined before + * including this header (e.g. via or manual #define). + * The #ifndef guards below provide fallbacks for non-kernel builds + * (AICPU, HOST) where these qualifiers are not needed. + * + * IMPORTANT — do NOT mix these with the CCE built-in topology intrinsics + * (`get_subblockid()`, `get_block_idx()`, `get_block_num()` declared in + * `kernel_operator.h` / tikcfw). Those intrinsics read AICore hardware + * registers that simpler's tensormap_and_ringbuffer runtime does NOT + * program. Specifically: + * + * - CCE `get_subblockid()` returns whatever stale value the AICore + * sub-block register holds — under simpler's MIX dispatch it is 0 + * for BOTH AIV0 and AIV1 of every cluster, so a kernel that uses + * it to partition heads will silently have AIV1 redo AIV0's work + * and the AIV1 share of the output is never written. This is the + * exact failure mode that produced the partial-zero output in + * issue #900 (PR #899 spmd_paged_attention_highperf); the kernel + * compiled, ran without error, and produced wrong output. Use + * `get_sub_block_id(args)` instead, which reads from the runtime's + * `GlobalContext.sub_block_id` that the scheduler initializes per + * AIV core in `scheduler_cold_path.cpp::SchedulerContext::init`. + * + * - `get_block_idx()` and `get_block_num()` are not redirected to + * simpler's LocalContext either — use the `(args)` variants below + * so the values reflect simpler's logical block_dim (which can + * differ from `RUNTIME_CONFIG.block_dim`, the physical core count). + * + * If you are porting a kernel originally written for native CANN dispatch + * (AscendC, ascend-transformer-boost, etc.), every reference to those + * three CCE intrinsics needs to be rewritten against this header. See + * `docs/aicore-kernel-programming.md` for the full author contract, + * porting checklist, and the worked example from PR #899 / issue #900. + */ + +#pragma once + +#include + +#include "aicore_completion_mailbox_types.h" +#include "pto_task_id.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ +#endif + +/** Number of extra pointer slots appended to the args[] tail (LocalContext + GlobalContext). */ +static constexpr int32_t PTO2_EXT_PARAMS_COUNT = 2; + +/** + * Args[] suffix indices for context pointers. + * Derived from MAX_TENSOR_ARGS(32) + MAX_SCALAR_ARGS(16). + * Users should not depend on these values; use the Get* functions below. + */ +static constexpr int32_t SPMD_LOCAL_CONTEXT_INDEX = 48; +static constexpr int32_t SPMD_GLOBAL_CONTEXT_INDEX = 49; +static constexpr int32_t PAYLOAD_LOCAL_CONTEXT_INDEX = SPMD_LOCAL_CONTEXT_INDEX; +static constexpr int32_t PAYLOAD_GLOBAL_CONTEXT_INDEX = SPMD_GLOBAL_CONTEXT_INDEX; + +/** + * Per-core global context, stored in PTO2DispatchPayload. + * Initialized once at runtime startup (init_global_context) based on each + * core's cluster position. Never modified after initialization. + */ +struct GlobalContext { + // AIV lane within cluster: 0=AIV0(left), 1=AIV1(right). + // Used by AIV to select the correct intra-cluster hw instruction. + // Not meaningful for AIC kernels or single-AIV tasks. + int32_t sub_block_id; +}; + +struct AsyncCtx { + volatile __gm__ uint32_t *completion_count; + volatile __gm__ int32_t *completion_error_code; + volatile __gm__ DeferredCompletionEntry *completion_entries; + uint32_t completion_capacity; + PTO2TaskId task_token; + + static inline AsyncCtx make(PTO2TaskId task_token, volatile __gm__ DeferredCompletionSlab *buffer) { + AsyncCtx ctx{}; + ctx.task_token = task_token; + if (buffer == nullptr) { + ctx.task_token = PTO2TaskId::invalid(); + return ctx; + } + ctx.completion_count = &buffer->count; + ctx.completion_error_code = &buffer->error_code; + ctx.completion_entries = &buffer->entries[0]; + ctx.completion_capacity = MAX_COMPLETIONS_PER_TASK; + return ctx; + } +}; + +/** + * Per-dispatch local context, stored in PTO2DispatchPayload. + * Written by build_payload() before each dispatch. Different blocks of the + * same task receive different s_block_idx values but the same s_block_num. + * + * NOTE: Fields are prefixed with s_ to avoid collisions with compiler + * built-in symbols block_idx / block_num on the a5 AICore target, which + * would cause a compile error if the unprefixed names were used. + */ +struct LocalContext { + int32_t s_block_idx; // Logical block index within the task [0, s_block_num) + int32_t s_block_num; // How many logical blocks this task requires. + // Currently fixed to 1 (block_dim > 1 not yet implemented). + // NOT the same as RUNTIME_CONFIG.block_dim in kernel_config.py, + // which controls how many physical cores the runtime launches. + AsyncCtx async_ctx; +}; + +/** + * Return the AIV lane index within the cluster. + * In a MIX 1C2V task: AIV0(left)=0, AIV1(right)=1. + * + * This value is only meaningful for AIV kernels in MIX tasks. It tells + * the AIV whether it is the left lane or the right lane within the cluster, + * which determines the correct hardware instruction for intra-cluster + * communication. + * + * AIC kernels should NOT call this function. + * Single-AIV tasks have no intra-cluster communication, so sub_block_id + * has no meaning and should not be used. + */ +static __aicore__ inline int32_t get_sub_block_id(__gm__ int64_t *args) { + __gm__ GlobalContext *ctx = + reinterpret_cast<__gm__ GlobalContext *>(static_cast(args[SPMD_GLOBAL_CONTEXT_INDEX])); + return ctx->sub_block_id; +} + +/** + * Return the logical block index assigned to the current worker. + * Range: [0, get_block_num(args)). + * Within the same task, different blocks receive different indices. + */ +static __aicore__ inline int32_t get_block_idx(__gm__ int64_t *args) { + __gm__ LocalContext *ctx = + reinterpret_cast<__gm__ LocalContext *>(static_cast(args[SPMD_LOCAL_CONTEXT_INDEX])); + return ctx->s_block_idx; +} + +/** + * Return how many logical blocks the current task requires. + * All blocks of the same task see the same value. + * Currently always returns 1 (block_dim>1 not yet implemented). + * + * Note: this is NOT the same as RUNTIME_CONFIG.block_dim in + * kernel_config.py, which controls how many physical cores are launched. + */ +static __aicore__ inline int32_t get_block_num(__gm__ int64_t *args) { + __gm__ LocalContext *ctx = + reinterpret_cast<__gm__ LocalContext *>(static_cast(args[SPMD_LOCAL_CONTEXT_INDEX])); + return ctx->s_block_num; +} diff --git a/src/a5/runtime/fully_distributed_within_core/common/pto_runtime_status.h b/src/a5/runtime/fully_distributed_within_core/common/pto_runtime_status.h new file mode 100644 index 000000000..e663ef477 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/common/pto_runtime_status.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO2 Runtime Status Helpers + * + * Shared error-code contract used inside the tensormap_and_ringbuffer runtime. + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_ + +#include + +// Orchestrator errors (1-99): detected in orchestrator thread +#define PTO2_ERROR_NONE 0 // Explicitly means "no error"; it is not an "unknown/unspecified" error code. +#define PTO2_ERROR_SCOPE_DEADLOCK 1 +#define PTO2_ERROR_HEAP_RING_DEADLOCK 2 +#define PTO2_ERROR_FLOW_CONTROL_DEADLOCK 3 +#define PTO2_ERROR_DEP_POOL_OVERFLOW 4 +#define PTO2_ERROR_INVALID_ARGS 5 // Arg construction error (invalid args) +#define PTO2_ERROR_DEPENDENCY_OVERFLOW 6 // Too many unique fanin dependencies for one task +#define PTO2_ERROR_REQUIRE_SYNC_START_INVALID 7 +#define PTO2_ERROR_TENSOR_WAIT_TIMEOUT 8 +#define PTO2_ERROR_EXPLICIT_ORCH_FATAL 9 +#define PTO2_ERROR_SCOPE_TASKS_OVERFLOW 10 // scope_tasks buffer saturated (all rings full) + +// Scheduler errors (100+): detected in scheduler threads +#define PTO2_ERROR_SCHEDULER_TIMEOUT 100 +#define PTO2_ERROR_ASYNC_COMPLETION_INVALID 101 +#define PTO2_ERROR_ASYNC_WAIT_OVERFLOW 102 +#define PTO2_ERROR_ASYNC_REGISTRATION_FAILED 103 + +static inline int32_t runtime_status_from_error_codes(int32_t orch_error_code, int32_t sched_error_code) { + if (orch_error_code != PTO2_ERROR_NONE) { + return orch_error_code < 0 ? orch_error_code : -orch_error_code; + } + if (sched_error_code != PTO2_ERROR_NONE) { + return sched_error_code < 0 ? sched_error_code : -sched_error_code; + } + return 0; +} + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_COMMON_PTO_RUNTIME_STATUS_H_ diff --git a/src/a5/runtime/fully_distributed_within_core/docs/MULTI_RING.md b/src/a5/runtime/fully_distributed_within_core/docs/MULTI_RING.md new file mode 100644 index 000000000..db4cda386 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/docs/MULTI_RING.md @@ -0,0 +1,330 @@ +# Multi-Ring Buffer Architecture + +> Extension to the PTO2 runtime. For the base architecture, see [RUNTIME_LOGIC.md](RUNTIME_LOGIC.md). + +## 1. Problem + +The single-ring design uses one `last_task_alive` watermark shared by HeapRing, TaskRing, and DepPool. When tasks from an inner scope (e.g., per-block iteration) complete, their resources cannot be reclaimed until **all** prior tasks — including those from the outer scope — also complete. This wastes ring capacity and can trigger deadlocks when ring sizes are small. + +## 2. Solution + +Split HeapRing, TaskRing, and DepPool into arrays of `PTO2_MAX_RING_DEPTH` (4) independent instances. Each scope depth maps to its own ring, with an independent `last_task_alive` watermark. + +```text +Scope depth 0 ──► rings[0] = { HeapRing, TaskRing, DepPool } +Scope depth 1 ──► rings[1] = { HeapRing, TaskRing, DepPool } +Scope depth 2 ──► rings[2] = { HeapRing, TaskRing, DepPool } +Scope depth ≥3 ──► rings[3] = { HeapRing, TaskRing, DepPool } (clamped) +``` + +Inner-scope tasks can now be reclaimed independently without waiting for outer-scope tasks to complete. + +## 3. Task ID Encoding + +Task IDs are widened from 32-bit to 64-bit to carry the ring identity: + +```text +task_id.raw = (ring_id << 32) | local_id +``` + +`PTO2TaskId` exposes direct accessors in `pto_runtime2_types.h`: + +| API | Purpose | +| --- | ------- | +| `pto2_make_task_id(ring_id, local_id)` | Compose a 64-bit task ID (`PTO2TaskId`) | +| `task_id.ring()` | Extract `ring_id` (bits 63-32) | +| `task_id.local()` | Extract `local_id` (bits 31-0) | +| `task_id.raw` | Access the packed 64-bit encoding | + +Type changes: + +| Field | Before | After | +| ----- | ------ | ----- | +| `PTO2TaskDescriptor.task_id` | `int32_t` | `PTO2TaskId` | +| `PTO2TensorMapEntry.producer_task_id` | `int32_t` | `PTO2TaskId` | +| `PTO2TaskSlotState.ring_id` | N/A | `uint8_t` (new, denormalized for fast access) | + +## 4. Data Structures + +### 4.1 PTO2RingSet (new) + +Bundles the three per-ring resources into a single aggregate (`pto_ring_buffer.h`): + +```cpp +struct PTO2RingSet { + PTO2HeapRing heap_ring; + PTO2TaskRing task_ring; + PTO2FaninPool fanin_pool; +}; +``` + +### 4.2 PTO2OrchestratorState (modified) + +```cpp +// Before: single ring +PTO2HeapRing heap_ring; +PTO2TaskRing task_ring; +PTO2DepListPool dep_pool; + +// After: per-ring array (dep_pool moved to scheduler, see §4.5) +PTO2RingSet rings[PTO2_MAX_RING_DEPTH]; +``` + +Ring selection: `current_ring_id() = min(scope_stack_top, PTO2_MAX_RING_DEPTH - 1)`. + +### 4.3 PTO2SharedMemoryHeader (modified) + +Per-ring flow control and per-ring layout info are grouped together: + +```cpp +struct PTO2RingFlowControl { + std::atomic current_task_index; // task ring head + std::atomic last_task_alive; // task ring tail + std::atomic heap_top; // heap alloc pointer + std::atomic heap_tail; // heap reclaim pointer +}; + +struct alignas(64) PTO2SharedMemoryRingHeader { + PTO2RingFlowControl fc; + + // Layout metadata (set once at init) + uint64_t task_window_size; + int32_t task_window_mask; // task_window_size - 1 + uint64_t heap_size; + uint64_t task_descriptors_offset; + + // Per-ring data pointers (host-side, set by PTO2SharedMemoryHandle::setup_pointers) + PTO2TaskDescriptor *task_descriptors; + PTO2TaskPayload *task_payloads; + PTO2TaskSlotState *slot_states; + + // Accessors (slot = local_id & task_window_mask) + PTO2TaskDescriptor &get_task_by_slot(int32_t slot); + PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id); + PTO2TaskPayload &get_payload_by_slot(int32_t slot); + PTO2TaskPayload &get_payload_by_task_id(int32_t local_id); + PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot); + PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id); +}; + +// In header: +PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH]; +``` + +Per-ring try-locks in the scheduler state prevent concurrent scheduler threads from interleaving watermark writes within the same ring. `FaninPool`/`DepListPool` `reclaim`/`ensure_space` take `PTO2SharedMemoryRingHeader&` directly (no `ring_id` or `fc` parameters). + +### 4.4 PTO2SharedMemoryHandle (lifecycle-only) + +Slimmed to lifecycle management only. Per-ring data pointers now live in `PTO2SharedMemoryRingHeader` (§4.3). Runtime components (orchestrator, scheduler) store `PTO2SharedMemoryHeader*` directly, eliminating one indirection on every per-ring access. + +```cpp +struct PTO2SharedMemoryHandle { + void *sm_base; + uint64_t sm_size; + PTO2SharedMemoryHeader *header; + bool is_owner; +}; +``` + +### 4.5 PTO2SchedulerState (modified) + +```cpp +struct RingSchedState { + // Cache Line 0: ring pointer (read-only) + hot path (read-write) + PTO2SharedMemoryRingHeader *ring; // direct pointer, no indirection + int32_t last_task_alive; + std::atomic advance_lock; // multi-thread CAS + + // Cache Line 1+: Thread 0 only (wiring dep_pool, cache-isolated) + alignas(64) PTO2DepListPool dep_pool; +}; + +RingSchedState ring_sched_states[PTO2_MAX_RING_DEPTH]; +PTO2SpscQueue wiring_queue; // global SPSC queue: orchestrator pushes, scheduler thread 0 drains +``` + +`slot_states`, `task_window_size`, and `task_window_mask` are no longer duplicated — callers access them via `ring->get_slot_state_by_*()` and other ring header accessors. The ring pointer shares cache line 0 with `last_task_alive` and `advance_lock`. + +### 4.6 PTO2TensorMap (modified) + +```cpp +PTO2TensorMapEntry** task_entry_heads[PTO2_MAX_RING_DEPTH]; +int64_t last_task_alives[PTO2_MAX_RING_DEPTH]; +``` + +Entry validity checks and `cleanup_retired` operate per-ring: + +```cpp +bool entry_valid(const PTO2TensorMapEntry& e) { + int32_t ring = e.producer_task_id.ring(); + int32_t local = e.producer_task_id.local(); + return local >= last_task_alives[ring]; +} +``` + +### 4.7 Unchanged Structures + +| Structure | Reason | +| --------- | ------ | +| `PTO2DepListEntry` | Stores `PTO2TaskSlotState*` pointer — naturally crosses ring boundaries | +| `PTO2TaskPayload` | `fanin_slot_states[]` are pointers — no ring coupling | +| `PTO2ReadyQueue` | Global ready queues shared across all rings (tasks ready to dispatch regardless of origin ring) | +| `PTO2DispatchPayload` | Built per-dispatch, no ring state needed | + +## 5. Reclamation + +### 5.1 Per-Ring Watermark Advancement + +Each ring's `last_task_alive` advances independently: + +```text +advance_ring_pointers(ring_id): // protected by per-ring advance_lock + la = ring->fc.last_task_alive + while ring->get_slot_state_by_task_id(la).task_state >= CONSUMED: + reset slot for reuse + la++ + sync_to_sm() // release-store last_task_alive +``` + +Per-ring try-locks in the scheduler state prevent concurrent scheduler threads from interleaving heap_tail writes within the same ring. + +### 5.2 Cross-Ring Dependencies + +Dependency edges use `PTO2TaskSlotState*` pointers, which naturally span rings: + +- Ring 1 task depends on ring 0 producer → ring 0's `fanout_head` linked list contains a ring 1 `PTO2TaskSlotState*` +- When ring 0 task completes, it walks its fanout list and decrements ring 1 consumers' `fanin_refcount` +- No special cross-ring logic needed — pointer-based design is ring-agnostic + +### 5.3 DepPool Reclamation + +DepPool is exclusively managed by scheduler thread 0 (allocation during wiring, reclamation during watermark advancement): + +```text +// Called by scheduler thread 0 during wiring_queue drain: +dep_pool_reclaim(ring_id): + la = ring->fc.last_task_alive + newest_consumed = la - 1 + mark = ring->get_slot_state_by_task_id(newest_consumed).dep_pool_mark + if mark > 0: + ring_sched_states[ring_id].dep_pool.advance_tail(mark) +``` + +Note: dep entries from ring N's pool may appear in ring M's fanout lists. Reclamation is safe because the entries are accessed during fanout traversal (completion time), which always happens before the consumer task — and therefore the dep entry — becomes eligible for reclamation. + +## 6. AICPU Register Protocol Fix + +The AICore dispatch protocol uses 32-bit registers. With multi-ring, `task_id` truncation to 32-bit loses the `ring_id`, causing collisions: + +```text +Ring 0, local_id=0 → DATA_MAIN_BASE = 0 + 1 = 1 +Ring 1, local_id=0 → DATA_MAIN_BASE = 0 + 1 = 1 (collision!) +``` + +AICore uses `last_reg_val` to detect new dispatches — identical values cause skipped tasks and false completions from stale COND registers. + +**Fix**: Per-core monotonic dispatch counter `s_dispatch_seq[core_id]` replaces `task_id` in register writes, guaranteeing unique `DATA_MAIN_BASE` values per core regardless of ring origin. + +## 7. Configuration + +### 7.1 Compile-Time Defaults (per ring) + +| Constant | Default | Total (×4 rings) | +| -------- | ------- | ---------------- | +| `PTO2_TASK_WINDOW_SIZE` | 16384 | 65536 | +| `PTO2_HEAP_SIZE` | 256 MB | 1 GB | +| `PTO2_DEP_LIST_POOL_SIZE` | 16384 | 65536 | + +### 7.2 Runtime Overrides + +Ring sizing can be configured either uniformly for every ring or independently +per ring. Precedence is resolved independently for each resource and ring: + +```text +per-ring CallConfig value + > scalar CallConfig value + > per-ring PTO2_RING_* env value + > scalar PTO2_RING_* env value + > compile-time default +``` + +`ring_id` is the scope-depth ring selected by the runtime: + +```text +scope depth 0 -> ring 0 +scope depth 1 -> ring 1 +scope depth 2 -> ring 2 +scope depth >=3 -> ring 3 +``` + +Per-task via `CallConfig.runtime_env` — different L2 tasks in one launch can +each carry their own sizes. Invalid values raise at submit time (`validate()`). +The scalar fields preserve the old behavior and broadcast one value to all +rings: + +```python +cfg = CallConfig() +cfg.runtime_env.ring_task_window = 128 # power of 2, >= 4 +cfg.runtime_env.ring_heap = 262144 # bytes/ring, >= 1024 +cfg.runtime_env.ring_dep_pool = 256 # 4 .. INT32_MAX +orchestrator.submit_next_level(handle, args, cfg) +``` + +Set the array fields to tune the four scope-depth rings independently. Each +array must contain exactly four entries; use `0` for an entry that should fall +through to the next precedence tier. All `CallConfig` values are integer +byte/count values. + +```python +cfg = CallConfig() +cfg.runtime_env.ring_task_windows = [8192, 16384, 131072, 524288] +cfg.runtime_env.ring_heaps = [ + 128 * 1024 * 1024, + 256 * 1024 * 1024, + 384 * 1024 * 1024, + 512 * 1024 * 1024, +] +cfg.runtime_env.ring_dep_pools = [4096, 8192, 16384, 32768] +orchestrator.submit_next_level(handle, args, cfg) +``` + +Scene tests set the same keys under a nested `runtime_env` block in the +per-case `config` dict: + +```python +"config": { + "runtime_env": { + "ring_task_windows": [8192, 16384, 131072, 524288], + "ring_heaps": [134217728, 268435456, 402653184, 536870912], + "ring_dep_pools": [4096, 8192, 16384, 32768], + } +} +``` + +Process-wide env fallback accepts either one scalar value or exactly four +comma-separated per-ring values. Invalid env values are logged and ignored, then +fall through to defaults. `PTO2_RING_HEAP` values are integer bytes: + +```bash +# Uniform, old behavior: +PTO2_RING_TASK_WINDOW=1024 +PTO2_RING_HEAP=1048576 +PTO2_RING_DEP_POOL=1024 + +# Per-ring, indexed by ring_id 0..3: +PTO2_RING_TASK_WINDOW=8192,16384,131072,524288 +PTO2_RING_HEAP=134217728,268435456,402653184,536870912 +PTO2_RING_DEP_POOL=4096,8192,16384,32768 +``` + +Use `--enable-scope-stats` to confirm the effective values for a real run. The +first line of `scope_stats/scope_stats.jsonl` includes `task_window_max`, +`heap_max`, and `dep_pool_max`, indexed by `ring`. + +### 7.3 Sizing Guidelines + +- `task_window` must be ≥ max tasks in any single scope + headroom for concurrent scopes +- `heap` must accommodate peak output buffer allocation across all in-flight tasks on that ring +- `dep_pool` must be ≥ total dependency entries for all in-flight tasks on that ring +- On hardware, back-pressure latency is higher than in simulation — size conservatively +- Adding inner `PTO2_SCOPE` reduces peak per-ring usage, enabling smaller sizes diff --git a/src/a5/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md b/src/a5/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md new file mode 100644 index 000000000..e6760fb1e --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/docs/RUNTIME_LOGIC.md @@ -0,0 +1,39 @@ +# Runtime Logic: fully_distributed_within_core + +**Target design.** Orchestration, scheduling, and execution all run on the AI +cores in SPMD fashion; the AICPU is removed from orchestration/scheduling. The +authoritative specification is: + +- [`docs/fully_distributed_within_core.md`](../../../../docs/fully_distributed_within_core.md) + +Core elements (see the spec): + +- Task ownership via a claim race over two global cursors (`cube_cursor`, + `vector_cursor`); `owner = builder = executor`. +- Per-core full-duplicate TensorMap for dependency discovery (pull model via a + global `task_completed_flag` ring). +- Per-core private task ring + block-shared `block.won[N]` deposit table for + multi-core (MIX / 2V) co-ownership (anchor push + follower async drain). +- Deterministic, per-core-replicated GM output heap with frontier-based + reclamation. + +## Current state (re-based on tensormap_and_ringbuffer) + +This runtime is re-based on `tensormap_and_ringbuffer` to reuse its +`PTO2TensorMap`, `MixedKernels`/`ActiveMask`, `L0TaskArgs`, the +`pto_orchestration_api.h` submit API, and kernel-address resolution. The +distributed model is layered on incrementally: + +- `runtime/` — adds global claim cursors, a global completion-flag ring, a + deterministic GM output heap, and per-core replicated TensorMap + private task + ring on top of the reused types. +- `aicore/` — the SPMD run-ahead orchestrate+execute loop (spec section 6). +- `aicpu/` — reduced to an init/wire/signal/wait stub (no orchestration, + scheduling, or dispatch). +- `host/` — runtime maker / compile info (orchestration entry is invoked on the + cores). +- `orchestration/` — the PTO2 orchestration API (unchanged surface). + +The legacy AICPU orchestrator/scheduler sources inherited from +`tensormap_and_ringbuffer` (`runtime/scheduler/`, the orchestrator pipeline) are +progressively replaced or bypassed by the distributed path. diff --git a/src/a5/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md b/src/a5/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md new file mode 100644 index 000000000..ef1de83b4 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/docs/SCALAR_DATA_ACCESS.md @@ -0,0 +1,137 @@ +# Scalar Data Access — get/set_tensor_data Design + +## 1. Overview + +During task graph construction, orchestration sometimes needs to read InCore kernel results (for control-flow decisions) or write initial values into tensors. `get_tensor_data` / `set_tensor_data` provide **blocking** cross-layer data access, allowing orchestration to safely read and write tensor data. + +**Core design principle**: Reuse the existing TensorMap dependency tracking mechanism — no new synchronization infrastructure. + +## 2. API + +```cpp +// Blocking read: returns value at the given indices (default: raw uint64_t bits) +// Specify T for typed read: float val = get_tensor_data(tensor, 1, idx); +template +T get_tensor_data(const Tensor& tensor, uint32_t ndims, const uint32_t indices[]); + +// Blocking write: stores value at the given indices (type deduced from argument) +// Typed write: set_tensor_data(tensor, 1, idx, 42.0f); +template +void set_tensor_data(Tensor& tensor, uint32_t ndims, const uint32_t indices[], T value); +``` + +Both call into the runtime through the ops table — orchestration .so needs no runtime symbol linkage. + +## 3. Blocking Interface Design + +### 3.1 get_tensor_data Flow + +```text +addr null-check → TensorMap lookup → spin-wait producer COMPLETED → compute flat offset → memcpy read +``` + +- **addr null-check**: `buffer.addr == 0` means unallocated — log error, return 0 +- **TensorMap lookup**: find producer task by `buffer.addr` +- **spin-wait**: wait until producer `task_state >= PTO2_TASK_COMPLETED` +- **No producer** (lookup callback never fires): skip waiting, read immediately + +### 3.2 set_tensor_data Flow + +```text +addr null-check → TensorMap lookup → spin-wait producer COMPLETED → spin-wait consumers done → memcpy write +``` + +One extra step versus get_tensor_data: wait for all consumers to finish (`fanout_refcount >= fanout_count - 1`, excluding the scope reference). + +### 3.3 Timeout + +- Uses cycle counter (`get_sys_cnt_aicpu()`), checked every 1024 spins +- Threshold: `PTO2_TENSOR_DATA_TIMEOUT_CYCLES` (~10 s at 1.5 GHz) +- On timeout: sets `orch.fatal = true`, preventing further task submission + +## 4. add_output with Initial Value + +```cpp +TensorCreateInfo ci(shapes, ndims, dtype); +ci.set_initial_value(initial_value); +args.add_output(ci); +``` + +**Mechanism**: + +1. `ci.set_initial_value(value)` marks the create-info with an initial value before submission +2. `add_output(ci)` stores a pointer to `ci` in `L0TaskArgs` (the original must remain valid until submit) +3. During payload init, the output tensor is materialized via `init_from_create_info()` which triggers the fill +4. Fill strategy: + - Small buffer (< 64 B): element-by-element memcpy directly into dst + - Large buffer (≥ 64 B): fill the first 64 bytes as a template block, then bulk-memcpy in 64 B chunks; partial tail copy for remainder + +**Constraint**: existing tensors are write targets only through `add_inout()`. + +## 5. Scalar Dependencies via 1-Element Tensors + +Traditional scalars (`L0TaskArgs::add_scalar`) are one-way inputs with no TensorMap tracking. For cross-task scalar values, use a 1-element tensor as the carrier: + +```cpp +uint32_t shapes[1] = {1}; +TensorCreateInfo scalar_ci(shapes, 1, DataType::FLOAT32); + +// Submit with initial value and keep the returned tensor +scalar_ci.set_initial_value(float_to_u64(77.0f)); +L0TaskArgs args; +args.add_output(scalar_ci); +TaskOutputTensors outs = rt_submit_aiv_task(FUNC_NOOP, args); +const Tensor& scalar_tensor = outs.get_ref(0); + +// Orchestration-side blocking read (waits for kernel completion) +uint32_t idx[1] = {0}; +float val = get_tensor_data(scalar_tensor, 1, idx); +``` + +**Advantage**: Fully reuses existing TensorMap (producer tracking, fanin/fanout dependencies) — no new infrastructure needed. + +## 6. Data Hazard Analysis + +Three actors: + +- **Kernel**: InCore task submitted via add_input/add_output/add_inout (asynchronous execution) +- **Orch Read**: orchestration calls `get_tensor_data` (blocking read) +- **Orch Write**: orchestration calls `set_tensor_data` (blocking write) + +### Hazard Matrix (earlier operation → later operation) + +| # | Earlier Op | Later Op | Hazard | Guarantee | Safe? | +| - | ---------- | -------- | ------ | --------- | ----- | +| 1 | Kernel write (OUTPUT) | Orch Read | RAW | spin-wait producer COMPLETED | Yes | +| 2 | Kernel write (OUTPUT) | Orch Write | WAW | spin-wait producer COMPLETED | Yes | +| 3 | Kernel read (INPUT) | Orch Write | WAR | spin-wait fanout_refcount | **Needs INOUT** | +| 4 | Kernel read-write (INOUT) | Orch Read | RAW | spin-wait producer COMPLETED | Yes | +| 5 | Kernel read-write (INOUT) | Orch Write | WAW+WAR | spin-wait producer + consumers | Yes | +| 6 | Orch Write | Kernel read (INPUT) | RAW | blocking completes before next submit | Yes | +| 7 | Orch Write | Kernel write (OUTPUT) | WAW | same — serial guarantee | Yes | +| 8 | Orch Read | Kernel write (OUTPUT) | WAR | same — serial guarantee | Yes | +| 9–12 | Orch ↔ Orch | — | — | same-thread serial execution | Yes | + +### Key Design Points + +**Scenario #3 is the only case requiring special attention**: + +TensorMap tracks only producers (OUTPUT/INOUT), not pure INPUT consumers. If a tensor is only registered via `add_input()`, TensorMap has no producer entry for it. `set_tensor_data`'s `wait_for_tensor_ready()` finds no matching producer (the lookup callback never fires) and returns immediately — but the kernel may still be reading → **WAR data race**. + +**Solution**: For tensors that may later be written via `set_tensor_data`, use `add_inout()` instead of `add_input()`. INOUT registers a producer entry in TensorMap, enabling `set_tensor_data` to track all consumers through `fanout_refcount`. + +**Scenarios #6–8 serial guarantee**: + +get/set_tensor_data are blocking calls, and orchestration is single-threaded serial submission. After a blocking operation completes, subsequent code (including task submissions) executes strictly afterward. + +## 7. External Tensor Behavior + +`make_tensor_external()` creates tensors with a pre-set `buffer.addr` (pointing to host-allocated device memory). + +| Scenario | Behavior | +| -------- | -------- | +| External tensor never submitted as OUTPUT/INOUT | No TensorMap entry — get/set execute immediately | +| External tensor previously submitted as OUTPUT/INOUT | TensorMap has producer entry — get/set spin-wait | +| External tensor submitted as INPUT, then set_tensor_data | **WAR risk** — must use INOUT instead (same as scenario #3) | + +**Key rule**: If an external tensor will later be written via `set_tensor_data`, all prior kernel accesses must use `add_inout()`, not `add_input()`. diff --git a/src/a5/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md b/src/a5/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md new file mode 100644 index 000000000..8cba7e90c --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/docs/SUBMIT_BY_CLUSTER.md @@ -0,0 +1,222 @@ +# Submit by Cluster - Requirements and Main-Branch-Aligned Design + +## 1. Goal + +Define a single, main-branch-aligned specification for PTO2 cluster submission that combines: + +1. Product requirements (what must be true). +2. Runtime design (how it is implemented on current main baseline). + +The target model is: one submitted graph node is one `MixedTask`, and dispatch/completion is mixed-task-granular. + +## 2. Background and Motivation + +Future Ascend hardware is expected to provide stronger locality within an AICore cluster (`1 AIC + 2 AIV`). +The runtime therefore needs a "submit together, run together" model for related AIC/AIV kernels. + +Legacy per-task submit (`kernel_id + worker_type`) cannot express atomic co-dispatch of multiple kernels to one cluster. + +## 3. Scope + +### In Scope + +1. New orchestration-facing submit API for cluster-aware mixed submission. +2. Runtime/backend scheduler and executor changes to treat a mixed submit as one atomic scheduling unit. +3. Dependency gating, readiness, dispatch, completion, and reclamation at mixed-task granularity. +4. AIV slot equivalence (`AIV0` and `AIV1` are equivalent execution targets). + +### Out of Scope + +1. User-facing cluster pinning (`allocate_cluster/free_cluster`-style APIs). +2. New worker types beyond AIC/AIV. +3. Cross-cluster user placement policies. +4. Hardware topology changes beyond `1 AIC + 2 AIV` per cluster. + +## 4. Main-Branch Baseline Constraints + +Design must preserve the current main runtime architecture: + +1. Executor threading split (orchestrator thread vs scheduler threads), and post-orchestrator transition (`transition_requested_` + `reassign_cores_for_all_threads()`). +2. Shared-memory hot/cold split (`PTO2TaskDescriptor` hot + `PTO2TaskPayload` cold). + +## 5. Terminology + +1. `cluster`: one physical unit with `1 AIC + 2 AIV`. +2. `MixedKernels`: 3 submit slots (`AIC`, `AIV0`, `AIV1`) with `INVALID_KERNEL_ID` for inactive slots. +3. `MixedTask`: one runtime graph node created by one submit call. +4. `active_mask`: bitmask of active subtask slots. +5. `resource shape`: normalized lane demand class of a mixed task. + +## 6. API Contract + +```cpp +inline constexpr int32_t INVALID_KERNEL_ID = -1; + +struct MixedKernels { + int32_t aic_kernel_id{INVALID_KERNEL_ID}; + int32_t aiv0_kernel_id{INVALID_KERNEL_ID}; + int32_t aiv1_kernel_id{INVALID_KERNEL_ID}; +}; + +static inline void rt_submit_task(PTO2Runtime* rt, + const MixedKernels& mixed_kernels, + Arg* args, + int32_t num_args); + +static inline void rt_submit_aic_task(PTO2Runtime* rt, + int32_t kernel_id, + Arg* args, + int32_t num_args); + +static inline void rt_submit_aiv_task(PTO2Runtime* rt, + int32_t kernel_id, + Arg* args, + int32_t num_args); +``` + +Rules: + +1. One submit call creates one `MixedTask`. +2. All active slots share the same `args` and `num_args`. +3. At least one slot must be active. +4. `aiv0_kernel_id` and `aiv1_kernel_id` are semantically equivalent. +5. Wrappers are orchestration sugar only (inline in orchestration API); no dedicated runtime ops entries. +6. Submit-contract types are defined once in a shared header-only submit-types surface consumed by orchestration and runtime headers. +7. Invalid submits follow existing PTO2 behavior (`always_assert`), not a new recoverable return-code API. + +## 7. Data Model (Requirements + Design) + +`PTO2TaskDescriptor` (hot path) carries mixed-task identity/state: + +1. `task_id` +2. `active_mask` +3. `completed_subtasks` (atomic counter, incremented per subtask completion) +4. `kernel_id[3]` for `(AIC, AIV0, AIV1)` +5. dependency heads/counters and packed-buffer metadata + +`PTO2TaskPayload` (cold path) carries: + +1. shared args/tensors/scalars copied once per mixed submit +2. fanin mixed-task IDs +3. other cold-path submit metadata + +Producer identity in TensorMap is mixed-task ID end-to-end. + +## 8. Scheduling Model + +### 8.1 Resource Shapes + +Runtime uses shape-based ready queues (not worker-type queues): + +1. `AIC_ONLY` +2. `AIV_X1` +3. `AIV_X2` +4. `AIC_AIV_X1` +5. `AIC_AIV_X2` + +Queueing key is normalized resource shape (not raw slot label). + +### 8.2 Atomic Cluster Dispatch + +1. Dispatch decision unit is one mixed task. +2. For multi-slot mixed tasks, partial launch is forbidden. +3. A mixed task is dispatchable only when one local owned cluster can satisfy all required lanes. +4. Compatible mixed tasks may co-reside over time if they use disjoint free lanes. + +### 8.3 Dependency and Completion + +1. Fanin release/readiness remains dependency-correct and graph-level. +2. Two-stage completion: + - `on_subtask_complete(task_id, subslot)` + - `on_task_complete(task_id)` only when `completed_subtasks == total_required_subtasks` +3. Downstream release is triggered once per mixed task completion, not once per subslot. + +## 9. Executor Ownership and Numbering + +### 9.1 Canonical Flattened Numbering (Unchanged) + +Given `block_dim` clusters: + +1. AIC IDs: `[0, block_dim)` +2. AIV IDs: `[block_dim, 3 * block_dim)` +3. Cluster `i`: `{i, block_dim + i, 2 * block_dim + i}` + +This project-defined flattened numbering is kept unchanged. + +### 9.2 Cluster Ownership + +1. One cluster must be owned by one scheduler domain/thread at a time. +2. No split-cluster ownership in either: + - initial `assign_cores_to_threads()` + - post-orchestrator `reassign_cores_for_all_threads()` +3. Lane occupancy bookkeeping must remain consistent with ownership after reassignment. + +## 10. Functional Requirements + +### 10.1 Valid Mixed Shapes + +1. AIC only +2. AIV only (1 or 2 AIV lanes) +3. AIC + 1 AIV +4. AIC + 2 AIV + +### 10.2 Runtime Behavior per Submit + +1. Validate submit arguments. +2. Allocate mixed-task ID and initialize descriptor/payload/slot_state once. +3. Lookup producers via TensorMap; collect fanin metadata and increment producers' `fanout_count`. +4. Push task to scheduler's wiring queue (scheduler thread 0 asynchronously wires fanout edges and determines readiness). +5. Dispatch all active lanes atomically when resources allow. +6. Aggregate completion and release downstream once. + +## 11. Non-Functional Requirements + +1. Correctness: no dependency violation, no partial mixed-task dispatch. +2. Determinism: dependency-correct ordering preserved; AIV lane choice may vary but remains semantically equivalent. +3. Fairness: resource-aware polling heuristic is allowed; strict starvation-free guarantee across all shapes is not required. +4. Performance: no obvious regression for non-cluster workflows. +5. Observability: lifecycle visibility for submit/ready/dispatch/block/complete. + +## 12. Acceptance Criteria + +Feature is accepted when: + +1. Orchestration compiles and submits via `MixedKernels` API/wrappers. +2. Scheduler dispatches each mixed task as one cluster scheduling decision. +3. Dependencies gate mixed-task readiness correctly. +4. AIV execution remains cluster-local and semantically equivalent across lanes. +5. Existing non-cluster workflows continue to pass without behavior regression. +6. Cluster ownership is never split across scheduler domains before/after transition. + +## 13. Verification Matrix + +Recommended validation coverage: + +1. Mapping correctness for cluster-to-core ID relation. +2. Atomic dispatch for multi-slot shapes. +3. Dependency gating and completion aggregation (`done_mask == active_mask`). +4. Lane-occupancy co-residency behavior for compatible shapes. +5. Core-transition ownership stability. +6. Invalid submit handling (`always_assert` path). +7. Regression coverage for existing examples/tests. + +Milestone command (device): + +```bash +python tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py \ + -p a2a3 -d 9 +``` + +Final validation: + +```bash +pytest examples tests/st --platform a2a3 +``` + +## 14. Resolved Decisions + +1. Legacy orchestration-facing single-task submit is replaced by mixed submit contract. +2. Invalid mixed submits fail with existing submit-time assert behavior. +3. Per-cluster concurrent capacity is lane-occupancy-driven, not a fixed constant. +4. Submit-contract types live in one shared header-only surface. +5. Resource-aware dispatch heuristics are allowed without a strict starvation-free guarantee. diff --git a/src/a5/runtime/fully_distributed_within_core/docs/device_log_profiling.md b/src/a5/runtime/fully_distributed_within_core/docs/device_log_profiling.md new file mode 100644 index 000000000..af661d440 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/docs/device_log_profiling.md @@ -0,0 +1,166 @@ +# PTO2 Device Log Profiling Guide + +## How to Find Device Logs + +AICPU logs (via `LOG_INFO_V9`) are written by CANN's **dlog** subsystem and do **not** appear in the `python test_*.py` / pytest terminal output. They are written to CANN's device log directory: + +```text +$HOME/ascend/log/debug/device-/device-_.log +``` + +Each run produces a new log file (or appends to an existing one). Find the most recent file by modification time: + +```bash +ls -lt $HOME/ascend/log/debug/device-/ | head -5 +``` + +## Log Structure Overview + +A single run produces two profiling blocks in the device log: + +| Block | Emitted by | Function | Content | +| ----- | ---------- | -------- | ------- | +| **Orchestrator Profiling** | Thread 3 (orchestrator) | `aicpu_orchestration_entry` | Time breakdown of graph construction on device | +| **PTO2 Scheduler Summary** | Threads 0/1/2 (schedulers) | `SchedulerContext::resolve_and_dispatch` | Per-thread scheduling statistics, phase timing, and lock contention | + +All timing values are in microseconds (us), converted from AICPU cycle counters. + +--- + +## Block 1: Orchestrator Profiling + +Thread 3 loads the orchestration `.so` via `dlopen`, calls `aicpu_orchestration_entry`, and prints a profiling summary after it returns. + +### Example (from a real run: batch=64, 16704 tasks) + +```text +Thread 3: Calling aicpu_orchestration_entry from SO +Thread 3: aicpu_orchestration_entry returned, cost 20943.940us +Thread 3: === Orchestrator Profiling: 16704 tasks, total=14601.580us === +Thread 3: sync_tensormap : 286.300us (2.0%) +Thread 3: task_ring_alloc: 380.400us (2.6%) +Thread 3: param_copy : 2147.800us (14.7%) +Thread 3: lookup+dep : 7290.300us (49.9%) +Thread 3: heap_alloc : 701.500us (4.8%) +Thread 3: tensormap_ins : 1890.380us (12.9%) +Thread 3: fanin+ready : 1207.400us (8.3%) +Thread 3: finalize+SM : 697.500us (4.8%) +Thread 3: scope_end : 364.080us +Thread 3: avg/task : 0.874us +Thread 3: PTO2 total submitted tasks = 16704 +``` + +### Field Reference + +| Field | Source (`pto_orchestrator.cpp`) | Description | +| ----- | ------------------------------- | ----------- | +| **cost** | Wall-clock around `orch_func()` call | Total time including orchestration logic + scope overhead | +| **total** | Sum of all sub-steps below | Accumulated time inside `submit_task` across all tasks | +| **sync_tensormap** | `g_orch_sync_cycle` | TensorMap validity sync and optional cleanup before each submission | +| **task_ring_alloc** | `g_orch_alloc_cycle` | Allocating a task slot from the task ring buffer | +| **param_copy** | `g_orch_args_cycle` | Copying param descriptors + tensor descriptor copies into task-owned storage | +| **lookup+dep** | `g_orch_lookup_cycle` | TensorMap lookup for inputs/inouts + building fanin/fanout dependency edges | +| **heap_alloc** | `g_orch_heap_cycle` | Allocating packed output buffers from the heap ring | +| **tensormap_ins** | `g_orch_insert_cycle` | Inserting output/inout tensors into the TensorMap | +| **fanin+ready** | `g_orch_fanin_cycle` | Building the fanin list + checking if task is already ready (Step 5/5b) | +| **scope_end** | `g_orch_scope_end_cycle` | `end_scope` overhead (notifying scheduler of scope completion) | +| **avg/task** | `total / submit_count` | Average orchestrator time per task submission | + +### Interpreting the Numbers + +- **cost > total**: The difference is overhead outside `submit_task` (the orchestration user code itself, scope_begin/end, TensorCreateInfo construction, etc.). +- **lookup+dep** is typically the dominant cost (~50%) because it involves TensorMap hash lookups and building dependency edges with spinlock-protected fanout list insertions. +- **param_copy** scales with the number of parameters per task. +- **avg/task < 1us** indicates efficient graph construction. + +--- + +## Block 2: PTO2 Scheduler Summary + +Each of the 3 scheduler threads (Thread 0, 1, 2) prints its own summary after completing all tasks. The output has two sub-sections: **summary** and **phase breakdown**. + +### Example (Thread 0, from a different run: batch=1, 1044 tasks) + +```text +Thread 0: completed=352 tasks in 3477.420us (147 loops, 2.4 tasks/loop) +Thread 0: --- Phase Breakdown --- +Thread 0: complete: 1485.020us (42.7%) +Thread 0: scan: 14.400us (0.4%) +Thread 0: dispatch: 1973.060us (56.7%) +Thread 0: idle: 4.940us (0.1%) +``` + +### Summary Line + +```text +Thread N: completed=X tasks in Yus (Z loops, W tasks/loop) +``` + +| Field | Description | +| ----- | ----------- | +| **completed** | Number of tasks this thread processed to completion | +| **Y us** | Total scheduler loop time (sum of all phase cycles) | +| **Z loops** | Number of scheduler loop iterations | +| **W tasks/loop** | Average tasks completed per loop iteration; higher = better throughput | + +### Phase Breakdown + +The scheduler loop runs four phases each iteration. Each phase's time is accumulated across all loop iterations. + +| Phase | What it does | Inline stats | +| ----- | ------------ | ------------ | +| **complete** | Polls handshake on each managed core; when a core completes, calls `on_subtask_complete(task_id, subslot)` to increment the completion counter; when `completed_subtasks == total_required_subtasks`, triggers `on_task_complete` which traverses fanout list (notify consumers) and fanin list (release producers) | `fanout`: edges/max_degree/avg for consumer notification; `fanin`: edges/max_degree/avg for producer release | +| **scan** | Updates the perf profiling header with latest scheduler state | — | +| **dispatch** | For each idle core, pops a task from the shape-based ready queue via `get_ready_task(shape)`, builds the dispatch payload, and writes the task to the core's handshake register | `pop`: `hit` = successful pops (task dispatched), `miss` = empty queue pops, `hit_rate` = hit/(hit+miss) | +| **idle** | Scheduler loop iteration where no progress was made (no completions, no dispatches) | — | + +**Interpreting phase percentages:** + +- **dispatch** is typically the largest (~55-60%) because it includes ready-queue pops (with spinlock), payload construction, and cache flush (`dc cvac` + `dsb sy`). +- **complete** is the second largest (~40-45%) because it traverses both fanout (CAS-based fanin decrement, conditional ready-queue push) and fanin (release_producer, check_consumed, ring pointer advancement). +- **scan** is small (<1%) — only updates the perf header. +- **idle** is negligible when tasks are flowing; high idle% indicates the scheduler is starved. + +**Interpreting pop hit_rate:** + +- **High hit_rate (>50%)**: Ready queue is well-supplied; dispatch is efficient. +- **Low hit_rate (<10%)**: Ready queue is mostly empty when cores become idle. The bottleneck is upstream (orchestrator submission speed or fanout resolution latency), not dispatch itself. + +### Per-Task Averages + +Divide each thread's phase times by its `completed` count to get per-task scheduling cost: + +| Metric | Formula | Typical value | +| ------ | ------- | ------------- | +| Scheduling overhead per task | total_time / completed | ~5-10 us/task | +| Dispatch per task | dispatch_time / completed | ~3-6 us/task | +| Complete per task | complete_time / completed | ~2-4 us/task | + +--- + +## Cross-Referencing with Host Profiling + +When `--enable-l2-swimlane` is used, the host terminal prints a **Task Statistics by Function** table with `Total_Exec` (total AICore kernel execution time). Combined with device log data: + +| Metric | Source | Description | +| ------ | ------ | ----------- | +| Avg kernel exec time | `Total_Exec / total_tasks` (host) | Time AICore spends executing each kernel | +| Avg scheduling overhead | `sum(thread_total) / total_tasks` (device log) | Time AICPU spends scheduling each task | +| Sched/Exec ratio | scheduling / execution | Scheduling overhead relative to kernel execution | + +A high sched/exec ratio (e.g., >3x) indicates that scheduling overhead dominates, and optimizations should target the scheduler's dispatch hot path (cache flush, payload construction) or upstream task flow. + +--- + +## Quick Reference: Extracting Profiling Data + +```bash +# Find the latest device log for device 2 +ls -t $HOME/ascend/log/debug/device-2/device-*.log | head -1 + +# Extract orchestrator profiling (Thread 3) +grep "Thread 3:" + +# Extract scheduler profiling (Threads 0/1/2) +grep -E "Thread [012]:" +``` diff --git a/src/a5/runtime/fully_distributed_within_core/docs/profiling_levels.md b/src/a5/runtime/fully_distributed_within_core/docs/profiling_levels.md new file mode 100644 index 000000000..2ef6c1b6a --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/docs/profiling_levels.md @@ -0,0 +1,450 @@ +# PTO Runtime2 Profiling Levels + +This document describes the profiling macro hierarchy and logging control in the PTO Runtime2 system. + +## Overview + +PTO Runtime2 uses a hierarchical profiling system with compile-time macros to control profiling code compilation and log output. The `enable_l2_swimlane` runtime flag (integer perf_level 0–4) controls data collection granularity (performance buffers, shared memory writes) but does NOT control log output. + +## Profiling Macro Hierarchy + +Defaults and dependency validation are centralized in +`src/common/task_interface/profiling_config.h`. Runtime headers include that +file before using the macros, so both a2a3 and a5 share the same default +values and compile-time checks. + +```text +PTO2_PROFILING (base level, default=1) +├── PTO2_ORCH_PROFILING (orchestrator, default=0, requires PTO2_PROFILING=1) +| └──PTO2_TENSORMAP_PROFILING (tensormap, default=0, requires PTO2_ORCH_PROFILING=1) +├── PTO2_SCHED_PROFILING (scheduler, default=0, requires PTO2_PROFILING=1) +└── --enable-l2-swimlane [PERF_LEVEL] (L2 swimlane data collection, 0-4, bare=4, requires PTO2_PROFILING=1) + +``` + +### Compile-Time Validation + +Each sub-level macro requires `PTO2_PROFILING=1`: + +```cpp +#if PTO2_ORCH_PROFILING && !PTO2_PROFILING +#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1" +#endif + +#if PTO2_SCHED_PROFILING && !PTO2_PROFILING +#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1" +#endif + +#if PTO2_TENSORMAP_PROFILING && !PTO2_ORCH_PROFILING +#error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1" +#endif +``` + +## Profiling Levels + +### Level 0: No Profiling (PTO2_PROFILING=0) + +**What's compiled:** + +- Debug/diagnostic logs (always present) +- Progress tracking (`PTO2 progress: completed=...`) +- Stall detection and dump (triggered after the `SCHEDULER_TIMEOUT_MS` wall-clock no-progress budget) +- Deadlock/livelock detection (`diagnose_stuck_state`, called on stall) + +**What's NOT compiled:** + +- All `CYCLE_COUNT_*` timing counters (`sched_*_cycle`, orchestrator cost counters) +- Scheduler/Orchestrator profiling summary logs guarded by `#if PTO2_PROFILING` +- Performance data collection paths (`enable_l2_swimlane` runtime flag becomes ineffective because profiling code is not compiled) + +**Log output (normal run, no stall):** + +- No `sched_start/sched_end/sched_cost` timestamps +- No `orch_start/orch_end/orch_cost` timestamps +- No `Scheduler summary: total_time=...` +- No `PTO2 total submitted tasks` log +- `PTO2 progress: completed=... total=...` may appear (thread 0 only, at task completion milestones) + +--- + +### Level 1: Basic Profiling (PTO2_PROFILING=1) + +**What's compiled:** + +- Base timing counters for scheduler loop (`sched_complete/dispatch/idle/scan`) +- Per-thread orchestration timing (`orch_start`, `orch_end`, `orch_cost`) +- Stage-level orchestration end timestamp (`orch_stage_end`, printed by last orch thread only, marks the moment all orch threads have finished and core transition is about to be requested; only when `orch_to_sched_` is true) +- PTO2 total submitted tasks count (printed by last orch thread, after orch timing line) +- Scheduler summary output (`total_time`, `loops`, `tasks_scheduled`) +- Scheduler lifetime timestamps and cost (`sched_start`, `sched_end`, `sched_cost` — captured inside `resolve_and_dispatch_pto2()`, printed before Scheduler summary) + +**What's NOT compiled:** + +- Detailed phase breakdowns +- TensorMap statistics + +**Log output (additional lines vs Level 0, per normal run):** + +- `Thread %d: orch_start=%llu orch_end=%llu orch_cost=%.3fus` — each orch thread, after orchestration fully complete +- `PTO2 total submitted tasks = %d, already executed %d tasks` — last orch thread only (×1), after orch timing line +- `Thread %d: orch_stage_end=%llu` — last orch thread only (×1), only when `orch_to_sched_=true` +- `Thread %d: sched_start=%llu sched_end=%llu sched_cost=%.3fus` — each sched thread, printed before Scheduler summary +- `Thread %d: Scheduler summary: total_time=%.3fus, loops=%llu, tasks_scheduled=%d` — each sched thread +- `Thread %d: sched_start=%llu sched_end(timeout)=%llu sched_cost=%.3fus` — timeout path only (replaces normal `sched_end`) + +**LOG_INFO_V9 count (normal run):** + +- `orch_to_sched_=false` (default): `N_sched*2 + N_orch*1 + 1` (orch_timing + PTO2_total + sched_timing + Scheduler_summary) +- `orch_to_sched_=true` (`PTO2_ORCH_TO_SCHED=1`): adds 1 (`orch_stage_end`) + +> See the table at the end for concrete counts based on the `paged_attention` example. + +**Example log output — `orch_to_sched_=false`** (from `paged_attention`, device 10): + +```text +Thread 2: orch_start=48214752948321 orch_end=48214752959379 orch_cost=230.000us +Thread 3: orch_start=48214752948316 orch_end=48214752961505 orch_cost=275.000us +PTO2 total submitted tasks = 13, already executed 13 tasks +Thread 1: sched_start=48214752948235 sched_end=48214752962379 sched_cost=295.000us +Thread 1: Scheduler summary: total_time=159.560us, loops=3782, tasks_scheduled=6 +Thread 0: sched_start=48214752948200 sched_end=48214752963571 sched_cost=320.000us +Thread 0: Scheduler summary: total_time=183.180us, loops=4611, tasks_scheduled=7 +``` + +**Example log output — `orch_to_sched_=true`** (`PTO2_ORCH_TO_SCHED=1`, from `paged_attention`, device 11): + +```text +Thread 3: orch_stage_end=48236915058307 +Thread 3: orch_start=48236915044001 orch_end=48236915058781 orch_cost=308.000us +Thread 2: orch_start=48236915044003 orch_end=48236915058782 orch_cost=308.000us +PTO2 total submitted tasks = 13, already executed 13 tasks +Thread 0: sched_start=48236915043911 sched_end=48236915059191 sched_cost=318.000us +Thread 0: Scheduler summary: total_time=187.920us, loops=4561, tasks_scheduled=4 +Thread 1: sched_start=48236915043947 sched_end=48236915061881 sched_cost=372.000us +Thread 1: Scheduler summary: total_time=168.620us, loops=3880, tasks_scheduled=9 +``` + +> With `orch_to_sched_=true`, orch threads transition to schedulers after orchestration. They print `orch_end` but do NOT print `Scheduler summary` or `sched_end` (they have no cores assigned at shutdown time). + +**Note:** + +- All logs above are controlled by compile-time macro `PTO2_PROFILING`, not by `enable_l2_swimlane`. +- `enable_l2_swimlane` only controls shared-memory data collection / swimlane export. +- Enable `orch_to_sched_` via environment variable: `PTO2_ORCH_TO_SCHED=1`. + +--- + +### Level 2: Scheduler Detailed Profiling (PTO2_SCHED_PROFILING=1) + +**Requires:** `PTO2_PROFILING=1` + +**What's compiled:** + +- All Level 1 features +- Detailed scheduler phase counters +- Phase-specific statistics (complete, scan, dispatch, idle) +- Hit rate tracking (complete poll, ready queue pop) + +**Log output:** 18 LOG_INFO_V9 logs (11 debug + 2 basic + 7 scheduler detailed - 2 replaced) + +- Replaces scheduler summary with detailed breakdown + +**Scheduler output:** + +```text +Thread X: === Scheduler Phase Breakdown: total=XXXus, XXX tasks === +Thread X: complete : XXXus (XX.X%) +Thread X: poll : XXXus (XX.X%) hit=XXX, miss=XXX, hit_rate=XX.X% +Thread X: otc_lock : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: otc_fanout : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: otc_fanin : XXXus (XX.X%) atomics=XXX +Thread X: otc_self : XXXus (XX.X%) atomics=XXX +Thread X: perf : XXXus (XX.X%) +Thread X: dispatch : XXXus (XX.X%) +Thread X: poll : XXXus (XX.X%) +Thread X: pop : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: setup : XXXus (XX.X%) +Thread X: scan : XXXus (XX.X%) +Thread X: idle : XXXus (XX.X%) +Thread X: avg/complete : XXXus +Thread X: Scheduler summary: total_time=XXXus, loops=XXX, tasks_scheduled=XXX +``` + +Per-thread fanout / fanin edge counts and ready-queue pop hit / miss +stats live in `aicpu_scheduler_phases[]` (in `l2_swimlane_records.json` +captured at l2_swimlane_level >= 3) and `deps.json`; consume them via +`simpler_setup/tools/sched_overhead_analysis.py`. + +--- + +### Level 3: Orchestrator Detailed Profiling (PTO2_ORCH_PROFILING=1) + +**Requires:** `PTO2_PROFILING=1` + +**What's compiled:** + +- All Level 1 features +- Detailed orchestrator phase counters +- Per-phase cycle tracking +- Atomic operation counters +- Wait time tracking + +**Log output:** 30 LOG_INFO_V9 logs (11 debug + 2 basic + 1 scheduler summary + 17 orchestrator detailed - 1 replaced) + +- Replaces basic orchestration completion with detailed breakdown + +**Orchestrator output:** + +```text +Thread X: === Orchestrator Profiling: XXX tasks, total=XXXus === +Thread X: sync_tensormap : XXXus (XX.X%) +Thread X: task_ring_alloc: XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: param_copy : XXXus (XX.X%) atomics=XXX +Thread X: lookup+dep : XXXus (XX.X%) +Thread X: heap_alloc : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: tensormap_ins : XXXus (XX.X%) +Thread X: fanin+ready : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: finalize+SM : XXXus (XX.X%) work=XXXus wait=XXXus atomics=XXX +Thread X: scope_end : XXXus atomics=XXX +Thread X: avg/task : XXXus +``` + +**Note:** Orchestrator logs always print when `PTO2_ORCH_PROFILING=1`, regardless of `enable_l2_swimlane` flag. + +--- + +### Level 4: TensorMap Profiling (PTO2_TENSORMAP_PROFILING=1) + +**Requires:** `PTO2_PROFILING=1` AND `PTO2_ORCH_PROFILING=1` + +**What's compiled:** + +- All Level 3 features +- TensorMap lookup statistics +- Hash chain walk tracking +- Overlap check counters + +**Log output:** 34 LOG_INFO_V9 logs (30 from Level 3 + 4 tensormap) + +**TensorMap output:** + +```text +Thread X: === TensorMap Lookup Stats === +Thread X: lookups : XXX, inserts: XXX +Thread X: chain walked : total=XXX, avg=X.X, max=X +Thread X: overlap checks : XXX, hits=XXX (XX.X%) +``` + +--- + +## Runtime Flag: enable_l2_swimlane (perf_level) + +`--enable-l2-swimlane` accepts an integer perf_level (0–4). Transport +mirrors the PMU pattern — two independent channels (one binary, one int): + +- **Binary on/off** — `KernelArgs::enable_profiling_flag` bit1 + (`PROFILING_FLAG_L2_SWIMLANE`). Set by the host whenever level > 0; read + by AICore (which only needs on/off to decide whether to write timing) and + by AICPU kernel entry via `set_l2_swimlane_enabled(bool)`. +- **Granular level (0–4)** — `L2SwimlaneDataHeader::l2_swimlane_level` + (shared memory). Host writes it in `L2SwimlaneCollector::initialize`; AICPU + promotes it from the header in `l2_swimlane_aicpu_init` and exposes it via + `get_l2_swimlane_level()` (typed `L2SwimlaneLevel`) for + `>= AICPU_TIMING / SCHED_PHASES / ORCH_PHASES` gates. + +On sim, the binary on/off travels via the dlsym'd `set_l2_swimlane_enabled` +entry point; the granular level still goes through the shared-memory +header just like on onboard. + +| Level | Collects | +| ----- | -------- | +| 0 | Nothing (disabled) | +| 1 | AICore timing only (start/end/task_id/func_id/core_type) | +| 2 | + dispatch_time, finish_time | +| 3 | + Scheduler phases (`SCHED_*`) | +| 4 | + Orchestrator phases (full) | + +Bare `--enable-l2-swimlane` = level 4 (backward compatible). + +### Level gating in AICPU code + +Use the strongly-typed `L2SwimlaneLevel` enum so each gate names the +content it depends on instead of relying on magic numbers: + +```cpp +// Any level > 0: AICPU task record buffer init / flush. +// Cheap binary check, available immediately after kernel entry. +if (is_l2_swimlane_enabled()) { ... } + +// AICPU dispatch/finish timestamps. +// Granular checks below require l2_swimlane_aicpu_init to have already run +// (so the level has been promoted from the shared-memory header). +if (get_l2_swimlane_level() >= L2SwimlaneLevel::AICPU_TIMING) { ... } + +// Scheduler main-loop phase records (SCHED_*) +if (get_l2_swimlane_level() >= L2SwimlaneLevel::SCHED_PHASES) { ... } + +// Orchestrator phase records +if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) { ... } +``` + +`L2SwimlaneLevel` is defined in `common/l2_swimlane_profiling.h` with +underlying type `uint32_t` (matches the `L2SwimlaneDataHeader::l2_swimlane_level` +shared-memory field and mirrors `PmuEventType : uint32_t`): + +| Enumerator | Underlying value | +| ---------- | ---------------- | +| `DISABLED` | 0 | +| `AICORE_TIMING` | 1 | +| `AICPU_TIMING` | 2 | +| `SCHED_PHASES` | 3 | +| `ORCH_PHASES` | 4 | + +### When enable_l2_swimlane=0 + +- No performance data collection +- No shared memory writes +- Logs still print (controlled by macros only) + +--- + +## Common Profiling Configurations + +### Development (minimal overhead) + +```bash +# No profiling overhead +PTO2_PROFILING=0 +``` + +### Basic Performance Monitoring + +```bash +# Minimal overhead, summary logs only +PTO2_PROFILING=1 +PTO2_ORCH_PROFILING=0 +PTO2_SCHED_PROFILING=0 +``` + +### Scheduler Performance Analysis + +```bash +# Detailed scheduler breakdown +PTO2_PROFILING=1 +PTO2_ORCH_PROFILING=0 +PTO2_SCHED_PROFILING=1 +``` + +### Orchestrator Performance Analysis + +```bash +# Detailed orchestrator breakdown +PTO2_PROFILING=1 +PTO2_ORCH_PROFILING=1 +PTO2_SCHED_PROFILING=0 +``` + +### Full Profiling (maximum overhead) + +```bash +# All profiling features enabled +PTO2_PROFILING=1 +PTO2_ORCH_PROFILING=1 +PTO2_SCHED_PROFILING=1 +PTO2_TENSORMAP_PROFILING=1 +``` + +--- + +## Setting Profiling Macros + +### At compile time + +Pass compile definitions through the build command or CI `CXXFLAGS`. +This overrides the defaults in `profiling_config.h` without changing source. + +```bash +# Example: disable all profiling code +CXXFLAGS="-DPTO2_PROFILING=0" pip install --no-build-isolation -e . + +# Example: enable orchestrator and tensormap profiling +CXXFLAGS="-DPTO2_ORCH_PROFILING=1 -DPTO2_TENSORMAP_PROFILING=1" \ + pip install --no-build-isolation -e . +``` + +### In source code (before including headers) + +Source-level overrides are only for local experiments. They must appear before +any header includes `profiling_config.h`; do not add duplicated fallback +definitions to runtime headers. + +```cpp +#define PTO2_PROFILING 1 +#define PTO2_ORCH_PROFILING 1 +#include "pto_runtime2_types.h" +``` + +--- + +## Log Output Summary + +> Example: `paged_attention` on Ascend hardware, 2 sched threads + 2 orch threads, normal run (no stall/timeout). + +| Level | Macro Settings | LOG_INFO_V9 Count (`orch_to_sched_=false`) | LOG_INFO_V9 Count (`orch_to_sched_=true`) | Description | +| ----- | -------------- | ------------------------------------------ | ----------------------------------------- | ----------- | +| 0 | `PTO2_PROFILING=0` | 0 | 0 | No timing output | +| 1 | `PTO2_PROFILING=1` | 7 | 8 | Timing timestamps + scheduler summary | +| 2 | `+PTO2_SCHED_PROFILING=1` | — | — | Scheduler detailed phase breakdown | +| 3 | `+PTO2_ORCH_PROFILING=1` | — | — | Orchestrator detailed phase breakdown | +| 4 | `+PTO2_TENSORMAP_PROFILING=1` | — | — | TensorMap lookup stats | + +--- + +## Implementation Notes + +### Key Principles + +1. **Macros control compilation and logging** + - `#if PTO2_PROFILING` controls whether profiling code is compiled + - Logs print when macro is enabled, regardless of runtime flag + +2. **Runtime flag controls data collection** + - `enable_l2_swimlane` controls performance buffer allocation + - Controls shared memory writes for host-side export + - Does NOT control log output + +3. **Consistent behavior across components** + - Scheduler logs: macro-controlled only + - Orchestrator logs: macro-controlled only + - Data collection: runtime flag controlled + +### Code Locations + +- Macro defaults and validation: `src/common/task_interface/profiling_config.h` +- Scheduler profiling: `src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp` and `scheduler_cold_path.cpp` +- Orchestrator profiling: `src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp` +- TensorMap profiling: `src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h` + +--- + +## Performance Impact + +### Compilation overhead + +- Level 0: No overhead +- Level 1: Minimal (counter increments, basic arithmetic) +- Level 2-4: Low to moderate (additional counters, cycle measurements) + +### Runtime overhead + +- Logging: Negligible (device logs are asynchronous) +- Data collection (`enable_l2_swimlane>0`): Low to moderate + - Performance buffer writes + - Shared memory updates + - Per-task timing measurements + +### Recommendation + +- Use Level 0 for production +- Use Level 1-2 for performance monitoring +- Use Level 3-4 for detailed performance analysis only diff --git a/src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp b/src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp new file mode 100644 index 000000000..55565e885 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.cpp @@ -0,0 +1,784 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file dep_gen_replay.cpp + * @brief Replay in-memory DepGenRecord stream → deps.json (strided tensor + * representation, tensor-annotated) via a host-resident PTO2TensorMap, + * with a differential check against the runtime template `compute_task_fanin`. + * + * Two passes run per record against two parallel PTO2TensorMap instances that + * evolve in lockstep: + * + * ORACLE pass (read-only contract): + * Drives `compute_task_fanin` (the same template the device orchestrator + * uses in pto_orchestrator.cpp:submit_task) against `tm_oracle`. Emits + * only PTO2TaskId values — the canonical set of producer IDs the runtime + * would have wired. We never widen this template's emit signature: this + * pass IS the contract, and any future change to `compute_task_fanin` + * automatically refreshes the oracle. + * + * ANNOT pass (this file's feature): + * Inlines the same STEP A (creator retention) + STEP B (tensormap lookup) + * against `tm_annot`, but the callback fires with the full + * `PTO2TensorMapEntry&` + the consumer Tensor* + the arg index, so the + * replay can record per-edge tensor metadata (producer/consumer + * shape/offset, dtype, version). + * + * After both passes finish per record, we compare the producer-ID set the + * oracle emitted to the producer-ID set the annot pass emitted. They MUST + * match. If they diverge, deps.json is not written and the function returns + * non-zero — this is the "no shotgun modifications" guarantee: anyone who + * changes `compute_task_fanin` will trip this gate immediately and know to + * mirror the change in the annot pass. + * + * STEP 1 (explicit_deps) is emitted at the call site (per pto_dep_compute.h's + * "kept at call site" note); both passes run the same explicit-deps loop, so + * the comparison covers it too. + * + * STEP 4 (`register_task_outputs`) runs on BOTH tensor maps after both passes + * complete, keeping `tm_oracle` and `tm_annot` bit-equivalent for the next + * record's INOUT+COVERED `remove_entry` mutations. + * + * Pool sizing: replay never advances last_task_alive, so each tensor map's + * entry pool must accommodate every output write across the whole trace. We + * scan the record buffer once to count INOUT + OUTPUT_EXISTING slots and size + * the pool accordingly. Both maps get the same size. + */ + +#include "dep_gen_replay.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/dep_gen.h" +#include "common/unified_log.h" +#include "data_type.h" +#include "pto_dep_compute.h" +#include "pto_task_id.h" +#include "pto_tensormap.h" +#include "tensor.h" + +namespace { + +int32_t ceil_pow2(int32_t v) { + if (v <= 1) return 1; + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + +// Count INOUT + OUTPUT_EXISTING slots across the record buffer — +// register_task_outputs only inserts those, and skips entries with manual_dep +// set. Counting both without inspecting manual_dep is a conservative upper +// bound (manual_dep is rare; the small over-allocation pays for itself in +// avoided pool exhaustion). +int32_t count_outputs(const DepGenRecord *records, size_t n) { + int32_t total = 0; + for (size_t i = 0; i < n; i++) { + const DepGenRecord &r = records[i]; + // Overflow chain slots are reinterpret_cast views with no tensor data; + // their `tensor_count` bytes are actually the overflow `dep_count` field, + // which would mislead the loop below if read as a tensor count. + if (r.flags & DEP_GEN_FLAG_OVERFLOW) continue; + for (uint16_t j = 0; j < r.tensor_count; j++) { + auto t = static_cast(r.arg_types[j]); + if (t == TensorArgType::INOUT || t == TensorArgType::OUTPUT_EXISTING) { + total++; + } + } + } + return total; +} + +// --------------------------------------------------------------------------- +// JSON output accumulators (in-memory tables that get serialized at the end) +// --------------------------------------------------------------------------- + +// Edge categories — matches the three places a runtime fanin edge is born. +enum class EdgeSource { EXPLICIT, CREATOR, TENSORMAP }; + +const char *edge_source_str(EdgeSource s) { + switch (s) { + case EdgeSource::EXPLICIT: + return "explicit"; + case EdgeSource::CREATOR: + return "creator"; + case EdgeSource::TENSORMAP: + return "tensormap"; + } + return "unknown"; +} + +const char *overlap_status_str(OverlapStatus s) { + switch (s) { + case OverlapStatus::COVERED: + return "covered"; + case OverlapStatus::OTHER: + return "other"; + case OverlapStatus::NO_OVERLAP: + return "no_overlap"; + } + return "unknown"; +} + +// One annotated edge. consumer_* always populated. producer_* populated for +// TENSORMAP source only — the explicit/creator emit paths don't have a +// matched tensormap entry to copy from. +// +// Slice description follows the strided Tensor model: (start_offset, strides[]) +// in element units. Byte offset of element coords[] is +// (start_offset + Σ coords[i] · strides[i]) · dtype_bytes +struct EdgeAnnot { + uint64_t pred; + uint64_t succ; + int32_t consumer_arg_idx; // -1 for EXPLICIT (not tied to a tensor arg) + EdgeSource source; + OverlapStatus overlap; // only meaningful for TENSORMAP + uint64_t tensor_id; // 0 for EXPLICIT + // Consumer side (the Tensor the submitting task is reading). + uint8_t consumer_dtype; + uint32_t consumer_ndims; + uint32_t consumer_shape[MAX_TENSOR_DIMS]; + uint64_t consumer_start_offset; // 1D element offset + uint32_t consumer_strides[MAX_TENSOR_DIMS]; + // Producer side (the slice the producer wrote, from the tensormap entry). + // Only populated when source == TENSORMAP. + uint32_t producer_ndims; + uint32_t producer_shape[MAX_TENSOR_DIMS]; + uint64_t producer_start_offset; + uint32_t producer_strides[MAX_TENSOR_DIMS]; +}; + +// One entry in the tensors[] table: the underlying storage, keyed by +// (buffer_addr, version). buffer_numel is the storage element count; +// per-edge fields describe the slice (start_offset + stride). +struct TensorTableEntry { + uint64_t tensor_id; + uint64_t buffer_addr; + uint64_t buffer_numel; // storage size in elements (= buffer.size / dtype_bytes) + int32_t version; + uint8_t dtype; +}; + +// One arg slot of a task, captured for the `tasks[].args[]` block so +// downstream viewers can render per-task input / output compartments without +// having to scan every edge. `has_tensor_info` is false only for OUTPUT slots: +// the runtime hasn't materialized a Tensor yet at submit_task time, so the +// captured blob is zeroed. +struct TaskArgEntry { + int32_t idx; + TensorArgType arg_type; + bool has_tensor_info; + uint64_t tensor_id; + uint8_t dtype; + uint32_t ndims; + uint32_t shape[MAX_TENSOR_DIMS]; + uint64_t start_offset; // 1D element offset + uint32_t strides[MAX_TENSOR_DIMS]; +}; + +struct TaskTableEntry { + uint64_t task_id; + bool in_manual_scope; + int32_t kernel_id[3]; // per-subslot {AIC, AIV0, AIV1}, -1 = inactive + std::vector args; +}; + +const char *arg_type_str(TensorArgType t) { + switch (t) { + case TensorArgType::INPUT: + return "INPUT"; + case TensorArgType::OUTPUT: + return "OUTPUT"; + case TensorArgType::INOUT: + return "INOUT"; + case TensorArgType::OUTPUT_EXISTING: + return "OUTPUT_EXISTING"; + } + return "UNKNOWN"; +} + +// FNV-1a 64-bit hash of (buffer_addr, version) — stable tensor identity +// across runs (no time-dependent inputs). +uint64_t make_tensor_id(uint64_t buffer_addr, int32_t version) { + constexpr uint64_t FNV_OFFSET = 0xcbf29ce484222325ULL; + constexpr uint64_t FNV_PRIME = 0x100000001b3ULL; + uint64_t h = FNV_OFFSET; + const uint8_t *p; + p = reinterpret_cast(&buffer_addr); + for (size_t i = 0; i < sizeof(buffer_addr); i++) { + h ^= p[i]; + h *= FNV_PRIME; + } + uint32_t v = static_cast(version); + p = reinterpret_cast(&v); + for (size_t i = 0; i < sizeof(v); i++) { + h ^= p[i]; + h *= FNV_PRIME; + } + return h; +} + +// Register a tensor in the tensors[] table on first sight of (addr, +// version). buffer_numel describes the underlying storage size in elements; +// per-edge fields describe the slice via (start_offset, strides[]). Subsequent +// sightings of the same (addr, version) are no-ops. +uint64_t register_tensor( + std::unordered_map &index_by_id, std::vector &table, const Tensor &t +) { + uint64_t id = make_tensor_id(t.buffer.addr, t.version); + auto it = index_by_id.find(id); + if (it != index_by_id.end()) { + return id; + } + TensorTableEntry e; + e.tensor_id = id; + e.buffer_addr = t.buffer.addr; + e.version = t.version; + e.dtype = static_cast(t.dtype); + const uint64_t elem_size = get_element_size(t.dtype); + e.buffer_numel = (elem_size == 0) ? 0 : (t.buffer.size / elem_size); + index_by_id[id] = table.size(); + table.push_back(e); + return id; +} + +// Copy a Tensor's slice description (shape + start_offset + stride) into an +// EdgeAnnot's consumer_* fields. +void fill_consumer(EdgeAnnot &e, const Tensor &t) { + e.consumer_dtype = static_cast(t.dtype); + e.consumer_ndims = t.ndims; + e.consumer_start_offset = t.start_offset; + for (uint32_t i = 0; i < t.ndims && i < MAX_TENSOR_DIMS; i++) { + e.consumer_shape[i] = t.shapes[i]; + e.consumer_strides[i] = t.strides[i]; + } +} + +// Copy a PTO2TensorMapEntry's slice description into an EdgeAnnot's producer_* +// fields. Only called from the TENSORMAP emit path. +void fill_producer(EdgeAnnot &e, const PTO2TensorMapEntry &entry) { + e.producer_ndims = entry.ndims; + e.producer_start_offset = entry.start_offset; + for (uint32_t i = 0; i < entry.ndims && i < MAX_TENSOR_DIMS; i++) { + e.producer_shape[i] = entry.shapes[i]; + e.producer_strides[i] = entry.strides[i]; + } +} + +// --------------------------------------------------------------------------- +// JSON writer +// --------------------------------------------------------------------------- + +void write_uint_array(std::ofstream &out, const uint32_t *data, uint32_t n) { + out << '['; + for (uint32_t i = 0; i < n; i++) { + if (i > 0) out << ','; + out << data[i]; + } + out << ']'; +} + +bool write_deps_json( + const char *path, const std::vector &tasks, const std::vector &tensors, + const std::vector &edges +) { + std::ofstream out(path, std::ios::out | std::ios::trunc); + if (!out) { + LOG_ERROR("dep_gen replay: failed to open '%s' for write", path); + return false; + } + // Strided tensor representation. tensors[].buffer_numel is the underlying + // storage element count; tasks[].args[] and edges[] carry per-slice + // geometry as (start_offset uint64, strides[] uint32 — runtime invariant + // forbids zero / negative strides, see runtime/tensor.h). + out << "{\"tasks\":["; + for (size_t i = 0; i < tasks.size(); i++) { + if (i > 0) out << ','; + const auto &t = tasks[i]; + // uint64 fields are quoted as strings — task_id/tensor_id/buffer_addr/ + // pred/succ can exceed Number.MAX_SAFE_INTEGER (2^53-1), silently + // losing precision in JS-based JSON parsers. Python consumers already + // pass these through int(...) and don't care which form they receive. + out << "{\"task_id\":\"" << t.task_id << '"'; + out << ",\"scope\":\"" << (t.in_manual_scope ? "manual" : "auto") << '"'; + // Per-subslot kernel ids {AIC, AIV0, AIV1}; INVALID_KERNEL_ID = -1 for + // inactive subslots. Emitted as a plain int triple — downstream viewers + // (and the swimlane host post-processor) use it to resolve task_id → + // kernel without the AICore record carrying the field itself. + out << ",\"kernel_ids\":[" << t.kernel_id[0] << ',' << t.kernel_id[1] << ',' << t.kernel_id[2] << ']'; + out << ",\"args\":["; + for (size_t a = 0; a < t.args.size(); a++) { + if (a > 0) out << ','; + const auto &arg = t.args[a]; + out << "{\"idx\":" << arg.idx; + out << ",\"type\":\"" << arg_type_str(arg.arg_type) << '"'; + if (arg.has_tensor_info) { + out << ",\"tensor_id\":\"" << arg.tensor_id << '"'; + out << ",\"dtype\":\"" << get_dtype_name(static_cast(arg.dtype)) << '"'; + out << ",\"shape\":"; + write_uint_array(out, arg.shape, arg.ndims); + out << ",\"start_offset\":\"" << arg.start_offset << '"'; + out << ",\"strides\":"; + write_uint_array(out, arg.strides, arg.ndims); + } + out << '}'; + } + out << "]}"; + } + out << ']'; + + out << ",\"tensors\":["; + for (size_t i = 0; i < tensors.size(); i++) { + if (i > 0) out << ','; + const auto &t = tensors[i]; + out << "{\"tensor_id\":\"" << t.tensor_id << '"'; + out << ",\"buffer_addr\":\"" << t.buffer_addr << '"'; + out << ",\"version\":" << t.version; + out << ",\"dtype\":\"" << get_dtype_name(static_cast(t.dtype)) << '"'; + out << ",\"buffer_numel\":\"" << t.buffer_numel << '"'; + out << '}'; + } + out << ']'; + + out << ",\"edges\":["; + for (size_t i = 0; i < edges.size(); i++) { + if (i > 0) out << ','; + const auto &e = edges[i]; + out << "{\"pred\":\"" << e.pred << "\",\"succ\":\"" << e.succ << '"'; + out << ",\"arg\":" << e.consumer_arg_idx; + out << ",\"source\":\"" << edge_source_str(e.source) << '"'; + if (e.source == EdgeSource::TENSORMAP) { + out << ",\"overlap\":\"" << overlap_status_str(e.overlap) << '"'; + } + if (e.source != EdgeSource::EXPLICIT) { + out << ",\"tensor_id\":\"" << e.tensor_id << '"'; + out << ",\"consumer_dtype\":\"" << get_dtype_name(static_cast(e.consumer_dtype)) << '"'; + out << ",\"consumer_shape\":"; + write_uint_array(out, e.consumer_shape, e.consumer_ndims); + out << ",\"consumer_start_offset\":\"" << e.consumer_start_offset << '"'; + out << ",\"consumer_strides\":"; + write_uint_array(out, e.consumer_strides, e.consumer_ndims); + } + if (e.source == EdgeSource::TENSORMAP) { + out << ",\"producer_shape\":"; + write_uint_array(out, e.producer_shape, e.producer_ndims); + out << ",\"producer_start_offset\":\"" << e.producer_start_offset << '"'; + out << ",\"producer_strides\":"; + write_uint_array(out, e.producer_strides, e.producer_ndims); + } + out << '}'; + } + out << "]}\n"; + return static_cast(out); +} + +// --------------------------------------------------------------------------- +// Annot pass — mirrors compute_task_fanin step-by-step against tm_annot. +// Must stay bit-equivalent to pto_dep_compute.h::compute_task_fanin in terms +// of which producer IDs are emitted (the differential check enforces this). +// --------------------------------------------------------------------------- + +template +void annot_pass( + const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, EmitCreator emit_creator, + EmitTM emit_tensormap +) { + if (in_manual_scope) { + return; + } + for (int32_t i = 0; i < inputs.tensor_count; i++) { + TensorArgType ptype = inputs.arg_types[i]; + if (ptype == TensorArgType::OUTPUT) { + continue; + } + const Tensor *tensor = &inputs.tensors[i].ref(); + + // STEP A: creator retention. + PTO2TaskId owner = tensor->owner_task_id; + if (owner.is_valid()) { + emit_creator(owner, i, *tensor); + } + + // STEP B: tensormap lookup (only INPUT/INOUT, skip manual_dep). + if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) { + continue; + } + if (tensor->manual_dep) { + continue; + } + + tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool { + emit_tensormap(entry.producer_task_id, i, *tensor, entry, overlap_status); + if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) { + tensor_map.remove_entry(entry); + } + return true; + }); + } +} + +} // namespace + +extern "C" int +dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, const char *deps_json_path) { + if (deps_json_path == nullptr) { + LOG_ERROR("dep_gen replay: null deps_json_path"); + return -1; + } + if (num_records > 0 && records == nullptr) { + LOG_ERROR("dep_gen replay: num_records=%zu but records pointer is null", num_records); + return -1; + } + LOG_INFO_V0("dep_gen replay: processing %zu in-memory records (dual-pass)", num_records); + + // Per-ring task window sizes — tensormap masks slot indices and requires + // each to be a power of two. Auto-size from the records themselves so each + // ring's window comfortably covers its observed max local_id (no slot + // aliasing during INOUT+COVERED remove_from_task). Same sizes feed both + // maps so they stay in lockstep. + int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + uint32_t max_local[PTO2_MAX_RING_DEPTH] = {0}; + for (size_t i = 0; i < num_records; i++) { + PTO2TaskId tid{records[i].task_id}; + uint8_t ring = tid.ring(); + uint32_t local = tid.local(); + if (ring < PTO2_MAX_RING_DEPTH && local > max_local[ring]) { + max_local[ring] = local; + } + } + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + int32_t need = static_cast(max_local[r] + 1); + task_window_sizes[r] = ceil_pow2(need < 16 ? 16 : need); + } + + int32_t output_count = count_outputs(records, num_records); + int32_t pool_size = output_count + (output_count / 10) + 64; + if (pool_size < PTO2_TENSORMAP_POOL_SIZE) { + pool_size = PTO2_TENSORMAP_POOL_SIZE; + } + + PTO2TensorMap tm_oracle; + PTO2TensorMap tm_annot; + std::memset(&tm_oracle, 0, sizeof(tm_oracle)); + std::memset(&tm_annot, 0, sizeof(tm_annot)); + + // Libc-backed arena (default ctor) that owns both replay tensormaps' + // storage. Released by the arena destructor when this function returns. + DeviceArena replay_arena; + + auto oracle_layout = + PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes); + auto annot_layout = + PTO2TensorMap::reserve_layout(replay_arena, PTO2_TENSORMAP_NUM_BUCKETS, pool_size, task_window_sizes); + if (replay_arena.commit() == nullptr || !tm_oracle.init_data_from_layout(oracle_layout, replay_arena) || + !tm_annot.init_data_from_layout(annot_layout, replay_arena)) { + LOG_ERROR("dep_gen replay: tensormap.init failed (buckets=%d, pool=%d)", PTO2_TENSORMAP_NUM_BUCKETS, pool_size); + return -3; + } + // Replay tensormaps live entirely on host; only arena-internal pointer + // fields need wiring (no parent-orch back-reference exists anymore). + tm_oracle.wire_arena_pointers(oracle_layout, replay_arena); + tm_annot.wire_arena_pointers(annot_layout, replay_arena); + + // JSON output accumulators. + std::vector task_table; + std::vector tensor_table; + std::unordered_map tensor_index; // tensor_id → table idx + std::vector annot_edges; + annot_edges.reserve(num_records * 2); + + TensorRef tref_buf[CORE_MAX_TENSOR_ARGS]; + TensorArgType atype_buf[CORE_MAX_TENSOR_ARGS]; + + // Per-record dedup of producer IDs — must match runtime's + // PTO2FaninBuilder::append_fanin_or_fail semantics, which collapses STEP 1 + // (explicit_deps) + STEP A (creator retention) + STEP B (tensormap lookup) + // into a single per-task fanin list. Both oracle and annot use this same + // semantics so the divergence check is meaningful. + std::unordered_set oracle_preds; + std::unordered_set annot_preds; + + // Scratch buffer for assembling full dep lists across overflow chains. + // Declared outside the loop so it can be reused (clear() keeps capacity). + std::vector full_deps_buf; + + for (size_t rec_i = 0; rec_i < num_records; rec_i++) { + const DepGenRecord &rec = records[rec_i]; + + // Overflow chain records are consumed by the preceding base; skip + // them in the main scan so we don't double-process or read the + // overflow's reinterpreted bytes as tensor/dep info. + if (rec.flags & DEP_GEN_FLAG_OVERFLOW) continue; + + PTO2TaskId task_id{rec.task_id}; + bool in_manual_scope = (rec.flags & DEP_GEN_FLAG_IN_MANUAL_SCOPE) != 0; + + oracle_preds.clear(); + annot_preds.clear(); + + int32_t tc = static_cast(rec.tensor_count); + if (tc > CORE_MAX_TENSOR_ARGS) { + tc = CORE_MAX_TENSOR_ARGS; + } + for (int32_t i = 0; i < tc; i++) { + tref_buf[i] = reinterpret_cast(&rec.tensors[i][0]); + atype_buf[i] = static_cast(rec.arg_types[i]); + } + + // Assemble the full dep list. Fast path: ≤ DEP_GEN_MAX_EXPLICIT_DEPS, + // no chain, point straight at rec.explicit_deps. Slow path: gather + // base + chain into full_deps_buf and point at the buffer. + // + // `explicit_dep_count` / `over->dep_count` originate from device + // shared memory and are bounded by the writer to the array sizes, but + // we clamp on read too so a corrupted record never drives an OOB read + // off the end of rec.explicit_deps[64] / over->deps[582]. + const uint64_t *deps_data; + int32_t dc; + if (rec.flags & DEP_GEN_FLAG_HAS_OVERFLOW) { + full_deps_buf.clear(); + uint16_t base_dc = rec.explicit_dep_count; + if (base_dc > DEP_GEN_MAX_EXPLICIT_DEPS) { + LOG_ERROR( + "dep_gen replay: clamping base explicit_dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")", + base_dc, DEP_GEN_MAX_EXPLICIT_DEPS, rec_i, rec.task_id + ); + base_dc = DEP_GEN_MAX_EXPLICIT_DEPS; + } + full_deps_buf.reserve(static_cast(base_dc) + DEP_GEN_OVERFLOW_DEPS_PER_RECORD); + full_deps_buf.insert(full_deps_buf.end(), rec.explicit_deps, rec.explicit_deps + base_dc); + bool chain_complete = false; + for (size_t j = rec_i + 1; j < num_records; j++) { + const DepGenRecord &maybe = records[j]; + if (!(maybe.flags & DEP_GEN_FLAG_OVERFLOW)) { + LOG_ERROR( + "dep_gen replay: unterminated overflow chain at rec_idx=%zu (task_id=%" PRIu64 ")", rec_i, + rec.task_id + ); + break; + } + if (maybe.task_id != rec.task_id) { + LOG_ERROR( + "dep_gen replay: orphan overflow at rec_idx=%zu (expected task_id=%" PRIu64 ", found %" PRIu64 + ")", + j, rec.task_id, maybe.task_id + ); + break; + } + const auto *over = reinterpret_cast(&maybe); + uint16_t over_dc = over->dep_count; + if (over_dc > DEP_GEN_OVERFLOW_DEPS_PER_RECORD) { + LOG_ERROR( + "dep_gen replay: clamping overflow dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")", + over_dc, DEP_GEN_OVERFLOW_DEPS_PER_RECORD, j, rec.task_id + ); + over_dc = DEP_GEN_OVERFLOW_DEPS_PER_RECORD; + } + full_deps_buf.insert(full_deps_buf.end(), over->deps, over->deps + over_dc); + if (over->flags & DEP_GEN_FLAG_LAST_OVERFLOW) { + chain_complete = true; + break; + } + } + if (!chain_complete) { + LOG_ERROR( + "dep_gen replay: chain for task_id=%" PRIu64 " missing LAST_OVERFLOW marker — " + "using partial dep list (%zu deps)", + rec.task_id, full_deps_buf.size() + ); + } + deps_data = full_deps_buf.data(); + dc = static_cast(full_deps_buf.size()); + } else { + deps_data = rec.explicit_deps; + uint16_t base_dc = rec.explicit_dep_count; + if (base_dc > DEP_GEN_MAX_EXPLICIT_DEPS) { + LOG_ERROR( + "dep_gen replay: clamping no-chain explicit_dep_count %u > %d at rec_idx=%zu (task_id=%" PRIu64 ")", + base_dc, DEP_GEN_MAX_EXPLICIT_DEPS, rec_i, rec.task_id + ); + base_dc = DEP_GEN_MAX_EXPLICIT_DEPS; + } + dc = static_cast(base_dc); + } + + DepInputs inputs; + inputs.tensor_count = tc; + inputs.tensors = tref_buf; + inputs.arg_types = atype_buf; + inputs.explicit_dep_count = dc; + inputs.explicit_deps = reinterpret_cast(deps_data); + + // Register tasks[] entry (with per-arg slot info) and any unseen + // tensors[] entries up-front. Tensors are registered from the + // consumer-side blob so raw_shapes / dtype are populated (the + // producer-side PTO2TensorMapEntry drops raw_shapes to fit in two + // cache lines). + TaskTableEntry task_entry; + task_entry.task_id = rec.task_id; + task_entry.in_manual_scope = in_manual_scope; + task_entry.kernel_id[0] = rec.kernel_id[0]; + task_entry.kernel_id[1] = rec.kernel_id[1]; + task_entry.kernel_id[2] = rec.kernel_id[2]; + task_entry.args.reserve(tc); + for (int32_t i = 0; i < tc; i++) { + TaskArgEntry slot{}; + slot.idx = i; + slot.arg_type = atype_buf[i]; + if (atype_buf[i] == TensorArgType::OUTPUT) { + // OUTPUT blob is zero at submit time (writer has no Tensor + // yet); leave has_tensor_info=false. Viewers render this as + // a placeholder "alloc" output slot. + slot.has_tensor_info = false; + } else { + const Tensor &t = tref_buf[i].ref(); + register_tensor(tensor_index, tensor_table, t); + slot.has_tensor_info = true; + slot.tensor_id = make_tensor_id(t.buffer.addr, t.version); + slot.dtype = static_cast(t.dtype); + slot.ndims = t.ndims; + slot.start_offset = t.start_offset; + for (uint32_t d = 0; d < t.ndims && d < MAX_TENSOR_DIMS; d++) { + slot.shape[d] = t.shapes[d]; + slot.strides[d] = t.strides[d]; + } + } + task_entry.args.push_back(slot); + } + task_table.push_back(std::move(task_entry)); + + // ============ STEP 1 — explicit_deps (call-site emit) ============ + // Same loop on both passes; they MUST produce identical sets here + // because they read the same record. Annot records explicit edges + // with consumer_arg_idx = -1 (not tied to any tensor arg). Reads + // from deps_data (base record's explicit_deps[] on fast path, the + // gathered base+chain buffer on overflow path). + for (int32_t i = 0; i < dc; i++) { + uint64_t pred_raw = deps_data[i]; + if (oracle_preds.insert(pred_raw).second) { + // First time this pred is seen at runtime call site. + } + if (annot_preds.insert(pred_raw).second) { + EdgeAnnot e{}; + e.pred = pred_raw; + e.succ = rec.task_id; + e.consumer_arg_idx = -1; + e.source = EdgeSource::EXPLICIT; + annot_edges.push_back(e); + } + } + + // ============ ORACLE pass — drive compute_task_fanin ============ + bool ok = compute_task_fanin(inputs, tm_oracle, in_manual_scope, [&](PTO2TaskId producer) -> bool { + oracle_preds.insert(producer.raw); + return true; + }); + if (!ok) { + LOG_ERROR("dep_gen replay: compute_task_fanin returned fatal at task_id=%" PRIu64, rec.task_id); + tm_oracle.destroy(); + tm_annot.destroy(); + return -4; + } + + // ============ ANNOT pass — inline mirror, full entry capture ============ + annot_pass( + inputs, tm_annot, in_manual_scope, + // emit_creator(producer, arg_idx, consumer_tensor) + [&](PTO2TaskId producer, int32_t arg_idx, const Tensor &consumer) { + if (!annot_preds.insert(producer.raw).second) { + return; // already covered by an earlier emit on this record + } + EdgeAnnot e{}; + e.pred = producer.raw; + e.succ = rec.task_id; + e.consumer_arg_idx = arg_idx; + e.source = EdgeSource::CREATOR; + e.tensor_id = make_tensor_id(consumer.buffer.addr, consumer.version); + fill_consumer(e, consumer); + annot_edges.push_back(e); + }, + // emit_tensormap(producer, arg_idx, consumer_tensor, entry, status) + [&](PTO2TaskId producer, int32_t arg_idx, const Tensor &consumer, const PTO2TensorMapEntry &entry, + OverlapStatus status) { + // Per-(succ, arg_idx, producer_buffer_addr, producer_version) + // dedup gives us "the same producer slice fired twice for the + // same consumer arg" collapse — but two distinct slices from + // the same producer (different version), or two different + // producers, both yield their own edges. The producer-id-set + // comparison below uses annot_preds, which dedups by pred + // only, matching runtime PTO2FaninBuilder semantics. + annot_preds.insert(producer.raw); + EdgeAnnot e{}; + e.pred = producer.raw; + e.succ = rec.task_id; + e.consumer_arg_idx = arg_idx; + e.source = EdgeSource::TENSORMAP; + e.overlap = status; + e.tensor_id = make_tensor_id(entry.buffer_addr, entry.version); + fill_consumer(e, consumer); + fill_producer(e, entry); + annot_edges.push_back(e); + } + ); + + // ============ Differential check ============ + if (oracle_preds != annot_preds) { + LOG_ERROR( + "dep_gen replay: DIVERGENCE at task_id=%" PRIu64 " (rec_idx=%zu): oracle has %zu preds, annot has %zu", + rec.task_id, rec_i, oracle_preds.size(), annot_preds.size() + ); + // Log the symmetric difference for debugging. + for (uint64_t p : oracle_preds) { + if (annot_preds.find(p) == annot_preds.end()) { + LOG_ERROR(" only-in-oracle pred: %" PRIu64, p); + } + } + for (uint64_t p : annot_preds) { + if (oracle_preds.find(p) == oracle_preds.end()) { + LOG_ERROR(" only-in-annot pred: %" PRIu64, p); + } + } + tm_oracle.destroy(); + tm_annot.destroy(); + return -6; + } + + // ============ STEP 4 — publish outputs on BOTH maps ============ + register_task_outputs(inputs, task_id, tm_oracle, in_manual_scope); + register_task_outputs(inputs, task_id, tm_annot, in_manual_scope); + } + + tm_oracle.destroy(); + tm_annot.destroy(); + + if (!write_deps_json(deps_json_path, task_table, tensor_table, annot_edges)) { + return -5; + } + LOG_INFO_V0( + "dep_gen replay: wrote deps.json to %s (tasks=%zu, tensors=%zu, edges=%zu)", deps_json_path, task_table.size(), + tensor_table.size(), annot_edges.size() + ); + return 0; +} diff --git a/src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.h b/src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.h new file mode 100644 index 000000000..49cc2331c --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/host/dep_gen_replay.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file dep_gen_replay.h + * @brief Host-side replay of in-memory DepGenRecord stream → deps.json. + * + * Takes the records the host collector drained from the device ring buffer + * (``DepGenCollector::records()``) and runs them back through a host-resident + * PTO2TensorMap using the same ``compute_task_fanin`` / ``register_task_outputs`` + * primitives the device orchestrator uses, emitting the full + * predecessor → successor edge list to deps.json. + * + * The records buffer is passed in directly — there is no intermediate + * ``submit_trace.bin`` on disk. The host already has the records once the + * device run completes, so going through the filesystem would just be + * extra I/O and an extra file in the output directory. + * + * deps.json is the sole source of truth for fanout: the L2 swimlane hot + * path no longer records ``L2SwimlaneAicpuTaskRecord::fanout[]`` (taking the per-task + * 1 KB GM store off the scheduler critical path). Replay sees every + * submit and reconstructs the complete dependency graph. + * + * Output format (deps.json, strided tensor representation): + * + * {"tasks": [{"task_id":, "scope":"auto|manual", + * "args":[{"idx":, "type":"", + * "tensor_id":, "dtype":"...", "shape":[...], + * "start_offset":, "strides":[...]}, ...]}, ...], + * "tensors": [{"tensor_id":, "buffer_addr":, "version":, + * "dtype":"FLOAT32", "buffer_numel":}, ...], + * "edges": [{"pred":, "succ":, "arg":, + * "source":"explicit|creator|tensormap", + * "overlap":"covered|other" (tensormap only), + * "tensor_id": (non-explicit), + * "consumer_dtype":"...", "consumer_shape":[...], + * "consumer_start_offset":, "consumer_strides":[...], + * "producer_shape":[...] (tensormap), + * "producer_start_offset": (tensormap), + * "producer_strides":[...] (tensormap)}, + * ...]} + * + * - All task ids are ``PTO2TaskId::raw`` values (``(ring_id << 32) | local_id``). + * - ``tensor_id`` is a stable FNV-1a hash of ``(buffer_addr, version)``. + * - ``buffer_numel`` is the underlying storage element count; tensor shapes + * are carried per-arg / per-edge alongside ``start_offset`` + ``strides``. + * - Distinct producers / arg indices / sources keep their own edges; per-record + * deduplication of producer ids mirrors the runtime + * ``PTO2FaninBuilder::append_fanin_or_fail`` semantics so the set of + * ``(pred, succ)`` pairs is identical to what the runtime would have + * recorded. + * + * Self-checking: the replay runs two parallel tensormap instances per record — + * an "oracle" map driven by the canonical ``compute_task_fanin`` template, and + * an "annotated" map driven by an inlined mirror that captures the per-edge + * tensor metadata. If the producer-id set on the two passes ever diverges, + * deps.json is NOT written and the function returns a non-zero error code. + * This is the guarantee against silent shotgun modifications: anyone who + * changes ``compute_task_fanin`` semantics has to mirror the change here too + * or the gate fires immediately. + * + * The replay is single-threaded and pure CPU: no device handle is required. + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_ + +#include +#include + +// Opaque forward decl — the canonical layout lives in common/dep_gen.h, but +// replay's API only needs to take a pointer + count. Callers who construct +// the buffer must include common/dep_gen.h themselves. +struct DepGenRecord; + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Replay an in-memory DepGenRecord stream and write deps.json. + * + * Per-ring task window sizes are auto-derived from the trace itself so each + * ring's window covers its observed max local_id without slot aliasing. + * + * @param records Pointer to a contiguous DepGenRecord array + * (typically ``DepGenCollector::records().data()``). + * @param num_records Number of records in the array. + * @param deps_json_path Output path; truncated if it exists. + * @return 0 on success; negative on error (see source for codes). + */ +int dep_gen_replay_emit_deps_json(const struct DepGenRecord *records, size_t num_records, const char *deps_json_path); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_HOST_DEP_GEN_REPLAY_H_ diff --git a/src/a5/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp b/src/a5/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp new file mode 100644 index 000000000..dfc5590c1 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/host/runtime_compile_info.cpp @@ -0,0 +1,27 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include "host/platform_compile_info.h" +#include "host/runtime_compile_info.h" +#include + +extern "C" { + +ToolchainType get_incore_compiler(void) { + if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_CCEC; + return TOOLCHAIN_HOST_GXX_15; +} + +ToolchainType get_orchestration_compiler(void) { + // tensormap_and_ringbuffer: a2a3 needs aarch64 cross-compile (AICPU is aarch64) + if (strcmp(get_platform(), "a2a3") == 0) return TOOLCHAIN_AARCH64_GXX; + return TOOLCHAIN_HOST_GXX; +} +} diff --git a/src/a5/runtime/fully_distributed_within_core/host/runtime_maker.cpp b/src/a5/runtime/fully_distributed_within_core/host/runtime_maker.cpp new file mode 100644 index 000000000..b95411a6c --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/host/runtime_maker.cpp @@ -0,0 +1,691 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Runtime Builder - rt2 Implementation (Device Orchestration) + * + * Provides init_runtime_impl and validate_runtime_impl functions for rt2 runtime. + * Supports device orchestration where AICPU thread 3 runs the orchestrator. + * + * init_runtime_impl: + * - Converts host tensor pointers to device pointers (all inputs copied H2D; + * only OUTPUT/INOUT tensors are copied back D2H) + * - Copies orchestration SO to device memory + * - Sets up runtime state for device orchestration + * + * validate_runtime_impl: + * - Copies OUTPUT/INOUT tensors back from device to host (read-only inputs + * are skipped) + * - Frees device memory + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../common/pto_runtime_status.h" +#include "../runtime/pto_runtime2.h" +#include "../runtime/pto_shared_memory.h" +#include "../runtime/runtime.h" +#include "../../../../common/task_interface/call_config.h" +#include "utils/device_arena.h" +#include "callable.h" +#include "common/platform_config.h" +#include "common/unified_log.h" +#include "prepare_callable_common.h" + +static_assert( + RUNTIME_ENV_RING_COUNT == PTO2_MAX_RING_DEPTH, "RuntimeEnv ring count must match PTO2 runtime ring depth" +); + +// Helper: return current time in milliseconds +static int64_t _now_ms() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; +} + +static bool is_power_of_2_u64(uint64_t value) { return value != 0 && (value & (value - 1)) == 0; } + +template +static std::string format_ring_array(const T (&values)[PTO2_MAX_RING_DEPTH]) { + std::string out = "["; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; ++r) { + if (r != 0) { + out += ", "; + } + out += std::to_string(values[r]); + } + out += "]"; + return out; +} + +static std::string trim_copy(const std::string &input) { + size_t begin = 0; + while (begin < input.size() && std::isspace(static_cast(input[begin]))) { + ++begin; + } + size_t end = input.size(); + while (end > begin && std::isspace(static_cast(input[end - 1]))) { + --end; + } + return input.substr(begin, end - begin); +} + +static bool parse_uint_token( + const char *name, const std::string &raw, uint64_t min_val, uint64_t max_val, bool require_power_of_2, uint64_t *out +) { + std::string token = trim_copy(raw); + if (token.empty()) { + LOG_WARN("%s has an empty value in '%s', ignored", name, raw.c_str()); + return false; + } + + if (token[0] == '-') { + LOG_WARN("%s=%s invalid (must be a non-negative integer), ignored", name, token.c_str()); + return false; + } + char *endptr = nullptr; + errno = 0; + unsigned long long parsed = std::strtoull(token.c_str(), &endptr, 10); + if (errno == ERANGE || endptr == token.c_str() || *endptr != '\0') { + LOG_WARN("%s=%s invalid (must be a non-negative integer), ignored", name, token.c_str()); + return false; + } + uint64_t val = static_cast(parsed); + + if (val < min_val || val > max_val) { + LOG_WARN( + "%s=%s invalid (must be in [%" PRIu64 ", %" PRIu64 "]), ignored", name, token.c_str(), min_val, max_val + ); + return false; + } + if (require_power_of_2 && !is_power_of_2_u64(val)) { + LOG_WARN("%s=%s invalid (must be a power of 2), ignored", name, token.c_str()); + return false; + } + *out = val; + return true; +} + +static void apply_env_ring_values( + const char *name, uint64_t min_val, uint64_t max_val, bool require_power_of_2, uint64_t out[PTO2_MAX_RING_DEPTH] +) { + const char *env = std::getenv(name); + if (!env) return; + + std::string text(env); + if (text.find(',') == std::string::npos) { + uint64_t value = 0; + if (!parse_uint_token(name, text, min_val, max_val, require_power_of_2, &value)) { + return; + } + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + out[r] = value; + } + return; + } + + uint64_t parsed[PTO2_MAX_RING_DEPTH]{}; + size_t pos = 0; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + size_t comma = text.find(',', pos); + std::string token = text.substr(pos, comma == std::string::npos ? std::string::npos : comma - pos); + if (!parse_uint_token(name, token, min_val, max_val, require_power_of_2, &parsed[r])) { + return; + } + if (comma == std::string::npos) { + if (r != PTO2_MAX_RING_DEPTH - 1) { + LOG_WARN( + "%s=%s invalid (expected exactly %d comma-separated values), ignored", name, env, + PTO2_MAX_RING_DEPTH + ); + return; + } + pos = text.size(); + } else { + pos = comma + 1; + } + } + if (pos < text.size() || (!text.empty() && text.back() == ',')) { + LOG_WARN("%s=%s invalid (expected exactly %d comma-separated values), ignored", name, env, PTO2_MAX_RING_DEPTH); + return; + } + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + out[r] = parsed[r]; + } +} + +static bool resolve_ring_config( + uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool, const uint64_t *ring_task_windows, + const uint64_t *ring_heaps, const uint64_t *ring_dep_pools, uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH], + uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH], int32_t eff_dep_pool_capacities[PTO2_MAX_RING_DEPTH] +) { + uint64_t dep_pool_values[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + eff_task_window_sizes[r] = PTO2_TASK_WINDOW_SIZE; + eff_heap_sizes[r] = PTO2_HEAP_SIZE; + dep_pool_values[r] = PTO2_DEP_LIST_POOL_SIZE; + } + + apply_env_ring_values("PTO2_RING_TASK_WINDOW", 4, static_cast(INT32_MAX), true, eff_task_window_sizes); + apply_env_ring_values("PTO2_RING_HEAP", 1024, std::numeric_limits::max(), false, eff_heap_sizes); + apply_env_ring_values("PTO2_RING_DEP_POOL", 4, static_cast(INT32_MAX), false, dep_pool_values); + + if (ring_task_window != 0) { + if (ring_task_window < 4 || ring_task_window > static_cast(INT32_MAX) || + !is_power_of_2_u64(ring_task_window)) { + LOG_ERROR( + "runtime_env.ring_task_window=%" PRIu64 " must be a power of 2 in [4, INT32_MAX]", ring_task_window + ); + return false; + } + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + eff_task_window_sizes[r] = ring_task_window; + } + } + if (ring_heap != 0) { + if (ring_heap < 1024) { + LOG_ERROR("runtime_env.ring_heap=%" PRIu64 " must be >= 1024", ring_heap); + return false; + } + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + eff_heap_sizes[r] = ring_heap; + } + } + if (ring_dep_pool != 0) { + if (ring_dep_pool < 4 || ring_dep_pool > static_cast(INT32_MAX)) { + LOG_ERROR("runtime_env.ring_dep_pool=%" PRIu64 " must be in [4, INT32_MAX]", ring_dep_pool); + return false; + } + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + dep_pool_values[r] = ring_dep_pool; + } + } + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (ring_task_windows != nullptr && ring_task_windows[r] != 0) { + eff_task_window_sizes[r] = ring_task_windows[r]; + } + if (ring_heaps != nullptr && ring_heaps[r] != 0) { + eff_heap_sizes[r] = ring_heaps[r]; + } + if (ring_dep_pools != nullptr && ring_dep_pools[r] != 0) { + dep_pool_values[r] = ring_dep_pools[r]; + } + + if (eff_task_window_sizes[r] < 4 || eff_task_window_sizes[r] > static_cast(INT32_MAX) || + !is_power_of_2_u64(eff_task_window_sizes[r])) { + LOG_ERROR( + "ring_task_windows[%d]=%" PRIu64 " must be a power of 2 in [4, INT32_MAX]", r, eff_task_window_sizes[r] + ); + return false; + } + if (eff_heap_sizes[r] < 1024) { + LOG_ERROR("ring_heaps[%d]=%" PRIu64 " must be >= 1024", r, eff_heap_sizes[r]); + return false; + } + if (dep_pool_values[r] < 4 || dep_pool_values[r] > static_cast(INT32_MAX)) { + LOG_ERROR("ring_dep_pools[%d]=%" PRIu64 " must be in [4, INT32_MAX]", r, dep_pool_values[r]); + return false; + } + eff_dep_pool_capacities[r] = static_cast(dep_pool_values[r]); + } + + return true; +} + +static int32_t read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) { + if (runtime == nullptr || host_header == nullptr) { + return 0; + } + + void *sm_ptr = runtime->get_gm_sm_ptr(); + if (sm_ptr == nullptr) { + return 0; + } + + int hdr_rc = runtime->host_api.copy_from_device(host_header, sm_ptr, sizeof(PTO2SharedMemoryHeader)); + if (hdr_rc != 0) { + LOG_WARN("Failed to copy PTO2 header from device"); + return 0; + } + + int32_t orch_error_code = host_header->orch_error_code.load(std::memory_order_relaxed); + int32_t sched_error_code = host_header->sched_error_code.load(std::memory_order_relaxed); + return runtime_status_from_error_codes(orch_error_code, sched_error_code); +} + +/** + * Stage the per-callable resources (kernel binaries + orchestration SO) into + * the supplied runtime so a subsequent bind_callable_to_runtime_impl can use + * them. This is the cacheable half of init_runtime_impl: nothing here depends + * on per-run argument values, so the prepare_callable / run_prepared split + * lets us run this once per callable_id and amortize across runs. + * + * @param runtime Pointer to pre-constructed Runtime (host_api populated) + * @param callable ChipCallable carrying the orch SO + child kernel binaries + * @return 0 on success, -1 on failure + */ +extern "C" int +prepare_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const void *), CallableArtifacts *out) { + if (callable == nullptr) { + LOG_ERROR("Callable pointer is null"); + return -1; + } + if (upload_fn == nullptr || out == nullptr) { + LOG_ERROR("upload_fn or out is null"); + return -1; + } + *out = CallableArtifacts{}; + out->signature.assign(callable->signature_, callable->signature_ + callable->sig_count()); + + LOG_INFO_V0("Registering %d kernel(s) in prepare_callable_impl", callable->child_count()); + if (upload_and_collect_child_addrs(callable, upload_fn, &out->kernel_addrs) != 0) { + LOG_ERROR("Failed to upload ChipCallable buffer"); + return -1; + } + for (const ChildKernelAddr &c : out->kernel_addrs) { + if (c.func_id < 0 || c.func_id >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("func_id=%d is out of range [0, %d)", c.func_id, RUNTIME_MAX_FUNC_ID); + return -1; + } + } + + const uint8_t *orch_so_binary = static_cast(callable->binary_data()); + size_t orch_so_size = callable->binary_size(); + + if (orch_so_binary == nullptr || orch_so_size == 0) { + LOG_ERROR("Orchestration SO binary is required for device orchestration"); + return -1; + } + + out->orch_so_data = orch_so_binary; + out->orch_so_size = orch_so_size; + out->func_name = callable->func_name(); + out->config_name = callable->config_name(); + LOG_INFO_V0("Orchestration SO: %zu bytes staged (host-only)", orch_so_size); + return 0; +} + +/** + * Per-run binding: build device-side argument storage (tensor copy-out, GM + * heap, PTO2 shared memory) and publish it to the runtime. Assumes the + * callable-side state (kernel binaries, orch SO bytes, func/config names) + * is already populated by prepare_callable_impl. + * + * Splitting this from prepare_callable_impl matches the per-callable_id + * design: register/run_prepared invokes this every call, while the prep + * half runs only once per callable_id. + * + * @param runtime Pointer to pre-constructed Runtime (host_api populated) + * @param orch_args Separated tensor/scalar arguments for this run + * @return 0 on success, -1 on failure + */ +extern "C" int bind_callable_to_runtime_impl( + Runtime *runtime, const ChipStorageTaskArgs *orch_args, void *host_orch_func_ptr, const ArgDirection *signature, + int sig_count, uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool, + const uint64_t *ring_task_windows, const uint64_t *ring_heaps, const uint64_t *ring_dep_pools +) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } + if (orch_args == nullptr) { + LOG_ERROR("orch_args pointer is null"); + return -1; + } + // trb runs orchestration on the device — there is no host-side orch + // function pointer to invoke. The c_api signature accepts one for + // symmetry with hbg; assert the trb-side invariant here. + if (host_orch_func_ptr != nullptr) { + LOG_ERROR("bind_callable_to_runtime_impl: trb does not accept a host_orch_func_ptr"); + return -1; + } + + int tensor_count = orch_args->tensor_count(); + int scalar_count = orch_args->scalar_count(); + LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count); + + int64_t t_total_start = _now_ms(); + + uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH]; + uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH]; + int32_t eff_dep_pool_capacities[PTO2_MAX_RING_DEPTH]; + if (!resolve_ring_config( + ring_task_window, ring_heap, ring_dep_pool, ring_task_windows, ring_heaps, ring_dep_pools, + eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities + )) { + return -1; + } + const std::string task_window_log = format_ring_array(eff_task_window_sizes); + const std::string heap_log = format_ring_array(eff_heap_sizes); + const std::string dep_pool_log = format_ring_array(eff_dep_pool_capacities); + LOG_INFO_V0( + "Ring buffer sizes: task_window=%s heap=%s dep_pool=%s", task_window_log.c_str(), heap_log.c_str(), + dep_pool_log.c_str() + ); + + // Build device args: copy from input, replace host tensor pointers with device pointers + ChipStorageTaskArgs device_args; + + int64_t t_args_start = _now_ms(); + for (int i = 0; i < tensor_count; i++) { + Tensor t = orch_args->tensor(i); + + if (t.is_child_memory()) { + LOG_INFO_V0(" Tensor %d: child memory, pass-through (0x%" PRIx64 ")", i, t.buffer.addr); + device_args.add_tensor(t); + continue; + } + + void *host_ptr = reinterpret_cast(static_cast(t.buffer.addr)); + size_t size = static_cast(t.nbytes()); + + void *dev_ptr = runtime->host_api.device_malloc(size); + if (dev_ptr == nullptr) { + LOG_ERROR("Failed to allocate device memory for tensor %d", i); + return -1; + } + + // Pure write-only OUTPUT buffers carry no meaningful host content, so + // the H2D copy-in is wasted. Zero them on-device instead (cheap HBM + // memset, no PCIe) so any region the kernel leaves unwritten reads as 0 + // rather than pooled-allocator garbage. INOUT (read-before-write) + // and IN keep the H2D copy. Falls back to copy_to_device if a backend + // did not wire device_memset. + bool is_pure_output = (signature != nullptr && i < sig_count && signature[i] == ArgDirection::OUT); + int rc; + if (is_pure_output && runtime->host_api.device_memset != nullptr) { + rc = runtime->host_api.device_memset(dev_ptr, 0, size); + } else { + rc = runtime->host_api.copy_to_device(dev_ptr, host_ptr, size); + } + if (rc != 0) { + LOG_ERROR("Failed to stage tensor %d to device", i); + runtime->host_api.device_free(dev_ptr); + return -1; + } + // Read-only INPUT tensors are never written by the kernel, so there is + // no point copying them back D2H at the end. Index the signature + // by the orch tensor index `i` (child_memory tensors are skipped above + // but do not consume a separate signature slot — scalars follow the + // tensor entries). Anything not provably IN keeps the safe default of + // copying back. + bool needs_copy_back = !(signature != nullptr && i < sig_count && signature[i] == ArgDirection::IN); + runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size, needs_copy_back}); + LOG_INFO_V0(" Tensor %d: %zu bytes at %p", i, size, dev_ptr); + + t.buffer.addr = reinterpret_cast(dev_ptr); + device_args.add_tensor(t); + } + for (int i = 0; i < scalar_count; i++) { + device_args.add_scalar(orch_args->scalar(i)); + } + int64_t t_args_end = _now_ms(); + + // Read orchestrator-to-scheduler transition flag from environment + { + const char *env_val = std::getenv("PTO2_ORCH_TO_SCHED"); + if (env_val && (env_val[0] == '1' || env_val[0] == 't' || env_val[0] == 'T')) { + runtime->orch_to_sched = true; + } + LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->orch_to_sched ? "enabled" : "disabled"); + } + + // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory, + // and the prebuilt runtime arena all live in a single backing allocation; + // setup_static_arena reserves the three regions and commits in one shot. + // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the + // free is deferred to DeviceRunner::finalize(). The runtime-arena size is + // determined by replaying the reserve sequence on a host-side arena. + uint64_t total_heap_size = 0; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (eff_heap_sizes[r] > std::numeric_limits::max() - total_heap_size) { + LOG_ERROR("Total ring heap size overflows uint64_t"); + return -1; + } + total_heap_size += eff_heap_sizes[r]; + } + uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(eff_task_window_sizes); + + int64_t t_prebuilt_start = _now_ms(); + DeviceArena host_arena; // libc malloc backend by default + PTO2RuntimeArenaLayout layout = + runtime_reserve_layout(host_arena, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities); + if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) { + LOG_ERROR("Failed to commit host arena for prebuilt runtime image"); + return -1; + } + + int64_t t_setup_start = _now_ms(); + if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) { + LOG_ERROR("Failed to setup pooled static arena"); + return -1; + } + int64_t t_setup_end = _now_ms(); + + int64_t t_heap_start = _now_ms(); + void *gm_heap = runtime->host_api.acquire_pooled_gm_heap(); + int64_t t_heap_end = _now_ms(); + if (gm_heap == nullptr) { + LOG_ERROR("Failed to acquire pooled GM heap"); + return -1; + } + runtime->set_gm_heap(gm_heap); + + int64_t t_sm_start = _now_ms(); + void *sm_ptr = runtime->host_api.acquire_pooled_gm_sm(); + int64_t t_sm_end = _now_ms(); + if (sm_ptr == nullptr) { + LOG_ERROR("Failed to acquire pooled PTO2 shared memory"); + return -1; + } + runtime->set_gm_sm_ptr(sm_ptr); + + void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena(); + if (runtime_arena_dev == nullptr) { + LOG_ERROR("Failed to acquire pooled runtime arena"); + return -1; + } + + // Set up device orchestration state + runtime->set_orch_args(device_args); + + // ------------------------------------------------------------------------- + // Build the prebuilt runtime-arena image on host. + // + // We pre-compute every byte the AICPU's runtime arena would otherwise have + // to write at boot: layout offsets, sub-structure init data, and pointers + // back to the SM / GM heap. Then we rtMemcpy the image into the pooled + // runtime-arena region that DeviceRunner keeps alive across runs. AICPU + // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM + // reset) + a handful of device-only field fixups. + // ------------------------------------------------------------------------- + PTO2Runtime *rt = + runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_sizes); + if (rt == nullptr) { + LOG_ERROR("runtime_init_data_from_layout failed"); + return -1; + } + runtime_wire_arena_pointers(host_arena, layout, rt); + + // Stash the layout inside the PTO2Runtime image so the AICPU can recover + // every arena-internal offset after rtMemcpy. The runtime arena's device + // base does NOT travel in this image — it's on the host Runtime + // (set_prebuilt_arena below), since the AICPU needs that pointer + // *before* it can dereference the image. + rt->prebuilt_layout = layout; + + int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size); + if (rc_upload != 0) { + LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload); + return -1; + } + runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime); + int64_t t_prebuilt_end = _now_ms(); + + LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count); + + int64_t t_total_end = _now_ms(); + LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start); + LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start); + LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start); + LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start); + LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start); + LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start); + + return 0; +} + +/** + * Validate runtime results and cleanup. + * + * This function: + * 1. Copies recorded tensors from device back to host + * 2. Frees device memory for recorded tensors + * 3. Clears tensor pair state + * + * @param runtime Pointer to Runtime + * @return 0 on success, -1 on failure + */ +extern "C" int validate_runtime_impl(Runtime *runtime) { + if (runtime == nullptr) { + LOG_ERROR("Runtime pointer is null"); + return -1; + } + + int rc = 0; + + LOG_INFO_V0("=== Copying Results Back to Host ==="); + + // Copy all recorded tensors from device back to host + TensorPair *tensor_pairs = runtime->tensor_pairs_.data(); + int tensor_pair_count = static_cast(runtime->tensor_pairs_.size()); + + LOG_INFO_V0("Tensor pairs to process: %d", tensor_pair_count); + + // PTO2 (device orchestration): graph output may be in packed buffer + uint64_t graph_out_ptr = 0; + uint64_t graph_out_size = 0; + bool skip_tensor_copy_back = false; + int32_t runtime_status = 0; + PTO2SharedMemoryHeader host_header; + memset(&host_header, 0, sizeof(host_header)); + + runtime_status = read_runtime_status(runtime, &host_header); + if (runtime_status != 0) { + int32_t orch_error_code = host_header.orch_error_code.load(std::memory_order_relaxed); + int32_t sched_error_code = host_header.sched_error_code.load(std::memory_order_relaxed); + LOG_ERROR( + "PTO2 runtime failed: orch_error_code=%d sched_error_code=%d runtime_status=%d", orch_error_code, + sched_error_code, runtime_status + ); + skip_tensor_copy_back = true; + } else { + graph_out_ptr = host_header.graph_output_ptr; + graph_out_size = host_header.graph_output_size; + if (graph_out_ptr != 0) { + LOG_INFO_V0("Graph output buffer: ptr=0x%" PRIx64 ", size=%" PRIu64, graph_out_ptr, graph_out_size); + } + } + + if (skip_tensor_copy_back) { + LOG_WARN("Skipping tensor copy-back because PTO2 runtime reported fatal status"); + } else { + bool first_output_tensor = true; + for (int i = 0; i < tensor_pair_count; i++) { + const TensorPair &pair = tensor_pairs[i]; + + // Skip if device pointer is null + if (pair.dev_ptr == nullptr) { + LOG_WARN("Tensor %d has null device pointer, skipping", i); + continue; + } + + // If host pointer is null, this is a device-only allocation (no copy-back) + if (pair.host_ptr == nullptr) { + LOG_INFO_V0("Tensor %d: device-only allocation (no copy-back)", i); + continue; + } + + // Read-only INPUT tensors were uploaded H2D but the kernel never + // wrote them — copying them back (potentially ~GB) is pure waste. + // They are still device_free'd in the cleanup loop below. + if (!pair.needs_copy_back) { + LOG_INFO_V0("Tensor %d: read-only input, skipping copy-back", i); + continue; + } + + void *src_ptr = pair.dev_ptr; + size_t copy_size = pair.size; + + // Use graph_output_ptr for the first output tensor if available + if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) { + src_ptr = reinterpret_cast(static_cast(graph_out_ptr)); + copy_size = static_cast(graph_out_size); + LOG_INFO_V0("Using packed output buffer for tensor %d", i); + first_output_tensor = false; + } + + int copy_rc = runtime->host_api.copy_from_device(pair.host_ptr, src_ptr, copy_size); + if (copy_rc != 0) { + LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc); + rc = copy_rc; + } else { + LOG_INFO_V0("Tensor %d: %zu bytes copied to host", i, pair.size); + } + } + } + + // Cleanup device tensors + LOG_INFO_V0("=== Cleaning Up ==="); + for (int i = 0; i < tensor_pair_count; i++) { + if (tensor_pairs[i].dev_ptr != nullptr) { + runtime->host_api.device_free(tensor_pairs[i].dev_ptr); + } + } + LOG_INFO_V0("Freed %d device allocations", tensor_pair_count); + + // Clear the per-run dispatch-table entries staged by prepare_callable_impl. + // The underlying chip-callable device buffer is pool-managed by + // DeviceRunner (keyed by content hash) and bulk-freed in + // DeviceRunner::finalize(). + int kernel_count = runtime->get_registered_kernel_count(); + for (int i = 0; i < kernel_count; i++) { + int func_id = runtime->get_registered_kernel_func_id(i); + runtime->set_function_bin_addr(func_id, 0); + } + if (kernel_count > 0) { + LOG_INFO_V0("Cleared %d kernel dispatch-table entries", kernel_count); + } + runtime->clear_registered_kernels(); + + // Clear tensor pairs + runtime->tensor_pairs_.clear(); + + LOG_INFO_V0("=== Finalize Complete ==="); + + if (rc == 0 && runtime_status != 0) { + rc = runtime_status; + } + + return rc; +} diff --git a/src/a5/runtime/fully_distributed_within_core/orchestration/common.cpp b/src/a5/runtime/fully_distributed_within_core/orchestration/common.cpp new file mode 100644 index 000000000..c4878a1c2 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/orchestration/common.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include "common.h" + +#ifdef __linux__ +#include +#include +#include +#include + +#include +#include +#include +#endif + +struct PTO2Runtime; + +// Unified-log error sink. Forward-declared here rather than pulled via +// common/unified_log.h: that header lives under common/log/include, which is +// not on the orchestration .so build's include path. The symbol resolves at +// link time for the runtime targets, and at dlopen time for the orchestration +// .so (against the executor's unified_log_device), so onboard diagnostics still +// reach the CANN device log. +extern "C" void unified_log_error(const char *func, const char *fmt, ...); + +namespace { +// Plain global (not thread_local) to avoid glibc TLSDESC stale-resolution +// crash (BZ #32412) when the orchestration SO is dlclose'd/re-dlopen'd +// between execution rounds. All orchestrator threads bind the same rt +// value, so per-thread storage is unnecessary. +PTO2Runtime *g_current_runtime = nullptr; +} // namespace + +extern "C" __attribute__((visibility("default"))) void framework_bind_runtime(PTO2Runtime *rt) { + g_current_runtime = rt; +} + +// Keep current_runtime local to this .so so orchestration helpers do not +// accidentally bind to the AICPU binary's same-named symbol. +extern "C" __attribute__((visibility("hidden"))) PTO2Runtime *framework_current_runtime() { return g_current_runtime; } + +/** + * Use addr2line to convert an address to file:line information. + * Uses the -i flag to expand inlines; returns the first line (innermost actual code location). + * If inlining is present, also returns the outer call chain via inline_chain. + */ +#ifdef __linux__ +static std::string addr_to_line(const char *executable, void *addr, std::string *inline_chain = nullptr) { + char cmd[512]; + snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr); + + std::array buffer; + std::string raw_output; + + FILE *pipe = popen(cmd, "r"); + if (pipe) { + while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) { + raw_output += buffer.data(); + } + pclose(pipe); + } + + if (raw_output.empty() || raw_output.find("??") != std::string::npos) { + return ""; + } + + // Split by lines + std::vector lines; + size_t pos = 0; + while (pos < raw_output.size()) { + size_t nl = raw_output.find('\n', pos); + if (nl == std::string::npos) nl = raw_output.size(); + std::string line = raw_output.substr(pos, nl - pos); + while (!line.empty() && line.back() == '\r') + line.pop_back(); + if (!line.empty()) lines.push_back(line); + pos = nl + 1; + } + + if (lines.empty()) return ""; + + // First line is the innermost actual code location; subsequent lines are outer inline callers + if (inline_chain && lines.size() > 1) { + *inline_chain = ""; + for (size_t j = 1; j < lines.size(); j++) { + *inline_chain += " [inlined by] " + lines[j] + "\n"; + } + } + + return lines.front(); +} +#endif + +/** + * Get current stack trace information (including file paths and line numbers). + * Uses dladdr to locate the shared library for each stack frame, then calls addr2line with relative addresses. + */ +std::string get_stacktrace(int skip_frames) { + (void)skip_frames; // May be unused on non-Linux platforms + std::string result; +#ifdef __linux__ + const int max_frames = 64; + void *buffer[max_frames]; + int nframes = backtrace(buffer, max_frames); + char **symbols = backtrace_symbols(buffer, nframes); + + if (symbols) { + result = "Stack trace:\n"; + for (int i = skip_frames; i < nframes; i++) { + std::string frame_info; + + void *addr = (void *)((char *)buffer[i] - 1); + + Dl_info dl_info; + std::string inline_chain; + if (dladdr(addr, &dl_info) && dl_info.dli_fname) { + void *rel_addr = (void *)((char *)addr - (char *)dl_info.dli_fbase); + std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain); + + if (addr2line_result.empty()) { + addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain); + } + + if (!addr2line_result.empty()) { + frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result; + } + } + + if (frame_info.empty()) { + std::string frame(symbols[i]); + + size_t start = frame.find('('); + size_t end = frame.find('+', start); + if (start != std::string::npos && end != std::string::npos) { + std::string mangled = frame.substr(start + 1, end - start - 1); + int status; + char *demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status); + if (status == 0 && demangled) { + frame = frame.substr(0, start + 1) + demangled + frame.substr(end); + free(demangled); + } + } + frame_info = frame; + } + + char buf[16]; + snprintf(buf, sizeof(buf), " #%d ", i - skip_frames); + result += buf + frame_info + "\n"; + if (!inline_chain.empty()) { + result += inline_chain; + } + } + free(symbols); + } +#else + result = "(Stack trace is only available on Linux)\n"; +#endif + return result; +} + +// AssertionError constructor +static std::string build_assert_message(const char *condition, const char *file, int line) { + std::string msg = "Assertion failed: " + std::string(condition) + "\n"; + msg += " Location: " + std::string(file) + ":" + std::to_string(line) + "\n"; + msg += get_stacktrace(3); + return msg; +} + +AssertionError::AssertionError(const char *condition, const char *file, int line) : + std::runtime_error(build_assert_message(condition, file, line)), + condition_(condition), + file_(file), + line_(line) {} + +[[noreturn]] void assert_impl(const char *condition, const char *file, int line) { + // Use unified_log_error directly rather than the LOG_ERROR macro: that macro + // lives in pto_orchestration_api.h and expands to + // current_runtime()->ops->log_error, but the ops table's definition pulls in + // pto_types.h (Arg → __aicore__-only to_u64), which the AICore build of this + // TU cannot compile. unified_log_error reaches the same sink without that + // dependency. + unified_log_error(__FUNCTION__, "\n========================================"); + unified_log_error(__FUNCTION__, "Assertion failed: %s", condition); + unified_log_error(__FUNCTION__, "Location: %s:%d", file, line); + unified_log_error(__FUNCTION__, "%s", get_stacktrace(2).c_str()); + unified_log_error(__FUNCTION__, "========================================\n"); + + throw AssertionError(condition, file, line); +} diff --git a/src/a5/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h b/src/a5/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h new file mode 100644 index 000000000..ed2f03989 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/orchestration/pto_arg_with_deps.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Convenience layer over Arg: bundles a fixed-capacity dependency buffer with + * an Arg and exposes an incremental add_dep(...) API on top of the runtime + * primitive L0TaskArgs::set_dependencies(ptr, count). + * + * Layering: + * - Primitive: Arg + set_dependencies(ptr, count) in pto_types.h. + * No cap, caller owns the deps buffer. + * - Convenience: L0TaskArgsWithDeps in this header. Owns a stack-sized dep + * buffer of capacity N (default 16); provides add_dep(). + * Submitted via the rt_submit_*_task overloads below, which + * forward the bundled deps into the underlying Arg. + * + * This file is auto-included at the bottom of pto_orchestration_api.h so + * orchestration sources see L0TaskArgsWithDeps after a single `#include + * "pto_orchestration_api.h"`. The split is purely organizational — + * orchestration code should not include this header directly. Code generated + * from pypto can ignore the convenience layer entirely and target Arg + + * set_dependencies(ptr, count) directly. + * + * L0TaskArgsWithDeps uses private inheritance from Arg so that set_dependencies and + * the explicit_dep* accessors are NOT reachable on a wrapper instance — users + * who pick the convenience layer cannot accidentally mix it with the + * primitive layer's dep API on the same object. + */ + +#pragma once + +#include +#include + +#include + +#include "pto_orchestration_api.h" // Arg, MixedKernels, rt_submit_* primitives + +template +class L0TaskArgsWithDeps : private L0TaskArgs { +public: + // Tensor / scalar setters — forward to Arg + using L0TaskArgs::add_inout; + using L0TaskArgs::add_input; + using L0TaskArgs::add_no_dep; + using L0TaskArgs::add_output; + using L0TaskArgs::add_scalar; + using L0TaskArgs::add_scalars; + using L0TaskArgs::add_scalars_i32; + using L0TaskArgs::copy_scalars_from; + + // Error / status — forward to Arg + using L0TaskArgs::error_msg; + using L0TaskArgs::has_error; + using L0TaskArgs::launch_spec; + using L0TaskArgs::set_error; + + // NOT exposed: set_dependencies, explicit_dep_count, explicit_dep, + // explicit_deps_data — these are the primitive-layer dep API. Users of + // the convenience layer reach dependencies only through add_dep() below. + + /** + * Append one or more dependencies to the bundled buffer. May be called + * multiple times; deps accumulate. Variadic accepts any non-zero number + * of PTO2TaskId arguments. + * + * Overflow (more than MAX_DEP_COUNT total) records an error on the + * underlying Arg; the error surfaces at submit time. + */ + template + void add_dep(Ids... ids) { + static_assert(sizeof...(Ids) >= 1, "add_dep: at least one task id is required"); + static_assert( + (std::is_same_v, PTO2TaskId> && ...), "add_dep: all arguments must be PTO2TaskId" + ); + if (count_ + sizeof...(Ids) > MAX_DEP_COUNT) { + L0TaskArgs::set_error( + "L0TaskArgsWithDeps::add_dep: dep count exceeds MAX_DEP_COUNT (bump the template arg)" + ); + return; + } + ((deps_[count_++] = ids), ...); + } + + /** + * Clear the bundled dep buffer and reset the underlying Arg. + * Use this to recycle an L0TaskArgsWithDeps across loop iterations. + */ + void reset() { + L0TaskArgs::reset(); + count_ = 0; + } + + /** + * Submit-only hook: bind the bundled deps onto the underlying Arg and + * return it as Arg&. Called by the rt_submit_*_task overloads below; + * orchestration code does not invoke this directly. + * + * Idempotent: explicitly clears any prior dep binding before re-setting, + * so a wrapper can be re-finalized (e.g. resubmitted) without tripping + * the primitive layer's single-shot check. + */ + L0TaskArgs &finalize_for_submit() { + L0TaskArgs::set_dependencies(nullptr, 0); + L0TaskArgs::set_dependencies(deps_, count_); + return *this; + } + +private: + PTO2TaskId deps_[MAX_DEP_COUNT]; + uint32_t count_ = 0; +}; + +// ============================================================================= +// Submit overloads — accept L0TaskArgsWithDeps transparently +// ============================================================================= + +template +static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, L0TaskArgsWithDeps &awd) { + return rt_submit_task(mixed_kernels, awd.finalize_for_submit()); +} + +template +static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, L0TaskArgsWithDeps &awd) { + return rt_submit_aic_task(kernel_id, awd.finalize_for_submit()); +} + +template +static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, L0TaskArgsWithDeps &awd) { + return rt_submit_aiv_task(kernel_id, awd.finalize_for_submit()); +} diff --git a/src/a5/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h b/src/a5/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h new file mode 100644 index 000000000..fa0fc9c8f --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/orchestration/pto_orchestration_api.h @@ -0,0 +1,386 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Orchestration API - Slim header for orchestration .so files + * + * This header provides everything an orchestration source needs without + * pulling in runtime implementation headers. The orchestration .so has + * zero link dependencies on runtime .cpp files; all runtime calls go + * through the PTO2RuntimeOps function-pointer table embedded in + * PTO2Runtime. + * + * Orchestration sources include ONLY this header: + * #include "pto_orchestration_api.h" + * + * Runtime sources continue to use pto_runtime2.h (which defines the + * full PTO2Runtime struct with all internal fields). + */ + +#pragma once + +#include +#include +#include + +#include + +// Type headers needed by orchestration +#include "common.h" // framework_bind_runtime / framework_current_runtime +#include "pto_runtime2_types.h" // PTO2_ERROR_* +#include "pto_submit_types.h" // MixedKernels, INVALID_KERNEL_ID, subtask slots +#include "pto_types.h" // Arg, TaskOutputTensors, TensorArgType +#include "task_args.h" // ChipStorageTaskArgs, Tensor +#include "tensor.h" // Tensor, TensorCreateInfo + +// ============================================================================= +// Tensor Factory Helpers +// ============================================================================= + +// make_tensor_external(...) — canonical factory for pre-allocated external +// memory — is defined in the unified tensor.h (common), so host and runtime +// build Tensors through the same controlled path. + +// ============================================================================= +// Ops Table and Opaque Runtime +// ============================================================================= + +/** + * Forward declaration — the orchestration sees PTO2Runtime as a partial + * struct whose first field is the ops pointer. The full definition + * lives in pto_runtime2.h (used only by runtime .cpp files). + */ +typedef struct PTO2Runtime PTO2Runtime; + +/** + * Function-pointer table for runtime operations. + * Populated by the runtime; called by orchestration through inline wrappers. + */ +typedef struct PTO2RuntimeOps { + TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args); + void (*scope_begin)(PTO2Runtime *rt); + void (*scope_end)(PTO2Runtime *rt); + void (*orchestration_done)(PTO2Runtime *rt); + bool (*is_fatal)(PTO2Runtime *rt); + void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...); + + // Logging (populated by runtime, called by orchestration) + void (*log_error)(const char *func, const char *fmt, ...); + void (*log_warn)(const char *func, const char *fmt, ...); + void (*log_debug)(const char *func, const char *fmt, ...); + // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside). + void (*log_info_v)(const char *func, int v, const char *fmt, ...); + + // Cross-layer data access (orchestration reads/writes tensor values via runtime) + // Placed after logging to avoid shifting hot-path field offsets. + uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]); + void (*set_tensor_data)( + PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value + ); + TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const L0TaskArgs &args); + TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const L0TaskArgs &args); + + // Stash the call-site of the next PTO2ScopeGuard so the [ScopeStats] + // collector can log it. Always present to keep ops-table layout stable + // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0. + void (*scope_set_site)(const char *file, int line); +} PTO2RuntimeOps; + +/** + * Partial PTO2Runtime definition for orchestration. + * + * Exposes the ops pointer (for runtime calls) and pending_scope_mode + * (read directly by inline scope wrappers). The real struct (in + * pto_runtime2.h) has the same first fields, so accessing them through + * this definition is well-defined (C struct layout guarantee). + */ +struct PTO2Runtime { + const PTO2RuntimeOps *ops; + PTO2ScopeMode pending_scope_mode; +}; + +// ============================================================================= +// Inline Convenience Wrappers (call through ops table) +// ============================================================================= + +static inline PTO2Runtime *current_runtime() { return framework_current_runtime(); } + +static inline TaskOutputTensors alloc_tensors(const L0TaskArgs &args) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return TaskOutputTensors{}; + } + return rt->ops->alloc_tensors(rt, args); +} + +static inline TaskOutputTensors alloc_tensors(const TensorCreateInfo create_infos[], uint32_t count) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return TaskOutputTensors{}; + } + L0TaskArgs args; + for (uint32_t i = 0; i < count; i++) { + args.add_output(create_infos[i]); + } + if (args.has_error) { + rt->ops->report_fatal( + rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", + args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg" + ); + return TaskOutputTensors{}; + } + return alloc_tensors(args); +} + +template +static inline TaskOutputTensors alloc_tensors(const CIs &...cis) { + static_assert(sizeof...(cis) > 0, "alloc_tensors requires at least one TensorCreateInfo"); + static_assert( + (std::is_same_v, TensorCreateInfo> && ...), + "alloc_tensors only accepts TensorCreateInfo arguments" + ); + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return TaskOutputTensors{}; + } + L0TaskArgs args; + (args.add_output(cis), ...); + if (args.has_error) { + rt->ops->report_fatal( + rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", + args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg" + ); + return TaskOutputTensors{}; + } + return alloc_tensors(args); +} + +static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return TaskOutputTensors{}; + } + return rt->ops->submit_task(rt, mixed_kernels, args); +} + +/** + * Convenience wrapper: submit an AIC-only task. + */ +static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, const L0TaskArgs &args) { + MixedKernels mk; + mk.aic_kernel_id = kernel_id; + return rt_submit_task(mk, args); +} + +/** + * Convenience wrapper: submit an AIV-only task (uses AIV0 slot). + */ +static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, const L0TaskArgs &args) { + MixedKernels mk; + mk.aiv0_kernel_id = kernel_id; + return rt_submit_task(mk, args); +} + +/** + * Submit a dependency-only task. Accepts the same Arg shape as rt_submit_task + * (inputs, outputs, inouts, explicit_deps, scalars) but does not run any + * AICore kernel. The task still participates in the dependency graph: it + * waits on its fanin and notifies its fanout. Useful as a synchronization + * barrier or as a placeholder producer for tests / dep-graph wiring. + */ +static inline TaskOutputTensors rt_submit_dummy_task(const L0TaskArgs &args) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return TaskOutputTensors{}; + } + return rt->ops->submit_dummy_task(rt, args); +} + +static inline void rt_scope_begin(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return; + } + rt->pending_scope_mode = mode; + rt->ops->scope_begin(rt); +} + +static inline void rt_scope_end() { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return; + } + rt->ops->scope_end(rt); +} + +static inline void rt_orchestration_done() { + PTO2Runtime *rt = current_runtime(); + rt->ops->orchestration_done(rt); +} + +static inline bool rt_is_fatal() { + PTO2Runtime *rt = current_runtime(); + return rt->ops->is_fatal(rt); +} + +#define rt_report_fatal(code, fmt, ...) \ + do { \ + PTO2Runtime *_rt = current_runtime(); \ + _rt->ops->report_fatal(_rt, (code), __FUNCTION__, (fmt), ##__VA_ARGS__); \ + } while (0) + +// ============================================================================= +// Logging Macros for Orchestration (call through ops table) +// ============================================================================= + +#define LOG_ERROR(fmt, ...) current_runtime()->ops->log_error(__FUNCTION__, fmt, ##__VA_ARGS__) +#define LOG_WARN(fmt, ...) current_runtime()->ops->log_warn(__FUNCTION__, fmt, ##__VA_ARGS__) +#define LOG_DEBUG(fmt, ...) current_runtime()->ops->log_debug(__FUNCTION__, fmt, ##__VA_ARGS__) + +// INFO verbosity tiers. v=0 most verbose, v=9 must-see, v=5 default. +#define LOG_INFO_V0(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 0, fmt, ##__VA_ARGS__) +#define LOG_INFO_V1(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 1, fmt, ##__VA_ARGS__) +#define LOG_INFO_V2(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 2, fmt, ##__VA_ARGS__) +#define LOG_INFO_V3(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 3, fmt, ##__VA_ARGS__) +#define LOG_INFO_V4(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 4, fmt, ##__VA_ARGS__) +#define LOG_INFO_V5(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 5, fmt, ##__VA_ARGS__) +#define LOG_INFO_V6(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 6, fmt, ##__VA_ARGS__) +#define LOG_INFO_V7(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 7, fmt, ##__VA_ARGS__) +#define LOG_INFO_V8(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 8, fmt, ##__VA_ARGS__) +#define LOG_INFO_V9(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 9, fmt, ##__VA_ARGS__) + +// ============================================================================= +// Cross-Layer Data Access +// ============================================================================= + +/** + * Read a value from a tensor at the given multi-dimensional indices. + * + * Default T = uint64_t preserves old behavior (raw bits). + * Specify T to get automatic type conversion: + * + * uint64_t raw = get_tensor_data(tensor, 1, idx); // old usage unchanged + * float val = get_tensor_data(tensor, 1, idx); // typed read + * + * If the tensor has a producer in TensorMap, spin-waits until the producer + * task completes before reading. External tensors (make_tensor_external) + * are read immediately without waiting. + */ +template +static inline T get_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return from_u64(0); + } + return from_u64(rt->ops->get_tensor_data(rt, tensor, ndims, indices)); +} + +/** + * Write a value to a tensor at the given multi-dimensional indices. + * + * Type is deduced from value argument; uint64_t by default: + * + * set_tensor_data(tensor, 1, idx, raw_u64); // old usage unchanged + * set_tensor_data(tensor, 1, idx, 42.0f); // typed write (T = float) + * + * If the tensor has a producer in TensorMap, spin-waits until the producer + * and all its consumers complete before writing (WAW + WAR safety). + * External tensors (make_tensor_external) with no TensorMap entry are + * written immediately without waiting. + * + * Limitation: TensorMap only tracks producers (OUTPUT/INOUT), not consumers + * that used the tensor as INPUT. If a kernel reads this tensor as INPUT + * (not INOUT) and the tensor has no TensorMap producer entry, set_tensor_data + * cannot detect the reader and may cause a data race. + * + * To ensure WAR safety for all access patterns, use add_inout() instead of + * add_input() for kernel parameters that may later be written via + * set_tensor_data. INOUT creates a TensorMap entry that enables automatic + * consumer tracking via fanout_refcount. + * + * The tensor must already have an allocated buffer (addr != 0). + * For runtime-created outputs, call this only on the Tensor returned by + * add_output(TensorCreateInfo) after submit returns. + */ +template +static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[], T value) { + PTO2Runtime *rt = current_runtime(); + if (rt->ops->is_fatal(rt)) { + return; + } + rt->ops->set_tensor_data(rt, tensor, ndims, indices, to_u64(value)); +} + +// ============================================================================= +// C++ Scope Guards and Macros +// ============================================================================= + +/** + * RAII Scope Guard (calls through ops table) + */ +class PTO2ScopeGuard { +public: + explicit PTO2ScopeGuard( + PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE() + ) : + rt_(current_runtime()) { + if (!rt_->ops->is_fatal(rt_)) { + rt_->pending_scope_mode = mode; + if (rt_->ops->scope_set_site) rt_->ops->scope_set_site(file, line); + rt_->ops->scope_begin(rt_); + } + } + ~PTO2ScopeGuard() { + if (!rt_->ops->is_fatal(rt_)) { + rt_->ops->scope_end(rt_); + } + } + +private: + PTO2Runtime *rt_; +}; + +#define _PTO2_CONCATENATE_IMPL(x, y) x##y +#define _PTO2_CONCATENATE(x, y) _PTO2_CONCATENATE_IMPL(x, y) + +#define PTO2_SCOPE_GUARD(...) \ + [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__) { __VA_ARGS__ } + +/** + * Scoped block macro: + * PTO2_SCOPE() { + * rt_submit_task(...); + * } + */ +#define PTO2_SCOPE(...) if (PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__){__VA_ARGS__}; true) + +// ============================================================================= +// Orchestration Config +// ============================================================================= + +/** + * Configuration exported by orchestration .so via aicpu_orchestration_config(). + * The executor reads these values to set up shared memory and runtime. + * + * This struct is defined identically in pto_runtime2.h (with an include + * guard) so the executor can use the same type without including this header. + */ +#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED +#define PTO2_ORCHESTRATION_CONFIG_DEFINED +struct PTO2OrchestrationConfig { + int expected_arg_count; +}; +#endif + +// Convenience layer (L0TaskArgsWithDeps + matching rt_submit_*_task overloads). +// Pulled in at the bottom so the wrapper sees L0TaskArgs, MixedKernels, and the +// rt_submit_*_task primitives defined above. Orchestration sources include +// only this single header to access both the primitive and convenience APIs. +#include "pto_arg_with_deps.h" // NOLINT(build/include_subdir) diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h b/src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h new file mode 100644 index 000000000..f914bfddf --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_ + +#include +#include + +#include "aicore_completion_mailbox_types.h" +#include "pto_constants.h" +#include "pto_task_id.h" + +// AICPU-only MPSC ring used to convey deferred-completion observations from +// FIN-handling scheduler threads to the dispatch thread. Producers push under +// CAS on `head`; the single consumer (dispatch thread, under AsyncWaitList:: +// busy) drains in seq order. Kernel-side code never touches this struct — +// AICore writes go into DeferredCompletionSlab (see +// aicore_completion_mailbox_types.h), which the FIN thread reads, flattens +// into messages here, and forwards. + +#define AICORE_COMPLETION_MAILBOX_CAPACITY 4096u +#define AICORE_COMPLETION_MAILBOX_MASK (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u) + +static_assert( + (AICORE_COMPLETION_MAILBOX_CAPACITY & (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)) == 0, + "AICORE_COMPLETION_MAILBOX_CAPACITY must be a power of two" +); + +// Mailbox message discriminator. CONDITION carries one deferred-completion +// observation flattened from a DeferredCompletionEntry. TASK_NORMAL_DONE +// carries the slot_state pointer in `addr` so the consumer can finalize the +// AsyncWaitEntry.slot_state binding for tasks whose conditions arrived +// before the FIN thread saw task_complete. New kinds may be added in future +// without growing the message — the `_pad[5]` slack is reserved for +// kind-specific payload extension. +#define MSG_KIND_CONDITION 0u +#define MSG_KIND_TASK_NORMAL_DONE 1u + +struct AICoreCompletionMailboxMessage { + // Per-slot ready flag. Producer publishes `tail+1` after filling the rest + // of the slot with a release store; consumer waits for the matching seq + // value with an acquire load. The release-acquire pair publishes all + // other fields below as a side effect, so they stay plain. + std::atomic seq; + PTO2TaskId task_token; + // CONDITION: completion observation addr (counter / SDMA event record). + // TASK_NORMAL_DONE: PTO2TaskSlotState pointer carried over to the consumer + // so it can finalize the AsyncWaitEntry.slot_state binding. + uint64_t addr; + uint32_t expected_value; + uint32_t engine; + int32_t completion_type; + uint32_t kind; + uint32_t _pad[5]; +}; + +static_assert(sizeof(AICoreCompletionMailboxMessage) == PTO2_ALIGN_SIZE, "AICoreCompletionMailboxMessage layout drift"); +static_assert( + sizeof(std::atomic) == sizeof(uint64_t), + "std::atomic must be layout-compatible with uint64_t for the message slot layout to hold" +); +static_assert( + std::atomic::is_always_lock_free, + "AICoreCompletionMailbox requires lock-free uint64_t atomics on every supported target" +); + +// POD view of a drained message. `seq` is the ring's publication flag, not +// payload, so try_pop copies out only the fields below (and seq is not even +// copyable — it is a std::atomic). +struct AICoreCompletionMsgView { + PTO2TaskId task_token{PTO2TaskId::invalid()}; + uint64_t addr{0}; + uint32_t expected_value{0}; + uint32_t engine{0}; + int32_t completion_type{0}; + uint32_t kind{0}; +}; + +struct AICoreCompletionMailbox { + // head and tail live on their own cache lines so producer CAS contention + // on head can't false-share with the consumer's tail updates. + alignas(PTO2_ALIGN_SIZE) std::atomic head; + uint8_t _head_pad[PTO2_ALIGN_SIZE - sizeof(uint64_t)]; + alignas(PTO2_ALIGN_SIZE) std::atomic tail; + uint8_t _tail_pad[PTO2_ALIGN_SIZE - sizeof(uint64_t)]; + alignas(PTO2_ALIGN_SIZE) AICoreCompletionMailboxMessage entries[AICORE_COMPLETION_MAILBOX_CAPACITY]; + + // Cheap, lock-free pending hint. Callers may invoke this outside the + // consumer lock; a stale answer only over/under-triggers a drain attempt. + bool has_pending() { return tail.load(std::memory_order_acquire) < head.load(std::memory_order_acquire); } + + // MPSC push for a CONDITION message. Returns false when the ring is full + // (head - tail >= CAPACITY); caller should SPIN_WAIT_HINT and retry. + // Lock-free: CAS the shared head to claim a slot, write the fields, then + // release-store seq so the single consumer observes the publication. + // + // The head CAS is relaxed: head is a pure ticket counter and carries no + // data to the consumer — publication is solely the seq release-store, and + // slot-reuse safety rests on the acquire load of tail. The relaxed failure + // order is likewise sufficient since a lost CAS just re-reads head and + // retries. compare_exchange_weak is used because this loop already re-reads + // head and re-checks fullness, so masking LL/SC spurious failures (what + // _strong adds on aarch64) would only be a redundant inner retry. + // + // Safe to call concurrently from any number of producers; structurally + // independent of the AsyncWaitList::busy lock. + bool try_push_condition( + PTO2TaskId task_token, uint64_t addr, uint32_t expected_value, uint32_t engine, int32_t completion_type + ) { + while (true) { + uint64_t h = head.load(std::memory_order_relaxed); + uint64_t t = tail.load(std::memory_order_acquire); + if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false; + uint64_t new_head = h + 1; + if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) { + AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK]; + slot->task_token.raw = task_token.raw; + slot->addr = addr; + slot->expected_value = expected_value; + slot->engine = engine; + slot->completion_type = completion_type; + slot->kind = MSG_KIND_CONDITION; + slot->seq.store(new_head, std::memory_order_release); + return true; + } + // CAS lost: another producer claimed the slot, retry with refreshed head. + } + } + + // MPSC push for a TASK_NORMAL_DONE sentinel. Carries the PTO2TaskSlotState + // pointer in the `addr` field so the consumer can finish binding the + // AsyncWaitEntry.slot_state without going back to the FIN-handling thread. + bool try_push_normal_done(PTO2TaskId task_token, uint64_t slot_state_addr) { + while (true) { + uint64_t h = head.load(std::memory_order_relaxed); + uint64_t t = tail.load(std::memory_order_acquire); + if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false; + uint64_t new_head = h + 1; + if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) { + AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK]; + slot->task_token.raw = task_token.raw; + slot->addr = slot_state_addr; + slot->expected_value = 0; + slot->engine = 0; + slot->completion_type = 0; + slot->kind = MSG_KIND_TASK_NORMAL_DONE; + slot->seq.store(new_head, std::memory_order_release); + return true; + } + } + } + + // Single-consumer transport-level dequeue (caller holds the consumer lock). + // Returns false at the first not-yet-published slot (gap) or when empty; + // otherwise copies the next message in tail order into `out`, advances + // tail, and returns true. tail is consumer-only-written (relaxed read); + // head bounds the scan (relaxed); the seq acquire is the real publication + // gate; the tail release publishes "slot free" to reusing producers. + bool try_pop(AICoreCompletionMsgView &out) { + uint64_t t = tail.load(std::memory_order_relaxed); + uint64_t h = head.load(std::memory_order_relaxed); + if (t >= h) return false; + AICoreCompletionMailboxMessage *slot = &entries[t & AICORE_COMPLETION_MAILBOX_MASK]; + if (slot->seq.load(std::memory_order_acquire) != t + 1) return false; + out.task_token.raw = slot->task_token.raw; + out.addr = slot->addr; + out.expected_value = slot->expected_value; + out.engine = slot->engine; + out.completion_type = slot->completion_type; + out.kind = slot->kind; + tail.store(t + 1, std::memory_order_release); + return true; + } +}; + +static_assert( + sizeof(AICoreCompletionMailbox) % PTO2_ALIGN_SIZE == 0, "AICoreCompletionMailbox size must be cache-line aligned" +); + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_ diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h b/src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h new file mode 100644 index 000000000..24c04c09e --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/aicore_completion_mailbox_types.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_ + +#include + +#include "pto_constants.h" + +// Types shared across the AICore↔AICPU boundary. +// +// This header is reachable from AICore-side translation units (via +// pto_async_kernel_api.h / pto_completion_token.h / sdma_completion_kernel.h) +// and must stay parseable by every AICore toolchain configuration: no +// , no __atomic_* intrinsics, no MPSC ring buffer struct. +// +// The MPSC ring (AICoreCompletionMailbox) and its push/drain helpers live in +// aicore_completion_mailbox.h, which is AICPU-only. + +inline constexpr int32_t MAX_COMPLETIONS_PER_TASK = 64; + +#define COMPLETION_ENGINE_SDMA 0u +#define COMPLETION_ENGINE_ROCE 1u +#define COMPLETION_ENGINE_URMA 2u +#define COMPLETION_ENGINE_CCU 3u + +#define COMPLETION_TYPE_COUNTER 0 +#define COMPLETION_TYPE_SDMA_EVENT_RECORD 1 + +// DeferredCompletionEntry / DeferredCompletionSlab back the per-task scratch +// area that AICore writes into to record "this completion has to be observed +// before the task can retire." The FIN-handling scheduler thread reads the +// slab, flattens entries into AICoreCompletionMailbox messages, and forwards +// them to the dispatch thread. `volatile` here is load-bearing: writers live +// on AICore and readers on AICPU, so the qualifier is the correct way to +// pin the compiler against caching / reordering on either side. +struct DeferredCompletionEntry { + uint64_t addr; + uint32_t expected_value; + uint32_t engine; + int32_t completion_type; + uint32_t _pad; +}; + +static_assert(sizeof(DeferredCompletionEntry) == 24, "DeferredCompletionEntry layout drift"); + +struct alignas(PTO2_ALIGN_SIZE) DeferredCompletionSlab { + volatile uint32_t count; + volatile int32_t error_code; + DeferredCompletionEntry entries[MAX_COMPLETIONS_PER_TASK]; +}; + +static_assert( + sizeof(DeferredCompletionSlab) % PTO2_ALIGN_SIZE == 0, + "DeferredCompletionSlab size must preserve array element cache-line boundaries" +); + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_ diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h b/src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h new file mode 100644 index 000000000..5e596e17b --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_kernel.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_BACKEND_SDMA_SDMA_COMPLETION_KERNEL_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_BACKEND_SDMA_SDMA_COMPLETION_KERNEL_H_ + +#include + +#include +#include + +#include "pto_async_kernel_api.h" +#include "aicore_completion_mailbox_types.h" +#include "pto_runtime_status.h" + +#ifndef __aicore__ +#define __aicore__ +#endif +#ifndef __gm__ +#define __gm__ +#endif + +// Re-exposed PTO-ISA constant so examples / callers don't need to include +// just to spell their scratch tile. +inline constexpr uint32_t SDMA_SCRATCH_ALIGNMENT = pto::comm::sdma::UB_ALIGN_SIZE; + +enum class SdmaOp : uint8_t { + TGET = 0, + TPUT = 1, +}; + +// SdmaRequestDescriptor bundles everything send_request_entry needs to drive +// one SDMA transfer + completion registration. It is a template because the +// destination / source / scratch types carry tensor shape & stride at compile +// time; the SdmaTget() / SdmaTput() helpers below let callers skip the +// template arguments. +// +// sync_id selects which event-record slot inside the workspace the engine +// writes into. Concurrent dispatches must use distinct sync_ids; today every +// caller submits one request per kernel invocation so passing 0 is safe. +// Future work (see .docs/25.comm-api-refactor/03.implementation-plan.md §5.2) +// will fold sync_id allocation into the adapter. +template +struct SdmaRequestDescriptor { + SdmaOp op; + DstTensor dst; + SrcTensor src; + ScratchTileT scratch; + __gm__ uint8_t *workspace; + uint32_t sync_id; +}; + +template +inline __aicore__ SdmaRequestDescriptor SdmaTget( + const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace, + uint32_t sync_id = 0 +) { + return SdmaRequestDescriptor{SdmaOp::TGET, dst, src, + scratch, workspace, sync_id}; +} + +template +inline __aicore__ SdmaRequestDescriptor SdmaTput( + const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace, + uint32_t sync_id = 0 +) { + return SdmaRequestDescriptor{SdmaOp::TPUT, dst, src, + scratch, workspace, sync_id}; +} + +namespace pto2::detail { + +inline __aicore__ void register_sdma_event_record(AsyncCtx &ctx, volatile __gm__ void *record_addr) { + CompletionToken token{ + reinterpret_cast(record_addr), 0, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_SDMA_EVENT_RECORD, 0 + }; + (void)register_completion_condition(ctx, token); +} + +template +inline __aicore__ void +register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsyncSession &session) { + if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) { + (void)event.Wait(session); + return; + } + if (event.handle == 0) { + return; + } + + const uint32_t engine = static_cast(event.engine); + if (engine != static_cast(::pto::comm::DmaEngine::SDMA)) { + defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID); + return; + } + + ::pto::comm::sdma::detail::UbTmpBuf tmp_buf; + uint32_t sync_id = 0; + __gm__ uint8_t *recv_workspace = nullptr; + uint32_t queue_num = 0; + if (!::pto::comm::sdma::detail::PrepareEventCheck( + session.sdmaSession, tmp_buf, sync_id, recv_workspace, queue_num + )) { + defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID); + return; + } + for (uint32_t queue_id = 0; queue_id < queue_num; ++queue_id) { + register_sdma_event_record(ctx, ::pto::comm::sdma::detail::GetEventRecord(recv_workspace, queue_id)); + } +} + +} // namespace pto2::detail + +// SDMA overload of the runtime's send_request_entry. Submits the descriptor +// to PTO-ISA, then registers the resulting AsyncEvent's GM flag(s) into the +// AsyncCtx deferred-wait slab and flushes. Returns false on submit/session +// failure (also records the error in ctx.completion_error_code). +template +inline __aicore__ bool +send_request_entry(AsyncCtx &ctx, SdmaRequestDescriptor desc) { + pto::comm::AsyncSession session; + if (!pto::comm::BuildAsyncSession(desc.scratch, desc.workspace, session, desc.sync_id)) { + pto2::detail::defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID); + return false; + } + + pto::comm::AsyncEvent event; + if (desc.op == SdmaOp::TGET) { + event = pto::comm::TGET_ASYNC(desc.dst, desc.src, session); + } else { + event = pto::comm::TPUT_ASYNC(desc.dst, desc.src, session); + } + pto2::detail::register_pto_async_event(ctx, event, session); + pto2::detail::defer_flush(ctx); + return true; +} + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_BACKEND_SDMA_SDMA_COMPLETION_KERNEL_H_ diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h b/src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h new file mode 100644 index 000000000..107fab62d --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/backend/sdma/sdma_completion_scheduler.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_BACKEND_SDMA_SDMA_COMPLETION_SCHEDULER_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_BACKEND_SDMA_SDMA_COMPLETION_SCHEDULER_H_ + +#include +#include + +#include "aicpu/platform_regs.h" +#include "aicore_completion_mailbox.h" +#include "pto_completion_token.h" +#include "pto_runtime_status.h" + +// runtime-side mirror of the PTO-ISA SdmaEventRecord. SDMA backend is the only +// allowed holder of this ABI knowledge; the generic scheduler dispatches into +// the helpers below through the completion ops table. +struct SdmaEventRecord { + uint32_t flag; + uint32_t sq_tail; + uint64_t channel_info; +}; + +static_assert(sizeof(SdmaEventRecord) == 16, "SDMA event record ABI drift"); +static_assert(offsetof(SdmaEventRecord, sq_tail) == 4, "SDMA event record ABI drift"); + +inline uintptr_t sdma_completion_cache_line(const volatile void *addr) { + return reinterpret_cast(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u); +} + +inline CompletionPollResult poll_sdma_event_record(uint64_t record_addr) { + if (record_addr == 0) { + return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID}; + } + volatile SdmaEventRecord *record = + reinterpret_cast(static_cast(record_addr)); + cache_invalidate_range(reinterpret_cast(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE); + uint32_t flag = __atomic_load_n(&record->flag, __ATOMIC_ACQUIRE); + return {flag != 0 ? CompletionPollState::READY : CompletionPollState::PENDING, PTO2_ERROR_NONE}; +} + +inline void retire_sdma_event_record(uint64_t record_addr) { + if (record_addr == 0) return; + volatile SdmaEventRecord *record = + reinterpret_cast(static_cast(record_addr)); + cache_invalidate_range(reinterpret_cast(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE); + uint32_t completed_tail = __atomic_load_n(&record->sq_tail, __ATOMIC_ACQUIRE); + uint64_t channel_info_addr = __atomic_load_n(&record->channel_info, __ATOMIC_ACQUIRE); + + volatile uint64_t *record_head = reinterpret_cast(record); + __atomic_store_n(record_head, 0ULL, __ATOMIC_RELEASE); + cache_flush_range(const_cast(reinterpret_cast(record_head)), sizeof(uint64_t)); + + if (channel_info_addr == 0) return; + uint64_t packed = (static_cast(completed_tail) << 32) | static_cast(completed_tail); + volatile uint64_t *channel_info = reinterpret_cast(static_cast(channel_info_addr)); + __atomic_store_n(channel_info, packed, __ATOMIC_RELEASE); + cache_flush_range(const_cast(reinterpret_cast(channel_info)), sizeof(uint64_t)); +} + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_BACKEND_SDMA_SDMA_COMPLETION_SCHEDULER_H_ diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/common.h b/src/a5/runtime/fully_distributed_within_core/runtime/common.h new file mode 100644 index 000000000..9dcf438ed --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/common.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#pragma once + +#include +#include + +// Assertion macros (always_assert / debug_assert), AssertionError, and the +// MAYBE_UNINITIALIZED diagnostics live in the shared header so the unified +// Tensor (src/common/task_interface/tensor.h) can use them without depending +// on this runtime-specific header. assert_impl / get_stacktrace are defined in +// orchestration/common.cpp for runtime targets. +#include "assert_compat.h" + +// Framework-internal TLS bridge. The executor binds the current thread's +// runtime before invoking the orchestration entry, so orchestration helpers can +// fetch the current PTO2Runtime without explicit parameter threading. Declared +// here (rather than in pto_orchestration_api.h) so framework TUs the AICore +// build also compiles — notably orchestration/common.cpp — see these symbols +// without pulling in pto_types.h, whose Arg::add_scalar → to_u64 path is +// __aicore__-only and would break the ccec build. +#ifdef __cplusplus +extern "C" { +#endif +struct PTO2Runtime; +PTO2Runtime *framework_current_runtime(void); +void framework_bind_runtime(PTO2Runtime *rt); +#ifdef __cplusplus +} +#endif diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h new file mode 100644 index 000000000..cae275625 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto2_dispatch_payload.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file pto2_dispatch_payload.h + * @brief Per-core dispatch payload for AICore kernel execution + * + * PTO2DispatchPayload holds the kernel function address, a per-core args[] + * array, and embedded SPMD context (LocalContext + GlobalContext). AICPU + * maintains a static array of these (one per core). + * + * GlobalContext (sub_block_id) is initialized once at runtime startup via + * init_global_context() and never modified afterwards. + * + * LocalContext (block_idx, block_num) and args[] are rebuilt by build_payload() + * before each dispatch. Both context struct pointers are written into the + * args[] suffix on every dispatch (since args[] is rebuilt entirely each time). + * + * AICore caches a pointer to its per-core slot at startup and reads from + * it on each dispatch. The struct is cache-line aligned to avoid false + * sharing across concurrently dispatched cores. + * + * The DATA_MAIN_BASE register protocol is unchanged from the base runtime: + * a monotonically increasing reg_task_id signals new work to AICore. + */ + +#pragma once + +#include + +#include "arg_direction.h" +#include "intrinsic.h" + +/** Max dispatch arguments: 16 scalars + up to 32 tensor pointers + ext params */ +#ifndef PTO2_DISPATCH_MAX_ARGS +#define PTO2_DISPATCH_MAX_ARGS (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + PTO2_EXT_PARAMS_COUNT) +#endif + +#ifndef PTO2_ALIGN_UP +#define PTO2_ALIGN_UP(x, align) (((x) + (align) - 1) & ~((align) - 1)) +#endif + +// Verify hardcoded indices in intrinsic.h match the computed values. +static_assert( + (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) == SPMD_LOCAL_CONTEXT_INDEX, "LOCAL_CONTEXT_INDEX out of sync with intrinsic.h" +); +static_assert( + (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + 1) == SPMD_GLOBAL_CONTEXT_INDEX, + "GLOBAL_CONTEXT_INDEX out of sync with intrinsic.h" +); + +/** + * Per-core dispatch payload: function address + args[] + SPMD context. + * + * AICPU maintains a static array s_payload_per_core[RUNTIME_MAX_WORKER]. + * AICore caches a pointer to its per-core slot at startup (via Handshake.task) + * and reads from it on each dispatch. + * + * The struct is cache-line aligned to prevent false sharing across + * concurrently dispatched cores. + */ +struct alignas(64) PTO2DispatchPayload { + uint64_t function_bin_addr; /**< Kernel entry address in GM (set by Scheduler) */ + uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars + ext params) */ + + /** Per-dispatch context: block_idx and block_num. + * Written by build_payload() before each dispatch. + * args[SPMD_LOCAL_CONTEXT_INDEX] points here. */ + LocalContext local_context; + + /** Per-core global context: sub_block_id (AIV lane identity). + * Initialized once by init_global_context() at runtime startup. + * args[SPMD_GLOBAL_CONTEXT_INDEX] points here. */ + GlobalContext global_context; + + uint8_t reserved_payload_abi_pad[8]; + + static_assert(sizeof(args[0]) == 8); + static_assert( + PTO2_ALIGN_UP((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]), 64) == + (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]) + ); +}; + +static_assert(sizeof(PTO2DispatchPayload) == 512, "PTO2DispatchPayload hardware ABI size drift"); diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h new file mode 100644 index 000000000..cf6eb4790 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_async_kernel_api.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef PTO_ASYNC_KERNEL_API_H +#define PTO_ASYNC_KERNEL_API_H + +#include + +#include +#include + +#include "intrinsic.h" +#include "aicore_completion_mailbox_types.h" +#include "pto_completion_token.h" +#include "pto_runtime_status.h" + +#ifndef __aicore__ +#define __aicore__ +#endif +#ifndef __gm__ +#define __gm__ +#endif + +// Public surface: get_async_ctx, async_ctx_is_deferred, +// register_completion_condition, send_notification, +// save_expected_notification_counter. Everything else lives in +// pto2::detail and is reserved for backend adapters / internal use. +namespace pto2::detail { + +inline __aicore__ void defer_load_slab(AsyncCtx &ctx) { + if (ctx.completion_count == nullptr) return; +#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__) + uintptr_t line = reinterpret_cast(ctx.completion_count) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u); + dcci((__gm__ int32_t *)line, SINGLE_CACHE_LINE); +#else + __asm__ __volatile__("" ::: "memory"); +#endif +} + +inline __aicore__ void defer_error(AsyncCtx &ctx, int32_t error_code) { + if (ctx.task_token.is_valid() && ctx.completion_error_code != nullptr) { + *ctx.completion_error_code = error_code; + } +} + +inline __aicore__ void defer_flush_range(volatile __gm__ void *addr, uint32_t size_bytes) { + if (addr == nullptr || size_bytes == 0) return; +#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__) + uintptr_t start = reinterpret_cast(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u); + uintptr_t end = + (reinterpret_cast(addr) + size_bytes + PTO2_ALIGN_SIZE - 1u) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u); + for (uintptr_t p = start; p < end; p += PTO2_ALIGN_SIZE) { + dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT); + } +#else + (void)addr; + (void)size_bytes; +#endif +} + +inline __aicore__ void defer_flush(AsyncCtx &ctx) { + if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr) return; +#if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__) + uint32_t count = *ctx.completion_count; + if (count > ctx.completion_capacity) { + count = ctx.completion_capacity; + } + uint32_t flush_bytes = static_cast(sizeof(*ctx.completion_count)); + if (ctx.completion_error_code != nullptr) { + flush_bytes += static_cast(sizeof(*ctx.completion_error_code)); + } + if (ctx.completion_entries != nullptr) { + flush_bytes += count * static_cast(sizeof(DeferredCompletionEntry)); + } + defer_flush_range(ctx.completion_count, flush_bytes); +#if defined(__CPU_SIM) + dsb(0); +#else + dsb(DSB_DDR); +#endif + pipe_barrier(PIPE_ALL); +#else + (void)ctx; + __asm__ __volatile__("" ::: "memory"); +#endif +} + +} // namespace pto2::detail + +inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args) { + __gm__ LocalContext *lc = + reinterpret_cast<__gm__ LocalContext *>(static_cast(args[PAYLOAD_LOCAL_CONTEXT_INDEX])); + AsyncCtx ctx{}; + ctx.completion_count = lc->async_ctx.completion_count; + ctx.completion_error_code = lc->async_ctx.completion_error_code; + ctx.completion_entries = lc->async_ctx.completion_entries; + ctx.completion_capacity = lc->async_ctx.completion_capacity; + ctx.task_token.raw = lc->async_ctx.task_token.raw; + pto2::detail::defer_load_slab(ctx); + return ctx; +} + +inline __aicore__ bool async_ctx_is_deferred(const AsyncCtx &ctx) { return ctx.task_token.is_valid(); } + +// Canonical writer: backend submit handlers build a CompletionToken and pass +// it here. Writes one DeferredCompletionEntry to the AsyncCtx slab and +// bumps completion_count. Returns false on overflow (also stores +// PTO2_ERROR_ASYNC_WAIT_OVERFLOW in ctx.completion_error_code) or when ctx is +// not currently a deferred context. +inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const CompletionToken &token) { + if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) { + return false; + } + + uint32_t idx = *ctx.completion_count; + if (idx >= ctx.completion_capacity) { + if (ctx.completion_error_code != nullptr) { + *ctx.completion_error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW; + } + return false; + } + + volatile __gm__ DeferredCompletionEntry *slot = &ctx.completion_entries[idx]; + slot->addr = token.addr; + slot->expected_value = token.expected_value; + slot->engine = token.engine; + slot->completion_type = token.completion_type; + slot->_pad = 0; + *ctx.completion_count = idx + 1; + return true; +} + +inline __aicore__ void +send_notification(volatile __gm__ void *remote_counter_addr, int32_t value, pto::comm::NotifyOp notify_op) { + __gm__ int32_t *counter = reinterpret_cast<__gm__ int32_t *>(const_cast<__gm__ void *>(remote_counter_addr)); + pto::comm::Signal signal(counter); + pto::comm::TNOTIFY(signal, value, notify_op); +} + +inline __aicore__ void +save_expected_notification_counter(AsyncCtx &ctx, volatile __gm__ void *counter_addr, uint32_t expected_value) { + CompletionToken token{ + reinterpret_cast(counter_addr), expected_value, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_COUNTER, 0 + }; + (void)register_completion_condition(ctx, token); + pto2::detail::defer_flush(ctx); +} + +#endif // PTO_ASYNC_KERNEL_API_H diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_async_wait.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_async_wait.h new file mode 100644 index 000000000..65608ad2f --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_async_wait.h @@ -0,0 +1,303 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef PTO_ASYNC_WAIT_H +#define PTO_ASYNC_WAIT_H + +#include +#include +#include + +#include "aicpu/platform_regs.h" +#include "backend/sdma/sdma_completion_scheduler.h" +#include "intrinsic.h" +#include "aicore_completion_mailbox.h" +#include "pto_completion_token.h" +#include "pto_runtime2_types.h" + +struct PTO2SchedulerState; +struct PTO2LocalReadyBuffer; +struct CompletionStats; + +inline constexpr int32_t MAX_ASYNC_WAITS = 64; + +// The mailbox transport (has_pending / try_push_condition / +// try_push_normal_done / try_pop) lives as AICoreCompletionMailbox member +// functions in aicore_completion_mailbox.h. This file only holds the +// application layer: translating drained messages into wait-list state. + +inline uintptr_t mailbox_cache_line(const volatile void *addr) { + return reinterpret_cast(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u); +} + +struct CompletionCondition; + +using CompletionPollFn = CompletionPollResult (*)(const CompletionCondition &); +using CompletionRetireFn = void (*)(CompletionCondition &); + +struct CompletionBackendOps { + CompletionPollFn poll; + CompletionRetireFn retire; +}; + +struct CompletionCondition { + AsyncEngine engine{ASYNC_ENGINE_SDMA}; + int32_t completion_type{COMPLETION_TYPE_COUNTER}; + bool satisfied{false}; + bool retired{false}; + volatile uint32_t *counter_addr{nullptr}; + uint64_t addr{0}; + uint32_t expected_value{0}; + + CompletionPollResult test() const; + void retire(); +}; + +// Per-completion-type ops. SDMA_EVENT_RECORD detail lives in +// backend/sdma/sdma_completion_scheduler.h; the op wrappers below are thin +// glue mapping CompletionCondition.addr into the backend's raw-addr helpers. +inline CompletionPollResult counter_poll_op(const CompletionCondition &cond) { + if (cond.counter_addr == nullptr) { + return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID}; + } + return { + *cond.counter_addr >= cond.expected_value ? CompletionPollState::READY : CompletionPollState::PENDING, + PTO2_ERROR_NONE + }; +} + +inline void counter_retire_op(CompletionCondition & /*cond*/) {} + +inline CompletionPollResult sdma_event_record_poll_op(const CompletionCondition &cond) { + return poll_sdma_event_record(cond.addr); +} + +inline void sdma_event_record_retire_op(CompletionCondition &cond) { retire_sdma_event_record(cond.addr); } + +inline const CompletionBackendOps *completion_backend_ops_for(int completion_type) { + static const CompletionBackendOps kOps[] = { + {counter_poll_op, counter_retire_op}, // COMPLETION_TYPE_COUNTER = 0 + {sdma_event_record_poll_op, sdma_event_record_retire_op}, // COMPLETION_TYPE_SDMA_EVENT_RECORD = 1 + }; + constexpr int kOpsCount = static_cast(sizeof(kOps) / sizeof(kOps[0])); + if (completion_type < 0 || completion_type >= kOpsCount) return nullptr; + return &kOps[completion_type]; +} + +inline CompletionPollResult CompletionCondition::test() const { + if (satisfied) { + return {CompletionPollState::READY, PTO2_ERROR_NONE}; + } + const CompletionBackendOps *ops = completion_backend_ops_for(completion_type); + if (ops == nullptr || ops->poll == nullptr) { + return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID}; + } + return ops->poll(*this); +} + +inline void CompletionCondition::retire() { + if (retired) return; + const CompletionBackendOps *ops = completion_backend_ops_for(completion_type); + if (ops != nullptr && ops->retire != nullptr) { + ops->retire(*this); + } + retired = true; +} + +struct AsyncWaitEntry { + PTO2TaskSlotState *slot_state{nullptr}; + PTO2TaskId task_token{PTO2TaskId::invalid()}; + CompletionCondition conditions[MAX_COMPLETIONS_PER_TASK]; + int32_t condition_count{0}; + int32_t waiting_completion_count{0}; + bool normal_done{false}; +}; + +struct AsyncPollResult { + int32_t completed{0}; + int32_t error_code{PTO2_ERROR_NONE}; + PTO2TaskSlotState *failed_slot_state{nullptr}; +}; + +inline const char *async_engine_name(AsyncEngine engine) { + switch (engine) { + case ASYNC_ENGINE_SDMA: + return "SDMA"; + case ASYNC_ENGINE_ROCE: + return "ROCE"; + case ASYNC_ENGINE_URMA: + return "URMA"; + case ASYNC_ENGINE_CCU: + return "CCU"; + default: + return "UNKNOWN"; + } +} + +struct AsyncWaitList { + std::atomic busy{0}; + AsyncWaitEntry entries[MAX_ASYNC_WAITS]; + int32_t count{0}; + // Diagnostic: counts every FIN-side try_push that hit a full mailbox. + // Expected to stay zero on real workloads (ring is 4096 entries); a + // non-zero value means consumers are too slow or the ring is undersized. + // Read by scheduler shutdown / l2 perf summary; not on the hot path. + std::atomic mpsc_skipped_count{0}; + + bool try_lock() { + int32_t expected = 0; + return busy.compare_exchange_strong(expected, 1, std::memory_order_acquire, std::memory_order_relaxed); + } + + void unlock() { busy.store(0, std::memory_order_release); } + + AsyncWaitEntry *find_entry_by_token(PTO2TaskId token) { + for (int32_t i = 0; i < count; i++) { + if (entries[i].task_token == token) return &entries[i]; + } + return nullptr; + } + + // Captures the side-channel a scheduler-aware drain needs to complete + // NotDeferred tasks inline (without storing a transient entry in + // entries[]). + struct DrainCompletionSink { + PTO2SchedulerState *sched{nullptr}; + PTO2LocalReadyBuffer *local_bufs{nullptr}; + PTO2TaskSlotState **deferred_release_slot_states{nullptr}; + int32_t *deferred_release_count{nullptr}; + int32_t deferred_release_capacity{0}; + int32_t inline_completed{0}; +#if PTO2_SCHED_PROFILING + int32_t thread_idx{0}; +#endif + + bool can_inline_complete() const { return sched != nullptr; } + }; + + // Inline-complete a NotDeferred task during drain. Returns false on + // deferred_release_slot_states overflow. + bool try_inline_complete_locked(DrainCompletionSink &sink, PTO2TaskSlotState &slot_state); + + // Single-consumer drain: pop each published message in tail order and + // translate it into wait-list state. An empty sink (sched == nullptr) just + // materializes entries; a sched-aware sink additionally inline-completes + // lonely NotDeferred NORMAL_DONEs without ever growing entries[]. + int32_t drain_aicore_completion_mailbox_locked( + AICoreCompletionMailbox *aicore_mailbox, DrainCompletionSink &sink, int32_t &error_code + ) { + error_code = PTO2_ERROR_NONE; + if (aicore_mailbox == nullptr) return 0; + + int32_t drained = 0; + AICoreCompletionMsgView msg; + // try_pop is the transport layer (seq-gated, in-order dequeue); this + // loop is the application layer (translate each message into wait-list + // state). try_pop returns false at the first gap or when empty. + while (aicore_mailbox->try_pop(msg)) { + drained++; + if (msg.kind == MSG_KIND_CONDITION) { + AsyncWaitEntry *entry = find_entry_by_token(msg.task_token); + if (entry == nullptr) { + // First message for this task — materialize the entry here. + // slot_state stays null until the matching TASK_NORMAL_DONE + // sentinel arrives. + if (count >= MAX_ASYNC_WAITS) { + error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW; + return drained; + } + entry = &entries[count++]; + entry->task_token = msg.task_token; + entry->slot_state = nullptr; + entry->condition_count = 0; + entry->waiting_completion_count = 0; + entry->normal_done = false; + } + if (!append_condition_locked( + *entry, msg.addr, msg.expected_value, static_cast(msg.engine), msg.completion_type, + error_code + )) { + return drained; + } + } else if (msg.kind == MSG_KIND_TASK_NORMAL_DONE) { + PTO2TaskSlotState *slot_state_ptr = + reinterpret_cast(static_cast(msg.addr)); + AsyncWaitEntry *entry = find_entry_by_token(msg.task_token); + if (entry == nullptr) { + // Producers strictly order: all CONDITIONs for token T are + // pushed before the matching NORMAL_DONE (the acq_rel on + // on_subtask_complete enforces this across producers). So + // observing NORMAL_DONE first => the task registered no + // conditions => NotDeferred. Complete it inline when the + // sink allows; otherwise fall back to the entry-store path. + if (sink.can_inline_complete()) { + (void)try_inline_complete_locked(sink, *slot_state_ptr); + continue; + } + if (count >= MAX_ASYNC_WAITS) { + error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW; + return drained; + } + entry = &entries[count++]; + entry->task_token = msg.task_token; + entry->slot_state = slot_state_ptr; + entry->condition_count = 0; + entry->waiting_completion_count = 0; + entry->normal_done = true; + } else { + if (entry->slot_state == nullptr) { + entry->slot_state = slot_state_ptr; + } + entry->normal_done = true; + } + } else { + error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED; + return drained; + } + } + return drained; + } + + bool append_condition_locked( + AsyncWaitEntry &entry, uint64_t addr, uint32_t expected_value, AsyncEngine engine, int32_t completion_type, + int32_t &error_code + ) { + if (entry.condition_count >= MAX_COMPLETIONS_PER_TASK) { + error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED; + return false; + } + CompletionCondition &cond = entry.conditions[entry.condition_count++]; + cond.engine = engine; + cond.completion_type = completion_type; + cond.satisfied = false; + cond.retired = false; + cond.addr = addr; + cond.counter_addr = completion_type == COMPLETION_TYPE_COUNTER ? + reinterpret_cast(static_cast(addr)) : + nullptr; + cond.expected_value = expected_value; + entry.waiting_completion_count++; + return true; + } + + template + AsyncPollResult poll_and_complete( + AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs, + PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, + int32_t deferred_release_capacity +#if PTO2_SCHED_PROFILING + , + int thread_idx +#endif + ); +}; + +#endif // PTO_ASYNC_WAIT_H diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_completion_token.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_completion_token.h new file mode 100644 index 000000000..45cdb0b51 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_completion_token.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_ + +#include + +#include "aicore_completion_mailbox_types.h" +#include "pto_runtime_status.h" + +// CompletionToken is the runtime-internal POD that backend submit handlers +// produce and the generic register_completion_condition() consumes. It is the +// ABI contract for "this is one completion to wait on" — independent of which +// backend (SDMA, RoCE, notification counter, ...) generated it. Each backend's +// (poll, retire) pair is registered in pto_async_wait.h's ops table, keyed by +// completion_type. +struct CompletionToken { + uint64_t addr; + uint32_t expected_value; + uint32_t engine; + int32_t completion_type; + uint64_t backend_cookie; +}; + +enum class CompletionPollState : uint8_t { + PENDING = 0, + READY = 1, + FAILED = 2, +}; + +struct CompletionPollResult { + CompletionPollState state{CompletionPollState::PENDING}; + int32_t error_code{PTO2_ERROR_NONE}; +}; + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_COMPLETION_TOKEN_H_ diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_constants.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_constants.h new file mode 100644 index 000000000..0707f53f9 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_constants.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_ + +#define PTO2_ALIGN_SIZE 64 // Cache line alignment +#define PTO2_PACKED_OUTPUT_ALIGN 1024 // Each output in packed buffer aligned to 1024B; gap is padding +#define PTO2_ALIGN_UP(x, align) (((x) + (align) - 1) & ~((align) - 1)) + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_CONSTANTS_H_ diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h new file mode 100644 index 000000000..1f78a78e5 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_dep_compute.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file pto_dep_compute.h + * @brief Dependency computation primitives shared by runtime submit_task and dep_gen replay. + * + * Two header-only template entry points: + * + * compute_task_fanin — STEP 3 in submit_task: per-tensor creator retention (Step A) + * + tensormap.lookup for INPUT/INOUT (Step B). Calls back into + * user-supplied `emit` for each producer it identifies. + * + * register_task_outputs — STEP 4 in submit_task: tensormap.insert for INOUT and + * OUTPUT_EXISTING tensors. No callbacks. + * + * STEP 1 (explicit_deps) is intentionally left at the runtime call site because its + * `last_task_alive` shortcut + unchecked slot lookup is subtly different from the + * `slot_state->task->task_id == producer` reuse check in STEP 3. Unifying them would + * require two emit semantics or a marginal behavior change in transients — not worth + * the minor structural overlap. Replay handles STEP 1 with a one-line loop of its own. + * + * The Emit callback contract: + * bool emit(PTO2TaskId producer); + * - return true to continue (whether or not the producer was actually recorded — + * producer-not-alive / dedup-hit / etc. all return true silently) + * - return false to signal fatal (e.g. fanin spill overflow); caller bails + * + * Performance: Emit is a template parameter, not std::function. Both runtime + * (lambda capturing fanin_builder + sm_header) and replay (lambda capturing edge + * vector) instantiate at the call site and inline through. Do NOT replace with + * std::function — it would break the inlining and add ~5 ns/call to the orch hot path. + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_ + +#include + +#include "pto_task_id.h" +#include "pto_tensormap.h" +#include "pto_types.h" // TensorRef +#include "tensor.h" + +/** + * View struct for inputs to compute_task_fanin / register_task_outputs. + * + * Both runtime and replay assemble one of these from their own data sources + * (runtime: from Arg accessors; replay: from SubmitTraceEntry fields). All + * pointer arrays must remain valid for the duration of the call. + */ +struct DepInputs { + int32_t tensor_count; + const TensorRef *tensors; // length = tensor_count (union; OUTPUT slots' .ptr is unused) + const TensorArgType *arg_types; // length = tensor_count + int32_t explicit_dep_count; + const PTO2TaskId *explicit_deps; // length = explicit_dep_count (validity checked by caller) +}; + +/** + * Compute fanin for a task being submitted (STEP 3: Step A creator retention + + * Step B tensormap modifier lookup). + * + * For each non-OUTPUT tensor: + * - If owner_task_id is valid, emit(owner) + * - For INPUT/INOUT (and not manual_dep), tensor_map.lookup(*tensor) and emit + * each matching producer. INOUT+COVERED triggers tensor_map.remove_entry(entry). + * + * @return true on success (or producer-skipped-silently); false if emit signaled + * fatal — caller should propagate (after any fatal bookkeeping done by emit). + */ +template +[[nodiscard]] inline bool +compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, Emit emit) { + if (in_manual_scope) { + return true; + } + + for (int32_t i = 0; i < inputs.tensor_count; i++) { + TensorArgType ptype = inputs.arg_types[i]; + if (ptype == TensorArgType::OUTPUT) { + // Runtime-created OUTPUT tensors are not looked up in the TensorMap since + // they have no dependencies. + continue; + } + + const Tensor *tensor = &inputs.tensors[i].ref(); + + // Step A: creator retention — all existing tensors extend their creator lifetime. + PTO2TaskId owner = tensor->owner_task_id; + if (owner.is_valid()) { + if (!emit(owner)) { + return false; + } + } + + // Step B: only INPUT/INOUT need modifier dependency lookup. + if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) { + continue; + } + if (tensor->manual_dep) { + continue; + } + + bool fatal = false; + tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool { + if (!emit(entry.producer_task_id)) { + fatal = true; + return false; // stop iteration + } + if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) { + tensor_map.remove_entry(entry); + } + return true; + }); + if (fatal) { + return false; + } + } + return true; +} + +/** + * Register a task's outputs in the tensormap (STEP 4 in submit_task). + * + * For INOUT and OUTPUT_EXISTING tensors (excluding manual_dep), inserts the + * tensor into tensor_map keyed by its buffer.addr with `task_id` as producer. + * + * No-op when in_manual_scope. + */ +inline void +register_task_outputs(const DepInputs &inputs, PTO2TaskId task_id, PTO2TensorMap &tensor_map, bool in_manual_scope) { + if (in_manual_scope) { + return; + } + for (int32_t i = 0; i < inputs.tensor_count; i++) { + TensorArgType ptype = inputs.arg_types[i]; + if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) { + const Tensor *tensor = &inputs.tensors[i].ref(); + if (!tensor->manual_dep) { + tensor_map.insert(*tensor, task_id); + } + } + } +} + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_ diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp new file mode 100644 index 000000000..09e0f35a5 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.cpp @@ -0,0 +1,977 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO Runtime2 - Orchestrator Implementation + * + * Implements orchestrator state management, scope handling, and task submission. + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#include "pto_orchestrator.h" + +#include +#include +#include +#include +#include +#include + +#include "aicpu/dep_gen_collector_aicpu.h" +#include "common/dep_gen.h" +#include "common/unified_log.h" +#include "pto_dep_compute.h" +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "pto_types.h" +#include "tensor.h" + +#if PTO2_PROFILING +#include "aicpu/tensor_dump_aicpu.h" +#endif + +// Verify the captured Tensor blob size in DepGenRecord matches the runtime +// Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without +// including runtime/tensor.h, so this check lives at the orch callsite. +static_assert(sizeof(Tensor) == DEP_GEN_TENSOR_SIZE, "DepGenRecord::tensors slot size out of sync with sizeof(Tensor)"); +// DEP_GEN_MAX_EXPLICIT_DEPS is a diagnostic-side capture cap only; the runtime +// imposes no hard cap on explicit dep count. If a submit exceeds this cap, +// dep_gen_aicpu_record_submit() logs and truncates — runtime correctness is +// unaffected, only the captured replay record is truncated. + +// Weak fallbacks: dep_gen_collector_aicpu.cpp provides the strong symbols in +// AICPU builds. Host builds (host_build_graph runtime, future dep_gen replay) +// link these no-op stubs so the runtime translation unit is self-contained. +// Visibility is hidden so the HOST .so doesn't export them into the global +// dynamic symbol table where they'd shadow the AICPU .so's strong symbols +// (same pattern as get_sys_cnt_aicpu / l2_perf_aicpu_record_orch_phase below). +extern "C" __attribute__((weak, visibility("hidden"))) bool is_dep_gen_enabled() { return false; } +__attribute__((weak, visibility("hidden"))) void dep_gen_aicpu_record_submit( + uint64_t, bool, int, const void *const *, const uint8_t *, int, const uint64_t *, const int32_t[3] +) {} + +#if PTO2_PROFILING +#include "aicpu/scope_stats_collector_aicpu.h" + +// Scope_stats enable gate, queried via the same predicate idiom as +// is_dep_gen_enabled. The AICPU collector links the strong definition; host +// builds fall back to this weak `false`. Gating here still skips the +// cross-agent occupancy reads that feed the sample when scope_stats is disabled. +extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; } + +// Heap-ring wrap report, called from the allocator (pto_ring_buffer.h) on each +// wrap. Strong definition lives in the AICPU collector; host builds fall back to +// this weak no-op so the runtime translation unit stays self-contained. +extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {} +#endif + +// ============================================================================= +// Orchestrator Profiling (compile-time toggle) +// ============================================================================= +#if PTO2_ORCH_PROFILING +#include "aicpu/device_time.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" +// Weak fallback for builds that don't link device_time.cpp (e.g. host). +// The strong symbol from platform/.../device_time.cpp wins in the AICPU build. +// +// IMPORTANT: visibility("hidden") is required to prevent the HOST .so from +// exporting this weak fallback into the global dynamic symbol table via +// RTLD_GLOBAL. Without it, when the AICPU .so is loaded and its PLT entry +// for get_sys_cnt_aicpu is resolved, the dynamic linker finds the HOST .so's +// weak definition first (already in global table) and uses it — returning 0. +// With hidden visibility, the HOST .so does not export this symbol globally, +// so the AICPU .so's PLT resolves to its own strong definition from +// device_time.cpp. +__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; } +// Weak fallback for builds that don't link l2_swimlane_collector_aicpu.cpp. +// The strong symbol from the AICPU build wins when profiling is available. +// Also hidden to prevent HOST .so from polluting the global symbol table. +__attribute__((weak, visibility("hidden"))) void +l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {} +// Accumulated cycles per sub-step (only needed for ORCH_PROFILING export) +static uint64_t g_orch_sync_cycle = 0; // tensormap sync +static uint64_t g_orch_alloc_cycle = 0; // unified task+heap alloc +static uint64_t g_orch_args_cycle = 0; // param copy +static uint64_t g_orch_lookup_cycle = 0; // tensormap lookup + dep building +static uint64_t g_orch_insert_cycle = 0; // tensormap insert +static uint64_t g_orch_fanin_cycle = 0; // fanin list + early-return check +static uint64_t g_orch_scope_end_cycle = 0; // scope_end overhead +static int64_t g_orch_submit_count = 0; +static uint32_t g_orch_submit_idx = 0; +uint64_t g_orch_alloc_wait_cycle = 0; +uint64_t g_orch_fanin_wait_cycle = 0; +uint64_t g_orch_alloc_atomic_count = 0; +uint64_t g_orch_args_atomic_count = 0; +uint64_t g_orch_scope_end_atomic_count = 0; +// Cycle accumulation feeds the per-sub-step `g_orch_*_cycle` cumulatives +// printed in the cold-path log. Per-sub-step swim-lane phase records were +// dropped; the per-submit envelope record (CYCLE_COUNT_ORCH_SUBMIT_RECORD) +// is the only swim-lane emit on the orch path. +#define CYCLE_COUNT_START() \ + bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \ + uint64_t _t0 = get_sys_cnt_aicpu(), _t1; \ + uint64_t _submit_start_ts = _t0 +#define CYCLE_COUNT_LAP(acc) \ + do { \ + _t1 = get_sys_cnt_aicpu(); \ + acc += (_t1 - _t0); \ + _t0 = _t1; \ + } while (0) +#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid) \ + do { \ + if (_prof_active) { \ + l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \ + } \ + } while (0) +#elif PTO2_PROFILING +#include "aicpu/device_time.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" +__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; } +__attribute__((weak, visibility("hidden"))) void +l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {} +// submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level) +static uint32_t g_orch_submit_idx = 0; +#define CYCLE_COUNT_START() \ + bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \ + uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0; \ + uint64_t _submit_start_ts = _t0 +#define CYCLE_COUNT_LAP(acc) \ + do { \ + } while (0) +#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid) \ + do { \ + if (_prof_active) { \ + _t1 = get_sys_cnt_aicpu(); \ + l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \ + } \ + } while (0) +#else +#define CYCLE_COUNT_START() +#define CYCLE_COUNT_LAP(acc) +#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid) +#endif + +static int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code) { + always_assert(orch != nullptr); + orch->fatal = true; + if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) { + return PTO2_ERROR_NONE; + } + + int32_t expected = PTO2_ERROR_NONE; + std::atomic &orch_error_code = orch->sm_header->orch_error_code; + if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) { + return error_code; + } + return expected; +} + +static void +orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *func, const char *fmt, va_list args) { + int32_t latched_code = orch_mark_fatal(orch, error_code); +#if PTO2_PROFILING + // Flush the active scope's peaks before the FATAL line so the diagnostic + // context lands adjacent in the log. Latched internally — safe to call + // from every cascaded report_fatal. + scope_stats_on_fatal(); +#endif + + if (fmt == nullptr || fmt[0] == '\0') { + if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) { + unified_log_error(func, "FATAL(code=%d, latched=%d)", error_code, latched_code); + } else { + unified_log_error(func, "FATAL(code=%d)", error_code); + } + return; + } + + char message[1024]; + vsnprintf(message, sizeof(message), fmt, args); + if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) { + unified_log_error(func, "FATAL(code=%d, latched=%d): %s", error_code, latched_code, message); + return; + } + unified_log_error(func, "FATAL(code=%d): %s", error_code, message); +} + +void PTO2OrchestratorState::report_fatal(int32_t error_code, const char *func, const char *fmt, ...) { + auto *orch = this; + va_list args; + va_start(args, fmt); + orch_report_fatal_v(orch, error_code, func, fmt, args); + va_end(args); +} + +static uint32_t next_fanin_seen_epoch(PTO2OrchestratorState *orch) { + uint32_t next = orch->fanin_seen_current_epoch + 1; + if (next == 0) { + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + memset( + orch->fanin_seen_epoch[r], 0, + static_cast(orch->sm_header->rings[r].task_window_size) * sizeof(uint32_t) + ); + } + next = 1; + } + orch->fanin_seen_current_epoch = next; + return next; +} + +struct PTO2FaninBuilder { + PTO2FaninBuilder(PTO2OrchestratorState *orch, PTO2FaninPool &spill_pool, uint32_t seen_epoch) : + count(0), + spill_start(0), + orch(orch), + seen_epoch(seen_epoch), + spill_pool(spill_pool) {} + int32_t count{0}; + int32_t spill_start{0}; + PTO2OrchestratorState *orch{nullptr}; + uint32_t seen_epoch{0}; + PTO2FaninPool &spill_pool; + PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP]; + + template + PTO2FaninForEachReturn for_each(Fn &&fn) const { + return for_each_fanin_storage(inline_slots, count, spill_start, spill_pool, static_cast(fn)); + } + + bool mark_seen(uint8_t prod_ring, int32_t prod_slot) { + if (prod_ring >= PTO2_MAX_RING_DEPTH || prod_slot < 0) { + return false; + } + uint32_t *seen = orch->fanin_seen_epoch[prod_ring]; + uint32_t slot = static_cast(prod_slot); + if (seen[slot] == seen_epoch) { + return true; + } + seen[slot] = seen_epoch; + return false; + } +}; + +static bool append_fanin_or_fail( + PTO2OrchestratorState *orch, uint8_t prod_ring, int32_t prod_slot, PTO2TaskSlotState *prod_state, + PTO2FaninBuilder *fanin_builder, uint8_t ring_id +) { + if (fanin_builder->mark_seen(prod_ring, prod_slot)) { + return true; + } + + if (fanin_builder->count < PTO2_FANIN_INLINE_CAP) { + fanin_builder->inline_slots[fanin_builder->count++] = prod_state; + return true; + } + + PTO2FaninPool &fanin_pool = fanin_builder->spill_pool; + if (!fanin_pool.ensure_space(orch->sm_header->rings[ring_id], 1)) { + orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW); + return false; + } + int32_t spill_idx = fanin_pool.top; + PTO2FaninSpillEntry *entry = fanin_pool.alloc(); + if (entry == nullptr) { + orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW); + return false; + } + if (fanin_builder->count == PTO2_FANIN_INLINE_CAP) { + fanin_builder->spill_start = spill_idx; + } + entry->slot_state = prod_state; + fanin_builder->count++; + return true; +} + +static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state); + +struct PTO2PreparedTask { + PTO2TaskId task_id = PTO2TaskId::invalid(); + PTO2TaskAllocResult alloc_result = {-1, 0, nullptr, nullptr}; + PTO2TaskDescriptor *task = nullptr; + PTO2TaskPayload *payload = nullptr; + PTO2TaskSlotState *slot_state = nullptr; +}; + +static PTO2OutputLayout calculate_output_layout(const L0TaskArgs &args) { + PTO2OutputLayout layout; + for (int32_t i = 0; i < args.tensor_count(); i++) { + if (args.tag(i) != TensorArgType::OUTPUT) { + continue; + } + layout.offsets[i] = layout.total_output_size; + layout.buffer_sizes[i] = + PTO2_ALIGN_UP(args.tensor(i).create_info().buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN); + layout.total_output_size += layout.buffer_sizes[i]; + } + return layout; +} + +static bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator, uint8_t ring_id) { + always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope"); + + int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top]; + if (scope_task_count < allocator.window_size() - 1) { + return true; + } + + int32_t active_count = allocator.active_count(); + + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Scope Deadlock Detected! (ring %d)", ring_id); + LOG_ERROR("========================================"); + LOG_ERROR("Tasks in current scope (%d) >= task_window_size (%d).", scope_task_count, allocator.window_size()); + LOG_ERROR(" scope_depth: %d", orch->scope_stack_top + 1); + LOG_ERROR(" ring_id: %d", ring_id); + LOG_ERROR(" scope_task_count: %d", scope_task_count); + LOG_ERROR(" active_tasks: %d / %d", active_count, allocator.window_size()); + LOG_ERROR("Root Cause:"); + LOG_ERROR(" Tasks within a scope hold a fanout_count reference that is only"); + LOG_ERROR(" released at scope_end. When scope task count >= window_size,"); + LOG_ERROR(" no slots can be reclaimed -> deadlock."); + LOG_ERROR("Solution:"); + LOG_ERROR(" 1. Reduce tasks per scope (use batching/unroll)"); + LOG_ERROR(" 2. Increase task window (current: %d)", allocator.window_size()); + LOG_ERROR(" Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_TASK_WINDOW="); + LOG_ERROR(" 3. Split work across multiple scopes"); + LOG_ERROR("========================================"); + orch_mark_fatal(orch, PTO2_ERROR_SCOPE_DEADLOCK); + return false; +} + +static void prefetch_payload(PTO2TaskPayload *payload, int32_t tensor_count, int32_t scalar_count) { + for (int32_t i = 0; i < tensor_count; i++) { + __builtin_prefetch(&payload->tensors[i], 1, 3); + __builtin_prefetch(reinterpret_cast(&payload->tensors[i]) + 64, 1, 3); + } + for (int32_t i = 0; i < scalar_count; i += 8) { + __builtin_prefetch(&payload->scalars[i], 1, 3); + } + __builtin_prefetch(payload, 1, 3); + __builtin_prefetch(reinterpret_cast(payload) + 64, 1, 3); + __builtin_prefetch(reinterpret_cast(payload) + 128, 1, 3); +} + +static bool prepare_task( + PTO2OrchestratorState *orch, const L0TaskArgs &args, int32_t total_output_size, ActiveMask active_mask, + PTO2PreparedTask *out +) { + uint8_t ring_id = orch->current_ring_id(); + auto &allocator = orch->rings[ring_id].task_allocator; + + if (!check_scope_can_accept_task(orch, allocator, ring_id)) { + return false; + } + + out->alloc_result = allocator.alloc(total_output_size); + if (out->alloc_result.failed()) { + orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK); + return false; + } + + out->task_id = PTO2TaskId::make(ring_id, static_cast(out->alloc_result.task_id)); + out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot); + out->task = &orch->sm_header->rings[ring_id].task_descriptors[out->alloc_result.slot]; + out->payload = &orch->sm_header->rings[ring_id].task_payloads[out->alloc_result.slot]; + + prefetch_payload(out->payload, args.tensor_count(), args.scalar_count()); + + // Re-bind payload/task pointers each submit. Value is per-slot constant + // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing + // here lets RingSchedState::init_data_from_layout() skip the + // O(window_size) bind loop. Both writes hit the same 64B slot_state + // cache line we're about to dirty below, so the extra cost is two + // stores on an already-hot line. Must precede the scheduler + // wiring.queue.push at the end of submit_task_common — that push is + // the first read of slot_state->task / slot_state->payload by another + // thread. + out->slot_state->bind_buffers(out->payload, out->task); + + // Fields already reset by advance_ring_pointers (eager reset after CONSUMED): + // fanout_lock=0, fanout_count=1, fanout_head=nullptr, + // fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0 + // Fields immutable after RingSchedState::init_data_from_layout(): + // ring_id + // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor + // observers); set to PENDING here when orchestrator actually reuses the slot. + out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed); + int16_t block_num = args.launch_spec.core_num(); + out->slot_state->total_required_subtasks = + static_cast(block_num * __builtin_popcount(active_mask.core_mask())); + out->slot_state->logical_block_num = block_num; + out->slot_state->active_mask = active_mask; + // fanin_count is set by scheduler during wiring + scope_tasks_push(orch, out->slot_state); + + return true; +} + +// ============================================================================= +// Scope Management +// ============================================================================= + +static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state) { + if (orch->scope_tasks_size >= orch->scope_tasks_capacity) { + // scope_tasks lives in the per-Worker arena (single backing allocation), + // so realloc is not legal. Capacity == PTO2_SCOPE_TASKS_CAP == + // PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH, the total in-flight slot + // budget — hitting it means every ring is saturated, so no further push + // could succeed regardless of buffer growth. + orch->report_fatal( + PTO2_ERROR_SCOPE_TASKS_OVERFLOW, __FUNCTION__, + "scope_tasks buffer saturated at %d entries (all rings full)", orch->scope_tasks_capacity + ); + return; + } + orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state; +} + +void PTO2OrchestratorState::begin_scope(PTO2ScopeMode mode) { + auto *orch = this; + if (orch->fatal) { + return; + } + assert(orch->scope_stack_top < static_cast(orch->scope_stack_capacity - 1) && "Scope stack overflow"); + if (mode == PTO2ScopeMode::AUTO && orch->in_manual_scope()) { + report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "auto scope nested inside manual scope is not supported"); + return; + } + + bool already_in_manual_scope = orch->in_manual_scope(); + ++orch->scope_stack_top; + orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size; + if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) { + orch->manual_begin_depth = orch->scope_stack_top; + } +#if PTO2_PROFILING + // Gate via is_scope_stats_enabled() (weak-false in host builds) BEFORE the + // collector call: when disabled we pay nothing. Sample the current ring's + // task/heap start-end and tensormap usage at the scope boundary. + if (is_scope_stats_enabled()) { + uint8_t ring_id = orch->current_ring_id(); + auto &alloc = orch->rings[ring_id].task_allocator; + int32_t dep_pool_tail = 0; + int32_t dep_pool_top = 0; + if (orch->scheduler) { + orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top); + } + scope_stats_begin( + ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail, + dep_pool_top, orch->tensor_map.current_used() + ); + } +#endif +} + +void PTO2OrchestratorState::end_scope() { + auto *orch = this; + if (orch->fatal) { + return; + } + assert(orch->scope_stack_top >= 0 && "Scope stack underflow"); + + // Snapshot the ring start/end BEFORE the orchestrator drains pending tasks + // via scheduler->on_scope_end, so the end record reflects the scope's + // occupancy at close, not the residual after teardown. +#if PTO2_PROFILING + // Gate via is_scope_stats_enabled() (see begin_scope). One collector call + // emits the end-boundary record and tears down bookkeeping. + if (is_scope_stats_enabled()) { + uint8_t ring_id = orch->current_ring_id(); + auto &alloc = orch->rings[ring_id].task_allocator; + int32_t dep_pool_tail = 0; + int32_t dep_pool_top = 0; + if (orch->scheduler) { + orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top); + } + scope_stats_end( + ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail, + dep_pool_top, orch->tensor_map.current_used() + ); + } +#endif + +#if PTO2_ORCH_PROFILING + uint64_t _se0 = get_sys_cnt_aicpu(); +#endif + + bool ending_manual_scope = orch->scope_stack_top == orch->manual_begin_depth; + int32_t begin = orch->scope_begins[orch->scope_stack_top--]; + int32_t count = orch->scope_tasks_size - begin; + if (ending_manual_scope) { + orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; + } + + if (orch->scheduler && count > 0) { + orch->scheduler->on_scope_end(&orch->scope_tasks[begin], count); + } + + // Rewind the task buffer — these entries are no longer needed + orch->scope_tasks_size = begin; + +#if PTO2_ORCH_PROFILING + uint64_t _se1 = get_sys_cnt_aicpu(); + g_orch_scope_end_cycle += (_se1 - _se0); +#endif +} + +// ============================================================================= +// Task Submission +// ============================================================================= + +// Shared body for submit_task / submit_dummy_task. Caller has already validated +// args.has_error, decided active_mask (empty for dummy), and resolved the per-slot +// kernel_ids (all INVALID_KERNEL_ID for dummy). Performs tensormap sync, fanin +// computation (explicit_deps + auto), output registration, slot init, and pushes +// to the scheduler wiring queue. +static TaskOutputTensors submit_task_common( + PTO2OrchestratorState *orch, const L0TaskArgs &args, ActiveMask active_mask, int32_t aic_kernel_id, + int32_t aiv0_kernel_id, int32_t aiv1_kernel_id +) { + CYCLE_COUNT_START(); + TaskOutputTensors result; + PTO2OutputLayout layout = calculate_output_layout(args); + PTO2PreparedTask prepared; + if (!prepare_task(orch, args, layout.total_output_size, active_mask, &prepared)) { + return result; + } + uint8_t ring_id = prepared.task_id.ring(); + PTO2SchedulerState *sched = orch->scheduler; + PTO2RingFlowControl &fc = orch->sm_header->rings[ring_id].fc; + PTO2TaskId task_id = prepared.task_id; + PTO2TaskSlotState &cur_slot_state = *prepared.slot_state; + PTO2TaskDescriptor &task = *prepared.task; + PTO2TaskPayload &payload = *prepared.payload; + result.set_task_id(task_id); + + // dep_gen capture point: snapshot the orch submit_task inputs while the + // tensormap is still in its pre-lookup state for this task. Replay reads + // these records offline to reconstruct the complete dep graph — the sole + // source of truth for fanout now that the swimlane hot path no longer + // records it. + if (is_dep_gen_enabled()) { + const void *tensor_ptrs[MAX_TENSOR_ARGS]; + // TensorArgType is `enum class : int32_t` (4 bytes); the on-disk record + // packs arg_types as uint8_t[16] (5-value enum fits in a byte). Narrow + // each tag here rather than letting the AICPU writer reinterpret a + // 4×-wider array as bytes — that path silently lost two of every three + // tags on little-endian and synthesized phantom self-edges in replay. + uint8_t arg_types_u8[MAX_TENSOR_ARGS]; + // Clamp to MAX_TENSOR_ARGS even though the Arg builder caps adds at + // MAX_TENSOR_ARGS: defensive against any future builder bypass / + // shared-memory bit-flip that could otherwise overrun the two + // MAX_TENSOR_ARGS-sized stack buffers above. + const int tc_raw = args.tensor_count(); + const int tc = tc_raw > MAX_TENSOR_ARGS ? MAX_TENSOR_ARGS : tc_raw; + for (int i = 0; i < tc; i++) { + // OUTPUT slots carry create_info (not yet a Tensor); skip them — + // they have no producer to look up and replay's per-tensor loop + // also skips OUTPUT. + tensor_ptrs[i] = (args.tag(i) == TensorArgType::OUTPUT) ? nullptr : &args.tensor(i).ref(); + arg_types_u8[i] = static_cast(args.tag(i)); + } + const int32_t kernel_ids_capture[3] = {aic_kernel_id, aiv0_kernel_id, aiv1_kernel_id}; + dep_gen_aicpu_record_submit( + task_id.raw, orch->in_manual_scope(), tc, tensor_ptrs, arg_types_u8, + static_cast(args.explicit_dep_count()), reinterpret_cast(args.explicit_deps_data()), + kernel_ids_capture + ); + } + + PTO2FaninBuilder fanin_builder(orch, orch->rings[ring_id].fanin_pool, next_fanin_seen_epoch(orch)); + + CYCLE_COUNT_LAP(g_orch_alloc_cycle); + +#if PTO2_PROFILING + if (layout.total_output_size > 0) { + orch->buffers_allocated++; + orch->bytes_allocated += layout.total_output_size; + } +#endif + + // === STEP 2: Sync TensorMap validity and optional cleanup === + // Read current last_task_alive from shared memory for this ring + int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire); + + orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive); + + CYCLE_COUNT_LAP(g_orch_sync_cycle); + + for (uint32_t i = 0; i < args.explicit_dep_count(); i++) { + PTO2TaskId dep_task_id = args.explicit_dep(i); + if (!dep_task_id.is_valid()) { + orch->report_fatal( + PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "Arg.set_dependencies(...) requires valid task ids" + ); + return result; + } + uint8_t dep_ring_id = dep_task_id.ring(); + PTO2SharedMemoryRingHeader &dep_ring = orch->sm_header->rings[dep_ring_id]; + int32_t dep_local_task_id = static_cast(dep_task_id.local()); + int32_t dep_last_task_alive = dep_ring.fc.last_task_alive.load(std::memory_order_acquire); + if (dep_local_task_id < dep_last_task_alive) { + continue; + } + int32_t dep_slot = dep_ring.get_slot_by_task_id(dep_local_task_id); + PTO2TaskSlotState *producer_slot_state = &dep_ring.get_slot_state_by_slot(dep_slot); + if (!append_fanin_or_fail(orch, dep_ring_id, dep_slot, producer_slot_state, &fanin_builder, ring_id)) { + return result; + } + } + + // === STEP 3: Lookup inputs (creator retention + tensormap modifier lookup) === + DepInputs dep_inputs{ + args.tensor_count(), args.tensor_data(), args.tag_data(), static_cast(args.explicit_dep_count()), + args.explicit_deps_data(), + }; + + auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool { + uint8_t prod_ring = producer_task_id.ring(); + PTO2SharedMemoryRingHeader &producer_ring = orch->sm_header->rings[prod_ring]; + int32_t prod_slot = producer_ring.get_slot_by_task_id(static_cast(producer_task_id.local())); + PTO2TaskSlotState *prod_state = &producer_ring.get_slot_state_by_slot(prod_slot); + return append_fanin_or_fail(orch, prod_ring, prod_slot, prod_state, &fanin_builder, ring_id); + }; + + if (!compute_task_fanin(dep_inputs, orch->tensor_map, orch->in_manual_scope(), runtime_emit)) { + return result; + } + + CYCLE_COUNT_LAP(g_orch_lookup_cycle); + + // === STEP 4: Register outputs/inouts in TensorMap (must be separate from lookup) === + register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope()); + + CYCLE_COUNT_LAP(g_orch_insert_cycle); + + // === STEP 5: Batch-write to GM (single cache line burst) === + // Deferred from allocation phase to avoid scattered GM writes that get + // evicted by TensorMap lookup/insert cache pressure. + __builtin_prefetch(&task, 1, 1); + task.task_id = task_id; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIC)] = aic_kernel_id; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIV0)] = aiv0_kernel_id; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIV1)] = aiv1_kernel_id; + task.packed_buffer_base = prepared.alloc_result.packed_base; + task.packed_buffer_end = prepared.alloc_result.packed_end; + + // Increment fanout_count on each producer (no lock — only orch writes this field). + // Prevents premature CONSUMED: scope_end's release_producer checks fanout_refcount == fanout_count. + for_each_fanin_storage( + fanin_builder.inline_slots, fanin_builder.count, fanin_builder.spill_start, fanin_builder.spill_pool, + [](PTO2TaskSlotState *producer) { + producer->fanout_count++; + } + ); + + int32_t inline_count = std::min(fanin_builder.count, PTO2_FANIN_INLINE_CAP); + // Store fanin metadata in payload for scheduler to iterate + payload.fanin_actual_count = fanin_builder.count; + payload.fanin_spill_start = fanin_builder.spill_start; + payload.fanin_spill_pool = &fanin_builder.spill_pool; + for (int i = 0; i < inline_count; i++) { + payload.fanin_inline_slot_states[i] = fanin_builder.inline_slots[i]; + } + + payload.init(args, result, prepared.alloc_result, layout); +#if PTO2_PROFILING + if (is_dump_args_enabled()) { + if (args.scalar_count() > 0) { + set_dump_args_task_scalar_dtypes( + task_id.raw, static_cast(args.scalar_count()), args.scalar_dtypes() + ); + } + // Selective vs full dump is latched at dump_args_init from DumpDataHeader + // (host-decided before any dispatch), so it is race-free regardless of + // submission order. Here we only record each marked task's arg mask and + // metadata flags, which selective collection consults. + if (args.dump_arg_mask() != 0) { + set_dump_args_task_mask(task_id.raw, args.dump_arg_mask(), args.dump_arg_index_ambiguous_mask()); + } + } +#endif + + CYCLE_COUNT_LAP(g_orch_args_cycle); +#if PTO2_ORCH_PROFILING + g_orch_args_atomic_count += 2; // fanout_lock.store + fanout_count.store +#endif + + // === STEP 6: push to wiring queue === + // Deferred wiring: orchestrator only stores dependency metadata and increments + // fanout_count. The actual fanout_head wiring (lock + dep_pool + early_finished) + // is handled asynchronously by scheduler thread 0 via the wiring queue. + // Push to global wiring queue — scheduler sets fanin_count, wires fanout, checks readiness + while (!sched->wiring.queue.push(&cur_slot_state)) { + SPIN_WAIT_HINT(); + } + + CYCLE_COUNT_LAP(g_orch_fanin_cycle); + CYCLE_COUNT_ORCH_SUBMIT_RECORD(task_id.raw); + +#if PTO2_PROFILING + orch->tasks_submitted++; +#if PTO2_ORCH_PROFILING + g_orch_submit_count++; +#endif + g_orch_submit_idx++; +#endif + return result; +} + +TaskOutputTensors PTO2OrchestratorState::submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args) { + auto *orch = this; + + // Orchestration API should short-circuit after fatal, but keep this entry + // robust as a no-op in case a caller reaches it directly. + if (orch->fatal) { + return TaskOutputTensors{}; + } + + // Validate Arg construction (errors recorded by add_input/add_output/etc.) + if (args.has_error) { + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Invalid Arg Detected!"); + LOG_ERROR("========================================"); + LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)"); + LOG_ERROR(" tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count()); + LOG_ERROR("This is a bug in the orchestration code."); + LOG_ERROR("========================================"); + orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS); + return TaskOutputTensors{}; + } + always_assert(orch->scheduler != nullptr); + // === Validate submit inputs === + ActiveMask active_mask = mixed_kernels.to_active_mask(); + always_assert(static_cast(active_mask) && "MixedKernels must have at least one active slot"); + + int16_t block_num = args.launch_spec.core_num(); + always_assert(block_num >= 1 && "block_num must be >= 1"); + + // Normalize single-AIV tasks: if only aiv1 is set (no aic, no aiv0), move + // it to the aiv0 slot. This guarantees the dispatch path can always use + // PTO2SubtaskSlot::AIV0 for single-AIV shapes without inspecting active_mask. + // Mixed tasks (AIC+AIV) keep their original AIV identity so the correct + // hardware channel (AIV0→AIC vs AIV1→AIC) is used at dispatch time. + MixedKernels normalized = mixed_kernels; + bool has_aic = active_mask.has_mask(PTO2_SUBTASK_MASK_AIC); + bool has_aiv0 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV0); + bool has_aiv1 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV1); + if (!has_aic && has_aiv1 && !has_aiv0) { + normalized.aiv0_kernel_id = normalized.aiv1_kernel_id; + normalized.aiv1_kernel_id = INVALID_KERNEL_ID; + active_mask = normalized.to_active_mask(); + } + + // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1) + if (block_num > 1 && args.launch_spec.require_sync_start()) { + // Deadlock check: block_num >= total available slots of the required type. + // For MIX/AIC: limit is total_cluster_count (one AIC per cluster). + // For AIV: limit is total_aiv_count. + PTO2ResourceShape shape = active_mask.to_shape(); + int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count; + if (limit > 0 && block_num > limit) { + report_fatal( + PTO2_ERROR_REQUIRE_SYNC_START_INVALID, __FUNCTION__, + "require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit + ); + return TaskOutputTensors{}; + } + active_mask.set_sync_start(); + } + + return submit_task_common( + orch, args, active_mask, normalized.aic_kernel_id, normalized.aiv0_kernel_id, normalized.aiv1_kernel_id + ); +} + +// Submit a dependency-only task: full dependency graph participation +// (tensormap lookup/insert, explicit_deps, manual_dep, manual_scope) but no +// AICore dispatch. Empty active_mask routes the slot to the DUMMY ready +// bucket; dispatch loop short-circuits to completion. Accepts the same Arg +// shape as submit_task; scalars are permitted but never consumed. +TaskOutputTensors PTO2OrchestratorState::submit_dummy_task(const L0TaskArgs &args) { + auto *orch = this; + + if (orch->fatal) { + return TaskOutputTensors{}; + } + + if (args.has_error) { + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Invalid Arg in submit_dummy_task!"); + LOG_ERROR("========================================"); + LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)"); + LOG_ERROR(" tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count()); + LOG_ERROR("========================================"); + orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS); + return TaskOutputTensors{}; + } + always_assert(orch->scheduler != nullptr); + + return submit_task_common(orch, args, ActiveMask{}, INVALID_KERNEL_ID, INVALID_KERNEL_ID, INVALID_KERNEL_ID); +} + +TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const L0TaskArgs &args) { + auto *orch = this; + // Orchestration API should short-circuit after fatal, but keep this entry + // robust as a no-op in case a caller reaches it directly. + if (orch->fatal) { + return TaskOutputTensors{}; + } + + if (args.tensor_count() <= 0) { + report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors requires at least one TensorCreateInfo"); + return TaskOutputTensors{}; + } + if (args.scalar_count() != 0) { + report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args"); + return TaskOutputTensors{}; + } + for (int32_t i = 0; i < args.tensor_count(); i++) { + if (args.tag(i) != TensorArgType::OUTPUT) { + report_fatal( + PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args" + ); + return TaskOutputTensors{}; + } + } + + CYCLE_COUNT_START(); + + if (args.has_error) { + report_fatal( + PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", + args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg" + ); + return TaskOutputTensors{}; + } + + PTO2OutputLayout layout = calculate_output_layout(args); + PTO2PreparedTask prepared; + if (!prepare_task(orch, args, layout.total_output_size, ActiveMask{}, &prepared)) { + return TaskOutputTensors{}; + } + + PTO2TaskDescriptor &task = *prepared.task; + PTO2TaskPayload &payload = *prepared.payload; + + CYCLE_COUNT_LAP(g_orch_alloc_cycle); + +#if PTO2_PROFILING + if (layout.total_output_size > 0) { + orch->buffers_allocated++; + orch->bytes_allocated += layout.total_output_size; + } +#endif + + task.task_id = prepared.task_id; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIC)] = INVALID_KERNEL_ID; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIV0)] = INVALID_KERNEL_ID; + task.kernel_id[static_cast(PTO2SubtaskSlot::AIV1)] = INVALID_KERNEL_ID; + task.packed_buffer_base = prepared.alloc_result.packed_base; + task.packed_buffer_end = prepared.alloc_result.packed_end; + + TaskOutputTensors outputs; + outputs.set_task_id(prepared.task_id); + payload.init(args, outputs, prepared.alloc_result, layout); + payload.fanin_actual_count = 0; + payload.fanin_spill_start = 0; + payload.fanin_spill_pool = &orch->rings[prepared.task_id.ring()].fanin_pool; + CYCLE_COUNT_LAP(g_orch_args_cycle); + + if (prepared.slot_state != nullptr) { + // Hidden alloc tasks complete inline in the orchestrator before any + // consumer can exist, so they have no fanout to notify and no worker + // subtasks to retire. Running the full on_task_complete path + // would only pay unnecessary fanout_lock / traversal overhead here. + // The generic slot initialization done in prepare_task() is still + // required so scope_end can release the producer-side reference and + // drive the slot to CONSUMED, but worker dispatch fields are never + // observed for hidden alloc tasks. + prepared.slot_state->task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release); + } + orch->inline_completed_tasks++; + + CYCLE_COUNT_LAP(g_orch_fanin_cycle); + CYCLE_COUNT_ORCH_SUBMIT_RECORD(prepared.task_id.raw); + +#if PTO2_PROFILING + orch->tasks_submitted++; +#if PTO2_ORCH_PROFILING + g_orch_submit_count++; +#endif + g_orch_submit_idx++; +#endif + + return outputs; +} + +// ============================================================================= +// Flow Control +// ============================================================================= + +void PTO2OrchestratorState::mark_done() { + auto *orch = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + int32_t total_tasks = orch->rings[r].task_allocator.active_count(); + if (total_tasks > 0) { + LOG_INFO_V0("=== [Orchestrator] ring %d: total_tasks=%d ===", r, total_tasks); + } + auto &fanin_pool = orch->rings[r].fanin_pool; + if (fanin_pool.top > 1) { + LOG_INFO_V0( + "=== [FaninPool %d] top=%d tail=%d used=%d high_water=%d capacity=%d ===", r, fanin_pool.top, + fanin_pool.tail, fanin_pool.top - fanin_pool.tail, fanin_pool.high_water, fanin_pool.capacity + ); + } + } + orch->sm_header->orchestrator_done.store(1, std::memory_order_release); + orch->scope_tasks_size = 0; + orch->scope_stack_top = -1; + orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; +#if !PTO2_ORCH_PROFILING && PTO2_PROFILING + g_orch_submit_idx = 0; +#endif +} + +#if PTO2_ORCH_PROFILING +PTO2OrchProfilingData orchestrator_get_profiling() { + PTO2OrchProfilingData d; + d.sync_cycle = g_orch_sync_cycle; + d.alloc_cycle = g_orch_alloc_cycle; + d.args_cycle = g_orch_args_cycle; + d.lookup_cycle = g_orch_lookup_cycle; + d.insert_cycle = g_orch_insert_cycle; + d.fanin_cycle = g_orch_fanin_cycle; + d.scope_end_cycle = g_orch_scope_end_cycle; + d.submit_count = g_orch_submit_count; + d.alloc_wait_cycle = g_orch_alloc_wait_cycle; + d.fanin_wait_cycle = g_orch_fanin_wait_cycle; + d.alloc_atomic_count = g_orch_alloc_atomic_count; + d.args_atomic_count = g_orch_args_atomic_count; + d.scope_end_atomic_count = g_orch_scope_end_atomic_count; + + // Reset + g_orch_sync_cycle = g_orch_alloc_cycle = g_orch_args_cycle = 0; + g_orch_lookup_cycle = g_orch_insert_cycle = 0; + g_orch_fanin_cycle = g_orch_scope_end_cycle = 0; + g_orch_submit_count = 0; + g_orch_submit_idx = 0; + g_orch_alloc_wait_cycle = 0; + g_orch_fanin_wait_cycle = 0; + g_orch_alloc_atomic_count = 0; + g_orch_args_atomic_count = 0; + g_orch_scope_end_atomic_count = 0; + return d; +} +#endif diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h new file mode 100644 index 000000000..a8ed3817f --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_orchestrator.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Orchestrator Interface + * + * The Orchestrator is responsible for: + * 1. Executing the orchestration function (Turing-complete control flow) + * 2. Allocating intermediate buffers from the heap + * 3. Submitting tasks via async InCore function calls + * 4. Building the dependency graph using TensorMap + * 5. Managing buffer scopes for lifecycle control + * + * The Orchestrator can run on either: + * - Host CPU (lower latency for complex control, easier debugging) + * - Device AI_CPU (lower latency for task submission) + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#pragma once + +#include "utils/device_arena.h" +#include "common/l2_swimlane_profiling.h" +#include "pto_ring_buffer.h" +#include "pto_runtime2_types.h" +#include "pto_submit_types.h" +#include "scheduler/pto_scheduler.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "pto_types.h" + +/** + * Layout descriptor produced by PTO2OrchestratorState::reserve_layout(). Holds + * arena offsets for every sub-region the orchestrator owns (per-ring fanin + * pools, scope arrays, plus the nested PTO2TensorMap layout). + */ +struct PTO2OrchestratorLayout { + size_t off_fanin_pool[PTO2_MAX_RING_DEPTH]; + size_t off_fanin_seen_epoch[PTO2_MAX_RING_DEPTH]; + size_t off_scope_tasks; + size_t off_scope_begins; + PTO2TensorMapLayout tensor_map; + int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; + int32_t scope_tasks_cap; + uint64_t scope_stack_capacity; +}; + +// ============================================================================= +// Orchestrator State +// ============================================================================= + +/** + * Orchestrator state structure (private to Orchestrator) + * + * Contains all state needed for task graph construction and buffer management. + */ +struct PTO2OrchestratorState { + // === SHARED MEMORY ACCESS === + PTO2SharedMemoryHeader *sm_header; + + // === PER-RING RESOURCES === + PTO2RingSet rings[PTO2_MAX_RING_DEPTH]; + uint32_t *fanin_seen_epoch[PTO2_MAX_RING_DEPTH]; + uint32_t fanin_seen_current_epoch{1}; + + // === TENSOR MAP (Private) === + PTO2TensorMap tensor_map; // Producer lookup + + // === SCOPE STACK (Private) === + // Single contiguous buffer of task IDs, partitioned by scope level. + // scope_begins[i] is the index into scope_tasks where scope i starts. + // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size). + PTO2TaskSlotState **scope_tasks; // Flat buffer of taskSlotState (all scopes concatenated) + int32_t scope_tasks_size; // Number of task IDs currently in the buffer + int32_t scope_tasks_capacity; // Allocated capacity of scope_tasks + int32_t *scope_begins; // scope_begins[i] = start index of scope i in scope_tasks + int32_t scope_stack_top; // Current top of stack (-1 = no scope open) + uint64_t scope_stack_capacity; // Max nesting depth (PTO2_MAX_SCOPE_DEPTH) + int32_t manual_begin_depth{PTO2_MAX_SCOPE_DEPTH}; + + // === SCHEDULER REFERENCE === + // Note: In simulated mode, orchestrator and scheduler share address space + // In real mode, they communicate via shared memory only + PTO2SchedulerState *scheduler; // For simulated mode only + + // Total core counts set once at executor init; used for submit-time deadlock detection. + int32_t total_cluster_count{0}; // AIC cores = MIX clusters + int32_t total_aiv_count{0}; // AIV cores (= 2 × clusters on standard hardware) +#if PTO2_PROFILING + // L2 swimlane_level copied from get_l2_swimlane_level(). + L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED}; +#endif + + // === GM HEAP (for output buffers) === + void *gm_heap_base; // Base address of GM heap + uint64_t gm_heap_size; // Total size of GM heap (all rings) + + // === FATAL ERROR === + // Fatal error flag (single-thread access by orchestrator, no atomic needed) + // Cross-thread notification uses shared memory orch_error_code (atomic) + bool fatal; + + // Hidden alloc tasks complete synchronously inside the orchestrator and + // therefore bypass the executor's normal worker-completion counter path. + // The executor adds this count into its completed_tasks_ progress counter + // after orchestration finishes so shutdown/profiling totals remain closed. + int64_t inline_completed_tasks{0}; + + // === STATISTICS === +#if PTO2_PROFILING + int64_t tasks_submitted; + int64_t buffers_allocated; + int64_t bytes_allocated; +#endif + + /** + * Get current ring index from scope depth. + * Maps scope depth to ring_id: min(scope_depth, PTO2_MAX_RING_DEPTH - 1) + */ + uint8_t current_ring_id() const { + int32_t depth = scope_stack_top; + if (depth < 0) depth = 0; + return depth < PTO2_MAX_RING_DEPTH ? static_cast(depth) : PTO2_MAX_RING_DEPTH - 1; + } + + bool in_manual_scope() const { return scope_stack_top >= manual_begin_depth; } + + // === Cold-path API (defined in pto_orchestrator.cpp) === + + // Phase 1: declare every sub-region (per-ring fanin pool, scope arrays, + // tensor_map sub-layout) on the supplied arena. task_window_sizes feeds + // the nested tensor_map layout. Returned layout is consumed by + // init_data_from_layout. + static PTO2OrchestratorLayout reserve_layout( + DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], + int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE + ); + static PTO2OrchestratorLayout reserve_layout( + DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], + const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH] + ); + + // Phase 3a: write everything *except* arena-internal pointer fields. + // sm_dev_base is the SM device address (only stored, never dereferenced); + // task_window_size feeds the per-ring SM address arithmetic. Safe to call + // on a host arena that holds the prebuilt image. + bool init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, + uint64_t task_window_size + ); + bool init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, + const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH] + ); + + // Phase 3b: write the arena-internal pointer fields (scope_tasks, + // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool, + // free_entry_list,task_entry_heads}, scheduler reference). + // Idempotent — host runs once on the image, AICPU runs once after attach. + void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler); + + // Forget pointers; arena owns the backing buffers. + void destroy(); + void set_scheduler(PTO2SchedulerState *scheduler); + void report_fatal(int32_t error_code, const char *func, const char *fmt, ...); + void begin_scope(PTO2ScopeMode mode = PTO2ScopeMode::AUTO); + void end_scope(); + TaskOutputTensors submit_task(const MixedKernels &mixed_kernels, const L0TaskArgs &args); + TaskOutputTensors submit_dummy_task(const L0TaskArgs &args); + TaskOutputTensors alloc_tensors(const L0TaskArgs &args); + void mark_done(); +}; + +// ============================================================================= +// Orchestrator Profiling Data +// ============================================================================= + +#if PTO2_ORCH_PROFILING +struct PTO2OrchProfilingData { + uint64_t sync_cycle; + uint64_t alloc_cycle; // Combined task slot + heap allocation + uint64_t args_cycle; + uint64_t lookup_cycle; + uint64_t insert_cycle; + uint64_t fanin_cycle; + uint64_t scope_end_cycle; + int64_t submit_count; + // Wait time tracking for blocking phases + uint64_t alloc_wait_cycle; // Cycles spent waiting in unified alloc + uint64_t fanin_wait_cycle; // Cycles spent waiting in fanout_lock + // Atomic operation counts per phase + uint64_t alloc_atomic_count; + uint64_t args_atomic_count; + uint64_t scope_end_atomic_count; +}; + +PTO2OrchProfilingData orchestrator_get_profiling(); +#endif diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp new file mode 100644 index 000000000..f6009dc57 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Ring Buffer Implementation + * + * Implements DepListPool ring buffer for zero-overhead dependency management. + * TaskAllocator methods are defined inline in pto_ring_buffer.h. + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#include "pto_ring_buffer.h" +#include +#include +#include "common/unified_log.h" +#include "scheduler/pto_scheduler.h" + +static void latch_pool_error(std::atomic *error_code_ptr, int32_t error_code) { + if (error_code_ptr == nullptr) { + return; + } + int32_t expected = PTO2_ERROR_NONE; + error_code_ptr->compare_exchange_strong(expected, error_code, std::memory_order_acq_rel); +} + +// ============================================================================= +// Fanin Spill Pool Implementation +// ============================================================================= +void PTO2FaninPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) { + if (sm_last_task_alive <= reclaim_task_cursor) return; + + int32_t scan_end = sm_last_task_alive; + for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) { + PTO2TaskPayload &payload = ring.get_payload_by_task_id(task_id); + if (payload.fanin_spill_pool != this) { + continue; + } + + int32_t inline_count = std::min(payload.fanin_actual_count, PTO2_FANIN_INLINE_CAP); + int32_t spill_edge_count = payload.fanin_actual_count - inline_count; + if (spill_edge_count > 0) { + advance_tail(payload.fanin_spill_start + spill_edge_count); + } + } + reclaim_task_cursor = scan_end; +} + +bool PTO2FaninPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) { + if (available() >= needed) return true; + + int spin_count = 0; + int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); + while (available() < needed) { + reclaim(ring, prev_last_alive); + if (available() >= needed) return true; + + spin_count++; + + int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); + if (cur_last_alive > prev_last_alive) { + spin_count = 0; + prev_last_alive = cur_last_alive; + } + + if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) { + int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire); + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Fanin Spill Pool Deadlock Detected!"); + LOG_ERROR("========================================"); + LOG_ERROR("Fanin spill pool cannot reclaim space after %d spins (no progress).", spin_count); + LOG_ERROR( + " - Pool used: %d / %d (%.1f%%)", used(), capacity, + (capacity > 0) ? (100.0 * used() / capacity) : 0.0 + ); + LOG_ERROR(" - Pool top: %d (linear)", top); + LOG_ERROR(" - Pool tail: %d (linear)", tail); + LOG_ERROR(" - High water: %d", high_water); + LOG_ERROR(" - Needed: %d entries", needed); + LOG_ERROR(" - last_task_alive: %d (stuck here)", cur_last_alive); + LOG_ERROR(" - current_task: %d", current); + LOG_ERROR(" - In-flight tasks: %d", current - cur_last_alive); + LOG_ERROR("Diagnosis:"); + LOG_ERROR(" last_task_alive is not advancing, so fanin spill pool tail"); + LOG_ERROR(" cannot reclaim. Check TaskRing diagnostics for root cause."); + LOG_ERROR("Solution:"); + LOG_ERROR(" Increase fanin spill pool capacity (current: %d, recommended: %d)", capacity, high_water * 2); + LOG_ERROR(" Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_DEP_POOL=%d", high_water * 2); + LOG_ERROR("========================================"); + latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW); + return false; + } + SPIN_WAIT_HINT(); + } + return true; +} + +// ============================================================================= +// Dependency List Pool Implementation +// ============================================================================= +void PTO2DepListPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) { + if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) { + int32_t mark = ring.get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark; + if (mark > 0) { + advance_tail(mark); + } + last_reclaimed = sm_last_task_alive; + } +} + +bool PTO2DepListPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) { + if (available() >= needed) return true; + + int spin_count = 0; + int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); + while (available() < needed) { + reclaim(ring, prev_last_alive); + if (available() >= needed) return true; + + spin_count++; + + // Progress detection: reset spin counter if last_task_alive advances + int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire); + if (cur_last_alive > prev_last_alive) { + spin_count = 0; + prev_last_alive = cur_last_alive; + } + + if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) { + int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire); + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Dependency Pool Deadlock Detected!"); + LOG_ERROR("========================================"); + LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count); + LOG_ERROR( + " - Pool used: %d / %d (%.1f%%)", used(), capacity, + (capacity > 0) ? (100.0 * used() / capacity) : 0.0 + ); + LOG_ERROR(" - Pool top: %d (linear)", top); + LOG_ERROR(" - Pool tail: %d (linear)", tail); + LOG_ERROR(" - High water: %d", high_water); + LOG_ERROR(" - Needed: %d entries", needed); + LOG_ERROR(" - last_task_alive: %d (stuck here)", cur_last_alive); + LOG_ERROR(" - current_task: %d", current); + LOG_ERROR(" - In-flight tasks: %d", current - cur_last_alive); + LOG_ERROR("Diagnosis:"); + LOG_ERROR(" last_task_alive is not advancing, so dep pool tail"); + LOG_ERROR(" cannot reclaim. Check TaskRing diagnostics for root cause."); + LOG_ERROR("Solution:"); + LOG_ERROR(" Increase dep pool capacity (current: %d, recommended: %d)", capacity, high_water * 2); + LOG_ERROR(" Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_DEP_POOL=%d", high_water * 2); + LOG_ERROR("========================================"); + latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW); + return false; + } + SPIN_WAIT_HINT(); + } + return true; +} diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h new file mode 100644 index 000000000..b07435197 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_ring_buffer.h @@ -0,0 +1,694 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Ring Buffer Data Structures + * + * Implements ring buffer designs for zero-overhead memory management: + * + * 1. TaskAllocator - Unified task slot + output buffer allocation + * - Combines task ring (slot allocation) and heap ring (output buffer allocation) + * - Single spin-wait loop with unified back-pressure and deadlock detection + * - O(1) bump allocation for both task slots and heap buffers + * + * 2. FaninPool - Fanin spill entry allocation + * - Ring buffer for spilled fanin entries + * - O(1) append allocation + * - Implicit reclamation with task ring + * + * 3. DepListPool - Dependency list entry allocation + * - Ring buffer for linked list entries + * - O(1) prepend operation + * - Implicit reclamation with task ring + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#ifndef PTO_RING_BUFFER_H +#define PTO_RING_BUFFER_H + +#include +#include +#include + +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" +#include "common/unified_log.h" + +#if PTO2_PROFILING +// Heap-ring wrap reporting — the allocator is the only place each individual +// wrap is observable, so it notifies the scope_stats collector here. Gated: +// pays nothing (no include, no call) when profiling is compiled out. +#include "aicpu/scope_stats_collector_aicpu.h" +#endif + +// Block notification interval (in spin counts) +#define PTO2_BLOCK_NOTIFY_INTERVAL 10000 +// Alloc spin limit - after this, report deadlock and exit +#define PTO2_ALLOC_SPIN_LIMIT 100000 + +// Dep pool spin limit - if exceeded, dep pool capacity too small for workload +#define PTO2_DEP_POOL_SPIN_LIMIT 100000 + +// ============================================================================= +// Task Allocator (unified task slot + heap buffer allocation) +// ============================================================================= + +/** + * Unified task slot + heap buffer allocator. + * + * Since task and heap are always allocated together and the orchestrator is + * single-threaded, both pointers (task index, heap top) are tracked locally + * and published to shared memory via plain store — no fetch_add or CAS needed. + * + * The alloc() method checks both resources BEFORE committing to either, + * eliminating the need for rollback on partial failure. + */ +class PTO2TaskAllocator { +public: + /** + * Initialize the allocator with task ring and heap ring resources. + * + * All pointer arguments are device addresses (live in SM / GM heap); this + * function only stores them, no dereferences, so it is safe to invoke + * from host code that constructs a prebuilt arena image. + * + * Production callers leave `initial_local_task_id` at 0: the SM ring + * flow-control counters that current_index_ptr / last_alive_ptr point at + * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM + * reset), so we keep local_task_id_ aligned with that without reading the + * SM. Tests that drive SM state directly may pass a non-zero seed to + * exercise corner cases like task IDs near INT32_MAX. + */ + void init( + PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic *current_index_ptr, + std::atomic *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic *error_code_ptr, + int32_t initial_local_task_id = 0 + ) { + descriptors_ = descriptors; + window_size_ = window_size; + window_mask_ = window_size - 1; + current_index_ptr_ = current_index_ptr; + last_alive_ptr_ = last_alive_ptr; + heap_base_ = heap_base; + heap_size_ = heap_size; + error_code_ptr_ = error_code_ptr; + local_task_id_ = initial_local_task_id; + heap_top_ = 0; + heap_tail_ = 0; + last_alive_seen_ = 0; + } + + /** + * Allocate a task slot and its associated output buffer in one call. + * + * Both task index and heap top are maintained as local counters and + * published to shared memory only on success. Since the orchestrator is + * single-threaded, no CAS or fetch_add is needed — just check-then-commit. + * + * @param output_size Total packed output size in bytes (0 = no heap needed) + * @return Allocation result; check failed() for errors + */ + PTO2TaskAllocResult alloc(int32_t output_size) { + uint64_t aligned_size = + output_size > 0 ? PTO2_ALIGN_UP(static_cast(output_size), PTO2_ALIGN_SIZE) : 0; + + int spin_count = 0; + int32_t prev_last_alive = last_alive_ptr_->load(std::memory_order_acquire); + int32_t last_alive = prev_last_alive; + update_heap_tail(last_alive); + bool blocked_on_heap = false; +#if PTO2_ORCH_PROFILING + uint64_t wait_start = 0; + bool waiting = false; +#endif + + while (true) { + // Check both resources; commit only if both available + if (local_task_id_ - last_alive + 1 < window_size_) { + void *heap_ptr = try_bump_heap(aligned_size); + if (heap_ptr) { + int32_t task_id = commit_task(); +#if PTO2_ORCH_PROFILING + record_wait(spin_count, wait_start, waiting); +#endif + return {task_id, task_id & window_mask_, heap_ptr, static_cast(heap_ptr) + aligned_size}; + } + blocked_on_heap = true; + } else { + blocked_on_heap = false; + } + + // Spin: wait for scheduler to advance last_task_alive + spin_count++; +#if PTO2_ORCH_PROFILING + if (!waiting) { + wait_start = get_sys_cnt_aicpu(); + waiting = true; + } +#endif + last_alive = last_alive_ptr_->load(std::memory_order_acquire); + update_heap_tail(last_alive); + if (last_alive > prev_last_alive) { + spin_count = 0; + prev_last_alive = last_alive; + } else { + if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0) { + LOG_WARN( + "[TaskAllocator] BLOCKED: tasks=%d/%d, heap=%" PRIu64 "/%" PRIu64 ", on=%s, spins=%d", + local_task_id_ - last_alive, window_size_, heap_top_, heap_size_, + blocked_on_heap ? "heap" : "task", spin_count + ); + } + if (spin_count >= PTO2_ALLOC_SPIN_LIMIT) { + report_deadlock(output_size, blocked_on_heap); + return {-1, -1, nullptr, nullptr}; + } + } + SPIN_WAIT_HINT(); + } + } + + // ========================================================================= + // State queries + // ========================================================================= + + int32_t active_count() const { + int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire); + return local_task_id_ - last_alive; + } + + // Task ring start/end: tail = oldest live task (last_task_alive), head = + // next task id to allocate. head - tail == active_count(). + int32_t task_tail() const { return last_alive_ptr_->load(std::memory_order_acquire); } + int32_t task_head() const { return local_task_id_; } + + int32_t window_size() const { return window_size_; } + + uint64_t heap_available() const { + uint64_t tail = heap_tail_; + if (heap_top_ >= tail) { + uint64_t at_end = heap_size_ - heap_top_; + uint64_t at_begin = tail; + return at_end > at_begin ? at_end : at_begin; + } + return tail - heap_top_; + } + + uint64_t heap_top() const { return heap_top_; } + // Heap ring start: reclaim pointer (oldest byte still live). heap_top() is + // the end (next allocation). heap_top - heap_tail == heap_used_bytes(). + uint64_t heap_tail() const { return heap_tail_; } + uint64_t heap_capacity() const { return heap_size_; } + + uint64_t heap_used_bytes() const { + if (heap_size_ == 0) return 0; + return (heap_top_ + heap_size_ - heap_tail_) % heap_size_; + } + +private: + // --- Task Ring --- + PTO2TaskDescriptor *descriptors_ = nullptr; + int32_t window_size_ = 0; + int32_t window_mask_ = 0; + std::atomic *current_index_ptr_ = nullptr; + std::atomic *last_alive_ptr_ = nullptr; + + // --- Heap --- + void *heap_base_ = nullptr; + uint64_t heap_size_ = 0; + + // --- Local state (single-writer, no atomics needed) --- + int32_t local_task_id_ = 0; // Next task ID to allocate + uint64_t heap_top_ = 0; // Current heap allocation pointer + uint64_t heap_tail_ = 0; // Heap reclamation pointer (derived from consumed tasks) + int32_t last_alive_seen_ = 0; // last_task_alive at last heap_tail derivation + + // --- Shared --- + std::atomic *error_code_ptr_ = nullptr; + + // ========================================================================= + // Internal helpers + // ========================================================================= + + /** + * Commit a task slot: bump local counter and publish to shared memory. + * Must only be called after space check has passed. + */ + int32_t commit_task() { + int32_t task_id = local_task_id_++; + current_index_ptr_->store(local_task_id_, std::memory_order_release); + return task_id; + } + + /** + * Derive heap_tail_ from the last consumed task's packed_buffer_end. + * + * Every task has a valid packed_buffer_end (equal to packed_buffer_base + * for zero-size allocations), so the last consumed task always determines + * the correct heap_tail — no backward scan needed. + */ + void update_heap_tail(int32_t last_alive) { + if (last_alive <= last_alive_seen_) return; + last_alive_seen_ = last_alive; + + PTO2TaskDescriptor &desc = descriptors_[(last_alive - 1) & window_mask_]; + uint64_t old_tail = heap_tail_; + heap_tail_ = + static_cast(static_cast(desc.packed_buffer_end) - static_cast(heap_base_)); +#if PTO2_PROFILING + // Reclaim pointer moves forward monotonically in ring order; a decrease + // means it wrapped past heap_size_ (occupancy < heap_size_ guarantees at + // most one wrap per call). Report it so scope_stats can unroll. + if (is_scope_stats_enabled() && heap_tail_ < old_tail) { + scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_RECLAIM); + } +#else + (void)old_tail; +#endif + } + + /** + * Bump the heap pointer for the given allocation size. + * Returns the allocated pointer, or nullptr if insufficient space. + * When alloc_size == 0, returns current position without advancing. + */ + void *try_bump_heap(uint64_t alloc_size) { + uint64_t top = heap_top_; + if (alloc_size == 0) { + return static_cast(heap_base_) + top; + } + uint64_t tail = heap_tail_; + void *result; + + if (top >= tail) { + uint64_t space_at_end = heap_size_ - top; + if (space_at_end >= alloc_size) { + result = static_cast(heap_base_) + top; + heap_top_ = top + alloc_size; + } else if (tail > alloc_size) { + LOG_DEBUG( + "try_bump_heap wrap-around alloc: top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64, top, tail, + alloc_size + ); + result = heap_base_; + heap_top_ = alloc_size; +#if PTO2_PROFILING + // Allocation pointer just wrapped past heap_size_; report it so + // scope_stats can unroll the wrapping offset into a monotonic value. + // The collector attributes the wrap to the current scope's ring. + if (is_scope_stats_enabled()) scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_ALLOC); +#endif + } else { + LOG_DEBUG( + "try_bump_heap failed (top>=tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64 + ", heap_size=%" PRIu64, + top, tail, alloc_size, heap_size_ + ); + return nullptr; + } + } else { + if (tail - top > alloc_size) { + result = static_cast(heap_base_) + top; + heap_top_ = top + alloc_size; + } else { + LOG_DEBUG( + "try_bump_heap failed (topload(std::memory_order_acquire); + int32_t active_tasks = local_task_id_ - last_alive; + uint64_t htail = heap_tail_; + + LOG_ERROR("========================================"); + if (heap_blocked) { + LOG_ERROR("FATAL: Task Allocator Deadlock - Heap Exhausted!"); + } else { + LOG_ERROR("FATAL: Task Allocator Deadlock - Task Ring Full!"); + } + LOG_ERROR("========================================"); + LOG_ERROR("No progress after %d spins.", PTO2_ALLOC_SPIN_LIMIT); + LOG_ERROR( + " Task ring: current=%d, last_alive=%d, active=%d/%d (%.1f%%)", local_task_id_, last_alive, active_tasks, + window_size_, 100.0 * active_tasks / window_size_ + ); + LOG_ERROR( + " Heap ring: top=%" PRIu64 ", tail=%" PRIu64 ", size=%" PRIu64 ", available=%" PRIu64, heap_top_, htail, + heap_size_, heap_available() + ); + if (heap_blocked) { + LOG_ERROR(" Requested: %d bytes", requested_output_size); + } + LOG_ERROR("Diagnosis:"); + LOG_ERROR(" last_task_alive is stuck at %d, meaning task %d", last_alive, last_alive); + LOG_ERROR(" cannot transition to CONSUMED. Possible causes:"); + LOG_ERROR(" 1. Task %d still executing (subtasks not complete)", last_alive); + LOG_ERROR(" 2. Task %d fanout not fully released (downstream not done)", last_alive); + LOG_ERROR(" 3. Scope reference not released (scope_end not called)"); + LOG_ERROR(" 4. Orchestrator blocked here -> can't call scope_end -> circular wait"); + LOG_ERROR("Solution:"); + if (heap_blocked) { + LOG_ERROR( + " Increase heap size (current: %" PRIu64 ", recommended: %" PRIu64 ")", heap_size_, heap_size_ * 2 + ); + LOG_ERROR(" Compile-time: PTO2_HEAP_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_HEAP= (e.g. %" PRIu64 ")", heap_size_ * 2); + } else { + LOG_ERROR(" Increase task window size (current: %d, recommended: %d)", window_size_, active_tasks * 2); + LOG_ERROR(" Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_TASK_WINDOW= (e.g. %d)", active_tasks * 2); + } + LOG_ERROR("========================================"); + if (error_code_ptr_) { + int32_t code = heap_blocked ? PTO2_ERROR_HEAP_RING_DEADLOCK : PTO2_ERROR_FLOW_CONTROL_DEADLOCK; + error_code_ptr_->store(code, std::memory_order_release); + } + } +}; + +// ============================================================================= +// Fanin Spill Pool +// ============================================================================= + +/** + * Fanin spill pool structure + * + * True ring buffer for allocating spilled fanin entries. + * Entries are reclaimed when their consumer tasks become CONSUMED. + * + * Linear counters (top, tail) grow monotonically; the physical index + * is obtained via modulo: base[linear_index % capacity]. + */ +struct PTO2FaninPool { + PTO2FaninSpillEntry *base; // Pool base address + int32_t capacity; // Total number of entries + int32_t top; // Linear next-allocation counter (starts from 1) + int32_t tail; // Linear first-alive counter (entries before this are dead) + int32_t high_water; // Peak concurrent usage (top - tail) + int32_t reclaim_task_cursor{0}; // Last task id scanned for reclaim on this pool + + std::atomic *error_code_ptr = nullptr; + + void init(PTO2FaninSpillEntry *in_base, int32_t in_capacity, std::atomic *in_error_code_ptr) { + base = in_base; + capacity = in_capacity; + top = 1; + tail = 1; + high_water = 0; + reclaim_task_cursor = 0; + base[0].slot_state = nullptr; + error_code_ptr = in_error_code_ptr; + } + + void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive); + + bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed); + + PTO2FaninSpillEntry *alloc() { + int32_t used = top - tail; + if (used >= capacity) { + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Fanin Spill Pool Overflow!"); + LOG_ERROR("========================================"); + LOG_ERROR("Fanin spill pool exhausted: %d entries alive (capacity=%d).", used, capacity); + LOG_ERROR(" - Pool top: %d (linear)", top); + LOG_ERROR(" - Pool tail: %d (linear)", tail); + LOG_ERROR(" - High water: %d", high_water); + LOG_ERROR("Solution:"); + LOG_ERROR(" Increase fanin spill pool capacity (current: %d, recommended: %d).", capacity, capacity * 2); + LOG_ERROR(" Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_DEP_POOL=%d", capacity * 2); + LOG_ERROR("========================================"); + if (error_code_ptr) { + error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release); + } + return nullptr; + } + int32_t idx = top % capacity; + top++; + used++; + if (used > high_water) high_water = used; + return &base[idx]; + } + + void advance_tail(int32_t new_tail) { + if (new_tail > tail) { + tail = new_tail; + } + } + + int32_t used() const { return top - tail; } + + int32_t available() const { return capacity - used(); } +}; + +template +using PTO2FaninCallbackResult = std::invoke_result_t; + +template +using PTO2FaninForEachReturn = std::conditional_t, void>, void, bool>; + +template +inline PTO2FaninForEachReturn for_each_fanin_storage( + InlineSlots &&inline_slot_states, int32_t fanin_count, int32_t spill_start, PTO2FaninPool &spill_pool, Fn &&fn +) { + using FaninCallbackResult = PTO2FaninCallbackResult; + static_assert( + std::is_same_v || std::is_same_v, + "fanin callback must return void or bool" + ); + + if constexpr (std::is_void_v) { + int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP); + for (int32_t i = 0; i < inline_count; i++) { + fn(inline_slot_states[i]); + } + + int32_t spill_count = fanin_count - inline_count; + if (spill_count <= 0) { + return; + } + + int32_t start_idx = spill_start % spill_pool.capacity; + int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx); + PTO2FaninSpillEntry *first = spill_pool.base + start_idx; + for (int32_t i = 0; i < first_count; i++) { + fn(first[i].slot_state); + } + + int32_t second_count = spill_count - first_count; + for (int32_t i = 0; i < second_count; i++) { + fn(spill_pool.base[i].slot_state); + } + return; + } else { + int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP); + for (int32_t i = 0; i < inline_count; i++) { + if (!fn(inline_slot_states[i])) { + return false; + } + } + + int32_t spill_count = fanin_count - inline_count; + if (spill_count <= 0) { + return true; + } + + int32_t start_idx = spill_start % spill_pool.capacity; + int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx); + PTO2FaninSpillEntry *first = spill_pool.base + start_idx; + for (int32_t i = 0; i < first_count; i++) { + if (!fn(first[i].slot_state)) { + return false; + } + } + + int32_t second_count = spill_count - first_count; + for (int32_t i = 0; i < second_count; i++) { + if (!fn(spill_pool.base[i].slot_state)) { + return false; + } + } + return true; + } +} + +template +inline PTO2FaninForEachReturn for_each_fanin_slot_state(const PTO2TaskPayload &payload, Fn &&fn) { + return for_each_fanin_storage( + payload.fanin_inline_slot_states, payload.fanin_actual_count, payload.fanin_spill_start, + *payload.fanin_spill_pool, static_cast(fn) + ); +} + +// ============================================================================= +// Dependency List Pool +// ============================================================================= + +/** + * Dependency list pool structure + * + * True ring buffer for allocating linked list entries. + * Entries are reclaimed when their producer tasks become CONSUMED, + * as tracked by the orchestrator via dep_pool_mark per task. + * + * Linear counters (top, tail) grow monotonically; the physical index + * is obtained via modulo: base[linear_index % capacity]. + */ +struct PTO2DepListPool { + PTO2DepListEntry *base; // Pool base address + int32_t capacity; // Total number of entries + int32_t top; // Linear next-allocation counter (starts from 1) + int32_t tail; // Linear first-alive counter (entries before this are dead) + int32_t high_water; // Peak concurrent usage (top - tail) + int32_t last_reclaimed{0}; // last_task_alive at last successful reclamation + + // Error code pointer for fatal error reporting (→ sm_header->orch_error_code) + std::atomic *error_code_ptr = nullptr; + + /** + * + * Initialize dependency list pool + * @param base Pool base address from shared memory + * @param capacity Total number of entries + */ + void init(PTO2DepListEntry *in_base, int32_t in_capacity, std::atomic *in_error_code_ptr) { + base = in_base; + capacity = in_capacity; + top = 1; // Start from 1, 0 means NULL/empty + tail = 1; // Match initial top (no reclaimable entries yet) + high_water = 0; + last_reclaimed = 0; + + // Initialize entry 0 as NULL marker + base[0].slot_state = nullptr; + base[0].next = nullptr; + + error_code_ptr = in_error_code_ptr; + } + + /** + * Reclaim dead entries based on scheduler's slot state dep_pool_mark. + * Safe to call multiple times — only advances tail forward. + * + * @param ring Ring header (for reading slot dep_pool_mark) + * @param sm_last_task_alive Current last_task_alive from shared memory + */ + void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive); + + /** + * Ensure dep pool for a specific ring has at least `needed` entries available. + * Spin-waits for reclamation if under pressure. Detects deadlock if no progress. + */ + bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed); + + /** + * Allocate a single entry from the pool (single-thread per pool instance) + * + * @return Pointer to allocated entry, or nullptr on fatal error + */ + PTO2DepListEntry *alloc() { + int32_t used = top - tail; + if (used >= capacity) { + LOG_ERROR("========================================"); + LOG_ERROR("FATAL: Dependency Pool Overflow!"); + LOG_ERROR("========================================"); + LOG_ERROR("DepListPool exhausted: %d entries alive (capacity=%d).", used, capacity); + LOG_ERROR(" - Pool top: %d (linear)", top); + LOG_ERROR(" - Pool tail: %d (linear)", tail); + LOG_ERROR(" - High water: %d", high_water); + LOG_ERROR("Solution:"); + LOG_ERROR(" Increase dep pool capacity (current: %d, recommended: %d).", capacity, capacity * 2); + LOG_ERROR(" Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h"); + LOG_ERROR(" Runtime env: PTO2_RING_DEP_POOL=%d", capacity * 2); + LOG_ERROR("========================================"); + if (error_code_ptr) { + error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release); + } + return nullptr; + } + int32_t idx = top % capacity; + top++; + used++; + if (used > high_water) high_water = used; + return &base[idx]; + } + + /** + * Advance the tail pointer, reclaiming dead entries. + * Called by the orchestrator based on last_task_alive advancement. + */ + void advance_tail(int32_t new_tail) { + if (new_tail > tail) { + tail = new_tail; + } + } + + /** + * Prepend a task ID to a dependency list + * + * O(1) operation: allocates new entry and links to current head. + * + * @param current_head Current list head offset (0 = empty list) + * @param task_slot Task slot to prepend + * @return New head offset + */ + PTO2DepListEntry *prepend(PTO2DepListEntry *cur, PTO2TaskSlotState *slot_state) { + PTO2DepListEntry *new_entry = alloc(); + if (!new_entry) return nullptr; + new_entry->slot_state = slot_state; + new_entry->next = cur; + return new_entry; + } + + int32_t used() const { return top - tail; } + + int32_t available() const { return capacity - used(); } +}; + +// ============================================================================= +// Ring Set (per-depth aggregate) +// ============================================================================= + +/** + * Groups a TaskAllocator and DepPool into one per-depth unit. + * PTO2_MAX_RING_DEPTH instances provide independent reclamation per scope depth. + */ +struct PTO2RingSet { + PTO2TaskAllocator task_allocator; + PTO2FaninPool fanin_pool; +}; + +#endif // PTO_RING_BUFFER_H diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp new file mode 100644 index 000000000..263adec8d --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.cpp @@ -0,0 +1,287 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO Runtime2 - Main Implementation + * + * Implements the unified runtime API that combines orchestrator and scheduler. + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#include "pto_runtime2.h" + +#include +#include +#include +#include + +#include + +#include "aicpu/device_time.h" +#include "common/unified_log.h" +#if PTO2_PROFILING +#include "aicpu/scope_stats_collector_aicpu.h" +#endif + +// Weak fallback for HOST .so builds (never called, but satisfies linker). +// The AICPU build links the strong symbol from platform/.../device_time.cpp. +// Hidden visibility prevents HOST .so from polluting global symbol table. +__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; } + +// ============================================================================= +// Orchestration Ops Table (function-pointer dispatch for orchestration .so) +// ============================================================================= + +static TaskOutputTensors submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args) { + return rt->orchestrator.submit_task(mixed_kernels, args); +} + +static TaskOutputTensors alloc_tensors_impl(PTO2Runtime *rt, const L0TaskArgs &args) { + return rt->orchestrator.alloc_tensors(args); +} + +static TaskOutputTensors submit_dummy_task_impl(PTO2Runtime *rt, const L0TaskArgs &args) { + return rt->orchestrator.submit_dummy_task(args); +} + +void rt_scope_begin(PTO2Runtime *rt) { + PTO2ScopeMode mode = rt->pending_scope_mode; + rt->pending_scope_mode = PTO2ScopeMode::AUTO; + rt->orchestrator.begin_scope(mode); +} + +void rt_scope_end(PTO2Runtime *rt) { rt->orchestrator.end_scope(); } + +void rt_orchestration_done(PTO2Runtime *rt) { rt->orchestrator.mark_done(); } + +static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; } + +void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + if (fmt == nullptr || fmt[0] == '\0') { + rt->orchestrator.report_fatal(error_code, func, nullptr); + } else { + char message[1024]; + vsnprintf(message, sizeof(message), fmt, args); + rt->orchestrator.report_fatal(error_code, func, "%s", message); + } + va_end(args); +} + +// Wait for all producers of this tensor to be safe for data access. +// Checks owner metadata (lifecycle anchor) and OverlapMap (modifier writers). +// For reads: wait until each producer COMPLETED (done writing). +// For writes: also wait until all consumers done reading +// (fanout_refcount >= fanout_count - 1, excluding scope reference). +// Uses cycle-based timeout (checked every 1024 spins). +// Returns false on timeout (sets orch.fatal). +MAYBE_UNINITIALIZED_BEGIN +static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller) { + PTO2TaskId owner = tensor.owner_task_id; + PTO2OrchestratorState &orch = rt->orchestrator; + + // Segmented wait: collect up to kSegmentCap producer slots, then flush by + // spinning on each. When the segment fills, we wait for the accumulated + // batch before continuing to gather more. Dedup is per-segment only; a + // producer that appears in two segments is waited on twice, which is + // idempotent (task_state is monotonic) and only adds one atomic load on + // the second encounter. + constexpr int kSegmentCap = 64; + const PTO2TaskSlotState *seg[kSegmentCap]; + int seg_count = 0; + bool signaled = false; + bool failed = false; + + auto wait_one_producer = [&](const PTO2TaskSlotState &slot) { + uint8_t ring_id = slot.ring_id; + int32_t local_id = static_cast(slot.task->task_id.local()); + uint64_t t0 = get_sys_cnt_aicpu(); + int32_t spin_count = 0; + while (slot.task_state.load(std::memory_order_acquire) < PTO2_TASK_COMPLETED) { + SPIN_WAIT_HINT(); + if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) { + orch.report_fatal( + PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller, + "Timeout (%llu cycles): producer (ring=%d, local=%d) not completed", + (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id + ); + failed = true; + return; + } + } + }; + + auto wait_one_consumers = [&](const PTO2TaskSlotState &slot) { + uint8_t ring_id = slot.ring_id; + int32_t local_id = slot.task->task_id.local(); + uint64_t t0 = get_sys_cnt_aicpu(); + int32_t spin_count = 0; + while (slot.fanout_refcount.load(std::memory_order_acquire) < slot.fanout_count - 1) { + SPIN_WAIT_HINT(); + if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) { + orch.report_fatal( + PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller, + "Timeout (%llu cycles): consumers of producer (ring=%d, local=%d) not done", + (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id + ); + failed = true; + return; + } + } + }; + + auto flush_segment = [&]() { + for (int i = 0; i < seg_count; i++) { + wait_one_producer(*seg[i]); + if (failed) return; + if (!wait_for_consumers) continue; + wait_one_consumers(*seg[i]); + if (failed) return; + } + seg_count = 0; + }; + + auto try_push = [&](const PTO2TaskSlotState &s) { + for (int j = 0; j < seg_count; j++) { + if (seg[j] == &s) return; // per-segment dedup + } + if (seg_count == kSegmentCap) { + flush_segment(); + if (failed) return; + } + seg[seg_count++] = &s; + if (!signaled) { + orch.scheduler->wiring.orch_needs_drain.store(true, std::memory_order_release); + signaled = true; + } + }; + + auto do_wait = [&]() { + // Step A: creator retention — read owner directly from tensor metadata + if (owner.is_valid()) { + auto &s = orch.sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local()); + try_push(s); + if (failed) return; + } + + // Step B: modifier writer lookup (OverlapMap), direct callback + orch.tensor_map.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus) -> bool { + PTO2TaskId pid = entry.producer_task_id; + auto &s = orch.sm_header->rings[pid.ring()].get_slot_state_by_task_id(pid.local()); + try_push(s); + return !failed; + }); + if (failed) return; + flush_segment(); + }; + + do_wait(); + if (signaled) { + orch.scheduler->wiring.orch_needs_drain.store(false, std::memory_order_release); + } + return !failed; +} +MAYBE_UNINITIALIZED_END + +uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) { + if (tensor.buffer.addr == 0) { + unified_log_error( + __FUNCTION__, "get_tensor_data: buffer not allocated (addr=0). " + "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns." + ); + return 0; + } + + if (!wait_for_tensor_ready(rt, tensor, false, __FUNCTION__)) { + return 0; + } + + uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims); + uint64_t elem_size = get_element_size(tensor.dtype); + const void *ptr = reinterpret_cast(tensor.buffer.addr + flat_offset * elem_size); + uint64_t result = 0; + memcpy(&result, ptr, elem_size); + return result; +} + +void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value) { + if (tensor.buffer.addr == 0) { + unified_log_error( + __FUNCTION__, "set_tensor_data: buffer not allocated (addr=0). " + "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns." + ); + return; + } + + // Wait for producer + all consumers before writing (WAW + WAR safety) + if (!wait_for_tensor_ready(rt, tensor, true, __FUNCTION__)) { + return; + } + + uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims); + uint64_t elem_size = get_element_size(tensor.dtype); + void *ptr = reinterpret_cast(tensor.buffer.addr + flat_offset * elem_size); + memcpy(ptr, &value, elem_size); +} + +// Ops-table entry that hands the call-site captured by PTO2ScopeGuard to the +// [ScopeStats] collector. The slot is always present in the struct to keep +// the layout stable; at PTO2_PROFILING=0 we fill nullptr so the orchestration +// .so's null-check skips it. +#if PTO2_PROFILING +static void scope_set_site_impl(const char *file, int line) { scope_stats_set_pending_site(file, line); } +#endif + +static const PTO2RuntimeOps s_runtime_ops = { + .submit_task = submit_task_impl, + .scope_begin = rt_scope_begin, + .scope_end = rt_scope_end, + .orchestration_done = rt_orchestration_done, + .is_fatal = is_fatal_impl, + .report_fatal = rt_report_fatal, + .log_error = unified_log_error, + .log_warn = unified_log_warn, + .log_debug = unified_log_debug, + .log_info_v = unified_log_info_v, + .get_tensor_data = get_tensor_data, + .set_tensor_data = set_tensor_data, + .alloc_tensors = alloc_tensors_impl, + .submit_dummy_task = submit_dummy_task_impl, +#if PTO2_PROFILING + .scope_set_site = scope_set_site_impl, +#else + .scope_set_site = nullptr, +#endif +}; + +// ============================================================================= +// Runtime Lifecycle (AICPU-only fixup) +// ============================================================================= +// +// Layout / init_data / wire / destroy live in +// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the +// prebuilt arena image. The pieces below — wiring the ops table and the +// SPMD core counts — depend on the device-side s_runtime_ops global and the +// AICPU SchedulerContext respectively, so they remain in the AICPU build. + +void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) { + rt->ops = &s_runtime_ops; + rt->orchestrator.total_cluster_count = aic_count; + rt->orchestrator.total_aiv_count = aiv_count; +} + +void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) { + if (rt) { + rt->mode = mode; + } +} diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.h new file mode 100644 index 000000000..db4af47ed --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2.h @@ -0,0 +1,291 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Main Interface + * + * This is the main header for the PTO Runtime2 system. + * It provides a unified API for task graph construction and execution. + * + * Key Features: + * - Ring buffer based memory management (zero allocation overhead) + * - Lazy invalidation TensorMap for dependency discovery + * - Scope-based buffer lifecycle management + * - Per-task spinlocks for concurrent fanout updates + * - Orchestrator-Scheduler decoupling via shared memory + * + * Usage: + * 1. Create runtime: PTO2Runtime create methods + * 2. Build task graph in orchestration function: + * - begin_scope() / end_scope() + * - submit_task() + * 3. Mark orchestration complete: mark_done() + * 4. Destroy runtime + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#pragma once + +#include "utils/device_arena.h" +#include "pto_runtime2_types.h" +#include "pto_submit_types.h" +#include "pto_shared_memory.h" +#include "pto_ring_buffer.h" +#include "pto_tensormap.h" +#include "scheduler/pto_scheduler.h" +#include "pto_orchestrator.h" +#include "aicore_completion_mailbox.h" + +// ============================================================================= +// Runtime Context +// ============================================================================= + +/** + * Runtime execution mode + */ +enum PTO2RuntimeMode { + PTO2_MODE_EXECUTE = 0, // Execute tasks on workers + PTO2_MODE_SIMULATE = 1, // Simulate task execution with cycle counting + PTO2_MODE_GRAPH_ONLY = 2 // Build graph only, no execution +}; + +/** + * Function-pointer ops table for runtime operations. + * + * The orchestration .so calls runtime functions through this table + * (via pto_orchestration_api.h inline wrappers), so it has zero link + * dependencies on runtime .cpp files. + */ +typedef struct PTO2Runtime PTO2Runtime; // forward declare for ops signatures + +struct PTO2RuntimeOps { + TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const L0TaskArgs &args); + void (*scope_begin)(PTO2Runtime *rt); + void (*scope_end)(PTO2Runtime *rt); + void (*orchestration_done)(PTO2Runtime *rt); + bool (*is_fatal)(PTO2Runtime *rt); + void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...); + + // Logging (populated by runtime, called by orchestration) + void (*log_error)(const char *func, const char *fmt, ...); + void (*log_warn)(const char *func, const char *fmt, ...); + void (*log_debug)(const char *func, const char *fmt, ...); + // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside). + void (*log_info_v)(const char *func, int v, const char *fmt, ...); + + // Cross-layer data access (orchestration reads/writes tensor values via runtime) + // Placed after logging to avoid shifting hot-path field offsets. + uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]); + void (*set_tensor_data)( + PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value + ); + TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const L0TaskArgs &args); + TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const L0TaskArgs &args); + + // Stash the call-site captured by PTO2ScopeGuard into the [ScopeStats] + // collector. Always present to keep ops-table layout stable across + // PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0. + void (*scope_set_site)(const char *file, int line); +}; + +/** + * Layout descriptor for the prebuilt runtime arena. Holds all sub-region + * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header / + * AICore mailbox) plus the layout-defining capacities. Produced once on the + * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout + * and runtime_wire_arena_pointers. + */ +struct PTO2RuntimeArenaLayout { + size_t off_sm_handle{0}; + PTO2OrchestratorLayout orch; + PTO2SchedulerLayout sched; + size_t off_runtime{0}; + size_t off_mailbox{0}; + + // Cached parameters (re-used by init_data + wire stages). + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{}; + uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{}; + int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{}; + + // Total arena byte size post-commit. Used by host to size the prebuilt + // image buffer and as the rtMemcpy length. + size_t arena_size{0}; +}; + +/** + * PTO Runtime2 context + * + * Contains all state for orchestration and scheduling. + * In simulated mode, runs in single process with shared address space. + */ +struct PTO2Runtime { + // Ops table (first field — used by orchestration .so via function pointers) + const PTO2RuntimeOps *ops; + PTO2ScopeMode pending_scope_mode; + + // Components + PTO2SharedMemoryHandle *sm_handle; + PTO2OrchestratorState orchestrator; + PTO2SchedulerState scheduler; + AICoreCompletionMailbox *aicore_mailbox; + + // GM Heap for output buffers + void *gm_heap; + uint64_t gm_heap_size; + bool gm_heap_owned; // True if we allocated it + + // Mode + PTO2RuntimeMode mode; + + // Statistics + int64_t total_cycles; + + // Prebuilt-arena fast path metadata. Carries every offset + // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct + // all arena-internal pointer fields without re-running init_data. The + // device base of the runtime arena travels separately on the host-side + // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it + // *before* dereferencing this image. Populated on host by + // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by + // aicpu_executor.cpp. + PTO2RuntimeArenaLayout prebuilt_layout; +}; + +// ============================================================================= +// Runtime Lifecycle API +// ============================================================================= + +/** + * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator / + * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied + * arena. Pure arithmetic; does not touch device memory and may run on host. + * Returns the layout descriptor; caller commits/attaches the arena before + * Phase 2/3. + */ +PTO2RuntimeArenaLayout runtime_reserve_layout( + DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE +); +PTO2RuntimeArenaLayout runtime_reserve_layout( + DeviceArena &arena, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], + const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH] +); + +/** + * Phase 2 — write the data half of the runtime arena: standalone fields, + * memset'd arena regions, sub-structure initializers, and SM-side device + * pointers. The arena must already be committed (or attached); writes go + * into arena.base() + sub-region offsets. + * + * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store + * them (never dereference). Safe to run on a host arena that owns a host + * mirror of the runtime image — the resulting buffer is rtMemcpy-ready. + * + * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena. + * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the + * AICore-side count fields are left untouched and must be filled by the + * AICPU at boot. + */ +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size, + void *gm_heap_dev_base, uint64_t heap_size +); +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size, + void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] +); + +/** + * Phase 3 — wire every arena-internal pointer field (rt->sm_handle, + * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler, + * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool, + * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on + * both host (writing host-mirror addresses) and AICPU (writing device + * addresses) sides. + */ +void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt); + +/** + * AICPU-only Phase 4 — fill in the few fields the host could not know at + * prebuilt-image build time: the ops table (s_runtime_ops is a device-side + * file-local global, host cannot resolve its device address) and the + * orchestrator's core counts (depend on the executor's scheduler context). + * Call once per boot after runtime_wire_arena_pointers. + */ +void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count); + +/** + * Destroy runtime. With the prebuilt-arena fast path the arena buffer is + * pooled across runs by DeviceRunner, so we never call arena.release() + * here — the destructor only forgets sub-structure pointers (idempotent + * cleanup). + */ +void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena); + +/** + * Set execution mode + */ +void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode); + +// ============================================================================= +// Orchestration API (called by orchestration function) +// ============================================================================= + +/** + * Begin a new scope + * + * All tasks submitted within this scope will have their lifetime + * bounded by the scope. When scope_end() is called, the scope + * releases its reference to all enclosed tasks. + */ +void rt_scope_begin(PTO2Runtime *rt); + +/** + * End current scope + * + * Releases scope reference for all tasks submitted since scope_begin(). + * Tasks whose refcount reaches zero will have their buffers released. + */ +void rt_scope_end(PTO2Runtime *rt); + +/** + * Mark orchestration as complete + * + * Signals that no more tasks will be submitted. + */ +void rt_orchestration_done(PTO2Runtime *rt); + +/** + * Enter fatal state explicitly from orchestration. + */ +void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...); + +/** + * Cross-layer data access: read a tensor value by waiting for its producer. + */ +uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]); + +/** + * Cross-layer data access: write a value to a tensor at given indices. + * Waits for producer completion (WAW) and all consumers (WAR) via TensorMap. + * See set_tensor_data in pto_orchestration_api.h for full documentation. + */ +void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value); + +/** + * Slim config struct exported by orchestration .so via aicpu_orchestration_config(). + * Shared definition with pto_orchestration_api.h (same layout, guarded). + */ +#ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED +#define PTO2_ORCHESTRATION_CONFIG_DEFINED +struct PTO2OrchestrationConfig { + int expected_arg_count; +}; +#endif diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h new file mode 100644 index 000000000..f2715982b --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_runtime2_types.h @@ -0,0 +1,420 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO Runtime2 - Core Type Definitions + * + * This header defines all fundamental types used by the PTO Runtime2 system: + * - Configuration constants + * - Worker types and task states + * - Tensor regions and task parameters + * - Task descriptors with fanin/fanout tracking + * - Dependency list entries + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_ + +#include +#include +#include + +#include + +#include "profiling_config.h" +#include "pto_constants.h" +#include "pto_runtime_status.h" +#include "pto2_dispatch_payload.h" +#include "aicore_completion_mailbox.h" +#include "pto_submit_types.h" +#include "pto_task_id.h" +#include "pto_types.h" + +// Spin-wait hint for AICPU threads. On real hardware the AICPU has dedicated +// ARM A55 cores — no OS yield is needed, so the hint is a no-op. In simulation +// all threads share host CPU cores, so we yield to prevent starvation. +// This header is also compiled into the Host .so (for struct definitions only), +// where the hint is never called — the fallback no-op keeps Host builds clean. +#if __has_include("spin_hint.h") +#include "spin_hint.h" +#else +#define SPIN_WAIT_HINT() ((void)0) +#endif + +#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING +#include "aicpu/device_time.h" +#endif + +// ============================================================================= +// Configuration Constants +// ============================================================================= + +// Task management +// NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value. +// Actual window size is passed at runtime to runtime_reserve_layout(). +// Use pto2_task_slot(sched, task_id) for slot calculation. +#define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2) + +// Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer) +// Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1) +#define PTO2_MAX_RING_DEPTH 4 + +// Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH) +#define PTO2_HEAP_SIZE (256 * 1024 * 1024) // 256MB per ring (1GB total) +#define PTO2_DEP_LIST_POOL_SIZE 16384 // Per-ring dependency list pool entries +#define PTO2_TENSORMAP_POOL_SIZE (65536) // TensorMap entry pool +#define PTO2_TENSORMAP_NUM_BUCKETS 4096 // Power of 2 for fast hash (4096×8B=32KB fits L1) + +// Scope management +#define PTO2_MAX_SCOPE_DEPTH 64 // Maximum nesting depth +// Hard cap for the scope_tasks buffer. Equals the total in-flight ring slot +// budget (PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH): once every ring slot +// is in flight, no more tasks can ever be pushed regardless of buffer size. +// scope_tasks_push fatals on overflow rather than growing the arena-owned +// buffer (which would be UB on the arena's malloc'd backing). +#define PTO2_SCOPE_TASKS_CAP (PTO2_TASK_WINDOW_SIZE * PTO2_MAX_RING_DEPTH) + +// Ready queue +#define PTO2_READY_QUEUE_SIZE 65536 // Per-shape queue size + +// Wiring queue +#define PTO2_WRIRING_QUEUE_SIZE 1024 // Per-shape queue size + +// Fanin storage +#define PTO2_FANIN_INLINE_CAP 64 + +// TensorMap cleanup interval +#define PTO2_TENSORMAP_CLEANUP_INTERVAL 64 // Cleanup every N retired tasks +#define PTO2_DEP_POOL_CLEANUP_INTERVAL 64 // Cleanup every N retired tasks + +// get_tensor_data/set_tensor_data spin wait timeout in cycles. +// ~10s on hardware (1.5 GHz counter), ~10s on simulation (chrono-based). +constexpr uint64_t PTO2_TENSOR_DATA_TIMEOUT_CYCLES = 15 * 1000 * 1000 * 1000ULL; + +// ============================================================================= +// Task States +// ============================================================================= + +/** + * Task state enumeration + * + * State transitions: + * PENDING -> COMPLETED -> CONSUMED + * + * The slot stays in PENDING from submit through "ready in queue" and "running + * on a worker"; readiness and running-vs-idle are derived from fanin_refcount + * and per-core running_slot_state respectively, not from task_state itself. + * + * Conditions: + * PENDING->COMPLETED: all subtasks finish (set by scheduler) or task is a + * hidden alloc completed inline by the orchestrator + * COMPLETED->CONSUMED: fanout_refcount == fanout_count && state == COMPLETED + */ +typedef enum { + PTO2_TASK_PENDING = 0, // Submitted; awaiting fanin, queued, or dispatched + PTO2_TASK_COMPLETED = 1, // Execution finished, output may still be in use + PTO2_TASK_CONSUMED = 2 // Output fully consumed, buffers can be released +} PTO2TaskState; + +/** + * Result of a unified task allocation. + */ +struct PTO2TaskAllocResult { + int32_t task_id; // Absolute task ID (not wrapped) + int32_t slot; // task_id & (window_size - 1) + void *packed_base; // Heap allocation result (nullptr if failure) + void *packed_end; // packed_base + aligned output_size + + bool failed() const { return task_id < 0; } +}; + +struct PTO2OutputLayout { + uint64_t offsets[MAX_TENSOR_ARGS] = {}; + uint64_t buffer_sizes[MAX_TENSOR_ARGS] = {}; + int32_t total_output_size = 0; +}; + +// ============================================================================= +// Dependency List Entry +// ============================================================================= + +/** + * Fanin spill entry + * Stored in the dedicated fanin spill ring buffer. + */ +struct PTO2TaskSlotState; // Forward declaration +struct PTO2FaninPool; // Forward declaration +struct PTO2FaninSpillEntry { + PTO2TaskSlotState *slot_state; +}; +static_assert(sizeof(PTO2FaninSpillEntry) == sizeof(uintptr_t)); + +/** + * Dependency list entry (singly-linked list node) + * Stored in DepListPool ring buffer. + */ +struct PTO2DepListEntry { + PTO2TaskSlotState *slot_state; // Consumer slot state (direct pointer) + PTO2DepListEntry *next; // next entry +}; + +// ============================================================================= +// Task Descriptor +// ============================================================================= + +/** + * Task descriptor structure (shared memory) + * + * Stored in the TaskDescriptor ring buffer in shared memory. + * Contains static identification and buffer pointers only. + * Dynamic scheduling state (fanin/fanout/task_state) is in PTO2TaskSlotState. + * + * Fields set by Orchestrator at submission, read by Scheduler for dispatch. + */ +struct PTO2TaskDescriptor { + // Mixed-task identification (encodes ring_id in upper 32 bits) + PTO2TaskId task_id; // raw: (ring_id << 32) | local_id + + // Per-slot kernel IDs (INVALID_KERNEL_ID = inactive) + int32_t kernel_id[PTO2_SUBTASK_SLOT_COUNT]; + + // Packed output buffer (all outputs packed into single contiguous buffer) + void *packed_buffer_base; // Start of packed buffer in GM Heap + void *packed_buffer_end; // End of packed buffer (for heap reclamation) +}; + +// ============================================================================= +// Per-Slot Scheduling State +// ============================================================================= + +/** + * Task payload data (cold path - only accessed during orchestration and dispatch) + * + * Layout: metadata + inline fanin packed in the first 9 cache lines, followed + * by bulk tensor and scalar data. Small fanins stay fully inline; larger + * fanins spill into a per-ring ring buffer slice. + */ +struct PTO2TaskPayload { + // === Cache lines 0-8 (576B) — metadata + inline fanin === + int32_t tensor_count{0}; + int32_t scalar_count{0}; + int32_t fanin_actual_count{0}; // Actual fanin count (without the +1 redundance) + int32_t fanin_spill_start{0}; // Linear start index in fanin spill pool (0 = no spill) + PTO2FaninPool *fanin_spill_pool{nullptr}; + PTO2TaskSlotState *fanin_inline_slot_states[PTO2_FANIN_INLINE_CAP]; + // === Cache lines 9-72 (4096B) — tensors (alignas(64) forces alignment) === + Tensor tensors[MAX_TENSOR_ARGS]; + // === Cache lines 73-74 (128B) — scalars === + uint64_t scalars[MAX_SCALAR_ARGS]; + + // Layout verification (size checks that don't need offsetof). + static_assert(sizeof(Tensor) == 128, "Tensor must be 2 cache lines"); + static_assert(MAX_SCALAR_ARGS * sizeof(uint64_t) == 128, "scalar region must be 128B (2 cache lines)"); + + /** + * Initialize payload: copy tensors, store scalars. + * + * For each param slot, the tensor source is determined by TensorArgType: + * - OUTPUT -> use materialized_outputs.output_ptr(out_idx++) + * - INPUT / INOUT -> use refs[i].tensor + * + * @param args Task arguments (tensors + scalars) + * @param result Materialized output tensors (from TensorCreateInfo path) + */ + void init( + const L0TaskArgs &args, TaskOutputTensors &result, PTO2TaskAllocResult &alloc_result, PTO2OutputLayout &layout + ) { + tensor_count = args.tensor_count(); + scalar_count = args.scalar_count(); + + // int32_t out_idx = 0; + for (int32_t i = 0; i < args.tensor_count(); i++) { + if (args.tag(i) != TensorArgType::OUTPUT) { + tensors[i].copy(args.tensor(i).ref()); + } else { + init_tensor_from_create_info( + tensors[i], args.tensor(i).create_info(), + reinterpret_cast(reinterpret_cast(alloc_result.packed_base) + layout.offsets[i]), + layout.buffer_sizes[i] + ); + tensors[i].owner_task_id = result.task_id(); + result.materialize_output(tensors[i]); + } + } + // Round up to cache line boundary. Both arrays are 128B so no overrun. + // Eliminates branches; extra bytes within the same CL have zero additional cost. + memcpy(scalars, args.scalars(), PTO2_ALIGN_UP(args.scalar_count() * sizeof(uint64_t), 64)); + } +}; + +// PTO2TaskPayload layout verification (offsetof requires complete type). +static_assert(offsetof(PTO2TaskPayload, fanin_spill_pool) == 16, "spill pool pointer layout drift"); +static_assert( + offsetof(PTO2TaskPayload, fanin_inline_slot_states) == 24, "inline fanin array must follow spill metadata" +); +static_assert(offsetof(PTO2TaskPayload, tensors) == 576, "tensors must start at byte 576 (cache line 9)"); +static_assert( + offsetof(PTO2TaskPayload, scalars) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor), + "scalars must immediately follow tensors" +); +static_assert( + sizeof(PTO2TaskPayload) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor) + MAX_SCALAR_ARGS * sizeof(uint64_t), + "PTO2TaskPayload size must stay on the baseline cache-line footprint" +); + +/** + * Per-task slot scheduling state (scheduler-private, NOT in shared memory) + * + * Consolidates all hot-path scheduling fields into a single cache-friendly + * structure (32 bytes = half a cache line). Accessing any field of a task's + * slot state brings all related fields into the same cache line. + * + * Concurrency notes: + * - fanout_head, fanout_count protected by fanout_lock (per-task spinlock) + * - fanin_count set once at submission, read-only after (hot path for ready check) + * - task_state, fanin_refcount, fanout_refcount updated atomically + */ +struct alignas(64) PTO2TaskSlotState { + // Fanout lock + list (accessed together under lock in on_task_complete) + std::atomic fanout_lock; // Per-task spinlock (0=unlocked, 1=locked) + int32_t fanout_count; // 1 (owning scope) + number of consumers + + PTO2DepListEntry *fanout_head; // Pointer to first fanout entry (nullptr = empty) + + // Task state (completion, consumed check, ready check) + std::atomic task_state; // PENDING/COMPLETED/CONSUMED + + // Fanin (accessed together in release_fanin_and_check_ready) + std::atomic fanin_refcount; // Dynamic: counts completed producers + int32_t fanin_count; // Number of producer dependencies (set once by wiring) + + // Fanout refcount (accessed with fanout_count in check_and_handle_consumed) + std::atomic fanout_refcount; // Dynamic: counts released references + + // --- Per-slot constant, re-bound by orch::prepare_task each submit --- + // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]), + // but written here per-submit instead of in an O(window_size) init loop — + // these are the only "scale-dependent" pointers in this struct, so moving + // them out of init makes startup cost independent of task_window_size. + PTO2TaskPayload *payload; + PTO2TaskDescriptor *task; + + // --- Set per-submit (depend on task inputs) --- + ActiveMask active_mask; // Bitmask of active subtask slots (set once) + uint8_t ring_id; // Ring layer (immutable after init) + // Set by any subtask FIN that pushed deferred-completion CONDITIONs to + // the runtime mailbox; read by the last subtask FIN to decide MPSC vs + // inline completion. Mirrors a2a3; see that mirror for the full + // memory-order argument. Carved out of the padding byte between ring_id + // and dep_pool_mark to keep PTO2TaskSlotState at 64 bytes. + std::atomic any_subtask_deferred{false}; + uint8_t _async_pad{0}; + int32_t dep_pool_mark{0}; // Dep pool top after wiring (thread-0-only) + + std::atomic completed_subtasks{0}; // Each core completion increments by 1 + int16_t total_required_subtasks{0}; // = logical_block_num * popcount(active_mask) + int16_t logical_block_num{1}; // Total logical blocks (set by orchestrator) + int16_t next_block_idx{0}; // Next block to dispatch (scheduler state) + + /** + * Bind the slot-invariant ring id. Called once per slot during + * RingSchedState::init(); ring_id never changes across reuses. + */ + void bind_ring(uint8_t rid) { ring_id = rid; } + + /** + * Re-bind the per-slot payload/task pointers. Called by + * orch::prepare_task on every submit. Value is constant for a given + * slot, but we pay the cheap re-write each submit (both fields land on + * the same 64B slot_state cache line that prepare_task is already + * dirtying) to avoid the init-time per-slot loop. + */ + void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) { + payload = p; + task = t; + } + + /** + * Reset dynamic scheduling fields for slot reuse. + * Called by advance_ring_pointers() after a slot transitions to CONSUMED + * and last_task_alive advances past it, but before sync_to_sm() publishes + * the new last_task_alive to the orchestrator. + * + * Skips payload, task, ring_id (immutable, bound once at init). + * Skips task_state: left as CONSUMED so that wait_for_tensor_ready() + * callers holding stale owner_task_id still observe a completed state. + * task_state is set to PENDING by the orchestrator when it reuses the slot. + */ + void reset_for_reuse() { + fanout_lock.store(0, std::memory_order_relaxed); + fanout_count = 1; + fanout_head = nullptr; + fanin_refcount.store(0, std::memory_order_relaxed); + fanout_refcount.store(0, std::memory_order_relaxed); + completed_subtasks.store(0, std::memory_order_relaxed); + next_block_idx = 0; + any_subtask_deferred.store(false, std::memory_order_relaxed); + } + + // === Per-task fanout spinlock === + // + // Used by BOTH the orchestrator and the scheduler. The fanout_lock MUST + // be held whenever reading or writing fanout_head / fanout_count, because + // the orchestrator adds consumers concurrently with the scheduler + // traversing the list after task completion. + +#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING + void lock_fanout(uint64_t &atomic_count, uint64_t &wait_cycle) { + uint64_t t0 = get_sys_cnt_aicpu(); + bool contended = false; + uint32_t atomic_ops = 0; + + for (;;) { + while (fanout_lock.load(std::memory_order_acquire) != 0) { + contended = true; + atomic_ops++; + SPIN_WAIT_HINT(); + } + int32_t expected = 0; + if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) { + atomic_ops++; + atomic_count += atomic_ops; + if (contended) { + wait_cycle += (get_sys_cnt_aicpu() - t0); + } + return; + } + contended = true; + atomic_ops++; + } + } +#endif + + void lock_fanout() { + for (;;) { + while (fanout_lock.load(std::memory_order_acquire) != 0) { + SPIN_WAIT_HINT(); + } + int32_t expected = 0; + if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) { + return; + } + } + } + + void unlock_fanout() { fanout_lock.store(0, std::memory_order_release); } +}; + +static_assert(sizeof(PTO2TaskSlotState) == 64); + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_ diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h new file mode 100644 index 000000000..cad5cec36 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_shared_memory.h @@ -0,0 +1,270 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Shared Memory Layout + * + * Defines the shared memory structure for Orchestrator-Scheduler communication. + * + * Memory Layout (per-ring sections repeat for each ring 0..PTO2_MAX_RING_DEPTH-1): + * +---------------------------+ + * | SharedMemoryHeader | (per-ring flow control + sync) + * +---------------------------+ + * | Ring 0: TaskDescriptor[] | + * | Ring 0: TaskPayload[] | + * | Ring 0: TaskSlotState[] | + * +---------------------------+ + * | Ring 1: TaskDescriptor[] | + * | Ring 1: TaskPayload[] | + * | Ring 1: TaskSlotState[] | + * +---------------------------+ + * | ... | + * +---------------------------+ + * + * Design principles: + * - Only data needed for Orchestrator<->Scheduler communication is here + * - TensorMap, scope_stack, ready_queues, dep_pool are in private memory + * - Flow control via atomic counters/flags (no locks needed for single-word R/W) + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#pragma once + +#include "utils/device_arena.h" +#include "pto_runtime2_types.h" + +// ============================================================================= +// Shared Memory Header +// ============================================================================= + +struct PTO2SharedMemoryHandle; + +/** + * Per-ring flow control state in shared memory. + * Written/read by Orchestrator and Scheduler for synchronization. + */ +struct alignas(64) PTO2RingFlowControl { + // === Cache Line 0: Written by Orchestrator, Read by Scheduler === + alignas(64) std::atomic current_task_index; // Task ring head (next to allocate) + + // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) === + alignas(64) std::atomic last_task_alive; // Task ring tail (oldest active task) + + // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private + // local_task_id_ from initial_local_task_id (default 0 in production) + // *without* dereferencing current_task_index — it relies on this reset + // running on every AICPU boot so 0 stays in sync. If you ever change + // the initial fc value or the boot ordering, update the default in + // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or + // submit IDs will be off by the divergence. + void init() { + current_task_index.store(0, std::memory_order_relaxed); + last_task_alive.store(0, std::memory_order_relaxed); + } + + bool validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const; +}; + +static_assert(sizeof(PTO2RingFlowControl) == 128, "PTO2RingFlowControl must be exactly 2 cache lines (128B)"); + +/** + * Per-ring shared memory header section. + * + * Groups flow-control, layout info, and per-ring data pointers for a single ring. + * Pointers are host-side only (set by setup_pointers, invalid on device). + */ +struct alignas(64) PTO2SharedMemoryRingHeader { + PTO2RingFlowControl fc; + + // Layout metadata (set once at init) + uint64_t task_window_size; + int32_t task_window_mask; + uint64_t heap_size; + uint64_t task_descriptors_offset; // Offset from SM base, in bytes + + // Per-ring data pointers (host-side, set by setup_pointers) + PTO2TaskDescriptor *task_descriptors; + PTO2TaskPayload *task_payloads; + PTO2TaskSlotState *slot_states; + + int32_t get_slot_by_task_id(int32_t local_task_id) { return local_task_id & task_window_mask; } + + PTO2TaskDescriptor &get_task_by_slot(int32_t slot) { return task_descriptors[slot]; } + + PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id) { + return task_descriptors[get_slot_by_task_id(local_id)]; + } + + PTO2TaskPayload &get_payload_by_slot(int32_t slot) { return task_payloads[slot]; } + + PTO2TaskPayload &get_payload_by_task_id(int32_t local_id) { return task_payloads[get_slot_by_task_id(local_id)]; } + + PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; } + + PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) { + return slot_states[get_slot_by_task_id(local_id)]; + } +}; + +/** + * Shared memory header structure + * + * Contains per-ring flow control and global layout information. + */ +struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader { + // === PER-RING FLOW CONTROL + LAYOUT INFO (set once at init) === + PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH]; + + // === GLOBAL FIELDS === + std::atomic orchestrator_done; // Flag: orchestration complete + + // Total shared memory size (for validation) + uint64_t total_size; + + // Graph output for copy-back (set by orchestrator when using packed buffer) + // Host finalize copies from this address instead of dev_ptr when non-zero + std::atomic graph_output_ptr; // Address where final output was written (packed buffer) + std::atomic graph_output_size; // Size in bytes + + // === ERROR REPORTING === + + // Orchestrator fatal error code (Orchestrator → Scheduler, AICPU → Host) + // Non-zero signals fatal error. Written by orchestrator, read by scheduler and host. + std::atomic orch_error_code; + + // Scheduler error state (Scheduler → Host, independent of orchestrator) + // Written by scheduler threads on timeout; read by orchestrator and host. + std::atomic sched_error_bitmap; // Bit X set = thread X had error + std::atomic sched_error_code; // Last scheduler error code (last-writer-wins) + std::atomic sched_error_thread; // Thread index of last error writer +}; + +static_assert( + (sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096), + "PTO2SharedMemoryHeader should be reasonably sized" +); + +// ============================================================================= +// Shared Memory Handle +// ============================================================================= + +/** + * Handle for shared memory lifecycle management (create/destroy). + * Runtime components (orchestrator, scheduler) use PTO2SharedMemoryHeader* directly. + */ +struct PTO2SharedMemoryHandle { + void *sm_base; // Base address of shared memory + uint64_t sm_size; // Total size of shared memory + + PTO2SharedMemoryHeader *header; + + // Ownership flag + bool is_owner; // True if this handle allocated the memory + + // === Static helpers === + + static uint64_t calculate_size(uint64_t task_window_size); + static uint64_t calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]); + + // UT convenience: reserve wrapper + sm_base on `arena`, commit, and init + // using default PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE. Only valid when the + // arena is otherwise empty (the call performs the single commit). All + // memory is owned by the arena — caller must not call destroy(). + static PTO2SharedMemoryHandle *create_and_init_default(DeviceArena &arena); + + // === Instance methods === + + // In-place init for caller-provided wrapper storage (e.g. a region carved + // out of a DeviceArena). Sets is_owner = false, calls setup_pointers and + // init_header. Returns false when `sm_size` is too small for the requested + // `task_window_size`. + bool init(void *sm_base, uint64_t sm_size, uint64_t task_window_size, uint64_t heap_size); + bool init_per_ring( + void *sm_base, uint64_t sm_size, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], + const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] + ); + + void destroy(); + void print_layout(); + bool validate(); + +private: + void init_header(uint64_t task_window_size, uint64_t heap_size); + void init_header_per_ring( + const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] + ); + void setup_pointers(uint64_t task_window_size); + void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]); +}; + +// ============================================================================= +// SM Device Layout Helpers +// ============================================================================= +// +// When the host pre-builds a runtime-arena image, it needs the device-side +// addresses of several SM sub-fields (ring flow-control counters, +// task_descriptors arrays, orch_error_code) so it can wire them into the +// orchestrator / scheduler init_data path without dereferencing the SM — +// the SM lives in device memory and cannot be touched from host. +// +// These helpers compute those addresses by offset arithmetic on the SM +// device base. Pure pointer math, no loads/stores; safe to call from host. +// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's +// own setup_pointers), so values are guaranteed consistent across sides. +namespace pto2_sm_layout { + +inline std::atomic *orch_error_code_addr(void *sm_dev_base) noexcept { + return reinterpret_cast *>( + static_cast(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code) + ); +} + +inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast( + static_cast(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) + + static_cast(ring_id) * sizeof(PTO2SharedMemoryRingHeader) + ); +} + +inline std::atomic *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast *>( + reinterpret_cast(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + + offsetof(PTO2RingFlowControl, current_task_index) + ); +} + +inline std::atomic *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept { + return reinterpret_cast *>( + reinterpret_cast(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + + offsetof(PTO2RingFlowControl, last_task_alive) + ); +} + +// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring) +// to compute ring `ring_id`'s task_descriptors device address. Accepts a +// per-ring window-size array so the helper's signature mirrors +// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently +// disagree with the SM layout when (hypothetically) ring sizes diverge. +inline PTO2TaskDescriptor *ring_task_descriptors_addr( + void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id +) noexcept { + assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range"); + char *p = static_cast(sm_dev_base); + p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + for (int r = 0; r < ring_id; r++) { + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + } + return reinterpret_cast(p); +} + +} // namespace pto2_sm_layout diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_submit_types.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_submit_types.h new file mode 100644 index 000000000..fa5a5df02 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_submit_types.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO Submit Types - Shared submit-contract definitions + * + * Header-only definitions shared by orchestration-facing and runtime-facing + * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h). + */ + +#pragma once + +#include + +inline constexpr int32_t INVALID_KERNEL_ID = -1; + +/** + * Subtask slot count: AIC, AIV0, AIV1 + */ +inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3; + +/** + * Subtask slot indices + */ +enum class PTO2SubtaskSlot : uint8_t { + AIC = 0, + AIV0 = 1, + AIV1 = 2, +}; + +/** + * Subtask mask bits (for ActiveMask) + */ +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0); // 0x1 +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1); // 0x2 +inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2); // 0x4 +inline constexpr uint8_t PTO2_SUBTASK_FLAG_SYNC_START = (1u << 3); // 0x8: all blocks must launch atomically + +/** + * Resource shape — classifies a MixedKernels into one of 3 scheduling buckets. + * + * Multi-subtask tasks (2+ active slots) are all scheduled as MIX. Dispatch + * chooses one cluster, then uses active_mask to decide which cores in that + * cluster must be placed together: all used cores idle -> running placement; + * all used cores already running with free pending slots -> pending placement; + * mixed used-core state is rejected and retried later. + * + * DUMMY is a synthetic shape for dep-only tasks (no AICore dispatch). Tasks + * with an empty core_mask route to a dedicated DUMMY ready queue and are + * completed inline by the scheduler dispatch loop, bypassing core allocation. + */ +enum class PTO2ResourceShape : uint8_t { + AIC = 0, // Single AIC + AIV = 1, // Single AIV + MIX = 2, // Full cluster (dispatch uses active_mask) + DUMMY = 3, // Dependency-only (no AICore dispatch) +}; + +// Number of *dispatchable* resource shapes (AIC, AIV, MIX). DUMMY does not +// allocate a per-shape ready_queue entry / local buffer — it lives in a +// dedicated queue inside PTO2SchedulerState. +inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 3; + +/** + * Bitmask of active subtask slots + flags, sizeof == 1. + */ +class ActiveMask { +public: + constexpr ActiveMask() = default; + constexpr explicit ActiveMask(uint8_t raw) : + raw_(raw) {} + + uint8_t raw() const { return raw_; } + + bool subtask_active(PTO2SubtaskSlot slot) const { return (raw_ & (1u << static_cast(slot))) != 0; } + + uint8_t core_mask() const { return raw_ & 0x07u; } + + bool requires_sync_start() const { return (raw_ & PTO2_SUBTASK_FLAG_SYNC_START) != 0; } + + PTO2ResourceShape to_shape() const { + uint8_t cmask = core_mask(); + if (cmask == 0) return PTO2ResourceShape::DUMMY; + int bit_count = __builtin_popcount(cmask); + if (bit_count >= 2) return PTO2ResourceShape::MIX; + if (cmask & PTO2_SUBTASK_MASK_AIC) return PTO2ResourceShape::AIC; + return PTO2ResourceShape::AIV; + } + + void set_sync_start() { raw_ |= PTO2_SUBTASK_FLAG_SYNC_START; } + + bool operator==(ActiveMask other) const { return raw_ == other.raw_; } + bool operator!=(ActiveMask other) const { return raw_ != other.raw_; } + + ActiveMask operator|(ActiveMask other) const { return ActiveMask(raw_ | other.raw_); } + ActiveMask &operator|=(ActiveMask other) { + raw_ |= other.raw_; + return *this; + } + + ActiveMask operator&(uint8_t mask) const { return ActiveMask(raw_ & mask); } + + bool has_mask(uint8_t mask) const { return (raw_ & mask) != 0; } + + explicit operator bool() const { return raw_ != 0; } + +private: + uint8_t raw_{0}; +}; + +static_assert(sizeof(ActiveMask) == 1, "ActiveMask must be exactly 1 byte"); + +/** + * Mixed-task submit contract. + * + * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive). + * At least one slot must be valid. + */ +struct MixedKernels { + int32_t aic_kernel_id{INVALID_KERNEL_ID}; + int32_t aiv0_kernel_id{INVALID_KERNEL_ID}; + int32_t aiv1_kernel_id{INVALID_KERNEL_ID}; + + ActiveMask to_active_mask() const { + uint8_t mask = 0; + if (aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC; + if (aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0; + if (aiv1_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV1; + return ActiveMask(mask); + } +}; + +/** + * SPMD launch parameters carried inside Arg. + * + * Controls how many logical blocks (SPMD dimension) a single task + * is expanded into at dispatch time. Each block receives a unique + * block_idx in [0, core_num) via the per-dispatch LocalContext. + */ +class PTO2LaunchSpec { +public: + constexpr PTO2LaunchSpec() = default; + + int16_t core_num() const { return core_num_; } + void set_core_num(int16_t n) { core_num_ = n; } + + bool require_sync_start() const { return require_sync_start_; } + void set_require_sync_start(bool v) { require_sync_start_ = v; } + +private: + int16_t core_num_{1}; + bool require_sync_start_{false}; +}; diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_tensormap.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_tensormap.h new file mode 100644 index 000000000..30017fadd --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_tensormap.h @@ -0,0 +1,723 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO Runtime2 - TensorMap Interface + * + * TensorMap provides producer lookup for dependency discovery: + * - Maps Tensor -> producer task ID + * - Used by pto_submit_task() to find dependencies + * + * Key design features: + * 1. Ring buffer pool for entries (no malloc/free) + * 2. Lazy invalidation (entries become stale when producer retires) + * 3. Per-task per-ring entry tracking for efficient cleanup + * 4. OVERLAP DETECTION: Detects dependencies for overlapping sub-regions + * + * Hash table with chaining: + * - buckets[] array of head offsets + * - Entries linked via next_in_bucket + * - Insert at head (newest first) for sorted chains + * + * CRITICAL: Hash only by base_ptr + * ============================== + * For overlap detection to work, ALL sub-regions of the same base tensor + * MUST be in the SAME hash bucket. This allows lookup to compare all + * potentially overlapping regions. + * + * Overlap detection: Two regions create a dependency if: + * 1. Same base_ptr (raw tensor pointer) + * 2. Byte ranges [offset, offset+size) intersect + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#pragma once + +#include "common.h" +#include "profiling_config.h" +#include "utils/device_arena.h" +#include "pto_runtime2_types.h" +#include "tensor.h" + +// Overlap geometry types. Relocated here from tensor.h: they are used only by +// the runtime's overlap-detection / dependency machinery, not by the +// wire/host-facing Tensor definition. +enum class OverlapStatus { + NO_OVERLAP, + COVERED, + OTHER, +}; + +struct Segment { + uint64_t begin; + uint64_t end; + + bool line_segment_intersection(const Segment &other) const { return end > other.begin && other.end > begin; } + bool contains(const Segment &other) const { return begin <= other.begin && other.end <= end; } +}; + +/** + * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the + * region offsets returned by DeviceArena::reserve() so init_from_layout() + * can fetch the matching pointers after the arena is committed. + * + * All offsets are relative to the arena's base. + */ +struct PTO2TensorMapLayout { + size_t off_buckets; + size_t off_entry_pool; + size_t off_free_entry_list; + size_t off_task_entry_heads[PTO2_MAX_RING_DEPTH]; + int32_t num_buckets; + int32_t pool_size; + int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; +}; + +// TensorMap Lookup Profiling (must precede inline lookup/insert methods). +#if PTO2_TENSORMAP_PROFILING +extern uint64_t g_lookup_chain_total; +extern uint64_t g_lookup_count; +extern int32_t g_lookup_chain_max; +extern uint64_t g_lookup_overlap_checks; +extern uint64_t g_lookup_overlap_hits; +extern uint64_t g_insert_count; +#endif + +// ============================================================================= +// TensorMap Structure +// ============================================================================= + +/** + * TensorMap entry structure — cache-line optimized for lookup + * + * Cache line 1 (64B, lookup hot path) mirrors Tensor cache line 1 byte-for-byte + * from byte 16 onward, so that `memcpy(this, &tensor, 64)` populates everything + * we need for overlap checks. Bytes [0, 16) carry entry-only fields (hash + * bucket head + chain pointer) that overlap Tensor::buffer (addr in [0, 8) is + * the hash key, size in [8, 16) is unused by the entry — we repurpose it for + * `next_in_bucket`). + * + * buffer_addr / next_in_bucket / producer_task_id — chain traversal + match + * start_offset — overlap byte range begin + * version, ndims, dtype, manual_dep, is_contiguous — overlap fast path + * shapes[5] — overlap comparison (line 1) + * + * Cache line 2 (64B, slow-path / non-contiguous overlap): + * prev_in_bucket / next_in_task / prev_in_task — chain manipulation + * bucket_index — bookkeeping + * extent_elem_cache — overlap byte range end + * strides[5] — reserved for L2 overlap (PR-2) + * + * When both entry & probe are `is_contiguous && start_offset == 0`, the overlap + * check derives `extent_elem = prod(shapes)` from cache line 1 alone. + * + * Entry size: 128B (2 cache lines), matches Tensor. + */ +struct alignas(64) PTO2TensorMapEntry { + // === Cache line 1 (64B) — lookup hot path; mirrors Tensor line 1 from byte 16 === + uint64_t buffer_addr; // 8B [0, 8): tensor base address (hash key, mirrors Tensor::buffer.addr) + PTO2TensorMapEntry *next_in_bucket; // 8B [8, 16): next entry in hash bucket chain (overlays Tensor::buffer.size) + PTO2TaskId producer_task_id; // 8B [16,24): mirrors Tensor::owner_task_id slot + uint64_t start_offset; // 8B [24,32): mirrors Tensor::start_offset (element offset) + int32_t version; // 4B [32,36): mirrors Tensor::version + uint32_t ndims; // 4B [36,40): mirrors Tensor::ndims + DataType dtype; // 1B [40,41): mirrors Tensor::dtype + bool manual_dep; // 1B [41,42): mirrors Tensor::manual_dep + bool is_contiguous; // 1B [42,43): mirrors Tensor::is_contiguous + uint8_t __padding1__; // 1B [43,44): mirrors Tensor padding + uint32_t shapes[MAX_TENSOR_DIMS]; // 20B [44,64): mirrors Tensor::shapes + + // === Cache line 2 (64B) — chain manipulation + non-contiguous overlap data === + PTO2TensorMapEntry *prev_in_bucket; // 8B [64, 72) + PTO2TensorMapEntry *next_in_task; // 8B [72, 80) + PTO2TensorMapEntry *prev_in_task; // 8B [80, 88) + int32_t bucket_index; // 4B [88, 92): -1 when unlinked + uint32_t __padding2__; // 4B [92, 96) + uint64_t extent_elem_cache; // 8B [96,104): non-contiguous extent (mirrors Tensor) + uint32_t strides[MAX_TENSOR_DIMS]; // 20B [104,124): element strides, mirrors Tensor::strides + uint8_t __padding3__[4]; // 4B [124,128) + + /** + * Copy overlap-relevant fields from a Tensor into this entry. + * + * 64B memcpy of Tensor cache line 1 populates buffer_addr (byte [0,8)), + * producer_task_id, start_offset, version, ndims, dtype, manual_dep, + * is_contiguous and shapes[]. Byte [8,16) holds Tensor::buffer.size in + * the source and gets written into next_in_bucket; that's harmless + * because link_entry() overwrites next_in_bucket immediately after. + * + * Cache line 2 (stride / extent_elem_cache) is derived from line 1 when + * the source is canonically contiguous (is_contiguous && start_offset==0), + * so the producer Tensor's cache line 2 stays cold during insert. Only + * non-contiguous producers pay one extra line 2 read. + */ + void copy_from_tensor(const Tensor &tensor) { + memcpy(this, &tensor, 64); + if (tensor.is_contiguous && tensor.start_offset == 0) { + uint64_t numel = 1; + for (uint32_t i = 0; i < tensor.ndims; i++) + numel *= tensor.shapes[i]; + extent_elem_cache = numel; + uint32_t s = 1; + for (int32_t i = static_cast(tensor.ndims) - 1; i >= 0; i--) { + strides[i] = s; + s *= tensor.shapes[i]; + } + } else { + extent_elem_cache = tensor.extent_elem_cache; + for (uint32_t i = 0; i < tensor.ndims; i++) { + strides[i] = tensor.strides[i]; + } + } + } + + void copy_tensor_create_info(const TensorCreateInfo &tensor_create_info, uint64_t addr) { + memcpy(this, &tensor_create_info, 64); + buffer_addr = addr; + // Create-info outputs are always contiguous with start_offset = 0; + // extent_elem = prod(shapes); stride is row-major. + uint64_t numel = 1; + for (uint32_t i = 0; i < tensor_create_info.ndims; i++) { + numel *= tensor_create_info.shapes[i]; + } + extent_elem_cache = numel; + uint32_t s = 1; + for (int32_t i = static_cast(tensor_create_info.ndims) - 1; i >= 0; i--) { + strides[i] = s; + s *= tensor_create_info.shapes[i]; + } + } + + /** + * Effective element extent of this entry. + * Contiguous-aligned views compute it from shapes alone (line 1 hit only); + * non-contiguous views read the cached value from line 2. + */ + uint64_t effective_extent_elem() const { + if (is_contiguous) { + uint64_t n = 1; + for (uint32_t i = 0; i < ndims; i++) + n *= shapes[i]; + return n; + } + return extent_elem_cache; + } + + /** + * Check overlap between input tensor and this entry (the producer output). + * + * Three-level cascade: + * L1 — O(1) byte-range intersection. Disjoint -> NO_OVERLAP. + * L2 — O(ndims) hyper-rectangle precise check, eligible only when both + * sides share the same canonical row-major axis layout (same + * dtype/ndims/strides[], stride descends as integer multiples, + * start_offset decomposes cleanly under the reference shape). + * Yields NO_OVERLAP / COVERED / OTHER per-dim. + * L3 — Non-hyper-rectangle pairs (transpose/permute mismatch, slice + * with step, etc): conservative OTHER. Exact enumeration via + * contiguous-segment merge is scheduled for a follow-up. + * + * COVERED is returned when `input` completely contains `entry` per-dim + * — dep_compute uses this to retire the now-redundant entry. + */ + OverlapStatus check_overlap(const Tensor &input) const { + debug_assert(input.buffer.addr == buffer_addr); + debug_assert(input.version >= version); + if (input.version > version) { + return OverlapStatus::OTHER; + } + + // -------- L1: byte-range intersection (O(1) fast reject) -------- + const uint64_t in_begin = input.start_offset; + const uint64_t in_end = input.start_offset + input.extent_elem(); + const uint64_t ent_begin = start_offset; + const uint64_t ent_end = start_offset + effective_extent_elem(); + Segment in_range_bytes{in_begin, in_end}; + Segment ent_range_bytes{ent_begin, ent_end}; + if (!in_range_bytes.line_segment_intersection(ent_range_bytes)) { + return OverlapStatus::NO_OVERLAP; + } + + // -------- L2 prereqs: same axis layout? -------- + if (input.dtype != dtype || input.ndims != ndims || ndims == 0) { + return OverlapStatus::OTHER; + } + for (uint32_t i = 0; i < ndims; i++) { + if (input.strides[i] != strides[i]) return OverlapStatus::OTHER; + } + // strides[ndims-1] must be 1 and strides[i-1] must be an integer + // multiple of strides[i] for the row-major reference-shape derivation + // below to hold. This rejects slice-with-step (strides[d] != prev factor) + // and any view chain that scrambles the axis order. (strides is + // uint32_t with the > 0 invariant enforced at construction, so no + // sign check needed.) + if (strides[ndims - 1] != 1) return OverlapStatus::OTHER; + for (uint32_t i = 1; i < ndims; i++) { + if (strides[i - 1] % strides[i] != 0) return OverlapStatus::OTHER; + } + + // Derive reference shape A from stride. By construction stride is + // row-major over A: strides[i] = prod(A[i+1..ndims-1]). So + // A[i] = strides[i-1] / strides[i] for i >= 1 + // A[0] = (buffer.size / dtype_bytes) / strides[0] + // input.buffer.size is the storage size; entry shares the same buffer + // (debug-asserted by buffer.addr equality at the top), so we read it + // from input rather than mirroring buffer.size into the entry. + // + // Note on buffer padding: runtime allocators may over-allocate + // `buffer.size` (cache-line / 1024B alignment, ring-buffer slot + // rounding, etc). When that happens, `numel_storage` is larger than + // the true logical extent and `ref_shapes[0]` ends up generously over- + // sized. This is intentional: ref_shapes is only used as an *upper + // bound* in the in-bounds checks below; the actual overlap test (the + // per-dim line-segment intersection on the real start_offset / + // shapes / stride further down) is unaffected. A larger-than-truth + // ref_shapes[0] simply makes the bounds check more permissive — it + // can never cause a false NO_OVERLAP nor a false COVERED. + uint32_t ref_shapes[MAX_TENSOR_DIMS] = {}; + for (uint32_t i = 1; i < ndims; i++) { + ref_shapes[i] = strides[i - 1] / strides[i]; + } + const uint64_t elem_size = get_element_size(dtype); + if (elem_size == 0) return OverlapStatus::OTHER; + const uint64_t numel_storage = input.buffer.size / elem_size; + const uint32_t stride0 = strides[0]; // > 0 by Tensor invariant + if (numel_storage % stride0 != 0) return OverlapStatus::OTHER; + ref_shapes[0] = static_cast(numel_storage / stride0); + + // Decompose start_offset into row-major multi-dim offsets. By the same + // relation strides[i] = prod(ref_shapes[i+1..]) so dividing by strides[i] + // (no inner loop) yields each axis offset directly. + uint32_t in_offsets[MAX_TENSOR_DIMS] = {}; + uint32_t ent_offsets[MAX_TENSOR_DIMS] = {}; + uint64_t in_remain = input.start_offset; + uint64_t ent_remain = start_offset; + for (uint32_t i = 0; i < ndims; i++) { + const uint32_t s = strides[i]; + in_offsets[i] = static_cast(in_remain / s); + ent_offsets[i] = static_cast(ent_remain / s); + in_remain %= s; + ent_remain %= s; + } + if (in_remain != 0 || ent_remain != 0) return OverlapStatus::OTHER; + + // Validate that each side fits within ref_shapes (defense in depth — + // a well-formed view always satisfies this). + for (uint32_t i = 0; i < ndims; i++) { + if (static_cast(in_offsets[i]) + input.shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER; + if (static_cast(ent_offsets[i]) + shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER; + } + + // -------- L2 core: per-dim line-segment intersection -------- + bool input_contains_entry = true; + for (uint32_t i = 0; i < ndims; i++) { + Segment in_seg{in_offsets[i], static_cast(in_offsets[i]) + input.shapes[i]}; + Segment ent_seg{ent_offsets[i], static_cast(ent_offsets[i]) + shapes[i]}; + if (!in_seg.line_segment_intersection(ent_seg)) { + return OverlapStatus::NO_OVERLAP; + } + if (!in_seg.contains(ent_seg)) { + input_contains_entry = false; + } + } + return input_contains_entry ? OverlapStatus::COVERED : OverlapStatus::OTHER; + } +}; + +static_assert(sizeof(PTO2TensorMapEntry) == 128, "TensorMapEntry must be exactly 2 cache lines (128 bytes)"); +static_assert(offsetof(PTO2TensorMapEntry, buffer_addr) == offsetof(Tensor, buffer.addr)); +static_assert(offsetof(PTO2TensorMapEntry, producer_task_id) == offsetof(Tensor, owner_task_id)); +static_assert(offsetof(PTO2TensorMapEntry, start_offset) == offsetof(Tensor, start_offset)); +static_assert(offsetof(PTO2TensorMapEntry, version) == offsetof(Tensor, version)); +static_assert(offsetof(PTO2TensorMapEntry, ndims) == offsetof(Tensor, ndims)); +static_assert(offsetof(PTO2TensorMapEntry, dtype) == offsetof(Tensor, dtype)); +static_assert(offsetof(PTO2TensorMapEntry, manual_dep) == offsetof(Tensor, manual_dep)); +static_assert(offsetof(PTO2TensorMapEntry, is_contiguous) == offsetof(Tensor, is_contiguous)); +static_assert(offsetof(PTO2TensorMapEntry, shapes) == offsetof(Tensor, shapes)); +static_assert( + offsetof(PTO2TensorMapEntry, prev_in_bucket) == 64, "TensorMapEntry must be exactly 2 cache lines (128 bytes)" +); + +// ============================================================================= +// TensorMap Lookup Chain Length Statistics (compile-time toggle) +// ============================================================================= + +/** + * TensorMap structure + * + * Hash table with ring buffer entry pool and lazy invalidation. + */ +struct PTO2TensorMap { + // Hash table buckets (fixed size, power of 2) + PTO2TensorMapEntry **buckets; // Array of offsets into entry_pool (-1 = empty) + int32_t num_buckets; // Must be power of 2 for fast modulo + + // Entry pool as ring buffer + PTO2TensorMapEntry *entry_pool; // Ring buffer of entries + PTO2TensorMapEntry **free_entry_list; // free entry ids + int32_t pool_size; // Total pool capacity + int32_t next_entry_idx; // id when next entry insert + int32_t free_num; // free entry number in entry pool + + // Per-ring per-task entry tracking (for efficient bucket cleanup) + // Indexed by [ring_id][local_id & (task_window_sizes[ring_id] - 1)] + PTO2TensorMapEntry **task_entry_heads[PTO2_MAX_RING_DEPTH]; + int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]; // Per-ring task window size (for slot masking) + + // Per-ring validity threshold (for lazy invalidation) + int32_t last_task_alives[PTO2_MAX_RING_DEPTH]; // Cached from shared memory per ring + + // Per-ring cleanup progress (for periodic cleanup_retired) + int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{}; + + uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const { + return task_local_id & (task_window_sizes[ring_id] - 1); + } + + // Accessors read by scope_stats_collector. Declared unconditionally so the + // collector .cpp compiles at PTO2_PROFILING=0 (collector is unconditional — + // setter symbols must export for host dlsym; the probe call sites that use + // these accessors stay gated by PTO2_PROFILING). + int32_t current_used() const { return next_entry_idx - free_num; } + int32_t pool_capacity() const { return pool_size; } + + // new_entry only allocates memory, does not assign attributes + PTO2TensorMapEntry *new_entry() { + if (free_num > 0) { + PTO2TensorMapEntry *res = free_entry_list[--free_num]; + debug_assert(res->bucket_index == -1); + return res; + } + always_assert(next_entry_idx < pool_size); + PTO2TensorMapEntry *res = &entry_pool[next_entry_idx++]; + debug_assert(res->bucket_index == -1); + return res; + } + + void free_entry(PTO2TensorMapEntry &entry) { + always_assert(entry.bucket_index != -1); // must still be in a bucket + + // Update predecessor's next pointer (O(1) via prev_in_bucket) + if (entry.prev_in_bucket == nullptr) { + // Entry is the head of its bucket chain, update bucket head + // Must compute hash BEFORE clearing tensor + buckets[entry.bucket_index] = entry.next_in_bucket; + } else { + entry.prev_in_bucket->next_in_bucket = entry.next_in_bucket; + } + + // Update successor's prev pointer + if (entry.next_in_bucket != nullptr) { + entry.next_in_bucket->prev_in_bucket = entry.prev_in_bucket; + } + + free_entry_list[free_num++] = &entry; + entry.bucket_index = -1; + entry.next_in_bucket = nullptr; + entry.prev_in_bucket = nullptr; + entry.next_in_task = nullptr; + entry.prev_in_task = nullptr; + } + + // ============================================================================= + // TensorMap API + // ============================================================================= + + /** + * Phase 1: reserve every sub-region (buckets, entry_pool, free list, per-ring + * task_entry_heads) on the supplied arena. Records the resulting offsets in + * the returned layout descriptor. Must be called before the arena is + * committed. + */ + static PTO2TensorMapLayout reserve_layout( + DeviceArena &arena, int32_t num_buckets, int32_t pool_size, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH] + ); + + /** + * Same as reserve_layout() with default sizes (PTO2_TENSORMAP_NUM_BUCKETS, + * PTO2_TENSORMAP_POOL_SIZE). + */ + static PTO2TensorMapLayout + reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]); + + /** + * Phase 3a: write everything *except* arena-internal pointer fields + * (buckets, entry_pool, free_entry_list, task_entry_heads[r]). + * Uses arena.region_ptr to address the arena regions for data writes, + * but does not store those addresses in struct fields. Safe to call on + * a host arena that holds the prebuilt image. + */ + bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena); + + /** + * Phase 3b: write the arena-internal pointer fields. Idempotent; + * called once on the host arena and once on the AICPU after attach. + */ + void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena); + + /** + * Tear down state. Does not free memory — the arena owns the backing + * buffer. Pointers are set to nullptr so accidental reuse traps. + */ + void destroy(); + + /** + * Update validity threshold from shared memory + * Called periodically to refresh the lazy invalidation threshold. + * + * @param last_task_alive Current value from shared memory + */ + void sync_validity(int32_t ring_id, int32_t last_task_alive) { this->last_task_alives[ring_id] = last_task_alive; } + + /** + * Lookup producer for a tensor region + * + * Searches the hash table for matching regions and invokes the callback + * for each overlapping valid entry. + * Stale entries from different rings are skipped (not truncated). + * + * The callback receives (PTO2TensorMapEntry &, OverlapStatus) and should + * return true to continue iteration, false to stop early. It is safe for + * the callback to call remove_entry() on the current entry: next_in_bucket + * is latched before invocation. + * + * @param tensor Tensor to look up + * @param on_match Callback invoked for each overlapping entry + */ + template + void lookup(const Tensor &tensor, Fn &&on_match) { + uint32_t bucket_index = hash(tensor.buffer.addr); + PTO2TensorMapEntry *cur_entry = buckets[bucket_index]; + +#if PTO2_TENSORMAP_PROFILING + g_lookup_count++; + int32_t chain_len = 0; +#endif + + while (cur_entry != nullptr) { + PTO2TensorMapEntry *next_entry = cur_entry->next_in_bucket; + +#if PTO2_TENSORMAP_PROFILING + chain_len++; +#endif + // Skip stale entries (no chain truncation — entries from different + // rings can be interleaved, so a stale entry from one ring does NOT + // imply subsequent entries from other rings are also stale) + if (!entry_valid(*cur_entry)) { + cur_entry = next_entry; + continue; + } + + // Entry is valid - check if regions OVERLAP (not just exact match) + // Since we hash only by base_ptr, all entries in this bucket have + // potential to overlap. We must check actual byte-range overlap. + if (tensor.buffer.addr == cur_entry->buffer_addr) { +#if PTO2_TENSORMAP_PROFILING + g_lookup_overlap_checks++; +#endif + auto overlap_status = cur_entry->check_overlap(tensor); + if (overlap_status != OverlapStatus::NO_OVERLAP) { +#if PTO2_TENSORMAP_PROFILING + g_lookup_overlap_hits++; +#endif + if (!on_match(*cur_entry, overlap_status)) { +#if PTO2_TENSORMAP_PROFILING + g_lookup_chain_total += chain_len; + if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len; +#endif + return; + } + } + } + + // Move to next entry + cur_entry = next_entry; + } +#if PTO2_TENSORMAP_PROFILING + g_lookup_chain_total += chain_len; + if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len; +#endif + } + + /** + * Insert a new entry (called when task produces output) + * + * Allocates from ring buffer pool, may overwrite stale entries. + * Inserts at head of hash bucket chain (maintains task_id ordering). + * + * @param tensor Tensor produced + * @param producer_task_id Task ID of producer + */ + void insert(const Tensor &tensor, PTO2TaskId producer_task_id) { + PTO2TensorMapEntry *entry = new_entry(); + entry->copy_from_tensor(tensor); + link_entry(entry, tensor.buffer.addr, producer_task_id); + } + + /** + * Cleanup stale entries for retired tasks + * + * Called periodically by Orchestrator when last_task_alive advances. + * Removes entries from bucket chains for tasks in [old, new) range. + * + * @param old_last_task_alive Previous threshold + * @param new_last_task_alive New threshold + */ + void cleanup_retired(int32_t ring_id, int32_t old_last_task_alive, int32_t new_last_task_alive) { + // Iterate through retired tasks on this ring and remove their entries + for (int32_t local_id = old_last_task_alive; local_id < new_last_task_alive; local_id++) { + int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1); + PTO2TensorMapEntry *cur_entry = task_entry_heads[ring_id][task_slot]; + + while (cur_entry != nullptr) { + PTO2TensorMapEntry *next_entry = cur_entry->next_in_task; // Save before clearing + // Only remove if this entry belongs to the retiring task + // (slot may have been reused by a newer task) + debug_assert( + cur_entry->producer_task_id == + PTO2TaskId::make(static_cast(ring_id), static_cast(local_id)) + ); + free_entry(*cur_entry); + cur_entry = next_entry; + } + + // Clear task's entry head (slot will be reused by local_id + task_window_sizes[ring_id]) + task_entry_heads[ring_id][task_slot] = nullptr; + } + } + + // ============================================================================= + // Internal Helpers (exposed for testing) + // ============================================================================= + + /** + * Compute hash for tensor addr + * + * Multiplicative hash using the golden-ratio constant. Multiplication + * mixes ALL input bits into the high bits of the product, so aligned + * addresses (low bits all-zero) still distribute evenly. We extract + * the top log2(num_buckets) bits which carry the most entropy. + */ + uint32_t hash(uint64_t key) { + key *= 0x9E3779B97F4A7C15ULL; + return static_cast(key >> (64 - __builtin_ctz(num_buckets))); + } + + /** + * Link an initialized entry into bucket and task chains. + */ + void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id) { +#if PTO2_TENSORMAP_PROFILING + g_insert_count++; +#endif + uint32_t bucket_index = hash(addr); + auto ring_id = producer_task_id.ring(); + auto local_id = producer_task_id.local(); + int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1); + + entry->producer_task_id = producer_task_id; + + // Insert at head of hash bucket + entry->bucket_index = bucket_index; + entry->next_in_bucket = buckets[bucket_index]; + if (entry->next_in_bucket != nullptr) { + entry->next_in_bucket->prev_in_bucket = entry; + } + buckets[bucket_index] = entry; + entry->prev_in_bucket = nullptr; + + // Link to task's entry list + entry->next_in_task = task_entry_heads[ring_id][task_slot]; + entry->prev_in_task = nullptr; + if (entry->next_in_task != nullptr) { + entry->next_in_task->prev_in_task = entry; + } + task_entry_heads[ring_id][task_slot] = entry; + } + + /** + * Check if entry is valid (producer has not retired) + */ + bool entry_valid(const PTO2TensorMapEntry &entry) const { + return static_cast(entry.producer_task_id.local()) >= last_task_alives[entry.producer_task_id.ring()]; + } + + void remove_entry(PTO2TensorMapEntry &entry) { + remove_from_task(entry); + free_entry(entry); + } + + /** + * Remove entry from its task chain (O(1) with prev pointer) + * Called during pool wrap-around to unlink reused entries. + */ + void remove_from_task(PTO2TensorMapEntry &entry) { + always_assert(entry.bucket_index != -1); // must still be in a bucket + // Update predecessor's next pointer (O(1) via prev_in_task) + if (entry.prev_in_task == nullptr) { + // Entry is the head of its task chain, update task_entry_heads + int32_t ring_id = entry.producer_task_id.ring(); + int32_t local_id = static_cast(entry.producer_task_id.local()); + int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1); + task_entry_heads[ring_id][task_slot] = entry.next_in_task; + } else { + entry.prev_in_task->next_in_task = entry.next_in_task; + } + + // Update successor's prev pointer + if (entry.next_in_task != nullptr) { + entry.next_in_task->prev_in_task = entry.prev_in_task; + } + + entry.next_in_task = nullptr; + entry.prev_in_task = nullptr; + } + + // ============================================================================= + // Debug Utilities + // ============================================================================= + + /** + * Print TensorMap statistics + */ + void print_stats(); + + /** + * Get count of valid entries + */ + int32_t valid_count(); + + // ============================================================================= + // TensorMap Synchronization + // ============================================================================= + + /** + * Sync TensorMap validity threshold from shared memory + * + * Called periodically to refresh the lazy invalidation threshold. + * Also triggers cleanup if threshold has advanced significantly. + */ + void sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive); +}; + +#if PTO2_TENSORMAP_PROFILING +struct PTO2TensorMapProfilingData { + uint64_t lookup_chain_total; + uint64_t lookup_count; + int32_t lookup_chain_max; + uint64_t overlap_checks; + uint64_t overlap_hits; + uint64_t insert_count; +}; + +PTO2TensorMapProfilingData pto2_tensormap_get_profiling(); +#endif diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/pto_types.h b/src/a5/runtime/fully_distributed_within_core/runtime/pto_types.h new file mode 100644 index 000000000..669771424 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/pto_types.h @@ -0,0 +1,602 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Orchestration Build Graph Types - Data structures for orchestration runtime extensions + * + * Standalone header defining orchestration-specific types for: + * - TaskOutputTensors: Return value from submit containing materialized output Tensors + * - Arg: Aggregated argument container for pto_submit_task API + * + * Tensor descriptor types (Tensor, PTOBufferHandle, TensorCreateInfo) are + * defined in tensor.h. + * + * This header is independent of orch_build_graph_runtime.h to allow inclusion from runtime.h + * without type conflicts (Handshake, TensorPair, HostApi). + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_ + +#include +#include + +#include +#include +#include + +#if defined(__aarch64__) +#include +#endif + +#include "aicpu/dump_arg_selection.h" +#include "data_type.h" +#include "profiling_config.h" +#include "pto_submit_types.h" +#include "task_args.h" +#include "tensor.h" +#include "tensor_create_info.h" // runtime-only TensorCreateInfo + materialization helpers + +typedef enum { + ASYNC_ENGINE_SDMA = 0, + ASYNC_ENGINE_ROCE = 1, + ASYNC_ENGINE_URMA = 2, + ASYNC_ENGINE_CCU = 3, + NUM_ASYNC_ENGINES = 4, +} AsyncEngine; + +enum class CompletionType : int32_t { + COUNTER = 0, +}; + +// ============================================================================= +// Task Output Tensors (return value from submit) +// ============================================================================= + +enum class PTO2ScopeMode : uint8_t { + AUTO = 0, + MANUAL = 1, +}; + +/** + * TaskOutputTensors — returned by submit, holds materialized output Tensors. + * + * Only runtime-created outputs are stored here, indexed in add_output order. + * + * The underlying storage is uninitialized; only output_count elements are + * valid after submit returns. This avoids default-constructing Tensor[] + * on the hot path (2 KB of unnecessary zeroing per submit). + * + * Users must hold a named TaskOutputTensors variable and borrow via get_ref(); + * binding get_ref() on an rvalue is compile-time rejected to prevent dangling. + * + * LIFETIME — single-scope only: + * Internally this class stores pointers into the submitting task's payload + * (PTO2TaskPayload::tensors[]), which lives in a ring-buffer slot. After + * scope_end the slot becomes eligible for reuse, and a later submit will + * overwrite the same Tensor storage in place. Therefore the + * TaskOutputTensors instance, the const Tensor& returned by get_ref(), and + * any pointer derived from either MUST NOT outlive the PTO2_SCOPE in which + * submit was called — do not move/copy them to outer-scope variables, do + * not capture references by std::reference_wrapper or raw pointers across + * scope boundaries. + * + * This invariant is intentionally not enforced at runtime: a reused slot + * simply carries a different but valid owner_task_id, so checking + * owner_task_id cannot distinguish "still mine" from "silently aliased to + * an unrelated task". Misuse manifests as a wrong-tensor read with no + * diagnostic. + */ +class TaskOutputTensors { +public: + TaskOutputTensors() : + task_id_(PTO2TaskId::invalid()), + output_count_(0) {} + + bool empty() const { return output_count_ == 0; } + uint32_t size() const { return output_count_; } + + /// Borrow a materialized output tensor by index (lvalue only). + const Tensor &get_ref(uint32_t index) const & { + always_assert(index < output_count_); + return *tensors_[index]; + } + const Tensor &get_ref(uint32_t index) const && = delete; + + /// Runtime-internal: append one materialized output Tensor. + void materialize_output(const Tensor &tensor) { + always_assert(output_count_ < MAX_TENSOR_ARGS); + tensors_[output_count_++] = &tensor; + } + + void set_task_id(PTO2TaskId id) { task_id_ = id; } + + PTO2TaskId task_id() const { return task_id_; } + +private: + PTO2TaskId task_id_; + uint32_t output_count_; + // Upper bound: a task cannot have more outputs than total tensor args + // (every OUTPUT/OUTPUT_EXISTING slot is one of the Arg's tensor slots). + const Tensor *tensors_[MAX_TENSOR_ARGS]; +}; + +// ============================================================================= +// Argument Types (for pto_submit_task API) +// ============================================================================= + +// TensorArgType is defined in tensor.h (included via task_args.h above) + +/** + * Tagged reference to a single Arg slot — either a Tensor* or a + * TensorCreateInfo*. The active member is determined by the slot's + * TensorArgType tag (OUTPUT → create_info, else → tensor pointer). + * + * Minimal-permission: the union members are private; content is set only via + * operator=(ptr) and read via ref()/create_info(). Copy/move are deleted — a + * TensorRef is written in place inside an Arg's slot array, never passed by + * value. + */ +class TensorRef { + union { + const Tensor *ptr_; + const TensorCreateInfo *create_info_; + }; + +public: + TensorRef() : + ptr_(nullptr) {} + TensorRef(const TensorRef &) = delete; + TensorRef(TensorRef &&) = delete; + TensorRef &operator=(const TensorRef &) = delete; + TensorRef &operator=(TensorRef &&) = delete; + + TensorRef &operator=(const Tensor *p) { + ptr_ = p; + return *this; + } + TensorRef &operator=(const TensorCreateInfo *ci) { + create_info_ = ci; + return *this; + } + + const Tensor &ref() const { return *ptr_; } + const TensorCreateInfo &create_info() const { return *create_info_; } + bool refers_to(const Tensor *t) const { return ptr_ == t; } + bool refers_to(const TensorCreateInfo *ci) const { return create_info_ == ci; } +}; + +/** + * Aggregated argument container for pto_submit_task + * + * Inherits storage from TaskArgsTpl. + * Each tensor slot stores a TensorRef union (Tensor* or TensorCreateInfo) + * discriminated by the corresponding tag(). + * Tensors are dispatched first in kernel args, followed by scalars. + * + * Output arguments follow two distinct ownership models: + * - add_output(const TensorCreateInfo&): OUTPUT — runtime allocates buffer + * and materializes a new Tensor, returned via TaskOutputTensors. + * - add_inout(const Tensor&): INOUT — reuses an existing Tensor as the write target. + * + * Example: + * Tensor x = make_tensor_external(dev_a, shapes, 2); + * TensorCreateInfo ci(shapes, 2); // must outlive submit + * Arg args; + * args.add_input(x); + * args.add_output(ci); + * args.add_scalar(some_value); + * TaskOutputTensors outs = rt_submit_aic_task(kernel_id, args); + * const Tensor& y = outs.get_ref(0); + */ +template +struct Arg : TaskArgsTpl { + using Base = TaskArgsTpl; + // Make dependent-base members visible for unqualified use (two-phase lookup + // does not search a dependent base in a class template). + using Base::scalar_count_; + using Base::scalars_; + using Base::tags_; + using Base::tensor_count_; + using Base::tensors_; + + // Minimal-permission: an Arg is built in place and consumed by reference; + // it is never copied/moved (it is a large object, and its TensorRef slots + // are non-copyable by design). + Arg() = default; + Arg(const Arg &) = delete; + Arg(Arg &&) = delete; + Arg &operator=(const Arg &) = delete; + Arg &operator=(Arg &&) = delete; + + bool has_error{false}; + const char *error_msg{nullptr}; + PTO2LaunchSpec launch_spec; // SPMD launch parameters (block_num, etc.) + + void clear() { + Base::clear(); +#if PTO2_PROFILING + dump_arg_selection_.clear(); +#endif + explicit_deps_ = nullptr; + explicit_dep_count_ = 0; + } + + void reset() { + clear(); + has_error = false; + error_msg = nullptr; + } + + void set_error(const char *msg) { + if (!has_error) { + has_error = true; + error_msg = msg; + } + } + + template + void dump(Args &&...args) { +#if PTO2_PROFILING + static_assert( + (std::is_lvalue_reference_v && ...), + "dump: temporaries are not allowed — pass tensors/scalars already added to this Arg" + ); + static_assert( + (is_supported_dump_arg_v && ...), + "dump: all arguments must be Tensor, TensorCreateInfo, or scalar lvalues" + ); + if constexpr (sizeof...(Args) == 0) { + mark_all_dump_args(); + } else { + (mark_dump_arg(args), ...); + } +#else + ((void)args, ...); +#endif + } + +#if PTO2_PROFILING + uint64_t dump_arg_mask() const { return dump_arg_selection_.dump_arg_mask(); } + uint64_t dump_arg_index_ambiguous_mask() const { return dump_arg_selection_.dump_arg_index_ambiguous_mask(); } +#else + uint64_t dump_arg_mask() const { return 0; } + uint64_t dump_arg_index_ambiguous_mask() const { return 0; } +#endif + + template + void add_input(Args &&...args) { + assert_add_tensor_args(); + if (!check_add_tensor_capacity(static_cast(sizeof...(Args)))) { + return; + } + ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::INPUT, tensor_count_++), ...); + } + + /// Batch add outputs — all Tensor or all TensorCreateInfo: + /// add_output(ci1, ci2) — runtime allocates buffers (OUTPUT) + /// add_output(t1, t2) — write-only existing tensors (OUTPUT_EXISTING) + template + void add_output(Args &&...args) { + assert_add_tensor_args(); + if (!check_add_tensor_capacity(static_cast(sizeof...(Args)))) return; + if constexpr ((std::is_same_v, TensorCreateInfo> && ...)) { + ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::OUTPUT, tensor_count_++), ...); + } else { + ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::OUTPUT_EXISTING, tensor_count_++), + ...); + } + } + + template + void add_inout(Args &&...args) { + assert_add_tensor_args(); + if (!check_add_tensor_capacity(static_cast(sizeof...(Args)))) { + return; + } + ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::INOUT, tensor_count_++), ...); + } + + /// No-dependency existing tensor: skips OverlapMap lookup, depends on creator only. + template + void add_no_dep(Args &&...args) { + assert_add_tensor_args(); + if (!check_add_tensor_capacity(static_cast(sizeof...(Args)))) return; + ((tensors_[tensor_count_] = &args, tags_[tensor_count_] = TensorArgType::NO_DEP, tensor_count_++), ...); + } + + /** + * Attach an explicit dependency array. The Arg stores (ptr, count) without + * copying — the caller's array must outlive the submit (same lifetime rule + * as add_input/add_output, which also store pointers). + * + * count == 0 is a valid "set empty" — it clears any previously stored deps + * and returns. This lets callers that build the dep set conditionally pass + * the result through unguarded, including in the no-dep branch: + * PTO2TaskId deps[3]; + * uint32_t n = 0; + * if (have_prev) deps[n++] = prev; + * if (is_last) deps[n++] = alloc; + * args.set_dependencies(deps, n); // safe even if n == 0 + * + * For count > 0, the call is single-shot: a second non-empty call after + * deps are already set will fail with set_error(). Use count == 0 first + * if you need to re-set. + */ + void set_dependencies(const PTO2TaskId *deps, uint32_t count) { + if (count == 0) { + explicit_deps_ = nullptr; + explicit_dep_count_ = 0; + return; + } + if (deps == nullptr) { + set_error("set_dependencies: deps must not be null when count > 0"); + return; + } + if (explicit_deps_ != nullptr) { + set_error("set_dependencies: may be called at most once per Arg"); + return; + } + explicit_deps_ = deps; + explicit_dep_count_ = count; + } + + uint32_t explicit_dep_count() const { return explicit_dep_count_; } + + PTO2TaskId explicit_dep(uint32_t index) const { + always_assert(index < explicit_dep_count_); + return explicit_deps_[index]; + } + + const PTO2TaskId *explicit_deps_data() const { return explicit_deps_; } + + /** + * Add scalar values. Types are deduced per argument; each value is + * bit-cast to uint64_t for storage. Mixed types are allowed: + * + * args.add_scalar(uint64_val); // single + * args.add_scalar(3.14f, int32_t(42), 7u); // mixed batch + */ + template + void add_scalar(Args &&...args) { + static_assert(sizeof...(Args) >= 1, "add_scalar: at least one argument required"); + static_assert((is_supported_scalar_arg_v && ...), "add_scalar: all types must be arithmetic or enum"); + if (scalar_count_ + sizeof...(Args) > MaxS) { + set_error(scalar_cap_msg()); + return; + } + (add_scalar_one(std::forward(args)), ...); + } + + void add_scalars(const uint64_t *values, int count) { + if (count < 0 || scalar_count_ + count > MaxS) { + set_error(scalar_cap_msg()); + return; + } + memcpy(&scalars_[scalar_count_], values, count * sizeof(uint64_t)); +#if PTO2_PROFILING + dump_arg_selection_.clear_scalar_metadata(scalar_count_, count); +#endif + scalar_count_ += count; + } + + /** + * Zero-extend int32 bit patterns into uint64 scalar slots. + * Negative values are treated as their unsigned 32-bit representation + * (e.g., -1 → 0x00000000FFFFFFFF, not 0xFFFFFFFFFFFFFFFF). + * Uses NEON to process 4 elements per iteration on aarch64. + */ + void add_scalars_i32(const int32_t *values, int count) { + if (count < 0 || scalar_count_ + count > MaxS) { + set_error(scalar_cap_msg()); + return; + } + uint64_t *dst = &scalars_[scalar_count_]; +#if defined(__aarch64__) + int i = 0; + for (; i + 4 <= count; i += 4) { + uint32x4_t v = vld1q_u32(reinterpret_cast(values + i)); + uint64x2_t lo = vmovl_u32(vget_low_u32(v)); + uint64x2_t hi = vmovl_u32(vget_high_u32(v)); + vst1q_u64(dst + i, lo); + vst1q_u64(dst + i + 2, hi); + } + for (; i < count; i++) { + dst[i] = static_cast(static_cast(values[i])); + } +#else + for (int i = 0; i < count; i++) { + dst[i] = static_cast(static_cast(values[i])); + } +#endif +#if PTO2_PROFILING + dump_arg_selection_.clear_scalar_metadata(scalar_count_, count); +#endif + scalar_count_ += count; + } + + /** + * Copy scalars from another Arg's scalar array. + * Useful when multiple tasks share the same scalar data (e.g., block indices). + */ + void copy_scalars_from(const Arg &src, int src_offset, int count) { + if (src_offset < 0 || count < 0 || src_offset + count > src.scalar_count_) { + set_error("Source scalar range out of bounds in copy_scalars_from"); + return; + } + if (scalar_count_ + count > MaxS) { + set_error(scalar_cap_msg()); + return; + } + memcpy(&scalars_[scalar_count_], &src.scalars_[src_offset], count * sizeof(uint64_t)); +#if PTO2_PROFILING + dump_arg_selection_.copy_scalar_dtypes_from(src.dump_arg_selection_, scalar_count_, src_offset, count); +#endif + scalar_count_ += count; + } + +#if PTO2_PROFILING + const uint8_t *scalar_dtypes() const { return dump_arg_selection_.scalar_dtypes(); } +#else + const uint8_t *scalar_dtypes() const { return nullptr; } +#endif + +private: + // Caller-owned dependency array; lifetime must extend through submit. +#if PTO2_PROFILING + DumpArgSelection dump_arg_selection_; +#endif + const PTO2TaskId *explicit_deps_{nullptr}; + uint32_t explicit_dep_count_{0}; +#if PTO2_PROFILING + template + static constexpr bool is_supported_dump_arg_v = + std::is_same_v, Tensor> || std::is_same_v, TensorCreateInfo> || + is_supported_scalar_arg_v; +#endif + + // Capacity-overflow messages — spell the actual limit (MaxS/MaxT, whatever + // the instantiation is) into the text via std::to_string. Built once into a + // function-local static so set_error() can hold the const char* safely. + static const char *scalar_cap_msg() { + static const std::string msg = "Too many scalar args (max " + std::to_string(MaxS) + ")"; + return msg.c_str(); + } + static const char *tensor_cap_msg() { + static const std::string msg = "Too many tensor args (max " + std::to_string(MaxT) + ")"; + return msg.c_str(); + } + + template + void add_scalar_one(T &&value) { + scalars_[scalar_count_] = to_u64(value); +#if PTO2_PROFILING + uintptr_t scalar_source_ptr = 0; + if constexpr (std::is_lvalue_reference_v) { + scalar_source_ptr = reinterpret_cast(&value); + } + dump_arg_selection_.record_scalar_source( + scalar_count_, scalar_source_ptr, dtype_of>>() + ); +#endif + scalar_count_++; + } + +#if PTO2_PROFILING + // No-arg dump(): mark every arg already added to this Arg. + void mark_all_dump_args() { + if (tensor_count_ == 0 && scalar_count_ == 0) { + set_error("dump: no arguments added to this Arg"); + return; + } + dump_arg_selection_.mark_all(tensor_count_, scalar_count_); + } + + void mark_dump_arg(const Tensor &tensor) { + for (int32_t i = 0; i < tensor_count_; i++) { + if (tags_[i] != TensorArgType::OUTPUT && tensors_[i].refers_to(&tensor)) { + dump_arg_selection_.mark_index(i); + return; + } + } + set_error("dump: tensor is not part of this Arg"); + } + + void mark_dump_arg(const TensorCreateInfo &create_info) { + for (int32_t i = 0; i < tensor_count_; i++) { + if (tags_[i] == TensorArgType::OUTPUT && tensors_[i].refers_to(&create_info)) { + dump_arg_selection_.mark_index(i); + return; + } + } + set_error("dump: TensorCreateInfo is not part of this Arg"); + } + + template + std::enable_if_t, void> mark_dump_arg(const T &scalar) { + uintptr_t ptr = reinterpret_cast(&scalar); + if (dump_arg_selection_.mark_scalar_by_ptr(ptr, scalar_count_, tensor_count_)) { + return; + } + set_error("dump: scalar is not part of this Arg"); + } +#endif + + // Compile-time validation: arg count, value category (reject temporaries — + // a stored &arg would dangle after the call), and element type. Driven + // purely by Args, with no runtime state. + template + static void assert_add_tensor_args() { + static_assert(sizeof...(Args) >= 1, "at least one argument required"); + static_assert( + (std::is_lvalue_reference_v && ...), + "temporaries are not allowed — stored pointers would dangle after the call" + ); + if constexpr (is_output) { + static_assert( + (std::is_same_v, Tensor> && ...) || + (std::is_same_v, TensorCreateInfo> && ...), + "add_output: all arguments must be the same type (all Tensor or all TensorCreateInfo)" + ); + } else { + static_assert((std::is_same_v, Tensor> && ...), "all arguments must be Tensor"); + } + } + + // Runtime validation: tensor-before-scalar ordering + slot capacity. Records + // an error and returns false on violation. + bool check_add_tensor_capacity(int32_t count) { + if (scalar_count_ != 0) { + set_error( + "add_input/add_output/add_inout called after add_scalar: " + "all tensors must be added before any scalars" + ); + return false; + } + if (tensor_count_ + count > static_cast(MaxT)) { + set_error(tensor_cap_msg()); + return false; + } + return true; + } +}; + +// ============================================================================= +// Task-args layer aliases +// ============================================================================= +// +// L0TaskArgs — core-level container used to build and submit tasks inside +// orchestration (small, stack-friendly). +using L0TaskArgs = Arg; + +// L2TaskArgs — chip-level entry-arg holding the orchestration entry's +// already-allocated inputs (capacity matches ChipStorageTaskArgs). +// aicpu_orchestration_entry/config receive a const L2TaskArgs&. +struct L2TaskArgs : Arg { + // Build from the executor's ChipStorageTaskArgs: each input becomes a + // TensorRef pointing at src's Tensor, so `src` must outlive this (on the + // executor path src is runtime->orch_args_storage_, alive for the whole run). + void create_from_chip_args(const ChipStorageTaskArgs &src) { + reset(); + for (int32_t i = 0; i < src.tensor_count(); ++i) { + // Entry inputs are external submit-time tensors; the entry binds them + // by const Tensor& (replacing from_tensor_arg's old version/manual_dep + // reset), so this invariant is what keeps that binding behavior-preserving. + const Tensor &t = src.tensor(i); + debug_assert(!t.manual_dep && t.version == 0); + add_input(t); + } + for (int32_t i = 0; i < src.scalar_count(); ++i) { + add_scalar(src.scalar(i)); + } + } +}; + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_ diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/runtime.h b/src/a5/runtime/fully_distributed_within_core/runtime/runtime.h new file mode 100644 index 000000000..4ac9c2db4 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/runtime.h @@ -0,0 +1,379 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Runtime Class - Device Execution and Handshake Control + * + * This class manages device-side execution through AICPU-AICore handshake + * protocol. Task graph construction is handled by PTO2Runtime; this class + * only handles: + * - Handshake buffers for AICPU-AICore communication + * - Execution parameters (block_dim, aicpu_thread_num) + * - Tensor pair management for host-device memory tracking + * - Device orchestration state (gm_sm_ptr_, orch_args_) + * - Function address mapping (func_id_to_addr_) + * + * Task dispatch uses a per-core PTO2DispatchPayload written by the scheduler. + * At dispatch time, build_payload() copies tensor pointers and scalars from + * the task payload into the per-core args[], populates SPMD context, then + * signals AICore via DATA_MAIN_BASE. + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_ + +#include +#include +#include // for fprintf, printf +#include // for memset + +#include + +#include "common/core_type.h" +#include "common/platform_config.h" +#include "pto2_dispatch_payload.h" +#include "task_args.h" + +// ============================================================================= +// Configuration Macros +// ============================================================================= + +#define RUNTIME_MAX_ARGS 128 +#define RUNTIME_MAX_WORKER 108 // 36 AIC + 72 AIV cores +#define RUNTIME_MAX_FUNC_ID 1024 +#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 4MB max for orchestration SO +#define RUNTIME_MAX_ORCH_SYMBOL_NAME 64 + +// Default ready queue shards: one shard per worker thread (total minus orchestrator) +constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1; + +// ============================================================================= +// Data Structures +// ============================================================================= + +/** + * Handshake Structure - Shared between Host, AICPU, and AICore + * + * This structure facilitates communication and synchronization between + * AICPU and AICore during task execution. + * + * Protocol State Machine: + * 1. Initialization: AICPU sets aicpu_ready=1 + * 2. Acknowledgment: AICore sets aicore_done=core_id+1 + * 3. Task Dispatch: AICPU writes DATA_MAIN_BASE after updating the per-core payload + * 4. Task Execution: AICore reads the cached PTO2DispatchPayload and executes + * 5. Task Completion: AICore writes FIN to COND; AICPU observes completion + * 6. Shutdown: AICPU sets control=1, AICore exits + * + * Each AICore instance has its own handshake buffer to enable concurrent + * task execution across multiple cores. + */ + +/** + * Handshake buffer for AICPU-AICore communication + * + * Each AICore has its own handshake buffer for synchronization with AICPU. + * The structure is cache-line aligned (64 bytes) to prevent false sharing + * between cores and optimize cache coherency operations. + * + * Profiling state lives outside this struct: enablement bits and per-core + * ring/reg addresses travel through `KernelArgs::enable_profiling_flag` + + * `KernelArgs::aicore_* per-core address arrays`, which the AICore kernel entry + * forwards into platform-owned per-core slots + * (`aicore/aicore_profiling_state.h`). Adding a profiling sub-feature does + * not require touching this struct anymore. + * + * Field Access Patterns: + * - aicpu_ready: Written by AICPU, read by AICore + * - aicore_done: Written by AICore, read by AICPU + * - task: Written by AICPU, read by AICore (Init: PTO2DispatchPayload*; runtime: unused) + * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV) + * - physical_core_id: Written by AICore (Phase 2), read by AICPU + * - aicpu_regs_ready / aicore_regs_ready: handshake sequence flags + */ +struct Handshake { + volatile uint32_t aicpu_ready; // AICPU ready signal: 0=not ready, 1=ready + volatile uint32_t aicore_done; // AICore ready signal: 0=not ready, core_id+1=ready + volatile uint64_t task; // Init: PTO2DispatchPayload* (set before aicpu_ready); runtime: unused + volatile CoreType core_type; // Core type: CoreType::AIC or CoreType::AIV + volatile uint32_t physical_core_id; // Physical core ID + volatile uint32_t aicpu_regs_ready; // AICPU register init done: 0=pending, 1=done + volatile uint32_t aicore_regs_ready; // AICore ID reported: 0=pending, 1=done +} __attribute__((aligned(64))); + +/** + * Tensor pair for tracking host-device memory mappings. + * Used for copy-back during finalize. + */ +struct TensorPair { + void *host_ptr; + void *dev_ptr; + size_t size; + // false for read-only INPUT tensors: they are never written by the kernel, + // so the end-of-run D2H copy-back is skipped. OUTPUT/INOUT/unknown + // keep the safe default of copying back. + bool needs_copy_back = true; +}; + +/** + * Host API function pointers for device memory operations. + * Allows runtime to use pluggable device memory backends. + */ +struct HostApi { + void *(*device_malloc)(size_t size); + void (*device_free)(void *dev_ptr); + int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size); + int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size); + // Set a device buffer to a byte value (device-side, no PCIe). Used to + // zero-init pure OUTPUT buffers in lieu of an H2D copy-in. May be + // null on backends that don't wire it; callers must fall back to + // copy_to_device. + int (*device_memset)(void *dev_ptr, int value, size_t size); + // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared + // memory, trb prebuilt runtime arena) as three independent device + // allocations. `runtime_arena_size == 0` skips the third region (hbg + // path: hbg has no prebuilt runtime arena). Idempotent on identical + // sizes; returns 0 on success, -1 on allocation failure. + int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size); + // Return the per-Worker pooled pointer for the PTO2 GM heap / shared + // memory / prebuilt runtime arena. setup_static_arena must have already + // committed the relevant region; the returned pointer is owned by the + // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it + // to device_free or record it in `tensor_pairs_`. + // + // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is + // only committed when setup_static_arena was invoked with + // runtime_arena_size > 0. Calling it on the hbg path + // (setup_static_arena(...,0)) returns nullptr (not undefined). + void *(*acquire_pooled_gm_heap)(); + void *(*acquire_pooled_gm_sm)(); + void *(*acquire_pooled_runtime_arena)(); + // Single-shot upload of the entire ChipCallable buffer. `callable` is a + // `const ChipCallable *` (declared void* to avoid pulling task_interface + // headers into runtime.h). DeviceRunner walks child_offsets_ to compute + // total byte size, allocates device GM once, fixes up each child's + // resolved_addr_ in an internal host scratch (onboard: device addr; sim: + // dlopen function pointer), H2D's once, and returns the device-side + // address of the ChipCallable header. Pool-managed: identical buffer + // contents (FNV-1a 64-bit) hit the dedup cache; all chip buffers are + // bulk-freed in DeviceRunner::finalize(). Returns 0 on error or when + // child_count() == 0. Caller computes child addrs as + // chip_dev + offsetof(ChipCallable, storage_) + child_offset(i) + // and stores them via runtime->set_function_bin_addr(fid, child_dev). + uint64_t (*upload_chip_callable_buffer)(const void *callable); +}; + +/** + * Task structure - Compatibility stub for platform layer + * + * RT2 uses PTO2DispatchPayload instead of Task for task dispatch. + * This stub exists only for API compatibility with device_runner.cpp. + * Since get_task_count() returns 0, this struct is never actually used. + */ +struct Task { + int func_id; + uint64_t function_bin_addr; +}; + +// Per-core entry point of the fully_distributed_within_core engine. Implemented +// in runtime/dist_engine.cpp (compiled into the AICPU .so), invoked by each +// AICore worker thread via Runtime::dist.core_main_fn. `runtime` is Runtime*, +// `core_type` is CoreType (cast to int to keep this typedef header-light). +// See docs/fully_distributed_within_core.md. +typedef void (*DistCoreMainFn)(void *runtime, int core_idx, int core_type); + +// ============================================================================= +// Runtime Class +// ============================================================================= + +/** + * Runtime class for device execution and handshake control + * + * This class manages AICPU-AICore communication through handshake buffers. + * Task graph construction is handled by PTO2Runtime; this class only handles + * execution control and device orchestration state. + */ +class Runtime { +public: + // Handshake buffers for AICPU-AICore communication + Handshake workers[RUNTIME_MAX_WORKER]; // Worker (AICore) handshake buffers + int worker_count; // Number of active workers + + // Execution parameters for AICPU scheduling. + // + // aicpu_thread_num is the *total* AICPU thread count launched on this run + // (= orch + schedulers). AicpuExecutor splits this into one orchestrator + // thread (highest idx, runs aicpu_orchestration_entry) and the remaining + // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore. + // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set. + int aicpu_thread_num; + int ready_queue_shards; // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1) + + // Filter-style affinity gate input (a5 onboard). Host fills before + // launch from device-side OCCUPY + DSMI CPU_TOPO via + // pto::a5::compute_allowed_cpus. The on-device gate keeps threads whose + // sched_getcpu() lands on one of these cpu_ids; exec_idx = position in + // this array drives sched/orch role assignment. Indices 0..count-2 are + // scheduler slots, index count-1 is the orchestrator slot. Sized to + // PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH for headroom — current + // policy is 4 sched + 1 orch = 5 active. + int32_t aicpu_allowed_cpus[16]; + int32_t aicpu_allowed_cpu_count; + // Actual AICPU thread launch count for this run. Host sets from + // popcount(OCCUPY) via the topology probe. See the matching field in + // src/a5/runtime/host_build_graph/runtime/runtime.h for rationale. + int32_t aicpu_launch_count; + + // PTO2 integration: kernel_id -> GM function_bin_addr mapping + // NOTE: Made public for direct access from aicore code + uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID]; + + // Orchestrator-to-scheduler transition control + // When true, orchestrator threads convert to scheduler threads after orchestration completes. + // When false (default), orchestrator threads exit after orchestration without dispatching tasks. + // Controlled via PTO2_ORCH_TO_SCHED environment variable. + bool orch_to_sched; + + // ---- fully_distributed_within_core handoff (SPMD-on-core) ---- + // The AICPU orchestrator thread does dlopen/arena setup, then hands the + // resolved orchestration entry + per-core engine off to the AICore worker + // threads through these fields instead of running orchestration/scheduling + // itself. Each AICore worker invokes core_main_fn(runtime, idx, core_type) + // once `go` is set, then increments `done_count` when finished. See + // runtime/dist_engine.* and docs/fully_distributed_within_core.md. + struct DistHandoff { + volatile uint64_t core_main_fn; // DistCoreMainFn (in AICPU .so) + volatile uint32_t go; // 1 once engine wired and cores may start + volatile int32_t num_workers; // number of AICore workers participating + volatile int32_t done_count; // workers atomically increment when done + } dist; + +private: + // Kernel binary tracking for cleanup + int registered_kernel_func_ids_[RUNTIME_MAX_FUNC_ID]; + int registered_kernel_count_; + + void *gm_sm_ptr_; // GM pointer to PTO2 shared memory (device) + void *gm_heap_ptr_; // GM heap for orchestrator output buffers (device) + void *slot_states_ptr_; // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling) + ChipStorageTaskArgs orch_args_storage_; // Copy of args for device + + // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing + // Runtime to device; AICPU reads them in the boot path to skip + // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer + // (already populated by runtime_init_data_from_layout + wire on host). + void *prebuilt_arena_base_; + size_t prebuilt_runtime_offset_; + + // Device orchestration SO (for dlopen on AICPU thread 3). + // The SO bytes themselves live in a separately-allocated device buffer + // owned by DeviceRunner; only the metadata below travels inside Runtime. + uint64_t dev_orch_so_addr_; + uint64_t dev_orch_so_size_; + // Per-callable_id dispatch. AICPU dispatches via + // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_` + // signals whether the host is delivering a freshly-registered + // callable_id (write+dlopen) or reusing an already-loaded one. + int32_t active_callable_id_; + bool register_new_callable_id_; + char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; + char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; + +public: + /** + * Constructor - zero-initialize all arrays + */ + Runtime(); + + // ========================================================================= + // Performance Profiling + // ========================================================================= + + // ========================================================================= + // Device orchestration (for AICPU thread 3) + // ========================================================================= + + void *get_gm_sm_ptr() const; + void *get_gm_heap_ptr() const; + const ChipStorageTaskArgs &get_orch_args() const; + void set_gm_sm_ptr(void *p); + void set_gm_heap(void *p); + void set_slot_states_ptr(void *p); + void set_orch_args(const ChipStorageTaskArgs &args); + + // Prebuilt-arena fast path (trb only). Set by host's + // bind_callable_to_runtime_impl; consumed by AICPU at boot to attach a + // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at + // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on + // first construction (Runtime() ctor zeros them) so a non-prebuilt boot + // path can still detect "no prebuilt image set" via nullptr. + void set_prebuilt_arena(void *arena_base, size_t runtime_off); + void *get_prebuilt_arena_base() const; + size_t get_prebuilt_runtime_offset() const; + + // Device orchestration SO binary (for dlopen on AICPU thread 3) + void set_dev_orch_so(uint64_t dev_addr, uint64_t size); + uint64_t get_dev_orch_so_addr() const; + uint64_t get_dev_orch_so_size() const; + // Per-callable_id dispatch. callable_id must be in + // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU + // whether to (re)load the orch SO into orch_so_table_[callable_id] or + // reuse the cached entry. + void set_active_callable_id(int32_t callable_id, bool is_new); + int32_t get_active_callable_id() const; + bool register_new_callable_id() const; + void set_device_orch_func_name(const char *name); + const char *get_device_orch_func_name() const; + void set_device_orch_config_name(const char *name); + const char *get_device_orch_config_name() const; + + uint64_t get_function_bin_addr(int func_id) const; + void set_function_bin_addr(int func_id, uint64_t addr); + /** + * Replay a previously-uploaded kernel address onto a fresh Runtime + * without recording it in registered_kernel_func_ids_. Used by + * DeviceRunner::bind_callable_to_runtime so prepared kernel + * binaries are not freed by validate_runtime_impl across runs. + */ + void replay_function_bin_addr(int func_id, uint64_t addr); + + int get_registered_kernel_count() const; + int get_registered_kernel_func_id(int index) const; + void clear_registered_kernels(); + + // ========================================================================= + // Deprecated API (for platform compatibility, always returns 0/nullptr) + // Task graph is now managed by PTO2Runtime, not Runtime + // ========================================================================= + + /** @deprecated Task count is now in PTO2 shared memory */ + int get_task_count() const { return 0; } + + /** @deprecated RT2 uses PTO2DispatchPayload, not Task. Always returns nullptr. */ + Task *get_task(int) { return nullptr; } + + // ========================================================================= + // Host API (host-only, not copied to device) + // ========================================================================= + + // Host API function pointers for device memory operations + // NOTE: Placed at end of class to avoid affecting device memory layout + HostApi host_api; + + // Host-side tensor ledger for D2H copy-back at finalize. Populated by + // runtime_maker.cpp from orch_args at bind time, then iterated in + // validate_runtime_impl. Not read by AICPU/AICore — the device-side + // Runtime image carries the std::vector control block as harmless + // garbage, identical to host_api above. No fixed cap — grows with the + // chip-level entry-tensor count. + std::vector tensor_pairs_; +}; + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_ diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp new file mode 100644 index 000000000..4b7484bc9 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Scheduler Implementation + * + * Implements scheduler state management, ready queues, and task lifecycle. + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#include "pto_scheduler.h" +#include +#include +#include "common/unified_log.h" + +#if PTO2_PROFILING +// Weak fallbacks for host/UT builds that don't link the scope_stats collector. +extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; } +extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {} +#endif + +// ============================================================================= +// Scheduler Profiling Counters +// ============================================================================= + +#if PTO2_SCHED_PROFILING +#include "common/platform_config.h" + +uint64_t g_sched_lock_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_fanout_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_fanin_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_self_consumed_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_lock_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_push_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_pop_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_lock_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_fanout_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_fanin_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_self_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_pop_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {}; +uint64_t g_sched_complete_count[PLATFORM_MAX_AICPU_THREADS] = {}; + +PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) { + PTO2SchedProfilingData d; + d.lock_cycle = std::exchange(g_sched_lock_cycle[thread_idx], 0); + d.fanout_cycle = std::exchange(g_sched_fanout_cycle[thread_idx], 0); + d.fanin_cycle = std::exchange(g_sched_fanin_cycle[thread_idx], 0); + d.self_consumed_cycle = std::exchange(g_sched_self_consumed_cycle[thread_idx], 0); + d.lock_wait_cycle = std::exchange(g_sched_lock_wait_cycle[thread_idx], 0); + d.push_wait_cycle = std::exchange(g_sched_push_wait_cycle[thread_idx], 0); + d.pop_wait_cycle = std::exchange(g_sched_pop_wait_cycle[thread_idx], 0); + d.lock_atomic_count = std::exchange(g_sched_lock_atomic_count[thread_idx], 0); + d.fanout_atomic_count = std::exchange(g_sched_fanout_atomic_count[thread_idx], 0); + d.fanin_atomic_count = std::exchange(g_sched_fanin_atomic_count[thread_idx], 0); + d.self_atomic_count = std::exchange(g_sched_self_atomic_count[thread_idx], 0); + d.pop_atomic_count = std::exchange(g_sched_pop_atomic_count[thread_idx], 0); + d.complete_count = std::exchange(g_sched_complete_count[thread_idx], 0); + return d; +} +#endif + +// ============================================================================= +// Debug Utilities +// ============================================================================= + +void PTO2SchedulerState::print_stats() { + PTO2SchedulerState *sched = this; + LOG_INFO_V0("=== Scheduler Statistics ==="); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (sched->ring_sched_states[r].last_task_alive > 0) { + LOG_INFO_V0("Ring %d:", r); + LOG_INFO_V0(" last_task_alive: %d", sched->ring_sched_states[r].last_task_alive); + auto &dp = sched->ring_sched_states[r].dep_pool; + if (dp.top > 0) { + LOG_INFO_V0( + " dep_pool: top=%d tail=%d used=%d high_water=%d capacity=%d", dp.top, dp.tail, dp.top - dp.tail, + dp.high_water, dp.capacity + ); + } + } + } +#if PTO2_SCHED_PROFILING + LOG_INFO_V0("tasks_completed: %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed)); + LOG_INFO_V0("tasks_consumed: %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed)); +#endif + LOG_INFO_V0("============================"); +} + +void PTO2SchedulerState::print_queues() { + PTO2SchedulerState *sched = this; + LOG_INFO_V0("=== Ready Queues ==="); + + const char *shape_names[] = {"AIC", "AIV", "MIX"}; + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + LOG_INFO_V0(" %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size()); + } + LOG_INFO_V0(" DUMMY: count=%" PRIu64, sched->dummy_ready_queue.size()); + + LOG_INFO_V0("===================="); +} diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h new file mode 100644 index 000000000..6413917f0 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/pto_scheduler.h @@ -0,0 +1,1267 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * PTO Runtime2 - Scheduler Interface + * + * The Scheduler is responsible for: + * 1. Maintaining per-resource-shape ready queues + * 2. Tracking task state (PENDING -> COMPLETED -> CONSUMED) + * 3. Managing fanin/fanout refcounts for dependency resolution + * 4. Advancing last_task_alive for heap reclamation + * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete) + * + * The Scheduler runs on Device AI_CPU and processes: + * - Task state transitions based on fanin_refcount + * - Buffer lifecycle based on fanout_refcount + * - Ring pointer advancement for flow control + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#pragma once + +#include + +#include "common/core_type.h" +#include "utils/device_arena.h" +#include "pto_async_wait.h" +#include "pto_ring_buffer.h" +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" + +#if PTO2_SCHED_PROFILING +#include "aicpu/device_time.h" +#define PTO2_SCHED_CYCLE_START() uint64_t _st0 = get_sys_cnt_aicpu(), _st1 +#define PTO2_SCHED_CYCLE_LAP(acc) \ + do { \ + _st1 = get_sys_cnt_aicpu(); \ + acc += (_st1 - _st0); \ + _st0 = _st1; \ + } while (0) +#endif + +// ============================================================================= +// Ready Queue (Lock-free bounded MPMC — Vyukov design) +// ============================================================================= + +/** + * Per-slot entry: sequence counter for ABA safety + task payload + */ +struct PTO2ReadyQueueSlot { + std::atomic sequence; + PTO2TaskSlotState *slot_state; +}; + +/** + * Thread-local ready buffer for local-first dispatch optimization. + * + * Two buffers per scheduling thread, one per CoreType (AIC=0, AIV=1). + * Initialized once before the scheduling loop; must be empty at + * the start of each iteration (verified by always_assert). + * + * Phase 1 fills per-CoreType buffers via on_task_complete(). + * The dispatch stage drains them local-first via get_ready_tasks_batch, + * with any remaining tasks pushed to the global ready queue. + */ +// Number of CoreType values eligible for local dispatch (AIC=0, AIV=1) +static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2; + +struct PTO2LocalReadyBuffer { + PTO2TaskSlotState **slot_states = nullptr; + int count = 0; + int capacity = 0; + + void reset(PTO2TaskSlotState **buf, int cap) { + slot_states = buf; + count = 0; + capacity = cap; + } + + bool try_push(PTO2TaskSlotState *s) { + if (slot_states && count < capacity) { + slot_states[count++] = s; + return true; + } + return false; + } + + PTO2TaskSlotState *pop() { return (count > 0) ? slot_states[--count] : nullptr; } +}; + +/** + * Lock-free bounded MPMC queue (Dmitry Vyukov design) + * + * Key properties: + * - enqueue_pos and dequeue_pos on separate cache lines (no false sharing) + * - Per-slot sequence counter prevents ABA problem + * - Empty queue pop returns immediately (single atomic load, no lock) + * - CAS contention is split: producers only touch enqueue_pos, + * consumers only touch dequeue_pos + */ +struct alignas(64) PTO2ReadyQueue { + PTO2ReadyQueueSlot *slots; + uint64_t capacity; + uint64_t mask; // capacity - 1 + char _pad0[64 - 24]; // Pad to own cache line + + std::atomic enqueue_pos; + char _pad1[64 - sizeof(std::atomic)]; // Own cache line + + std::atomic dequeue_pos; + char _pad2[64 - sizeof(std::atomic)]; // Own cache line + + uint64_t size() { + uint64_t e = enqueue_pos.load(std::memory_order_relaxed); + uint64_t d = dequeue_pos.load(std::memory_order_relaxed); + return (e >= d) ? (e - d) : 0; + } + + bool push(PTO2TaskSlotState *slot_state) { + uint64_t pos; + PTO2ReadyQueueSlot *slot; + while (true) { + pos = enqueue_pos.load(std::memory_order_relaxed); + slot = &slots[pos & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos); + if (diff == 0) { + if (enqueue_pos.compare_exchange_weak( + pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed + )) { + break; + } + } else if (diff < 0) { + return false; // Queue full + } + } + + slot->slot_state = slot_state; + slot->sequence.store(static_cast(pos + 1), std::memory_order_release); + return true; + } + + // Batch push: reserve count slots with a single CAS after confirming + // every target slot is available under the usual Vyukov sequence check. + void push_batch(PTO2TaskSlotState **items, int count) { + if (count == 0) return; + + uint64_t pos; + while (true) { + pos = enqueue_pos.load(std::memory_order_relaxed); + bool ready = true; + for (int i = 0; i < count; i++) { + PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos + i); + if (diff != 0) { + ready = false; + break; + } + } + if (!ready) { + continue; + } + if (enqueue_pos.compare_exchange_weak( + pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed + )) { + break; + } + } + + for (int i = 0; i < count; i++) { + PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask]; + slot->slot_state = items[i]; + slot->sequence.store(static_cast(pos + i + 1), std::memory_order_release); + } + } + +#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING + bool push(PTO2TaskSlotState *slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) { + uint64_t pos; + PTO2ReadyQueueSlot *slot; + uint64_t t0 = get_sys_cnt_aicpu(); + bool contended = false; + uint32_t atomic_ops = 0; + while (true) { + pos = enqueue_pos.load(std::memory_order_relaxed); + slot = &slots[pos & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos); + atomic_ops += 2; // enqueue_pos.load + sequence.load + if (diff == 0) { + if (enqueue_pos.compare_exchange_weak( + pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed + )) { + atomic_ops++; // successful CAS + break; + } + contended = true; + atomic_ops++; // failed CAS + } else if (diff < 0) { + return false; // Queue full + } else { + contended = true; // diff > 0: slot not yet released, spin + } + } + atomic_ops++; // final sequence.store + atomic_count += atomic_ops; + if (contended) { + wait_cycle += (get_sys_cnt_aicpu() - t0); + } + + slot->slot_state = slot_state; + slot->sequence.store(static_cast(pos + 1), std::memory_order_release); + return true; + } +#endif + + PTO2TaskSlotState *pop() { + // Fast-path: skip slot load when queue is clearly empty + uint64_t d = dequeue_pos.load(std::memory_order_relaxed); + uint64_t e = enqueue_pos.load(std::memory_order_relaxed); + if (d >= e) { + return nullptr; + } + + uint64_t pos; + PTO2ReadyQueueSlot *slot; + while (true) { + pos = dequeue_pos.load(std::memory_order_relaxed); + slot = &slots[pos & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos + 1); + if (diff == 0) { + if (dequeue_pos.compare_exchange_weak( + pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed + )) + break; + } else if (diff < 0) { + return nullptr; // Queue empty + } + } + + PTO2TaskSlotState *result = slot->slot_state; + slot->sequence.store(static_cast(pos + mask + 1), std::memory_order_release); + return result; + } + +#if PTO2_SCHED_PROFILING + PTO2TaskSlotState *pop(uint64_t &atomic_count, uint64_t &wait_cycle) { + // Fast-path: skip slot load when queue is clearly empty + uint64_t d = dequeue_pos.load(std::memory_order_relaxed); + uint64_t e = enqueue_pos.load(std::memory_order_relaxed); + atomic_count += 2; // dequeue_pos.load + enqueue_pos.load + if (d >= e) { + return nullptr; + } + + uint64_t pos; + PTO2ReadyQueueSlot *slot; + uint64_t t0 = get_sys_cnt_aicpu(); + bool contended = false; + uint32_t atomic_ops = 0; + while (true) { + pos = dequeue_pos.load(std::memory_order_relaxed); + slot = &slots[pos & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos + 1); + atomic_ops += 2; // dequeue_pos.load + sequence.load + if (diff == 0) { + if (dequeue_pos.compare_exchange_weak( + pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed + )) { + atomic_ops++; // successful CAS + break; + } + contended = true; + atomic_ops++; // failed CAS + } else if (diff < 0) { + atomic_count += atomic_ops; + return nullptr; // Queue empty + } else { + contended = true; + } + } + atomic_ops++; // final sequence.store + atomic_count += atomic_ops; + if (contended) { + wait_cycle += (get_sys_cnt_aicpu() - t0); + } + + PTO2TaskSlotState *result = slot->slot_state; + slot->sequence.store(static_cast(pos + mask + 1), std::memory_order_release); + return result; + } +#endif + + // Batch pop: reserve a contiguous run of ready slots with a single CAS. + // Returns actual number of items popped (may be less than max_count). + int pop_batch(PTO2TaskSlotState **out, int max_count) { + uint64_t pos; + int count; + while (true) { + pos = dequeue_pos.load(std::memory_order_relaxed); + count = 0; + while (count < max_count) { + PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos + count + 1); + if (diff == 0) { + count++; + continue; + } + if (diff < 0) { + break; + } + count = -1; + break; + } + if (count == 0) return 0; + if (count < 0) continue; + if (dequeue_pos.compare_exchange_weak( + pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed + )) { + break; + } + } + + for (int i = 0; i < count; i++) { + PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask]; + out[i] = slot->slot_state; + slot->sequence.store(static_cast(pos + i + mask + 1), std::memory_order_release); + } + return count; + } + +#if PTO2_SCHED_PROFILING + int pop_batch(PTO2TaskSlotState **out, int max_count, uint64_t &atomic_count, uint64_t &wait_cycle) { + uint64_t pos; + int count; + uint64_t t0 = get_sys_cnt_aicpu(); + bool contended = false; + uint32_t atomic_ops = 0; + while (true) { + pos = dequeue_pos.load(std::memory_order_relaxed); + atomic_ops++; // dequeue_pos.load + count = 0; + while (count < max_count) { + PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask]; + int64_t seq = slot->sequence.load(std::memory_order_acquire); + int64_t diff = seq - static_cast(pos + count + 1); + atomic_ops++; // sequence.load + if (diff == 0) { + count++; + continue; + } + if (diff < 0) { + break; + } + contended = true; + count = -1; + break; + } + if (count == 0) { + atomic_count += atomic_ops; + return 0; + } + if (count < 0) { + continue; + } + if (dequeue_pos.compare_exchange_weak( + pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed + )) { + atomic_ops++; // successful CAS + break; + } + contended = true; + atomic_ops++; // failed CAS + } + + for (int i = 0; i < count; i++) { + PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask]; + out[i] = slot->slot_state; + slot->sequence.store(static_cast(pos + i + mask + 1), std::memory_order_release); + atomic_ops++; // sequence.store + } + atomic_count += atomic_ops; + if (contended) { + wait_cycle += (get_sys_cnt_aicpu() - t0); + } + return count; + } +#endif +}; + +// Cold-path ready queue operations (defined in pto_scheduler.cpp). Declared +// as non-member so PTO2ReadyQueue stays a POD-like struct with cache-line +// alignment. Storage is owned by the caller-supplied arena. +// reserve_layout: declare the slots[] region on the arena (must precede commit) +// init_from_layout: bind slots pointer from arena.region_ptr(off) and +// initialize sequence counters +// destroy: forget the slots pointer (arena owns the buffer) +size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity); +// Writes everything *except* the arena-internal `slots` pointer field +// (sequences/positions on the slot array, capacity, mask). Uses +// arena.region_ptr(slots_off) only to address the slot array for writes; +// does NOT store the pointer in `queue->slots`. Call +// `ready_queue_wire_arena_pointers` afterwards to set the field itself. +bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity); +// Stores queue->slots = arena.region_ptr(slots_off). Idempotent. +void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off); +void ready_queue_destroy(PTO2ReadyQueue *queue); + +// ============================================================================= +// SPSC Queue (Single-Producer Single-Consumer, wait-free) +// ============================================================================= +// +// Bounded ring buffer optimized for the wiring queue use case: +// - Producer: orchestrator thread (push) +// - Consumer: scheduler thread 0 (pop_batch) +// +// Design based on Rigtorp's cached-index technique: each side caches +// the other's index locally, avoiding cross-core cache line bouncing +// on the hot path. Only when the local cache says "full" or "empty" +// does the thread issue an acquire load on the remote index. +// +// Memory layout: 5 cache-line-aligned fields ensure zero false sharing. + +struct alignas(64) PTO2SpscQueue { + // --- Producer cache lines (orchestrator thread) --- + alignas(64) std::atomic head_{0}; + alignas(64) uint64_t tail_cached_{0}; + + // --- Consumer cache lines (scheduler thread 0) --- + alignas(64) std::atomic tail_{0}; + alignas(64) uint64_t head_cached_{0}; + + // --- Shared Cacheline (read only) with mask and data ptr (immutable after init) --- + alignas(64) PTO2TaskSlotState **buffer_{nullptr}; + uint64_t mask_{0}; + + // Padding to exactly 5 cache lines + char padding[64 - sizeof(PTO2TaskSlotState **) - sizeof(uint64_t)]; + + // Reserve the backing buffer region on the supplied arena. Returns the + // region offset, to be passed to init_from_layout() after the arena is + // committed. Cache-line aligned: the buffer is shared between the + // orchestrator (push) and scheduler thread 0 (pop_batch), so its base + // must not false-share with neighboring regions. + static size_t reserve_layout(DeviceArena &arena, uint64_t capacity) { + return arena.reserve(capacity * sizeof(uintptr_t), PTO2_ALIGN_SIZE); + } + + // Writes everything except the arena-internal `buffer_` pointer field + // (zeros the slot pointer array, mask/head/tail). The host pre-builds the + // image without storing a host address in buffer_; the AICPU wires + // buffer_ at boot via wire_arena_pointers(). + bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) { + if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false; + auto *buf = static_cast(arena.region_ptr(buffer_off)); + // calloc'd-equivalent: zero the slot pointers so spurious early pops + // observe nullptr. + for (uint64_t i = 0; i < capacity; i++) + buf[i] = nullptr; + mask_ = capacity - 1; + head_.store(0, std::memory_order_relaxed); + tail_.store(0, std::memory_order_relaxed); + tail_cached_ = 0; + head_cached_ = 0; + return true; + } + + // Wire the arena-internal pointer. Called by both host (with host arena) + // and AICPU (with device arena attached to the prebuilt image). + void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) { + buffer_ = static_cast(arena.region_ptr(buffer_off)); + } + + // Arena owns the buffer; here we only forget our pointer. + void destroy() { buffer_ = nullptr; } + + // Push one item (producer only). Returns false if queue is full. + // Full condition: next_h - tail > mask_ (i.e. > capacity-1), so the + // effective usable capacity is capacity-1 (one slot is wasted as a + // sentinel to distinguish full from empty). uint64_t wrapping is safe + // since head and tail are monotonically increasing and subtraction + // wraps correctly. + bool push(PTO2TaskSlotState *item) { + uint64_t h = head_.load(std::memory_order_relaxed); + uint64_t next_h = h + 1; + if (next_h - tail_cached_ > mask_) { + tail_cached_ = tail_.load(std::memory_order_acquire); + if (next_h - tail_cached_ > mask_) { + return false; + } + } + buffer_[h & mask_] = item; + head_.store(next_h, std::memory_order_release); + return true; + } + + // Pop up to max_count items (consumer only). Returns actual count. + int pop_batch(PTO2TaskSlotState **out, int max_count) { + uint64_t t = tail_.load(std::memory_order_relaxed); + uint64_t avail = head_cached_ - t; + if (avail < static_cast(max_count)) { + head_cached_ = head_.load(std::memory_order_acquire); + avail = head_cached_ - t; + if (avail == 0) return 0; + } + int count = (avail < static_cast(max_count)) ? static_cast(avail) : max_count; + for (int i = 0; i < count; i++) { + out[i] = buffer_[(t + i) & mask_]; + } + tail_.store(t + count, std::memory_order_release); + return count; + } + + // Approximate size (used for backoff decisions, not exact). + uint64_t size() const { + uint64_t h = head_.load(std::memory_order_acquire); + uint64_t t = tail_.load(std::memory_order_acquire); + return h - t; + } +}; + +static_assert(sizeof(PTO2SpscQueue) == 5 * 64, "PTO2SpscQueue must be exactly 5 cache lines (320B)"); +// ============================================================================= + +/** + * Statistics returned by mixed-task completion processing + */ +struct CompletionStats { + int32_t fanout_edges; // Number of fanout edges traversed (notify consumers) + int32_t tasks_enqueued; // Number of consumers that became READY + int32_t fanin_edges; // Number of fanin edges traversed (release producers) + bool mixed_task_completed; // True only when this callback completed a mixed task +}; + +/** + * Layout descriptor produced by PTO2SchedulerState::reserve_layout(). Holds + * the arena offsets of every sub-region the scheduler needs plus the + * capacities used at layout time (init_from_layout reuses them). + */ +struct PTO2SchedulerLayout { + size_t off_ready_queue_slots[PTO2_NUM_RESOURCE_SHAPES]; + size_t off_dummy_ready_queue_slots; + size_t off_dep_pool_entries[PTO2_MAX_RING_DEPTH]; + size_t off_wiring_spsc_buffer; + uint64_t ready_queue_capacity; + uint64_t spsc_capacity; + int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; +}; + +/** + * Scheduler state structure + * + * Contains dynamic state updated during task execution. + * Separated from shared memory for cache efficiency. + * Hot-path methods are defined inline (implicitly inline as member functions). + */ +struct PTO2SchedulerState { + // Shared memory access + PTO2SharedMemoryHeader *sm_header; + + // Per-ring state + struct alignas(64) RingSchedState { + // --- Cache Line 0: ring pointer (read-only) + hot path (read-write) --- + PTO2SharedMemoryRingHeader *ring; + int32_t last_task_alive; + std::atomic advance_lock; // multi-thread CAS + + // --- Cache Line 1+: Thread 0 only (wiring dep_pool) --- + alignas(64) PTO2DepListPool dep_pool; +#if PTO2_PROFILING + // Published only for scope_stats; orchestrator must not read dep_pool's non-atomic counters directly. + alignas(64) std::atomic dep_pool_snapshot_tail; + std::atomic dep_pool_snapshot_top; +#endif + + // Initialize arena-internal data + arena-external pointers; does NOT + // store dep_pool.base (that lives in the runtime arena and is wired + // by SchedulerState::wire_arena_pointers). The `ring` field stores + // the device address of the SM ring header — computed via offset + // arithmetic, no SM dereference. + bool init_data_from_layout(void *sm_dev_base, int32_t ring_id); + void destroy(); + + void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); } + +#if PTO2_PROFILING + void publish_dep_pool_snapshot() { + dep_pool_snapshot_tail.store(dep_pool.tail, std::memory_order_release); + dep_pool_snapshot_top.store(dep_pool.top, std::memory_order_release); + } + + void read_dep_pool_snapshot(int32_t &tail, int32_t &top) const { + top = dep_pool_snapshot_top.load(std::memory_order_acquire); + tail = dep_pool_snapshot_tail.load(std::memory_order_acquire); + if (tail > top) tail = top; + } +#endif + + void advance_ring_pointers() { + int32_t current_task_index = ring->fc.current_task_index.load(std::memory_order_acquire); + int32_t old_last_task_alive = last_task_alive; + + while (last_task_alive < current_task_index) { + PTO2TaskSlotState &slot_state = ring->get_slot_state_by_task_id(last_task_alive); + if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) { + break; + } + last_task_alive++; + } + + // Eager reset: prepare reclaimed slots for reuse while still hot in cache. + // Safe because last_task_alive has advanced past these slots but + // sync_to_sm has not yet published — the orchestrator cannot reuse + // them until the release store below. + // Skips payload, task, ring_id — immutable after RingSchedState::init(). + for (int32_t id = old_last_task_alive; id < last_task_alive; id++) { + ring->get_slot_state_by_task_id(id).reset_for_reuse(); + } + + sync_to_sm(); + } + } ring_sched_states[PTO2_MAX_RING_DEPTH]; + + // Ready queues remain global (scheduling is ring-agnostic) + PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES]; + + // Dependency-only tasks (active_mask is empty, shape == DUMMY). Drained by + // the dispatch loop and completed inline -- never goes to AICore. + PTO2ReadyQueue dummy_ready_queue; + + // Wiring subsystem — groups all wiring-related state for cache-line isolation. + // + // Three cache-line regions by writer: + // 1. batch_* / backoff — thread 0 exclusive (local batch buffer) + // 2. queue — SPSC: orchestrator push, thread 0 pop + // 3. orch_needs_drain — orchestrator write, thread 0 read + struct alignas(64) WiringState { + static constexpr uint64_t BATCH_SIZE = 30; + static constexpr int BACKOFF_LIMIT = 32; + + // --- Thread 0 exclusive: local batch buffer + backoff --- + int batch_count = 0; + int batch_index = 0; + int backoff_counter = 0; + PTO2TaskSlotState *batch[BATCH_SIZE]; + + // --- SPSC queue: orchestrator (push) ↔ thread 0 (pop) --- + alignas(64) PTO2SpscQueue queue; + + // --- Orchestrator write, thread 0 read --- + alignas(64) std::atomic orch_needs_drain{false}; + } wiring; + + static_assert( + offsetof(WiringState, queue) == 256, "WiringState: batch region must be exactly 4 cache lines before queue" + ); + static_assert(sizeof(WiringState) == 640, "WiringState must be exactly 10 cache lines (640B)"); + + alignas(64) AsyncWaitList async_wait_list; + + // Statistics (cold path, isolated from hot-path fields) +#if PTO2_SCHED_PROFILING + alignas(64) std::atomic tasks_completed; + std::atomic tasks_consumed; +#endif + // ========================================================================= + // Inline hot-path methods + // ========================================================================= + + /** + * Drain wiring queue: pop submitted tasks and wire their fanout edges. + * Called by scheduler thread 0 each loop iteration. Sets fanin_count, + * acquires fanout_lock per producer, allocates dep_pool entries, and + * pushes ready tasks to the appropriate ready queue. + * + * @return Number of tasks wired this call. + */ + + int drain_wiring_queue(bool force_drain = false) { + int wired = 0; + + // Refill local batch buffer when exhausted. + if (wiring.batch_index >= wiring.batch_count) { + // Backoff: defer pop when queue holds fewer than a full batch, + // unless force_drain, orch_needs_drain, or backoff limit reached. + if (!force_drain && wiring.queue.size() < WiringState::BATCH_SIZE) { + if (!wiring.orch_needs_drain.load(std::memory_order_acquire) && + wiring.backoff_counter < WiringState::BACKOFF_LIMIT) { + wiring.backoff_counter++; + return 0; + } + } + wiring.backoff_counter = 0; + wiring.batch_count = wiring.queue.pop_batch(wiring.batch, WiringState::BATCH_SIZE); + wiring.batch_index = 0; + if (wiring.batch_count == 0) return 0; + } + + // Process tasks from local buffer in strict FIFO order. + while (wiring.batch_index < wiring.batch_count) { + PTO2TaskSlotState *ws = wiring.batch[wiring.batch_index]; + int ring_id = ws->ring_id; + auto &rss = ring_sched_states[ring_id]; + int32_t wfanin = ws->payload->fanin_actual_count; + + if (wfanin > 0 && rss.dep_pool.available() < wfanin) { + rss.dep_pool.reclaim(*rss.ring, rss.last_task_alive); + if (rss.dep_pool.available() < wfanin) { +#if PTO2_PROFILING + if (is_scope_stats_enabled()) { + rss.publish_dep_pool_snapshot(); + } +#endif + break; // not enough dep_pool space — keep remainder for next call + } + } + + wiring.batch_index++; + wire_task(rss, ws, wfanin); + wired++; + } + + return wired; + } + + // Route a ready slot to the right global queue. Dummy tasks (empty + // active_mask) live in dummy_ready_queue; everything else goes to the + // per-shape ready_queues[]. Used by paths that do not have a thread-local + // ready buffer (e.g. wiring). See push_ready_routed_local for the + // dispatch-time fast path. + void push_ready_routed(PTO2TaskSlotState *slot_state) { + PTO2ResourceShape shape = slot_state->active_mask.to_shape(); + if (shape == PTO2ResourceShape::DUMMY) { + dummy_ready_queue.push(slot_state); + } else { + ready_queues[static_cast(shape)].push(slot_state); + } + } + + /** + * Wire fanout edges for a single task. Sets fanin_count, acquires each + * producer's fanout_lock, allocates dep_pool entries for live producers, + * pushes the task to the ready queue once its fanin refcount is satisfied. + */ + void wire_task(RingSchedState &rss, PTO2TaskSlotState *ws, int32_t wfanin) { + PTO2TaskPayload *wp = ws->payload; + ws->fanin_count = wfanin + 1; + + if (wfanin != 0) { + int32_t early_finished = 0; + for_each_fanin_slot_state(*wp, [&](PTO2TaskSlotState *producer) { + producer->lock_fanout(); + int32_t pstate = producer->task_state.load(std::memory_order_acquire); + if (pstate >= PTO2_TASK_COMPLETED) { + early_finished++; + } else { + producer->fanout_head = rss.dep_pool.prepend(producer->fanout_head, ws); + } + producer->unlock_fanout(); + }); + + int32_t init_rc = early_finished + 1; + int32_t new_rc = ws->fanin_refcount.fetch_add(init_rc, std::memory_order_acq_rel) + init_rc; + if (new_rc >= ws->fanin_count) { + push_ready_routed(ws); + } + } else { + ws->fanin_refcount.fetch_add(1, std::memory_order_acq_rel); + push_ready_routed(ws); + } + + ws->dep_pool_mark = rss.dep_pool.top; +#if PTO2_PROFILING + if (is_scope_stats_enabled()) { + rss.publish_dep_pool_snapshot(); + } +#endif + } + + void check_and_handle_consumed(PTO2TaskSlotState &slot_state) { + if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return; + + PTO2TaskState expected = PTO2_TASK_COMPLETED; + if (!slot_state.task_state.compare_exchange_strong( + expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire + )) { + return; + } + +#if PTO2_SCHED_PROFILING + tasks_consumed.fetch_add(1, std::memory_order_relaxed); +#endif + + int32_t ring_id = slot_state.ring_id; + // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task + int32_t expected_lock = 0; + if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( + expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed + )) { + ring_sched_states[ring_id].advance_ring_pointers(); + ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); + } + } + +#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING + void check_and_handle_consumed(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) { + int32_t fc = slot_state.fanout_count; + int32_t rc = slot_state.fanout_refcount.load(std::memory_order_acquire); + + atomic_count += 2; // fanout_count.load + fanout_refcount.load + + if (rc != fc) return; + + PTO2TaskState expected = PTO2_TASK_COMPLETED; + if (!slot_state.task_state.compare_exchange_strong( + expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire + )) { + atomic_count += 1; // failed CAS + return; + } + + atomic_count += 1; // successful CAS + +#if PTO2_SCHED_PROFILING + tasks_consumed.fetch_add(1, std::memory_order_relaxed); +#endif + + int32_t ring_id = slot_state.ring_id; + // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task + int32_t expected_lock = 0; + if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong( + expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed + )) { + ring_sched_states[ring_id].advance_ring_pointers(); + ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release); + atomic_count += 2; // try-lock CAS + unlock store + } else { + atomic_count += 1; // failed try-lock CAS + } + } +#endif + + void release_producer(PTO2TaskSlotState &slot_state) { + slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel); + check_and_handle_consumed(slot_state); + } + +#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING + void release_producer(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) { + slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel); + atomic_count += 1; // fanout_refcount.fetch_add + check_and_handle_consumed(slot_state, atomic_count); + } +#endif + + bool release_fanin_and_check_ready(PTO2TaskSlotState &slot_state, PTO2LocalReadyBuffer *local_bufs = nullptr) { + // Atomically increment fanin_refcount and check if all producers are done + // ACQ_REL on fanin_refcount already synchronizes with the orchestrator's + // init release, making fanin_count visible — plain load suffices. + int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1; + + if (new_refcount == slot_state.fanin_count) { + // Local-first: try per-CoreType thread-local buffer before global queue + // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1] + // DUMMY shape is out of range for local_bufs (sized PTO2_NUM_RESOURCE_SHAPES); + // dummy slots bypass the local fast path and go straight to dummy_ready_queue. + PTO2ResourceShape shape = slot_state.active_mask.to_shape(); + if (shape == PTO2ResourceShape::DUMMY) { + dummy_ready_queue.push(&slot_state); + } else if (!local_bufs || !local_bufs[static_cast(shape)].try_push(&slot_state)) { + ready_queues[static_cast(shape)].push(&slot_state); + } + return true; + } + return false; + } + +#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING + bool release_fanin_and_check_ready( + PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &push_wait, + PTO2LocalReadyBuffer *local_bufs = nullptr + ) { + int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1; + atomic_count += 1; // fanin_refcount.fetch_add + + if (new_refcount == slot_state.fanin_count) { + // Local-first: try per-CoreType thread-local buffer before global queue. + // Dummy slots bypass local_bufs (out-of-range for PTO2_NUM_RESOURCE_SHAPES) + // and go straight to dummy_ready_queue; use the profiling-aware push so + // atomic_count / push_wait stay consistent with the non-dummy path. + PTO2ResourceShape shape = slot_state.active_mask.to_shape(); + if (shape == PTO2ResourceShape::DUMMY) { + dummy_ready_queue.push(&slot_state, atomic_count, push_wait); + } else if (!local_bufs || !local_bufs[static_cast(shape)].try_push(&slot_state)) { + ready_queues[static_cast(shape)].push(&slot_state, atomic_count, push_wait); + } + return true; + } + return false; + } +#endif + + int get_ready_tasks_batch( + PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count + ) { + int count = 0; + while (count < max_count && local_buf.count > 0) { + out[count++] = local_buf.slot_states[--local_buf.count]; + } + int remaining = max_count - count; + if (remaining > 0) { + count += ready_queues[static_cast(shape)].pop_batch(out + count, remaining); + } + return count; + } + +#if PTO2_SCHED_PROFILING + int get_ready_tasks_batch( + PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count, + uint64_t &atomic_count, uint64_t &wait_cycle + ) { + int count = 0; + while (count < max_count && local_buf.count > 0) { + out[count++] = local_buf.slot_states[--local_buf.count]; + } + int remaining = max_count - count; + if (remaining > 0) { + count += + ready_queues[static_cast(shape)].pop_batch(out + count, remaining, atomic_count, wait_cycle); + } + return count; + } +#endif + + void on_scope_end(PTO2TaskSlotState **task_slot_states, int32_t count) { +#if PTO2_ORCH_PROFILING + extern uint64_t g_orch_scope_end_atomic_count; + if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0); + for (int32_t i = 0; i < count; i++) { + if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0); + release_producer(*task_slot_states[i], g_orch_scope_end_atomic_count); + } +#else + if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0); + for (int32_t i = 0; i < count; i++) { + if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0); + release_producer(*task_slot_states[i]); + } +#endif + } + + /** + * Subtask completion: atomic counter model. + * Called when a single subtask (AIC, AIV0, or AIV1) finishes on any block. + * Atomically increments completed_subtasks and checks whether all subtasks + * across all blocks are done. + * + * @return true if this was the last subtask, completing the entire task. + */ + bool on_subtask_complete(PTO2TaskSlotState &slot_state) { + int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel); + return (prev + 1) == slot_state.total_required_subtasks; + } + + /** + * Two-stage completion: second stage. + * Called exactly once when all subtasks of a mixed task are done + * (i.e., on_subtask_complete returned true). + * Handles fanout notification, fanin release, and self-consumption check. + */ +#if PTO2_SCHED_PROFILING + CompletionStats +#else + void +#endif + on_task_complete( + PTO2TaskSlotState &slot_state, +#if PTO2_SCHED_PROFILING + int thread_idx, +#endif + + PTO2LocalReadyBuffer *local_bufs = nullptr + ) { +#if PTO2_SCHED_PROFILING + CompletionStats stats = {0, 0, 0, true}; +#endif +#if PTO2_SCHED_PROFILING + extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[]; + extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[]; + extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[]; + uint64_t lock_atomics = 0, lock_wait = 0; + PTO2_SCHED_CYCLE_START(); +#endif + +#if PTO2_SCHED_PROFILING + slot_state.lock_fanout(lock_atomics, lock_wait); +#else + slot_state.lock_fanout(); +#endif + slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release); + PTO2DepListEntry *current = slot_state.fanout_head; // Protected by fanout_lock + slot_state.unlock_fanout(); + +#if PTO2_SCHED_PROFILING + lock_atomics += 2; // state.store + unlock.store + g_sched_lock_atomic_count[thread_idx] += lock_atomics; + g_sched_lock_wait_cycle[thread_idx] += lock_wait; + PTO2_SCHED_CYCLE_LAP(g_sched_lock_cycle[thread_idx]); +#endif + + // Fanout: notify consumers +#if PTO2_SCHED_PROFILING + uint64_t fanout_atomics = 0, push_wait = 0; +#endif + while (current != nullptr) { + PTO2TaskSlotState &consumer_slot = *current->slot_state; +#if PTO2_SCHED_PROFILING + stats.fanout_edges++; + if (release_fanin_and_check_ready(consumer_slot, fanout_atomics, push_wait, local_bufs)) { + stats.tasks_enqueued++; + } +#else + release_fanin_and_check_ready(consumer_slot, local_bufs); +#endif + current = current->next; + } + +#if PTO2_SCHED_PROFILING + g_sched_fanout_atomic_count[thread_idx] += fanout_atomics; + g_sched_push_wait_cycle[thread_idx] += push_wait; + PTO2_SCHED_CYCLE_LAP(g_sched_fanout_cycle[thread_idx]); + return stats; +#endif + } + + /** + * Cold path: release producers (fanin traversal) + check self for CONSUMED. + * Returns fanin edge count for profiling. + */ + +#if PTO2_SCHED_PROFILING + int32_t on_task_release(PTO2TaskSlotState &slot_state, int32_t thread_idx) { + PTO2_SCHED_CYCLE_START(); + extern uint64_t g_sched_fanin_cycle[], g_sched_fanin_atomic_count[]; + extern uint64_t g_sched_self_atomic_count[]; + extern uint64_t g_sched_self_consumed_cycle[]; + extern uint64_t g_sched_complete_count[]; + uint64_t fanin_atomics = 0; +#else + int32_t on_task_release(PTO2TaskSlotState &slot_state) { +#endif + PTO2TaskPayload *payload = slot_state.payload; + for_each_fanin_slot_state(*payload, [&](PTO2TaskSlotState *producer_slot_state) { +#if PTO2_SCHED_PROFILING + release_producer(*producer_slot_state, fanin_atomics); +#else + release_producer(*producer_slot_state); +#endif + }); +#if PTO2_SCHED_PROFILING + g_sched_fanin_atomic_count[thread_idx] += fanin_atomics; + PTO2_SCHED_CYCLE_LAP(g_sched_fanin_cycle[thread_idx]); +#endif + + // Self consumed check +#if PTO2_SCHED_PROFILING + uint64_t self_atomics = 0; + check_and_handle_consumed(slot_state, self_atomics); + g_sched_self_atomic_count[thread_idx] += self_atomics; + PTO2_SCHED_CYCLE_LAP(g_sched_self_consumed_cycle[thread_idx]); + g_sched_complete_count[thread_idx]++; +#else + check_and_handle_consumed(slot_state); +#endif + return payload->fanin_actual_count; + } + + // === Cold-path API (defined in pto_scheduler.cpp) === + + // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots, + // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena. + // Capacities are baked into the returned layout; init_data_from_layout uses + // the same values. + static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE); + static PTO2SchedulerLayout + reserve_layout(DeviceArena &arena, const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]); + + // Phase 3a: write everything *except* arena-internal pointer fields. + // `sm_dev_base` is the device address of the SM (only stored, never + // dereferenced here). Safe to call on a host arena that holds the + // prebuilt image buffer. (The orchestrator counterpart takes + // task_window_size for ring task_descriptors address arithmetic; the + // scheduler only needs the SM header / ring header base addresses, + // both window-size-independent.) + bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base); + + // Phase 3b: write the arena-internal pointer fields + // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each + // ring, wiring.queue.buffer_). Called on both host and device sides. + void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena); + + // Forget per-region pointers; arena owns the backing memory. + void destroy(); + void print_stats(); + void print_queues(); +}; + +// Scheduler cold-path API is declared as PTO2SchedulerState member functions. +// See init()/destroy()/print_stats()/print_queues() below the struct definition. + +// Short-circuit NotDeferred completions seen during drain so they don't grow +// entries[]. Mirrors the a2a3 impl; see that mirror for the rationale. +inline bool +AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state) { +#if PTO2_SCHED_PROFILING + sink.sched->on_task_complete(slot_state, sink.thread_idx, sink.local_bufs); +#else + sink.sched->on_task_complete(slot_state, sink.local_bufs); +#endif + if (*sink.deferred_release_count >= sink.deferred_release_capacity) { + while (*sink.deferred_release_count > 0) { +#if PTO2_SCHED_PROFILING + (void)sink.sched->on_task_release( + *sink.deferred_release_slot_states[--(*sink.deferred_release_count)], sink.thread_idx + ); +#else + sink.sched->on_task_release(*sink.deferred_release_slot_states[--(*sink.deferred_release_count)]); +#endif + } + } + sink.deferred_release_slot_states[(*sink.deferred_release_count)++] = &slot_state; + sink.inline_completed++; + return true; +} + +template +inline AsyncPollResult AsyncWaitList::poll_and_complete( + AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs, + PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, int32_t deferred_release_capacity +#if PTO2_SCHED_PROFILING + , + int thread_idx +#endif +) { + AsyncPollResult result; + if (!try_lock()) return result; + + AsyncWaitList::DrainCompletionSink sink{}; + sink.sched = sched; + sink.local_bufs = local_bufs; + sink.deferred_release_slot_states = deferred_release_slot_states; + sink.deferred_release_count = &deferred_release_count; + sink.deferred_release_capacity = deferred_release_capacity; +#if PTO2_SCHED_PROFILING + sink.thread_idx = thread_idx; +#endif + + int32_t drain_err = PTO2_ERROR_NONE; + drain_aicore_completion_mailbox_locked(aicore_mailbox, sink, drain_err); + if (drain_err != PTO2_ERROR_NONE) { + result.error_code = drain_err; + unlock(); + return result; + } + result.completed += sink.inline_completed; + + for (int32_t i = count - 1; i >= 0; --i) { + AsyncWaitEntry &entry = entries[i]; + uintptr_t last_invalidated_counter_line = static_cast(-1); + for (int32_t c = 0; c < entry.condition_count; c++) { + CompletionCondition &cond = entry.conditions[c]; + if (cond.satisfied) continue; + if (cond.completion_type == COMPLETION_TYPE_COUNTER && cond.counter_addr != nullptr) { + uintptr_t counter_line = mailbox_cache_line(cond.counter_addr); + if (counter_line != last_invalidated_counter_line) { + cache_invalidate_range(reinterpret_cast(counter_line), sizeof(uint32_t)); + last_invalidated_counter_line = counter_line; + } + } + CompletionPollResult poll = cond.test(); + if (poll.state == CompletionPollState::FAILED) { + result.error_code = poll.error_code; + result.failed_slot_state = entry.slot_state; + unlock(); + return result; + } + if (poll.state == CompletionPollState::READY) { + cond.satisfied = true; + cond.retire(); + entry.waiting_completion_count--; + } + } + + if (entry.normal_done && entry.waiting_completion_count <= 0) { +#if PTO2_SCHED_PROFILING + sched->on_task_complete(*entry.slot_state, thread_idx, local_bufs); +#else + sched->on_task_complete(*entry.slot_state, local_bufs); +#endif + if (deferred_release_count >= deferred_release_capacity) { + while (deferred_release_count > 0) { +#if PTO2_SCHED_PROFILING + (void)sched->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx); +#else + sched->on_task_release(*deferred_release_slot_states[--deferred_release_count]); +#endif + } + } + deferred_release_slot_states[deferred_release_count++] = entry.slot_state; + result.completed++; + + int32_t last = count - 1; + if (i != last) entries[i] = entries[last]; + count = last; + } + } + + unlock(); + return result; +} + +// ============================================================================= +// Scheduler Profiling Data +// ============================================================================= + +#if PTO2_SCHED_PROFILING +struct PTO2SchedProfilingData { + // Sub-phase cycle breakdown within on_task_complete + uint64_t lock_cycle; // lock_fanout + state store + unlock + uint64_t fanout_cycle; // fanout traversal + uint64_t fanin_cycle; // fanin traversal + uint64_t self_consumed_cycle; // self check_and_handle_consumed + + // Wait times + uint64_t lock_wait_cycle; // spin-wait in fanout_lock + uint64_t push_wait_cycle; // CAS contention in push() + uint64_t pop_wait_cycle; // CAS contention in pop() + + // Atomic counts per sub-phase + uint64_t lock_atomic_count; + uint64_t fanout_atomic_count; + uint64_t fanin_atomic_count; + uint64_t self_atomic_count; + uint64_t pop_atomic_count; + + int64_t complete_count; +}; + +/** + * Get and reset scheduler profiling data for a specific thread. + * Returns accumulated profiling data and resets counters. + */ +PTO2SchedProfilingData scheduler_get_profiling(int thread_idx); +#endif diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp new file mode 100644 index 000000000..5e09042a1 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_cold_path.cpp @@ -0,0 +1,1096 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include "scheduler_context.h" + +#include +#include + +#include "common/unified_log.h" +#include "aicpu/device_time.h" +#include "aicpu/l2_swimlane_collector_aicpu.h" +#include "aicpu/platform_regs.h" +#include "aicpu/pmu_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" +#include "common/memory_barrier.h" +#include "common/l2_swimlane_profiling.h" +#include "common/platform_config.h" +#include "pto_runtime2.h" +#include "pto_shared_memory.h" +#include "runtime.h" +#include "spin_hint.h" + +// ============================================================================= +// Cold-path helpers for the main dispatch loop (noinline to reduce hot-loop icache) +// ============================================================================= + +static void latch_scheduler_error(PTO2SharedMemoryHeader *header, int32_t thread_idx, int32_t error_code) { + if (header == nullptr || error_code == PTO2_ERROR_NONE) { + return; + } + // The first error code/thread pair wins; the bitmap cumulatively records all reporting threads. + int32_t expected = PTO2_ERROR_NONE; + if (header->sched_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) { + header->sched_error_thread.store(thread_idx, std::memory_order_release); + } + if (thread_idx >= 0 && thread_idx < 32) { + header->sched_error_bitmap.fetch_or(1U << static_cast(thread_idx), std::memory_order_acq_rel); + } +} + +LoopAction SchedulerContext::handle_orchestrator_exit( + int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count +) { + if (completed_.load(std::memory_order_acquire)) { + return LoopAction::BREAK_LOOP; + } + int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire); + if (orch_err != PTO2_ERROR_NONE) { + LOG_ERROR( + "Thread %d: Fatal error (code=%d), sending EXIT_SIGNAL to all cores. " + "completed_tasks=%d, total_tasks=%d", + thread_idx, orch_err, completed_tasks_.load(std::memory_order_relaxed), total_tasks_ + ); + if (!completed_.exchange(true, std::memory_order_acq_rel)) { + emergency_shutdown(runtime); + } + return LoopAction::BREAK_LOOP; + } + int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire); + if (sched_err != PTO2_ERROR_NONE) { + LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err); + if (!completed_.exchange(true, std::memory_order_acq_rel)) { + emergency_shutdown(runtime); + } + return LoopAction::BREAK_LOOP; + } + + bool orch_done = orchestrator_done_; + if (!orch_done) return LoopAction::NONE; + + task_count = total_tasks_; + if (task_count > 0 && completed_tasks_.load(std::memory_order_relaxed) >= task_count) { + completed_.store(true, std::memory_order_release); + LOG_INFO_V0( + "Thread %d: PTO2 completed tasks %d/%d", thread_idx, completed_tasks_.load(std::memory_order_relaxed), + task_count + ); + return LoopAction::BREAK_LOOP; + } + return LoopAction::NONE; +} + +LoopAction SchedulerContext::handle_core_transition(bool &cores_released) { + if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE; + if (!reassigned_.load(std::memory_order_acquire)) { + wait_reassign_.fetch_add(1, std::memory_order_release); + while (!reassigned_.load(std::memory_order_acquire)) { + if (completed_.load(std::memory_order_acquire)) { + return LoopAction::BREAK_LOOP; + } + SPIN_WAIT_HINT(); + } + } + cores_released = true; + return LoopAction::NONE; +} + +LoopAction +SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) { + if (completed_.load(std::memory_order_acquire)) { + return LoopAction::BREAK_LOOP; + } + int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire); + if (orch_err != PTO2_ERROR_NONE) { + LOG_ERROR("Thread %d: Fatal error detected (code=%d), sending EXIT_SIGNAL to all cores", thread_idx, orch_err); + if (!completed_.exchange(true, std::memory_order_acq_rel)) { + emergency_shutdown(runtime); + } + return LoopAction::BREAK_LOOP; + } + int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire); + if (sched_err != PTO2_ERROR_NONE) { + LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err); + if (!completed_.exchange(true, std::memory_order_acq_rel)) { + emergency_shutdown(runtime); + } + return LoopAction::BREAK_LOOP; + } + return LoopAction::NONE; +} + +// ============================================================================= +// Stall diagnostic log format. +// +// Every line is self-contained — when scheduler threads emit concurrently and +// device_log interleaves their output, each line still carries enough context +// to identify which thread / iteration / object it belongs to. +// +// Prefix on every line: +// [STALL thread=N idle_iterations=K] CATEGORY ... +// +// All scheduler threads spinning at the same idle rate hit STALL_LOG_INTERVAL +// together, so lines with the same idle_iterations belong to one diagnostic +// round; grep "idle_iterations=N" groups one round's output. +// +// Categories (and which thread emits them): +// SUMMARY — completed / total counts and scan totals (thread 0 only) +// TASK — one per non-completed task scanned from shared rings (thread 0 only) +// - state=RUNNING: includes running_on=[...] cross-ref +// - state=READY: fanin satisfied but no idle core yet +// - state=WAIT: includes missing_deps=N +// CLUSTER — one per cluster owned by this thread (every thread) +// - busy slot shows kernel + task_id + cond_reg_state; +// ANOMALY suffix when COND register is fin while software +// still has the slot marked busy. +// +// Reader workflow: +// 1. grep SUMMARY -> overall completion status +// 2. grep "idle_iterations=N TASK" -> stuck RUNNING task and which +// core/thread it is on +// 3. grep "idle_iterations=N CLUSTER.*task=" -> cross-check via the +// cluster line (or just +// read running_on in step 2) +// ============================================================================= + +namespace { + +// Format a core's idle/busy state into a fixed buffer. Used inside CLUSTER lines. +// Layout (idle): coreN(idle) +// Layout (busy): coreN(busy kernel=K task=T cond_reg_state=ack) +// Layout (anomaly): coreN(busy kernel=K task=T cond_reg_state=fin ANOMALY) +// +// Healthy busy: COND register reports ack (AICore still executing). fin means +// AICore wrote completion but AICPU hasn't recycled the running slot yet — +// either a completion-poll bug or the diagnostic raced the recycle. +void format_core_status( + char *buf, size_t buf_size, int32_t core_id, bool idle, const CoreExecState *core_state, uint64_t reg_addr_for_cond +) { + if (idle) { + snprintf(buf, buf_size, "core%d(idle)", core_id); + return; + } + int32_t kernel = -1; + int64_t task_id_raw = -1; + if (core_state && core_state->running_slot_state) { + int32_t subslot = static_cast(core_state->running_subslot); + kernel = core_state->running_slot_state->task->kernel_id[subslot]; + task_id_raw = static_cast(core_state->running_slot_state->task->task_id.raw); + } + uint64_t cond_reg = read_reg(reg_addr_for_cond, RegId::COND); + int32_t hw_state = EXTRACT_TASK_STATE(cond_reg); + const char *cond_reg_state_str = (hw_state == TASK_ACK_STATE) ? "ack" : "fin"; + if (hw_state == TASK_ACK_STATE) { + snprintf( + buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s)", core_id, kernel, task_id_raw, + cond_reg_state_str + ); + } else { + snprintf( + buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s ANOMALY)", core_id, kernel, + task_id_raw, cond_reg_state_str + ); + } +} + +} // namespace + +int32_t SchedulerContext::find_core_owner_thread(int32_t core_id) const { + for (int32_t t = 0; t < aicpu_thread_num_; t++) { + const int32_t *ids = core_trackers_[t].core_ids(); + int32_t n = core_trackers_[t].core_num(); + for (int32_t i = 0; i < n; i++) { + if (ids[i] == core_id) return t; + } + } + return -1; +} + +bool SchedulerContext::self_owns_running_task(int32_t thread_idx) const { + const int32_t *cores = core_trackers_[thread_idx].core_ids(); + int32_t core_num = core_trackers_[thread_idx].core_num(); + for (int32_t i = 0; i < core_num; i++) { + if (core_exec_states_[cores[i]].running_slot_state != nullptr) { + return true; + } + } + return false; +} + +bool SchedulerContext::no_thread_owns_running_task() const { + for (int32_t t = 0; t < aicpu_thread_num_; t++) { + if (self_owns_running_task(t)) return false; + } + return true; +} + +void SchedulerContext::log_stall_diagnostics( + int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count +) { + CoreTracker &tracker = core_trackers_[thread_idx]; + + // T0 owns the shared-ring scan; printing it from other threads would + // produce identical TASK lines once per scheduler thread. + if (thread_idx == 0) { + int32_t cnt_ready = 0, cnt_waiting = 0, cnt_running = 0, submitted_in_ring = 0; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + PTO2SharedMemoryRingHeader &ring = *sched_->ring_sched_states[r].ring; + int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed); + submitted_in_ring += ring_task_count; + for (int32_t si = 0; si < ring_task_count; si++) { + PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si); + PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed); + int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed); + int32_t fi = slot_state.fanin_count; + int32_t kid_aic = slot_state.task->kernel_id[0]; + int32_t kid_aiv0 = slot_state.task->kernel_id[1]; + int32_t kid_aiv1 = slot_state.task->kernel_id[2]; + int64_t task_id = static_cast(slot_state.task->task_id.raw); + if (st >= PTO2_TASK_COMPLETED) continue; + // task_state has no intermediate ready/running value — it + // stays PENDING until the worker stores COMPLETED. Classify + // by the ground truth instead: a slot is RUNNING iff some + // core has it as running_slot_state. A task occupies at most + // 3 cores (one cluster), all under the same owner thread by + // construction of assign_cores_to_threads. + char running_on[192] = {0}; + int32_t owner = -1; + int32_t pos = 0; + bool is_running = false; + for (int32_t cid = 0; cid < cores_total_num_ && pos + 32 < (int32_t)sizeof(running_on); cid++) { + if (core_exec_states_[cid].running_slot_state != &slot_state) continue; + is_running = true; + if (owner < 0) owner = find_core_owner_thread(cid); + const char *sname = subslot_name(core_exec_states_[cid].running_subslot); + int32_t written = snprintf( + running_on + pos, sizeof(running_on) - pos, "%score=%d(%s)", pos == 0 ? "" : " ", cid, sname + ); + if (written > 0) pos += written; + } + + if (is_running) { + cnt_running++; + if (cnt_running > STALL_DUMP_READY_MAX) continue; + LOG_INFO_V9( + "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64 + " state=RUNNING fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] " + "running_on=[owner_thread=%d cores=[%s]]", + thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, owner, running_on + ); + continue; + } + if (rc >= fi) { + cnt_ready++; + if (cnt_ready > STALL_DUMP_READY_MAX) continue; + LOG_INFO_V9( + "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64 + " state=READY fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d]", + thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1 + ); + continue; + } + cnt_waiting++; + if (cnt_waiting > STALL_DUMP_WAIT_MAX) continue; + LOG_INFO_V9( + "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64 + " state=WAIT fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] missing_deps=%d", + thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, fi - rc + ); + } + } + int32_t effective_total = task_count > 0 ? task_count : submitted_in_ring; + int32_t c = completed_tasks_.load(std::memory_order_relaxed); + LOG_INFO_V9( + "[STALL thread=%d idle_iterations=%d] SUMMARY completed=%d/%d last_progress_iteration=%d " + "scan_ready=%d scan_waiting=%d scan_running=%d", + thread_idx, idle_iterations, c, effective_total, last_progress_count, cnt_ready, cnt_waiting, cnt_running + ); + } + + // CLUSTER lines: one per cluster this thread owns. + // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the + // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads. + int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_; + for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) { + int32_t offset = cli * 3; + int32_t aic_id = tracker.get_aic_core_id(offset); + int32_t aiv0_id = tracker.get_aiv0_core_id(offset); + int32_t aiv1_id = tracker.get_aiv1_core_id(offset); + bool aic_idle = tracker.is_aic_core_idle(offset); + bool aiv0_idle = tracker.is_aiv0_core_idle(offset); + bool aiv1_idle = tracker.is_aiv1_core_idle(offset); + int32_t cluster_id = cli * ast + thread_idx; + char aic_buf[128], aiv0_buf[128], aiv1_buf[128]; + format_core_status( + aic_buf, sizeof(aic_buf), aic_id, aic_idle, &core_exec_states_[aic_id], core_exec_states_[aic_id].reg_addr + ); + format_core_status( + aiv0_buf, sizeof(aiv0_buf), aiv0_id, aiv0_idle, &core_exec_states_[aiv0_id], + core_exec_states_[aiv0_id].reg_addr + ); + format_core_status( + aiv1_buf, sizeof(aiv1_buf), aiv1_id, aiv1_idle, &core_exec_states_[aiv1_id], + core_exec_states_[aiv1_id].reg_addr + ); + LOG_INFO_V9( + "[STALL thread=%d idle_iterations=%d] CLUSTER cluster_id=%d aic=%s aiv0=%s aiv1=%s", thread_idx, + idle_iterations, cluster_id, aic_buf, aiv0_buf, aiv1_buf + ); + } +} + +void SchedulerContext::log_shutdown_stall_snapshot( + int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count +) { + LOG_WARN( + "[SHUTDOWN_SNAPSHOT trigger_thread=%d reason=scheduler_timeout idle_iterations=%d] " + "dumping all scheduler threads before emergency shutdown", + trigger_thread_idx, trigger_idle_iterations + ); + int32_t thread_count = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_; + if (thread_count < 0 || thread_count > MAX_AICPU_THREADS) { + LOG_ERROR( + "[SHUTDOWN_SNAPSHOT trigger_thread=%d] invalid thread_count=%d, clamping to [0,%d]", trigger_thread_idx, + thread_count, MAX_AICPU_THREADS + ); + thread_count = thread_count < 0 ? 0 : MAX_AICPU_THREADS; + } + for (int32_t t = 0; t < thread_count; t++) { + log_stall_diagnostics(t, total_tasks_, trigger_idle_iterations, trigger_last_progress_count); + } +} + +int32_t SchedulerContext::handle_timeout_exit( + int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations, + int32_t last_progress_count +#if PTO2_PROFILING + , + uint64_t sched_start_ts +#endif +) { + LOG_ERROR( + "[STALL thread=%d idle_iterations=%d] TIMEOUT_EXIT after_idle_iterations=%d", thread_idx, idle_iterations, + idle_iterations + ); + latch_scheduler_error(header, thread_idx, PTO2_ERROR_SCHEDULER_TIMEOUT); + if (!completed_.exchange(true, std::memory_order_acq_rel)) { + log_shutdown_stall_snapshot(thread_idx, idle_iterations, last_progress_count); +#if PTO2_PROFILING + // Capture the in-flight kernels' partial output before signalling the + // cores to exit, so the dump reflects the live stuck state. + if (is_dump_args_enabled()) { + dump_running_task_outputs( + thread_idx, cores_total_num_, + [this](int32_t cid) { + return core_exec_states_[cid].running_slot_state; + }, + [](ActiveMask active_mask, int raw_subtask_id) { + return active_mask.subtask_active(static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif + emergency_shutdown(runtime); + } +#if PTO2_PROFILING + uint64_t sched_timeout_ts = get_sys_cnt_aicpu(); + LOG_INFO_V9( + "Thread %d: sched_start=%" PRIu64 " sched_end(timeout)=%" PRIu64 " sched_cost=%.3fus", thread_idx, + static_cast(sched_start_ts), static_cast(sched_timeout_ts), + cycles_to_us(sched_timeout_ts - sched_start_ts) + ); +#endif + return -PTO2_ERROR_SCHEDULER_TIMEOUT; +} + +#if PTO2_PROFILING +void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) { + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; + uint64_t sched_end_ts = get_sys_cnt_aicpu(); + LOG_INFO_V9( + "Thread %d: sched_start=%" PRIu64 " sched_end=%" PRIu64 " sched_cost=%.3fus", thread_idx, + static_cast(l2_swimlane.sched_start_ts), static_cast(sched_end_ts), + cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts) + ); + + uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle + + l2_swimlane.sched_scan_cycle + l2_swimlane.sched_dispatch_cycle + + l2_swimlane.sched_idle_cycle; + if (sched_total == 0) sched_total = 1; + +#if PTO2_SCHED_PROFILING + { + PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx); + uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle; + uint64_t complete_poll = + (l2_swimlane.sched_complete_cycle > otc_total + l2_swimlane.sched_complete_perf_cycle) ? + (l2_swimlane.sched_complete_cycle - otc_total - l2_swimlane.sched_complete_perf_cycle) : + 0; + uint64_t dispatch_poll = (l2_swimlane.sched_dispatch_cycle > + l2_swimlane.sched_dispatch_pop_cycle + l2_swimlane.sched_dispatch_setup_cycle) ? + (l2_swimlane.sched_dispatch_cycle - l2_swimlane.sched_dispatch_pop_cycle - + l2_swimlane.sched_dispatch_setup_cycle) : + 0; + + LOG_INFO_V9( + "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx, + cycles_to_us(sched_total), cur_thread_completed + ); + + // fanout / fanin per-thread aggregates live in + // sched_overhead_analysis.compute_dag_stats_from_deps (deps.json edges + // × core_to_thread). + LOG_INFO_V9( + "Thread %d: complete : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_complete_cycle), + l2_swimlane.sched_complete_cycle * 100.0 / sched_total + ); + + uint64_t c_parent = l2_swimlane.sched_complete_cycle > 0 ? l2_swimlane.sched_complete_cycle : 1; + uint64_t complete_miss_count = (l2_swimlane.complete_probe_count > l2_swimlane.complete_hit_count) ? + (l2_swimlane.complete_probe_count - l2_swimlane.complete_hit_count) : + 0; + double complete_hit_rate = l2_swimlane.complete_probe_count > 0 ? + l2_swimlane.complete_hit_count * 100.0 / l2_swimlane.complete_probe_count : + 0.0; + LOG_INFO_V9( + "Thread %d: poll : %.3fus (%.1f%%) hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%", + thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent, + static_cast(l2_swimlane.complete_hit_count), static_cast(complete_miss_count), + complete_hit_rate + ); + LOG_INFO_V9( + "Thread %d: otc_lock : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", thread_idx, + cycles_to_us(sp.lock_cycle), sp.lock_cycle * 100.0 / c_parent, + cycles_to_us(sp.lock_cycle - sp.lock_wait_cycle), cycles_to_us(sp.lock_wait_cycle), + static_cast(sp.lock_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: otc_fanout : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", thread_idx, + cycles_to_us(sp.fanout_cycle), sp.fanout_cycle * 100.0 / c_parent, + cycles_to_us(sp.fanout_cycle - sp.push_wait_cycle), cycles_to_us(sp.push_wait_cycle), + static_cast(sp.fanout_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: otc_fanin : %.3fus (%.1f%%) atomics=%" PRIu64 "", thread_idx, + cycles_to_us(sp.fanin_cycle), sp.fanin_cycle * 100.0 / c_parent, + static_cast(sp.fanin_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: otc_self : %.3fus (%.1f%%) atomics=%" PRIu64 "", thread_idx, + cycles_to_us(sp.self_consumed_cycle), sp.self_consumed_cycle * 100.0 / c_parent, + static_cast(sp.self_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: perf : %.3fus (%.1f%%)", thread_idx, + cycles_to_us(l2_swimlane.sched_complete_perf_cycle), + l2_swimlane.sched_complete_perf_cycle * 100.0 / c_parent + ); + + LOG_INFO_V9( + "Thread %d: dispatch : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_dispatch_cycle), + l2_swimlane.sched_dispatch_cycle * 100.0 / sched_total + ); + + uint64_t d_parent = l2_swimlane.sched_dispatch_cycle > 0 ? l2_swimlane.sched_dispatch_cycle : 1; + LOG_INFO_V9( + "Thread %d: poll : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll), + dispatch_poll * 100.0 / d_parent + ); + LOG_INFO_V9( + "Thread %d: pop : %.3fus (%.1f%%) work=%.3fus wait=%.3fus atomics=%" PRIu64 "", thread_idx, + cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle), l2_swimlane.sched_dispatch_pop_cycle * 100.0 / d_parent, + cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle), + static_cast(sp.pop_atomic_count) + ); + LOG_INFO_V9( + "Thread %d: setup : %.3fus (%.1f%%)", thread_idx, + cycles_to_us(l2_swimlane.sched_dispatch_setup_cycle), + l2_swimlane.sched_dispatch_setup_cycle * 100.0 / d_parent + ); + + LOG_INFO_V9( + "Thread %d: scan : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_scan_cycle), + l2_swimlane.sched_scan_cycle * 100.0 / sched_total + ); + +#if PTO2_SCHED_PROFILING + LOG_INFO_V9( + "Thread %d: wiring : %.3fus (%.1f%%) tasks=%d", thread_idx, + cycles_to_us(l2_swimlane.sched_wiring_cycle), l2_swimlane.sched_wiring_cycle * 100.0 / sched_total, + l2_swimlane.phase_wiring_count + ); +#else + LOG_INFO_V9( + "Thread %d: wiring : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_wiring_cycle), + l2_swimlane.sched_wiring_cycle * 100.0 / sched_total + ); +#endif + + LOG_INFO_V9( + "Thread %d: idle : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_idle_cycle), + l2_swimlane.sched_idle_cycle * 100.0 / sched_total + ); + + if (cur_thread_completed > 0) { + LOG_INFO_V9( + "Thread %d: avg/complete : %.3fus", thread_idx, + cycles_to_us(l2_swimlane.sched_complete_cycle) / cur_thread_completed + ); + } + } +#endif + LOG_INFO_V9( + "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx, + cycles_to_us(sched_total), static_cast(l2_swimlane.sched_loop_count), cur_thread_completed + ); +} +#endif + +// ============================================================================= +// Shutdown: deinit AICore regs for this thread's cores. +// Orchestrator threads have core_trackers_[thread_idx].core_num() == 0 -> no-op. +// platform_deinit_aicore_regs is idempotent; safe to call after early completion. +// ============================================================================= +int32_t SchedulerContext::shutdown(int32_t thread_idx) { + const int32_t *cores = core_trackers_[thread_idx].core_ids(); + int32_t core_num = core_trackers_[thread_idx].core_num(); + if (core_num == 0) return 0; + +#if PTO2_PROFILING + // Restore PMU CTRL registers for this thread's cores before AICore shutdown + if (is_pmu_enabled()) { + pmu_aicpu_finalize(cores, core_num); + } +#endif + + LOG_INFO_V0("Thread %d: Shutting down %d cores", thread_idx, core_num); + int32_t rc = 0; + for (int32_t i = 0; i < core_num; i++) { + int32_t core_id = cores[i]; + uint64_t reg_addr = core_exec_states_[core_id].reg_addr; + if (reg_addr != 0) { + // Timeout means AICore is unresponsive. Log and continue deiniting remaining cores. + if (platform_deinit_aicore_regs(reg_addr) != 0) { + LOG_ERROR("Thread %d: Core %d deinit timed out", thread_idx, core_id); + rc = -1; + } + } else { + LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id); + } + } + LOG_INFO_V0("Thread %d: Shutdown complete", thread_idx); + return rc; +} + +// ============================================================================= +// Handshake with all AICore workers; discover core type and reg address. +// ============================================================================= +int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) { + Handshake *all_handshakes = reinterpret_cast(runtime->workers); + cores_total_num_ = runtime->worker_count; + + // Validate cores_total_num_ before using as array index + if (cores_total_num_ == 0 || cores_total_num_ > RUNTIME_MAX_WORKER) { + LOG_ERROR("Invalid cores_total_num %d (expected 1-%d)", cores_total_num_, RUNTIME_MAX_WORKER); + return -1; + } + + aic_count_ = 0; + aiv_count_ = 0; + + LOG_INFO_V0("Handshaking with %d cores", cores_total_num_); + + // Step 1: Write per-core payload addresses and send handshake signal. + // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before + // aicpu_ready=1, so AICore reads the correct payload pointer after waking up. + for (int32_t i = 0; i < cores_total_num_; i++) { + all_handshakes[i].task = reinterpret_cast(&payload_per_core_[i][0]); + OUT_OF_ORDER_STORE_BARRIER(); + all_handshakes[i].aicpu_ready = 1; + } + OUT_OF_ORDER_STORE_BARRIER(); + + // Get platform physical cores count for validation + uint32_t max_physical_cores_count = platform_get_physical_cores_count(); + + // Step 2: Wait for all cores to respond, collect core type and register addresses + bool handshake_failed = false; + for (int32_t i = 0; i < cores_total_num_; i++) { + Handshake *hank = &all_handshakes[i]; + + while (hank->aicore_regs_ready == 0) { + SPIN_WAIT_HINT(); + } + + uint32_t physical_core_id = hank->physical_core_id; + + if (physical_core_id >= max_physical_cores_count) { + LOG_ERROR( + "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id, + max_physical_cores_count + ); + handshake_failed = true; + continue; + } + + uint64_t *regs = reinterpret_cast(regs_); + uint64_t reg_addr = regs[physical_core_id]; + + // Initialize AICore registers after discovery (first round) + platform_init_aicore_regs(reg_addr); + OUT_OF_ORDER_STORE_BARRIER(); + hank->aicpu_regs_ready = 1; + + OUT_OF_ORDER_STORE_BARRIER(); + + while (hank->aicore_done == 0) { + SPIN_WAIT_HINT(); + } + + CoreType type = hank->core_type; + + core_exec_states_[i].reg_addr = reg_addr; + core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND); + +#if PTO2_PROFILING + physical_core_ids_[i] = physical_core_id; +#endif + +#if !PTO2_PROFILING + core_exec_states_[i].worker_id = i; + core_exec_states_[i].physical_core_id = physical_core_id; + core_exec_states_[i].core_type = type; +#endif + + if (type == CoreType::AIC) { + aic_worker_ids_[aic_count_++] = i; + LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); + } else { + aiv_worker_ids_[aiv_count_++] = i; + LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); + } + } + + if (handshake_failed) { + emergency_shutdown(runtime); + return -1; + } + + LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_); + return 0; +} + +// ============================================================================= +// Assign discovered cores to scheduler threads (cluster-aligned round-robin). +// ============================================================================= +bool SchedulerContext::assign_cores_to_threads() { + // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % active_sched_threads_. + // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together. + active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; + int32_t cluster_count = aic_count_; + + // Max clusters any single sched thread can hold: ceil(cluster_count / active_sched_threads_). + int32_t max_clusters_per_thread = (cluster_count + active_sched_threads_ - 1) / active_sched_threads_; + int32_t thread_cores_num = max_clusters_per_thread * 3; + + if (thread_cores_num > CoreTracker::MAX_CORE_PER_THREAD) { + LOG_ERROR("Can't assign more then 64 cores in per scheduler"); + return false; + } + + LOG_INFO_V0( + "Assigning cores (round-robin): %d clusters across %d sched threads (%d AIC, %d AIV)", cluster_count, + active_sched_threads_, aic_count_, aiv_count_ + ); + + for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) { + core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID; + core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID; + } + + // Count clusters per thread first (round-robin may distribute unevenly) + int32_t clusters_per_thread[MAX_AICPU_THREADS] = {}; + for (int32_t ci = 0; ci < cluster_count; ci++) { + clusters_per_thread[ci % active_sched_threads_]++; + } + for (int32_t i = 0; i < active_sched_threads_; i++) { + core_trackers_[i].init(clusters_per_thread[i]); + } + + int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {}; + + for (int32_t ci = 0; ci < cluster_count; ci++) { + int32_t t = ci % active_sched_threads_; + + int32_t aic_wid = aic_worker_ids_[ci]; + int32_t aiv0_wid = aiv_worker_ids_[2 * ci]; + int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1]; + + core_trackers_[t].set_cluster(cluster_idx_per_thread[t]++, aic_wid, aiv0_wid, aiv1_wid); + + LOG_INFO_V0("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", t, ci, aic_wid, aiv0_wid, aiv1_wid); + } + + for (int32_t t = 0; t < aicpu_thread_num_; t++) { + LOG_INFO_V0( + "Thread %d: total %d cores (%d clusters)", t, core_trackers_[t].core_num(), + core_trackers_[t].get_cluster_count() + ); + } + + LOG_INFO_V0( + "Config: threads=%d, cores=%d, cores_per_thread=%d", aicpu_thread_num_, cores_total_num_, thread_cores_num + ); + return true; +} + +// ============================================================================= +// Reassign all cores across all threads (sched + orchestrator) after orchestration. +// ============================================================================= +void SchedulerContext::reassign_cores_for_all_threads() { + LOG_INFO_V0( + "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_ + ); + + // Collect running worker_ids from all current trackers + bool running_cores[RUNTIME_MAX_WORKER] = {}; + for (int32_t i = 0; i < aicpu_thread_num_; i++) { + auto all_running = core_trackers_[i].get_all_running_cores(); + int32_t bp; + while ((bp = all_running.pop_first()) >= 0) { + running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true; + } + } + + // Count clusters per thread (round-robin across all threads) + int32_t cluster_count = aic_count_; + int32_t clusters_per_thread[MAX_AICPU_THREADS] = {}; + for (int32_t ci = 0; ci < cluster_count; ci++) { + clusters_per_thread[ci % aicpu_thread_num_]++; + } + + // Re-init all trackers and reset core counts + for (int32_t i = 0; i < aicpu_thread_num_; i++) { + core_trackers_[i].init(clusters_per_thread[i]); + } + + // Assign clusters round-robin and restore running state + int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {}; + for (int32_t ci = 0; ci < cluster_count; ci++) { + int32_t t = ci % aicpu_thread_num_; + + int32_t aic_wid = aic_worker_ids_[ci]; + int32_t aiv0_wid = aiv_worker_ids_[2 * ci]; + int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1]; + + int32_t cl_idx = cluster_idx_per_thread[t]++; + core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid); + + // init() marks all idle; toggle cores that were running and restore pending_occupied + if (running_cores[aic_wid]) { + core_trackers_[t].change_core_state(cl_idx * 3); + core_trackers_[t].set_pending_occupied(cl_idx * 3); + } + if (running_cores[aiv0_wid]) { + core_trackers_[t].change_core_state(cl_idx * 3 + 1); + core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1); + } + if (running_cores[aiv1_wid]) { + core_trackers_[t].change_core_state(cl_idx * 3 + 2); + core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2); + } + } + + // Log final distribution + LOG_INFO_V0("Core reassignment complete:"); + for (int32_t t = 0; t < aicpu_thread_num_; t++) { + int32_t aic_running = core_trackers_[t].get_running_count(); + int32_t aiv_running = core_trackers_[t].get_running_count(); + LOG_INFO_V0( + " Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(), + core_trackers_[t].get_cluster_count(), aic_running, aiv_running + ); + } + active_sched_threads_ = aicpu_thread_num_; +} + +// ============================================================================= +// Emergency shutdown: broadcast exit signal to every handshake'd core and +// deinit their AICore register blocks. Idempotent. +// ============================================================================= +void SchedulerContext::emergency_shutdown(Runtime *runtime) { + LOG_WARN("Emergency shutdown: sending exit signal to all initialized cores"); + Handshake *all_handshakes = reinterpret_cast(runtime->workers); + int32_t timeout_count = 0; + for (int32_t i = 0; i < cores_total_num_; i++) { + Handshake *hank = &all_handshakes[i]; + OUT_OF_ORDER_STORE_BARRIER(); + hank->aicpu_regs_ready = 1; + if (core_exec_states_[i].reg_addr != 0) { + if (platform_deinit_aicore_regs(core_exec_states_[i].reg_addr) != 0) { + timeout_count++; + } + } + } + if (timeout_count > 0) { + LOG_ERROR("Emergency shutdown: %d cores did not acknowledge exit", timeout_count); + } + LOG_WARN("Emergency shutdown complete"); +} + +// ============================================================================= +// Lifecycle: init / deinit +// ============================================================================= +int32_t SchedulerContext::init( + Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base +) { + always_assert(runtime != nullptr); + + // Zero all per-core execution state before handshake + memset(core_exec_states_, 0, sizeof(core_exec_states_)); + + // Wire thread/transition configuration that handshake/assign need to read. + aicpu_thread_num_ = aicpu_thread_num; + sched_thread_num_ = sched_thread_num; + orch_to_sched_ = orch_to_sched; + regs_ = regs_base; + +#if PTO2_PROFILING + // l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory + // header — must be called BEFORE the orchestrator thread caches the level + // via rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level() in + // AicpuExecutor::run(). Otherwise the cached value would still be DISABLED + // (only the binary enable bit has been seeded by kernel.cpp at this point), + // and the CYCLE_COUNT_START() gate in pto_orchestrator.cpp would suppress + // all ORCH_PHASES records. Reset the cached level on disabled runs so a + // prior enabled launch's level can't leak into the phase-record gates in + // scheduler_dispatch (`>= SCHED_PHASES`). + if (is_l2_swimlane_enabled()) { + l2_swimlane_aicpu_init(runtime->worker_count); + l2_swimlane_level_ = get_l2_swimlane_level(); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + // When orchestrator phases merge into scheduler threads + // (PTO2_ORCH_TO_SCHED=1), phase records flow through + // aicpu_thread_num_ pools — matches the same branch in + // dump_args_init (scheduler_dispatch.cpp). + // Sched phase pool count = number of scheduler threads. + // sched_thread_num_ <= 0 is the "use all AICPU threads as + // scheduler threads" sentinel (see assign_cores_to_threads' + // active_sched_threads_ normalization). Without this + // normalization here, init_phase would prime zero sched pools + // and all sched_phase emits would silently drop. + const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; + const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched; + // Orch phase is a single instance (PR #971 design), so the orch + // pool count is always 1 regardless of orch_to_sched mode. + const int orch_phase_threads = 1; + l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_phase_threads, orch_phase_threads); + } + } else { + l2_swimlane_level_ = L2SwimlaneLevel::DISABLED; + } +#endif + + // Discover cores and assign to scheduler threads. + int32_t rc = handshake_all_cores(runtime); + if (rc != 0) { + LOG_ERROR("handshake_all_cores failed"); + return rc; + } + if (!assign_cores_to_threads()) { + return -1; + } + + // Initialize task counters. Task count comes from PTO2 shared memory. + if (runtime->get_gm_sm_ptr()) { + auto *header = static_cast(runtime->get_gm_sm_ptr()); + // Read at one-time boot init, before the SM is reset for the run, so a + // ring not yet written holds uninitialized memory (0xbe... under ASAN's + // malloc-fill). Sum in int64 and only count rings whose value is a + // plausible task count — (0, PTO2_SCOPE_TASKS_CAP]; a ring cannot hold + // more than the scope cap. This rejects any garbage pattern (negative + // or positive), so uninitialized rings contribute 0 (the correct boot + // count) while valid counts still add up, with no signed overflow. + int64_t task_count = 0; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + int32_t ring_tasks = header->rings[r].fc.current_task_index.load(std::memory_order_acquire); + if (ring_tasks > 0 && ring_tasks <= PTO2_SCOPE_TASKS_CAP) task_count += ring_tasks; + } + total_tasks_ = static_cast(task_count); + } else { + total_tasks_ = 0; + } + completed_tasks_.store(0, std::memory_order_release); + + // Device orchestration: the orchestrator thread flips this when the graph is built. + orchestrator_done_ = false; + + // Clear per-core dispatch payloads + memset(payload_per_core_, 0, sizeof(payload_per_core_)); + memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_)); + + // Initialize per-core GlobalContext (sub_block_id) based on cluster position. + // This is done once at startup and never modified afterwards. + for (int32_t t = 0; t < sched_thread_num_; t++) { + CoreTracker &tracker = core_trackers_[t]; + for (int32_t c = 0; c < tracker.get_cluster_count(); c++) { + int32_t cluster_offset = c * 3; // Each cluster = 1 AIC + 2 AIV + auto aiv0_id = tracker.get_core_id_by_offset(tracker.get_aiv0_core_offset(cluster_offset)); + auto aiv1_id = tracker.get_core_id_by_offset(tracker.get_aiv1_core_offset(cluster_offset)); + payload_per_core_[aiv0_id][0].global_context.sub_block_id = 0; + payload_per_core_[aiv0_id][1].global_context.sub_block_id = 0; + payload_per_core_[aiv1_id][0].global_context.sub_block_id = 1; + payload_per_core_[aiv1_id][1].global_context.sub_block_id = 1; + } + } + + func_id_to_addr_ = runtime->func_id_to_addr_; + + return 0; +} + +void SchedulerContext::deinit() { + // Reset all per-core execution state + for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) { + core_exec_states_[i] = {}; + core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID; + core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID; + } + + // Clear per-core dispatch payloads + memset(payload_per_core_, 0, sizeof(payload_per_core_)); + memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_)); + + // Reset sync-start drain coordination — a previous run that aborted mid-drain + // would otherwise leave dirty pending/elected/ack state for the next reuse. + drain_state_.sync_start_pending.store(0, std::memory_order_release); + drain_state_.drain_worker_elected.store(0, std::memory_order_release); + drain_state_.drain_ack_mask.store(0, std::memory_order_release); + drain_state_.pending_task.store(nullptr, std::memory_order_release); + + // Reset task counters and orchestrator state + completed_tasks_.store(0, std::memory_order_release); + total_tasks_ = 0; + orchestrator_done_ = false; + init_claimed_.store(false, std::memory_order_release); + init_complete_.store(false, std::memory_order_release); + + // Reset core transition state + transition_requested_.store(false, std::memory_order_release); + wait_reassign_.store(0, std::memory_order_release); + reassigned_.store(false, std::memory_order_release); + completed_.store(false, std::memory_order_release); + + // Reset core discovery and assignment state + aic_count_ = 0; + aiv_count_ = 0; + cores_total_num_ = 0; + aicpu_thread_num_ = 0; + sched_thread_num_ = 0; + orch_to_sched_ = false; + active_sched_threads_ = 0; + for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) { + core_trackers_[t] = CoreTracker{}; + } + + regs_ = 0; + sched_ = nullptr; + rt_ = nullptr; + func_id_to_addr_ = nullptr; +} + +void SchedulerContext::wait_init_complete() const { + while (!init_complete_.load(std::memory_order_acquire)) { + SPIN_WAIT_HINT(); + } +} + +void SchedulerContext::bind_runtime(PTO2Runtime *rt) { + rt_ = rt; + sched_ = &rt->scheduler; +} + +// ============================================================================= +// Post-orchestration bookkeeping. Runs on the orchestrator thread once the +// build phase finishes; folds inline-completed tasks, flips orchestrator_done_, +// and drives the orchestrator → scheduler core transition (or fatal shutdown). +// ============================================================================= +void SchedulerContext::on_orchestration_done( + Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks +) { +#if PTO2_PROFILING + if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) { + // Flush orchestrator's phase record buffer (orch pool, ordinal 0) + l2_swimlane_aicpu_flush_orch_phase_buffer(thread_idx); + } +#endif + + total_tasks_ = total_tasks; + + // Fold tasks completed inline during orchestration + int32_t inline_completed = static_cast(rt->orchestrator.inline_completed_tasks); + if (inline_completed > 0) { + completed_tasks_.fetch_add(inline_completed, std::memory_order_relaxed); +#if PTO2_SCHED_PROFILING + rt->scheduler.tasks_completed.fetch_add(inline_completed, std::memory_order_relaxed); +#endif + } + orchestrator_done_ = true; + + // Check for fatal error from orchestration; if so, shut down immediately. + int32_t orch_err = 0; + if (sched_->sm_header) { + orch_err = sched_->sm_header->orch_error_code.load(std::memory_order_relaxed); + } + if (orch_err != PTO2_ERROR_NONE) { + if (!completed_.exchange(true, std::memory_order_acq_rel)) { + emergency_shutdown(runtime); + } + } + + // Skip core transition on fatal error — cores already shut down above. + if (completed_.load(std::memory_order_acquire)) { + // Signal transition to unblock scheduler threads waiting at core transition + transition_requested_.store(true, std::memory_order_release); + reassigned_.store(true, std::memory_order_release); + } else if (orch_to_sched_) { + LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx); + transition_requested_.store(true, std::memory_order_release); + + // Wait for scheduler threads to acknowledge transition request + while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) { + if (completed_.load(std::memory_order_acquire)) { + break; + } + SPIN_WAIT_HINT(); + } + if (!completed_.load(std::memory_order_acquire)) { + reassign_cores_for_all_threads(); + reassigned_.store(true, std::memory_order_release); + } + } + +#if PTO2_PROFILING + // Write core-to-thread mapping AFTER reassignment so the profiling data + // reflects the final distribution (all active_sched_threads_, including + // former orchestrator threads when orch_to_sched_ is enabled). + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + l2_swimlane_aicpu_init_core_assignments(cores_total_num_); + for (int32_t t = 0; t < active_sched_threads_; t++) { + l2_swimlane_aicpu_write_core_assignments_for_thread( + t, core_trackers_[t].core_ids(), core_trackers_[t].core_num() + ); + } + } +#endif +} diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp new file mode 100644 index 000000000..7d83249ab --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_completion.cpp @@ -0,0 +1,514 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include "scheduler_context.h" + +#include "common/unified_log.h" +#include "aicpu/device_time.h" +#include "aicpu/platform_regs.h" +#include "common/l2_swimlane_profiling.h" +#include "common/memory_barrier.h" +#include "common/platform_config.h" +#include "pto_runtime2.h" +#include "runtime.h" +#include "spin_hint.h" + +// Performance profiling headers +#include "aicpu/l2_swimlane_collector_aicpu.h" +#include "aicpu/pmu_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" + +// ============================================================================= +// Dual-slot state machine helpers +// ============================================================================= + +namespace { +inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256; +} + +// Pure function: read register result -> SlotTransition (no side effects). +SlotTransition SchedulerContext::decide_slot_transition( + int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id +) { + SlotTransition t; + if (pending_id != AICPU_TASK_INVALID && reg_task_id == pending_id) { + t.matched = true; + t.running_done = true; // Serial execution: pending event implies running done + t.running_freed = true; + t.pending_freed = true; + if (reg_state == TASK_FIN_STATE) { + t.pending_done = true; // Case 1: pending FIN + } + // else: Case 2: pending ACK (pending_done stays false) + } else if (reg_task_id == running_id) { + if (reg_state == TASK_FIN_STATE) { + if (pending_id == AICPU_TASK_INVALID) { + // Case 3.2: running FIN, no pending -> core goes idle + t.matched = true; + t.running_done = true; + t.running_freed = true; + } + // Case 3.1: running FIN, pending exists -> skip (transient state). + // Case 1/2 (pending ACK/FIN) will complete running implicitly via running_done=true. + } else { + // Case 4: running ACK -- only pending_freed (slot now hardware-latched) + t.matched = true; + t.pending_freed = true; + } + } + return t; +} + +// Complete one slot's task: subtask counting, mixed completion, deferred release, profiling. +void SchedulerContext::complete_slot_task( + PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, [[maybe_unused]] PTO2SubtaskSlot subslot, + int32_t thread_idx, int32_t core_id, Handshake *hank, int32_t &completed_this_turn, + PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs +#if PTO2_PROFILING + , + uint64_t dispatch_ts, uint64_t finish_ts +#endif +) { +#if PTO2_PROFILING + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; +#else + (void)hank; +#endif + // MPSC fast-path: see a2a3 mirror for the full design narrative. The + // any_subtask_deferred flag on slot_state discriminates non-deferred + // tasks (inline complete in parallel on FIN thread) from deferred ones + // (route through the lock-free AICoreCompletionMailbox). + AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr; + bool defer_completion_to_consumer = false; + + if (slot_state.payload != nullptr) { + volatile DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][expected_reg_task_id & 1]; + int32_t slab_err = deferred_slab->error_code; + if (slab_err != PTO2_ERROR_NONE) { + int32_t expected = PTO2_ERROR_NONE; + sched_->sm_header->sched_error_code.compare_exchange_strong( + expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire + ); + completed_.store(true, std::memory_order_release); + return; + } + + uint32_t cond_count = deferred_slab->count; + if (cond_count > MAX_COMPLETIONS_PER_TASK) { + int32_t expected = PTO2_ERROR_NONE; + sched_->sm_header->sched_error_code.compare_exchange_strong( + expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire + ); + completed_.store(true, std::memory_order_release); + return; + } + + if (cond_count > 0) { + slot_state.any_subtask_deferred.store(true, std::memory_order_release); + + const PTO2TaskId token = slot_state.task->task_id; + for (uint32_t i = 0; i < cond_count; ++i) { + volatile DeferredCompletionEntry *e = &deferred_slab->entries[i]; + while (!mailbox->try_push_condition(token, e->addr, e->expected_value, e->engine, e->completion_type)) { + sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed); + SPIN_WAIT_HINT(); + } + } + } + } + + bool task_complete = sched_->on_subtask_complete(slot_state); + + if (task_complete && slot_state.payload != nullptr && + slot_state.any_subtask_deferred.load(std::memory_order_acquire)) { + while (!mailbox->try_push_normal_done(slot_state.task->task_id, reinterpret_cast(&slot_state))) { + sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed); + SPIN_WAIT_HINT(); + } + defer_completion_to_consumer = true; + } + + if (task_complete && !defer_completion_to_consumer) { +#if PTO2_PROFILING + if (is_dump_args_enabled()) { + dump_args_for_task( + thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION, + [](ActiveMask active_mask, int raw_subtask_id) { + return active_mask.subtask_active(static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif +#if PTO2_SCHED_PROFILING + // SCHED_PROFILING variant takes thread_idx for its per-thread atomic + // counter side-effects (g_sched_*_atomic_count[thread_idx], consumed + // by the otc_* log lines). Its return value is unused. + (void)sched_->on_task_complete(slot_state, thread_idx, local_bufs); +#else + sched_->on_task_complete(slot_state, local_bufs); +#endif +#if PTO2_PROFILING + l2_swimlane.phase_complete_count++; +#endif + if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP) { + deferred_release_slot_states[deferred_release_count++] = &slot_state; + } else { + LOG_INFO_V9("Thread %d: release", thread_idx); + while (deferred_release_count > 0) { +#if PTO2_SCHED_PROFILING + // SCHED_PROFILING variant takes thread_idx for the per-thread + // atomic counter side-effects. The return value is unused. + (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx); +#else + sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]); +#endif + } + deferred_release_slot_states[deferred_release_count++] = &slot_state; + } + completed_this_turn++; + } + +#if PTO2_PROFILING + // Level gate: at AICORE_TIMING (level=1) the AICore record alone carries + // {start, end, task_token_raw}, host resolves func_id/core_type from + // dep_gen / per-core mapping, and AICPU has nothing to write. Only at + // AICPU_TIMING (level=2) and above does AICPU contribute dispatch/finish + // timestamps via complete_task. + if (l2_swimlane.l2_swimlane_enabled && l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) { +#if PTO2_SCHED_PROFILING + uint64_t t_perf_start = get_sys_cnt_aicpu(); +#endif + + if (l2_swimlane_aicpu_complete_task( + core_id, thread_idx, static_cast(expected_reg_task_id), dispatch_ts, finish_ts + ) != 0) { + LOG_ERROR( + "Core %d: l2_swimlane_aicpu_complete_task failed for task 0x%" PRIx64, core_id, + static_cast(slot_state.task->task_id.raw) + ); + } +#if PTO2_SCHED_PROFILING + l2_swimlane.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start); +#endif + } +#endif + +#if PTO2_PROFILING + if (is_pmu_enabled()) { + // Slot key must be the 32-bit register token AICore wrote into + // dual_issue_slots[task_id & 1].task_id (= DATA_MAIN_BASE value). + // task_id.raw is the full PTO2 (ring_id<<32|local_id) encoding — + // matching on that would never hit. Pass the PTO2 id separately + // for the PmuRecord. + pmu_aicpu_complete_record( + core_id, thread_idx, static_cast(expected_reg_task_id), slot_state.task->task_id.raw, + slot_state.task->kernel_id[static_cast(subslot)], hank[core_id].core_type + ); + } +#endif +} + +// Promote pending slot data to running slot. Clears pending fields. +void SchedulerContext::promote_pending_to_running(CoreExecState &core) { + core.running_slot_state = core.pending_slot_state; + core.running_reg_task_id = core.pending_reg_task_id; + core.running_subslot = core.pending_subslot; +#if PTO2_PROFILING + core.running_dispatch_timestamp = core.pending_dispatch_timestamp; +#endif + core.pending_slot_state = nullptr; + core.pending_reg_task_id = AICPU_TASK_INVALID; +} + +// Clear running slot (core becomes idle). +void SchedulerContext::clear_running_slot(CoreExecState &core) { + core.running_slot_state = nullptr; + core.running_reg_task_id = AICPU_TASK_INVALID; +} + +void SchedulerContext::check_running_cores_for_completion( + int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed, + bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, + PTO2LocalReadyBuffer *local_bufs +) { +#if PTO2_SCHED_PROFILING + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; +#endif + CoreTracker &tracker = core_trackers_[thread_idx]; + auto running_core_states = tracker.get_all_running_cores(); + while (running_core_states.has_value()) { + int32_t bit_pos = running_core_states.pop_first(); + int32_t core_id = tracker.get_core_id_by_offset(bit_pos); + CoreExecState &core = core_exec_states_[core_id]; + + // --- Judgment phase: read register, derive transition --- + // Use the precomputed cond_ptr (resolved once in handshake) to skip + // the reg_offset switch and reg_addr addition on every poll. + uint64_t reg_val = static_cast(*core.cond_ptr); + // ARM64 allows Device-nGnRnE -> Normal-cacheable load reorder; the + // rmb() pins any AICore-published cacheable reads downstream of the + // FIN observation. Replaces the post-`__sync_synchronize` that the + // old read_reg() helper carried implicitly. + rmb(); + int32_t reg_task_id = EXTRACT_TASK_ID(reg_val); + int32_t reg_state = EXTRACT_TASK_STATE(reg_val); + +#if PTO2_SCHED_PROFILING + if (l2_swimlane.l2_swimlane_enabled) { + l2_swimlane.complete_probe_count++; + } +#endif + + SlotTransition t = + decide_slot_transition(reg_task_id, reg_state, core.running_reg_task_id, core.pending_reg_task_id); + if (!t.matched) continue; + +#if PTO2_SCHED_PROFILING + if (l2_swimlane.l2_swimlane_enabled && (t.running_done || t.pending_done)) { + l2_swimlane.complete_hit_count++; + } +#endif + +#if PTO2_PROFILING + // Capture finish_ts at the FIN observation point — right after rmb() + // pinned cacheable AICore reads downstream of the register load, and + // BEFORE any fanin / deferred-release work. Anything later would + // charge AICPU completion-processing cost to (end → finish). + uint64_t finish_ts = 0; + if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING && (t.pending_done || t.running_done)) { + finish_ts = get_sys_cnt_aicpu(); + } +#endif + + // --- Apply phase: execute actions based on transition --- + + // 1. Complete finished tasks (capture pointers before modifying core state) + if (t.pending_done) { + complete_slot_task( + *core.pending_slot_state, core.pending_reg_task_id, core.pending_subslot, thread_idx, core_id, hank, + completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs +#if PTO2_PROFILING + , + core.pending_dispatch_timestamp, finish_ts +#endif + ); + cur_thread_completed++; + } + if (t.running_done) { + complete_slot_task( + *core.running_slot_state, core.running_reg_task_id, core.running_subslot, thread_idx, core_id, hank, + completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs +#if PTO2_PROFILING + , + core.running_dispatch_timestamp, finish_ts +#endif + ); + cur_thread_completed++; + } + + // 2. Update slot data + if (t.running_freed) { + if (core.pending_slot_state != nullptr && !t.pending_done) { + promote_pending_to_running(core); // Case 2 or Case 3 (with pending) + } else { + clear_running_slot(core); // Case 1 or Case 3 (no pending) + if (t.pending_done) { + // Case 1: pending FIN observed directly -- clear stale pending fields. + // Without this, pending_reg_task_id retains a stale value that blocks + // clear_pending_occupied and permanently degrades pipelining. + core.pending_slot_state = nullptr; + core.pending_reg_task_id = AICPU_TASK_INVALID; + } + } + } + + // 3. Update tracker bitmap + bool is_idle = (core.running_reg_task_id == AICPU_TASK_INVALID); + if (is_idle) { + tracker.change_core_state(bit_pos); // Mark idle + tracker.clear_pending_occupied(bit_pos); // Idle safeguard: no payload to protect + } else if (t.pending_freed && core.pending_reg_task_id == AICPU_TASK_INVALID) { + // Case 4 (running ACK) or Case 2 (pending ACK): clear pending_occupied only + // when no pending task is currently held. Otherwise pending slot is occupied + // by a pre-loaded task and must stay protected. + tracker.clear_pending_occupied(bit_pos); + } + + // 4. Progress signal (only when running task completes) + if (t.running_done) { + made_progress = true; + } + } +} + +// ============================================================================= +// sync_start drain protocol +// ============================================================================= + +// Take ownership of slot_state and signal all threads to enter drain mode. +// Returns true if this thread won the CAS and owns the drain slot. +// Returns false if another thread already holds drain; caller must re-push slot_state. +// +// Two-phase protocol: CAS 0 -> -1 (sentinel) to claim ownership, store task and +// reset election flag, then release-store block_num. Other threads acquire-load +// sync_start_pending; seeing block_num > 0 ensures all relaxed stores are visible. +bool SchedulerContext::enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num) { + int32_t expected = 0; + if (!drain_state_.sync_start_pending.compare_exchange_strong( + expected, -1, std::memory_order_relaxed, std::memory_order_relaxed + )) { + return false; // Another thread already holds the drain slot. + } + // We own the drain slot. Store the task and reset election flag before making it visible. + drain_state_.pending_task.store(slot_state, std::memory_order_release); + drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed); + drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed); + // Release store: all stores above are now visible to any thread that + // acquire-loads sync_start_pending and sees block_num > 0. + drain_state_.sync_start_pending.store(block_num, std::memory_order_release); + return true; +} + +// Count total available resources across all scheduler threads for a given shape. +int32_t SchedulerContext::count_global_available(PTO2ResourceShape shape, uint8_t core_mask) { + int32_t total = 0; + for (int32_t t = 0; t < active_sched_threads_; t++) { + if (shape == PTO2ResourceShape::MIX) { + total += core_trackers_[t].count_mix_running_clusters(core_mask); + } else { + total += core_trackers_[t].get_idle_core_offset_states(shape).count(); + } + } + return total; +} + +// Drain worker: dispatch all blocks in one pass across all threads' trackers. +// Called only when global resources >= block_num, so one pass always suffices. +// All other threads are spinning -- the drain worker has exclusive tracker access. +void SchedulerContext::drain_worker_dispatch(Runtime *runtime, int32_t block_num) { + PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire); + if (!slot_state) { + drain_state_.sync_start_pending.store(0, std::memory_order_release); + return; + } + PTO2ResourceShape shape = slot_state->active_mask.to_shape(); + uint8_t core_mask = slot_state->active_mask.core_mask(); + + for (int32_t t = 0; t < active_sched_threads_ && slot_state->next_block_idx < block_num; t++) { + auto valid = (shape == PTO2ResourceShape::MIX) ? + core_trackers_[t].get_mix_running_cluster_offset_states(core_mask) : + core_trackers_[t].get_idle_core_offset_states(shape); + while (valid.has_value() && slot_state->next_block_idx < block_num) { + dispatch_block(runtime, t, valid.pop_first(), *slot_state, shape, false, slot_state->next_block_idx); + slot_state->next_block_idx++; + } + } + + // All blocks dispatched -- clear drain state. + // Release fence ensures tracker mutations are visible to threads that + // acquire-load sync_start_pending == 0 and resume normal operation. + std::atomic_thread_fence(std::memory_order_release); + drain_state_.pending_task.store(nullptr, std::memory_order_release); + drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed); + drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed); + drain_state_.sync_start_pending.store(0, std::memory_order_release); +} + +// Called by each scheduler thread when drain_state_.sync_start_pending != 0. +// +// Protocol (single-stage ack barrier): +// 1. Ack barrier: all threads signal they've stopped dispatch, then spin +// until all ack bits are set. +// If this thread's bit gets cleared while waiting, a reset occurred -- return. +// 2. Election: one thread wins the CAS and becomes the drain worker. +// If resources are insufficient, reset ack/election fields and return -- +// all threads resume completion polling to free running cores, then retry. +// 3. Dispatch: elected thread dispatches all blocks (one pass, resources guaranteed). +// Non-elected threads spin-wait until sync_start_pending == 0. +// During dispatch the elected thread has exclusive tracker access. +void SchedulerContext::handle_drain_mode(Runtime *runtime, int32_t thread_idx) { + // Every spin in this function honors is_completed(): once the run latches + // completed_ (all tasks done, or a fatal error raised elsewhere), peers leave + // the dispatch loop and stop participating in the drain. A thread parked in a + // drain spin would then wait forever for acks / a gate-open that can no longer + // arrive -- the AICPU watchdog never fires here because these spins live + // outside the dispatch loop's wall-clock budget, so the hang escalates straight + // to the 3 s STARS op-exec timeout (507018) and poisons the device. Bailing on + // completed_ is always safe: any pending sync_start task is either already + // dispatched (a stale re-popped slot) or moot under teardown, and deinit() + // resets drain_state_ before the next run, so leaving it dirty is harmless. + // Spin until drain is fully initialized (sentinel -1 -> block_num > 0). + int32_t block_num; + do { + if (is_completed()) return; + block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire); + } while (block_num < 0); + if (block_num == 0) return; + + uint32_t all_acked = (1u << active_sched_threads_) - 1; + + // Ack barrier -- signal this thread has stopped dispatch. + drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release); + + // Spin until all threads have acked. + // If our bit is cleared while waiting, elected reset due to insufficient resources. + while (true) { + if (is_completed()) return; + uint32_t ack = drain_state_.drain_ack_mask.load(std::memory_order_acquire); + if ((ack & all_acked) == all_acked) break; + if ((ack & (1u << thread_idx)) == 0) return; + SPIN_WAIT_HINT(); + } + + // Election -- exactly one thread wins the CAS. + int32_t expected = 0; + drain_state_.drain_worker_elected.compare_exchange_strong( + expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed + ); + + if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1) { + // Non-elected: spin-wait for drain completion or resource-insufficient reset. + while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) { + if (is_completed()) return; + if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return; + SPIN_WAIT_HINT(); + } + return; + } + + // Elected: check if global resources are sufficient. + PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire); + if (slot_state == nullptr) { + // pending_task is observed null only when a concurrent drain completion + // already cleared it (drain_worker_dispatch nulls it before reopening the + // gate). That drain is done and this is a stale-elected thread, so just + // release the election lock and return. Do NOT clear drain_ack_mask or + // sync_start_pending: a *new* drain run may already be active and + // accumulating acks, and zeroing them would corrupt it into a hang. + drain_state_.drain_worker_elected.store(0, std::memory_order_release); + return; + } + PTO2ResourceShape shape = slot_state->active_mask.to_shape(); + int32_t available = count_global_available(shape, slot_state->active_mask.core_mask()); + + if (available < block_num) { + // Insufficient resources -- reset drain fields so threads can resume + // completion polling to free running cores, then retry. + drain_state_.drain_ack_mask.store(0, std::memory_order_release); + drain_state_.drain_worker_elected.store(0, std::memory_order_release); + return; + } + + // Dispatch -- all other threads are spinning, elected thread has exclusive tracker access. + drain_worker_dispatch(runtime, block_num); +} diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h new file mode 100644 index 000000000..8aa8d0034 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_context.h @@ -0,0 +1,387 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#ifndef SCHEDULER_CONTEXT_H +#define SCHEDULER_CONTEXT_H + +#include "common/l2_swimlane_profiling.h" +#include "common/unified_log.h" +#include "scheduler_types.h" + +#include "scheduler/pto_scheduler.h" + +#include "aicore_completion_mailbox.h" + +// These macros are defined in runtime.h, but we cannot include it here +// (it pulls in Handshake which we only forward-declare). Mirror the +// authoritative values so the class layout compiles standalone. +#ifndef RUNTIME_MAX_WORKER +#define RUNTIME_MAX_WORKER 108 +#endif +#ifndef RUNTIME_MAX_FUNC_ID +#define RUNTIME_MAX_FUNC_ID 1024 +#endif + +// Forward declarations — avoid pulling in full headers for pointer/reference params. +class Runtime; +struct Handshake; +struct PTO2Runtime; + +/** + * SchedulerContext: owns all scheduler-side state and methods. + * + * Held as a member of AicpuExecutor (sched_ctx_). The single public entry + * point is resolve_and_dispatch(), called once per scheduler thread. + * + * All dispatch/completion/drain/cold-path logic is implemented as private + * member methods, split across three .cpp files by responsibility: + * - scheduler_completion.cpp (completion polling, drain protocol) + * - scheduler_cold_path.cpp (exit checks, stall diagnostics, profiling) + * - scheduler_dispatch.cpp (task dispatch loop and helpers) + */ +class SchedulerContext { +public: + // ========================================================================= + // Lifecycle + // ========================================================================= + + // Initialize scheduler state from the given runtime and thread layout. + // - Discovers cores via handshake_all_cores() + // - Assigns cores to scheduler threads + // - Resets task counters, payloads, per-core GlobalContext + // - Binds func_id_to_addr_ / initial sched_ (if rt is already known) + // - Captures AICore-register base (consumed by handshake_all_cores()) + // Returns 0 on success, negative on failure (handshake / assignment error). + int32_t + init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base); + + // Reset all SchedulerContext-owned state to its post-construction defaults. + // Called by AicpuExecutor::deinit() during per-run teardown. + void deinit(); + + // ========================================================================= + // Per-thread execution entry points (called by AicpuExecutor::run) + // ========================================================================= + + // Main scheduler thread entry: poll completion + dispatch ready tasks. + int32_t resolve_and_dispatch(Runtime *runtime, int32_t thread_idx); + + // Shutdown AICore registers for this thread's assigned cores. + // Also runs PMU finalize (PTO2_PROFILING) before deinit when enabled. + // Orchestrator threads (core_trackers_[thread_idx].core_num() == 0) are a no-op. + int32_t shutdown(int32_t thread_idx); + + // Run all post-orchestration scheduler bookkeeping: + // - publishes core assignments to the perf collector (PTO2_PROFILING) + // - latches submitted task count from PTO2 shared memory + // - folds inline_completed_tasks into completed_tasks_ + // - flips orchestrator_done_ and triggers core transition + // (skipped on fatal error — emergency_shutdown runs instead) + // Callers must invoke rt_orchestration_done(rt) before this — that + // step belongs to the orchestrator lifecycle, not the scheduler. + void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks); + + // Bind the PTO2Runtime scheduler pointer. Required in device-orchestration + // mode where rt is created by the orchestrator thread after init(). + void bind_runtime(PTO2Runtime *rt); + + // ========================================================================= + // State queries / external synchronization points + // ========================================================================= + + int32_t aic_count() const { return aic_count_; } + int32_t aiv_count() const { return aiv_count_; } + bool is_completed() const { return completed_.load(std::memory_order_acquire); } + int32_t completed_tasks_count() const { return completed_tasks_.load(std::memory_order_acquire); } + + // Block until the first scheduler thread has finished one-time PTO2 init. + // Called by the orchestrator thread in device-orch mode. + void wait_init_complete() const; + +private: + // ========================================================================= + // State + // ========================================================================= + + // --- Scheduler binding & per-core runtime state --- + alignas(64) PTO2SchedulerState *sched_{nullptr}; + PTO2Runtime *rt_{nullptr}; + + // Per-core execution state, indexed by core_id (= worker_id) + CoreExecState core_exec_states_[RUNTIME_MAX_WORKER]; + + // Cluster-ordered core trackers, one per scheduler thread + CoreTracker core_trackers_[MAX_AICPU_THREADS]; + + // Per-core dispatch payload storage: dual-buffer for pipelining. + // buf_idx = reg_task_id & 1; adjacent dispatches alternate automatically. + PTO2DispatchPayload payload_per_core_[RUNTIME_MAX_WORKER][2]; + + // Per-core deferred-completion software registration storage. This has + // the same runtime lifetime as payload_per_core_, but is kept out of the + // dispatch payload so normal task dispatch layout and cache footprint stay + // unchanged. + DeferredCompletionSlab deferred_slab_per_core_[RUNTIME_MAX_WORKER][2]; + + // sync_start drain coordination + SyncStartDrainState drain_state_; + +#if PTO2_PROFILING + SchedL2SwimlaneCounters sched_l2_swimlane_[MAX_AICPU_THREADS]; + // Cached once at init() from get_l2_swimlane_level(), AFTER + // l2_swimlane_aicpu_init has promoted the level from the shared-memory header. + L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED}; +#endif + + // --- Task-execution tracking --- + std::atomic completed_tasks_{0}; + int32_t total_tasks_{0}; + // Device orchestration: set by last orchestrator when graph is built; schedulers poll it. + // volatile prevents the compiler from hoisting the load out of spin loops. + volatile bool orchestrator_done_{false}; + std::atomic completed_{false}; + uint64_t *func_id_to_addr_{nullptr}; + + // --- Core-transition coordination --- + std::atomic transition_requested_{false}; + std::atomic wait_reassign_{0}; + std::atomic reassigned_{false}; + + // --- Thread/core configuration --- + int32_t active_sched_threads_{0}; + int32_t sched_thread_num_{0}; + bool orch_to_sched_{false}; + int32_t aicpu_thread_num_{0}; + int32_t cores_total_num_{0}; + + // Cluster-ordered worker_id lists, populated by handshake_all_cores(). + int32_t aic_worker_ids_[RUNTIME_MAX_WORKER]{}; + int32_t aiv_worker_ids_[RUNTIME_MAX_WORKER]{}; + int32_t aic_count_{0}; + int32_t aiv_count_{0}; + +#if PTO2_PROFILING + // Physical core ids keyed by logical worker id. Populated by + // handshake_all_cores() and handed to pmu_aicpu_init() so the platform + // can resolve per-core PMU MMIO bases. Only needed when PTO2_PROFILING=1 + // — without it, PMU is compiled out and core_exec_states_ already + // carries the field. + uint32_t physical_core_ids_[RUNTIME_MAX_WORKER]{}; +#endif + + // Platform AICore-register base array (set by AicpuExecutor before init()). + uint64_t regs_{0}; + + // --- One-time init coordination --- + std::atomic init_claimed_{false}; + std::atomic init_complete_{false}; + + // ========================================================================= + // Core management (scheduler_cold_path.cpp) + // ========================================================================= + + // Handshake with all AICore workers; populates core_exec_states_, worker id lists. + int32_t handshake_all_cores(Runtime *runtime); + + // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads. + bool assign_cores_to_threads(); + + // Re-distribute all cores across all threads after orchestration completes. + void reassign_cores_for_all_threads(); + + // Emergency shutdown: broadcast exit signal to every handshake'd core and + // deinit their AICore register blocks. Idempotent. + void emergency_shutdown(Runtime *runtime); + + // ========================================================================= + // Dispatch (scheduler_dispatch.cpp) + // ========================================================================= + + static const char *shape_name(PTO2ResourceShape shape); + + // Lower-case rendering of PTO2SubtaskSlot, used by dispatch and stall logs. + // Kept lower-case to match the `kernels=[aic:N aiv0:N aiv1:N]` field + // convention already established in the stall log family. + static inline const char *subslot_name(PTO2SubtaskSlot s) { + switch (s) { + case PTO2SubtaskSlot::AIC: + return "aic"; + case PTO2SubtaskSlot::AIV0: + return "aiv0"; + case PTO2SubtaskSlot::AIV1: + return "aiv1"; + } + return "?"; + } + + int pop_ready_tasks_batch( + PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, + int max_count + ); + + void build_payload( + PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, + const AsyncCtx &async_ctx, int32_t block_idx + ); + + void dispatch_subtask_to_core( + Runtime *runtime, int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, + PTO2SubtaskSlot subslot, bool to_pending, int32_t block_idx + ); + + void dispatch_mix_block_to_cluster( + Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state, bool to_pending, + int32_t block_idx + ); + + void dispatch_block( + Runtime *runtime, int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, + PTO2ResourceShape shape, bool to_pending, int32_t block_idx + ); + + void dispatch_shape( + Runtime *runtime, int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, + PTO2LocalReadyBuffer &local_buf, CoreTracker &tracker, bool &entered_drain, bool &made_progress, + bool &try_pushed + ); + + // One pass of "Phase 4" in the resolve_and_dispatch loop: IDLE-stage dispatch + // for MIX then (if no mix residual) AIC/AIV; mid-flush of local buffers; then + // PENDING-stage dispatch with cross-thread idle gating. MIX is strictly + // prioritized — when mix residual is detected after MIX-IDLE, AIC/AIV are + // skipped for the whole pass but MIX-PENDING still runs. + // + // Forward-progress argument for AIC/AIV: skip_aic_aiv is sticky for the + // current pass only. The next loop iteration re-evaluates after Phase 1 + // completion polling and the global MIX queue draining (here or on any + // peer thread). AIC/AIV starvation is therefore bounded by MIX throughput, + // not unbounded — once mix completes on at least one cluster, the next + // pass either drains the residual or admits AIC/AIV. + void dispatch_ready_tasks( + Runtime *runtime, int32_t thread_idx, CoreTracker &tracker, + PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES], bool pmu_active, bool &made_progress, + bool &try_pushed + ); + + // Returns true if any *other* scheduler thread currently has an idle core + // matching `shape`. Used as a scheduling hint on the PENDING dispatch path + // — see the implementation in scheduler_dispatch.cpp for the hint-semantics + // rationale and the safety argument against the drain worker. + bool has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const; + + // True if mix tasks remain anywhere this thread could see them: the caller's + // MIX local LIFO stack or the global MIX ready queue. Approximate — + // PTO2ReadyQueue::size() (see pto_scheduler.h) snapshots its enqueue/dequeue + // positions with std::memory_order_relaxed and may interleave with concurrent + // push/pop. Don't confuse with PTO2SpscQueue::size(), which uses acquire + // loads — that one isn't on this path. A stale read here causes at most one + // extra/missed AIC/AIV skip and self-corrects on the next loop iteration. + bool has_residual_mix(const PTO2LocalReadyBuffer &mix_local_buf) const { + return mix_local_buf.count > 0 || sched_->ready_queues[static_cast(PTO2ResourceShape::MIX)].size() > 0; + } + + // ========================================================================= + // Completion & drain (scheduler_completion.cpp) + // ========================================================================= + + static SlotTransition + decide_slot_transition(int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id); + + void complete_slot_task( + PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, PTO2SubtaskSlot subslot, int32_t thread_idx, + int32_t core_id, Handshake *hank, int32_t &completed_this_turn, + PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, + PTO2LocalReadyBuffer *local_bufs +#if PTO2_PROFILING + , + uint64_t dispatch_ts, uint64_t finish_ts +#endif + ); + + static void promote_pending_to_running(CoreExecState &core); + static void clear_running_slot(CoreExecState &core); + + void check_running_cores_for_completion( + int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed, + bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, + PTO2LocalReadyBuffer *local_bufs + ); + + bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num); + int32_t count_global_available(PTO2ResourceShape shape, uint8_t core_mask); + void drain_worker_dispatch(Runtime *runtime, int32_t block_num); + void handle_drain_mode(Runtime *runtime, int32_t thread_idx); + + // ========================================================================= + // Cold path: exit checks, stall diagnostics, profiling (scheduler_cold_path.cpp) + // ========================================================================= + + __attribute__((noinline, cold)) LoopAction + handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count); + + __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released); + + __attribute__((noinline, cold)) LoopAction + check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime); + + __attribute__((noinline, cold)) void + log_stall_diagnostics(int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count); + + __attribute__((noinline, cold)) void log_shutdown_stall_snapshot( + int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count + ); + + // Reverse lookup: given a global core_id, find which scheduler thread's + // tracker owns it. Returns -1 if not found. Linear scan — only used on + // the cold diagnostic path. + int32_t find_core_owner_thread(int32_t core_id) const; + + // Does this thread own any core with a RUNNING task (running_slot_state set)? + // Gates the scheduler timeout fatal latch: a thread without an owned + // RUNNING task has no first-hand evidence of a stuck dispatch and must + // not declare global fatal on its own idle observation. The thread that + // does own the stuck task will reach the budget on its own polls and + // latch with valid evidence (or recover when the COND register flips). + bool self_owns_running_task(int32_t thread_idx) const; + + // Does *any* scheduler thread own a RUNNING task? Used as the second + // fatal-latch condition: if the wall-clock budget elapsed AND no thread + // owns RUNNING work AND tasks remain incomplete, the system is in a + // pre-dispatch / WAIT-only deadlock (e.g. dependency cycle) and the + // ownerless idle threads are the only observers — let one of them latch. + bool no_thread_owns_running_task() const; + + __attribute__((noinline, cold)) int32_t handle_timeout_exit( + int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations, + int32_t last_progress_count +#if PTO2_PROFILING + , + uint64_t sched_start_ts +#endif + ); + +#if PTO2_PROFILING + __attribute__((noinline, cold)) void log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed); +#endif + + // ========================================================================= + // Small inline helpers + // ========================================================================= + + uint64_t get_function_bin_addr(int func_id) const { + if (!func_id_to_addr_ || func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("func_id=%d is out of range [0, %d) or map is null", func_id, RUNTIME_MAX_FUNC_ID); + return 0; + } + return func_id_to_addr_[func_id]; + } +}; + +#endif // SCHEDULER_CONTEXT_H diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp new file mode 100644 index 000000000..d3fbbde5d --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_dispatch.cpp @@ -0,0 +1,1020 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#include "scheduler_context.h" + +#include +#include +#include + +#include "common.h" // debug_assert +#include "common/unified_log.h" +#include "aicpu/device_time.h" +#include "aicpu/platform_regs.h" +#include "callable.h" +#include "common/l2_swimlane_profiling.h" +#include "common/memory_barrier.h" +#include "common/platform_config.h" +#include "pto_runtime2.h" +#include "runtime.h" +#include "spin_hint.h" + +// Performance profiling headers +#include "aicpu/l2_swimlane_collector_aicpu.h" +#include "aicpu/pmu_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" + +// ============================================================================= +// Dispatch helpers +// ============================================================================= + +namespace { +inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256; +} + +const char *SchedulerContext::shape_name(PTO2ResourceShape shape) { + switch (shape) { + case PTO2ResourceShape::AIC: + return "AIC"; + case PTO2ResourceShape::AIV: + return "AIV"; + case PTO2ResourceShape::MIX: + return "MIX"; + case PTO2ResourceShape::DUMMY: + return "DUMMY"; + } + return "UNKNOWN"; +} + +bool SchedulerContext::has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const { + // Cross-thread read of peer trackers without explicit synchronization. The + // backing `core_states_` is a naturally aligned uint64_t; aarch64 guarantees + // single-copy atomicity for an 8-byte aligned load, so no torn read. The + // value is consumed only as a scheduling *hint* — a stale read at worst + // causes one missed/extra pending dispatch, corrected on the next iteration. + // Drain-mode cross-thread writes are serialized by handle_drain_mode's ack + // barrier (all peers spin out of the dispatch path before any tracker + // mutation), so this routine is never racing the drain worker. + for (int32_t t = 0; t < active_sched_threads_; t++) { + if (t == self_thread_idx) continue; + if (core_trackers_[t].get_idle_core_offset_states(shape).has_value()) { + return true; + } + } + return false; +} + +int SchedulerContext::pop_ready_tasks_batch( + PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count +) { +#if PTO2_PROFILING + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; +#if PTO2_SCHED_PROFILING + extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[]; + uint64_t t_pop_start = get_sys_cnt_aicpu(); + int count = sched_->get_ready_tasks_batch( + shape, local_buf, out, max_count, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx] + ); + l2_swimlane.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start); +#else + int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count); +#endif + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + if (count > 0) { + l2_swimlane.pop_hit += count; + } else { + l2_swimlane.pop_miss++; + } + } +#else + (void)thread_idx; + int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count); +#endif + return count; +} + +void SchedulerContext::build_payload( + PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, + const AsyncCtx &async_ctx, int32_t block_idx +) { + int32_t slot_idx = static_cast(subslot); + uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]); + const CoreCallable *callable = reinterpret_cast(callable_addr); + dispatch_payload.function_bin_addr = callable->resolved_addr(); + auto &payload = *slot_state.payload; + int n = 0; + for (int32_t i = 0; i < payload.tensor_count; i++) { + dispatch_payload.args[n++] = reinterpret_cast(&payload.tensors[i]); + } + for (int32_t i = 0; i < payload.scalar_count; i++) { + dispatch_payload.args[n++] = payload.scalars[i]; + } + dispatch_payload.local_context.s_block_idx = block_idx; + dispatch_payload.local_context.s_block_num = slot_state.logical_block_num; + dispatch_payload.local_context.async_ctx = async_ctx; + dispatch_payload.args[PAYLOAD_LOCAL_CONTEXT_INDEX] = reinterpret_cast(&dispatch_payload.local_context); + dispatch_payload.args[PAYLOAD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast(&dispatch_payload.global_context); +} + +void SchedulerContext::dispatch_subtask_to_core( + Runtime *runtime, int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, + bool to_pending, int32_t block_idx +) { + CoreTracker &tracker = core_trackers_[thread_idx]; + auto core_id = tracker.get_core_id_by_offset(core_offset); + (void)runtime; + CoreExecState &core_exec_state = core_exec_states_[core_id]; + core_exec_state.dispatch_seq++; + uint32_t reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK; + static_assert( + (TASK_ID_MASK - AICORE_EXIT_SIGNAL + 1) % 2 == 0, "Sentinel skip must be even to preserve dual-buffer parity" + ); + if (reg_task_id >= AICORE_EXIT_SIGNAL) { + core_exec_state.dispatch_seq += (TASK_ID_MASK - reg_task_id + 1); + reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK; + } + + uint32_t buf_idx = reg_task_id & 1u; + PTO2DispatchPayload &payload = payload_per_core_[core_id][buf_idx]; + DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][buf_idx]; + deferred_slab->count = 0; + deferred_slab->error_code = PTO2_ERROR_NONE; + AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab); + build_payload(payload, slot_state, subslot, async_ctx, block_idx); + + if (to_pending) { + core_exec_state.pending_subslot = subslot; + core_exec_state.pending_slot_state = &slot_state; + core_exec_state.pending_reg_task_id = static_cast(reg_task_id); + } else { + core_exec_state.running_subslot = subslot; + core_exec_state.running_slot_state = &slot_state; + core_exec_state.running_reg_task_id = static_cast(reg_task_id); + tracker.change_core_state(core_offset); + } + + LOG_DEBUG( + "Thread %d: Dispatched %s %s task %" PRId64 " kernel_id=[%d,%d,%d] block_idx=%d/total_blocks=%d to" + " core_offset=%d core_id=%d reg_task_id=%u", + thread_idx, to_pending ? "pending" : "idle", subslot_name(subslot), + static_cast(slot_state.task->task_id.raw), slot_state.task->kernel_id[0], + slot_state.task->kernel_id[1], slot_state.task->kernel_id[2], block_idx, slot_state.logical_block_num, + core_offset, core_id, reg_task_id + ); + + // AICore buffer rotation lives on the dispatch path: count this dispatch + // and rotate before write_reg when we're about to cross a BUFFER_SIZE + // boundary. The completion-before-dispatch invariant makes this race-free. +#if PTO2_PROFILING + if (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED) { + l2_swimlane_aicpu_on_aicore_dispatch(core_id, thread_idx); + } +#endif + + // Publish task data (slot_state / args writes done above) before AICore + // can observe the dispatched task_id. ARM64 needs an explicit store-store + // fence across Normal-cacheable -> Device-nGnRnE; the old write_reg() + // helper provided this implicitly via __sync_synchronize. + wmb(); + + // Capture dispatch timestamp at the latest possible moment — after wmb, + // immediately before the DATA_MAIN_BASE write. +#if PTO2_PROFILING + if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) { + uint64_t dispatch_ts = get_sys_cnt_aicpu(); + if (to_pending) { + core_exec_state.pending_dispatch_timestamp = dispatch_ts; + } else { + core_exec_state.running_dispatch_timestamp = dispatch_ts; + } + } +#endif + + write_reg(core_exec_state.reg_addr, RegId::DATA_MAIN_BASE, static_cast(reg_task_id)); + tracker.set_pending_occupied(core_offset); +} + +void SchedulerContext::dispatch_mix_block_to_cluster( + Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state, bool to_pending, + int32_t block_idx +) { + CoreTracker &tracker = core_trackers_[thread_idx]; + uint8_t cmask = slot_state.active_mask.core_mask(); + if (cmask & PTO2_SUBTASK_MASK_AIC) { + dispatch_subtask_to_core( + runtime, thread_idx, tracker.get_aic_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIC, + to_pending, block_idx + ); + } + if (cmask & PTO2_SUBTASK_MASK_AIV0) { + dispatch_subtask_to_core( + runtime, thread_idx, tracker.get_aiv0_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIV0, + to_pending, block_idx + ); + } + if (cmask & PTO2_SUBTASK_MASK_AIV1) { + dispatch_subtask_to_core( + runtime, thread_idx, tracker.get_aiv1_core_offset(cluster_offset), slot_state, PTO2SubtaskSlot::AIV1, + to_pending, block_idx + ); + } +} + +void SchedulerContext::dispatch_block( + Runtime *runtime, int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape, + bool to_pending, int32_t block_idx +) { +#if PTO2_PROFILING + if (is_dump_args_enabled()) { + dump_args_for_task( + thread_idx, slot_state, TensorDumpStage::BEFORE_DISPATCH, + [](ActiveMask active_mask, int raw_subtask_id) { + return active_mask.subtask_active(static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif + if (shape == PTO2ResourceShape::MIX) { + dispatch_mix_block_to_cluster(runtime, thread_idx, core_offset, slot_state, to_pending, block_idx); + } else if (shape == PTO2ResourceShape::AIC) { + dispatch_subtask_to_core( + runtime, thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIC, to_pending, block_idx + ); + } else { + dispatch_subtask_to_core( + runtime, thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx + ); + } +#if PTO2_PROFILING + sched_l2_swimlane_[thread_idx].phase_dispatch_count += __builtin_popcount(slot_state.active_mask.core_mask()); +#endif +} + +void SchedulerContext::dispatch_shape( + Runtime *runtime, int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, + PTO2LocalReadyBuffer &local_buf, CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed +) { +#if PTO2_SCHED_PROFILING + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; +#endif + if (entered_drain) return; + + bool is_pending = (phase == CoreTracker::DispatchPhase::PENDING); + bool is_mix = (shape == PTO2ResourceShape::MIX); + auto cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase); + if (!cores.has_value()) return; + + while (cores.has_value() && !entered_drain) { + int want = cores.count(); + PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS * 3]; + int got = pop_ready_tasks_batch(shape, thread_idx, local_buf, batch, want); + if (got == 0) break; + + bool dispatched_any = false; + for (int bi = 0; bi < got; bi++) { + PTO2TaskSlotState *slot_state = batch[bi]; + CoreTracker::BitStates selected_mix_clusters(0ULL); + + if (is_mix) { + auto candidates = cores; + uint8_t cmask = slot_state->active_mask.core_mask(); + auto wanted = is_pending ? CoreTracker::MixPlacement::PENDING : CoreTracker::MixPlacement::RUNNING; + while (candidates.has_value()) { + int32_t cluster_offset = candidates.pop_first(); + if (tracker.classify_mix_cluster(cluster_offset, cmask) == wanted) { + selected_mix_clusters |= CoreTracker::BitStates(1ULL << cluster_offset); + } + } + if (!selected_mix_clusters.has_value()) { + sched_->ready_queues[static_cast(shape)].push(slot_state); + continue; + } + } + + if (slot_state->active_mask.requires_sync_start()) { + if (is_pending) { + sched_->ready_queues[static_cast(shape)].push(slot_state); + continue; + } + int32_t available = is_mix ? selected_mix_clusters.count() : cores.count(); + if (available < slot_state->logical_block_num) { + if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) { + sched_->ready_queues[static_cast(shape)].push(slot_state); + } + for (int rem = bi + 1; rem < got; rem++) { + sched_->ready_queues[static_cast(shape)].push(batch[rem]); + } + entered_drain = true; + break; + } + } + + if (!cores.has_value()) { + sched_->ready_queues[static_cast(shape)].push_batch(&batch[bi], got - bi); + break; + } + + dispatched_any = true; + try_pushed = true; +#if PTO2_SCHED_PROFILING + uint64_t t_setup_start = get_sys_cnt_aicpu(); +#endif + // Claim a contiguous range of blocks, hand the slot back to the + // ready queue immediately, then perform the expensive dispatches. + // This lets other schedulers concurrently claim and dispatch the + // remaining blocks of the same SPMD task instead of spinning while + // this thread fills all its own cores. Only local `start + b` is + // read after the push -- `next_block_idx` may already be advanced + // by another scheduler that popped the slot. + int32_t remaining = slot_state->logical_block_num - slot_state->next_block_idx; + int32_t available = is_mix ? selected_mix_clusters.count() : cores.count(); + int32_t claim = std::min(available, remaining); + int32_t start = slot_state->next_block_idx; + slot_state->next_block_idx += claim; + + if (slot_state->next_block_idx < slot_state->logical_block_num) { + sched_->ready_queues[static_cast(shape)].push(slot_state); + } + + for (int32_t b = 0; b < claim; b++) { + auto core_offset = is_mix ? selected_mix_clusters.pop_first() : cores.pop_first(); + if (is_mix) { + cores.clear_bit(core_offset); + } + dispatch_block(runtime, thread_idx, core_offset, *slot_state, shape, is_pending, start + b); + } + made_progress = true; +#if PTO2_SCHED_PROFILING + l2_swimlane.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start); +#endif + } + + if (!dispatched_any) break; + + if (!cores.has_value()) { + cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase); + } + } +} + +void SchedulerContext::dispatch_ready_tasks( + Runtime *runtime, int32_t thread_idx, CoreTracker &tracker, + PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES], bool pmu_active, bool &made_progress, bool &try_pushed +) { + using Phase = CoreTracker::DispatchPhase; + constexpr int32_t MIX_I = static_cast(PTO2ResourceShape::MIX); + + // MIX is handled explicitly at the top of each stage; only AIC/AIV cycle + // through this 2-elem array, with order toggled by thread parity for + // shape-level load balancing across threads. + static constexpr PTO2ResourceShape kAicAivOrder[2][2] = { + {PTO2ResourceShape::AIC, PTO2ResourceShape::AIV}, + {PTO2ResourceShape::AIV, PTO2ResourceShape::AIC}, + }; + const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1]; + + auto flush_local_bufs = [&]() { + for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) { + auto &lb = local_bufs[s]; + if (lb.count > 0) { + sched_->ready_queues[s].push_batch(lb.slot_states, lb.count); + lb.count = 0; + } + } + }; + // Every return path below must flush; wrap in RAII so we cannot forget. + // The mid-function flush between IDLE and PENDING is still called + // explicitly — guard only covers exit. + struct FlushGuard { + decltype(flush_local_bufs) &flush_fn; + ~FlushGuard() { flush_fn(); } + } flush_guard{flush_local_bufs}; + + bool entered_drain = false; + + // ===== IDLE stage ===== + dispatch_shape( + runtime, thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain, + made_progress, try_pushed + ); + if (entered_drain) return; + + // MIX-IDLE residual: AIC/AIV (both IDLE and PENDING) yield for this pass. + // MIX-PENDING below still runs — that is the core of "mix strict priority": + // pending slots are spent on mix before AIC/AIV get any chance. + bool skip_aic_aiv = has_residual_mix(local_bufs[MIX_I]); + + if (!skip_aic_aiv) { + for (int i = 0; i < 2; i++) { + PTO2ResourceShape s = aic_aiv[i]; + dispatch_shape( + runtime, thread_idx, s, Phase::IDLE, local_bufs[static_cast(s)], tracker, entered_drain, + made_progress, try_pushed + ); + if (entered_drain) return; + } + } + + // Flush between IDLE and PENDING so PENDING-stage queue-size checks and any + // peer-thread reads see the IDLE-stage release_fanin output. + flush_local_bufs(); + + if (pmu_active) return; + + // ===== PENDING stage ===== + // MIX-PENDING gate: skip when a peer has an idle MIX-capable cluster — that + // peer's next IDLE-MIX iteration will pull the mix task from the global + // queue (already flushed above) at lower latency than us pre-loading a + // pending slot here. Forward progress for MIX is preserved: at least one + // thread will run MIX-IDLE next pass and consume the residual. + // + // The gate is NOT subject to skip_aic_aiv — residual mix continues to drain + // via pending slots on this thread when no peer is idle. + if (!has_idle_in_other_threads(thread_idx, PTO2ResourceShape::MIX)) { + dispatch_shape( + runtime, thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain, + made_progress, try_pushed + ); + if (entered_drain) return; + } + + // Re-check after MIX-PENDING. If MIX-IDLE already set skip_aic_aiv, leave + // it set; otherwise, escalate iff PENDING-MIX left residual. + if (!skip_aic_aiv && has_residual_mix(local_bufs[MIX_I])) { + skip_aic_aiv = true; + } + + // PENDING-MIX may have re-populated AIC/AIV local_bufs via release_fanin + // during in-flight completions; flush_guard ensures these don't carry + // across to the next iteration's IDLE stage. + if (skip_aic_aiv) return; + + // AIC/AIV-PENDING gate: a peer-idle skip is a delay, not a loss — the peer + // will pull from the global queue on its next IDLE pass. + for (int i = 0; i < 2; i++) { + PTO2ResourceShape s = aic_aiv[i]; + if (has_idle_in_other_threads(thread_idx, s)) continue; + dispatch_shape( + runtime, thread_idx, s, Phase::PENDING, local_bufs[static_cast(s)], tracker, entered_drain, + made_progress, try_pushed + ); + if (entered_drain) return; + } +} + +// ============================================================================= +// Main scheduler dispatch loop +// ============================================================================= + +int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_idx) { + CoreTracker &tracker = core_trackers_[thread_idx]; + LOG_INFO_V0("Thread %d: resolve_and_dispatch entry", thread_idx); + + PTO2SharedMemoryHeader *header = sched_->sm_header; + if (!header) { + LOG_ERROR("PTO2 dispatch: header is null"); + return -1; + } + LOG_INFO_V0( + "Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu", thread_idx, static_cast(header), + static_cast(header->rings[0].task_descriptors_offset), + static_cast(header->rings[0].task_window_size) + ); + + Handshake *hank = static_cast(runtime->workers); + LOG_INFO_V0( + "Thread %d: hank=%p, window_size=%lu", thread_idx, static_cast(hank), + static_cast(header->rings[0].task_window_size) + ); + + // One-time init: assign perf buffers (one thread does it; others wait). + // l2_swimlane_aicpu_init / l2_swimlane_aicpu_init_phase already ran eagerly in + // SchedulerContext::init() so the orchestrator thread can read the + // promoted g_l2_swimlane_level before caching it on rt->orchestrator. Only + // dump_tensor / pmu init remain dispatch-time because they depend on + // handshake-derived core IDs / counts. + if (!init_claimed_.exchange(true, std::memory_order_acq_rel)) { + LOG_INFO_V0("Thread %d: doing one-time init", thread_idx); + +#if PTO2_PROFILING + if (is_dump_args_enabled()) { + dump_args_init(orch_to_sched_ ? aicpu_thread_num_ : sched_thread_num_); + } + if (is_pmu_enabled()) { + pmu_aicpu_init(physical_core_ids_, cores_total_num_); + LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_); + } +#endif + + LOG_INFO_V0("Thread %d: one-time init done", thread_idx); + init_complete_.store(true, std::memory_order_release); + } else { + while (!init_complete_.load(std::memory_order_acquire)) { + SPIN_WAIT_HINT(); + } + } + + LOG_INFO_V0("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, tracker.core_num()); + int32_t cur_thread_completed = 0; + // Non-zero once a scheduler-hang timeout latches; returned in place of the + // completed count so the caller still sees the negative error rc while the + // shared end-of-loop flush below runs. + int32_t timeout_rc = 0; + int32_t idle_iterations = 0; + int32_t last_progress_count = 0; +#if PTO2_PROFILING + auto &l2_swimlane = sched_l2_swimlane_[thread_idx]; + l2_swimlane.reset(); + l2_swimlane.l2_swimlane_enabled = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED); +#endif + + constexpr int LOCAL_READY_CAP_PER_TYPE = 64; + PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE]; + PTO2LocalReadyBuffer local_bufs[PTO2_NUM_RESOURCE_SHAPES]; + for (int32_t i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + local_bufs[i].reset(local_ptrs[i], LOCAL_READY_CAP_PER_TYPE); + } + PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP]; + int32_t deferred_release_count = 0; + + bool cores_released = false; + +#if PTO2_PROFILING + l2_swimlane.sched_start_ts = get_sys_cnt_aicpu(); +#endif + +#if PTO2_PROFILING + // Queue-depth snapshot carried across the iteration boundary: each phase + // emit consumes (phase_start_*) and refreshes them with its own end snapshot + // so the next phase's "at_start" equals the previous phase's "at_end". + // + // L2SWIMLANE_NUM_QUEUE_SHAPES (3) matches PTO2_NUM_RESOURCE_SHAPES: AIC/AIV/MIX. + // + // **Hot-path cost discipline.** Local depth (this thread's PTO2LocalReadyBuffer) + // is a single int read on a register-cached stack — free. Shared depth + // (PTO2ReadyQueue::size) is two atomic relaxed loads against cache lines + // that all peer sched threads also write to (enqueue_pos and dequeue_pos + // bounce on every flush_local_bufs + every pop). With both phases emitting + // per iter that's 12 cross-core loads × thousands of iters per run, a + // measurable AICPU slowdown. Mitigation: lazy + per-iter cached shared + // snapshot, refreshed at most once per iteration. The complete-emit and + // dispatch-emit in the same iter both reuse the same shared sample; the + // big transitions (local→shared flush) still show up across iter boundaries. + static_assert( + L2SWIMLANE_NUM_QUEUE_SHAPES == PTO2_NUM_RESOURCE_SHAPES, + "queue snapshot width must match runtime resource shape count" + ); + int16_t phase_start_local[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0}; + int16_t phase_start_shared[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0}; + int16_t iter_shared_snapshot[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0}; + bool iter_shared_sampled = false; + auto capture_local_snapshot = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) { + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + local_out[s] = static_cast(local_bufs[s].count); + } + }; + auto get_or_sample_shared = [&]() -> const int16_t * { + if (!iter_shared_sampled) { + // Clamp to int16_t max before narrowing. PTO2_PROF_READYQUEUE_SIZE + // is in the low thousands today but could grow with platform + // scaling — without clamp, sizes above 32767 wrap to negatives + // and silently corrupt the snapshot. + constexpr size_t kMax = static_cast(std::numeric_limits::max()); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + const size_t qsize = sched_->ready_queues[s].size(); + iter_shared_snapshot[s] = static_cast(std::min(qsize, kMax)); + } + iter_shared_sampled = true; + } + return iter_shared_snapshot; + }; + auto capture_phase_end = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES], + int16_t shared_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) { + capture_local_snapshot(local_out); + const int16_t *shared_cached = get_or_sample_shared(); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) + shared_out[s] = shared_cached[s]; + }; + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + capture_phase_end(phase_start_local, phase_start_shared); + } +#endif + + // Wall-clock timestamp of the last completed task on this thread. + // Updated on made_progress; consulted to decide whether the wall-clock + // budget for declaring a scheduler hang has elapsed. Initialized to + // "now" so the first budget cycle starts when this thread does, not at + // an undefined value. + uint64_t last_progress_ts = get_sys_cnt_aicpu(); + + while (true) { + if (completed_.load(std::memory_order_acquire)) { + break; + } + bool made_progress = false; +#if PTO2_PROFILING + CYCLE_COUNT_START(); + l2_swimlane.sched_loop_count++; + uint64_t _t0_phase = _t0; + // Per-iter lazy shared-queue snapshot: first phase emit in this iter + // pays the atomic-load cost, subsequent emits in the same iter reuse + // the cached value. Reset here so we re-sample exactly once per iter + // (or skip entirely on iters with no phase emit). + iter_shared_sampled = false; +#endif + int32_t task_count = 0; + if (!tracker.has_any_running_cores()) { + LoopAction action = handle_orchestrator_exit(thread_idx, header, runtime, task_count); + if (action == LoopAction::BREAK_LOOP) break; + } + + if (!cores_released && orch_to_sched_) { + LoopAction action = handle_core_transition(cores_released); + if (action == LoopAction::BREAK_LOOP) break; + } + +#if PTO2_PROFILING + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); +#endif + + // Phase 1: Check running cores for completion + int32_t completed_this_turn = 0; + + bool try_completed = tracker.has_any_running_cores(); + if (try_completed) { + check_running_cores_for_completion( + thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress, + deferred_release_slot_states, deferred_release_count, local_bufs + ); + } + if (completed_this_turn > 0) { +#if PTO2_SCHED_PROFILING + sched_->tasks_completed.fetch_add(completed_this_turn, std::memory_order_relaxed); +#endif + int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed); + int32_t new_total = prev + completed_this_turn; + last_progress_count = new_total; + if (thread_idx == 0 && task_count > 0) { + if (new_total <= PROGRESS_VERBOSE_THRESHOLD || + new_total / PROGRESS_LOG_INTERVAL != prev / PROGRESS_LOG_INTERVAL || new_total >= task_count) { + LOG_INFO_V9( + "PTO2 progress: completed=%d total=%d (%.1f%%)", new_total, task_count, + 100.0 * new_total / task_count + ); + } + } + } + + if (rt_ != nullptr && rt_->aicore_mailbox != nullptr && + (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending())) { + AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete( + rt_->aicore_mailbox, sched_, local_bufs, deferred_release_slot_states, deferred_release_count, + PTO2_DEFERRED_RELEASE_CAP +#if PTO2_SCHED_PROFILING + , + thread_idx +#endif + ); + if (poll_result.error_code != PTO2_ERROR_NONE) { + int32_t expected = PTO2_ERROR_NONE; + header->sched_error_code.compare_exchange_strong( + expected, poll_result.error_code, std::memory_order_acq_rel, std::memory_order_acquire + ); + completed_.store(true, std::memory_order_release); + break; + } + if (poll_result.completed > 0) { +#if PTO2_SCHED_PROFILING + sched_->tasks_completed.fetch_add(poll_result.completed, std::memory_order_relaxed); +#endif + int32_t prev = completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed); + int32_t new_total = prev + poll_result.completed; + last_progress_count = new_total; + made_progress = true; + } + } + +#if PTO2_PROFILING + if (!try_completed) { + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); + } else { + CYCLE_COUNT_LAP(l2_swimlane.sched_complete_cycle); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_complete_count > 0) { + // Local depth is cheap (this thread's own buffer counter). + // Shared depth is NOT sampled here: complete's release_fanin + // pushes to local_bufs in the fast path (try_push succeeds + // until cap=64). Shared only changes on dispatch's flush + // path. Carrying phase_start_shared forward as end_shared + // is the right answer 99% of the time AND skips three + // contended atomic loads per emit. + int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES]; + capture_local_snapshot(phase_end_local); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Complete, _t0_phase, _t1, l2_swimlane.sched_loop_count, + l2_swimlane.phase_complete_count, /*pop_hit=*/0, /*pop_miss=*/0, phase_start_local, + phase_start_shared, phase_end_local, phase_start_shared + ); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + phase_start_local[s] = phase_end_local[s]; + // phase_start_shared unchanged — carried forward + } + _t0_phase = _t1; + l2_swimlane.phase_complete_count = 0; + } + } +#endif + + bool try_pushed = false; + + // Phase 2 drain check + if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) { + handle_drain_mode(runtime, thread_idx); + continue; + } + + // Phase 3: Drain wiring queue (thread 0 only) + int wired = 0; + if (thread_idx == 0) { + wired = sched_->drain_wiring_queue(orchestrator_done_); + if (wired > 0) { + made_progress = true; +#if PTO2_SCHED_PROFILING + l2_swimlane.phase_wiring_count += wired; +#endif + } + } +#if PTO2_PROFILING + CYCLE_COUNT_LAP(l2_swimlane.sched_wiring_cycle); + // Wire outer phase: emit one bar covering this iter's drain_wiring_queue + // pass when it wired any tasks. tasks_processed = wired count. + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && wired > 0) { + int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES]; + capture_local_snapshot(phase_end_local); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Wire, _t0_phase, _t1, l2_swimlane.sched_loop_count, + static_cast(wired), /*pop_hit=*/0, /*pop_miss=*/0, phase_start_local, phase_start_shared, + phase_end_local, phase_start_shared + ); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + phase_start_local[s] = phase_end_local[s]; + } + _t0_phase = _t1; + } +#endif + + // Phase 3b: Drain dummy ready queue (thread 0 only). + // + // Dependency-only tasks bypass AICore dispatch: they go through the + // scheduler so fanin/fanout edges stay consistent, but completion is + // signalled inline here. Pinned to thread 0 to avoid cross-thread + // races and to keep cache hot near the wiring drain above. + if (thread_idx == 0) { + constexpr int DUMMY_DRAIN_BATCH = 16; + PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH]; + int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH); +#if PTO2_PROFILING + // Dummy outer phase: covers handling of all dummies popped this + // iter. tasks_processed = dummy_got. + uint64_t dummy_outer_t0 = + (dummy_got > 0 && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0; +#endif + for (int di = 0; di < dummy_got; di++) { + PTO2TaskSlotState &dummy_slot = *dummy_batch[di]; +#if PTO2_SCHED_PROFILING + sched_->on_task_complete(dummy_slot, thread_idx, local_bufs); +#else + sched_->on_task_complete(dummy_slot, local_bufs); +#endif + // Dummy tasks have no subtasks to retire and no fanout pre-conditions + // beyond their own producers; release self-reference so the slot can + // reach CONSUMED once all consumers drain. + deferred_release_slot_states[deferred_release_count++] = &dummy_slot; + if (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP) { + while (deferred_release_count > 0) { +#if PTO2_SCHED_PROFILING + (void)sched_->on_task_release( + *deferred_release_slot_states[--deferred_release_count], thread_idx + ); +#else + sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]); +#endif + } + } + int32_t prev = completed_tasks_.fetch_add(1, std::memory_order_relaxed); + last_progress_count = prev + 1; + cur_thread_completed++; + } + if (dummy_got > 0) { + made_progress = true; + } +#if PTO2_PROFILING + if (dummy_outer_t0 != 0) { + int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES]; + capture_local_snapshot(phase_end_local); + uint64_t dummy_outer_t1 = get_sys_cnt_aicpu(); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Dummy, dummy_outer_t0, dummy_outer_t1, + l2_swimlane.sched_loop_count, static_cast(dummy_got), /*pop_hit=*/0, + /*pop_miss=*/0, phase_start_local, phase_start_shared, phase_end_local, phase_start_shared + ); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + phase_start_local[s] = phase_end_local[s]; + } + _t0_phase = dummy_outer_t1; + } +#endif + } + + // Phase 4: MIX-strict-priority dispatch with phase-split and + // cross-thread idle gating. See dispatch_ready_tasks for the policy. + const bool pmu_active = is_pmu_enabled(); + dispatch_ready_tasks(runtime, thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed); + +#if PTO2_PROFILING + if (!try_pushed) { + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); + } else { + CYCLE_COUNT_LAP(l2_swimlane.sched_dispatch_cycle); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_dispatch_count > 0) { + // Final-drain at loop end emits the trailing-idle tail so + // sum-of-deltas == run-cumulative. + uint64_t pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit; + uint64_t pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit; + // L2SwimlaneAicpuPhaseRecord's extras are uint32 — a delta that overflows means + // an emit was missed for ~4 billion pops, which is well outside any + // realistic dispatch cadence and silently truncates without this guard. + debug_assert(pop_hit_delta < (1ULL << 32)); + debug_assert(pop_miss_delta < (1ULL << 32)); + int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES]; + int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES]; + capture_phase_end(phase_end_local, phase_end_shared); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, _t0_phase, _t1, l2_swimlane.sched_loop_count, + l2_swimlane.phase_dispatch_count, static_cast(pop_hit_delta), + static_cast(pop_miss_delta), phase_start_local, phase_start_shared, phase_end_local, + phase_end_shared + ); + for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) { + phase_start_local[s] = phase_end_local[s]; + phase_start_shared[s] = phase_end_shared[s]; + } + _t0_phase = _t1; + l2_swimlane.phase_dispatch_count = 0; + l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit; + l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss; + } + } +#endif + +#if !PTO2_PROFILING + (void)try_completed; + (void)try_pushed; +#endif + + if (made_progress) { + idle_iterations = 0; + last_progress_ts = get_sys_cnt_aicpu(); + } else { + while (deferred_release_count > 0) { +#if PTO2_SCHED_PROFILING + (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx); +#else + sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]); +#endif + } + idle_iterations++; + + if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0) { + LoopAction action = check_idle_fatal_error(thread_idx, header, runtime); + if (action == LoopAction::BREAK_LOOP) break; + } + + if (idle_iterations % STALL_LOG_INTERVAL == 0) { + log_stall_diagnostics(thread_idx, total_tasks_, idle_iterations, last_progress_count); + } + // Wall-clock budget gate, with two fatal-latch branches: + // + // 1. Self owns a RUNNING task — first-hand evidence the + // dispatch is stuck. Latch. + // 2. No thread anywhere owns a RUNNING task AND tasks remain + // unfinished — the system is in a pre-dispatch / WAIT-only + // deadlock (e.g. dependency cycle). Ownerless idle threads + // are the only observers; let this one latch on the global + // evidence (`completed_tasks_ < total_tasks_` and + // `no_thread_owns_running_task()`). + // + // Otherwise: a sibling thread owns a RUNNING task but hasn't + // hit its own budget yet (typical distributed startup-skew + // case) — refresh last_progress_ts and keep spinning. The + // STALL diagnostic above still fires periodically so + // observability is preserved. + if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) { + bool self_owns = self_owns_running_task(thread_idx); + bool global_stuck = !self_owns && total_tasks_ > 0 && + completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ && + no_thread_owns_running_task(); + if (self_owns || global_stuck) { + // Latch the error + emergency_shutdown, then break to the + // shared end-of-loop cleanup so the diagnostic buffers get + // flushed to the host. An early return here would strand the + // stuck task's already-dumped inputs and every completed + // task's in/out records in the unflushed per-thread dump + // buffer — exactly the state we need to triage the hang. + timeout_rc = handle_timeout_exit( + thread_idx, header, runtime, idle_iterations, last_progress_count +#if PTO2_PROFILING + , + l2_swimlane.sched_start_ts +#endif + ); + break; + } + last_progress_ts = get_sys_cnt_aicpu(); + } + SPIN_WAIT_HINT(); +#if PTO2_PROFILING + CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle); + // a2a3 design has Complete + Dispatch sched phases only; idle gaps + // are reconstructed at post-process time from sched record spacing. + (void)_t0_phase; +#endif + } + } + + // Drain any entries left in the deferred-release batch. The in-loop flush + // only fires on idle iterations and on buffer-full; a loop exit while the + // last iteration made progress can leave entries un-released. Drop them + // here so every consumed producer slot completes its on_task_release + // regardless of which loop-exit path fired. + while (deferred_release_count > 0) { +#if PTO2_SCHED_PROFILING + (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx); +#else + sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]); +#endif + } + +#if PTO2_PROFILING + // Final-drain: emit any pop_hit / pop_miss accrued since the last + // dispatch emit (typically the trailing idle loops while waiting for + // orchestrator_done_) as a zero-duration synthetic dispatch record so + // sum(record.pop_*) reconciles with the run-cumulative counter. + // Gate on SCHED_PHASES — at lower levels the phase buffer is never + // flushed (see below), so writing this record would be wasted work. + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + uint64_t final_pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit; + uint64_t final_pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit; + debug_assert(final_pop_hit_delta < (1ULL << 32)); + debug_assert(final_pop_miss_delta < (1ULL << 32)); + if (final_pop_hit_delta != 0 || final_pop_miss_delta != 0) { + uint64_t t_now = get_sys_cnt_aicpu(); + int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES]; + int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES]; + capture_phase_end(phase_end_local, phase_end_shared); + l2_swimlane_aicpu_record_sched_phase( + thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, t_now, t_now, l2_swimlane.sched_loop_count, 0, + static_cast(final_pop_hit_delta), static_cast(final_pop_miss_delta), + phase_end_local, phase_end_shared, phase_end_local, phase_end_shared + ); + l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit; + l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss; + } + } + log_l2_swimlane_summary(thread_idx, cur_thread_completed); +#endif + +#if PTO2_PROFILING + if (l2_swimlane.l2_swimlane_enabled) { + l2_swimlane_aicpu_flush( + thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num() + ); + if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { + l2_swimlane_aicpu_flush_sched_phase_buffer(thread_idx); + } + } +#endif +#if PTO2_PROFILING + if (is_dump_args_enabled()) { + dump_args_flush(thread_idx); + } +#endif +#if PTO2_PROFILING + if (is_pmu_enabled()) { + pmu_aicpu_flush_buffers( + thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num() + ); + } +#endif + + return timeout_rc != 0 ? timeout_rc : cur_thread_completed; +} diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h new file mode 100644 index 000000000..c23a547af --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/scheduler/scheduler_types.h @@ -0,0 +1,464 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#ifndef SCHEDULER_TYPES_H +#define SCHEDULER_TYPES_H + +#include +#include + +#include "common/core_type.h" +#include "common/platform_config.h" +#include "pto2_dispatch_payload.h" +#include "pto_runtime2_types.h" +#include "spin_hint.h" + +// ============================================================================= +// Profiling macros (compile-time gated) +// ============================================================================= + +#if PTO2_PROFILING +#include "aicpu/device_time.h" +// Accumulated nanoseconds per sub-step +#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1 +#define CYCLE_COUNT_LAP(acc) \ + do { \ + _t1 = get_sys_cnt_aicpu(); \ + acc += (_t1 - _t0); \ + _t0 = _t1; \ + } while (0) +#else +#define CYCLE_COUNT_START() +#define CYCLE_COUNT_LAP(acc) +#endif + +// ============================================================================= +// Scheduler constants +// ============================================================================= + +constexpr int32_t MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS; + +// Periodic cadence (in idle iterations) for emitting the per-thread STALL +// diagnostic while no progress is being made. Purely an observability knob, +// independent of the wall-clock timeout below: small enough to fire a few times +// before the budget expires, large enough not to flood device_log. +constexpr int32_t STALL_LOG_INTERVAL = 480000; +constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024; // Check orchestrator error every N idle iters + +// Wall-clock budget for declaring "no progress = scheduler timeout". Replaces +// the per-thread iteration-count cap that once lived here as MAX_IDLE_ITERATIONS +// for the fatal-latch decision; STALL_LOG_INTERVAL above keeps the per-thread +// diagnostic cadence. +// +// Using wall-clock here is load-bearing for distributed runs: with per-thread +// iteration counts, a pure-idle thread spinning ~115 ns/iter hits the cap in +// ~92 ms while a sibling thread polling a RUNNING task takes ~200 ms for the +// same iteration count. The fast spinner racing ahead and latching fatal +// kills the slower-but-correct poller mid-poll — see the distributed +// startup-skew scenario in issue #897. +// +// The budget is platform-defined (PLATFORM_SCHEDULER_TIMEOUT_MS in spin_hint.h) +// because the safe value differs per variant: onboard trims it to 2 s so the +// AICPU detects a hang and flushes its diagnostics (tensor dump, in-flight +// partial output) before STARS reaps the op and poisons the context (chain: +// this < op-exec < host stream-sync, platform_config.h); sim has no STARS to +// race and keeps the full 5 s #897 headroom. See spin_hint.h for the per-variant +// rationale. +constexpr int32_t SCHEDULER_TIMEOUT_MS = PLATFORM_SCHEDULER_TIMEOUT_MS; +constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES = + static_cast(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000); +constexpr int32_t STALL_DUMP_READY_MAX = 8; +constexpr int32_t STALL_DUMP_WAIT_MAX = 4; +constexpr int32_t STALL_DUMP_CORE_MAX = 8; +constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10; // log every completion for the first N tasks +constexpr int32_t PROGRESS_LOG_INTERVAL = 250; // log every N completions after threshold + +// ============================================================================= +// Control flow signal from cold-path helpers back to the main dispatch loop. +// ============================================================================= + +enum class LoopAction : int8_t { + NONE, // cold path did not trigger; proceed normally + BREAK_LOOP, // equivalent to 'break' from the while(true) loop +}; + +// ============================================================================= +// Per-core state: one cache line per core to eliminate false sharing +// and co-locate all hot-path fields for minimal cache misses. +// Dual-slot layout: running (currently executing) + pending (pre-loaded, awaiting hardware pickup). +// ============================================================================= + +struct alignas(64) CoreExecState { + // --- Hot fields (completion + dispatch, every iteration) --- + uint64_t reg_addr; // offset 0: register base address (set once in handshake) + PTO2TaskSlotState *running_slot_state; // offset 8: slot state for running task (nullptr = empty) + PTO2TaskSlotState *pending_slot_state; // offset 16: slot state for pending task (nullptr = empty) + int32_t running_reg_task_id; // offset 24: register task ID (AICPU_TASK_INVALID = idle) + int32_t pending_reg_task_id; // offset 28: pending register task ID (AICPU_TASK_INVALID = none) + uint32_t dispatch_seq; // offset 32: monotonic dispatch counter + PTO2SubtaskSlot running_subslot; // offset 36: which subtask slot is running + PTO2SubtaskSlot pending_subslot; // offset 37: which subtask slot is pending + uint8_t pad0_[2]; // offset 38: alignment padding + // Precomputed COND register pointer; resolved once in handshake so the + // hot completion poll does a single volatile load instead of recomputing + // reg_base + reg_offset(COND) on every iteration. + volatile uint32_t *cond_ptr; // offset 40: precomputed pointer to COND register +#if PTO2_PROFILING + // --- Profiling fields (dispatch path, compile-time gated) --- + uint64_t running_dispatch_timestamp; // offset 48: AICPU dispatch timestamp for running task + uint64_t pending_dispatch_timestamp; // offset 56: AICPU dispatch timestamp for pending task +#else + // --- Cold fields (init/diagnostics only, never in hot path) --- + int32_t worker_id; // offset 48: index in runtime.workers[] + uint32_t physical_core_id; // offset 52: hardware physical core ID + CoreType core_type; // offset 56: AIC or AIV (enum class : int32_t) + uint8_t pad2_[4]; // offset 60: pad to 64 bytes +#endif +}; +static_assert(sizeof(CoreExecState) == 64, "CoreExecState must occupy exactly one cache line"); + +// ============================================================================= +// CoreTracker: cluster-based bitmask tracker for idle/running core state. +// +// core_states_ encodes per-cluster core idle/running in 3 bits per cluster: +// bit i*3 = AIC of cluster i (1 = idle, 0 = running) +// bit i*3+1 = AIV0 of cluster i +// bit i*3+2 = AIV1 of cluster i +// Max 21 clusters per tracker (63 bits in uint64_t). +// ============================================================================= + +class alignas(64) CoreTracker { +public: + static inline int32_t MAX_CORE_PER_THREAD = 63; + static constexpr int32_t MAX_CLUSTERS = 63 / 3; + +public: + CoreTracker() = default; + + class BitStates { + public: + BitStates() = default; + + explicit BitStates(uint64_t states) : + states_(states) {} + void init() { states_ = 0; } + + BitStates operator~() const { return BitStates(~states_); } + BitStates operator&(const BitStates &other) const { return BitStates(states_ & other.states_); } + BitStates operator|(const BitStates &other) const { return BitStates(states_ | other.states_); } + BitStates operator^(const BitStates &other) const { return BitStates(states_ ^ other.states_); } + BitStates operator>>(int32_t offset) const { return BitStates(states_ >> offset); } + BitStates operator<<(int32_t offset) const { return BitStates(states_ << offset); } + void operator&=(const BitStates &other) { states_ &= other.states_; } + void operator|=(const BitStates &other) { states_ |= other.states_; } + void operator^=(const BitStates &other) { states_ ^= other.states_; } + + bool has_value() const { return states_ > 0; } + int32_t count() const { return __builtin_popcountll(states_); } + void clear_bit(int32_t offset) { states_ &= ~(1ULL << offset); } + + // Extract the lowest set bit from mask, clear it, and return its position. + // Returns -1 if mask is empty. + int32_t pop_first() { + if (states_ == 0) return -1; + int32_t pos = __builtin_ctzll(states_); + states_ &= states_ - 1; + return pos; + } + + private: + uint64_t states_{0}; + }; + +public: + void init(int32_t cluster_count) { + cluster_count_ = cluster_count; + aic_mask_.init(); + aiv_mask_.init(); + pending_occupied_.init(); + for (int32_t i = 0; i < cluster_count; i++) { + aic_mask_ |= BitStates(1ULL << (i * 3)); + aiv_mask_ |= BitStates(6ULL << (i * 3)); + } + core_states_ = aic_mask_ | aiv_mask_; + } + + void set_cluster(int32_t cluster_idx, int32_t aic_wid, int32_t aiv0_wid, int32_t aiv1_wid) { + core_id_map_[cluster_idx * 3] = aic_wid; + core_id_map_[cluster_idx * 3 + 1] = aiv0_wid; + core_id_map_[cluster_idx * 3 + 2] = aiv1_wid; + } + + int32_t get_cluster_count() const { return cluster_count_; } + + // --- Running core queries --- + + template + bool has_running_cores() const { + if constexpr (CT == CoreType::AIC) { + return ((~core_states_) & aic_mask_).has_value(); + } else { + return ((~core_states_) & aiv_mask_).has_value(); + } + } + + bool has_any_running_cores() const { return ((~core_states_) & (aic_mask_ | aiv_mask_)).has_value(); } + + template + int32_t get_running_count() const { + if constexpr (CT == CoreType::AIC) { + return ((~core_states_) & aic_mask_).count(); + } else { + return ((~core_states_) & aiv_mask_).count(); + } + } + + // Return an opaque bitmask for iterating running cores of a given type. + // Use pop_first() to extract core bit offsets one at a time. + template + BitStates get_running_cores() const { + if constexpr (CT == CoreType::AIC) { + return (~core_states_) & aic_mask_; + } else { + return (~core_states_) & aiv_mask_; + } + } + + BitStates get_all_running_cores() const { return (~core_states_) & (aic_mask_ | aiv_mask_); } + BitStates get_cluster_offset_states() const { return aic_mask_; } + + // --- Cluster matching --- + + BitStates get_valid_cluster_offset_states(PTO2ResourceShape shape) const { + switch (shape) { + case PTO2ResourceShape::AIC: + return core_states_ & aic_mask_; + case PTO2ResourceShape::AIV: + return ((core_states_ >> 1) | (core_states_ >> 2)) & aic_mask_; + case PTO2ResourceShape::MIX: + return (core_states_ >> 1) & (core_states_ >> 2) & core_states_ & aic_mask_; + case PTO2ResourceShape::DUMMY: + // DUMMY tasks never reach the core-tracker dispatch path; they are + // completed inline by resolve_and_dispatch via dummy_ready_queue. + return BitStates(0ULL); + } + return BitStates(0ULL); + } + + int32_t get_aic_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset]; } + int32_t get_aiv0_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 1]; } + int32_t get_aiv1_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 2]; } + + int32_t get_aic_core_offset(int32_t cluster_offset) const { return cluster_offset; } + int32_t get_aiv0_core_offset(int32_t cluster_offset) const { return cluster_offset + 1; } + int32_t get_aiv1_core_offset(int32_t cluster_offset) const { return cluster_offset + 2; } + + bool is_aic_core_idle(int32_t cluster_offset) const { + return ((core_states_ >> cluster_offset) & BitStates(1ULL)).has_value(); + } + bool is_aiv0_core_idle(int32_t cluster_offset) const { + return ((core_states_ >> (cluster_offset + 1)) & BitStates(1ULL)).has_value(); + } + bool is_aiv1_core_idle(int32_t cluster_offset) const { + return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value(); + } + + // --- State mutation --- + + // Toggle bit at the given bit offset (running <-> idle) + void change_core_state(int32_t bit_offset) { core_states_ ^= BitStates(1ULL << bit_offset); } + + // --- Pending-occupied tracking --- + // Tracks whether a core's pending payload slot is occupied (awaiting hardware ACK). + // SET on dispatch (both running-first and pending), CLEAR on idle or pending_freed. + + void set_pending_occupied(int32_t bit_offset) { pending_occupied_ |= BitStates(1ULL << bit_offset); } + void clear_pending_occupied(int32_t bit_offset) { + pending_occupied_ ^= (pending_occupied_ & BitStates(1ULL << bit_offset)); + } + + // --- Two-phase dispatch queries --- + + // Idle dispatch: returns bit offsets of idle cores for the given shape. + // For AIC: 1 bit per cluster (core offset == cluster offset). + // For AIV: 1 bit per AIV core (2 bits per cluster at aiv_mask_ positions). + // Only AIC needs pending_occupied filtering: by invariant, idle cores (core_states_ bit=1) + // always have pending_occupied=0, so AIV/MIX need no extra filtering. + // Skipping the AIC-centric filter also fixes a latent bug where a running+pending AIC core + // would incorrectly block AIV idle dispatch on the same cluster. + BitStates get_idle_core_offset_states(PTO2ResourceShape shape) const { + if (shape == PTO2ResourceShape::AIC) { + return get_valid_cluster_offset_states(shape) & ~(pending_occupied_ & aic_mask_); + } + if (shape == PTO2ResourceShape::AIV) { + return core_states_ & aiv_mask_; + } + return get_valid_cluster_offset_states(shape); // MIX: cluster-level + } + + // Pending dispatch: returns bit offsets of cores eligible for pending-slot dispatch. + // AIC: 1 bit per cluster (aic_mask_ positions). AIV: 1 bit per AIV core (aiv_mask_ positions). + // Runtime MIX dispatch uses classify_mix_cluster() so the decision follows the task's active_mask. + enum class MixPlacement : uint8_t { RUNNING, PENDING, REJECT }; + + // A MIX block must place all cores named by active_mask the same way: + // all idle means running placement, all running means pending placement, + // and any mixed state is retried later. + MixPlacement classify_mix_cluster(int32_t cluster_offset, uint8_t core_mask) const { + BitStates used(0ULL); + if (core_mask & PTO2_SUBTASK_MASK_AIC) { + used |= BitStates(1ULL << cluster_offset); + } + if (core_mask & PTO2_SUBTASK_MASK_AIV0) { + used |= BitStates(1ULL << (cluster_offset + 1)); + } + if (core_mask & PTO2_SUBTASK_MASK_AIV1) { + used |= BitStates(1ULL << (cluster_offset + 2)); + } + if (!used.has_value() || (pending_occupied_ & used).has_value()) { + return MixPlacement::REJECT; + } + + BitStates idle = core_states_ & used; + if (idle.count() == used.count()) { + return MixPlacement::RUNNING; + } + if (!idle.has_value()) { + return MixPlacement::PENDING; + } + return MixPlacement::REJECT; + } + + BitStates get_mix_running_cluster_offset_states(uint8_t core_mask) const { + BitStates result(0ULL); + BitStates candidates = get_cluster_offset_states(); + while (candidates.has_value()) { + int32_t cluster_offset = candidates.pop_first(); + if (classify_mix_cluster(cluster_offset, core_mask) == MixPlacement::RUNNING) { + result |= BitStates(1ULL << cluster_offset); + } + } + return result; + } + + int32_t count_mix_running_clusters(uint8_t core_mask) const { + return get_mix_running_cluster_offset_states(core_mask).count(); + } + + BitStates get_pending_core_offset_states(PTO2ResourceShape shape) const { + if (shape == PTO2ResourceShape::MIX) { + // Shape-level query kept conservative for legacy callers/tests. + // The real MIX dispatch path applies active_mask in classify_mix_cluster(). + // Any core without a pending payload can accept a dispatch (idle or running). + BitStates available = ~pending_occupied_; + BitStates mix_available = + (available & aic_mask_) & ((available >> 1) & aic_mask_) & ((available >> 2) & aic_mask_); + // Pending MIX can only reuse a fully-running cluster. Partially-running clusters + // could split one MIX block across immediate and pending placement. + BitStates running = ~core_states_; + BitStates cluster_all_running = + (running & aic_mask_) & ((running >> 1) & aic_mask_) & ((running >> 2) & aic_mask_); + return mix_available & cluster_all_running; + } + if (shape == PTO2ResourceShape::AIC) { + return (~core_states_) & aic_mask_ & ~(pending_occupied_ & aic_mask_); + } + // AIV + return (~core_states_) & aiv_mask_ & ~pending_occupied_; + } + + // --- Two-phase dispatch unified query --- + + enum class DispatchPhase : uint8_t { IDLE, PENDING }; + + BitStates get_dispatchable_cores(PTO2ResourceShape shape, DispatchPhase phase) const { + return (phase == DispatchPhase::IDLE) ? get_idle_core_offset_states(shape) : + get_pending_core_offset_states(shape); + } + + // --- Bit offset <-> worker_id mapping --- + + int32_t get_core_id_by_offset(int32_t offset) const { return core_id_map_[offset]; } + + const int32_t *core_ids() const { return core_id_map_; } + int32_t core_num() const { return cluster_count_ * 3; } + +private: + int32_t cluster_count_; + BitStates aic_mask_; + BitStates aiv_mask_; + BitStates core_states_; + BitStates pending_occupied_; + int32_t core_id_map_[63]; // bit_position -> worker_id, max 21 clusters * 3 +}; + +// ============================================================================= +// SlotTransition: pure event signals from a single register poll. +// true = event occurred, false = no-op (maintain current state). +// ============================================================================= + +struct SlotTransition { + bool running_done = false; // running task completed + bool pending_done = false; // pending task completed + bool running_freed = false; // running slot data should be released + bool pending_freed = false; // pending_occupied can be cleared + bool matched = false; // some case was hit (otherwise skip apply) +}; + +// ============================================================================= +// Profiling counters (compile-time gated) +// ============================================================================= + +#if PTO2_PROFILING +struct alignas(64) SchedL2SwimlaneCounters { + bool l2_swimlane_enabled{false}; + uint64_t sched_start_ts{0}; + uint64_t sched_scan_cycle{0}; + uint64_t sched_complete_cycle{0}; + uint64_t sched_dispatch_cycle{0}; + uint64_t sched_wiring_cycle{0}; + uint64_t sched_idle_cycle{0}; + uint64_t sched_loop_count{0}; + uint32_t phase_complete_count{0}; + uint32_t phase_dispatch_count{0}; + // Per-emit delta is (current - *_at_last_emit). Accumulated only when + // l2_swimlane_level_ >= SCHED_PHASES. + uint64_t pop_hit{0}; + uint64_t pop_miss{0}; + uint64_t pop_hit_at_last_emit{0}; + uint64_t pop_miss_at_last_emit{0}; +#if PTO2_SCHED_PROFILING + uint32_t phase_wiring_count{0}; + uint64_t complete_probe_count{0}; + uint64_t complete_hit_count{0}; + uint64_t sched_complete_perf_cycle{0}; + uint64_t sched_dispatch_pop_cycle{0}; + uint64_t sched_dispatch_setup_cycle{0}; +#endif + void reset() { *this = SchedL2SwimlaneCounters{}; } +}; +#endif + +// ============================================================================= +// sync_start drain coordination +// ============================================================================= + +// When sync_start_pending != 0, all scheduler threads skip dispatch +// (only process completions) until the drain worker finishes launching all blocks. +struct alignas(64) SyncStartDrainState { + std::atomic sync_start_pending{0}; // 0=normal; -1=initializing; >0=active (value=block_num) + std::atomic drain_worker_elected{0}; // 0=none; >0: elected thread's (thread_idx+1) + std::atomic drain_ack_mask{0}; // bit per thread; all-set = all threads reached ack barrier + std::atomic pending_task{nullptr}; // held task (not re-queued) + int32_t _pad[10]; +}; +static_assert(sizeof(SyncStartDrainState) == 64); + +#endif // SCHEDULER_TYPES_H diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp new file mode 100644 index 000000000..f98c56cb6 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_runtime2_init.cpp @@ -0,0 +1,457 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Host/AICPU shared runtime-arena layout, init_data and wire implementations. + * + * Lives under runtime/shared/ so it is included in both the host_runtime.so + * build (host pre-populates the prebuilt arena image) and the aicpu_runtime + * build (AICPU runs wire_arena_pointers + destroy after attach). The + * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp + * (ops table, scope/submit/dispatch business logic, profiling) stay in their + * original files and the aicpu build only. + */ + +#include +#include + +#include + +#include "pto_orchestrator.h" +#include "pto_runtime2.h" +#include "pto_ring_buffer.h" +#include "pto_shared_memory.h" +#include "pto_tensormap.h" +#include "scheduler/pto_scheduler.h" + +static bool sum_ring_heap_sizes(const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], uint64_t *total) { + uint64_t sum = 0; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (heap_sizes[r] > std::numeric_limits::max() - sum) { + LOG_ERROR("Total ring heap size overflows uint64_t"); + return false; + } + sum += heap_sizes[r]; + } + *total = sum; + return true; +} + +// ============================================================================= +// Ready queue +// ============================================================================= + +size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) { + // Align the slots[] base to a full cache line so MPMC CAS traffic on the + // first slot cannot false-share with whatever region sits in front of us + // (e.g. orchestrator tensormap heads written by the orch thread). + return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE); +} + +bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) { + // Address the slots region for data writes without storing the pointer in + // queue->slots — that field is set by ready_queue_wire_arena_pointers. + auto *slots_arena = static_cast(arena.region_ptr(slots_off)); + queue->capacity = capacity; + queue->mask = capacity - 1; + queue->enqueue_pos.store(0, std::memory_order_relaxed); + queue->dequeue_pos.store(0, std::memory_order_relaxed); + + for (uint64_t i = 0; i < capacity; i++) { + slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed); + slots_arena[i].slot_state = nullptr; + } + + return true; +} + +void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) { + queue->slots = static_cast(arena.region_ptr(slots_off)); +} + +void ready_queue_destroy(PTO2ReadyQueue *queue) { + // Arena owns the slots[] buffer; just forget the pointer. + queue->slots = nullptr; +} + +// ============================================================================= +// Scheduler +// ============================================================================= + +bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) { + // ring stores the device address of the SM ring header — pure offset + // arithmetic, no SM load. + ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id); + last_task_alive = 0; + advance_lock.store(0, std::memory_order_relaxed); +#if PTO2_PROFILING + dep_pool_snapshot_tail.store(1, std::memory_order_relaxed); + dep_pool_snapshot_top.store(1, std::memory_order_relaxed); +#endif + + // Per-slot SM-side initialization (bind_ring + reset_for_reuse + + // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle:: + // init_header_per_ring so the AICPU performs it during SM reset; host + // prebuilt-arena init skips SM access here. + + return true; +} + +void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; } + +PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) { + int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + dep_pool_capacities[r] = dep_pool_capacity; + } + return reserve_layout(arena, dep_pool_capacities); +} + +PTO2SchedulerLayout +PTO2SchedulerState::reserve_layout(DeviceArena &arena, const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]) { + PTO2SchedulerLayout layout{}; + layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE; + layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + layout.dep_pool_capacities[r] = dep_pool_capacities[r]; + } + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); + } + layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + // Force a cache-line base so writes from scheduler thread 0 (sole + // writer of this ring's dep_pool) do not invalidate adjacent + // multi-threaded regions like ready_queue.slots. + layout.off_dep_pool_entries[r] = + arena.reserve(static_cast(dep_pool_capacities[r]) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE); + } + layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE); + return layout; +} + +bool PTO2SchedulerState::init_data_from_layout( + const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base +) { + PTO2SchedulerState *sched = this; + sched->sm_header = reinterpret_cast(sm_dev_base); +#if PTO2_SCHED_PROFILING + sched->tasks_completed.store(0, std::memory_order_relaxed); + sched->tasks_consumed.store(0, std::memory_order_relaxed); +#endif + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) { + return false; + } + } + + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + if (!ready_queue_init_data_from_layout( + &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity + )) { + return false; + } + } + if (!ready_queue_init_data_from_layout( + &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity + )) { + return false; + } + + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto *dep_entries = static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); + memset(dep_entries, 0, static_cast(layout.dep_pool_capacities[r]) * sizeof(PTO2DepListEntry)); + sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacities[r], orch_err); + } + + if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) { + return false; + } + sched->wiring.batch_count = 0; + sched->wiring.batch_index = 0; + sched->wiring.backoff_counter = 0; + + return true; +} + +void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) { + PTO2SchedulerState *sched = this; + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]); + } + ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + sched->ring_sched_states[r].dep_pool.base = + static_cast(arena.region_ptr(layout.off_dep_pool_entries[r])); + } + sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer); +} + +void PTO2SchedulerState::destroy() { + PTO2SchedulerState *sched = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + sched->ring_sched_states[r].destroy(); + sched->ring_sched_states[r].dep_pool.base = nullptr; + } + sched->wiring.queue.destroy(); + for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) { + ready_queue_destroy(&sched->ready_queues[i]); + } + ready_queue_destroy(&sched->dummy_ready_queue); +} + +// ============================================================================= +// Orchestrator +// ============================================================================= + +PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout( + DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity +) { + int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + dep_pool_capacities[r] = dep_pool_capacity; + } + return reserve_layout(arena, task_window_sizes, dep_pool_capacities); +} + +PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout( + DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], + const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH] +) { + PTO2OrchestratorLayout layout{}; + layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP; + layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + layout.dep_pool_capacities[r] = dep_pool_capacities[r]; + } + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + const size_t fanin_pool_bytes = + PTO2_ALIGN_UP(static_cast(dep_pool_capacities[r]) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE); + layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE); + + always_assert(task_window_sizes[r] > 0 && (task_window_sizes[r] & (task_window_sizes[r] - 1)) == 0); + const size_t seen_epoch_bytes = + PTO2_ALIGN_UP(static_cast(task_window_sizes[r]) * sizeof(uint32_t), PTO2_ALIGN_SIZE); + layout.off_fanin_seen_epoch[r] = arena.reserve(seen_epoch_bytes, PTO2_ALIGN_SIZE); + } + layout.off_scope_tasks = + arena.reserve(static_cast(layout.scope_tasks_cap) * sizeof(uintptr_t), alignof(PTO2TaskSlotState *)); + layout.off_scope_begins = + arena.reserve(static_cast(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t)); + layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes); + return layout; +} + +bool PTO2OrchestratorState::init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, + uint64_t task_window_size +) { + uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + heap_sizes[r] = heap_size; + task_window_sizes[r] = task_window_size; + } + return init_data_from_layout(layout, arena, sm_dev_base, gm_heap, heap_sizes, task_window_sizes); +} + +bool PTO2OrchestratorState::init_data_from_layout( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, + const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH] +) { + auto *orch = this; + *orch = PTO2OrchestratorState{}; + + orch->sm_header = reinterpret_cast(sm_dev_base); + orch->gm_heap_base = gm_heap; + uint64_t total_heap_size = 0; + if (!sum_ring_heap_sizes(heap_sizes, &total_heap_size)) { + return false; + } + orch->gm_heap_size = total_heap_size; + orch->fatal = false; + + auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base); + uint64_t heap_offset = 0; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + void *ring_heap_base = reinterpret_cast(gm_heap) + heap_offset; + auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r); + auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r); + auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r); + + orch->rings[r].task_allocator.init( + task_descs_dev, static_cast(task_window_sizes[r]), cur_idx_dev, last_alive_dev, ring_heap_base, + heap_sizes[r], orch_err + ); + heap_offset += heap_sizes[r]; + + const size_t fanin_pool_bytes = PTO2_ALIGN_UP( + static_cast(layout.dep_pool_capacities[r]) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE + ); + auto *fanin_entries = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); + memset(fanin_entries, 0, fanin_pool_bytes); + orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacities[r], orch_err); + + const size_t seen_epoch_bytes = PTO2_ALIGN_UP( + static_cast(layout.tensor_map.task_window_sizes[r]) * sizeof(uint32_t), PTO2_ALIGN_SIZE + ); + auto *seen_epoch = static_cast(arena.region_ptr(layout.off_fanin_seen_epoch[r])); + memset(seen_epoch, 0, seen_epoch_bytes); + orch->fanin_seen_epoch[r] = seen_epoch; + } + + if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) { + return false; + } + + orch->scope_tasks_size = 0; + orch->scope_tasks_capacity = layout.scope_tasks_cap; + orch->scope_stack_top = -1; + orch->scope_stack_capacity = layout.scope_stack_capacity; + orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH; + + return true; +} + +void PTO2OrchestratorState::wire_arena_pointers( + const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg +) { + auto *orch = this; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + orch->rings[r].fanin_pool.base = static_cast(arena.region_ptr(layout.off_fanin_pool[r])); + orch->fanin_seen_epoch[r] = static_cast(arena.region_ptr(layout.off_fanin_seen_epoch[r])); + } + orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena); + orch->scope_tasks = static_cast(arena.region_ptr(layout.off_scope_tasks)); + orch->scope_begins = static_cast(arena.region_ptr(layout.off_scope_begins)); + orch->scheduler = scheduler_arg; +} + +void PTO2OrchestratorState::destroy() { + auto *orch = this; + orch->tensor_map.destroy(); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + orch->rings[r].fanin_pool.base = nullptr; + orch->fanin_seen_epoch[r] = nullptr; + } + orch->scope_tasks = nullptr; + orch->scope_begins = nullptr; +} + +void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; } + +// ============================================================================= +// Top-level runtime arena +// ============================================================================= + +PTO2RuntimeArenaLayout +runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) { + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; + int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = task_window_size; + heap_sizes[r] = 0; + dep_pool_capacities[r] = dep_pool_capacity; + } + return runtime_reserve_layout(arena, task_window_sizes, heap_sizes, dep_pool_capacities); +} + +PTO2RuntimeArenaLayout runtime_reserve_layout( + DeviceArena &arena, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], + const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH], const int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH] +) { + PTO2RuntimeArenaLayout layout{}; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + layout.task_window_sizes[r] = task_window_sizes[r]; + layout.heap_sizes[r] = heap_sizes[r]; + layout.dep_pool_capacities[r] = dep_pool_capacities[r]; + } + + layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle)); + int32_t task_window_sizes_i32[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes_i32[r] = static_cast(task_window_sizes[r]); + } + layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes_i32, dep_pool_capacities); + layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacities); + layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE); + layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox)); + + layout.arena_size = arena.total_size(); + return layout; +} + +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, + uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size +) { + uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + heap_sizes[r] = heap_size; + } + return runtime_init_data_from_layout(arena, layout, mode, sm_dev_base, 0, gm_heap_dev_base, heap_sizes); +} + +PTO2Runtime *runtime_init_data_from_layout( + DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, + uint64_t /*sm_size*/, void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] +) { + PTO2Runtime *rt = static_cast(arena.region_ptr(layout.off_runtime)); + memset(rt, 0, sizeof(*rt)); + + auto *sm_wrap = static_cast(arena.region_ptr(layout.off_sm_handle)); + memset(sm_wrap, 0, sizeof(*sm_wrap)); + + // rt->ops is filled by the AICPU at boot. + rt->mode = mode; + rt->gm_heap = gm_heap_dev_base; + uint64_t total_heap_size = 0; + if (!sum_ring_heap_sizes(heap_sizes, &total_heap_size)) { + return nullptr; + } + rt->gm_heap_size = total_heap_size; + rt->gm_heap_owned = false; + rt->total_cycles = 0; + + if (!rt->orchestrator.init_data_from_layout( + layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_sizes, layout.task_window_sizes + )) { + return nullptr; + } + if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) { + return nullptr; + } + + auto *mailbox = static_cast(arena.region_ptr(layout.off_mailbox)); + memset(mailbox, 0, sizeof(*mailbox)); + + return rt; +} + +void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) { + rt->sm_handle = static_cast(arena.region_ptr(layout.off_sm_handle)); + rt->aicore_mailbox = static_cast(arena.region_ptr(layout.off_mailbox)); + rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler); + rt->scheduler.wire_arena_pointers(layout.sched, arena); +} + +void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) { + // Arena buffer is pooled across runs by DeviceRunner — never freed here. + if (!rt) return; + rt->scheduler.destroy(); + rt->orchestrator.destroy(); + rt->aicore_mailbox = nullptr; + rt->sm_handle = nullptr; +} diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp new file mode 100644 index 000000000..d704bd85d --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_shared_memory.cpp @@ -0,0 +1,268 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - Shared Memory Implementation + * + * Implements shared memory allocation, initialization, and management + * for Orchestrator-Scheduler communication. + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#include "pto_shared_memory.h" +#include +#include +#include +#include "common/unified_log.h" + +// ============================================================================= +// Size Calculation +// ============================================================================= + +uint64_t PTO2SharedMemoryHandle::calculate_size(uint64_t task_window_size) { + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = task_window_size; + } + return calculate_size_per_ring(task_window_sizes); +} + +uint64_t PTO2SharedMemoryHandle::calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) { + uint64_t size = 0; + + // Header (aligned to cache line) + size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + + // Per-ring task descriptors and payloads + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + } + + return size; +} + +// ============================================================================= +// Creation and Destruction +// ============================================================================= + +void PTO2SharedMemoryHandle::setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) { + char *ptr = (char *)sm_base; + + // Header + header = (PTO2SharedMemoryHeader *)ptr; + ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + + // Per-ring task descriptors, payloads, and slot states + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &ring = header->rings[r]; + ring.task_descriptors = (PTO2TaskDescriptor *)ptr; + ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + + ring.task_payloads = (PTO2TaskPayload *)ptr; + ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + + ring.slot_states = (PTO2TaskSlotState *)ptr; + ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + } +} + +void PTO2SharedMemoryHandle::setup_pointers(uint64_t task_window_size) { + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = task_window_size; + } + setup_pointers_per_ring(task_window_sizes); +} + +bool PTO2SharedMemoryHandle::init( + void *sm_base_arg, uint64_t sm_size_arg, uint64_t task_window_size, uint64_t heap_size +) { + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = task_window_size; + heap_sizes[r] = heap_size; + } + return init_per_ring(sm_base_arg, sm_size_arg, task_window_sizes, heap_sizes); +} + +bool PTO2SharedMemoryHandle::init_per_ring( + void *sm_base_arg, uint64_t sm_size_arg, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], + const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] +) { + if (!sm_base_arg || sm_size_arg == 0) return false; + if (sm_size_arg < calculate_size_per_ring(task_window_sizes)) return false; + + sm_base = sm_base_arg; + sm_size = sm_size_arg; + is_owner = false; + setup_pointers_per_ring(task_window_sizes); + init_header_per_ring(task_window_sizes, heap_sizes); + return true; +} + +PTO2SharedMemoryHandle *PTO2SharedMemoryHandle::create_and_init_default(DeviceArena &arena) { + const uint64_t buffer_size = calculate_size(PTO2_TASK_WINDOW_SIZE); + const size_t off_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle)); + const size_t off_buffer = arena.reserve(static_cast(buffer_size), PTO2_ALIGN_SIZE); + if (arena.commit() == nullptr) return nullptr; + + auto *handle = static_cast(arena.region_ptr(off_handle)); + memset(handle, 0, sizeof(*handle)); + void *buffer = arena.region_ptr(off_buffer); + memset(buffer, 0, static_cast(buffer_size)); + if (!handle->init(buffer, buffer_size, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE)) return nullptr; + return handle; +} + +void PTO2SharedMemoryHandle::destroy() { + // Arena-owned wrappers (is_owner == false) are reclaimed by arena.release(); + // calling destroy on them is a no-op so existing callers stay safe. + if (is_owner && sm_base) { + free(sm_base); + free(this); + } +} + +// ============================================================================= +// Initialization +// ============================================================================= +// +// no need init data in pool, init pool data when used +void PTO2SharedMemoryHandle::init_header(uint64_t task_window_size, uint64_t heap_size) { + uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]; + uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_window_sizes[r] = task_window_size; + heap_sizes[r] = heap_size; + } + init_header_per_ring(task_window_sizes, heap_sizes); +} + +void PTO2SharedMemoryHandle::init_header_per_ring( + const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH] +) { + // Per-ring flow control (start at 0) + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + header->rings[r].fc.init(); + } + + header->orchestrator_done.store(0, std::memory_order_relaxed); + + // Per-ring layout info + uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + header->rings[r].task_window_size = task_window_sizes[r]; + header->rings[r].task_window_mask = static_cast(task_window_sizes[r] - 1); + header->rings[r].heap_size = heap_sizes[r]; + header->rings[r].task_descriptors_offset = offset; + offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE); + offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE); + offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE); + } + + header->total_size = sm_size; + header->graph_output_ptr.store(0, std::memory_order_relaxed); + header->graph_output_size.store(0, std::memory_order_relaxed); + + // Error reporting + header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed); + header->sched_error_bitmap.store(0, std::memory_order_relaxed); + header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed); + header->sched_error_thread.store(-1, std::memory_order_relaxed); + + // Per-ring slot_states reset. Previously lived in + // PTO2SchedulerState::RingSchedState::init(), but it writes into + // ring->slot_states[] which is SM-side storage — keeping it here lets + // host-side prebuilt-arena init skip all SM dereferences. + // bind_ring() pins the ring_id (slot-invariant after this point); + // reset_for_reuse() prepares dynamic fanout/refcount fields so the first + // submit doesn't need an explicit reset. + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto &ring = header->rings[r]; + for (uint64_t i = 0; i < task_window_sizes[r]; i++) { + ring.slot_states[i].bind_ring(static_cast(r)); + ring.slot_states[i].reset_for_reuse(); + ring.slot_states[i].fanin_count = 0; + ring.slot_states[i].active_mask = ActiveMask{}; + } + } +} + +// ============================================================================= +// Debug Utilities +// ============================================================================= + +void PTO2SharedMemoryHandle::print_layout() { + if (!header) return; + + PTO2SharedMemoryHeader *h = header; + + LOG_INFO_V0("=== PTO2 Shared Memory Layout ==="); + LOG_INFO_V0("Base address: %p", sm_base); + LOG_INFO_V0("Total size: %" PRIu64 " bytes", h->total_size); + LOG_INFO_V0("Ring depth: %d", PTO2_MAX_RING_DEPTH); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + LOG_INFO_V0("Ring %d:", r); + LOG_INFO_V0(" task_window_size: %" PRIu64, h->rings[r].task_window_size); + LOG_INFO_V0(" heap_size: %" PRIu64 " bytes", h->rings[r].heap_size); + LOG_INFO_V0( + " descriptors_off: %" PRIu64 " (0x%" PRIx64 ")", h->rings[r].task_descriptors_offset, + h->rings[r].task_descriptors_offset + ); + LOG_INFO_V0(" current_task_idx: %d", h->rings[r].fc.current_task_index.load(std::memory_order_acquire)); + LOG_INFO_V0(" last_task_alive: %d", h->rings[r].fc.last_task_alive.load(std::memory_order_acquire)); + } + LOG_INFO_V0("orchestrator_done: %d", h->orchestrator_done.load(std::memory_order_acquire)); + LOG_INFO_V0("Error state:"); + LOG_INFO_V0(" orch_error_code: %d", h->orch_error_code.load(std::memory_order_relaxed)); + LOG_INFO_V0(" sched_error_bitmap: 0x%x", h->sched_error_bitmap.load(std::memory_order_relaxed)); + LOG_INFO_V0(" sched_error_code: %d", h->sched_error_code.load(std::memory_order_relaxed)); + LOG_INFO_V0(" sched_error_thread: %d", h->sched_error_thread.load(std::memory_order_relaxed)); + LOG_INFO_V0("================================"); +} + +bool PTO2SharedMemoryHandle::validate() { + if (!sm_base) return false; + if (!header) return false; + + PTO2SharedMemoryHeader *h = header; + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + if (!h->rings[r].fc.validate(this, r)) return false; + } + + return true; +} + +bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const { + if (!handle) return false; + if (!handle->header) return false; + if (ring_id < 0 || ring_id >= PTO2_MAX_RING_DEPTH) return false; + + const PTO2SharedMemoryHeader *h = handle->header; + + // Check that offsets are within bounds + if (h->rings[ring_id].task_descriptors_offset >= h->total_size) return false; + + // Check pointer alignment + if ((uintptr_t)h->rings[ring_id].task_descriptors % PTO2_ALIGN_SIZE != 0) return false; + + // Check flow control pointer sanity + int32_t current = current_task_index.load(std::memory_order_acquire); + int32_t last_alive = last_task_alive.load(std::memory_order_acquire); + if (current < 0) return false; + if (last_alive < 0) return false; + + return true; +} diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp new file mode 100644 index 000000000..b99c67233 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/shared/pto_tensormap.cpp @@ -0,0 +1,261 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * PTO Runtime2 - TensorMap Implementation + * + * Implements TensorMap with ring buffer pool, lazy invalidation, + * and chain truncation optimization. + * + * Key features: + * 1. O(1) insert at bucket head + * 2. O(valid_entries) lookup with chain truncation + * 3. Automatic stale entry cleanup during lookup + * 4. Periodic explicit cleanup for long chains + * + * Based on: docs/RUNTIME_LOGIC.md + */ + +#include "pto_tensormap.h" + +#include +#include + +#include "common.h" +#include "common/unified_log.h" + +// ============================================================================= +// TensorMap Lookup Chain Length Statistics (compile-time toggle) +// ============================================================================= +#if PTO2_TENSORMAP_PROFILING +uint64_t g_lookup_chain_total = 0; +uint64_t g_lookup_count = 0; +int32_t g_lookup_chain_max = 0; +uint64_t g_lookup_overlap_checks = 0; +uint64_t g_lookup_overlap_hits = 0; +uint64_t g_insert_count = 0; +#endif + +// ============================================================================= +// Initialization and Destruction +// ============================================================================= + +PTO2TensorMapLayout PTO2TensorMap::reserve_layout( + DeviceArena &arena, int32_t new_num_buckets, int32_t new_pool_size, + const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH] +) { + // num_buckets must be a power of two for the hash truncation to work. + always_assert((new_num_buckets & (new_num_buckets - 1)) == 0); + + PTO2TensorMapLayout layout{}; + layout.num_buckets = new_num_buckets; + layout.pool_size = new_pool_size; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + layout.task_window_sizes[r] = new_task_window_sizes[r]; + } + + layout.off_buckets = arena.reserve( + static_cast(new_num_buckets) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *) + ); + layout.off_entry_pool = + arena.reserve(static_cast(new_pool_size) * sizeof(PTO2TensorMapEntry), alignof(PTO2TensorMapEntry)); + layout.off_free_entry_list = + arena.reserve(static_cast(new_pool_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + layout.off_task_entry_heads[r] = arena.reserve( + static_cast(new_task_window_sizes[r]) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *) + ); + } + return layout; +} + +PTO2TensorMapLayout +PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]) { + return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes); +} + +bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) { + num_buckets = layout.num_buckets; + pool_size = layout.pool_size; + + // Address arena regions for data writes; do not store these in struct + // fields (wire_arena_pointers does that). + auto *buckets_arena = static_cast(arena.region_ptr(layout.off_buckets)); + auto *entry_pool_arena = static_cast(arena.region_ptr(layout.off_entry_pool)); + auto *free_list_arena = static_cast(arena.region_ptr(layout.off_free_entry_list)); + + // buckets[]: empty == nullptr. + for (int32_t i = 0; i < num_buckets; i++) { + buckets_arena[i] = nullptr; + } + + // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...). + // The pool's persistent invariant after init is "bucket_index == -1 means + // not linked", set explicitly below. + memset(entry_pool_arena, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry)); + for (int32_t i = 0; i < pool_size; i++) { + entry_pool_arena[i].bucket_index = -1; + entry_pool_arena[i].next_in_bucket = nullptr; + entry_pool_arena[i].prev_in_bucket = nullptr; + entry_pool_arena[i].next_in_task = nullptr; + entry_pool_arena[i].prev_in_task = nullptr; + entry_pool_arena[i].producer_task_id = PTO2TaskId{}; + } + + // free_entry_list: zeroed (was calloc'd before); contents become meaningful + // only after entries are freed back, so the body of the array stays as 0. + memset(free_list_arena, 0, static_cast(pool_size) * sizeof(PTO2TensorMapEntry *)); + + next_entry_idx = 0; + free_num = 0; + + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + auto *heads_arena = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); + for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) { + heads_arena[i] = nullptr; + } + task_window_sizes[r] = layout.task_window_sizes[r]; + last_task_alives[r] = 0; + last_cleanup[r] = 0; + } + + return true; +} + +void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) { + buckets = static_cast(arena.region_ptr(layout.off_buckets)); + entry_pool = static_cast(arena.region_ptr(layout.off_entry_pool)); + free_entry_list = static_cast(arena.region_ptr(layout.off_free_entry_list)); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_entry_heads[r] = static_cast(arena.region_ptr(layout.off_task_entry_heads[r])); + } +} + +void PTO2TensorMap::destroy() { + // Arena owns the backing memory; here we only forget our pointers so any + // stray post-destroy access trips a nullptr dereference instead of reading + // a recycled allocation. + buckets = nullptr; + entry_pool = nullptr; + free_entry_list = nullptr; + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + task_entry_heads[r] = nullptr; + } +} + +// ============================================================================= +// Debug Utilities +// ============================================================================= + +void PTO2TensorMap::print_stats() { + int32_t valid = 0; + int32_t stale = 0; + int32_t empty_buckets = 0; + int32_t max_chain = 0; + int64_t total_chain = 0; + int32_t non_empty_buckets = 0; + + // Count entries + for (int32_t i = 0; i < pool_size; i++) { + if (entry_pool[i].bucket_index != -1) { + if (entry_valid(entry_pool[i])) { + valid++; + } else { + stale++; + } + } + } + + // Count bucket stats + for (int32_t b = 0; b < num_buckets; b++) { + int32_t chain_len = 0; + auto cur_entry = buckets[b]; + + while (cur_entry != nullptr) { + chain_len++; + cur_entry = cur_entry->next_in_bucket; + } + + if (chain_len == 0) { + empty_buckets++; + } else { + non_empty_buckets++; + total_chain += chain_len; + if (chain_len > max_chain) { + max_chain = chain_len; + } + } + } + + LOG_INFO_V0("=== TensorMap Statistics ==="); + LOG_INFO_V0("Pool size: %d", pool_size); + LOG_INFO_V0("Pool next entry idx: %d", next_entry_idx); + LOG_INFO_V0("Pool free_num: %d", free_num); + LOG_INFO_V0("Num buckets: %d", num_buckets); + LOG_INFO_V0("Valid entries: %d", valid); + LOG_INFO_V0("Stale entries: %d", stale); + LOG_INFO_V0("Empty buckets: %d", empty_buckets); + LOG_INFO_V0("Max chain len: %d", max_chain); + LOG_INFO_V0("Avg chain len: %.2f", non_empty_buckets > 0 ? (float)total_chain / non_empty_buckets : 0); + for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) { + LOG_INFO_V0("Last task alive[%d]: %d", r, last_task_alives[r]); + } + LOG_INFO_V0("============================"); +} + +int32_t PTO2TensorMap::valid_count() { + int32_t count = 0; + + for (int32_t i = 0; i < pool_size; i++) { + if (entry_pool[i].bucket_index != -1 && entry_valid(entry_pool[i])) { + count++; + } + } + + return count; +} + +void PTO2TensorMap::sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive) { + auto ring_id = task_id.ring(); + auto local_id = task_id.local(); + sync_validity(ring_id, sm_last_task_alive); + + // Only attempt cleanup when last_task_alive has actually advanced; + // otherwise cleanup_retired would empty-loop and we'd spin forever. + auto overlap = get_task_local_id_slot(ring_id, local_id) == get_task_local_id_slot(ring_id, last_cleanup[ring_id]); + if (sm_last_task_alive - last_cleanup[ring_id] >= PTO2_TENSORMAP_CLEANUP_INTERVAL || overlap) { + cleanup_retired(ring_id, last_cleanup[ring_id], sm_last_task_alive); + last_cleanup[ring_id] = sm_last_task_alive; + } +} + +// ============================================================================= +// TensorMap Lookup Profiling +// ============================================================================= +#if PTO2_TENSORMAP_PROFILING +PTO2TensorMapProfilingData pto2_tensormap_get_profiling() { + PTO2TensorMapProfilingData d; + d.lookup_chain_total = g_lookup_chain_total; + d.lookup_count = g_lookup_count; + d.lookup_chain_max = g_lookup_chain_max; + d.overlap_checks = g_lookup_overlap_checks; + d.overlap_hits = g_lookup_overlap_hits; + d.insert_count = g_insert_count; + + // Reset + g_lookup_chain_total = 0; + g_lookup_count = 0; + g_lookup_chain_max = 0; + g_lookup_overlap_checks = 0; + g_lookup_overlap_hits = 0; + g_insert_count = 0; + return d; +} +#endif diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp b/src/a5/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp new file mode 100644 index 000000000..1683ac323 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/shared/runtime.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Runtime Class - Implementation + * + * Device execution and handshake control. + * Task graph construction is handled by PTO2Runtime. + */ + +#include "runtime.h" + +#include "common/unified_log.h" +#include "pto_runtime2_types.h" +#include "pto_shared_memory.h" + +// ============================================================================= +// Constructor +// ============================================================================= + +Runtime::Runtime() { + // NOTE: host_api is initialized in InitRuntime() (host-only code) + // because the CApi functions don't exist when compiled for device. + + // Initialize handshake buffers + memset(workers, 0, sizeof(workers)); + worker_count = 0; + aicpu_thread_num = 1; + ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS; + memset(aicpu_allowed_cpus, 0, sizeof(aicpu_allowed_cpus)); + aicpu_allowed_cpu_count = 0; + aicpu_launch_count = 0; + orch_to_sched = false; + + // fully_distributed_within_core handoff fields + dist.core_main_fn = 0; + dist.go = 0; + dist.num_workers = 0; + dist.done_count = 0; + + // Initialize profiling state + + // Initialize device orchestration state + gm_sm_ptr_ = nullptr; + gm_heap_ptr_ = nullptr; + slot_states_ptr_ = nullptr; + orch_args_storage_.clear(); + prebuilt_arena_base_ = nullptr; + prebuilt_runtime_offset_ = 0; + + // Initialize device orchestration SO binary + dev_orch_so_addr_ = 0; + dev_orch_so_size_ = 0; + active_callable_id_ = -1; + register_new_callable_id_ = false; + device_orch_func_name_[0] = '\0'; + device_orch_config_name_[0] = '\0'; + + // Initialize kernel binary tracking + registered_kernel_count_ = 0; + + // Initialize function address mapping + for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) { + func_id_to_addr_[i] = 0; + } +} + +// ============================================================================= +// Device orchestration +// ============================================================================= + +void *Runtime::get_gm_sm_ptr() const { return gm_sm_ptr_; } +void *Runtime::get_gm_heap_ptr() const { return gm_heap_ptr_; } +const ChipStorageTaskArgs &Runtime::get_orch_args() const { return orch_args_storage_; } +void Runtime::set_gm_sm_ptr(void *p) { gm_sm_ptr_ = p; } +void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; } +void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; } +void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; } + +void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) { + prebuilt_arena_base_ = arena_base; + prebuilt_runtime_offset_ = runtime_off; +} +void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; } +size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; } + +// Device orchestration SO metadata (bytes live in a separate device buffer +// owned by DeviceRunner; only the address/size travels in Runtime). +void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) { + dev_orch_so_addr_ = dev_addr; + dev_orch_so_size_ = size; +} + +uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; } + +uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; } + +void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) { + active_callable_id_ = callable_id; + register_new_callable_id_ = is_new; +} + +int32_t Runtime::get_active_callable_id() const { return active_callable_id_; } + +bool Runtime::register_new_callable_id() const { return register_new_callable_id_; } + +void Runtime::set_device_orch_func_name(const char *name) { + if (name == nullptr) { + device_orch_func_name_[0] = '\0'; + return; + } + std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1); + device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0'; +} + +const char *Runtime::get_device_orch_func_name() const { return device_orch_func_name_; } + +void Runtime::set_device_orch_config_name(const char *name) { + if (name == nullptr) { + device_orch_config_name_[0] = '\0'; + return; + } + std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1); + device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0'; +} + +const char *Runtime::get_device_orch_config_name() const { return device_orch_config_name_; } + +uint64_t Runtime::get_function_bin_addr(int func_id) const { + if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0; + return func_id_to_addr_[func_id]; +} + +void Runtime::set_function_bin_addr(int func_id, uint64_t addr) { + if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID); + return; + } + if (addr != 0 && func_id_to_addr_[func_id] == 0) { + if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID) { + registered_kernel_func_ids_[registered_kernel_count_++] = func_id; + } else { + LOG_ERROR( + "[Runtime] Registration limit reached (%d). Cannot track func_id=%d for cleanup.", RUNTIME_MAX_FUNC_ID, + func_id + ); + } + } + func_id_to_addr_[func_id] = addr; +} + +void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) { + if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) { + LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID); + return; + } + func_id_to_addr_[func_id] = addr; +} + +int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; } + +int Runtime::get_registered_kernel_func_id(int index) const { + if (index < 0 || index >= registered_kernel_count_) return -1; + return registered_kernel_func_ids_[index]; +} + +void Runtime::clear_registered_kernels() { registered_kernel_count_ = 0; } diff --git a/src/a5/runtime/fully_distributed_within_core/runtime/tensor_create_info.h b/src/a5/runtime/fully_distributed_within_core/runtime/tensor_create_info.h new file mode 100644 index 000000000..912839a34 --- /dev/null +++ b/src/a5/runtime/fully_distributed_within_core/runtime/tensor_create_info.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * TensorCreateInfo — submit-time create-info for runtime-allocated outputs. + * + * Runtime-only: this header (and the materialization helpers below) are NOT + * part of the wire/host-facing Tensor in src/common/task_interface/tensor.h. + * It carries the metadata required to materialize a fresh contiguous output: + * dtype, ndims, shapes, manual_dep, and an optional initial value fill. Its + * 64B layout mirrors Tensor cache line 1 so init_tensor_from_create_info() can + * copy the whole line with a single memcpy. + */ + +#pragma once + +#include +#include +#include + +#include "data_type.h" +#include "tensor.h" + +class alignas(64) TensorCreateInfo { +public: + TensorCreateInfo( + const uint32_t shapes_in[], uint32_t ndims_in, DataType dtype_in = DataType::FLOAT32, bool manual_dep_in = false + ) : + initial_value(0), + has_initial_value(false), + __pad2__(0), + start_offset(0), // mirrors Tensor::start_offset; pre-zeroed for create-info outputs + version(0), + ndims(ndims_in), + dtype(dtype_in), + manual_dep(manual_dep_in), + is_contiguous(true), // mirrors Tensor::is_contiguous; pre-set for create-info outputs + __pad_flags__(0) { + // Bound the write below: shapes[] holds MAX_TENSOR_DIMS, and ndims_in + // comes from user-submitted output shapes — guard before the loop so an + // oversized rank can't overrun the fixed array. + always_assert(ndims_in > 0 && ndims_in <= MAX_TENSOR_DIMS); + for (uint32_t i = 0; i < ndims_in; i++) { + shapes[i] = shapes_in[i]; + } + } + + void copy(const TensorCreateInfo &other) { memcpy(this, &other, sizeof(other)); } + + template + void set_initial_value(T value) { + has_initial_value = true; + initial_value = to_u64(value); + } + + uint64_t buffer_size_bytes() const { + uint64_t total = 1; + for (uint32_t i = 0; i < ndims; i++) { + total *= shapes[i]; + } + return total * get_element_size(dtype); + } + +public: + // --- Bytes [0, 32): TensorCreateInfo-only fields --- + // These occupy the same positions as Tensor::buffer, Tensor::owner_task_id, + // and Tensor::start_offset. The runtime overwrites owner metadata after the + // memcpy and recomputes start_offset / stride during payload materialization. + uint64_t initial_value; + bool has_initial_value; + uint8_t __pad1__[7]; + uint64_t __pad2__; // → Tensor::owner_task_id (overwritten post-memcpy) + uint64_t start_offset; // mirrors Tensor::start_offset; always 0 for create-info outputs + + // --- Bytes [32, 64): Matches Tensor cache line 1 layout --- + int32_t version; // Always 0 for create-info outputs + uint32_t ndims; + DataType dtype; + bool manual_dep; + bool is_contiguous; // Always true for create-info outputs + uint8_t __pad_flags__; // → Tensor::child_memory (always 0 for create-info outputs) + uint32_t shapes[MAX_TENSOR_DIMS]; // → Tensor::shapes + + TensorCreateInfo() = default; +}; + +// TensorCreateInfo layout must match Tensor cacheline 1 for memcpy optimization +static_assert(sizeof(TensorCreateInfo) == 64, "TensorCreateInfo must match Tensor cacheline 1 size (64 bytes)"); +static_assert(offsetof(TensorCreateInfo, start_offset) == offsetof(Tensor, start_offset)); +static_assert(offsetof(TensorCreateInfo, version) == offsetof(Tensor, version)); +static_assert(offsetof(TensorCreateInfo, ndims) == offsetof(Tensor, ndims)); +static_assert(offsetof(TensorCreateInfo, dtype) == offsetof(Tensor, dtype)); +static_assert(offsetof(TensorCreateInfo, manual_dep) == offsetof(Tensor, manual_dep)); +static_assert(offsetof(TensorCreateInfo, is_contiguous) == offsetof(Tensor, is_contiguous)); +static_assert(offsetof(TensorCreateInfo, __pad_flags__) == offsetof(Tensor, child_memory)); +static_assert(offsetof(TensorCreateInfo, shapes) == offsetof(Tensor, shapes)); + +// ============================================================================ +// Materialization helpers — operate on a Tensor& through its public members. +// Factored out of Tensor (which now lives in the wire/host-facing common +// header) so the create-info dependency stays runtime-only. +// ============================================================================ + +/// Fill the entire backing buffer of `t` with `initial_value` (doubling memcpy). +inline void fill_tensor_initial_value(Tensor &t, uint64_t initial_value) { + always_assert(reinterpret_cast(t.buffer.addr) != nullptr); + uint64_t elem_size = get_element_size(t.dtype); + char *dst = reinterpret_cast(t.buffer.addr); + constexpr uint64_t blk_size = 64; + uint64_t blk = (t.buffer.size < blk_size) ? t.buffer.size : blk_size; + for (uint64_t b = 0; b < blk; b += elem_size) { + memcpy(dst + b, &initial_value, elem_size); + } + uint64_t filled = blk; + while (filled < t.buffer.size) { + uint64_t copy_size = ((t.buffer.size - filled) < filled) ? (t.buffer.size - filled) : filled; + memcpy(dst + filled, dst, copy_size); + filled += copy_size; + } +} + +/// Materialize a TensorCreateInfo into `t` (fresh contiguous output). +/// Single 64B memcpy covers cache line 1; `ci` pre-initialises start_offset (=0) +/// and is_contiguous (=true) in its line-1 slots so they need no reset here. +/// Cache line 2 (stride/extent) is computed from `ci.shapes` in a single reverse pass. +inline void init_tensor_from_create_info(Tensor &t, const TensorCreateInfo &ci, void *addr, uint64_t buffer_size) { + always_assert(ci.ndims > 0 && ci.ndims <= MAX_TENSOR_DIMS); + memcpy(&t, &ci, 64); + t.buffer = {reinterpret_cast(addr), buffer_size}; + t.owner_task_id = PTO2TaskId::invalid(); // caller (orchestrator) overwrites with actual task_id + uint32_t s = 1; + for (int32_t i = static_cast(t.ndims) - 1; i >= 0; --i) { + t.strides[i] = s; + s *= t.shapes[i]; + } + t.extent_elem_cache = s; + if (ci.has_initial_value) { + fill_tensor_initial_value(t, ci.initial_value); + } +} diff --git a/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aic/kernel_matmul.cpp b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aic/kernel_matmul.cpp new file mode 100644 index 000000000..9a3a1c337 --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aic/kernel_matmul.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Matrix Multiplication Kernel (Cube Core) + * + * Computes: C = A @ B (TILE x TILE x TILE matmul) + * Uses TMATMUL instruction + * + * Args (Tensor*): + * args[0] = A (INPUT) - TILE x TILE + * args[1] = B (INPUT) - TILE x TILE + * args[2] = C (OUTPUT) - TILE x TILE + */ + +#include +#include +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +AICORE constexpr inline T CeilAlign(T num_1, T num_2) { + if (num_2 == 0) { + return 0; + } + return (num_1 + num_2 - 1) / num_2 * num_2; +} + +static __aicore__ inline int get_num_tiles(__gm__ Tensor *tensor, uint64_t tile_elems) { + uint64_t total_elems = tensor->shapes[0]; + return static_cast(total_elems / tile_elems); +} + +template +static __aicore__ void matmul_impl(__gm__ float *input_a, __gm__ float *input_b, __gm__ float *output) { + constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float); + constexpr int M = CeilAlign(TILE, 16); + constexpr int K = CeilAlign(TILE, blockAlign); + constexpr int N = CeilAlign(TILE, blockAlign); + + using GlobalDataA = + GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; + using GlobalDataB = + GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; + using GlobalDataC = + GlobalTensor, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>; + + GlobalDataA src0Global(input_a); + GlobalDataB src1Global(input_b); + GlobalDataC dstGlobal(output); + + using TileMatA = Tile; + using TileMatB = Tile; + + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + TLOAD(aMatTile, src0Global); + TLOAD(bMatTile, src1Global); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(dstGlobal, cTile); + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *input_a = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *input_b = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *output = reinterpret_cast<__gm__ Tensor *>(args[2]); + + constexpr uint64_t TILE_ELEMS = 128 * 128; + int num_tiles = get_num_tiles(input_a, TILE_ELEMS); + + __gm__ float *base_a = reinterpret_cast<__gm__ float *>(input_a->buffer.addr) + input_a->start_offset; + __gm__ float *base_b = reinterpret_cast<__gm__ float *>(input_b->buffer.addr) + input_b->start_offset; + __gm__ float *base_c = reinterpret_cast<__gm__ float *>(output->buffer.addr) + output->start_offset; + + for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + __gm__ float *a_ptr = base_a + (tile_idx * TILE_ELEMS); + __gm__ float *b_ptr = base_b + (tile_idx * TILE_ELEMS); + __gm__ float *c_ptr = base_c + (tile_idx * TILE_ELEMS); + + matmul_impl<128>(a_ptr, b_ptr, c_ptr); + } +} diff --git a/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aiv/kernel_add.cpp b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aiv/kernel_add.cpp new file mode 100644 index 000000000..d542c38b3 --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/aiv/kernel_add.cpp @@ -0,0 +1,93 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Element-wise Tensor Addition Kernel + * + * Implements: out[i] = src0[i] + src1[i] + * Tile size: ROWS x COLS + * + * Args (Tensor*): + * args[0] = src0 (INPUT) - ROWS x COLS + * args[1] = src1 (INPUT) - ROWS x COLS + * args[2] = out (OUTPUT) - ROWS x COLS + */ + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +static __aicore__ inline int get_num_tiles(__gm__ Tensor *tensor, uint64_t tile_elems) { + uint64_t total_elems = tensor->shapes[0]; + return static_cast(total_elems / tile_elems); +} + +template +static __aicore__ void add_impl(__gm__ float *src0, __gm__ float *src1, __gm__ float *out) { + using DynShapeDim5 = Shape<1, 1, 1, ROWS, COLS>; + using DynStridDim5 = Stride<1, 1, 1, COLS, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src0Tile(ROWS, COLS); + TileData src1Tile(ROWS, COLS); + TileData dstTile(ROWS, COLS); + TASSIGN(src0Tile, 0x0); + TASSIGN(src1Tile, 0x10000); + TASSIGN(dstTile, 0x20000); + + GlobalData src0Global(src0); + GlobalData src1Global(src1); + GlobalData dstGlobal(out); + + TLOAD(src0Tile, src0Global); + TLOAD(src1Tile, src1Global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(dstTile, src0Tile, src1Tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dstGlobal, dstTile); + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *src0_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *src1_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *out_tensor = reinterpret_cast<__gm__ Tensor *>(args[2]); + + constexpr uint64_t TILE_ELEMS = 128 * 128; + int num_tiles = get_num_tiles(src0_tensor, TILE_ELEMS); + + __gm__ float *base_src0 = reinterpret_cast<__gm__ float *>(src0_tensor->buffer.addr) + src0_tensor->start_offset; + __gm__ float *base_src1 = reinterpret_cast<__gm__ float *>(src1_tensor->buffer.addr) + src1_tensor->start_offset; + __gm__ float *base_out = reinterpret_cast<__gm__ float *>(out_tensor->buffer.addr) + out_tensor->start_offset; + + for (int tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + __gm__ float *src0_ptr = base_src0 + (tile_idx * TILE_ELEMS); + __gm__ float *src1_ptr = base_src1 + (tile_idx * TILE_ELEMS); + __gm__ float *out_ptr = base_out + (tile_idx * TILE_ELEMS); + + add_impl<128, 128>(src0_ptr, src1_ptr, out_ptr); + } +} diff --git a/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp new file mode 100644 index 000000000..d08f7645b --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Alternating Matmul-Add Orchestration Function (tensormap_and_ringbuffer Runtime) + * + * Submits independent matmul and add tasks per batch. + * + * Configuration read from scalar args: + * - batch: Number of batches + * - M: Number of matmul tasks per batch + * - N: Number of add tasks per batch + * - matmul_batch: Number of matmul tiles per task group + * - add_batch: Number of add tiles per task group + * + * Task pattern: interleaved [matmul_0, add_0, matmul_1, add_1, ...] + * All tasks are completely independent (no dependencies). + * + * Arg layout: [A, B, C, X, Y, Z, batch, M_val, N_val, matmul_batch, add_batch] + */ + +#include +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +#define FUNC_MATMUL 0 +#define FUNC_ADD 1 + +static constexpr uint64_t MATMUL_ELEMS = 128 * 128; +static constexpr uint64_t ADD_ELEMS = 128 * 128; + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; // NOLINT(readability/casting) + return PTO2OrchestrationConfig{ + .expected_arg_count = 11, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) { + // Tensor args + const Tensor &ext_A = orch_args.tensor(0).ref(); + const Tensor &ext_B = orch_args.tensor(1).ref(); + const Tensor &ext_C = orch_args.tensor(2).ref(); + const Tensor &ext_X = orch_args.tensor(3).ref(); + const Tensor &ext_Y = orch_args.tensor(4).ref(); + const Tensor &ext_Z = orch_args.tensor(5).ref(); + + // Scalar config args + int batch = static_cast(orch_args.scalar(0)); + int M = static_cast(orch_args.scalar(1)); + int N = static_cast(orch_args.scalar(2)); + int matmul_batch = static_cast(orch_args.scalar(3)); + int add_batch = static_cast(orch_args.scalar(4)); + + LOG_INFO_V0( + "[alternating_orch] Batch: %d, M: %d, N: %d, matmul_batch: %d, add_batch: %d", batch, M, N, matmul_batch, + add_batch + ); + + int total_matmul_tasks = batch * M; + int total_add_tasks = batch * N; + int num_matmul_groups = total_matmul_tasks / matmul_batch; + int num_add_groups = total_add_tasks / add_batch; + + int total_matmul = 0; + int total_add = 0; + + int max_groups = num_matmul_groups > num_add_groups ? num_matmul_groups : num_add_groups; + + // Interleaved submit: matmul and add groups alternate + for (int group_idx = 0; group_idx < max_groups; group_idx++) { + if (group_idx < num_matmul_groups) { + int start_task_idx = group_idx * matmul_batch; + uint64_t offset = static_cast(start_task_idx) * MATMUL_ELEMS; + uint64_t group_size = static_cast(matmul_batch) * MATMUL_ELEMS; + + uint32_t matmul_group_shapes[1] = {static_cast(group_size)}; + uint32_t view_offsets[1] = {static_cast(offset)}; + + Tensor A_view = ext_A.view(matmul_group_shapes, view_offsets); + Tensor B_view = ext_B.view(matmul_group_shapes, view_offsets); + Tensor C_view = ext_C.view(matmul_group_shapes, view_offsets); + + L0TaskArgs params_matmul; + params_matmul.add_input(A_view); + params_matmul.add_input(B_view); + params_matmul.add_output(C_view); + rt_submit_aic_task(FUNC_MATMUL, params_matmul); + total_matmul++; + } + + if (group_idx < num_add_groups) { + int start_task_idx = group_idx * add_batch; + uint64_t offset = static_cast(start_task_idx) * ADD_ELEMS; + uint64_t group_size = static_cast(add_batch) * ADD_ELEMS; + + uint32_t add_group_shapes[1] = {static_cast(group_size)}; + uint32_t view_offsets[1] = {static_cast(offset)}; + + Tensor X_view = ext_X.view(add_group_shapes, view_offsets); + Tensor Y_view = ext_Y.view(add_group_shapes, view_offsets); + Tensor Z_view = ext_Z.view(add_group_shapes, view_offsets); + + L0TaskArgs params_add; + params_add.add_input(X_view); + params_add.add_input(Y_view); + params_add.add_output(Z_view); + rt_submit_aiv_task(FUNC_ADD, params_add); + total_add++; + } + } + + LOG_INFO_V9("[alternating_orch] Submitted %d matmul groups and %d add groups", total_matmul, total_add); +} + +} // extern "C" diff --git a/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/test_alternating_matmul_add.py b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/test_alternating_matmul_add.py new file mode 100644 index 000000000..70051d7ce --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/alternating_matmul_add/test_alternating_matmul_add.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Alternating matmul + add: interleaved AIC (matmul 128x128) and AIV (add 128x128) tasks. + +Tests AIC+AIV mixed execution with scalar parameters and batched task submission. +C[b,m] = A[b,m] @ B[b,m], Z[b,n] = X[b,n] + Y[b,n]. +""" + +import ctypes + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestAlternatingMatmulAdd(SceneTestCase): + """Alternating matmul + add with scalar parameters.""" + + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/alternating_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aic/kernel_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + ], + } + + CASES = [ + { + "name": "default", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {"batch": 1, "M": 1, "N": 1, "matmul_batch": 1, "add_batch": 1}, + }, + { + "name": "Case1", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {"batch": 500, "M": 4, "N": 4, "matmul_batch": 4, "add_batch": 4}, + "manual": True, + }, + { + "name": "Case2", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {"batch": 512, "M": 2, "N": 5, "matmul_batch": 4, "add_batch": 5}, + "manual": True, + }, + ] + + def generate_args(self, params): + batch = params["batch"] + M = params["M"] + N = params["N"] + matmul_batch = params.get("matmul_batch", 1) + add_batch = params.get("add_batch", 1) + matmul_size = 128 + add_rows = 128 + add_cols = 128 + + torch.manual_seed(42) + A = torch.randn(batch, M, matmul_size, matmul_size, dtype=torch.float32) * 0.01 + B = torch.randn(batch, M, matmul_size, matmul_size, dtype=torch.float32) * 0.01 + C = torch.zeros(batch, M, matmul_size, matmul_size, dtype=torch.float32) + X = torch.randn(batch, N, add_rows, add_cols, dtype=torch.float32) * 0.01 + Y = torch.randn(batch, N, add_rows, add_cols, dtype=torch.float32) * 0.01 + Z = torch.zeros(batch, N, add_rows, add_cols, dtype=torch.float32) + + return TaskArgsBuilder( + Tensor("A", A.flatten()), + Tensor("B", B.flatten()), + Tensor("C", C.flatten()), + Tensor("X", X.flatten()), + Tensor("Y", Y.flatten()), + Tensor("Z", Z.flatten()), + Scalar("batch", ctypes.c_int64(batch)), + Scalar("M_val", ctypes.c_int64(M)), + Scalar("N_val", ctypes.c_int64(N)), + Scalar("matmul_batch", ctypes.c_int64(matmul_batch)), + Scalar("add_batch", ctypes.c_int64(add_batch)), + ) + + def compute_golden(self, args, params): + batch = params["batch"] + M = params["M"] + N = params["N"] + matmul_size = 128 + add_rows = 128 + add_cols = 128 + + A = args.A.reshape(batch, M, matmul_size, matmul_size) + B = args.B.reshape(batch, M, matmul_size, matmul_size) + C = args.C.reshape(batch, M, matmul_size, matmul_size) + X = args.X.reshape(batch, N, add_rows, add_cols) + Y = args.Y.reshape(batch, N, add_rows, add_cols) + Z = args.Z.reshape(batch, N, add_rows, add_cols) + + for b in range(batch): + for m in range(M): + C[b, m] = torch.matmul(A[b, m], B[b, m]) + for b in range(batch): + for n in range(N): + Z[b, n] = X[b, n] + Y[b, n] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp new file mode 100644 index 000000000..825665b70 --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +// Batched PV Matmul Kernel: for each batch b, pij(M, K) @ vj(K, N) -> oi_new(M, N) +// +// Processes batch_count batches in a single kernel invocation. +// Per-batch addresses are computed from global tensor bases + block_table lookup. +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128) -> (16, 128) +// Case2: (64, 64) @ ( 64, 128) -> (64, 128) +// +// Template: M=q_tile, K=block_size, N=head_dim + +#include +#include + +#include "tensor.h" + +// NOLINTNEXTLINE(build/namespaces) +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] // NOLINT(whitespace/braces) +#endif + +template +static __aicore__ void pv_matmul_batch_impl( + __gm__ Tensor *pij_batch, __gm__ Tensor *value_cache, __gm__ Tensor *block_table_t, __gm__ Tensor *oi_new_batch, + uint64_t batch_count, uint64_t block_idx, uint64_t block_num, uint64_t batch_start +) { + __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_batch->buffer.addr); + __gm__ bfloat16_t *val_base = reinterpret_cast<__gm__ bfloat16_t *>(value_cache->buffer.addr); + __gm__ float *oi_base = reinterpret_cast<__gm__ float *>(oi_new_batch->buffer.addr); + __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr); + + using GlobalA = GlobalTensor, Stride>; + using GlobalB = GlobalTensor, Stride>; + using GlobalOut = GlobalTensor, Stride>; + + using TileMatA = Tile; + using TileMatB = Tile; + + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + for (uint64_t b = 0; b < batch_count; b++) { + __gm__ bfloat16_t *pij_addr = pij_base + b * M * K; + int32_t phys_block = bt[(batch_start + b) * block_num + block_idx]; + __gm__ bfloat16_t *vj_addr = val_base + static_cast(phys_block) * K * N; + __gm__ float *oi_addr = oi_base + b * M * N; + + GlobalA pijGlobal(pij_addr); + GlobalB vjGlobal(vj_addr); + GlobalOut oiGlobal(oi_addr); + + TLOAD(aMatTile, pijGlobal); + TLOAD(bMatTile, vjGlobal); + + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + + TMOV(aTile, aMatTile); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(oiGlobal, cTile); + + if (b + 1 < batch_count) { + pipe_barrier(PIPE_ALL); + } + } + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *pij_batch = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *value_cache = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *oi_new_batch = reinterpret_cast<__gm__ Tensor *>(args[3]); + uint64_t batch_count = static_cast(args[4]); + uint64_t block_idx = static_cast(args[5]); + uint64_t block_num = static_cast(args[6]); + uint64_t batch_start = static_cast(args[7]); + + uint64_t q_tile_size = static_cast(pij_batch->shapes[0] / batch_count); + uint64_t block_size = static_cast(pij_batch->shapes[1]); + + if (q_tile_size == 16 && block_size <= 16) { + pv_matmul_batch_impl<16, 16, 16>( + pij_batch, value_cache, block_table_t, oi_new_batch, batch_count, block_idx, block_num, batch_start + ); + } else if (q_tile_size == 16) { + pv_matmul_batch_impl<16, 128, 128>( + pij_batch, value_cache, block_table_t, oi_new_batch, batch_count, block_idx, block_num, batch_start + ); + } else { + pv_matmul_batch_impl<64, 64, 128>( + pij_batch, value_cache, block_table_t, oi_new_batch, batch_count, block_idx, block_num, batch_start + ); + } +} diff --git a/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp new file mode 100644 index 000000000..0bf394f93 --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +// Batched QK Matmul Kernel: for each batch b, qi(M, K) @ kj.T(K, N) -> sij(M, N) +// +// Processes batch_count batches in a single kernel invocation. +// Per-batch addresses are computed from global tensor bases + block_table lookup. +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) @ (128, 128).T -> (16, 128) +// Case2: (64, 128) @ (128, 64).T -> (64, 64) +// +// Template: M=q_tile, K=head_dim, N=block_size + +#include +#include + +#include "tensor.h" + +// NOLINTNEXTLINE(build/namespaces) +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] // NOLINT(whitespace/braces) +#endif + +template +static __aicore__ void qk_matmul_batch_impl( + __gm__ Tensor *query, __gm__ Tensor *key_cache, __gm__ Tensor *block_table_t, __gm__ Tensor *sij_batch, + uint64_t batch_count, uint64_t block_idx, uint64_t q_offset, uint64_t block_num, uint64_t num_heads, + uint64_t batch_start +) { + __gm__ bfloat16_t *query_base = reinterpret_cast<__gm__ bfloat16_t *>(query->buffer.addr); + __gm__ bfloat16_t *key_base = reinterpret_cast<__gm__ bfloat16_t *>(key_cache->buffer.addr); + __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_batch->buffer.addr); + __gm__ int32_t *bt = reinterpret_cast<__gm__ int32_t *>(block_table_t->buffer.addr); + + using GlobalA = GlobalTensor, Stride>; + using GlobalB = GlobalTensor, Stride, Layout::DN>; + using GlobalOut = GlobalTensor, Stride>; + + using TileMatA = Tile; + using TileMatB = Tile; + + using LeftTile = TileLeft; + using RightTile = TileRight; + using AccTile = TileAcc; + + TileMatA aMatTile; + TileMatB bMatTile; + TASSIGN(aMatTile, 0x0); + TASSIGN(bMatTile, 0x20000); + + LeftTile aTile; + RightTile bTile; + AccTile cTile; + TASSIGN(aTile, 0x0); + TASSIGN(bTile, 0x0); + TASSIGN(cTile, 0x0); + + for (uint64_t b = 0; b < batch_count; b++) { + __gm__ bfloat16_t *qi_addr = query_base + ((batch_start + b) * num_heads + q_offset) * K; + int32_t phys_block = bt[(batch_start + b) * block_num + block_idx]; + __gm__ bfloat16_t *kj_addr = key_base + static_cast(phys_block) * N * K; + __gm__ float *sij_addr = sij_base + b * M * N; + + GlobalA qiGlobal(qi_addr); + GlobalB kjGlobal(kj_addr); + GlobalOut sijGlobal(sij_addr); + + TLOAD(aMatTile, qiGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TLOAD(bMatTile, kjGlobal); + set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0); + TMOV(aTile, aMatTile); + wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1); + TMOV(bTile, bMatTile); + + set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0); + + TMATMUL(cTile, aTile, bTile); + + set_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0); + + TSTORE(sijGlobal, cTile); + + if (b + 1 < batch_count) { + pipe_barrier(PIPE_ALL); + } + } + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *query = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *key_cache = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *block_table_t = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *sij_batch = reinterpret_cast<__gm__ Tensor *>(args[3]); + uint64_t batch_count = static_cast(args[4]); + uint64_t block_idx = static_cast(args[5]); + uint64_t q_offset = static_cast(args[6]); + uint64_t block_num = static_cast(args[7]); + uint64_t num_heads = static_cast(args[8]); + uint64_t batch_start = static_cast(args[9]); + + uint64_t q_tile_size = static_cast(sij_batch->shapes[0] / batch_count); + uint64_t block_size = static_cast(sij_batch->shapes[1]); + + if (q_tile_size == 16 && block_size <= 16) { + qk_matmul_batch_impl<16, 16, 16>( + query, key_cache, block_table_t, sij_batch, batch_count, block_idx, q_offset, block_num, num_heads, + batch_start + ); + } else if (q_tile_size == 16) { + qk_matmul_batch_impl<16, 128, 128>( + query, key_cache, block_table_t, sij_batch, batch_count, block_idx, q_offset, block_num, num_heads, + batch_start + ); + } else { + qk_matmul_batch_impl<64, 128, 64>( + query, key_cache, block_table_t, sij_batch, batch_count, block_idx, q_offset, block_num, num_heads, + batch_start + ); + } +} diff --git a/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_online_update.cpp new file mode 100644 index 000000000..b8955c3b5 --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_online_update.cpp @@ -0,0 +1,230 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Batched Online Softmax Update + Normalize Kernel (AIV) +// +// Processes batch_count batches in a single kernel invocation. +// For each batch b, updates accumulators mi/li/oi with new block's mij/lij/oi_new. +// On is_last, normalizes and writes to the output tensor at the correct batch offset. +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) -- q_tile=16, head_dim=128 +// Case2: (64, 128) -- q_tile=64, head_dim=128 +// +// Scalar layout strategy: +// M scalar floats stored contiguously in GM can be loaded as either: +// - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops +// - DN (kAlignedRows, 1) ColMajor for row-broadcast ops +// Conversion between layouts uses TRESHAPE (UB-internal, zero GM access). + +#include +#include + +#include "tensor.h" + +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +template +static __aicore__ void online_update_batch_impl( + __gm__ Tensor *mij_batch, __gm__ Tensor *lij_batch, __gm__ Tensor *oi_new_batch, __gm__ Tensor *mi_batch, + __gm__ Tensor *li_batch, __gm__ Tensor *oi_batch, __gm__ Tensor *out, uint64_t is_first, uint64_t is_last, + uint64_t batch_count, uint64_t q_offset, uint64_t num_heads, uint64_t batch_start +) { + __gm__ float *mij_base = reinterpret_cast<__gm__ float *>(mij_batch->buffer.addr); + __gm__ float *lij_base = reinterpret_cast<__gm__ float *>(lij_batch->buffer.addr); + __gm__ float *oi_new_base = reinterpret_cast<__gm__ float *>(oi_new_batch->buffer.addr); + __gm__ float *mi_base = reinterpret_cast<__gm__ float *>(mi_batch->buffer.addr); + __gm__ float *li_base = reinterpret_cast<__gm__ float *>(li_batch->buffer.addr); + __gm__ float *oi_base = reinterpret_cast<__gm__ float *>(oi_batch->buffer.addr); + __gm__ float *out_base = reinterpret_cast<__gm__ float *>(out->buffer.addr); + + constexpr int kScalarCols = 32 / sizeof(float); + constexpr int kScalarRows = M / kScalarCols; + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + using GlobalDataMxN = GlobalTensor, Stride<1, 1, 1, N, 1>>; + using GlobalScalarND = + GlobalTensor, Stride<1, 1, 1, kScalarCols, 1>>; + + using TileDataMxN = Tile; + using TileScalarND = + Tile; + using TileScalarDN = Tile; + + constexpr int kDataBytes = M * N * sizeof(float); + constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float); + + TileDataMxN oiNewTile; + TileDataMxN oiTile; + + TileScalarND mijND, lijND, miND, liND; + TileScalarND miNewND, alphaND, betaND, tmpND; + + TileScalarDN alphaDN, betaDN, liDN; + + TASSIGN(oiNewTile, 0); + TASSIGN(oiTile, kDataBytes); + TASSIGN(mijND, 2 * kDataBytes); + TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes); + TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes); + TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes); + TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes); + TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes); + TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes); + TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes); + + for (uint64_t b = 0; b < batch_count; b++) { + __gm__ float *mij_ptr = mij_base + b * M; + __gm__ float *lij_ptr = lij_base + b * M; + __gm__ float *oi_new_ptr = oi_new_base + b * M * N; + __gm__ float *mi_ptr = mi_base + b * M; + __gm__ float *li_ptr = li_base + b * M; + __gm__ float *oi_ptr = oi_base + b * M * N; + __gm__ float *dst_ptr = out_base + ((batch_start + b) * num_heads + q_offset) * N; + + GlobalDataMxN oiNewGlobal(oi_new_ptr); + GlobalDataMxN oiGlobal(oi_ptr); + GlobalDataMxN dstGlobal(dst_ptr); + + GlobalScalarND mijGlobalND(mij_ptr); + GlobalScalarND lijGlobalND(lij_ptr); + GlobalScalarND miGlobalND(mi_ptr); + GlobalScalarND liGlobalND(li_ptr); + + if (is_first) { + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(mijND, mijGlobalND); + TLOAD(lijND, lijGlobalND); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, mijND); + TSTORE(liGlobalND, lijND); + TSTORE(oiGlobal, oiNewTile); + + if (is_last) { + TRESHAPE(liDN, lijND); + set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1); + TROWEXPANDDIV(oiNewTile, oiNewTile, liDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiNewTile); + } + } else { + TLOAD(oiNewTile, oiNewGlobal); + TLOAD(oiTile, oiGlobal); + TLOAD(mijND, mijGlobalND); + TLOAD(lijND, lijGlobalND); + TLOAD(miND, miGlobalND); + TLOAD(liND, liGlobalND); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + TMAX(miNewND, miND, mijND); + pipe_barrier(PIPE_V); + TSUB(alphaND, miND, miNewND); + pipe_barrier(PIPE_V); + TEXP(alphaND, alphaND); + pipe_barrier(PIPE_V); + TSUB(betaND, mijND, miNewND); + pipe_barrier(PIPE_V); + TEXP(betaND, betaND); + pipe_barrier(PIPE_V); + TMUL(liND, alphaND, liND); + pipe_barrier(PIPE_V); + TMUL(tmpND, betaND, lijND); + pipe_barrier(PIPE_V); + TADD(liND, liND, tmpND); + + TRESHAPE(alphaDN, alphaND); + TRESHAPE(betaDN, betaND); + if (is_last) { + TRESHAPE(liDN, liND); + } + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(miGlobalND, miNewND); + TSTORE(liGlobalND, liND); + + TROWEXPANDMUL(oiTile, oiTile, alphaDN); + TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN); + pipe_barrier(PIPE_V); + TADD(oiTile, oiTile, oiNewTile); + + if (is_last) { + pipe_barrier(PIPE_V); + TROWEXPANDDIV(oiTile, oiTile, liDN); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(dstGlobal, oiTile); + } else { + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(oiGlobal, oiTile); + } + } + + if (b + 1 < batch_count) { + pipe_barrier(PIPE_ALL); + } + } + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *mij_batch = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *lij_batch = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *oi_new_batch = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *mi_batch = reinterpret_cast<__gm__ Tensor *>(args[3]); + __gm__ Tensor *li_batch = reinterpret_cast<__gm__ Tensor *>(args[4]); + __gm__ Tensor *oi_batch = reinterpret_cast<__gm__ Tensor *>(args[5]); + __gm__ Tensor *out = reinterpret_cast<__gm__ Tensor *>(args[6]); + uint64_t is_first = static_cast(args[7]); + uint64_t is_last = static_cast(args[8]); + uint64_t batch_count = static_cast(args[9]); + uint64_t q_offset = static_cast(args[10]); + uint64_t num_heads = static_cast(args[11]); + uint64_t batch_start = static_cast(args[12]); + + uint64_t q_tile_size = static_cast(mij_batch->shapes[0] / batch_count); + uint64_t head_dim = static_cast(oi_new_batch->shapes[1]); + + if (q_tile_size == 16 && head_dim <= 16) { + online_update_batch_impl<16, 16>( + mij_batch, lij_batch, oi_new_batch, mi_batch, li_batch, oi_batch, out, is_first, is_last, batch_count, + q_offset, num_heads, batch_start + ); + } else if (q_tile_size == 16) { + online_update_batch_impl<16, 128>( + mij_batch, lij_batch, oi_new_batch, mi_batch, li_batch, oi_batch, out, is_first, is_last, batch_count, + q_offset, num_heads, batch_start + ); + } else { + online_update_batch_impl<64, 128>( + mij_batch, lij_batch, oi_new_batch, mi_batch, li_batch, oi_batch, out, is_first, is_last, batch_count, + q_offset, num_heads, batch_start + ); + } +} diff --git a/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp new file mode 100644 index 000000000..3ce77eaa0 --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +// Batched Softmax Preparation Kernel (AIV) +// +// Processes batch_count batches in a single kernel invocation. +// For each batch b at block_idx bn: +// valid_len = min(N, context_lens[b] - bn * N) +// sij_masked = pad(sij[b], valid_len, -inf) +// sij_scale = sij_masked * scale +// mij[b] = row_max(sij_scale) +// pij[b] = exp(sij_scale - mij[b]) (truncated to bf16 then back) +// lij[b] = row_sum(pij[b]) +// +// Supports two tile configurations via runtime dispatch: +// Case1: (16, 128) -- q_tile=16, block_size=128 +// Case2: (64, 64) -- q_tile=64, block_size=64 + +#include +#include + +#include "tensor.h" + +// NOLINTNEXTLINE(build/namespaces) +using namespace pto; + +#include "pipe_sync.h" + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] // NOLINT(whitespace/braces) +#endif + +template +static __aicore__ void softmax_prepare_batch_impl( + __gm__ Tensor *sij_batch, __gm__ Tensor *context_lens_t, __gm__ Tensor *pij_batch, __gm__ Tensor *mij_batch, + __gm__ Tensor *lij_batch, float scale_value, uint64_t batch_count, uint64_t block_idx, uint64_t batch_start +) { + __gm__ float *sij_base = reinterpret_cast<__gm__ float *>(sij_batch->buffer.addr); + __gm__ bfloat16_t *pij_base = reinterpret_cast<__gm__ bfloat16_t *>(pij_batch->buffer.addr); + __gm__ float *mij_base = reinterpret_cast<__gm__ float *>(mij_batch->buffer.addr); + __gm__ float *lij_base = reinterpret_cast<__gm__ float *>(lij_batch->buffer.addr); + __gm__ int32_t *ctx_lens = reinterpret_cast<__gm__ int32_t *>(context_lens_t->buffer.addr); + + constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float)); + + using GlobalDataMxN = GlobalTensor, Stride<1, 1, 1, N, 1>>; + using GlobalDataMxN_bf16 = GlobalTensor, Stride<1, 1, 1, N, 1>>; + using GlobalScalarDN = GlobalTensor, Stride<1, 1, 1, 1, 1>, Layout::DN>; + + using TileSijDyn = Tile; + using TileSijPad = Tile; + + using TileVecMxN = Tile; + using TileVecMxN_bf16 = Tile; + using TileScalarDN = Tile; + + TileVecMxN sijTile; + TileSijPad sijPadTile; + TileVecMxN pijTile; + TileVecMxN tmpTile; + TileScalarDN maxTile; + TileScalarDN sumTile; + TileVecMxN_bf16 pijBf16Tile; + + TASSIGN(sijTile, 0x0); + TASSIGN(sijPadTile, 0x0); + TASSIGN(pijTile, M * N * sizeof(float)); + TASSIGN(tmpTile, 2 * M * N * sizeof(float)); + TASSIGN(maxTile, 3 * M * N * sizeof(float)); + TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float)); + TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float)); + + for (uint64_t b = 0; b < batch_count; b++) { + int32_t cur_seq = ctx_lens[batch_start + b]; + uint64_t start = block_idx * N; + uint64_t valid_len = 0; + if (start < static_cast(cur_seq)) { + uint64_t remaining = static_cast(cur_seq) - start; + valid_len = (remaining < N) ? remaining : N; + } + + __gm__ float *sij_addr = sij_base + b * M * N; + __gm__ bfloat16_t *pij_addr = pij_base + b * M * N; + __gm__ float *mij_addr = mij_base + b * M; + __gm__ float *lij_addr = lij_base + b * M; + + GlobalDataMxN sijGlobal(sij_addr); + GlobalDataMxN_bf16 pijGlobal(pij_addr); + GlobalScalarDN mijGlobal(mij_addr); + GlobalScalarDN lijGlobal(lij_addr); + + if (valid_len == 0) { + // Block entirely beyond sequence: write mij=-1e30, lij=0, pij=0 + // Use -1e30 instead of -inf to avoid NaN in online_update (exp(-inf - (-inf)) = NaN) + constexpr float NEG_LARGE = -1e30f; + for (int i = 0; i < kAlignedRows; i++) { + maxTile.SetValue(i, NEG_LARGE); + sumTile.SetValue(i, 0.0f); + } + for (int i = 0; i < M * N; i++) { + pijBf16Tile.SetValue(i, static_cast(0.0f)); + } + + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(mijGlobal, maxTile); + TSTORE(lijGlobal, sumTile); + TSTORE(pijGlobal, pijBf16Tile); + + if (b + 1 < batch_count) { + pipe_barrier(PIPE_ALL); + } + continue; + } + + TLOAD(sijTile, sijGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + + TileSijDyn sijDynTile(static_cast(valid_len)); + TASSIGN(sijDynTile, 0x0); + TFILLPAD_INPLACE(sijPadTile, sijDynTile); + pipe_barrier(PIPE_V); + + TMULS(sijTile, sijTile, scale_value); + pipe_barrier(PIPE_V); + TROWMAX(maxTile, sijTile, tmpTile); + pipe_barrier(PIPE_V); + TROWEXPANDSUB(pijTile, sijTile, maxTile); + pipe_barrier(PIPE_V); + TEXP(pijTile, pijTile); + pipe_barrier(PIPE_V); + // Truncate pij to bf16 first, then compute lij from truncated values (matches golden) + TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + pipe_barrier(PIPE_V); + TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND); + pipe_barrier(PIPE_V); + TROWSUM(sumTile, pijTile, tmpTile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(pijGlobal, pijBf16Tile); + TSTORE(mijGlobal, maxTile); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1); + TSTORE(lijGlobal, sumTile); + + if (b + 1 < batch_count) { + pipe_barrier(PIPE_ALL); + } + } + + pipe_sync(); +} + +extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) { + __gm__ Tensor *sij_batch = reinterpret_cast<__gm__ Tensor *>(args[0]); + __gm__ Tensor *context_lens_t = reinterpret_cast<__gm__ Tensor *>(args[1]); + __gm__ Tensor *pij_batch = reinterpret_cast<__gm__ Tensor *>(args[2]); + __gm__ Tensor *mij_batch = reinterpret_cast<__gm__ Tensor *>(args[3]); + __gm__ Tensor *lij_batch = reinterpret_cast<__gm__ Tensor *>(args[4]); + union { + uint64_t u; + float f; + } scale_conv; + scale_conv.u = static_cast(args[5]); + float scale_value = scale_conv.f; + uint64_t batch_count = static_cast(args[6]); + uint64_t block_idx = static_cast(args[7]); + uint64_t batch_start = static_cast(args[8]); + + uint64_t q_tile_size = static_cast(sij_batch->shapes[0] / batch_count); + uint64_t block_size = static_cast(pij_batch->shapes[1]); + + if (q_tile_size == 16 && block_size <= 16) { + softmax_prepare_batch_impl<16, 16>( + sij_batch, context_lens_t, pij_batch, mij_batch, lij_batch, scale_value, batch_count, block_idx, batch_start + ); + } else if (q_tile_size == 16) { + softmax_prepare_batch_impl<16, 128>( + sij_batch, context_lens_t, pij_batch, mij_batch, lij_batch, scale_value, batch_count, block_idx, batch_start + ); + } else { + softmax_prepare_batch_impl<64, 64>( + sij_batch, context_lens_t, pij_batch, mij_batch, lij_batch, scale_value, batch_count, block_idx, batch_start + ); + } +} diff --git a/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp new file mode 100644 index 000000000..1717ebc48 --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -0,0 +1,215 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Batch Paged Attention Orchestration Function - Production Scale + * + * Chunked batched architecture: the full batch is split into chunks of + * IN_CORE_BATCH size. Each chunk's QK/SF/PV/UP tasks are independent + * and can be scheduled to different cores in parallel. + * + * Task count = num_chunks * (1 + max_bn * 4), where + * num_chunks = ceil(batch / IN_CORE_BATCH) + * + * For batch <= IN_CORE_BATCH, behavior is identical to the non-chunked version. + * + * Memory Layout: + * Query: (batch * num_heads, head_dim) bf16 + * Key: (total_blocks, block_size, head_dim) bf16 (stored as K^T for QK) + * Value: (total_blocks, block_size, head_dim) bf16 + * + * Per-chunk intermediate tensors (contiguous across chunk_bc dimension): + * sij: (chunk_bc * q_tile, block_size) fp32 + * pij: (chunk_bc * q_tile, block_size) bf16 + * mij/lij: (chunk_bc * q_tile) fp32 + * oi_new: (chunk_bc * q_tile, head_dim) fp32 + * oi: (chunk_bc * q_tile, head_dim) fp32 accumulator + * mi/li: (chunk_bc * q_tile) fp32 accumulator + * + * Kernels receive global tensors + scalar metadata (including batch_start) + * and compute per-batch addresses internally. + */ + +#include +#include + +#include +#include + +#include "pto_orchestration_api.h" + +#define FUNC_QK_MATMUL 0 +#define FUNC_SOFTMAX_PREPARE 1 +#define FUNC_PV_MATMUL 2 +#define FUNC_ONLINE_UPDATE 3 +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{ + .expected_arg_count = 7, + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) { + // Read dimensions from tensor metadata + uint64_t batch = orch_args.tensor(0).ref().shapes[0]; + uint64_t num_heads = orch_args.tensor(0).ref().shapes[1]; + uint64_t head_dim = orch_args.tensor(0).ref().shapes[2]; + DataType data_type = orch_args.tensor(0).ref().dtype; + + uint64_t block_size = orch_args.tensor(1).ref().shapes[1]; + uint64_t block_num = orch_args.tensor(3).ref().shapes[1]; + + uint64_t scale_value = orch_args.scalar(0); + + uint64_t q_tile = std::min(num_heads, static_cast(128)); + uint64_t q_loop = (num_heads + q_tile - 1) / q_tile; + uint64_t elem_size = get_element_size(data_type); + + LOG_INFO_V0("batch_paged_attention: batch=%" PRIu64 ", num_heads=%" PRIu64, batch, num_heads); + + void *query_ptr = orch_args.tensor(0).ref().data_as(); + void *kc_ptr = orch_args.tensor(1).ref().data_as(); + void *vc_ptr = orch_args.tensor(2).ref().data_as(); + void *out_ptr = orch_args.tensor(5).ref().data_as(); + + uint32_t bt_shapes[2] = {static_cast(batch), static_cast(block_num)}; + Tensor block_table = + make_tensor_external(orch_args.tensor(3).ref().data_as(), bt_shapes, 2, DataType::INT32, false); + + uint32_t cl_shapes[1] = {static_cast(batch)}; + Tensor context_lens = + make_tensor_external(orch_args.tensor(4).ref().data_as(), cl_shapes, 1, DataType::INT32, false); + + uint64_t max_bn = 0; + for (uint64_t b = 0; b < batch; b++) { + uint32_t cl_idx[1] = {static_cast(b)}; + uint64_t cur_seq = static_cast(get_tensor_data(context_lens, 1, cl_idx)); + uint64_t bn_b = (cur_seq + block_size - 1) / block_size; + if (bn_b > max_bn) max_bn = bn_b; + } + + uint32_t query_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + uint64_t total_blocks_count = orch_args.tensor(1).ref().shapes[0]; + uint64_t kv_total_rows = total_blocks_count * block_size; + uint32_t key_cache_shapes[2] = {static_cast(kv_total_rows), static_cast(head_dim)}; + uint32_t value_cache_shapes[2] = {static_cast(kv_total_rows), static_cast(head_dim)}; + uint32_t out_shapes[2] = {static_cast(batch * num_heads), static_cast(head_dim)}; + + Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type); + Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type); + Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type); + Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32, true); + + constexpr uint64_t IN_CORE_BATCH = 16; + uint64_t num_chunks = (batch + IN_CORE_BATCH - 1) / IN_CORE_BATCH; + + for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) { + uint64_t q_offset = q_idx * q_tile; + + for (uint64_t chunk_idx = 0; chunk_idx < num_chunks; chunk_idx++) { + uint64_t chunk_bc = batch - chunk_idx * IN_CORE_BATCH; + if (chunk_bc > IN_CORE_BATCH) chunk_bc = IN_CORE_BATCH; + uint64_t batch_start = chunk_idx * IN_CORE_BATCH; + + PTO2_SCOPE() { + uint32_t oi_acc_shapes[2] = {static_cast(chunk_bc * q_tile), static_cast(head_dim)}; + uint32_t scalar_acc_shapes[1] = {static_cast(chunk_bc * q_tile)}; + TensorCreateInfo oi_batch_ci(oi_acc_shapes, 2, DataType::FLOAT32); + TensorCreateInfo scalar_acc_ci(scalar_acc_shapes, 1, DataType::FLOAT32); + TaskOutputTensors alloc_outs = alloc_tensors(oi_batch_ci, scalar_acc_ci, scalar_acc_ci); + const Tensor &oi_batch = alloc_outs.get_ref(0); + const Tensor &li_batch = alloc_outs.get_ref(1); + const Tensor &mi_batch = alloc_outs.get_ref(2); + + // Inner-loop create infos: shapes are loop-invariant, hoist out of bn loop + uint32_t sij_shapes[2] = {static_cast(chunk_bc * q_tile), static_cast(block_size)}; + uint32_t vec_shapes[1] = {static_cast(chunk_bc * q_tile)}; + uint32_t oi_new_shapes[2] = {static_cast(chunk_bc * q_tile), static_cast(head_dim)}; + TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32); + TensorCreateInfo pij_ci(sij_shapes, 2, data_type); + TensorCreateInfo vec_ci(vec_shapes, 1, DataType::FLOAT32); + TensorCreateInfo oi_new_ci(oi_new_shapes, 2, DataType::FLOAT32); + + for (uint64_t bn = 0; bn < max_bn; bn++) { + PTO2_SCOPE() { + L0TaskArgs params_qk; + params_qk.add_input(query); + params_qk.add_input(key_cache); + params_qk.add_input(block_table); + params_qk.add_output(sij_ci); + params_qk.add_scalar(chunk_bc); + params_qk.add_scalar(bn); + params_qk.add_scalar(q_offset); + params_qk.add_scalar(block_num); + params_qk.add_scalar(num_heads); + params_qk.add_scalar(batch_start); + TaskOutputTensors qk_outs = rt_submit_aic_task(FUNC_QK_MATMUL, params_qk); + const Tensor &sij_b = qk_outs.get_ref(0); + + L0TaskArgs params_sf; + params_sf.add_input(sij_b); + params_sf.add_input(context_lens); + params_sf.add_output(pij_ci); + params_sf.add_output(vec_ci); + params_sf.add_output(vec_ci); + params_sf.add_scalar(scale_value); + params_sf.add_scalar(chunk_bc); + params_sf.add_scalar(bn); + params_sf.add_scalar(batch_start); + TaskOutputTensors sf_outs = rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf); + const Tensor &pij_b = sf_outs.get_ref(0); + const Tensor &mij_b = sf_outs.get_ref(1); + const Tensor &lij_b = sf_outs.get_ref(2); + + L0TaskArgs params_pv; + params_pv.add_input(pij_b); + params_pv.add_input(value_cache); + params_pv.add_input(block_table); + params_pv.add_output(oi_new_ci); + params_pv.add_scalar(chunk_bc); + params_pv.add_scalar(bn); + params_pv.add_scalar(block_num); + params_pv.add_scalar(batch_start); + TaskOutputTensors pv_outs = rt_submit_aic_task(FUNC_PV_MATMUL, params_pv); + const Tensor &oi_new_b = pv_outs.get_ref(0); + + uint64_t is_first = (bn == 0) ? 1 : 0; + uint64_t is_last = (bn == max_bn - 1) ? 1 : 0; + L0TaskArgs params_up; + params_up.add_input(mij_b); + params_up.add_input(lij_b); + params_up.add_input(oi_new_b); + params_up.add_inout(mi_batch); + params_up.add_inout(li_batch); + params_up.add_inout(oi_batch); + params_up.add_inout(out); + params_up.add_scalar(is_first); + params_up.add_scalar(is_last); + params_up.add_scalar(chunk_bc); + params_up.add_scalar(q_offset); + params_up.add_scalar(num_heads); + params_up.add_scalar(batch_start); + rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up); + } + } + } + } + } + + LOG_INFO_V0( + "batch_paged_attention: %" PRIu64 " tasks (batch=%" PRIu64 ", max_bn=%" PRIu64 ", chunks=%" PRIu64 + ", IN_CORE_BATCH=%" PRIu64 ")", + static_cast(num_chunks * (1 + max_bn * 4)), batch, max_bn, num_chunks, IN_CORE_BATCH + ); +} + +} // extern "C" diff --git a/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/test_batch_paged_attention.py b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/test_batch_paged_attention.py new file mode 100644 index 000000000..f36391d77 --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/batch_paged_attention/test_batch_paged_attention.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Batch paged attention: batched online softmax with AIC/AIV subgraph splitting (bfloat16).""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden +from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestBatchPagedAttention(SceneTestCase): + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/paged_attention_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "name": "QK", + "source": "kernels/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "name": "SF", + "source": "kernels/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 2, + "name": "PV", + "source": "kernels/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 3, + "name": "UP", + "source": "kernels/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "Case1", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": { + "batch": 256, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case2", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "Case3", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "manual": True, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 256, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseSmall1", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 9}, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 33, + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseSmall2", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 9}, + "manual": True, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 31, + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseSmall3", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 9}, + "manual": True, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 128, + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseVarSeq2", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 9}, + "manual": True, + "params": { + "batch": 2, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 33, + "context_lens_list": [33, 17], + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + { + "name": "CaseVarSeq4", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 9}, + "manual": True, + "params": { + "batch": 4, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 128, + "context_lens_list": [33, 64, 128, 15], + "max_model_len": 256, + "dtype": "bfloat16", + }, + }, + ] + + def generate_args(self, params): + result = _pa_generate_inputs(params) + specs = [] + for name, value in result: + if isinstance(value, torch.Tensor): + specs.append(Tensor(name, value)) + else: + specs.append(Scalar(name, value)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/kernels/orchestration/chain_barrier_orch.cpp b/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/kernels/orchestration/chain_barrier_orch.cpp new file mode 100644 index 000000000..eb9340bf8 --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/kernels/orchestration/chain_barrier_orch.cpp @@ -0,0 +1,94 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * Many-to-one barrier via explicit set_dependencies — exercises the dep_gen + * overflow chain wire format. + * + * Submits N producers each writing X[0] = 42.0, then a dummy_T whose only + * dependency surface is set_dependencies({all N producer ids}, N), then a + * consumer that explicit-depends on the barrier and copies X[0] -> Y[0]. + * + * Picking N > DEP_GEN_MAX_EXPLICIT_DEPS (=64) forces the dep_gen capture to + * spill into one or more DepGenOverflowRecord slots; picking N to span the + * 64 + k*326 boundaries exercises both single- and multi-overflow chains. + * + * Args layout: [X, Y, scalar(N)] + * - X: every producer writes it (tensormap auto-deps the chain so the + * SENTINEL is preserved); consumer reads it. + * - Y: consumer writes it; host checks Y[0] == SENTINEL. + * + * Scalar: N (1 .. MAX_PRODUCERS). + */ + +#include + +#include "pto_orchestration_api.h" // NOLINT(build/include_subdir) + +#define FUNC_WRITE_CONST 0 +#define FUNC_COPY_FIRST 1 + +// Stack room for producer_ids[]. 500 covers everything we expect to test; +// PTO2_DEP_LIST_POOL_SIZE (16384) is the real ceiling on a per-ring basis. +static constexpr int32_t MAX_PRODUCERS = 500; + +extern "C" { + +__attribute__((visibility("default"))) PTO2OrchestrationConfig aicpu_orchestration_config(const L2TaskArgs &orch_args) { + (void)orch_args; + return PTO2OrchestrationConfig{ + .expected_arg_count = 3, // X, Y, scalar(N) + }; +} + +__attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2TaskArgs &orch_args) { + const Tensor &ext_X = orch_args.tensor(0).ref(); + const Tensor &ext_Y = orch_args.tensor(1).ref(); + + uint64_t n_raw = orch_args.scalar(0); + int32_t n = static_cast(n_raw); + if (n < 1 || n > MAX_PRODUCERS) { + rt_report_fatal(PTO2_ERROR_INVALID_ARGS, "chain_barrier_orch: invalid n=%d", n); + return; + } + + PTO2TaskId producer_ids[MAX_PRODUCERS]; + + // N producers each INOUT X. tensormap auto-deps them in a chain, so X[0] + // stays at SENTINEL through all of them — the host only checks the final + // value, which proves the barrier waited for every producer to finish. + for (int32_t i = 0; i < n; i++) { + L0TaskArgs args; + args.add_inout(ext_X); + producer_ids[i] = rt_submit_aic_task(FUNC_WRITE_CONST, args).task_id(); + } + + // Dummy barrier with explicit deps on ALL N producers. dc=n > 64 forces + // the dep_gen writer to emit base + overflow chain. + PTO2TaskId barrier_id; + { + L0TaskArgs args; + args.set_dependencies(producer_ids, n); + barrier_id = rt_submit_dummy_task(args).task_id(); + } + + // Consumer: explicit dep on barrier only, reads X, writes Y. + { + L0TaskArgs args; + PTO2TaskId consumer_deps[] = {barrier_id}; + args.set_dependencies(consumer_deps, 1); + args.add_input(ext_X); + args.add_inout(ext_Y); + rt_submit_aic_task(FUNC_COPY_FIRST, args); + } +} + +} // extern "C" diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen.py b/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen.py new file mode 100644 index 000000000..7377b545c --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""dep_gen capture + replay sim test. + +Re-runs the ``vector_example`` orchestration with ``--enable-dep-gen``. +Verifies the end-to-end dep_gen pipeline on a2a3sim: + + ``/deps.json`` is produced by the host replay + (PTO2TensorMap replay → JSON edge list), and contains exactly the + 6 edges documented in example_orchestration.cpp. The capture path + (host collector drains the device ring buffer into memory and feeds + the replay directly — no submit_trace.bin on disk) is exercised + implicitly: if it broke, deps.json would be empty or wrong. + +deps.json is now the sole source of truth for fanout edges — the device +hot path no longer records L2SwimlaneAicpuTaskRecord::fanout[], so there is no +"fanout ⊆ deps" cross-check to run. swimlane_converter.py joins +deps.json into the Perfetto trace at post-process time. + +Compute correctness is delegated to the upstream ``vector_example`` test — +this case re-uses the same orchestration to keep coverage focused on the +capture+replay+validation pipeline. +""" + +import json +import shutil +import subprocess +import sys + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.scene_test import _outputs_dir, _sanitize_for_filename + +KERNELS_BASE = "../../../../../../examples/a2a3/tensormap_and_ringbuffer/vector_example/kernels" + + +def _task_id(ring: int, local: int) -> int: + """Encode (ring_id, local_id) → 64-bit raw matching ``PTO2TaskId::raw`` — + keeps the bit layout (``(ring << 32) | local``) in one place rather than + repeating ``1 << 32`` arithmetic at every call site. + """ + return (ring << 32) | local + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestDepGen(SceneTestCase): + """Vector example, run with dep_gen enabled, then verify submit_trace.bin.""" + + CALLABLE = { + "orchestration": { + "source": f"{KERNELS_BASE}/orchestration/example_orchestration.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": f"{KERNELS_BASE}/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": f"{KERNELS_BASE}/aiv/kernel_add_scalar.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT], + }, + { + "func_id": 2, + "source": f"{KERNELS_BASE}/aiv/kernel_mul.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + ], + } + + CASES = [ + { + "name": "default", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 3}, + "params": {}, + }, + ] + + def generate_args(self, params): + SIZE = 128 * 128 + return TaskArgsBuilder( + Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)), + Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32)), + Tensor("f", torch.zeros(SIZE, dtype=torch.float32)), + ) + + def compute_golden(self, args, params): + args.f[:] = (args.a + args.b + 1) * (args.a + args.b + 2) + (args.a + args.b) + + def test_run(self, st_platform, st_worker, request): + # Run the standard scene-test loop, then assert dep_gen output for the + # cases that actually ran on this platform. Without this override, the + # pytest path silently passes when dep_gen is disabled in the AICPU + # build (the trace ring stays empty and deps.json is just `{"edges":[]}`) + # — the bug that prompted #742. Use the framework helper so the + # rounds-guard stays consistent with SceneTestCase.test_run (super() + # already warned, so warn=False here). + super().test_run(st_platform, st_worker, request) + if not self._effective_enable_dep_gen(request): + return + for case in self.CASES: + if st_platform in case.get("platforms", []): + self._post_validate(case) + + def _post_validate(self, case): + """Skips if no per-case output_prefix dir exists (e.g. selector + skipped this case at pytest level). When the dir + deps.json are + present, assert that deps.json contains the 6 edges documented in + example_orchestration.cpp. + """ + case_name = case["name"] + safe_label = _sanitize_for_filename(f"TestDepGen_{case_name}") + outputs = _outputs_dir() + matches = sorted(outputs.glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime) + if not matches: + # No output_prefix dir — dep_gen flag wasn't on for this run; nothing + # to validate. Don't fail the test (the case itself already passed). + return + out_dir = matches[-1] + + # ---- deps.json (host replay output — sole dep_gen artifact on disk) ---- + # We only reach here with --enable-dep-gen on and rounds<=1 (the + # test_run gate via _effective_enable_dep_gen) AND an output dir present + # (the case actually ran). deps.json MUST therefore have been produced; + # its absence means the capture->reconcile->replay pipeline silently + # produced nothing (reconcile drops or replay failure) — exactly the + # regression this test exists to catch (#742). Fail loudly, don't skip. + deps_path = out_dir / "deps.json" + assert deps_path.exists(), ( + f"--enable-dep-gen is on and {out_dir} exists, but deps.json was not produced " + f"— capture/reconcile/replay pipeline regression" + ) + with deps_path.open() as f: + deps = json.load(f) + # Strided-Tensor schema: annotated edges with tasks[] / tensors[] + # sidecars carrying strided slice descriptors (start_offset + + # stride[]). Project annotated edges down to a (pred, succ) set for + # the existing structural checks; the annotation sanity check below + # verifies the tensor metadata path. + raw_edges = deps.get("edges", []) + deps_edges = set() + for e in raw_edges: + assert isinstance(e, dict), f"deps.json edge must be an object, got {type(e).__name__}: {e!r}" + pred, succ = e.get("pred"), e.get("succ") + if pred is None or succ is None: + continue + deps_edges.add((int(pred), int(succ))) + + # example_orchestration.cpp comment block (verified by tracing the source): + # t0: ring 0, local 0 + # t1..t4: ring 1, local 0..3 (inner manual scope → ring 1) + # Edges: t0->t1, t0->t2, t1->t3, t2->t3, t0->t4, t3->t4 + t0 = _task_id(0, 0) + t1 = _task_id(1, 0) + t2 = _task_id(1, 1) + t3 = _task_id(1, 2) + t4 = _task_id(1, 3) + expected_edges = {(t0, t1), (t0, t2), (t1, t3), (t2, t3), (t0, t4), (t3, t4)} + missing = expected_edges - deps_edges + assert not missing, f"deps.json missing expected edges: {missing} (got {deps_edges})" + # Allow extra edges (creator-retention may add owner edges that don't appear + # in the comment's logical-dep view), but flag anything outside the task set. + valid_ids = {t0, t1, t2, t3, t4} + bad = {e for e in deps_edges if e[0] not in valid_ids or e[1] not in valid_ids} + assert not bad, f"deps.json contains edges referencing unknown task ids: {bad}" + + # ---- Annotated-edge sanity ---- + # Replay always emits the tensor-info sidecar; the differential check + # inside the replay would have failed the run before we got here if + # the annotated pass disagreed with compute_task_fanin. These + # assertions just confirm the schema actually carries the expected + # blocks (so e.g. a future "always write empty arrays" bug would + # surface here, not silently in a downstream viewer). + tasks = deps.get("tasks", []) + tensors = deps.get("tensors", []) + task_ids = {int(t["task_id"]) for t in tasks if "task_id" in t} + assert valid_ids <= task_ids, f"tasks[] missing expected ids: {valid_ids - task_ids}" + # Every non-explicit edge should reference a tensor_id present in + # tensors[]. EXPLICIT edges legitimately omit it. + tensor_ids = {int(t["tensor_id"]) for t in tensors if "tensor_id" in t} + for e in raw_edges: + if not isinstance(e, dict): + continue + source = e.get("source") + if source == "explicit": + continue + tid = e.get("tensor_id") + assert tid is not None and int(tid) in tensor_ids, ( + f"edge {e.get('pred')}->{e.get('succ')} (source={source}) " + f"references tensor_id {tid} absent from tensors[]" + ) + # Annotated edges must carry consumer-side strided slice info. + assert "consumer_shape" in e and "consumer_start_offset" in e and "consumer_strides" in e, ( + f"edge {e.get('pred')}->{e.get('succ')} (source={source}) missing consumer_shape/start_offset/strides" + ) + + # ---- Tool smoke: deps_viewer (text) ---- + # scene_test auto-generates deps_viewer.txt via _graph_case_dep_gen; + # smoke verifies it was produced and has the expected sections. + out_txt = out_dir / "deps_viewer.txt" + assert out_txt.exists(), f"scene_test auto-hook did not produce {out_txt}" + text = out_txt.read_text() + assert "SUMMARY" in text and "TASK INDEX" in text, "text deps graph missing expected sections" + + for extra in (["--direction", "LR"], ["--engine", "dot"]): + bad = subprocess.run( + [ + sys.executable, + "-m", + "simpler_setup.tools.deps_viewer", + str(deps_path), + "--format", + "text", + *extra, + ], + check=False, + timeout=60, + capture_output=True, + text=True, + ) + assert bad.returncode != 0, f"text mode should reject {' '.join(extra)}" + assert "only valid with --format html" in bad.stderr + + if shutil.which("dot"): + out_html = out_dir / "_smoke_deps.html" + subprocess.run( + [ + sys.executable, + "-m", + "simpler_setup.tools.deps_viewer", + str(deps_path), + "--format", + "html", + "-o", + str(out_html), + ], + check=True, + timeout=60, + ) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen_chain.py b/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen_chain.py new file mode 100644 index 000000000..774ca0470 --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/dfx/dep_gen/test_dep_gen_chain.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""dep_gen overflow chain regression — submits with >64 explicit deps. + +A submit with explicit_dep_count > DEP_GEN_MAX_EXPLICIT_DEPS (=64) spills the +extra deps into one or more DepGenOverflowRecord slots that overlay the same +buffer ring. Before the chain wire format, dep_gen would silently truncate +the tail in deps.json; this test verifies every explicit dep edge survives +the round-trip writer → host collector → replay → deps.json. + +Test shape (chain_barrier_orch.cpp): N producers each INOUT X, then a dummy +barrier `set_dependencies({all N producer ids})`, then a consumer +`set_dependencies({barrier_id})` reading X and writing Y. With N spanning +the {64, 65, 390, 391} boundaries we exercise: + + - n=64: base only (no chain) — sanity baseline + - n=65: base + 1 overflow record (1 dep in overflow) + - n=200: base + 1 overflow (136 deps in overflow) + - n=391: base + 2 overflow (326 + 1 deps across two overflows) + +Validation: the barrier task in deps.json must have exactly N predecessors, +all of which are the producer ids. The consumer must have one explicit +predecessor — the barrier. +""" + +import json + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test +from simpler_setup.scene_test import _outputs_dir, _sanitize_for_filename + +# Path is relative to this file's directory (the SceneTestCase build helper +# resolves CALLABLE sources from there). dummy_task already ships the two +# kernels we need (write_const + copy_first), so we reuse those instead of +# duplicating the source. +DUMMY_KERNELS = "../../dummy_task/kernels" + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestDepGenChain(SceneTestCase): + """dep_gen overflow chain: many-to-one barrier with >64 explicit deps.""" + + RTOL = 0 + ATOL = 0 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/chain_barrier_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.INOUT, D.INOUT], # X, Y; N goes as scalar + }, + "incores": [ + { + "func_id": 0, + "name": "WRITE_CONST", + "source": f"{DUMMY_KERNELS}/aic/kernel_write_const.cpp", + "core_type": "aic", + # Single-AIC task with one INOUT tensor (args[0]). Declared so + # the tensor dump's per-subtask sum matches the payload. + "signature": [D.INOUT], + }, + { + "func_id": 1, + "name": "COPY_FIRST", + "source": f"{DUMMY_KERNELS}/aic/kernel_copy_first.cpp", + "core_type": "aic", + # Single-AIC task: copies args[0] -> args[1] (IN, INOUT). + "signature": [D.IN, D.INOUT], + }, + ], + } + + # Sentinel must match kernel_write_const (writes 42.0f). + SENTINEL = 42.0 + INIT_VAL = -1.0 + + CASES = [ + { + "name": "n_64_no_chain", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 2, "block_dim": 1}, + "params": {"n": 64}, + }, + { + "name": "n_65_single_overflow", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 2, "block_dim": 1}, + "params": {"n": 65}, + }, + { + "name": "n_200_single_overflow", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 2, "block_dim": 1}, + "params": {"n": 200}, + }, + { + "name": "n_391_two_overflow", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 2, "block_dim": 1}, + "params": {"n": 391}, + }, + ] + + def generate_args(self, params): + # Single-element tensors are enough — kernel_write_const writes index 0 + # and kernel_copy_first reads index 0. + x = torch.full((16,), self.INIT_VAL, dtype=torch.float32) + y = torch.full((16,), self.INIT_VAL, dtype=torch.float32) + return TaskArgsBuilder( + Tensor("x", x), + Tensor("y", y), + Scalar("n", int(params["n"])), + ) + + def compute_golden(self, args, params): + # Producers each write SENTINEL to X[0]; consumer copies X[0] -> Y[0]. + # If the barrier didn't actually wait for all producers, the consumer + # could race ahead and copy INIT_VAL instead — making the host check + # a defacto sanity gate even before we look at deps.json. + args.x[0] = self.SENTINEL + args.y[0] = self.SENTINEL + + def test_run(self, st_platform, st_worker, request): + super().test_run(st_platform, st_worker, request) + if not self._effective_enable_dep_gen(request): + return + for case in self.CASES: + if st_platform in case.get("platforms", []): + self._post_validate(case) + + def _post_validate(self, case): + """Verify every explicit dep edge survived the writer → replay round-trip. + + With dep_gen on, deps.json must contain N edges from the producers to + the barrier task (one per `set_dependencies` entry the orchestration + emitted), plus the consumer's one explicit edge back from the barrier. + Pre-chain code would truncate the producer→barrier edge set to 16/64. + """ + case_name = case["name"] + n = int(case["params"]["n"]) + safe_label = _sanitize_for_filename(f"TestDepGenChain_{case_name}") + outputs = _outputs_dir() + matches = sorted(outputs.glob(f"{safe_label}_*"), key=lambda p: p.stat().st_mtime) + assert matches, f"no output dir for case {case_name!r} — scene didn't run on this platform?" + out_dir = matches[-1] + deps_path = out_dir / "deps.json" + # _post_validate is only invoked when dep_gen was effectively enabled; + # absence of deps.json means the host runner declined to emit it (most + # likely reconcile_counters failed). Surface that as a hard failure + # rather than silently passing — the whole point of this test is to + # catch chain-side reconciliation regressions. + assert deps_path.exists(), ( + f"dep_gen was enabled but {deps_path} is missing. Likely cause: " + f"reconcile_counters() detected a count mismatch and suppressed deps.json emission. " + f"Check the run log for 'dep_gen reconcile' warnings." + ) + + with deps_path.open() as f: + deps = json.load(f) + + raw_edges = deps.get("edges", []) + # Project annotated edges → (pred, succ) — we only care about graph + # structure here; the annot-vs-oracle agreement gate already ran + # inside the replay before deps.json was written. + edges = set() + explicit_edges = set() + for e in raw_edges: + if not isinstance(e, dict): + continue + pred, succ = e.get("pred"), e.get("succ") + if pred is None or succ is None: + continue + pair = (int(pred), int(succ)) + edges.add(pair) + if e.get("source") == "explicit": + explicit_edges.add(pair) + + # Identify the barrier task: it's the task with exactly n explicit-source + # incoming edges. (Producers have 0; consumer has 1 — the one to barrier.) + explicit_by_succ = {} + for pred, succ in explicit_edges: + explicit_by_succ.setdefault(succ, set()).add(pred) + barrier_candidates = [tid for tid, preds in explicit_by_succ.items() if len(preds) == n] + assert len(barrier_candidates) == 1, ( + f"expected exactly one task with {n} explicit predecessors " + f"(the barrier), got {len(barrier_candidates)}: " + f"{[(tid, len(preds)) for tid, preds in explicit_by_succ.items()]}" + ) + barrier_id = barrier_candidates[0] + barrier_preds = explicit_by_succ[barrier_id] + + # All N producer→barrier edges must be present. This is the chain + # round-trip assertion: pre-chain code drops anything past index 63. + assert len(barrier_preds) == n, f"barrier has {len(barrier_preds)} preds, expected {n}" + + # Consumer must explicit-depend on the barrier — exactly one outgoing + # explicit edge from the barrier. + outgoing_explicit_from_barrier = {succ for pred, succ in explicit_edges if pred == barrier_id} + assert len(outgoing_explicit_from_barrier) == 1, ( + f"barrier {barrier_id} has {len(outgoing_explicit_from_barrier)} outgoing explicit edges, " + f"expected 1 (the consumer)" + ) + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/__init__.py b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/__init__.py new file mode 100644 index 000000000..ad03ca31b --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- diff --git a/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/_swimlane_validate.py b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/_swimlane_validate.py new file mode 100644 index 000000000..13efeadd2 --- /dev/null +++ b/tests/st/a2a3/fully_distributed_within_core/dfx/l2_swimlane/_swimlane_validate.py @@ -0,0 +1,240 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Shared l2_swimlane post-case validation. + +The vector_example and paged_attention swimlane tests run the same capture → +tool smoke → differential gate sequence; the only difference between them is +the workload itself. The helpers below are workload-agnostic so each test +file owns only its CALLABLE + cases. + +The differential gate is the load-bearing assertion: it parses the script's +printed Pop / Fanout / Fanin totals and cross-checks them against an oracle +computed straight from the raw artifacts. The paged_attention test exercises +the per-task dedup branch in ``compute_dag_stats_from_deps`` because mixed +AIC+AIV tasks produce multiple perf rows per ``task_id``. +""" + +from __future__ import annotations + +import json +import re +import subprocess +import sys +from pathlib import Path + +from simpler_setup.scene_test import _outputs_dir, _sanitize_for_filename +from simpler_setup.tools.swimlane_converter import read_perf_data + +_REQUIRED_TASK_FIELDS = ( + "task_id", + "func_id", + "core_id", + "core_type", + "start_time_us", + "end_time_us", + # receive_time_us / local_setup_us are populated unconditionally by the + # AICore-side capture (v3 schema). propagation_us requires AICPU dispatch_ts + # and is therefore only present at level≥2 — not in this required-set. + "receive_time_us", + "local_setup_us", +) + + +def validate_perf_artifact(case_label: str, *, expected_task_count: int | None = None) -> None: + """Locate the latest output dir for ``case_label`` and run the full + capture-→-tools-→-differential sequence. + + Args: + case_label: full SceneTest case label (``f"{cls_name}_{case_name}"``) + used to glob the per-case ``outputs/