Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
9f7efd7
adding tracr back in again (working)
noabauma Jun 10, 2026
6c3e026
adding tracr markers and the INDEP_ORCH method
noabauma Jun 11, 2026
fd73895
Merge branch 'main' into tracr
noabauma Jun 11, 2026
653785a
merging main again
noabauma Jun 16, 2026
643537d
Merge branch 'main' into tracr
noabauma Jun 17, 2026
f51b297
Merge branch 'main' into tracr
noabauma Jun 17, 2026
d5c73c8
Merge branch 'main' into tracr
noabauma Jun 19, 2026
2ca1954
remove this redundant print
noabauma Jun 19, 2026
f1f53d5
Merge branch 'main' into tracr
noabauma Jun 22, 2026
87b9988
merging main again
noabauma Jun 23, 2026
aaa3b3a
Merge branch 'main' into tracr
noabauma Jun 23, 2026
d610b50
mergin main again
noabauma Jun 25, 2026
22d49f7
200 -> 1000
noabauma Jun 26, 2026
104c330
Merge branch 'main' into tracr
noabauma Jun 26, 2026
9d8699f
adding corrections gemini wanted
noabauma Jun 26, 2026
d1daa48
Merge remote-tracking branch 'origin/main' into tracr
noabauma Jun 26, 2026
7171a74
Merge branch 'tracr_save' into tracr
noabauma Jun 26, 2026
53a28ae
updating the CI
noabauma Jun 26, 2026
ea0b944
adding more ci fixes
noabauma Jun 26, 2026
079f147
minor fix for the ci
noabauma Jun 26, 2026
9373caa
mergein main again
noabauma Jun 29, 2026
b7f16be
Merge branch 'main' into tracr
noabauma Jun 30, 2026
83eef5a
Fix(tracr): adapt to refactored Runtime dev-split API (#1216)
noabauma Jun 30, 2026
068d11f
mergin main again...
noabauma Jul 1, 2026
279ef8c
style: clang-format TraCR changes (clang-format v21.1.0)
noabauma Jul 1, 2026
3b729e5
updating tracr version
noabauma Jul 1, 2026
d036300
style: ruff-format kernel_compiler.py TraCR changes
noabauma Jul 1, 2026
6c6c509
style: fix license headers on TraCR .cmake files
noabauma Jul 1, 2026
e5d7da4
style: add final newline to TraCR header files
noabauma Jul 1, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
- name: Checkout
uses: actions/checkout@v5
with:
submodules: recursive
fetch-depth: 0

- name: Install build and lint tools
Expand Down Expand Up @@ -71,6 +72,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
submodules: recursive

- name: Set up C++ compiler (Linux)
if: runner.os == 'Linux'
Expand Down Expand Up @@ -113,6 +116,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
submodules: recursive

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v6
Expand Down Expand Up @@ -175,6 +180,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
submodules: recursive

- name: Load pinned pto-isa commit
uses: ./.github/actions/read-pto-isa
Expand Down Expand Up @@ -264,6 +271,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
submodules: recursive

- name: Load pinned pto-isa commit
uses: ./.github/actions/read-pto-isa
Expand Down Expand Up @@ -354,6 +363,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
submodules: recursive

- name: Set up C++ compiler
run: |
Expand Down Expand Up @@ -454,6 +465,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
submodules: recursive

- name: Load pinned pto-isa commit
uses: ./.github/actions/read-pto-isa
Expand Down Expand Up @@ -546,6 +559,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
submodules: recursive

- name: Load pinned pto-isa commit
uses: ./.github/actions/read-pto-isa
Expand Down Expand Up @@ -622,6 +637,7 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v5
with:
submodules: recursive
fetch-depth: 0
- name: Check file changes
id: check
Expand Down Expand Up @@ -668,6 +684,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
submodules: recursive

- name: Load pinned pto-isa commit
uses: ./.github/actions/read-pto-isa
Expand Down Expand Up @@ -743,6 +761,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
submodules: recursive

- name: Load pinned pto-isa commit
uses: ./.github/actions/read-pto-isa
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/sanitizers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
submodules: recursive

- name: Load pinned pto-isa commit
uses: ./.github/actions/read-pto-isa
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "tools/tracr"]
path = tools/tracr
url = https://github.com/huawei-csl/TracR.git
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
#include <cstdint>
#include <cstring>

#include <tracr/tracr.hpp>
#include <tracr_simpler_markers.hpp>

#include "pto_orchestration_api.h"

#define FUNC_QK_MATMUL 0
Expand Down Expand Up @@ -89,6 +92,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta

CYCLE_COUNT_START();

INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Read_Dimensions, 0);

// Read dimensions from tensor metadata
uint64_t batch = orch_args.tensor(0).ref().shapes[0];
uint64_t num_heads = orch_args.tensor(0).ref().shapes[1];
Expand All @@ -107,6 +112,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta

LOG_INFO_V9(">>>>>> batch = %" PRIu64, batch);

INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Reshape_Kernels, 0);

// Reshape tensors for kernel consumption (2D flattened)
void *query_ptr = orch_args.tensor(0).ref().data_as<void>();
void *kc_ptr = orch_args.tensor(1).ref().data_as<void>();
Expand Down Expand Up @@ -136,6 +143,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta
Tensor context_lens =
make_tensor_external(orch_args.tensor(4).ref().data_as<void>(), cl_shapes, 1, DataType::INT32, false);

INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Pre_Loop_Info, uint32_t(batch));

// Create infos are loop-invariant — shapes depend only on q_tile/head_dim/block_size
uint32_t tile2d_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
uint32_t scalar_shapes[1] = {static_cast<uint32_t>(q_tile)};
Expand All @@ -148,11 +157,16 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta
PROF_INC(prof_make_count, 4);
CYCLE_COUNT_LAP(prof_make_tensor);

LOG_INFO_V0(
"Thread %d: Orch PTO2_SCOPE loop: #batch=%" PRIu64 ", q_loop=%" PRIu64, g_TraCR_thread_idx, batch, q_loop
);

for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
uint32_t cl_idx[1] = {static_cast<uint32_t>(b_idx)};
uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, PTO2_SCOPE_, uint32_t(b_idx + batch * q_idx));
PTO2_SCOPE() {
CYCLE_COUNT_LAP(prof_scope);
uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
Expand Down Expand Up @@ -250,7 +264,6 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta
CYCLE_COUNT_LAP(prof_scope);
}
}

#ifdef ENABLE_PROFILING
uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup +
prof_submit_task + prof_scope;
Expand Down
21 changes: 20 additions & 1 deletion simpler_setup/kernel_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,17 @@ def get_orchestration_include_dirs(self, runtime_name: str) -> list[str]:
runtime_dir = str(self.project_root / "src" / arch / "runtime" / runtime_name / "runtime")
runtime_common_dir = str(self.project_root / "src" / arch / "runtime" / runtime_name / "common")
common_dir = str(self.project_root / "src" / "common" / "task_interface")
return [runtime_dir, runtime_common_dir, common_dir] + self.get_platform_include_dirs()
tracr_dir1 = str(self.project_root / "tools")
tracr_dir2 = str(self.project_root / "tools" / "tracr" / "include")
tracr_dir3 = str(self.project_root / "tools" / "tracr" / "extern")
return [
runtime_dir,
runtime_common_dir,
common_dir,
tracr_dir1,
tracr_dir2,
tracr_dir3,
] + self.get_platform_include_dirs()

def get_incore_include_dirs(self) -> list[str]:
"""
Expand Down Expand Up @@ -487,6 +497,15 @@ def _compile_orchestration_shared_lib(
if sys.platform != "darwin":
cmd.append("-Wl,--build-id=sha1")

if os.getenv("BUILD_TRACR", "OFF") == "ON":
cmd.extend(
[
"-DENABLE_TRACR",
"-DTRACR_DISABLE_FLUSH",
"-DUSE_HW_COUNTER",
]
)

if extra_sources:
for src in extra_sources:
src = os.path.abspath(src)
Expand Down
13 changes: 13 additions & 0 deletions src/a2a3/platform/onboard/aicpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,19 @@ message(STATUS "AICpu kernel: ${NUM_AICPU_KERNEL_SOURCES} source files")
message(VERBOSE "AICpu kernel sources: ${AICPU_KERNEL_SOURCES}")
add_library(aicpu_kernel SHARED ${AICPU_KERNEL_SOURCES})

# TraCR
include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake)
tracr_enable(aicpu_kernel)

# Option to make the Orchestrator run independently (i.e. finalize before letting the schedulers run)
option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF)
if(DEFINED ENV{INDEP_ORCH})
set(INDEP_ORCH $ENV{INDEP_ORCH})
endif()
if(INDEP_ORCH)
target_compile_definitions(aicpu_kernel PRIVATE INDEP_ORCH)
endif()

option(WERROR "Treat compiler warnings as errors" ON)
if(DEFINED ENV{SIMPLER_DISABLE_WARNINGS_AS_ERRORS})
set(WERROR OFF)
Expand Down
13 changes: 13 additions & 0 deletions src/a2a3/platform/onboard/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,19 @@ message(STATUS "Host runtime: ${NUM_HOST_RUNTIME_SOURCES} source files")
message(VERBOSE "Host runtime sources: ${HOST_RUNTIME_SOURCES}")
add_library(host_runtime SHARED ${HOST_RUNTIME_SOURCES})

# TraCR
include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake)
tracr_enable(host_runtime)

# Optional" to make the Orchestrator run independently (i.e. finalize before letting the schedulers run)
option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF)
if(DEFINED ENV{INDEP_ORCH})
set(INDEP_ORCH $ENV{INDEP_ORCH})
endif()
if(INDEP_ORCH)
target_compile_definitions(host_runtime PRIVATE INDEP_ORCH)
endif()

# C++ standard (applied only to C++ files)
set_target_properties(host_runtime PROPERTIES
CXX_STANDARD 17
Expand Down
21 changes: 21 additions & 0 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
#include <iostream>
#include <string>
#include <vector>

#include <tracr_simpler_api.hpp>

#include "acl/acl.h"

// Include HAL constants from CANN (header only, library loaded dynamically)
Expand Down Expand Up @@ -264,6 +267,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {

if (prepare_runtime_for_launch(runtime, block_dim, launch_aicpu_num) != 0) return -1;

// Initialize TraCR memory on the device
#ifdef ENABLE_TRACR
// LOG_INFO_V9("[TraCR] thread[%d] DevAllocTraCR device_id_=%d", sched_getcpu(), device_id_);
rc = DevAllocTraCR(this, runtime);
if (rc != 0) {
LOG_ERROR("DevAllocTraCR failed rc=%d", rc);
return rc;
}
#endif
Comment on lines +270 to +278

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🩺 Stability & Availability | 🟠 Major | ⚡ Quick win

Make the TraCR lifecycle unwind-safe.

DevAllocTraCR() can fail after populating one runtime pointer, and every later return before Line 487 skips the only shown free path in StoreTracrData(). The new Line 490 return also bypasses teardown_shared_collectors_after_run() at Line 496. Failed runs will leak TraCR buffers and skip the collector stop/export path. Either guard the TraCR buffers with an all-exit cleanup path here or make the helper transactional, then return the export error only after teardown has run.

Also applies to: 485-493

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/a2a3/platform/onboard/host/device_runner.cpp` around lines 270 - 278, The
TraCR allocation/error path in device_runner::StoreTracrData is not unwind-safe,
so failed runs can skip cleanup and leak buffers. Make the
DevAllocTraCR()/runtime pointer setup transactional or add a single all-exit
cleanup path that always runs before returning, ensuring the StoreTracrData free
path and teardown_shared_collectors_after_run() are executed even on failures.
Keep the final export-error return only after teardown has completed, and
preserve the existing error logging around DevAllocTraCR and later failure
points.

// a2a3 onboard now uses the same host-computed, device-filtered affinity
// shape as a5. Host probes the AICPU user pool once, chooses the active
// cpu_ids deterministically, writes them into Runtime, and the AICPU-side
Expand Down Expand Up @@ -471,6 +483,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {

read_device_wall_ns();

// Download and Free TraCR memory from Device and store in memory (~/ascend/)
#ifdef ENABLE_TRACR
rc = StoreTracrData(this, runtime);
if (rc != 0) {
LOG_ERROR("FreeTraCR failed: %d", rc);
return -1;
}
#endif

// Tear down collectors. stop() joins mgmt then collector in the only safe
// order (mgmt's final-drain pass into L2 has poll as its consumer).
teardown_shared_collectors_after_run();
Expand Down
21 changes: 21 additions & 0 deletions src/a2a3/platform/sim/aicpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,27 @@ endif()
# Create shared library (host-compatible for dlopen)
add_library(aicpu_kernel SHARED ${AICPU_SOURCES})

# TraCR
include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake)
tracr_enable(aicpu_kernel)

# TODO: move this somewhere such that EVERY platform launches this once. Placing this here is hacky...
# Only build the host-side trace post-processor when TraCR is enabled: it is an
# offline analysis tool, and it pulls in Linux-only APIs (sched_getcpu) that do
# not compile on the macOS packaging build.
if(BUILD_TRACR)
include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr_postprocessing_script.cmake)
endif()

Comment on lines +77 to +84

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟠 Major | 🏗️ Heavy lift

Hoist TraCR post-processing out of the sim-only target.

Lines 77-79 register tracr_postprocessing_script.cmake only from the sim aicpu target, and the TODO already notes this should run once for every platform. As written, onboard builds miss the advertised post-process step entirely. Move this include to a shared parent CMake entry so it is added exactly once for both sim and onboard builds.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/a2a3/platform/sim/aicpu/CMakeLists.txt` around lines 77 - 79, The TraCR
post-processing include is only registered from the sim-only aicpu CMake target,
so onboard builds never get the shared post-process step. Move the include of
tracr_postprocessing_script.cmake out of the aicpu-specific CMakeLists and into
a shared parent CMake entry that is evaluated for all platforms, ensuring it is
added exactly once for both sim and onboard builds.

# Optional: to make the Orchestrator run independently (i.e. finalize before letting the schedulers run)
option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF)
if(DEFINED ENV{INDEP_ORCH})
set(INDEP_ORCH $ENV{INDEP_ORCH})
endif()
if(INDEP_ORCH)
target_compile_definitions(aicpu_kernel PRIVATE INDEP_ORCH)
endif()

option(WERROR "Treat compiler warnings as errors" ON)
if(DEFINED ENV{SIMPLER_DISABLE_WARNINGS_AS_ERRORS})
set(WERROR OFF)
Expand Down
13 changes: 13 additions & 0 deletions src/a2a3/platform/sim/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,19 @@ endif()
# Create shared library
add_library(host_runtime SHARED ${HOST_RUNTIME_SOURCES})

# TraCR
include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake)
tracr_enable(host_runtime)

# Optional: to make the Orchestrator run independently (i.e. finalize before letting the schedulers run)
option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF)
if(DEFINED ENV{INDEP_ORCH})
set(INDEP_ORCH $ENV{INDEP_ORCH})
endif()
if(INDEP_ORCH)
target_compile_definitions(host_runtime PRIVATE INDEP_ORCH)
endif()

# C++ standard (applied only to C++ files)
set_target_properties(host_runtime PROPERTIES
CXX_STANDARD 17
Expand Down
20 changes: 20 additions & 0 deletions src/a2a3/platform/sim/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include <string>
#include <vector>

#include <tracr_simpler_api.hpp>

#include "aicpu/device_phase_aicpu.h"
#include "aicpu/platform_aicpu_affinity.h"
#include "callable_protocol.h"
Expand Down Expand Up @@ -253,6 +255,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
worker_count_ = num_aicore;
runtime.set_aicpu_thread_num(launch_aicpu_num);

// Initialize TraCR memory on the device
#ifdef ENABLE_TRACR
rc = DevAllocTraCR(this, runtime);
if (rc != 0) {
LOG_ERROR("DevAllocTraCR failed rc=%d", rc);
return rc;
}
#endif

Comment on lines +258 to +266

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🩺 Stability & Availability | 🟠 Major | ⚡ Quick win

Make the TraCR lifecycle unwind-safe.

DevAllocTraCR() can fail after populating one runtime pointer, and every later return before Line 529 skips the only shown free path in StoreTracrData(). The new Line 532 return also bypasses the collector stop/reconcile/export sequence at Lines 538-572. Failed runs will leak TraCR buffers and skip collector teardown. Add an all-exit TraCR cleanup path and defer the export failure until after the collector shutdown path has completed.

Also applies to: 527-535

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/a2a3/platform/sim/host/device_runner.cpp` around lines 238 - 246, The
TraCR setup in device_runner.cpp is not unwind-safe: `DevAllocTraCR()` can
partially initialize runtime state, and later early returns in
`StoreTracrData()` / the export path skip the only cleanup and collector
shutdown sequence. Refactor the `StoreTracrData()` flow to funnel all exits
through a single TraCR cleanup path that frees any allocated TraCR buffers and
always runs the collector stop/reconcile/export teardown before returning, and
defer propagating export failure until after that shutdown sequence completes.

int num_aic = block_dim;
uint32_t enable_profiling_flag = PROFILING_FLAG_NONE;
if (enable_dump_tensor_) {
Expand Down Expand Up @@ -571,6 +582,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
return runtime_rc;
}

// Download and Free TraCR memory from Device and store in memory (~/ascend/)
#ifdef ENABLE_TRACR
rc = StoreTracrData(this, runtime);
if (rc != 0) {
LOG_ERROR("FreeTraCR failed: %d", rc);
return -1;
}
#endif

// Tear down collectors. stop() joins mgmt then collector in the only safe
// order (mgmt's final-drain pass into L2 has poll as its consumer).
if (enable_l2_swimlane_) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@

// Tensor dump uses these defaults to size its selective mask table so task-id
// ring/slot lookup stays aligned with PTO2 task id layout.
#ifdef INDEP_ORCH
#define PTO2_TASK_WINDOW_SIZE 65536 // Default per-ring task window size (power of 2)
#else
#define PTO2_TASK_WINDOW_SIZE 16384 // Default per-ring task window size (power of 2)
#define PTO2_MAX_RING_DEPTH 4 // Number of task-id ring layers
#endif

#define PTO2_MAX_RING_DEPTH 4 // Number of task-id ring layers

#endif // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
Loading
Loading