diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7008917a8..e0fbf5f8e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,6 +21,7 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v5
         with:
+          submodules: recursive
           fetch-depth: 0
 
       - name: Install build and lint tools
@@ -71,6 +72,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5
+        with:
+          submodules: recursive
 
       - name: Set up C++ compiler (Linux)
         if: runner.os == 'Linux'
@@ -113,6 +116,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5
+        with:
+          submodules: recursive
 
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v6
@@ -175,6 +180,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5
+        with:
+          submodules: recursive
 
       - name: Load pinned pto-isa commit
         uses: ./.github/actions/read-pto-isa
@@ -264,6 +271,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5
+        with:
+          submodules: recursive
 
       - name: Load pinned pto-isa commit
         uses: ./.github/actions/read-pto-isa
@@ -354,6 +363,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5
+        with:
+          submodules: recursive
 
       - name: Set up C++ compiler
         run: |
@@ -454,6 +465,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5
+        with:
+          submodules: recursive
 
       - name: Load pinned pto-isa commit
         uses: ./.github/actions/read-pto-isa
@@ -546,6 +559,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5
+        with:
+          submodules: recursive
 
       - name: Load pinned pto-isa commit
         uses: ./.github/actions/read-pto-isa
@@ -622,6 +637,7 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v5
         with:
+          submodules: recursive
           fetch-depth: 0
       - name: Check file changes
         id: check
@@ -668,6 +684,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5
+        with:
+          submodules: recursive
 
       - name: Load pinned pto-isa commit
         uses: ./.github/actions/read-pto-isa
@@ -743,6 +761,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5
+        with:
+          submodules: recursive
 
       - name: Load pinned pto-isa commit
         uses: ./.github/actions/read-pto-isa
diff --git a/.github/workflows/sanitizers.yml b/.github/workflows/sanitizers.yml
index 2444fab32..7661e19f0 100644
--- a/.github/workflows/sanitizers.yml
+++ b/.github/workflows/sanitizers.yml
@@ -38,6 +38,8 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5
+        with:
+          submodules: recursive
 
       - name: Load pinned pto-isa commit
         uses: ./.github/actions/read-pto-isa
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..e8945dcab
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "tools/tracr"]
+	path = tools/tracr
+	url = https://github.com/huawei-csl/TracR.git
diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 2ed86cdf2..7220228d3 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -25,6 +25,9 @@
 #include <cstdint>
 #include <cstring>
 
+#include <tracr/tracr.hpp>
+#include <tracr_simpler_markers.hpp>
+
 #include "pto_orchestration_api.h"
 
 #define FUNC_QK_MATMUL 0
@@ -89,6 +92,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta
 
     CYCLE_COUNT_START();
 
+    INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Read_Dimensions, 0);
+
     // Read dimensions from tensor metadata
     uint64_t batch = orch_args.tensor(0).ref().shapes[0];
     uint64_t num_heads = orch_args.tensor(0).ref().shapes[1];
@@ -107,6 +112,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta
 
     LOG_INFO_V9(">>>>>> batch = %" PRIu64, batch);
 
+    INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Reshape_Kernels, 0);
+
     // Reshape tensors for kernel consumption (2D flattened)
     void *query_ptr = orch_args.tensor(0).ref().data_as<void>();
     void *kc_ptr = orch_args.tensor(1).ref().data_as<void>();
@@ -136,6 +143,8 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta
     Tensor context_lens =
         make_tensor_external(orch_args.tensor(4).ref().data_as<void>(), cl_shapes, 1, DataType::INT32, false);
 
+    INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Pre_Loop_Info, uint32_t(batch));
+
     // Create infos are loop-invariant — shapes depend only on q_tile/head_dim/block_size
     uint32_t tile2d_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
     uint32_t scalar_shapes[1] = {static_cast<uint32_t>(q_tile)};
@@ -148,11 +157,16 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta
     PROF_INC(prof_make_count, 4);
     CYCLE_COUNT_LAP(prof_make_tensor);
 
+    LOG_INFO_V0(
+        "Thread %d: Orch PTO2_SCOPE loop: #batch=%" PRIu64 ", q_loop=%" PRIu64, g_TraCR_thread_idx, batch, q_loop
+    );
+
     for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
         uint32_t cl_idx[1] = {static_cast<uint32_t>(b_idx)};
         uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
         uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
         for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
+            INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, PTO2_SCOPE_, uint32_t(b_idx + batch * q_idx));
             PTO2_SCOPE() {
                 CYCLE_COUNT_LAP(prof_scope);
                 uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
@@ -250,7 +264,6 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const L2Ta
             CYCLE_COUNT_LAP(prof_scope);
         }
     }
-
 #ifdef ENABLE_PROFILING
     uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup +
                      prof_submit_task + prof_scope;
diff --git a/simpler_setup/kernel_compiler.py b/simpler_setup/kernel_compiler.py
index 2ea17b66e..85ac7aed5 100644
--- a/simpler_setup/kernel_compiler.py
+++ b/simpler_setup/kernel_compiler.py
@@ -143,7 +143,17 @@ def get_orchestration_include_dirs(self, runtime_name: str) -> list[str]:
         runtime_dir = str(self.project_root / "src" / arch / "runtime" / runtime_name / "runtime")
         runtime_common_dir = str(self.project_root / "src" / arch / "runtime" / runtime_name / "common")
         common_dir = str(self.project_root / "src" / "common" / "task_interface")
-        return [runtime_dir, runtime_common_dir, common_dir] + self.get_platform_include_dirs()
+        tracr_dir1 = str(self.project_root / "tools")
+        tracr_dir2 = str(self.project_root / "tools" / "tracr" / "include")
+        tracr_dir3 = str(self.project_root / "tools" / "tracr" / "extern")
+        return [
+            runtime_dir,
+            runtime_common_dir,
+            common_dir,
+            tracr_dir1,
+            tracr_dir2,
+            tracr_dir3,
+        ] + self.get_platform_include_dirs()
 
     def get_incore_include_dirs(self) -> list[str]:
         """
@@ -487,6 +497,15 @@ def _compile_orchestration_shared_lib(
         if sys.platform != "darwin":
             cmd.append("-Wl,--build-id=sha1")
 
+        if os.getenv("BUILD_TRACR", "OFF") == "ON":
+            cmd.extend(
+                [
+                    "-DENABLE_TRACR",
+                    "-DTRACR_DISABLE_FLUSH",
+                    "-DUSE_HW_COUNTER",
+                ]
+            )
+
         if extra_sources:
             for src in extra_sources:
                 src = os.path.abspath(src)
diff --git a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt
index f044df3df..84f76fb46 100644
--- a/src/a2a3/platform/onboard/aicpu/CMakeLists.txt
+++ b/src/a2a3/platform/onboard/aicpu/CMakeLists.txt
@@ -63,6 +63,19 @@ message(STATUS "AICpu kernel: ${NUM_AICPU_KERNEL_SOURCES} source files")
 message(VERBOSE "AICpu kernel sources: ${AICPU_KERNEL_SOURCES}")
 add_library(aicpu_kernel SHARED ${AICPU_KERNEL_SOURCES})
 
+# TraCR
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake)
+tracr_enable(aicpu_kernel)
+
+# Option to make the Orchestrator run independently (i.e. finalize before letting the schedulers run)
+option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF)
+if(DEFINED ENV{INDEP_ORCH})
+    set(INDEP_ORCH $ENV{INDEP_ORCH})
+endif()
+if(INDEP_ORCH)
+    target_compile_definitions(aicpu_kernel PRIVATE INDEP_ORCH)
+endif()
+
 option(WERROR "Treat compiler warnings as errors" ON)
 if(DEFINED ENV{SIMPLER_DISABLE_WARNINGS_AS_ERRORS})
     set(WERROR OFF)
diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt
index f0a403b14..bbef3da5b 100644
--- a/src/a2a3/platform/onboard/host/CMakeLists.txt
+++ b/src/a2a3/platform/onboard/host/CMakeLists.txt
@@ -91,6 +91,19 @@ message(STATUS "Host runtime: ${NUM_HOST_RUNTIME_SOURCES} source files")
 message(VERBOSE "Host runtime sources: ${HOST_RUNTIME_SOURCES}")
 add_library(host_runtime SHARED ${HOST_RUNTIME_SOURCES})
 
+# TraCR
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake)
+tracr_enable(host_runtime)
+
+# Optional" to make the Orchestrator run independently (i.e. finalize before letting the schedulers run)
+option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF)
+if(DEFINED ENV{INDEP_ORCH})
+    set(INDEP_ORCH $ENV{INDEP_ORCH})
+endif()
+if(INDEP_ORCH)
+    target_compile_definitions(host_runtime PRIVATE INDEP_ORCH)
+endif()
+
 # C++ standard (applied only to C++ files)
 set_target_properties(host_runtime PROPERTIES
     CXX_STANDARD 17
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index de517a9cc..9d3c8ef3e 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -28,6 +28,9 @@
 #include <iostream>
 #include <string>
 #include <vector>
+
+#include <tracr_simpler_api.hpp>
+
 #include "acl/acl.h"
 
 // Include HAL constants from CANN (header only, library loaded dynamically)
@@ -264,6 +267,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
 
     if (prepare_runtime_for_launch(runtime, block_dim, launch_aicpu_num) != 0) return -1;
 
+    // Initialize TraCR memory on the device
+#ifdef ENABLE_TRACR
+    // LOG_INFO_V9("[TraCR] thread[%d] DevAllocTraCR device_id_=%d", sched_getcpu(), device_id_);
+    rc = DevAllocTraCR(this, runtime);
+    if (rc != 0) {
+        LOG_ERROR("DevAllocTraCR failed rc=%d", rc);
+        return rc;
+    }
+#endif
     // a2a3 onboard now uses the same host-computed, device-filtered affinity
     // shape as a5. Host probes the AICPU user pool once, chooses the active
     // cpu_ids deterministically, writes them into Runtime, and the AICPU-side
@@ -471,6 +483,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
 
     read_device_wall_ns();
 
+    // Download and Free TraCR memory from Device and store in memory (~/ascend/)
+#ifdef ENABLE_TRACR
+    rc = StoreTracrData(this, runtime);
+    if (rc != 0) {
+        LOG_ERROR("FreeTraCR failed: %d", rc);
+        return -1;
+    }
+#endif
+
     // Tear down collectors. stop() joins mgmt then collector in the only safe
     // order (mgmt's final-drain pass into L2 has poll as its consumer).
     teardown_shared_collectors_after_run();
diff --git a/src/a2a3/platform/sim/aicpu/CMakeLists.txt b/src/a2a3/platform/sim/aicpu/CMakeLists.txt
index dd18486c1..11fa4349d 100644
--- a/src/a2a3/platform/sim/aicpu/CMakeLists.txt
+++ b/src/a2a3/platform/sim/aicpu/CMakeLists.txt
@@ -70,6 +70,27 @@ endif()
 # Create shared library (host-compatible for dlopen)
 add_library(aicpu_kernel SHARED ${AICPU_SOURCES})
 
+# TraCR
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake)
+tracr_enable(aicpu_kernel)
+
+# TODO: move this somewhere such that EVERY platform launches this once. Placing this here is hacky...
+# Only build the host-side trace post-processor when TraCR is enabled: it is an
+# offline analysis tool, and it pulls in Linux-only APIs (sched_getcpu) that do
+# not compile on the macOS packaging build.
+if(BUILD_TRACR)
+    include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr_postprocessing_script.cmake)
+endif()
+
+# Optional: to make the Orchestrator run independently (i.e. finalize before letting the schedulers run)
+option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF)
+if(DEFINED ENV{INDEP_ORCH})
+    set(INDEP_ORCH $ENV{INDEP_ORCH})
+endif()
+if(INDEP_ORCH)
+    target_compile_definitions(aicpu_kernel PRIVATE INDEP_ORCH)
+endif()
+
 option(WERROR "Treat compiler warnings as errors" ON)
 if(DEFINED ENV{SIMPLER_DISABLE_WARNINGS_AS_ERRORS})
     set(WERROR OFF)
diff --git a/src/a2a3/platform/sim/host/CMakeLists.txt b/src/a2a3/platform/sim/host/CMakeLists.txt
index 3b9f283f0..e460cdd7c 100644
--- a/src/a2a3/platform/sim/host/CMakeLists.txt
+++ b/src/a2a3/platform/sim/host/CMakeLists.txt
@@ -75,6 +75,19 @@ endif()
 # Create shared library
 add_library(host_runtime SHARED ${HOST_RUNTIME_SOURCES})
 
+# TraCR
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../tools/tracr.cmake)
+tracr_enable(host_runtime)
+
+# Optional: to make the Orchestrator run independently (i.e. finalize before letting the schedulers run)
+option(INDEP_ORCH "Run Orchestrator independent from the Schedulers" OFF)
+if(DEFINED ENV{INDEP_ORCH})
+    set(INDEP_ORCH $ENV{INDEP_ORCH})
+endif()
+if(INDEP_ORCH)
+    target_compile_definitions(host_runtime PRIVATE INDEP_ORCH)
+endif()
+
 # C++ standard (applied only to C++ files)
 set_target_properties(host_runtime PROPERTIES
     CXX_STANDARD 17
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index c8e0929b6..232eba512 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -29,6 +29,8 @@
 #include <string>
 #include <vector>
 
+#include <tracr_simpler_api.hpp>
+
 #include "aicpu/device_phase_aicpu.h"
 #include "aicpu/platform_aicpu_affinity.h"
 #include "callable_protocol.h"
@@ -253,6 +255,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     worker_count_ = num_aicore;
     runtime.set_aicpu_thread_num(launch_aicpu_num);
 
+    // Initialize TraCR memory on the device
+#ifdef ENABLE_TRACR
+    rc = DevAllocTraCR(this, runtime);
+    if (rc != 0) {
+        LOG_ERROR("DevAllocTraCR failed rc=%d", rc);
+        return rc;
+    }
+#endif
+
     int num_aic = block_dim;
     uint32_t enable_profiling_flag = PROFILING_FLAG_NONE;
     if (enable_dump_tensor_) {
@@ -571,6 +582,15 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
         return runtime_rc;
     }
 
+    // Download and Free TraCR memory from Device and store in memory (~/ascend/)
+#ifdef ENABLE_TRACR
+    rc = StoreTracrData(this, runtime);
+    if (rc != 0) {
+        LOG_ERROR("FreeTraCR failed: %d", rc);
+        return -1;
+    }
+#endif
+
     // Tear down collectors. stop() joins mgmt then collector in the only safe
     // order (mgmt's final-drain pass into L2 has poll as its consumer).
     if (enable_l2_swimlane_) {
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
index 82bb7c193..39ddabdc3 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
@@ -20,7 +20,12 @@
 
 // Tensor dump uses these defaults to size its selective mask table so task-id
 // ring/slot lookup stays aligned with PTO2 task id layout.
+#ifdef INDEP_ORCH
+#define PTO2_TASK_WINDOW_SIZE 65536  // Default per-ring task window size (power of 2)
+#else
 #define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
-#define PTO2_MAX_RING_DEPTH 4        // Number of task-id ring layers
+#endif
+
+#define PTO2_MAX_RING_DEPTH 4  // Number of task-id ring layers
 
 #endif  // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index 9f787c8c1..d22d20713 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -186,6 +186,11 @@ class Runtime {
     // Task storage
     Task tasks[RUNTIME_MAX_TASKS];  // Fixed-size task array
 
+    // TraCR data placeholder
+    // Those are the pointers with the allocated memory on the device
+    void *tracrData_;
+    void *tracrDataSizes_;
+
     // Filter-style affinity gate input (a2a3 onboard). Placed AFTER `tasks`
     // because AICore reads runtime->tasks[] by offset. Host fills these before
     // launch from AICPU OCCUPY; the device gate keeps threads whose
@@ -239,6 +244,10 @@ class Runtime {
     void set_worker_count(int n) { worker_count = n; }
     int get_aicpu_thread_num() const { return aicpu_thread_num; }
     void set_aicpu_thread_num(int n) { aicpu_thread_num = n; }
+    void *get_tracr_data() const { return tracrData_; }
+    void set_tracr_data(void *p) { tracrData_ = p; }
+    void *get_tracr_data_sizes() const { return tracrDataSizes_; }
+    void set_tracr_data_sizes(void *p) { tracrDataSizes_ = p; }
     Handshake *get_workers() { return workers; }
     int32_t get_aicpu_allowed_cpu_count() const { return aicpu_allowed_cpu_count; }
     void set_aicpu_allowed_cpu_count(int32_t n) { aicpu_allowed_cpu_count = n; }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 152f82fc4..f43f79894 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -22,6 +22,9 @@
 #include <sys/mman.h>
 #endif
 
+#include <tracr/tracr.hpp>
+#include <tracr_simpler_markers.hpp>
+
 #include "aicpu/device_time.h"
 #include "aicpu/device_phase_aicpu.h"
 #include "aicpu/orch_so_file.h"
@@ -192,6 +195,8 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
         return 0;
     }
 
+    INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Initializing, uint32_t(tracr_getcpu()));
+
     LOG_INFO_V0("AicpuExecutor: Initializing");
 
     if (runtime == nullptr) {
@@ -458,6 +463,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                     LOG_INFO_V0("Thread %d: No config function, using defaults", thread_idx);
                 }
 
+                INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Allocating, 0);
+
                 // sm_handle / rt are bound to *this* run's memory and must be
                 // (re)created every run, regardless of whether the SO itself was
                 // reused above.
@@ -591,6 +598,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
 #if PTO2_PROFILING
             orch_cycle_start = get_sys_cnt_aicpu();
 #endif
+            INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Orchestrating, thread_idx);
             framework_bind_runtime(rt);
             if (*p_bind != nullptr) {
                 (*p_bind)(rt);
@@ -734,6 +742,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
         if (rt == nullptr) {
             LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx);
         } else {
+            INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Scheduling, thread_idx);
             sched_ctx_.bind_runtime(rt);
             if (serial_orch_sched_) {
                 sched_ctx_.wait_for_orchestration_done_before_dispatch(runtime, thread_idx);
@@ -751,6 +760,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
     // Always shutdown AICore — even if sched_ctx_.completed_ was already true.
     // platform_deinit_aicore_regs is idempotent; orchestrator threads have
     // core_trackers_[thread_idx].core_num() == 0 so they skip the loop harmlessly.
+    INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, De_Initializing, 0);
     int32_t shutdown_rc = sched_ctx_.shutdown(thread_idx);
     if (shutdown_rc != 0 && run_rc == 0) {
         run_rc = shutdown_rc;
@@ -825,6 +835,64 @@ void AicpuExecutor::deinit(Runtime *runtime) {
 
 // ===== Public Entry Point =====
 
+/**
+ * init tracr profiler
+ *
+ * NOTE: make sure g_TraCR_thread_idx starts at 0 and follows in the positive direction
+ */
+inline void TRACR_START() {
+    g_TraCR_thread_idx = g_TraCR_thread_idx_counter.fetch_add(1, std::memory_order_relaxed);
+
+    if (g_TraCR_thread_idx == 0) {
+        INSTRUMENTATION_START();
+    } else {
+        INSTRUMENTATION_THREAD_INIT();
+    }
+}
+
+/**
+ * finalizing tracr function
+ *
+ * NOTE: make shure g_TraCR_thread_idx starts at 0 and follows in the positive direction
+ */
+inline void TRACR_FINALIZE(Runtime *runtime) {
+    (void)(runtime);
+
+#ifdef ENABLE_TRACR
+    LOG_INFO_V9(
+        "[TraCR] thread[%d] dumping the #traces: %lu %p", g_TraCR_thread_idx, tracrThread->_traceIdx,
+        runtime->get_tracr_data()
+    );
+
+    if (g_TraCR_thread_idx >= 0 && g_TraCR_thread_idx < runtime->get_aicpu_thread_num()) {
+        if (runtime->get_tracr_data() != nullptr && tracrThread->_traceIdx > 0) {
+            TraCR::Payload *tracrData = reinterpret_cast<TraCR::Payload *>(runtime->get_tracr_data());
+            const size_t payload_size = tracrThread->_traceIdx * sizeof(TraCR::Payload);
+
+            std::memcpy(&tracrData[g_TraCR_thread_idx * TraCR::CAPACITY], tracrThread->_traces.data(), payload_size);
+        }
+
+        if (runtime->get_tracr_data_sizes() != nullptr) {
+            size_t *tracrDataSizes = reinterpret_cast<size_t *>(runtime->get_tracr_data_sizes());
+            tracrDataSizes[g_TraCR_thread_idx] = tracrThread->_traceIdx;
+        }
+    } else {
+        LOG_ERROR(
+            "[TraCR] thread index %d out of bounds (max=%d)", g_TraCR_thread_idx, runtime->get_aicpu_thread_num()
+        );
+    }
+#endif
+
+    if (g_TraCR_thread_idx == 0) {
+        INSTRUMENTATION_END();
+        g_TraCR_thread_idx_counter.store(0, std::memory_order_relaxed);
+    } else {
+        INSTRUMENTATION_THREAD_FINALIZE();
+    }
+
+    g_TraCR_thread_idx = -1;
+}
+
 // Device orchestration SO registration entry. Exported directly by the runtime
 // (not via a platform forwarding shell): registration is a TMARB-only ability,
 // so the symbol lives where the capability does. host_build_graph does not
@@ -871,6 +939,12 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
 
     LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution");
 
+    // INIT TraCR all threads coming in
+    TRACR_START();
+    LOG_INFO_V9(
+        "[TraCR] thread[%d:%d] start ENABLE_TRACR=%d", g_TraCR_thread_idx, tracr_getcpu(), INSTRUMENTATION_ACTIVE
+    );
+
     // Each phase is bracketed by its own scope so the start/end boundaries are
     // visible and an early `return` still records the end via the guard dtor.
     // rc / runtime_rc are declared out here because they outlive their phase.
@@ -909,6 +983,10 @@ extern "C" int32_t aicpu_execute(Runtime *runtime) {
         g_aicpu_executor.deinit(runtime);
     }
 
+    INSTRUMENTATION_MARK_RESET(g_TraCR_thread_idx);
+    // Finalize TraCR all threads coming in
+    TRACR_FINALIZE(runtime);
+
     if (runtime_rc != 0) {
         LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc);
         return runtime_rc;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 01194134a..584f5dc87 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -63,17 +63,26 @@
 // NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value.
 // Actual window size is passed at runtime to runtime_create_from_sm().
 // Use pto2_task_slot(sched, task_id) for slot calculation.
+#ifdef INDEP_ORCH
+#define PTO2_TASK_WINDOW_SIZE 65536  // Default per-ring task window size (power of 2)
+#else
 #define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
+#endif
 
 // Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer)
 // Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
 #define PTO2_MAX_RING_DEPTH 4
 
 // Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH)
+#ifdef INDEP_ORCH
+#define PTO2_HEAP_SIZE (256 * 1024 * 1024 * 2)  // 512MB per ring (2GB total)
+#else
 #define PTO2_HEAP_SIZE (256 * 1024 * 1024)  // 256MB per ring (1GB total)
-#define PTO2_DEP_LIST_POOL_SIZE 16384       // Per-ring dependency list pool entries
-#define PTO2_TENSORMAP_POOL_SIZE (65536)    // TensorMap entry pool
-#define PTO2_TENSORMAP_NUM_BUCKETS 4096     // Power of 2 for fast hash (4096×8B=32KB fits L1)
+#endif
+
+#define PTO2_DEP_LIST_POOL_SIZE 16384     // Per-ring dependency list pool entries
+#define PTO2_TENSORMAP_POOL_SIZE (65536)  // TensorMap entry pool
+#define PTO2_TENSORMAP_NUM_BUCKETS 4096   // Power of 2 for fast hash (4096×8B=32KB fits L1)
 
 // Scope management
 #define PTO2_MAX_SCOPE_DEPTH 64  // Maximum nesting depth
@@ -91,7 +100,11 @@
 #define PTO2_EARLY_DISPATCH_QUEUE_SIZE 64
 
 // Wiring queue
+#ifdef INDEP_ORCH
+#define PTO2_WRIRING_QUEUE_SIZE 65536  // Per-shape queue size
+#else
 #define PTO2_WRIRING_QUEUE_SIZE 1024  // Per-shape queue size
+#endif
 
 // Fanin storage
 #define PTO2_FANIN_INLINE_CAP 64
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 8a41434de..656a98740 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -175,6 +175,11 @@ struct alignas(64) DeviceRuntimeLaunchDesc {
     // PTO2 integration: kernel_id -> GM function_bin_addr mapping
     uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
 
+    // TraCR data placeholder
+    // Those are the pointers with the allocated memory on the device
+    void *tracrData_;
+    void *tracrDataSizes_;
+
     // Serial orchestrator -> scheduler start control.
     // When true, scheduler threads wait until orchestration has fully built the
     // task graph before entering resolve_and_dispatch().
@@ -243,6 +248,10 @@ class Runtime {
     void set_worker_count(int n) { dev.worker_count = n; }
     int get_aicpu_thread_num() const { return dev.aicpu_thread_num; }
     void set_aicpu_thread_num(int n) { dev.aicpu_thread_num = n; }
+    void *get_tracr_data() const { return dev.tracrData_; }
+    void set_tracr_data(void *p) { dev.tracrData_ = p; }
+    void *get_tracr_data_sizes() const { return dev.tracrDataSizes_; }
+    void set_tracr_data_sizes(void *p) { dev.tracrDataSizes_ = p; }
     Handshake *get_workers() { return dev.workers; }
     int32_t get_aicpu_allowed_cpu_count() const { return dev.aicpu_allowed_cpu_count; }
     void set_aicpu_allowed_cpu_count(int32_t n) { dev.aicpu_allowed_cpu_count = n; }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index 8e1813367..6823307f4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -13,6 +13,9 @@
 #include <cinttypes>
 #include <cstdio>
 
+#include <tracr/tracr.hpp>
+#include <tracr_simpler_markers.hpp>
+
 #include "common/unified_log.h"
 #include "aicpu/dep_gen_collector_aicpu.h"
 #include "aicpu/device_phase_aicpu.h"
@@ -984,6 +987,7 @@ SchedulerContext::init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched
 void SchedulerContext::deinit() {
     // Reset all per-core execution state
     for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
+        INSTRUMENTATION_MARK_RESET(sched_thread_num_ + 1 + i);
         core_exec_states_[i] = {};
         core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
         core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
index 774589865..f6409d077 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
@@ -12,6 +12,9 @@
 
 #include <algorithm>
 
+#include <tracr/tracr.hpp>
+#include <tracr_simpler_markers.hpp>
+
 #include "common/unified_log.h"
 #include "aicpu/device_time.h"
 #include "aicpu/platform_regs.h"
@@ -384,6 +387,7 @@ void SchedulerContext::check_running_cores_for_completion(
 #endif
             );
             cur_thread_completed++;
+            INSTRUMENTATION_MARK_RESET(sched_thread_num_ + 1 + core_id);
         }
         if (t.running_done) {
             complete_slot_task(
@@ -395,6 +399,7 @@ void SchedulerContext::check_running_cores_for_completion(
 #endif
             );
             cur_thread_completed++;
+            INSTRUMENTATION_MARK_RESET(sched_thread_num_ + 1 + core_id);
         }
 
         // 2. Update slot data
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index c4a10369d..a5c7696f6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -14,6 +14,9 @@
 #include <cinttypes>
 #include <limits>
 
+#include <tracr/tracr.hpp>
+#include <tracr_simpler_markers.hpp>
+
 #include "common.h"  // debug_assert
 
 #include "common/unified_log.h"
@@ -167,10 +170,12 @@ SchedulerContext::PublishHandle SchedulerContext::prepare_subtask_to_core(
     build_payload(payload, slot_state, subslot, async_ctx, block_idx);
 
     if (to_pending) {
+        INSTRUMENTATION_MARK_SET(sched_thread_num_ + 1 + core_id, Running_Task_Pair, 0);
         core_exec_state.pending_subslot = subslot;
         core_exec_state.pending_slot_state = &slot_state;
         core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
     } else {
+        INSTRUMENTATION_MARK_SET(sched_thread_num_ + 1 + core_id, Running_Task_Single, 0);
         core_exec_state.running_subslot = subslot;
         core_exec_state.running_slot_state = &slot_state;
         core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
@@ -816,6 +821,17 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     constexpr bool pmu_active = false;
 #endif
 
+#ifdef INDEP_ORCH
+    INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Barrier, orchestrator_done_.load(std::memory_order_relaxed));
+    LOG_INFO_V9(
+        "[TraCR] Thread %d: Waiting before the Orch to finish: %d, orchestrator_done_=%d", g_TraCR_thread_idx,
+        g_TraCR_thread_idx_counter.load(), orchestrator_done_.load(std::memory_order_relaxed)
+    );
+    while (!orchestrator_done_.load(std::memory_order_acquire)) {
+        SPIN_WAIT_HINT();
+    }
+#endif
+
 #if PTO2_PROFILING
     l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
 #endif
@@ -924,6 +940,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
 #endif
 
         // Phase 1: Check running cores for completion
+        INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase1, 0);
         int32_t completed_this_turn = 0;
 
         bool try_completed = tracker.has_any_running_cores();
@@ -1020,6 +1037,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
 
         // Phase 2 drain check
         if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
+            INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase2, 0);
             handle_drain_mode(thread_idx);
             continue;
         }
@@ -1030,6 +1048,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
             wired = sched_->drain_wiring_queue(orchestrator_done_.load(std::memory_order_relaxed));
             if (wired > 0) {
                 made_progress = true;
+                INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase3, 0);
 #if PTO2_SCHED_PROFILING
                 l2_swimlane.phase_wiring_count += wired;
 #endif
@@ -1066,6 +1085,12 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
             constexpr int DUMMY_DRAIN_BATCH = 16;
             PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH];
             int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH);
+
+            if (dummy_got > 0) {
+                (void)(dummy_got);
+                INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase3b, 0);
+            }
+
 #if PTO2_PROFILING
             // Dummy outer phase: covers handling of all dummies popped this
             // iter. Per-dummy DummyTask markers are emitted to a SEPARATE lane
@@ -1075,6 +1100,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
             uint64_t dummy_outer_t0 =
                 (dummy_got > 0 && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
 #endif
+
             for (int di = 0; di < dummy_got; di++) {
                 PTO2TaskSlotState &dummy_slot = *dummy_batch[di];
 
@@ -1175,6 +1201,8 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
 
         // Phase 4: MIX-strict-priority dispatch with phase-split and
         // cross-thread idle gating. See dispatch_ready_tasks for the policy.
+        INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Phase4, 0);
+
 #if PTO2_PROFILING
         uint64_t dispatch_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
 #endif
@@ -1334,6 +1362,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
                                   0;
 #endif
             while (deferred_release_count > 0) {
+                INSTRUMENTATION_MARK_SET(g_TraCR_thread_idx, Drain, 0);
 #if PTO2_SCHED_PROFILING
                 (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
 #else
diff --git a/tools/tracr b/tools/tracr
new file mode 160000
index 000000000..f0574cdad
--- /dev/null
+++ b/tools/tracr
@@ -0,0 +1 @@
+Subproject commit f0574cdad14f56da69706533f3fa8e4b75b4f476
diff --git a/tools/tracr.cmake b/tools/tracr.cmake
new file mode 100644
index 000000000..55013897c
--- /dev/null
+++ b/tools/tracr.cmake
@@ -0,0 +1,114 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+
+# The tracr.cmake directory
+set(TRACR_ROOT_DIR "${CMAKE_CURRENT_LIST_DIR}")
+
+# This BUILD_TRACR is a Environment variable used to toggle the build of TraCR
+# Use: BUILD_TRACR=ON pip install --no-build-isolation -e '.[test]'
+# Default is 'OFF'
+option(BUILD_TRACR "Enable TraCR" OFF)
+if(DEFINED ENV{BUILD_TRACR})
+    set(BUILD_TRACR $ENV{BUILD_TRACR})
+endif()
+
+function(tracr_enable target)
+    message(STATUS "Enabling TraCR '${BUILD_TRACR}' for target: ${target}")
+
+    if (NOT TARGET ${target})
+        message(FATAL_ERROR "Target '${target}' does not exist.")
+    endif()
+
+    # Create the TraCR include directory path
+    set(TRACR_INCLUDE_DIR
+        ${TRACR_ROOT_DIR}/tracr/include
+    )
+
+    # Check if it even exists
+    if (NOT EXISTS "${TRACR_INCLUDE_DIR}/tracr/tracr.hpp")
+        message(FATAL_ERROR
+            "tracr.hpp not found at ${TRACR_INCLUDE_DIR}/tracr/tracr.hpp"
+        )
+    endif()
+
+    # Append the nlohmann json path as well
+    set(TRACR_INCLUDE_DIR
+        ${TRACR_INCLUDE_DIR}
+        ${TRACR_ROOT_DIR}
+        ${TRACR_ROOT_DIR}/tracr/extern
+    )
+
+    # --- include the directories ---
+    # SYSTEM: TraCR and its vendored third-party headers (e.g. extern/nlohmann
+    # json) are external to simpler and don't compile cleanly under the build's
+    # -Wall -Wextra -Werror (modern clang flags nlohmann's deprecated literal
+    # operators). Marking them system suppresses warnings from those headers.
+    target_include_directories(${target} SYSTEM PRIVATE
+        ${TRACR_INCLUDE_DIR}
+    )
+
+    # --- compiler flags of TraCR ---
+    if (BUILD_TRACR)
+        # Flag to enable/disable TraCR calls at compile time
+        target_compile_definitions(${target} PRIVATE ENABLE_TRACR)
+
+        # TraCR threads capacity (default is 1<<20 ~= 1 million traces per thread = ~17MB per thread buffer size)
+        set(TRACR_CAPACITY "" CACHE STRING "Optional TraCR buffer capacity (empty = use internal default)")
+
+        if(NOT "${TRACR_CAPACITY}" STREQUAL "")
+            message(STATUS "TraCR adding capacity: ${TRACR_CAPACITY}")
+
+            if(NOT TRACR_CAPACITY MATCHES "^[0-9]+$")
+                message(FATAL_ERROR "TRACR_CAPACITY must be a positive integer")
+            endif()
+
+            target_compile_definitions(${target} PRIVATE
+                TRACR_CAPACITY=${TRACR_CAPACITY}
+            )
+        endif()
+
+        # As the traces are collected on the Ascend device,
+        # there is no need to store them on the device filesystem.
+        target_compile_definitions(${target} PRIVATE TRACR_DISABLE_FLUSH USE_HW_COUNTER)
+
+        # TraCR full size buffer modes:
+        # default (none):              Abort if buffer is full
+        # TRACR_POLICY_PERIODIC:       If buffer is full, overwrite from the beginning
+        # TRACR_POLICY_IGNORE_IF_FULL: If buffer is full, ignore incoming traces
+        # if (TRACR_POLICY)
+        #     target_compile_definitions(${target} PRIVATE TRACR_POLICY_PERIODIC)
+        # endif()
+        set(TRACR_POLICY "" CACHE STRING "TraCR policy (empty = use C++ default)")
+
+        set_property(CACHE TRACR_POLICY PROPERTY STRINGS
+            ""  # default: abort if full
+            TRACR_POLICY_PERIODIC
+            TRACR_POLICY_IGNORE_IF_FULL
+        )
+
+        if(NOT "${TRACR_POLICY}" STREQUAL "")
+            if(TRACR_POLICY STREQUAL "TRACR_POLICY_PERIODIC")
+                message(STATUS "TraCR adding policy: 'TRACR_POLICY_PERIODIC'")
+                target_compile_definitions(${target} PRIVATE TRACR_POLICY_PERIODIC)
+            elseif(TRACR_POLICY STREQUAL "TRACR_POLICY_IGNORE_IF_FULL")
+                message(STATUS "TraCR adding policy: 'TRACR_POLICY_IGNORE_IF_FULL'")
+                target_compile_definitions(${target} PRIVATE TRACR_POLICY_IGNORE_IF_FULL)
+            else()
+                message(FATAL_ERROR "Unknown TRACR_POLICY: ${TRACR_POLICY}")
+            endif()
+        else()
+            message(STATUS "No TraCR policy given: using C++ default")
+        endif()
+
+        # Flag to enable TraCR debugging prints (TODO: Not yet working!)
+        # if (TRACR_DEBUG)
+        #     target_compile_definitions(${target} PRIVATE ENABLE_TRACR_DEBUG)
+        # endif()
+    endif()
+endfunction()
diff --git a/tools/tracr_postprocessing_script.cmake b/tools/tracr_postprocessing_script.cmake
new file mode 100644
index 000000000..b5cb4e49e
--- /dev/null
+++ b/tools/tracr_postprocessing_script.cmake
@@ -0,0 +1,28 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+
+set(BUILD_DIR "${CMAKE_CURRENT_LIST_DIR}/../build/output/bin/")
+
+message(STATUS "TraCR: REAL_SOURCE_DIR: '${CMAKE_CURRENT_LIST_DIR}'")
+
+# Paraver format configuration file
+configure_file(
+    ${CMAKE_CURRENT_LIST_DIR}/tracr/postprocessing/paraver/state.cfg
+    ${BUILD_DIR}/state.cfg
+    COPYONLY
+)
+
+add_executable(tracr_process ${CMAKE_CURRENT_LIST_DIR}/tracr/postprocessing/tracr_process.cpp)
+
+tracr_enable(tracr_process)
+
+# Set the output directory for the compiled executable
+set_target_properties(tracr_process PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${BUILD_DIR}
+)
diff --git a/tools/tracr_simpler_api.hpp b/tools/tracr_simpler_api.hpp
new file mode 100644
index 000000000..5b8110f9b
--- /dev/null
+++ b/tools/tracr_simpler_api.hpp
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * TraCR API functions for Simpler A2A3, A2A3sim, A5, A5sim
+ *
+ * TODO: A5 not yet able to test
+ */
+
+#pragma once
+
+#include <filesystem>  // C++17 or newer
+#include <fstream>
+#include <array>
+#include <string>
+#include <vector>
+#include <nlohmann/json.hpp>
+
+#include <tracr/tracr.hpp>
+#include <tracr_simpler_markers.hpp>
+
+namespace fs = std::filesystem;
+using json = nlohmann::json;
+
+// TraCR profiling/benchmarking stuff
+size_t getSampleID() {
+    const auto env = std::getenv("PYPTO_RUN_SAMPLE_ID");
+    return env ? std::stoul(env) : 0;
+}
+size_t sampleID = getSampleID();
+
+std::string tracr_dir = "~/ascend/tracr/proc.1";
+
+/**
+ * A function for defining the path of the TraCR traces in home
+ */
+fs::path expand_user_path(const std::string &path) {
+    if (!path.empty() && path[0] == '~') {
+        const char *home = std::getenv("HOME");
+        if (!home) throw std::runtime_error("HOME not set");
+
+        std::string sub = path.substr(1);                        // remove ~
+        if (!sub.empty() && sub[0] == '/') sub = sub.substr(1);  // remove leading slash
+
+        return fs::path(home) / sub;
+    }
+    return fs::path(path);
+}
+
+/**
+ *
+ */
+inline int TracrData2BTS(const TraCR::Payload *tracrData, const size_t *tracrDataSizes, const size_t num_threads) {
+    fs::path base_dir = expand_user_path(tracr_dir);
+
+    fs::create_directories(base_dir);
+
+    for (uint32_t t = 0; t < num_threads; ++t) {
+        size_t num_traces = tracrDataSizes[t];
+
+        if (num_traces == 0) continue;
+
+        if (num_traces > TraCR::CAPACITY) {
+            LOG_ERROR("Thread %u exceeds CAPACITY", t);
+            return -1;
+        }
+
+        fs::path thread_dir = base_dir / ("thread." + std::to_string(t + 1));
+
+        fs::create_directories(thread_dir);
+
+        fs::path file_path = thread_dir / "traces.bts";
+
+        std::ofstream out(file_path, std::ios::binary);
+        if (!out) {
+            LOG_ERROR("Cannot open %s", file_path);
+            return -1;
+        }
+
+        const TraCR::Payload *thread_ptr = tracrData + t * TraCR::CAPACITY;
+
+        out.write(reinterpret_cast<const char *>(thread_ptr), num_traces * sizeof(TraCR::Payload));
+
+        if (!out) {
+            LOG_ERROR("Write failed for %s", file_path);
+            return -1;
+        }
+    }
+    return 0;
+}
+
+/**
+ * A method for storing the TraCR metadata.json
+ */
+template <typename RuntimeT>
+int StoreTracrMetaData(RuntimeT &runtime) {
+    fs::path base_dir = expand_user_path(tracr_dir);
+
+    // Add the metadata.json
+    nlohmann::json metadata;
+
+    // channel_names
+    nlohmann::json channel_names = nlohmann::json::array();
+    for (int i = 0; i < runtime.get_aicpu_thread_num(); ++i) {
+        channel_names.push_back("AICPU_" + std::to_string(i));
+    }
+    for (int i = 0; i < int(runtime.get_worker_count() / 3); ++i) {
+        channel_names.push_back("AICube_" + std::to_string(i));
+    }
+    for (int i = 0; i < int(2 * runtime.get_worker_count() / 3); ++i) {
+        channel_names.push_back("AIVector_" + std::to_string(i));
+    }
+    channel_names.push_back("INVALID");
+
+    metadata["channel_names"] = channel_names;
+    metadata["num_channels"] = channel_names.size();
+
+    // markerTypes
+    metadata["markerTypes"] = nlohmann::json::object();
+
+    for (int i = 0; i < MARKERTYPE_COUNT; ++i) {
+        std::ostringstream oss;
+        oss << std::setw(2) << std::setfill('0') << (i + 1);
+        metadata["markerTypes"][oss.str()] = MarkerTypeNames[i];
+    }
+
+    metadata["pid"] = 1;
+    metadata["start_time"] = 0;
+    metadata["tid"] = 0;
+
+    fs::path metadata_dir = base_dir / ("metadata.json");
+
+    std::ofstream file(metadata_dir);
+    if (!file.is_open()) {
+        LOG_ERROR("Failed to open file for writing.\n");
+        return -1;
+    }
+
+    // Dump JSON into file
+    file << metadata.dump(4);
+
+    // Close the file
+    file.close();
+
+    return 0;
+}
+
+/**
+ * A function for extracting the TraCR data from the Device to Host
+ */
+template <typename DeviceRunnerT, typename RuntimeT>
+int StoreTracrData(DeviceRunnerT *device_runner, RuntimeT &runtime) {
+    static_assert(
+        std::is_trivially_copyable_v<TraCR::Payload>, "TraCR::Payload must be trivially copyable for raw binary dump"
+    );
+
+    if (runtime.get_tracr_data() == nullptr) {
+        LOG_ERROR("runtime.tracrData_ is a nullptr");
+        return -1;
+    }
+
+    if (runtime.get_tracr_data_sizes() == nullptr) {
+        LOG_ERROR("runtime.tracrDataSizes_ is a nullptr");
+        return -1;
+    }
+
+    if (runtime.get_aicpu_thread_num() <= 0) {
+        LOG_ERROR("runtime.aicpu_thread_num is zero or negative: %d", runtime.get_aicpu_thread_num());
+        return -1;
+    }
+
+    // Download the tracrData_ from Device to Host
+    size_t size = sizeof(TraCR::Payload) * TraCR::CAPACITY * runtime.get_aicpu_thread_num();
+    std::vector<TraCR::Payload> tracrData(TraCR::CAPACITY * runtime.get_aicpu_thread_num());
+    int rc = device_runner->copy_from_device(
+        reinterpret_cast<void *>(tracrData.data()), reinterpret_cast<void *>(runtime.get_tracr_data()), size
+    );
+    if (rc != 0) {
+        LOG_ERROR("device_runner->copy_from_device 'tracrData' failed rc=%d", rc);
+        return rc;
+    }
+
+    // Download the tracrDataSizes_ from Device to Host
+    size = sizeof(size_t) * runtime.get_aicpu_thread_num();
+    std::vector<size_t> tracrDataSizes(runtime.get_aicpu_thread_num());
+    rc = device_runner->copy_from_device(
+        reinterpret_cast<void *>(tracrDataSizes.data()), reinterpret_cast<void *>(runtime.get_tracr_data_sizes()), size
+    );
+    if (rc != 0) {
+        LOG_ERROR("device_runner->copy_from_device 'tracrDataSizes' failed rc=%d", rc);
+        return rc;
+    }
+
+    // Now, store the traces into '~/ascend/tracr/'
+    tracr_dir =
+        "~/ascend/tracr_" + std::to_string(sampleID++) + "/proc." + std::to_string(1000 + device_runner->device_id());
+    rc = TracrData2BTS(tracrData.data(), tracrDataSizes.data(), runtime.get_aicpu_thread_num());
+    if (rc != 0) {
+        LOG_ERROR("TracrData2BTS() failed");
+        return rc;
+    }
+
+    // Free device TraCR memory data placeholder
+    device_runner->free_tensor(runtime.get_tracr_data());
+    device_runner->free_tensor(runtime.get_tracr_data_sizes());
+
+    rc = StoreTracrMetaData(runtime);
+    if (rc != 0) {
+        LOG_ERROR("StoreTracrMetaData failed: %d", rc);
+        return rc;
+    }
+
+    return 0;
+}
+
+/**
+ * A method for allocating memory on the device
+ *
+ * Polymorphic to A2A3 and A5 (should be)
+ */
+template <typename DeviceRunnerT, typename RuntimeT>
+int DevAllocTraCR(DeviceRunnerT *device_runner, RuntimeT &runtime) {
+    const size_t size = sizeof(TraCR::Payload) * runtime.get_aicpu_thread_num() * TraCR::CAPACITY;
+    // LOG_INFO_V9("Device alloc start of size=%u, %p", size, runtime.get_tracr_data());
+    runtime.set_tracr_data(device_runner->allocate_tensor(size));
+    if (runtime.get_tracr_data() == nullptr) {
+        LOG_ERROR("runtime.tracrData_: alloc %zu bytes failed", size);
+        return -1;
+    }
+    // LOG_INFO_V9("Device alloc start of size=%u, %p", size, runtime.get_tracr_data());
+    runtime.set_tracr_data_sizes(device_runner->allocate_tensor(runtime.get_aicpu_thread_num() * sizeof(size_t)));
+    if (runtime.get_tracr_data_sizes() == nullptr) {
+        const size_t sizes_bytes = runtime.get_aicpu_thread_num() * sizeof(size_t);
+        LOG_ERROR("runtime.tracrDataSizes_: alloc %zu bytes failed", sizes_bytes);
+        device_runner->free_tensor(runtime.get_tracr_data());
+        runtime.set_tracr_data(nullptr);
+        return -1;
+    }
+    return 0;
+}
diff --git a/tools/tracr_simpler_markers.hpp b/tools/tracr_simpler_markers.hpp
new file mode 100644
index 000000000..99fd8cf59
--- /dev/null
+++ b/tools/tracr_simpler_markers.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+/**
+ * TraCR Simpler Marker Types
+ */
+
+#pragma once
+
+#include <atomic>
+#include <string_view>
+
+// sched_getcpu() is a glibc/Linux-only API, but the simulator/host build also
+// compiles on non-Linux targets (e.g. the macOS packaging CI). Route the TraCR
+// call sites through this portable shim instead of calling sched_getcpu directly.
+#if defined(__linux__)
+#include <sched.h>
+inline int tracr_getcpu() { return sched_getcpu(); }
+#else
+inline int tracr_getcpu() { return -1; }
+#endif
+
+// Global TraCR thread idx counter
+inline std::atomic<int> g_TraCR_thread_idx_counter{0};
+
+// Global thread local thread idx placeholder
+inline thread_local int g_TraCR_thread_idx{-1};
+
+#define MARKER_TYPES       \
+    X(Orchestrating)       \
+    X(Read_Dimensions)     \
+    X(Reshape_Kernels)     \
+    X(Pre_Loop_Info)       \
+    X(PTO2_SCOPE_)         \
+    X(Scheduling)          \
+    X(Phase1)              \
+    X(Phase2)              \
+    X(Phase3)              \
+    X(Phase3b)             \
+    X(Phase4)              \
+    X(Drain)               \
+    X(Initializing)        \
+    X(De_Initializing)     \
+    X(DLL_loading)         \
+    X(Allocating)          \
+    X(Running_Task_Single) \
+    X(Running_Task_Pair)   \
+    X(Barrier)
+
+enum MarkerType {
+#define X(name) name,
+    MARKER_TYPES
+#undef X
+
+        MARKERTYPE_COUNT
+};
+
+constexpr std::string_view MarkerTypeNames[] = {
+#define X(name) #name,
+    MARKER_TYPES
+#undef X
+};