From db6d4b80097ee047ef22884833ccf725774df6e6 Mon Sep 17 00:00:00 2001
From: Peter Hizalev <phizalev@tenstorrent.com>
Date: Thu, 9 Apr 2026 20:02:47 -0700
Subject: [PATCH 01/31] [examples] Matmul tutorial (#485)

---
 docs/sphinx/elementwise-tutorial/index.md     |   2 +-
 docs/sphinx/index.rst                         |   1 +
 docs/sphinx/matmul-tutorial/index.md          | 446 ++++++++++++++++++
 examples/matmul-tutorial/step_0_ttnn_base.py  |  69 +++
 .../step_1_single_node_single_tile_block.py   | 210 +++++++++
 .../step_2_single_node_multitile_block.py     | 220 +++++++++
 examples/matmul-tutorial/step_3_multinode.py  | 236 +++++++++
 .../step_4_multinode_grid_auto.py             | 240 ++++++++++
 .../step_5_multidevice_shard_m.py             | 246 ++++++++++
 .../step_6_multidevice_shard_k.py             | 260 ++++++++++
 .../step_7_multidevice_shard_k_all_reduce.py  | 248 ++++++++++
 11 files changed, 2177 insertions(+), 1 deletion(-)
 create mode 100644 docs/sphinx/matmul-tutorial/index.md
 create mode 100644 examples/matmul-tutorial/step_0_ttnn_base.py
 create mode 100644 examples/matmul-tutorial/step_1_single_node_single_tile_block.py
 create mode 100644 examples/matmul-tutorial/step_2_single_node_multitile_block.py
 create mode 100644 examples/matmul-tutorial/step_3_multinode.py
 create mode 100644 examples/matmul-tutorial/step_4_multinode_grid_auto.py
 create mode 100644 examples/matmul-tutorial/step_5_multidevice_shard_m.py
 create mode 100644 examples/matmul-tutorial/step_6_multidevice_shard_k.py
 create mode 100644 examples/matmul-tutorial/step_7_multidevice_shard_k_all_reduce.py

diff --git a/docs/sphinx/elementwise-tutorial/index.md b/docs/sphinx/elementwise-tutorial/index.md
index 33faf632b..5eef11bd5 100644
--- a/docs/sphinx/elementwise-tutorial/index.md
+++ b/docs/sphinx/elementwise-tutorial/index.md
@@ -1,4 +1,4 @@
-# Elementwise Operation Tutorial
+# Elementwise Tutorial
 
 This tutorial walks through building a fused elementwise operation in TT-Lang,
 introducing one concept at a time. Each step is a self-contained runnable
diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index e3c1f2532..0f0a63169 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -10,6 +10,7 @@ TT-Lang Documentation
    getting-started
    tour/index
    elementwise-tutorial/index
+   matmul-tutorial/index
    programming-guide
    claude-skills
    testing
diff --git a/docs/sphinx/matmul-tutorial/index.md b/docs/sphinx/matmul-tutorial/index.md
new file mode 100644
index 000000000..cf84bf156
--- /dev/null
+++ b/docs/sphinx/matmul-tutorial/index.md
@@ -0,0 +1,446 @@
+# Matmul Tutorial
+
+This tutorial walks through building a fused matrix multiplication operation in
+TT-Lang, introducing one concept at a time. Each step is a self-contained
+runnable script.
+
+## The Goal
+
+We want to compute `y = relu(a @ b + c)` on 8192×8192 `bfloat16` tensors. The
+entire expression — matrix multiply, bias add, and activation — is the target
+for kernel fusion: instead of dispatching three separate TT-NN operations that
+each read and write DRAM, a custom TT-Lang operation streams tiles from DRAM
+into L1, accumulates the dot product across the K dimension, adds the bias, and
+applies relu before writing the result back. Later steps scale this to multiple
+nodes and multiple devices using data parallelism and K-sharding.
+
+## Step 0 — TT-NN Baseline
+
+**Script**: [`examples/matmul-tutorial/step_0_ttnn_base.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_0_ttnn_base.py)
+
+The starting point uses TT-NN directly, with no custom operation:
+
+```python
+y = ttnn.relu(ttnn.add(ttnn.matmul(a, b), c))
+```
+
+Each call dispatches a separate operation and writes an intermediate tensor back
+to DRAM. This is the reference we'll verify against as we build the custom
+operation. Correctness is measured with Pearson Correlation Coefficient (PCC)
+rather than `allclose` because matmul accumulates bfloat16 rounding differently
+from a reference float32 computation.
+
+## Step 1 — Single Node, Single-Tile Block
+
+**Script**: [`examples/matmul-tutorial/step_1_single_node_single_tile_block.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_1_single_node_single_tile_block.py)
+
+This step introduces the complete TT-Lang programming model. The operation fuses
+`relu(a @ b + c)` into a single pass, processing one 32×32 tile at a time on
+one node.
+
+### Operation function and grid
+
+An operation is a Python function decorated with `@ttl.operation()`. The `grid`
+argument selects how many nodes (Tensix cores) to run on. `grid=(1, 1)` means
+a single node.
+
+```python
+@ttl.operation(grid=(1, 1))
+def __tutorial_operation(
+    a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor, y: ttnn.Tensor
+):
+    ...
+```
+
+The function arguments are the tensors the operation operates on. They live in
+DRAM on device and are passed by the host at call time.
+
+### Dataflow buffers
+
+A *dataflow buffer* (DFB) is an L1 buffer shared between kernel functions within
+a node. It is created once in the operation scope from a tensor likeness and a
+block shape:
+
+```python
+a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2)
+```
+
+`shape=(1, 1)` means each buffer entry holds one 32×32 tile. `block_count=2`
+allocates two blocks in L1 so that the reader and compute kernels can work
+concurrently — while compute processes one entry, the reader fills the other
+(double-buffering).
+
+Matmul needs one additional DFB that the elementwise tutorial does not use:
+`acc_dfb` holds the running accumulator for the K-reduction. Because compute
+both reads the previous partial sum and writes a new one in each k-step, two
+slots in `acc_dfb` alternate in a ping-pong pattern:
+
+```python
+acc_dfb = ttl.make_dataflow_buffer_like(y, shape=(1, 1), block_count=2)
+```
+
+### Kernel functions
+
+Three kernel functions run concurrently inside the operation:
+
+```python
+@ttl.compute()
+def compute(): ...
+
+@ttl.datamovement()
+def read(): ...
+
+@ttl.datamovement()
+def write(): ...
+```
+
+**Reader DM kernel** — for each output tile `(m, n)`, first reads the bias
+`c[m, n]` into `c_dfb`, then streams all k-tiles of `a` and `b` into their
+DFBs:
+
+```python
+for m_tile in range(m_tiles):
+    for n_tile in range(n_tiles):
+        with c_dfb.reserve() as c_blk:
+            ttl.copy(c[m_tile, n_tile], c_blk).wait()
+
+        for k_tile in range(k_tiles):
+            with a_dfb.reserve() as a_blk, b_dfb.reserve() as b_blk:
+                tx_a = ttl.copy(a[m_tile, k_tile], a_blk)
+                tx_b = ttl.copy(b[k_tile, n_tile], b_blk)
+                tx_a.wait(); tx_b.wait()
+```
+
+`ttl.copy` starts a non-blocking transfer; `tx.wait()` waits for completion.
+The index `a[m_tile, k_tile]` selects a tile in *tile coordinates* (not element
+coordinates). The `with` block calls `push()` on exit, signalling the compute
+kernel.
+
+**Compute kernel** — initializes the accumulator to zero, accumulates
+`a @ b` across all k-tiles, then adds the bias and applies relu:
+
+```python
+for _ in range(m_tiles):
+    for _ in range(n_tiles):
+        with acc_dfb.reserve() as acc_blk:
+            acc_blk.store(ttl.math.fill(acc_blk, 0))  # zero the accumulator
+
+        for _ in range(k_tiles):
+            with (
+                a_dfb.wait() as a_blk,
+                b_dfb.wait() as b_blk,
+                acc_dfb.wait() as pre_acc_blk,   # previous partial sum
+            ):
+                with acc_dfb.reserve() as acc_blk:
+                    acc_blk.store(pre_acc_blk + a_blk @ b_blk)
+
+        with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk:
+            with y_dfb.reserve() as y_blk:
+                y_blk.store(ttl.math.relu(c_blk + acc_blk))
+```
+
+`ttl.math.fill(acc_blk, 0)` produces a block expression that fills a block
+with a scalar value; `store()` materializes the expression. `wait()` blocks
+until the reader has pushed a filled tile. `reserve()` blocks until the writer
+has freed an entry. The `with` block automatically calls `pop()` on inputs and
+`push()` on the output when the scope exits.
+
+**Writer DM kernel** — copies completed output tiles from L1 back to DRAM:
+
+```python
+with y_dfb.wait() as y_blk:
+    ttl.copy(y_blk, y[m_tile, n_tile]).wait()
+```
+
+## Step 2 — Single Node, Multi-Tile Block
+
+**Script**: [`examples/matmul-tutorial/step_2_single_node_multitile_block.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_2_single_node_multitile_block.py)
+
+Processing one tile at a time incurs a synchronization round-trip per tile and
+limits the hardware's ability to amortize compute setup overhead. This step
+groups tiles into larger blocks so that each transfer and compute iteration
+covers a multi-tile patch.
+
+```python
+M_GRANULARITY = 4
+N_GRANULARITY = 4
+K_GRANULARITY = 4
+```
+
+The DFB shapes must match the tile dimensions of each tensor operand, which
+differ because the matmul operands have different roles:
+
+```python
+a_dfb = ttl.make_dataflow_buffer_like(
+    a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2  # M×K
+)
+b_dfb = ttl.make_dataflow_buffer_like(
+    b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2  # K×N
+)
+c_dfb = ttl.make_dataflow_buffer_like(
+    c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2  # M×N
+)
+```
+
+The iteration counts change from individual tiles to blocks, and the reader
+selects a tile range per transfer:
+
+```python
+m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block
+
+tx_a = ttl.copy(
+    a[start_m_tile:end_m_tile, start_k_tile:end_k_tile],
+    a_blk,
+)
+```
+
+The operation structure, synchronization pattern, and compute expression are
+unchanged from Step 1.
+
+## Step 3 — Multi-Node, Fixed Grid
+
+**Script**: [`examples/matmul-tutorial/step_3_multinode.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_3_multinode.py)
+
+This step parallelizes the operation across a 4×4 grid of nodes. To familiarize
+the user with Tenstorrent hardware architecture we recommend reading
+[TT Architecture and Metalium Guide](https://github.com/tenstorrent/tt-metal/blob/main/METALIUM_GUIDE.md).
+
+### Declaring a multi-node grid
+
+```python
+@ttl.operation(grid=(4, 4))
+def __tutorial_operation(...):
+```
+
+All nodes execute the same operation body. They differentiate their work using
+their coordinates in the grid.
+
+### Partitioning strategy
+
+For matmul, the M×N output space is partitioned across the grid. The K
+dimension is **not** partitioned: every node iterates over all k-blocks to
+accumulate its own independent partial product. No inter-node communication is
+required.
+
+`ttl.grid_size(dims=2)` returns `(grid_n, grid_m)` — the number of nodes along
+each dimension. `ttl.node(dims=2)` returns the `(node_n, node_m)` coordinates
+of the current node, zero-based.
+
+```python
+grid_n, grid_m = ttl.grid_size(dims=2)
+
+m_blocks_per_node = m_blocks // grid_m
+n_blocks_per_node = n_blocks // grid_n
+```
+
+### Mapping local to global indices
+
+Each DM kernel uses its node coordinates to offset into the global tensor:
+
+```python
+node_n, node_m = ttl.node(dims=2)
+
+for local_m_block in range(m_blocks_per_node):
+    m_block = node_m * m_blocks_per_node + local_m_block
+    ...
+for local_n_block in range(n_blocks_per_node):
+    n_block = node_n * n_blocks_per_node + local_n_block
+    ...
+```
+
+The compute kernel iterates over the same `m_blocks_per_node × n_blocks_per_node`
+count as the DM kernels, but does not need to know the node's coordinates
+directly — the DM kernels already stream only the relevant tiles into the DFBs.
+
+This version requires the block counts to be evenly divisible by the grid.
+See Step 4 for a version that handles arbitrary sizes.
+
+## Step 4 — Multi-Node, Auto Grid
+
+**Script**: [`examples/matmul-tutorial/step_4_multinode_grid_auto.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_4_multinode_grid_auto.py)
+
+This step removes two constraints from Step 3: the hard-coded grid size and
+the requirement for even divisibility.
+
+### Auto grid
+
+```python
+@ttl.operation(grid="auto")
+```
+
+`grid="auto"` lets the compiler select the largest grid that fits available
+hardware resources. The operation must work correctly for any grid the compiler
+may choose.
+
+### Ceiling division
+
+When the number of blocks does not divide evenly across the grid, nodes at the
+trailing edge would be left without work. Ceiling division ensures every block
+is assigned to some node:
+
+```python
+m_blocks_per_node = -(-m_blocks // grid_m)  # ceil(m_blocks / grid_m)
+n_blocks_per_node = -(-n_blocks // grid_n)  # ceil(n_blocks / grid_n)
+```
+
+### Bounds checking
+
+Nodes at the trailing edge may be assigned more iterations than there are
+actual blocks. All three kernel functions guard per-block work:
+
+```python
+for local_m_block in range(m_blocks_per_node):
+    m_block = node_m * m_blocks_per_node + local_m_block
+    if m_block < m_blocks:          # skip if past the end of the tensor
+        for local_n_block in range(n_blocks_per_node):
+            n_block = node_n * n_blocks_per_node + local_n_block
+            if n_block < n_blocks:  # skip if past the end of the tensor
+                ...
+```
+
+The guard must appear in every kernel function — compute, read, and write —
+so that they all agree on exactly which blocks to process.
+
+## Step 5 — Multi-Device, Shard M
+
+**Script**: [`examples/matmul-tutorial/step_5_multidevice_shard_m.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_5_multidevice_shard_m.py)
+
+This step scales the operation to multiple devices using SPMD
+(Single-Program Multiple-Data) mode. The TT-Lang operation body is unchanged
+from Step 4; only the tensor distribution across devices changes.
+
+### Opening a mesh device
+
+```python
+ttnn.set_fabric_config(ttnn.FabricConfig.FABRIC_1D)
+mesh_device = ttnn.open_mesh_device(ttnn.MeshShape(1, n_devices))
+```
+
+A mesh device groups multiple physical devices into a single logical device.
+`FabricConfig.FABRIC_1D` configures a 1D ring interconnect between devices.
+
+### M-sharding strategy
+
+The M dimension of the output is split evenly: device `i` computes rows
+`i * (M/n_devices)` through `(i+1) * (M/n_devices) - 1`. Because each row of
+the output only depends on the corresponding rows of `a` and `c`, and on the
+full matrix `b`, no inter-device communication is needed.
+
+```python
+a = from_torch(a, ttnn.ShardTensorToMesh(mesh_device, dim=0))  # shard M rows
+b = from_torch(b, ttnn.ReplicateTensorToMesh(mesh_device))     # replicate K×N
+c = from_torch(c, ttnn.ShardTensorToMesh(mesh_device, dim=0))  # shard M rows
+y = from_torch(y, ttnn.ShardTensorToMesh(mesh_device, dim=0))  # shard M rows
+```
+
+`ShardTensorToMesh(dim=0)` splits the tensor along its first dimension across
+all devices. `ReplicateTensorToMesh` sends the same tensor to every device.
+
+### Gathering results
+
+After the operation, the per-device output shards are concatenated on the host:
+
+```python
+y = ttnn.to_torch(y, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=0))
+```
+
+The TT-Lang operation runs identically on each device in SPMD mode — `grid="auto"`
+applies independently per device, filling the full per-device grid.
+
+## Step 6 — Multi-Device, Shard K
+
+**Script**: [`examples/matmul-tutorial/step_6_multidevice_shard_k.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_6_multidevice_shard_k.py)
+
+This step changes the sharding strategy: instead of splitting M across devices,
+the K (reduction) dimension is split. This allows the matrix multiply to be
+parallelized along the contraction axis at the cost of requiring a reduction
+step to combine results.
+
+### K-sharding strategy
+
+Each device computes a partial dot product over its K slice:
+
+```
+device i: y_i = a[:, K_i] @ b[K_i, :] + c_i
+```
+
+where `K_i` is the slice of K assigned to device `i`. The full result is
+`y = sum(y_i)`.
+
+```python
+a = from_torch(a, ttnn.ShardTensorToMesh(mesh_device, dim=1))  # shard K cols
+b = from_torch(b, ttnn.ShardTensorToMesh(mesh_device, dim=0))  # shard K rows
+```
+
+### Handling the bias
+
+The bias `c` must only be added once, not once per device. To handle this
+within the uniform SPMD model, a stacked tensor is constructed where device 0
+receives the real `c` and all other devices receive zeros:
+
+```python
+replicated_cs = torch.zeros((M * n_devices, N), dtype=torch.bfloat16)
+replicated_cs[:M, :] = c  # only the first M rows carry the real bias
+replicated_cs = from_torch(replicated_cs, ttnn.ShardTensorToMesh(mesh_device, dim=0))
+```
+
+After sharding along `dim=0`, device 0 gets `c` and devices 1..n−1 get zeros,
+so the summation `sum(a_i @ b_i + c_i)` correctly produces `a @ b + c`.
+
+### Host-side reduction
+
+Because the kernel produces partial sums, relu cannot be applied on-device.
+The host collects the partial outputs and reduces them manually before
+activating:
+
+```python
+partial_ys = ttnn.to_torch(partial_ys, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=0))
+
+y = torch.zeros((M, N), dtype=torch.bfloat16)
+for i in range(n_devices):
+    y += partial_ys[i * M : (i + 1) * M, :]
+
+y = torch.relu(y)
+```
+
+The TT-Lang operation body drops the `ttl.math.relu` from Step 4 and stores
+the raw `c_blk + acc_blk` result, deferring activation to after the reduction.
+
+## Step 7 — Multi-Device, Shard K with All-Reduce
+
+**Script**: [`examples/matmul-tutorial/step_7_multidevice_shard_k_all_reduce.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_7_multidevice_shard_k_all_reduce.py)
+
+This step replaces the host-side manual reduction from Step 6 with an
+on-device all-reduce, keeping the result on the mesh and enabling the
+activation to be applied on-device as well.
+
+### All-reduce
+
+```python
+replicated_ys = ttnn.all_reduce(partial_ys)
+replicated_ys = ttnn.relu(replicated_ys)
+```
+
+`ttnn.all_reduce` sums `partial_ys` across all devices using the TT-Fabric
+interconnect. Each device ends up with the fully reduced M×N result — the
+output is replicated rather than sharded. `ttnn.relu` is then applied
+on-device to all replicas in parallel.
+
+### Verifying replicated results
+
+Because all-reduce replicates the result, every device holds a correct copy of
+the full output. The verification loop checks each device's copy independently:
+
+```python
+replicated_ys = ttnn.to_torch(
+    replicated_ys, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=0)
+)
+for i in range(n_devices):
+    y = replicated_ys[i * M : (i + 1) * M, :]
+    pcc = ...
+    assert pcc > 0.99
+```
+
+Compared to Step 6, this approach avoids the host round-trip for reduction and
+moves the relu entirely on-device. The TT-Lang operation body is identical to
+Step 6.
diff --git a/examples/matmul-tutorial/step_0_ttnn_base.py b/examples/matmul-tutorial/step_0_ttnn_base.py
new file mode 100644
index 000000000..f57a48403
--- /dev/null
+++ b/examples/matmul-tutorial/step_0_ttnn_base.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Tutorial Step 0: TT-NN Baseline
+# ================================
+# This is the starting point: a matmul-bias-activation expressed entirely in
+# TT-NN.  No custom operation is involved.  TT-NN dispatches each op separately,
+# resulting in multiple DRAM round-trips.
+#
+# The operation: y = relu(a @ b + c)
+#
+# The subsequent tutorial steps replace this entire computation with a single
+# fused TT-Lang operation, showing how to take control of data movement and
+# compute explicitly.
+
+import ttnn
+import torch
+
+
+def from_torch(tensor: torch.Tensor):
+
+    # Upload a bfloat16 torch tensor to DRAM on the device in tiled layout.
+
+    return ttnn.from_torch(
+        tensor,
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+        memory_config=ttnn.DRAM_MEMORY_CONFIG,
+    )
+
+
+torch.manual_seed(42)
+
+device = ttnn.open_device(device_id=0)
+
+try:
+    M, K, N = 8192, 8192, 8192
+
+    a = torch.randn((M, K), dtype=torch.bfloat16)
+    b = torch.randn((K, N), dtype=torch.bfloat16)
+    c = torch.randn((M, N), dtype=torch.bfloat16)
+
+    expected_y = torch.relu(a @ b + c)
+
+    a = from_torch(a)
+    b = from_torch(b)
+    c = from_torch(c)
+
+    # TT-NN dispatches three separate operations: matmul, add, relu.
+    # With a custom TT-Lang operation we can fuse all three into a single
+    # kernel, reducing DRAM traffic and operation-launch overhead.
+
+    y = ttnn.relu(ttnn.add(ttnn.matmul(a, b), c))
+
+    y = ttnn.to_torch(y)
+
+    pcc = torch.corrcoef(
+        torch.stack([y.flatten().float(), expected_y.flatten().float()])
+    )[0, 1].item()
+
+    print(f"PCC {pcc:.6f}")
+
+    assert pcc > 0.99
+
+finally:
+    ttnn.close_device(device)
diff --git a/examples/matmul-tutorial/step_1_single_node_single_tile_block.py b/examples/matmul-tutorial/step_1_single_node_single_tile_block.py
new file mode 100644
index 000000000..a246a8daf
--- /dev/null
+++ b/examples/matmul-tutorial/step_1_single_node_single_tile_block.py
@@ -0,0 +1,210 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Tutorial Step 1: Single Node, Single-Tile Block
+# ================================================
+# Introduces the core TT-Lang programming model for matmul:
+#   - @ttl.operation   — declares an operation and the grid it runs on
+#   - @ttl.compute     — the compute kernel: tile-level matrix multiply and add
+#   - @ttl.datamovement — DM kernels: move data between DRAM and L1
+#   - ttl.make_dataflow_buffer_like — creates an in-L1 dataflow buffer (DFB)
+#     that synchronizes data passing between kernels
+#   - ttl.copy / tx.wait — initiates and awaits a transfer
+#   - ttl.math.fill    — fills a block with a scalar value (used to zero the
+#     accumulator before the k-reduction loop)
+#
+# The operation fuses a @ b + c followed by relu into a single kernel,
+# processing one 32×32 tile at a time.  The outer m×n loop iterates over
+# output tiles; the inner k loop accumulates partial products.
+
+import ttnn
+import torch
+
+
+def from_torch(tensor: torch.Tensor):
+
+    # Upload a bfloat16 torch tensor to DRAM on the device in tiled layout.
+
+    return ttnn.from_torch(
+        tensor,
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+        memory_config=ttnn.DRAM_MEMORY_CONFIG,
+    )
+
+
+import ttl
+
+# Tenstorrent hardware operates on 32×32 tiles.  Tensor dimensions in tile
+# coordinates are obtained by dividing the element-count by TILE_SIZE.
+
+TILE_SIZE = 32
+
+
+# @ttl.operation marks a Python function as a TT-Lang operation.
+# grid=(1, 1) means the operation runs on a single node (one Tensix core).
+# The function signature lists the tensors the operation reads and writes;
+# these live in DRAM and are passed by the host at call time.
+
+
+@ttl.operation(grid=(1, 1))
+def __tutorial_operation(
+    a: ttnn.Tensor,
+    b: ttnn.Tensor,
+    c: ttnn.Tensor,
+    y: ttnn.Tensor,
+) -> None:
+
+    # Compute iteration counts in tile coordinates.
+
+    m_tiles = a.shape[0] // TILE_SIZE
+    n_tiles = b.shape[1] // TILE_SIZE
+    k_tiles = a.shape[1] // TILE_SIZE
+
+    # Dataflow buffers (DFBs) are L1 buffers shared between threads.
+    # shape=(1, 1) means each entry holds exactly one 32×32 tile.
+    # block_count=2 allocates two blocks, enabling double-buffering: while the
+    # compute kernel processes one entry, the DM kernel can fill the other.
+    #
+    # acc_dfb is the running accumulator for the k-reduction.  It is both
+    # produced and consumed by the compute kernel in a ping-pong pattern:
+    # each k-step reads the previous partial sum (pre_acc_blk) and writes a
+    # new one (acc_blk), so block_count=2 allows the two slots to alternate.
+
+    a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2)
+    b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, 1), block_count=2)
+    c_dfb = ttl.make_dataflow_buffer_like(c, shape=(1, 1), block_count=2)
+    acc_dfb = ttl.make_dataflow_buffer_like(y, shape=(1, 1), block_count=2)
+    y_dfb = ttl.make_dataflow_buffer_like(y, shape=(1, 1), block_count=2)
+
+    # The DM reader runs concurrently with the compute kernel.
+    # For each output tile (m, n) it first reads the bias tile c[m, n], then
+    # streams all k input tiles for a and b into their respective DFBs.
+
+    @ttl.datamovement()
+    def read():
+        for m_tile in range(m_tiles):
+            for n_tile in range(n_tiles):
+
+                # Read the bias tile for this (m, n) output position first so
+                # it is available when the compute kernel finishes accumulating.
+
+                with c_dfb.reserve() as c_blk:
+                    tx_c = ttl.copy(
+                        c[m_tile, n_tile],
+                        c_blk,
+                    )
+
+                    tx_c.wait()
+
+                for k_tile in range(k_tiles):
+
+                    # Stream a[m, k] and b[k, n] tiles into L1 for each step
+                    # of the k-reduction.
+
+                    with (
+                        a_dfb.reserve() as a_blk,
+                        b_dfb.reserve() as b_blk,
+                    ):
+                        tx_a = ttl.copy(
+                            a[m_tile, k_tile],
+                            a_blk,
+                        )
+                        tx_b = ttl.copy(
+                            b[k_tile, n_tile],
+                            b_blk,
+                        )
+
+                        tx_a.wait()
+                        tx_b.wait()
+
+    # The compute kernel accumulates partial matmul products across k, then
+    # adds the bias and applies relu before writing the result to y_dfb.
+
+    @ttl.compute()
+    def compute():
+        for _ in range(m_tiles):
+            for _ in range(n_tiles):
+
+                # Initialize the accumulator to zero before the k loop.
+                # ttl.math.fill produces a block expression; store() materializes
+                # it into acc_blk and pushes it so the k loop can consume it.
+
+                with acc_dfb.reserve() as acc_blk:
+                    acc_blk.store(ttl.math.fill(acc_blk, 0))
+
+                for _ in range(k_tiles):
+
+                    # Consume the previous partial sum (pre_acc_blk) along with
+                    # the next a and b tiles, compute the updated partial sum,
+                    # and push it back into acc_dfb for the next k-step.
+
+                    with (
+                        a_dfb.wait() as a_blk,
+                        b_dfb.wait() as b_blk,
+                        acc_dfb.wait() as pre_acc_blk,
+                    ):
+                        with acc_dfb.reserve() as acc_blk:
+                            acc_blk.store(pre_acc_blk + a_blk @ b_blk)
+
+                # After k is exhausted, add the bias and apply relu in one step.
+
+                with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk:
+                    with y_dfb.reserve() as y_blk:
+                        y_blk.store(ttl.math.relu(c_blk + acc_blk))
+
+    # The DM writer reads completed output tiles from y_dfb and writes them
+    # back to the output tensor in DRAM.
+
+    @ttl.datamovement()
+    def write():
+        for m_tile in range(m_tiles):
+            for n_tile in range(n_tiles):
+                with y_dfb.wait() as y_blk:
+                    tx = ttl.copy(
+                        y_blk,
+                        y[m_tile, n_tile],
+                    )
+                    tx.wait()
+
+
+def tutorial_operation(a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor):
+    y = from_torch(torch.zeros((a.shape[0], b.shape[1]), dtype=torch.bfloat16))
+    __tutorial_operation(a, b, c, y)
+    return y
+
+
+torch.manual_seed(42)
+
+device = ttnn.open_device(device_id=0)
+
+try:
+    M, K, N = 8192, 8192, 8192
+
+    a = torch.randn((M, K), dtype=torch.bfloat16)
+    b = torch.randn((K, N), dtype=torch.bfloat16)
+    c = torch.randn((M, N), dtype=torch.bfloat16)
+
+    expected_y = torch.relu(a @ b + c)
+
+    a = from_torch(a)
+    b = from_torch(b)
+    c = from_torch(c)
+
+    y = tutorial_operation(a, b, c)
+
+    y = ttnn.to_torch(y)
+
+    pcc = torch.corrcoef(
+        torch.stack([y.flatten().float(), expected_y.flatten().float()])
+    )[0, 1].item()
+
+    print(f"PCC {pcc:.6f}")
+
+    assert pcc > 0.99
+
+finally:
+    ttnn.close_device(device)
diff --git a/examples/matmul-tutorial/step_2_single_node_multitile_block.py b/examples/matmul-tutorial/step_2_single_node_multitile_block.py
new file mode 100644
index 000000000..c2e88e4e0
--- /dev/null
+++ b/examples/matmul-tutorial/step_2_single_node_multitile_block.py
@@ -0,0 +1,220 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Tutorial Step 2: Single Node, Multi-Tile Block
+# ===============================================
+# Builds on Step 1 by processing multiple tiles per dataflow buffer entry
+# instead of one tile at a time.
+#
+# New concepts introduced:
+#   - Multi-tile blocks: each DFB entry holds a granularity-sized patch of
+#     tiles.  Fewer, larger memory transfers reduce per-transfer overhead and
+#     give the compute kernel more work per synchronization round-trip.
+#   - Asymmetric block shapes: a, b, and c have different tile dimensions
+#     (M×K, K×N, and M×N respectively), so their DFBs use matching shapes.
+#
+# Everything else (single node, same three-kernel structure) is identical to
+# Step 1.  The loop bodies are unchanged; only the DFB shapes and the tensor
+# slice ranges differ.
+
+import ttnn
+import torch
+
+
+def from_torch(tensor: torch.Tensor):
+    return ttnn.from_torch(
+        tensor,
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+        memory_config=ttnn.DRAM_MEMORY_CONFIG,
+    )
+
+
+import ttl
+
+TILE_SIZE = 32
+
+# M_GRANULARITY, N_GRANULARITY, K_GRANULARITY control how many tiles fit along
+# each matmul dimension per block.  With all set to 4, each a-block is a 4×4
+# patch of tiles (128×128 elements), each b-block is 4×4, and each c/y-block
+# is 4×4 in M×N space.
+
+M_GRANULARITY = 4
+N_GRANULARITY = 4
+K_GRANULARITY = 4
+
+
+@ttl.operation(grid=(1, 1))
+def __tutorial_operation(
+    a: ttnn.Tensor,
+    b: ttnn.Tensor,
+    c: ttnn.Tensor,
+    y: ttnn.Tensor,
+) -> None:
+    m_tiles_per_block = M_GRANULARITY
+    n_tiles_per_block = N_GRANULARITY
+    k_tiles_per_block = K_GRANULARITY
+
+    # m_blocks, n_blocks, k_blocks now count blocks, not individual tiles.
+
+    m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block
+    n_blocks = b.shape[1] // TILE_SIZE // n_tiles_per_block
+    k_blocks = a.shape[1] // TILE_SIZE // k_tiles_per_block
+
+    # DFB shapes match the tile dimensions of each tensor operand:
+    #   a: M×K → shape (m_tiles_per_block, k_tiles_per_block)
+    #   b: K×N → shape (k_tiles_per_block, n_tiles_per_block)
+    #   c, acc, y: M×N → shape (m_tiles_per_block, n_tiles_per_block)
+
+    a_dfb = ttl.make_dataflow_buffer_like(
+        a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2
+    )
+    b_dfb = ttl.make_dataflow_buffer_like(
+        b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    c_dfb = ttl.make_dataflow_buffer_like(
+        c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    acc_dfb = ttl.make_dataflow_buffer_like(
+        y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    y_dfb = ttl.make_dataflow_buffer_like(
+        y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+
+    @ttl.datamovement()
+    def read():
+        for m_block in range(m_blocks):
+
+            # Convert block index to tile index range for the tensor slice.
+
+            start_m_tile = m_block * m_tiles_per_block
+            end_m_tile = (m_block + 1) * m_tiles_per_block
+
+            for n_block in range(n_blocks):
+                start_n_tile = n_block * n_tiles_per_block
+                end_n_tile = (n_block + 1) * n_tiles_per_block
+
+                # Slice with a range to copy the entire M×N block in one transfer.
+
+                with c_dfb.reserve() as c_blk:
+                    tx_c = ttl.copy(
+                        c[
+                            start_m_tile:end_m_tile,
+                            start_n_tile:end_n_tile,
+                        ],
+                        c_blk,
+                    )
+
+                    tx_c.wait()
+
+                for k_block in range(k_blocks):
+                    start_k_tile = k_block * k_tiles_per_block
+                    end_k_tile = (k_block + 1) * k_tiles_per_block
+                    with (
+                        a_dfb.reserve() as a_blk,
+                        b_dfb.reserve() as b_blk,
+                    ):
+                        tx_a = ttl.copy(
+                            a[
+                                start_m_tile:end_m_tile,
+                                start_k_tile:end_k_tile,
+                            ],
+                            a_blk,
+                        )
+                        tx_b = ttl.copy(
+                            b[
+                                start_k_tile:end_k_tile,
+                                start_n_tile:end_n_tile,
+                            ],
+                            b_blk,
+                        )
+
+                        tx_a.wait()
+                        tx_b.wait()
+
+    # The compute kernel is unchanged in structure from Step 1.  The hardware
+    # now operates on full multi-tile blocks per iteration rather than single
+    # tiles, amortizing synchronization overhead over more compute work.
+
+    @ttl.compute()
+    def compute():
+        for _ in range(m_blocks):
+            for _ in range(n_blocks):
+                with acc_dfb.reserve() as acc_blk:
+                    acc_blk.store(ttl.math.fill(acc_blk, 0))
+
+                for _ in range(k_blocks):
+                    with (
+                        a_dfb.wait() as a_blk,
+                        b_dfb.wait() as b_blk,
+                        acc_dfb.wait() as pre_acc_blk,
+                    ):
+                        with acc_dfb.reserve() as acc_blk:
+                            acc_blk.store(pre_acc_blk + a_blk @ b_blk)
+
+                with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk:
+                    with y_dfb.reserve() as y_blk:
+                        y_blk.store(ttl.math.relu(c_blk + acc_blk))
+
+    @ttl.datamovement()
+    def write():
+        for m_block in range(m_blocks):
+            start_m_tile = m_block * m_tiles_per_block
+            end_m_tile = (m_block + 1) * m_tiles_per_block
+
+            for n_block in range(n_blocks):
+                start_n_tile = n_block * n_tiles_per_block
+                end_n_tile = (n_block + 1) * n_tiles_per_block
+
+                with y_dfb.wait() as y_blk:
+                    tx = ttl.copy(
+                        y_blk,
+                        y[
+                            start_m_tile:end_m_tile,
+                            start_n_tile:end_n_tile,
+                        ],
+                    )
+                    tx.wait()
+
+
+def tutorial_operation(a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor):
+    y = from_torch(torch.zeros((a.shape[0], b.shape[1]), dtype=torch.bfloat16))
+    __tutorial_operation(a, b, c, y)
+    return y
+
+
+torch.manual_seed(42)
+
+device = ttnn.open_device(device_id=0)
+
+try:
+    M, K, N = 8192, 8192, 8192
+
+    a = torch.randn((M, K), dtype=torch.bfloat16)
+    b = torch.randn((K, N), dtype=torch.bfloat16)
+    c = torch.randn((M, N), dtype=torch.bfloat16)
+
+    expected_y = torch.relu(a @ b + c)
+
+    a = from_torch(a)
+    b = from_torch(b)
+    c = from_torch(c)
+
+    y = tutorial_operation(a, b, c)
+
+    y = ttnn.to_torch(y)
+
+    pcc = torch.corrcoef(
+        torch.stack([y.flatten().float(), expected_y.flatten().float()])
+    )[0, 1].item()
+
+    print(f"PCC {pcc:.6f}")
+
+    assert pcc > 0.99
+
+finally:
+    ttnn.close_device(device)
diff --git a/examples/matmul-tutorial/step_3_multinode.py b/examples/matmul-tutorial/step_3_multinode.py
new file mode 100644
index 000000000..23368397e
--- /dev/null
+++ b/examples/matmul-tutorial/step_3_multinode.py
@@ -0,0 +1,236 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Tutorial Step 3: Multi-Node, Fixed Grid
+# ========================================
+# Extends Step 2 by running the operation across a grid of nodes in parallel.
+#
+# New concepts introduced:
+#   - grid=(4, 4)           — run the operation on a 4×4 grid of nodes (16 cores)
+#   - ttl.grid_size(dims=2) — query the (n, m) grid dimensions at runtime
+#   - ttl.node(dims=2)      — query this node's (node_n, node_m) position
+#
+# Each node processes an independent rectangular region of the output tensor,
+# partitioned along the M and N dimensions.  The K dimension is not partitioned:
+# every node iterates over all k_blocks to accumulate its full partial product.
+# This requires tensor dimensions to be evenly divisible by the grid (see Step
+# 4 for a version that handles remainders).
+
+import ttnn
+import torch
+
+
+def from_torch(tensor: torch.Tensor):
+    return ttnn.from_torch(
+        tensor,
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+        memory_config=ttnn.DRAM_MEMORY_CONFIG,
+    )
+
+
+import ttl
+
+TILE_SIZE = 32
+M_GRANULARITY = 4
+N_GRANULARITY = 4
+K_GRANULARITY = 4
+
+
+# grid=(4, 4) launches the operation body on every node of a 4-column × 4-row
+# grid.  All nodes execute the same code; they differentiate their work via
+# ttl.node().
+
+
+@ttl.operation(grid=(4, 4))
+def __tutorial_operation(
+    a: ttnn.Tensor,
+    b: ttnn.Tensor,
+    c: ttnn.Tensor,
+    y: ttnn.Tensor,
+) -> None:
+    m_tiles_per_block = M_GRANULARITY
+    n_tiles_per_block = N_GRANULARITY
+    k_tiles_per_block = K_GRANULARITY
+
+    m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block
+    n_blocks = b.shape[1] // TILE_SIZE // n_tiles_per_block
+    k_blocks = a.shape[1] // TILE_SIZE // k_tiles_per_block
+
+    # ttl.grid_size returns (grid_n, grid_m) matching the (n, m) convention
+    # used by ttl.node.  The grid is partitioned so each node handles an
+    # independent slice of the M×N output space.
+
+    grid_n, grid_m = ttl.grid_size(dims=2)
+
+    # Divide the total block count evenly across the grid.
+    # Assumes the tensor is evenly divisible by the grid size.
+
+    m_blocks_per_node = m_blocks // grid_m
+    n_blocks_per_node = n_blocks // grid_n
+
+    a_dfb = ttl.make_dataflow_buffer_like(
+        a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2
+    )
+    b_dfb = ttl.make_dataflow_buffer_like(
+        b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    c_dfb = ttl.make_dataflow_buffer_like(
+        c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    acc_dfb = ttl.make_dataflow_buffer_like(
+        y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    y_dfb = ttl.make_dataflow_buffer_like(
+        y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+
+    @ttl.datamovement()
+    def read():
+
+        # ttl.node() returns the zero-based coordinates of this specific node.
+        # node_n and node_m are used to offset into the global tensor.
+
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+
+            # Map local block index to global block index.
+
+            m_block = node_m * m_blocks_per_node + local_m_block
+            start_m_tile = m_block * m_tiles_per_block
+            end_m_tile = (m_block + 1) * m_tiles_per_block
+
+            for local_n_block in range(n_blocks_per_node):
+                n_block = node_n * n_blocks_per_node + local_n_block
+                start_n_tile = n_block * n_tiles_per_block
+                end_n_tile = (n_block + 1) * n_tiles_per_block
+
+                with c_dfb.reserve() as c_blk:
+                    tx_c = ttl.copy(
+                        c[
+                            start_m_tile:end_m_tile,
+                            start_n_tile:end_n_tile,
+                        ],
+                        c_blk,
+                    )
+
+                    tx_c.wait()
+
+                # All nodes iterate over the full k dimension to accumulate
+                # their partial matmul result independently.
+
+                for k_block in range(k_blocks):
+                    start_k_tile = k_block * k_tiles_per_block
+                    end_k_tile = (k_block + 1) * k_tiles_per_block
+                    with (
+                        a_dfb.reserve() as a_blk,
+                        b_dfb.reserve() as b_blk,
+                    ):
+                        tx_a = ttl.copy(
+                            a[
+                                start_m_tile:end_m_tile,
+                                start_k_tile:end_k_tile,
+                            ],
+                            a_blk,
+                        )
+                        tx_b = ttl.copy(
+                            b[
+                                start_k_tile:end_k_tile,
+                                start_n_tile:end_n_tile,
+                            ],
+                            b_blk,
+                        )
+
+                        tx_a.wait()
+                        tx_b.wait()
+
+    # The compute kernel iterates over the blocks assigned to this node.
+    # It does not need to know its node coordinates: the DM kernels already
+    # stream only the relevant tiles into the DFBs.
+
+    @ttl.compute()
+    def compute():
+        for _ in range(m_blocks_per_node):
+            for _ in range(n_blocks_per_node):
+                with acc_dfb.reserve() as acc_blk:
+                    acc_blk.store(ttl.math.fill(acc_blk, 0))
+
+                for _ in range(k_blocks):
+                    with (
+                        a_dfb.wait() as a_blk,
+                        b_dfb.wait() as b_blk,
+                        acc_dfb.wait() as pre_acc_blk,
+                    ):
+                        with acc_dfb.reserve() as acc_blk:
+                            acc_blk.store(pre_acc_blk + a_blk @ b_blk)
+
+                with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk:
+                    with y_dfb.reserve() as y_blk:
+                        y_blk.store(ttl.math.relu(c_blk + acc_blk))
+
+    @ttl.datamovement()
+    def write():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+            start_m_tile = m_block * m_tiles_per_block
+            end_m_tile = (m_block + 1) * m_tiles_per_block
+
+            for local_n_block in range(n_blocks_per_node):
+                n_block = node_n * n_blocks_per_node + local_n_block
+                start_n_tile = n_block * n_tiles_per_block
+                end_n_tile = (n_block + 1) * n_tiles_per_block
+
+                with y_dfb.wait() as y_blk:
+                    tx = ttl.copy(
+                        y_blk,
+                        y[
+                            start_m_tile:end_m_tile,
+                            start_n_tile:end_n_tile,
+                        ],
+                    )
+                    tx.wait()
+
+
+def tutorial_operation(a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor):
+    y = from_torch(torch.zeros((a.shape[0], b.shape[1]), dtype=torch.bfloat16))
+    __tutorial_operation(a, b, c, y)
+    return y
+
+
+torch.manual_seed(42)
+
+device = ttnn.open_device(device_id=0)
+
+try:
+    M, K, N = 8192, 8192, 8192
+
+    a = torch.randn((M, K), dtype=torch.bfloat16)
+    b = torch.randn((K, N), dtype=torch.bfloat16)
+    c = torch.randn((M, N), dtype=torch.bfloat16)
+
+    expected_y = torch.relu(a @ b + c)
+
+    a = from_torch(a)
+    b = from_torch(b)
+    c = from_torch(c)
+
+    y = tutorial_operation(a, b, c)
+
+    y = ttnn.to_torch(y)
+
+    pcc = torch.corrcoef(
+        torch.stack([y.flatten().float(), expected_y.flatten().float()])
+    )[0, 1].item()
+
+    print(f"PCC {pcc:.6f}")
+
+    assert pcc > 0.99
+
+finally:
+    ttnn.close_device(device)
diff --git a/examples/matmul-tutorial/step_4_multinode_grid_auto.py b/examples/matmul-tutorial/step_4_multinode_grid_auto.py
new file mode 100644
index 000000000..aa87b49e0
--- /dev/null
+++ b/examples/matmul-tutorial/step_4_multinode_grid_auto.py
@@ -0,0 +1,240 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Tutorial Step 4: Multi-Node, Auto Grid
+# =======================================
+# Extends Step 3 by removing the hard-coded grid size and handling tensor
+# dimensions that are not evenly divisible by the grid.
+#
+# New concepts introduced:
+#   - grid="auto"     — the compiler picks the largest grid available in the
+#                       hardware; the operation must not assume any specific
+#                       grid dimensions
+#   - ceiling division — ensures every block is assigned to a node even when
+#                        the block count doesn't divide evenly across the grid
+#   - bounds checking  — nodes at the trailing edge of the grid may have fewer
+#                        blocks to process; guard all per-block work with
+#                        `if m_block < m_blocks` / `if n_block < n_blocks`
+#
+# Because all three kernels must agree on which blocks to process, the bounds
+# check appears in every kernel function.
+
+import ttnn
+import torch
+
+
+def from_torch(tensor: torch.Tensor):
+    return ttnn.from_torch(
+        tensor,
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+        memory_config=ttnn.DRAM_MEMORY_CONFIG,
+    )
+
+
+import ttl
+
+TILE_SIZE = 32
+M_GRANULARITY = 4
+N_GRANULARITY = 4
+K_GRANULARITY = 4
+
+
+# grid="auto" asks the compiler to select the grid at compile time based on
+# available hardware resources.  The operation body must work correctly for any
+# grid the compiler may choose.
+
+
+@ttl.operation(grid="auto")
+def __tutorial_operation(
+    a: ttnn.Tensor,
+    b: ttnn.Tensor,
+    c: ttnn.Tensor,
+    y: ttnn.Tensor,
+) -> None:
+    m_tiles_per_block = M_GRANULARITY
+    n_tiles_per_block = N_GRANULARITY
+    k_tiles_per_block = K_GRANULARITY
+
+    # Total block counts across the entire tensor (not per-node).
+
+    m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block
+    n_blocks = b.shape[1] // TILE_SIZE // n_tiles_per_block
+    k_blocks = a.shape[1] // TILE_SIZE // k_tiles_per_block
+
+    grid_n, grid_m = ttl.grid_size(dims=2)
+
+    # Ceiling division: -(-x // y) is a concise Python idiom for ceil(x / y).
+    # This ensures every block is covered even when m_blocks or n_blocks is not
+    # a multiple of the grid size.  Nodes in the last row/column of the grid
+    # may receive fewer blocks and rely on the bounds checks below to skip
+    # out-of-range work.
+
+    m_blocks_per_node = -(-m_blocks // grid_m)  # divceil
+    n_blocks_per_node = -(-n_blocks // grid_n)  # divceil
+
+    a_dfb = ttl.make_dataflow_buffer_like(
+        a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2
+    )
+    b_dfb = ttl.make_dataflow_buffer_like(
+        b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    c_dfb = ttl.make_dataflow_buffer_like(
+        c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    acc_dfb = ttl.make_dataflow_buffer_like(
+        y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    y_dfb = ttl.make_dataflow_buffer_like(
+        y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+
+    @ttl.datamovement()
+    def read():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+
+            # Skip if this node was assigned more iterations than there are
+            # actual blocks (happens at the trailing edge of the grid).
+
+            if m_block < m_blocks:
+                start_m_tile = m_block * m_tiles_per_block
+                end_m_tile = (m_block + 1) * m_tiles_per_block
+
+                for local_n_block in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n_block
+                    if n_block < n_blocks:
+                        start_n_tile = n_block * n_tiles_per_block
+                        end_n_tile = (n_block + 1) * n_tiles_per_block
+
+                        with c_dfb.reserve() as c_blk:
+                            tx_c = ttl.copy(
+                                c[
+                                    start_m_tile:end_m_tile,
+                                    start_n_tile:end_n_tile,
+                                ],
+                                c_blk,
+                            )
+
+                            tx_c.wait()
+
+                        for k_block in range(k_blocks):
+                            start_k_tile = k_block * k_tiles_per_block
+                            end_k_tile = (k_block + 1) * k_tiles_per_block
+                            with (
+                                a_dfb.reserve() as a_blk,
+                                b_dfb.reserve() as b_blk,
+                            ):
+                                tx_a = ttl.copy(
+                                    a[
+                                        start_m_tile:end_m_tile,
+                                        start_k_tile:end_k_tile,
+                                    ],
+                                    a_blk,
+                                )
+                                tx_b = ttl.copy(
+                                    b[
+                                        start_k_tile:end_k_tile,
+                                        start_n_tile:end_n_tile,
+                                    ],
+                                    b_blk,
+                                )
+
+                                tx_a.wait()
+                                tx_b.wait()
+
+    @ttl.compute()
+    def compute():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+            if m_block < m_blocks:
+                for local_n_block in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n_block
+                    if n_block < n_blocks:
+                        with acc_dfb.reserve() as acc_blk:
+                            acc_blk.store(ttl.math.fill(acc_blk, 0))
+
+                        for _ in range(k_blocks):
+                            with (
+                                a_dfb.wait() as a_blk,
+                                b_dfb.wait() as b_blk,
+                                acc_dfb.wait() as pre_acc_blk,
+                            ):
+                                with acc_dfb.reserve() as acc_blk:
+                                    acc_blk.store(pre_acc_blk + a_blk @ b_blk)
+
+                        with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk:
+                            with y_dfb.reserve() as y_blk:
+                                y_blk.store(ttl.math.relu(c_blk + acc_blk))
+
+    @ttl.datamovement()
+    def write():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+            if m_block < m_blocks:
+                start_m_tile = m_block * m_tiles_per_block
+                end_m_tile = (m_block + 1) * m_tiles_per_block
+
+                for local_n_block in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n_block
+                    if n_block < n_blocks:
+                        start_n_tile = n_block * n_tiles_per_block
+                        end_n_tile = (n_block + 1) * n_tiles_per_block
+
+                        with y_dfb.wait() as y_blk:
+                            tx = ttl.copy(
+                                y_blk,
+                                y[
+                                    start_m_tile:end_m_tile,
+                                    start_n_tile:end_n_tile,
+                                ],
+                            )
+                            tx.wait()
+
+
+def tutorial_operation(a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor):
+    y = from_torch(torch.zeros((a.shape[0], b.shape[1]), dtype=torch.bfloat16))
+    __tutorial_operation(a, b, c, y)
+    return y
+
+
+torch.manual_seed(42)
+
+device = ttnn.open_device(device_id=0)
+
+try:
+    M, K, N = 8192, 8192, 8192
+
+    a = torch.randn((M, K), dtype=torch.bfloat16)
+    b = torch.randn((K, N), dtype=torch.bfloat16)
+    c = torch.randn((M, N), dtype=torch.bfloat16)
+
+    expected_y = torch.relu(a @ b + c)
+
+    a = from_torch(a)
+    b = from_torch(b)
+    c = from_torch(c)
+
+    y = tutorial_operation(a, b, c)
+
+    y = ttnn.to_torch(y)
+
+    pcc = torch.corrcoef(
+        torch.stack([y.flatten().float(), expected_y.flatten().float()])
+    )[0, 1].item()
+
+    print(f"PCC {pcc:.6f}")
+
+    assert pcc > 0.99
+
+finally:
+    ttnn.close_device(device)
diff --git a/examples/matmul-tutorial/step_5_multidevice_shard_m.py b/examples/matmul-tutorial/step_5_multidevice_shard_m.py
new file mode 100644
index 000000000..378f7b2e2
--- /dev/null
+++ b/examples/matmul-tutorial/step_5_multidevice_shard_m.py
@@ -0,0 +1,246 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Tutorial Step 5: Multi-Device, Shard M
+# =======================================
+# Extends Step 4 to run across multiple devices using SPMD (Single-Program
+# Multiple-Data) mode.  The TT-Lang operation itself is unchanged; only the
+# tensor distribution across devices differs.
+#
+# New concepts introduced:
+#   - ttnn.MeshShape / ttnn.open_mesh_device — open a 1D mesh of all available
+#     devices
+#   - ttnn.ShardTensorToMesh(dim=0) — split a tensor along the M dimension so
+#     each device receives M/n_devices rows
+#   - ttnn.ReplicateTensorToMesh   — send the same tensor to every device
+#   - ttnn.ConcatMeshToTensor(dim=0) — gather per-device output tensors back to
+#     the host by concatenating along M
+#
+# Sharding strategy: a and c are sharded along M (rows), b is replicated.
+# Each device computes its portion of the M×N output independently with no
+# inter-device communication required.  The host concatenates the results.
+
+import ttnn
+import torch
+
+
+def from_torch(tensor: torch.Tensor, mesh_mapper):
+
+    # Upload a bfloat16 torch tensor to DRAM on all mesh devices, applying the
+    # given mapper to determine how the tensor is distributed.
+
+    return ttnn.from_torch(
+        tensor,
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=mesh_device,
+        memory_config=ttnn.DRAM_MEMORY_CONFIG,
+        mesh_mapper=mesh_mapper,
+    )
+
+
+import ttl
+
+TILE_SIZE = 32
+M_GRANULARITY = 4
+N_GRANULARITY = 4
+K_GRANULARITY = 4
+
+
+# The TT-Lang operation body is identical to Step 4.  grid="auto" applies
+# independently to each device in SPMD mode; each device fills its own grid.
+
+
+@ttl.operation(grid="auto")
+def tutorial_operation(
+    a: ttnn.Tensor,
+    b: ttnn.Tensor,
+    c: ttnn.Tensor,
+    y: ttnn.Tensor,
+) -> None:
+    m_tiles_per_block = M_GRANULARITY
+    n_tiles_per_block = N_GRANULARITY
+    k_tiles_per_block = K_GRANULARITY
+
+    m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block
+    n_blocks = b.shape[1] // TILE_SIZE // n_tiles_per_block
+    k_blocks = a.shape[1] // TILE_SIZE // k_tiles_per_block
+
+    grid_n, grid_m = ttl.grid_size(dims=2)
+
+    m_blocks_per_node = -(-m_blocks // grid_m)  # divceil
+    n_blocks_per_node = -(-n_blocks // grid_n)  # divceil
+
+    a_dfb = ttl.make_dataflow_buffer_like(
+        a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2
+    )
+    b_dfb = ttl.make_dataflow_buffer_like(
+        b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    c_dfb = ttl.make_dataflow_buffer_like(
+        c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    acc_dfb = ttl.make_dataflow_buffer_like(
+        y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    y_dfb = ttl.make_dataflow_buffer_like(
+        y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+
+    @ttl.datamovement()
+    def read():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+            if m_block < m_blocks:
+                start_m_tile = m_block * m_tiles_per_block
+                end_m_tile = (m_block + 1) * m_tiles_per_block
+
+                for local_n_block in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n_block
+                    if n_block < n_blocks:
+                        start_n_tile = n_block * n_tiles_per_block
+                        end_n_tile = (n_block + 1) * n_tiles_per_block
+
+                        with c_dfb.reserve() as c_blk:
+                            tx_c = ttl.copy(
+                                c[
+                                    start_m_tile:end_m_tile,
+                                    start_n_tile:end_n_tile,
+                                ],
+                                c_blk,
+                            )
+
+                            tx_c.wait()
+
+                        for k_block in range(k_blocks):
+                            start_k_tile = k_block * k_tiles_per_block
+                            end_k_tile = (k_block + 1) * k_tiles_per_block
+                            with (
+                                a_dfb.reserve() as a_blk,
+                                b_dfb.reserve() as b_blk,
+                            ):
+                                tx_a = ttl.copy(
+                                    a[
+                                        start_m_tile:end_m_tile,
+                                        start_k_tile:end_k_tile,
+                                    ],
+                                    a_blk,
+                                )
+                                tx_b = ttl.copy(
+                                    b[
+                                        start_k_tile:end_k_tile,
+                                        start_n_tile:end_n_tile,
+                                    ],
+                                    b_blk,
+                                )
+
+                                tx_a.wait()
+                                tx_b.wait()
+
+    @ttl.compute()
+    def compute():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+            if m_block < m_blocks:
+                for local_n_block in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n_block
+                    if n_block < n_blocks:
+                        with acc_dfb.reserve() as acc_blk:
+                            acc_blk.store(ttl.math.fill(acc_blk, 0))
+
+                        for _ in range(k_blocks):
+                            with (
+                                a_dfb.wait() as a_blk,
+                                b_dfb.wait() as b_blk,
+                                acc_dfb.wait() as pre_acc_blk,
+                            ):
+                                with acc_dfb.reserve() as acc_blk:
+                                    acc_blk.store(pre_acc_blk + a_blk @ b_blk)
+
+                        with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk:
+                            with y_dfb.reserve() as y_blk:
+                                y_blk.store(ttl.math.relu(c_blk + acc_blk))
+
+    @ttl.datamovement()
+    def write():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+            if m_block < m_blocks:
+                start_m_tile = m_block * m_tiles_per_block
+                end_m_tile = (m_block + 1) * m_tiles_per_block
+
+                for local_n_block in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n_block
+                    if n_block < n_blocks:
+                        start_n_tile = n_block * n_tiles_per_block
+                        end_n_tile = (n_block + 1) * n_tiles_per_block
+
+                        with y_dfb.wait() as y_blk:
+                            tx = ttl.copy(
+                                y_blk,
+                                y[
+                                    start_m_tile:end_m_tile,
+                                    start_n_tile:end_n_tile,
+                                ],
+                            )
+                            tx.wait()
+
+
+torch.manual_seed(42)
+
+n_devices = ttnn.GetNumAvailableDevices()
+assert n_devices > 0 and (
+    n_devices & (n_devices - 1) == 0
+), "Number of available devices must be power of 2 "
+ttnn.set_fabric_config(ttnn.FabricConfig.FABRIC_1D)
+
+# Open a 1D mesh of all available devices.  Each device will process an
+# independent M/n_devices slice of the output rows.
+
+mesh_device = ttnn.open_mesh_device(ttnn.MeshShape(1, n_devices))
+
+try:
+    M, K, N = 8192, 8192, 8192
+
+    a = torch.randn((M, K), dtype=torch.bfloat16)
+    b = torch.randn((K, N), dtype=torch.bfloat16)
+    c = torch.randn((M, N), dtype=torch.bfloat16)
+
+    expected_y = torch.relu(a @ b + c)
+
+    # Distribute tensors across devices:
+    #   a: sharded along M (each device gets M/n_devices rows)
+    #   b: replicated on every device (all devices need the full K×N matrix)
+    #   c: sharded along M to match the corresponding rows of a
+
+    a = from_torch(a, ttnn.ShardTensorToMesh(mesh_device, dim=0))
+    b = from_torch(b, ttnn.ReplicateTensorToMesh(mesh_device))
+    c = from_torch(c, ttnn.ShardTensorToMesh(mesh_device, dim=0))
+
+    y = torch.zeros((M, N), dtype=torch.bfloat16)
+    y = from_torch(y, ttnn.ShardTensorToMesh(mesh_device, dim=0))
+
+    tutorial_operation(a, b, c, y)
+
+    # Gather per-device output shards back to the host by concatenating along M.
+
+    y = ttnn.to_torch(y, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=0))
+
+    pcc = torch.corrcoef(
+        torch.stack([y.flatten().float(), expected_y.flatten().float()])
+    )[0, 1].item()
+
+    print(f"PCC {pcc:.6f}")
+
+    assert pcc > 0.99
+
+finally:
+    ttnn.close_device(mesh_device)
diff --git a/examples/matmul-tutorial/step_6_multidevice_shard_k.py b/examples/matmul-tutorial/step_6_multidevice_shard_k.py
new file mode 100644
index 000000000..2cd2cba88
--- /dev/null
+++ b/examples/matmul-tutorial/step_6_multidevice_shard_k.py
@@ -0,0 +1,260 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Tutorial Step 6: Multi-Device, Shard K
+# ========================================
+# Changes the sharding strategy from Step 5: instead of splitting M across
+# devices, this step splits the K (reduction) dimension.
+#
+# New concepts introduced:
+#   - K-sharding: a is sharded along K (dim=1), b is sharded along K (dim=0).
+#     Each device computes a partial product a_i @ b_i over its K slice.
+#   - Partial results: because K is split, no single device has the full dot
+#     product.  Each device produces a partial sum y_i = a_i @ b_i + c_i
+#     (where only device 0 carries the real bias c; the rest use zeros).
+#   - Host-side reduction: the host manually sums partial_ys across devices
+#     and applies relu after the reduction.
+#
+# The TT-Lang operation body is mostly unchanged from Step 4.  The kernel produces
+# a partial output (no relu) and writes it to y; the relu is deferred to the
+# host so it can be applied after the cross-device reduction.
+
+import ttnn
+import torch
+
+
+def from_torch(tensor: torch.Tensor, mesh_mapper):
+    return ttnn.from_torch(
+        tensor,
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=mesh_device,
+        memory_config=ttnn.DRAM_MEMORY_CONFIG,
+        mesh_mapper=mesh_mapper,
+    )
+
+
+import ttl
+
+TILE_SIZE = 32
+M_GRANULARITY = 4
+N_GRANULARITY = 4
+K_GRANULARITY = 4
+
+
+# The operation body is identical to Step 4 except relu is removed from the
+# final store — the kernel now writes the raw c + acc result so the host can
+# sum partial outputs across devices before activating.
+
+
+@ttl.operation(grid="auto")
+def tutorial_operation(
+    a: ttnn.Tensor,
+    b: ttnn.Tensor,
+    c: ttnn.Tensor,
+    y: ttnn.Tensor,
+) -> None:
+    m_tiles_per_block = M_GRANULARITY
+    n_tiles_per_block = N_GRANULARITY
+    k_tiles_per_block = K_GRANULARITY
+
+    m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block
+    n_blocks = b.shape[1] // TILE_SIZE // n_tiles_per_block
+    k_blocks = a.shape[1] // TILE_SIZE // k_tiles_per_block
+
+    grid_n, grid_m = ttl.grid_size(dims=2)
+
+    m_blocks_per_node = -(-m_blocks // grid_m)  # divceil
+    n_blocks_per_node = -(-n_blocks // grid_n)  # divceil
+
+    a_dfb = ttl.make_dataflow_buffer_like(
+        a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2
+    )
+    b_dfb = ttl.make_dataflow_buffer_like(
+        b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    c_dfb = ttl.make_dataflow_buffer_like(
+        c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    acc_dfb = ttl.make_dataflow_buffer_like(
+        y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    y_dfb = ttl.make_dataflow_buffer_like(
+        y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+
+    @ttl.datamovement()
+    def read():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+            if m_block < m_blocks:
+                start_m_tile = m_block * m_tiles_per_block
+                end_m_tile = (m_block + 1) * m_tiles_per_block
+
+                for local_n_block in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n_block
+                    if n_block < n_blocks:
+                        start_n_tile = n_block * n_tiles_per_block
+                        end_n_tile = (n_block + 1) * n_tiles_per_block
+
+                        with c_dfb.reserve() as c_blk:
+                            tx_c = ttl.copy(
+                                c[
+                                    start_m_tile:end_m_tile,
+                                    start_n_tile:end_n_tile,
+                                ],
+                                c_blk,
+                            )
+
+                            tx_c.wait()
+
+                        for k_block in range(k_blocks):
+                            start_k_tile = k_block * k_tiles_per_block
+                            end_k_tile = (k_block + 1) * k_tiles_per_block
+                            with (
+                                a_dfb.reserve() as a_blk,
+                                b_dfb.reserve() as b_blk,
+                            ):
+                                tx_a = ttl.copy(
+                                    a[
+                                        start_m_tile:end_m_tile,
+                                        start_k_tile:end_k_tile,
+                                    ],
+                                    a_blk,
+                                )
+                                tx_b = ttl.copy(
+                                    b[
+                                        start_k_tile:end_k_tile,
+                                        start_n_tile:end_n_tile,
+                                    ],
+                                    b_blk,
+                                )
+
+                                tx_a.wait()
+                                tx_b.wait()
+
+    @ttl.compute()
+    def compute():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+            if m_block < m_blocks:
+                for local_n_block in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n_block
+                    if n_block < n_blocks:
+                        with acc_dfb.reserve() as acc_blk:
+                            acc_blk.store(ttl.math.fill(acc_blk, 0))
+
+                        for _ in range(k_blocks):
+                            with (
+                                a_dfb.wait() as a_blk,
+                                b_dfb.wait() as b_blk,
+                                acc_dfb.wait() as pre_acc_blk,
+                            ):
+                                with acc_dfb.reserve() as acc_blk:
+                                    acc_blk.store(pre_acc_blk + a_blk @ b_blk)
+
+                        with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk:
+                            with y_dfb.reserve() as y_blk:
+                                y_blk.store(c_blk + acc_blk)
+
+    @ttl.datamovement()
+    def write():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+            if m_block < m_blocks:
+                start_m_tile = m_block * m_tiles_per_block
+                end_m_tile = (m_block + 1) * m_tiles_per_block
+
+                for local_n_block in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n_block
+                    if n_block < n_blocks:
+                        start_n_tile = n_block * n_tiles_per_block
+                        end_n_tile = (n_block + 1) * n_tiles_per_block
+
+                        with y_dfb.wait() as y_blk:
+                            tx = ttl.copy(
+                                y_blk,
+                                y[
+                                    start_m_tile:end_m_tile,
+                                    start_n_tile:end_n_tile,
+                                ],
+                            )
+                            tx.wait()
+
+
+torch.manual_seed(42)
+
+n_devices = ttnn.GetNumAvailableDevices()
+assert n_devices > 0 and (
+    n_devices & (n_devices - 1) == 0
+), "Number of available devices must be power of 2 "
+ttnn.set_fabric_config(ttnn.FabricConfig.FABRIC_1D)
+mesh_device = ttnn.open_mesh_device(ttnn.MeshShape(1, n_devices))
+
+try:
+    M, K, N = 8192, 8192, 8192
+
+    a = torch.randn((M, K), dtype=torch.bfloat16)
+    b = torch.randn((K, N), dtype=torch.bfloat16)
+    c = torch.randn((M, N), dtype=torch.bfloat16)
+
+    expected_y = torch.relu(a @ b + c)
+
+    # Distribute tensors across devices for K-sharding:
+    #   a: sharded along K (dim=1) — each device gets M×(K/n_devices) columns
+    #   b: sharded along K (dim=0) — each device gets (K/n_devices)×N rows
+
+    a = from_torch(a, ttnn.ShardTensorToMesh(mesh_device, dim=1))
+    b = from_torch(b, ttnn.ShardTensorToMesh(mesh_device, dim=0))
+
+    # The bias c should only be added once, not once per device.  Build a
+    # stacked tensor of shape (M * n_devices, N): device 0 gets the real c,
+    # all other devices get zeros.  After sharding along dim=0, each device
+    # receives its M×N slice: c for device 0, zeros for the rest.
+
+    replicated_cs = torch.zeros((M * n_devices, N), dtype=torch.bfloat16)
+    replicated_cs[:M, :] = c
+    replicated_cs = from_torch(
+        replicated_cs, ttnn.ShardTensorToMesh(mesh_device, dim=0)
+    )
+
+    # partial_ys collects the per-device output: y_i = a_i @ b_i + c_i.
+    # These are partial sums that must be reduced on the host.
+
+    partial_ys = torch.zeros((M * n_devices, N), dtype=torch.bfloat16)
+    partial_ys = from_torch(partial_ys, ttnn.ShardTensorToMesh(mesh_device, dim=0))
+
+    tutorial_operation(a, b, replicated_cs, partial_ys)
+
+    partial_ys = ttnn.to_torch(
+        partial_ys, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=0)
+    )
+
+    # Sum the partial products from each device to recover the full matmul
+    # result, then apply relu on the host.
+
+    y = torch.zeros((M, N))
+
+    for i in range(n_devices):
+        y += partial_ys[i * M : (i + 1) * M, :]
+
+    y = torch.relu(y)
+
+    pcc = torch.corrcoef(
+        torch.stack([y.flatten().float(), expected_y.flatten().float()])
+    )[0, 1].item()
+
+    print(f"PCC {pcc:.6f}")
+
+    assert pcc > 0.99
+
+finally:
+    ttnn.close_device(mesh_device)
diff --git a/examples/matmul-tutorial/step_7_multidevice_shard_k_all_reduce.py b/examples/matmul-tutorial/step_7_multidevice_shard_k_all_reduce.py
new file mode 100644
index 000000000..3a5259372
--- /dev/null
+++ b/examples/matmul-tutorial/step_7_multidevice_shard_k_all_reduce.py
@@ -0,0 +1,248 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Tutorial Step 7: Multi-Device, Shard K with All-Reduce
+# =======================================================
+# Replaces the host-side manual reduction from Step 6 with an on-device
+# all-reduce, keeping the final result on the mesh rather than pulling it
+# to the host for summation.
+#
+# New concepts introduced:
+#   - ttnn.all_reduce — sums partial_ys across all devices in-place using the
+#     TT-Fabric interconnect; each device ends up with the fully reduced M×N
+#     result (the result is replicated across all devices)
+#   - Post-reduce activation: relu is applied on-device after all_reduce,
+#     replacing the host-side relu from Step 6
+#
+# The TT-Lang operation body and the K-sharding setup are identical to Step 6.
+# The only change is in the host code: ttnn.all_reduce + ttnn.relu replace the
+# manual Python loop that summed partial outputs.
+
+import ttnn
+import torch
+
+
+def from_torch(tensor: torch.Tensor, mesh_mapper):
+    return ttnn.from_torch(
+        tensor,
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=mesh_device,
+        memory_config=ttnn.DRAM_MEMORY_CONFIG,
+        mesh_mapper=mesh_mapper,
+    )
+
+
+import ttl
+
+TILE_SIZE = 32
+M_GRANULARITY = 4
+N_GRANULARITY = 4
+K_GRANULARITY = 4
+
+
+@ttl.operation(grid="auto")
+def tutorial_operation(
+    a: ttnn.Tensor,
+    b: ttnn.Tensor,
+    c: ttnn.Tensor,
+    y: ttnn.Tensor,
+) -> None:
+    m_tiles_per_block = M_GRANULARITY
+    n_tiles_per_block = N_GRANULARITY
+    k_tiles_per_block = K_GRANULARITY
+
+    m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block
+    n_blocks = b.shape[1] // TILE_SIZE // n_tiles_per_block
+    k_blocks = a.shape[1] // TILE_SIZE // k_tiles_per_block
+
+    grid_n, grid_m = ttl.grid_size(dims=2)
+
+    m_blocks_per_node = -(-m_blocks // grid_m)  # divceil
+    n_blocks_per_node = -(-n_blocks // grid_n)  # divceil
+
+    a_dfb = ttl.make_dataflow_buffer_like(
+        a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2
+    )
+    b_dfb = ttl.make_dataflow_buffer_like(
+        b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    c_dfb = ttl.make_dataflow_buffer_like(
+        c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    acc_dfb = ttl.make_dataflow_buffer_like(
+        y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+    y_dfb = ttl.make_dataflow_buffer_like(
+        y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2
+    )
+
+    @ttl.datamovement()
+    def read():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+            if m_block < m_blocks:
+                start_m_tile = m_block * m_tiles_per_block
+                end_m_tile = (m_block + 1) * m_tiles_per_block
+
+                for local_n_block in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n_block
+                    if n_block < n_blocks:
+                        start_n_tile = n_block * n_tiles_per_block
+                        end_n_tile = (n_block + 1) * n_tiles_per_block
+
+                        with c_dfb.reserve() as c_blk:
+                            tx_c = ttl.copy(
+                                c[
+                                    start_m_tile:end_m_tile,
+                                    start_n_tile:end_n_tile,
+                                ],
+                                c_blk,
+                            )
+
+                            tx_c.wait()
+
+                        for k_block in range(k_blocks):
+                            start_k_tile = k_block * k_tiles_per_block
+                            end_k_tile = (k_block + 1) * k_tiles_per_block
+                            with (
+                                a_dfb.reserve() as a_blk,
+                                b_dfb.reserve() as b_blk,
+                            ):
+                                tx_a = ttl.copy(
+                                    a[
+                                        start_m_tile:end_m_tile,
+                                        start_k_tile:end_k_tile,
+                                    ],
+                                    a_blk,
+                                )
+                                tx_b = ttl.copy(
+                                    b[
+                                        start_k_tile:end_k_tile,
+                                        start_n_tile:end_n_tile,
+                                    ],
+                                    b_blk,
+                                )
+
+                                tx_a.wait()
+                                tx_b.wait()
+
+    @ttl.compute()
+    def compute():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+            if m_block < m_blocks:
+                for local_n_block in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n_block
+                    if n_block < n_blocks:
+                        with acc_dfb.reserve() as acc_blk:
+                            acc_blk.store(ttl.math.fill(acc_blk, 0))
+
+                        for _ in range(k_blocks):
+                            with (
+                                a_dfb.wait() as a_blk,
+                                b_dfb.wait() as b_blk,
+                                acc_dfb.wait() as pre_acc_blk,
+                            ):
+                                with acc_dfb.reserve() as acc_blk:
+                                    acc_blk.store(pre_acc_blk + a_blk @ b_blk)
+
+                        with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk:
+                            with y_dfb.reserve() as y_blk:
+                                y_blk.store(c_blk + acc_blk)
+
+    @ttl.datamovement()
+    def write():
+        node_n, node_m = ttl.node(dims=2)
+
+        for local_m_block in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m_block
+            if m_block < m_blocks:
+                start_m_tile = m_block * m_tiles_per_block
+                end_m_tile = (m_block + 1) * m_tiles_per_block
+
+                for local_n_block in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n_block
+                    if n_block < n_blocks:
+                        start_n_tile = n_block * n_tiles_per_block
+                        end_n_tile = (n_block + 1) * n_tiles_per_block
+
+                        with y_dfb.wait() as y_blk:
+                            tx = ttl.copy(
+                                y_blk,
+                                y[
+                                    start_m_tile:end_m_tile,
+                                    start_n_tile:end_n_tile,
+                                ],
+                            )
+                            tx.wait()
+
+
+torch.manual_seed(42)
+
+n_devices = ttnn.GetNumAvailableDevices()
+assert n_devices > 1 and (
+    n_devices & (n_devices - 1) == 0
+), "Number of available devices must be >1 and be power of 2 "
+ttnn.set_fabric_config(ttnn.FabricConfig.FABRIC_1D)
+mesh_device = ttnn.open_mesh_device(ttnn.MeshShape(1, n_devices))
+
+try:
+    M, K, N = 8192, 8192, 8192
+
+    a = torch.randn((M, K), dtype=torch.bfloat16)
+    b = torch.randn((K, N), dtype=torch.bfloat16)
+    c = torch.randn((M, N), dtype=torch.bfloat16)
+
+    expected_y = torch.relu(a @ b + c)
+
+    # K-sharding setup is identical to Step 6.
+
+    a = from_torch(a, ttnn.ShardTensorToMesh(mesh_device, dim=1))
+    b = from_torch(b, ttnn.ShardTensorToMesh(mesh_device, dim=0))
+
+    replicated_cs = torch.zeros((M * n_devices, N), dtype=torch.bfloat16)
+    replicated_cs[:M, :] = c
+    replicated_cs = from_torch(
+        replicated_cs, ttnn.ShardTensorToMesh(mesh_device, dim=0)
+    )
+
+    partial_ys = torch.zeros((M * n_devices, N), dtype=torch.bfloat16)
+    partial_ys = from_torch(partial_ys, ttnn.ShardTensorToMesh(mesh_device, dim=0))
+
+    tutorial_operation(a, b, replicated_cs, partial_ys)
+
+    # ttnn.all_reduce sums partial_ys across all devices using TT-Fabric,
+    # producing a fully reduced M×N result replicated on every device.
+    # ttnn.relu is then applied on-device, replacing the host-side loop from
+    # Step 6.
+
+    replicated_ys = ttnn.all_reduce(partial_ys)
+    replicated_ys = ttnn.relu(replicated_ys)
+
+    # Because the result is replicated, all devices hold the correct answer.
+    # Verify each device's copy against the expected output.
+
+    replicated_ys = ttnn.to_torch(
+        replicated_ys, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=0)
+    )
+
+    for i in range(n_devices):
+        y = replicated_ys[i * M : (i + 1) * M, :]
+
+        pcc = torch.corrcoef(
+            torch.stack([y.flatten().float(), expected_y.flatten().float()])
+        )[0, 1].item()
+
+        print(f"PCC {pcc:.6f}")
+
+        assert pcc > 0.99
+
+finally:
+    ttnn.close_device(mesh_device)

From 2544cb1dc169e603b54ab88d24141a38733032e2 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Thu, 9 Apr 2026 20:59:22 -0700
Subject: [PATCH 02/31] Allow subblocking for matmul accumulating computes

  SubblockComputeForDST previously skipped all accumulating computes.
  Matmul K accumulates in-place in DST without consuming DST slots, so
  subblocking the parallel (M, N) dims is safe. LowerMatmulBlock handles
  the subblocked compute before LowerToLoops sees it.
---
 .../TTL/Transforms/ConvertTTLComputeToSCF.cpp  | 11 ++++++++---
 .../Transforms/TTLSubblockComputeForDST.cpp    | 18 ++++++++++++------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/lib/Dialect/TTL/Transforms/ConvertTTLComputeToSCF.cpp b/lib/Dialect/TTL/Transforms/ConvertTTLComputeToSCF.cpp
index 97aa10d3e..1625fecaa 100644
--- a/lib/Dialect/TTL/Transforms/ConvertTTLComputeToSCF.cpp
+++ b/lib/Dialect/TTL/Transforms/ConvertTTLComputeToSCF.cpp
@@ -373,14 +373,19 @@ struct LowerComputeToLoops : OpRewritePattern<ComputeOp> {
                               })
                               .wasInterrupted();
 
-    assert(!(isSubblocked && isAccumulating) &&
-           "SubblockComputeForDST must skip accumulating computes");
-
     SmallVector<StringAttr> iterTypes;
     for (Attribute attr : op.getIteratorTypes()) {
       iterTypes.push_back(mlir::cast<StringAttr>(attr));
     }
 
+    // Subblocked accumulating computes (matmul K>1 with output > DST) are
+    // handled by LowerMatmulBlock, which generates the K reduction loop
+    // with per-K DstSections. By the time LowerToLoops runs, the ComputeOp
+    // has been replaced. This assert catches unexpected cases.
+    assert(!(isSubblocked && isAccumulating) &&
+           "subblocked accumulating computes should be handled by "
+           "LowerMatmulBlock before LowerToLoops");
+
     // Side-effect-only loops: no iter_args, no tensor.insert, no scf.yield
     // with tensor values. Stores are explicit side effects (tile_store).
     bool processingFailed = false;
diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
index c599257ec..493a607e5 100644
--- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
@@ -103,24 +103,30 @@ struct TTLSubblockComputeForDSTPass
     func::FuncOp funcOp = getOperation();
 
     // Collect compute ops to subblock (avoid modifying while walking).
-    // Skip accumulating computes -- subblocking would break reduction
-    // accumulation by splitting the reduction loop across subblocks.
+    // Skip non-matmul accumulating computes (e.g., reduce_tile) because
+    // subblocking would break their reduction accumulation semantics.
+    // Matmul accumulating computes are safe: K accumulates in-place in
+    // DST without consuming DST slots (effectiveTiles already excludes
+    // reduction dims for matmul -- see hasMatmulBlock logic below).
     SmallVector<ComputeOp> opsToSubblock;
     funcOp.walk([&](ComputeOp computeOp) {
       auto unrollAttr =
           computeOp->getAttrOfType<IntegerAttr>(kUnrollFactorAttrName);
       if (unrollAttr && unrollAttr.getInt() > 1) {
         bool hasAccumulating = false;
+        bool hasMatmulBlock = false;
         computeOp.getBody().walk([&](Operation *op) {
           if (op->hasTrait<TTLAccumulatingOpTrait>()) {
             hasAccumulating = true;
-            return WalkResult::interrupt();
           }
-          return WalkResult::advance();
+          if (isa<TileMatmulBlockOp>(op)) {
+            hasMatmulBlock = true;
+          }
         });
-        if (!hasAccumulating) {
-          opsToSubblock.push_back(computeOp);
+        if (hasAccumulating && !hasMatmulBlock) {
+          return;
         }
+        opsToSubblock.push_back(computeOp);
       }
     });
 

From 01cb9b6c5a53a541eb4b1176df9a041bce55b9dc Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Thu, 9 Apr 2026 23:07:39 -0700
Subject: [PATCH 03/31] initial pack_reconfig_l1_acc support

---
 include/ttlang/Dialect/TTL/Passes.td          | 18 +++++
 .../Transforms/TTKernelCombinePackTiles.cpp   | 14 ++++
 lib/Dialect/TTL/Pipelines/TTLPipelines.cpp    |  1 +
 lib/Dialect/TTL/Transforms/CMakeLists.txt     |  1 +
 .../Transforms/TTLAnnotateReductionLoops.cpp  | 71 +++++++++++++++++++
 .../Transforms/TTLSubblockComputeForDST.cpp   | 51 +++++++++++--
 python/ttl/ttl_api.py                         |  1 +
 test/python/test_matmul_k_accumulation.py     | 23 +++++-
 .../TTL/Transforms/subblock_matmul.mlir       | 47 ++++++------
 9 files changed, 196 insertions(+), 31 deletions(-)
 create mode 100644 lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp

diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index 71035366e..8e8765181 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -46,6 +46,24 @@ def TTKernelInsertL1Accumulation
   ];
 }
 
+def TTLAnnotateReductionLoops
+    : Pass<"ttl-annotate-reduction-loops", "::mlir::func::FuncOp"> {
+  let summary = "Annotate user-written scf.for loops as reduction loops";
+  let description = [{
+    Detects user-written `scf.for` loops where all iterations store to the
+    same CB slot (reserved before the loop, pushed after) and annotates them
+    with `ttl.reduction_loop`. This enables `TTKernelInsertL1Accumulation`
+    to insert `pack_reconfig_l1_acc` guards so that packs accumulate across
+    iterations instead of overwriting.
+
+    Targets the pattern:
+      cb_reserve -> scf.for { compute(store to reserved CB) } -> cb_push
+  }];
+  let dependentDialects = [
+    "::mlir::scf::SCFDialect"
+  ];
+}
+
 def TTLConvertTTLToCompute
     : Pass<"convert-ttl-to-compute", "::mlir::func::FuncOp"> {
   let summary = "Lower TTL elementwise tensor ops to ttl.compute with tile ops";
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp
index 1fa6a3503..5d0c5ad5f 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp
@@ -11,12 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ttlang/Dialect/TTL/IR/TTL.h"
 #include "ttlang/Dialect/TTL/Passes.h"
 #include "ttmlir/Dialect/TTKernel/IR/TTKernel.h"
 #include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 
@@ -63,6 +65,18 @@ struct TTKernelCombinePackTilesPass
 
   void runOnOperation() override {
     getOperation().walk([](Block *block) {
+      // Skip blocks inside reduction loops: pack_tile_block is
+      // incompatible with L1 accumulation (pack_reconfig_l1_acc).
+      // L1 acc requires individual pack_tile calls so each K iteration
+      // can independently add to the existing L1 value.
+      for (Operation *parent = block->getParentOp(); parent;
+           parent = parent->getParentOp()) {
+        if (auto forOp = dyn_cast<scf::ForOp>(parent)) {
+          if (forOp->hasAttr(kReductionLoopAttrName)) {
+            return;
+          }
+        }
+      }
       // Collect all combinable runs first, then replace them. Replacing
       // during iteration would invalidate the block's operation list.
       SmallVector<SmallVector<ttk::PackTileOp>> runs;
diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
index 81be3dad8..d059fc090 100644
--- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
+++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
@@ -18,6 +18,7 @@ namespace mlir::tt::ttl {
 
 void createTTLToTTKernelPipeline(OpPassManager &pm,
                                  const TTLToTTKernelPipelineOptions &options) {
+  pm.addPass(createTTLAnnotateReductionLoops());
   pm.addPass(createTTLConvertTTLToCompute());
   {
     TTLSetComputeKernelConfigOptions configOpts;
diff --git a/lib/Dialect/TTL/Transforms/CMakeLists.txt b/lib/Dialect/TTL/Transforms/CMakeLists.txt
index a5e2b3fb9..40d01413d 100644
--- a/lib/Dialect/TTL/Transforms/CMakeLists.txt
+++ b/lib/Dialect/TTL/Transforms/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_dialect_library(TTLangTTLTransforms
   LowerDPrintToEmitC.cpp
   LowerSignpostToEmitC.cpp
   TTLAnnotateCBAssociations.cpp
+  TTLAnnotateReductionLoops.cpp
   TTLDumpCBFlowGraph.cpp
   TTLLowerMatmulBlock.cpp
   TTLAssignDST.cpp
diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
new file mode 100644
index 000000000..b7d090781
--- /dev/null
+++ b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
@@ -0,0 +1,71 @@
+// SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+//===----------------------------------------------------------------------===//
+// TTL Annotate Reduction Loops
+//===----------------------------------------------------------------------===//
+//
+// Detects user-written scf.for loops that accumulate into the same CB slot
+// (reserve before loop, store inside, push after) and annotates them with
+// kReductionLoopAttrName for L1 accumulation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ttlang/Dialect/TTL/IR/TTL.h"
+#include "ttlang/Dialect/TTL/IR/TTLOps.h"
+#include "ttlang/Dialect/TTL/Passes.h"
+
+#include "mlir/Dialect/SCF/IR/SCF.h"
+
+#define DEBUG_TYPE "ttl-annotate-reduction-loops"
+
+namespace mlir::tt::ttl {
+
+#define GEN_PASS_DEF_TTLANNOTATEREDUCTIONLOOPS
+#include "ttlang/Dialect/TTL/Passes.h.inc"
+
+namespace {
+
+struct TTLAnnotateReductionLoopsPass
+    : public impl::TTLAnnotateReductionLoopsBase<
+          TTLAnnotateReductionLoopsPass> {
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+
+    func.walk([&](scf::ForOp forOp) {
+      // Skip loops already annotated (from compiler-generated tile loops).
+      if (forOp->hasAttr(kReductionLoopAttrName) ||
+          forOp->hasAttr(kTileLoopStrideAttrName) ||
+          forOp->hasAttr(kSubblockLoopStrideAttrName)) {
+        return;
+      }
+
+      // Check if the loop body contains a store (ttl.store) targeting a
+      // CB that was reserved (ttl.cb_reserve) before the loop.
+      bool hasReductionStore = false;
+      forOp.getBody()->walk([&](StoreOp store) {
+        Value view = store.getView();
+        // Trace through attach_cb to find the cb_reserve.
+        if (auto attachCB = view.getDefiningOp<AttachCBOp>()) {
+          view = attachCB.getTensor();
+        }
+        if (auto reserve = view.getDefiningOp<CBReserveOp>()) {
+          // The cb_reserve must be OUTSIDE the for loop (before it).
+          if (!forOp->isAncestor(reserve)) {
+            hasReductionStore = true;
+          }
+        }
+      });
+
+      if (hasReductionStore) {
+        forOp->setAttr(kReductionLoopAttrName,
+                       OpBuilder(forOp).getUnitAttr());
+      }
+    });
+  }
+};
+
+} // namespace
+
+} // namespace mlir::tt::ttl
diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
index 493a607e5..5903dbbdc 100644
--- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
@@ -216,13 +216,47 @@ struct TTLSubblockComputeForDSTPass
     SmallVector<int64_t> parallelSubblockSizes =
         computeMultiDimSubblockSizes(parallelDimSizes, parallelBudget);
 
-    // Expand back to full-rank subblock sizes: reduction dims get their full
-    // size, parallel dims get the computed subblock size.
+    // Expand back to full-rank subblock sizes. Parallel dims get the
+    // computed subblock size. For matmul when the parallel output exceeds
+    // DST capacity, reduction (K) dims are tiled to 1 for L1 accumulation:
+    // each K step packs to L1 independently, and TTKernelInsertL1Accumulation
+    // inserts pack_reconfig_l1_acc guards. When the output fits in DST, K
+    // stays at full size for DST accumulation (higher precision, fewer packs).
+    bool tileKToOne = false;
+    if (hasMatmulBlock) {
+      // Only tile K for standalone matmul (no accumulator). The fused
+      // prev + a @ b pattern has an accumulator operand that requires
+      // DST accumulation semantics (copy_tile + matmul_block). Tiling
+      // K would break the accumulator reload logic.
+      bool hasAccumulator = false;
+      computeOp.getBody().walk([&](TileMatmulBlockOp mmOp) {
+        if (mmOp.getAccumulator()) {
+          hasAccumulator = true;
+        }
+      });
+      if (!hasAccumulator) {
+        int64_t parallelProduct = 1;
+        for (auto sz : parallelSubblockSizes) {
+          parallelProduct *= sz;
+        }
+        // Tile K to 1 when: (1) subblocking IS needed (parallel output
+        // exceeds DST), (2) the subblock is strictly smaller than the
+        // full output, and (3) the subblock is non-trivial (> 1 tile).
+        // When the subblock degenerates to 1x1 (e.g., prime dimensions),
+        // K tiling provides no benefit -- the per-tile DST accumulation
+        // path handles it via generateAccumulatingLoops.
+        tileKToOne = parallelProduct > 1 &&
+                     parallelProduct < effectiveTiles &&
+                     effectiveTiles > unrollFactor;
+      }
+    }
     SmallVector<int64_t> subblockSizes(rank);
     int64_t parallelIdx = 0;
     for (int64_t d = 0; d < rank; ++d) {
       if (iterTypes[d] == utils::IteratorType::parallel) {
         subblockSizes[d] = parallelSubblockSizes[parallelIdx++];
+      } else if (tileKToOne) {
+        subblockSizes[d] = 1;
       } else {
         subblockSizes[d] = dimSizes[d];
       }
@@ -303,11 +337,14 @@ struct TTLSubblockComputeForDSTPass
     // can distinguish subblock loops from tile loops and compute correct
     // CB offsets (both linearized and per-dimension).
     for (size_t i = 0; i < subblockedDims.size(); ++i) {
-      loopNest.loops[i]->setAttr(
-          kSubblockLoopStrideAttrName,
-          b.getIndexAttr(blockStrides[subblockedDims[i]]));
-      loopNest.loops[i]->setAttr(kSubblockDimAttrName,
-                                 b.getIndexAttr(subblockedDims[i]));
+      int64_t dim = subblockedDims[i];
+      loopNest.loops[i]->setAttr(kSubblockLoopStrideAttrName,
+                                 b.getIndexAttr(blockStrides[dim]));
+      loopNest.loops[i]->setAttr(kSubblockDimAttrName, b.getIndexAttr(dim));
+      // Mark reduction dimension loops for L1 accumulation insertion.
+      if (iterTypes[dim] == utils::IteratorType::reduction) {
+        loopNest.loops[i]->setAttr(kReductionLoopAttrName, b.getUnitAttr());
+      }
     }
 
     // Precompute per-output subblock info: shape, tile count, and whether
diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py
index 9cf005fa0..2d6603df1 100644
--- a/python/ttl/ttl_api.py
+++ b/python/ttl/ttl_api.py
@@ -1203,6 +1203,7 @@ def _compile_kernel(
         assign_dst_pass = f"ttl-assign-dst{{enable-fpu-binary-ops={fpu_flag}}}"
 
         pipeline_passes = [
+            "func.func(ttl-annotate-reduction-loops)",
             "func.func(convert-ttl-to-compute)",
             set_compute_config_pass,
             f"func.func({assign_dst_pass})",
diff --git a/test/python/test_matmul_k_accumulation.py b/test/python/test_matmul_k_accumulation.py
index a5d79d9d3..4ed632865 100644
--- a/test/python/test_matmul_k_accumulation.py
+++ b/test/python/test_matmul_k_accumulation.py
@@ -175,13 +175,30 @@ def test_matmul_k_accumulation_streaming(k_tiles, block_n, device):
 @pytest.mark.parametrize("k_tiles", K_TILES, ids=[f"K{k}" for k in K_TILES])
 @pytest.mark.requires_device
 def test_matmul_k_accumulation_single_fill(k_tiles, block_n, device):
-    """Kt>1 single-fill accumulation: tighter bounds (f32 DST)."""
+    """Kt>1 single-fill accumulation.
+
+    When the output block fits in DST (block_n <= 4 for f32), matmul_block
+    accumulates all K tiles in f32 DST with one bf16 truncation at the end
+    (tighter bounds). When the output exceeds DST capacity (block_n > 4
+    for f32), the compiler tiles K to 1 for L1 accumulation, producing one
+    bf16 truncation per K step (same bounds as the streaming test).
+    """
     scale = math.sqrt(k_tiles)
+    # DST capacity with fp32_dest_acc_en=true is 4. Output block is
+    # 1 x block_n. When block_n > 4, L1 acc activates with per-K-step
+    # bf16 truncation, requiring relaxed error bounds.
+    uses_l1_acc = block_n > 4
+    if uses_l1_acc:
+        max_err = 0.5 * scale
+        mean_err = 0.05 * scale
+    else:
+        max_err = 0.1 * scale
+        mean_err = 0.01 * scale
     _run(
         _make_matmul_kn,
         k_tiles,
         block_n,
         device,
-        max_err_limit=0.1 * scale,
-        mean_err_limit=0.01 * scale,
+        max_err_limit=max_err,
+        mean_err_limit=mean_err,
     )
diff --git a/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir b/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir
index 9fb0212c9..aa4a891f8 100644
--- a/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir
+++ b/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir
@@ -1,35 +1,40 @@
 // Tests for ttl-subblock-compute-for-dst with matmul computes.
 // Matmul K (reduction) accumulates in-place in DST, so only M*N parallel
-// tiles count toward the DST budget. Subblocking partitions the M*N output
-// space while keeping K whole in each subblock.
+// tiles count toward the DST budget. When the parallel output exceeds DST,
+// subblocking partitions M*N AND tiles K to 1 for L1 accumulation.
 
 // RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(convert-ttl-to-compute, ttl-set-compute-kernel-config, ttl-assign-dst{enable-fpu-binary-ops=0}, ttl-subblock-compute-for-dst))' --split-input-file | FileCheck %s
 
 // -----
 
-// Purpose: M*N=16 exceeds f32 DST capacity (4). K=3 is excluded from the
-// budget, so subblocking partitions the 4x4 output into 1x4 strips.
-// Loop on M (dim 0): 0 to 4 step 1. K (dim 2) stays at 3 in each subblock.
+// Purpose: M*N=16 exceeds f32 DST capacity (4). Subblocking partitions the
+// 4x4 output into 1x4 strips AND tiles K from 3 to 1. The K loop is
+// annotated with ttl.reduction_loop for L1 accumulation.
+// Loops: M (dim 0) 0..4 step 1, K (dim 2) 0..3 step 1.
 
-// CHECK-LABEL: func.func @matmul_subblock_k_excluded
+// CHECK-LABEL: func.func @matmul_subblock_k_tiled
 // CHECK-SAME:  fp32_dest_acc_en = true
 // Outer subblock loop over M dimension.
-// CHECK:       scf.for %[[IV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
-// A sliced on M, K kept whole: [iv, 0] [1, 3].
-// CHECK:         tensor.extract_slice {{.*}}[%[[IV]], 0] [1, 3] [1, 1]
-// B not sliced (full [3, 4]).
-// CHECK:         tensor.extract_slice {{.*}}[0, 0] [3, 4] [1, 1]
-// Output sliced on M: [iv, 0] [1, 4].
-// CHECK:         tensor.extract_slice {{.*}}[%[[IV]], 0] [1, 4] [1, 1]
-// Inner compute on subblock [1, 4, 3] (M=1, N=4, K=3).
-// CHECK:         ttl.compute
-// CHECK-SAME:      tensor<1x3x!ttcore.tile<32x32, bf16>>
-// CHECK-SAME:      tensor<3x4x!ttcore.tile<32x32, bf16>>
-// CHECK-SAME:      tensor<1x4x!ttcore.tile<32x32, bf16>>
-// CHECK-SAME:      iterator_types = ["parallel", "parallel", "reduction"]
-// CHECK:           ttl.tile_matmul_block
+// CHECK:       scf.for %[[MIV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// Inner K reduction loop.
+// CHECK:         scf.for %[[KIV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// A sliced on M and K: [miv, kiv] [1, 1].
+// CHECK:           tensor.extract_slice {{.*}}[%[[MIV]], %[[KIV]]] [1, 1] [1, 1]
+// B sliced on K: [kiv, 0] [1, 4].
+// CHECK:           tensor.extract_slice {{.*}}[%[[KIV]], 0] [1, 4] [1, 1]
+// Output sliced on M: [miv, 0] [1, 4].
+// CHECK:           tensor.extract_slice {{.*}}[%[[MIV]], 0] [1, 4] [1, 1]
+// Inner compute on subblock [1, 4, 1] (M=1, N=4, K=1).
+// CHECK:           ttl.compute
+// CHECK-SAME:        tensor<1x1x!ttcore.tile<32x32, bf16>>
+// CHECK-SAME:        tensor<1x4x!ttcore.tile<32x32, bf16>>
+// CHECK-SAME:        tensor<1x4x!ttcore.tile<32x32, bf16>>
+// CHECK-SAME:        iterator_types = ["parallel", "parallel", "reduction"]
+// CHECK:             ttl.tile_matmul_block
+// K loop annotated for L1 accumulation.
+// CHECK:         } {{{.*}}ttl.reduction_loop{{.*}}}
 // CHECK:       }
-func.func @matmul_subblock_k_excluded(
+func.func @matmul_subblock_k_tiled(
     %arg0: tensor<4x3x!ttcore.tile<32x32, bf16>>,
     %arg1: tensor<3x4x!ttcore.tile<32x32, bf16>>) -> tensor<4x4x!ttcore.tile<32x32, bf16>> {
   %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[4, 3], !ttcore.tile<32x32, bf16>, 2>

From 513cc4d7dfc6d063db5f93d3d3446edef13ccc61 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Thu, 9 Apr 2026 23:07:50 -0700
Subject: [PATCH 04/31] preformat

---
 lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp | 3 +--
 lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp  | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
index b7d090781..181318da1 100644
--- a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
@@ -59,8 +59,7 @@ struct TTLAnnotateReductionLoopsPass
       });
 
       if (hasReductionStore) {
-        forOp->setAttr(kReductionLoopAttrName,
-                       OpBuilder(forOp).getUnitAttr());
+        forOp->setAttr(kReductionLoopAttrName, OpBuilder(forOp).getUnitAttr());
       }
     });
   }
diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
index 5903dbbdc..cb32c9103 100644
--- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
@@ -245,8 +245,7 @@ struct TTLSubblockComputeForDSTPass
         // When the subblock degenerates to 1x1 (e.g., prime dimensions),
         // K tiling provides no benefit -- the per-tile DST accumulation
         // path handles it via generateAccumulatingLoops.
-        tileKToOne = parallelProduct > 1 &&
-                     parallelProduct < effectiveTiles &&
+        tileKToOne = parallelProduct > 1 && parallelProduct < effectiveTiles &&
                      effectiveTiles > unrollFactor;
       }
     }

From 64764602bdf6deb23a89af436db5c85c3ec41a3d Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Thu, 9 Apr 2026 23:13:02 -0700
Subject: [PATCH 05/31] add pytest for l1 accumulation

---
 test/python/test_matmul_l1_acc.py | 191 ++++++++++++++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100644 test/python/test_matmul_l1_acc.py

diff --git a/test/python/test_matmul_l1_acc.py b/test/python/test_matmul_l1_acc.py
new file mode 100644
index 000000000..599c33ea7
--- /dev/null
+++ b/test/python/test_matmul_l1_acc.py
@@ -0,0 +1,191 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Matmul L1 accumulation: reserve once, store K times, push once.
+
+The compiler detects the scf.for loop storing to the same reserved CB
+and annotates it as a reduction loop. TTKernelInsertL1Accumulation inserts
+pack_reconfig_l1_acc guards so each K iteration packs additively to L1.
+
+Tests single-core and multicore configurations with various block sizes.
+"""
+
+# REQUIRES: ttnn
+# UNSUPPORTED: system-darwin
+# RUN: %python -m pytest %s -v --tb=short
+
+import pytest
+import torch
+import ttl
+
+ttnn = pytest.importorskip("ttnn", exc_type=ImportError)
+
+from ttlang_test_utils import to_dram
+from utils.correctness import assert_pcc
+
+TILE = 32
+
+
+def _make_l1_acc_kernel(block_m, block_n, grid="auto"):
+    """Matmul with L1 accumulation: reserve once, store K times, push once."""
+
+    @ttl.operation(grid=grid)
+    def kernel(a, b, out):
+        Mt = a.shape[0] // TILE
+        Kt = a.shape[1] // TILE
+        Nt = b.shape[1] // TILE
+
+        M_num = Mt // block_m
+        N_num = Nt // block_n
+
+        grid_n, grid_m = ttl.grid_size(dims=2)
+        m_per = -(-M_num // grid_m)
+        n_per = -(-N_num // grid_n)
+
+        a_dfb = ttl.make_dataflow_buffer_like(a, shape=(block_m, 1), block_count=2)
+        b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, block_n), block_count=2)
+        out_dfb = ttl.make_dataflow_buffer_like(
+            out, shape=(block_m, block_n), block_count=2
+        )
+
+        @ttl.compute()
+        def compute():
+            node_n, node_m = ttl.node(dims=2)
+            for lm in range(m_per):
+                mb = node_m * m_per + lm
+                if mb < M_num:
+                    for ln in range(n_per):
+                        nb = node_n * n_per + ln
+                        if nb < N_num:
+                            out_blk = out_dfb.reserve()
+                            for _ in range(Kt):
+                                a_blk = a_dfb.wait()
+                                b_blk = b_dfb.wait()
+                                out_blk.store(a_blk @ b_blk)
+                                a_blk.pop()
+                                b_blk.pop()
+                            out_blk.push()
+
+        @ttl.datamovement()
+        def reader():
+            node_n, node_m = ttl.node(dims=2)
+            for lm in range(m_per):
+                mb = node_m * m_per + lm
+                if mb < M_num:
+                    m_off = mb * block_m
+                    for ln in range(n_per):
+                        nb = node_n * n_per + ln
+                        if nb < N_num:
+                            for kt in range(Kt):
+                                with a_dfb.reserve() as blk:
+                                    ttl.copy(
+                                        a[
+                                            m_off : m_off + block_m,
+                                            kt : kt + 1,
+                                        ],
+                                        blk,
+                                    ).wait()
+
+        @ttl.datamovement()
+        def writer():
+            node_n, node_m = ttl.node(dims=2)
+            for lm in range(m_per):
+                mb = node_m * m_per + lm
+                if mb < M_num:
+                    m_off = mb * block_m
+                    for ln in range(n_per):
+                        nb = node_n * n_per + ln
+                        if nb < N_num:
+                            n_off = nb * block_n
+                            for kt in range(Kt):
+                                with b_dfb.reserve() as blk:
+                                    ttl.copy(
+                                        b[
+                                            kt : kt + 1,
+                                            n_off : n_off + block_n,
+                                        ],
+                                        blk,
+                                    ).wait()
+                            with out_dfb.wait() as blk:
+                                ttl.copy(
+                                    blk,
+                                    out[
+                                        m_off : m_off + block_m,
+                                        n_off : n_off + block_n,
+                                    ],
+                                ).wait()
+
+    return kernel
+
+
+# Single-core tests (grid=(1,1))
+SINGLE_CORE_PARAMS = [
+    # (block_m, block_n, Kt)
+    (2, 2, 2),  # Output 2x2=4 fits in f32 DST
+    (2, 2, 4),  # K=4
+    (3, 3, 2),  # Output 3x3=9 > f32 DST(4)
+    (4, 4, 4),  # Output 4x4=16 > f32 DST(4)
+    (8, 8, 2),  # Large output, small K
+    (8, 8, 8),  # Large output, large K
+]
+
+
+@pytest.mark.parametrize(
+    "block_m,block_n,Kt",
+    SINGLE_CORE_PARAMS,
+    ids=[f"blk{m}x{n}_K{k}" for m, n, k in SINGLE_CORE_PARAMS],
+)
+@pytest.mark.requires_device
+def test_l1_acc_single_core(block_m, block_n, Kt, device):
+    """L1 accumulation on single core with various block sizes."""
+    M, K, N = block_m * TILE, Kt * TILE, block_n * TILE
+    a_torch = torch.randn(M, K, dtype=torch.bfloat16)
+    b_torch = torch.randn(K, N, dtype=torch.bfloat16)
+    golden = (a_torch.float() @ b_torch.float()).float()
+
+    a = to_dram(a_torch, device)
+    b = to_dram(b_torch, device)
+    out = to_dram(torch.zeros(M, N, dtype=torch.bfloat16), device)
+
+    kernel = _make_l1_acc_kernel(block_m, block_n, grid=(1, 1))
+    kernel(a, b, out)
+
+    result = ttnn.to_torch(out).float()
+    assert_pcc(golden, result, threshold=0.999)
+
+
+# Multicore tests (grid="auto") with multiple output blocks
+MULTI_CORE_PARAMS = [
+    # (Mt, Kt, Nt, block_m, block_n)
+    (16, 4, 16, 8, 8),  # 2x2 output blocks, K=4
+    (32, 8, 32, 8, 8),  # 4x4 output blocks, K=8
+    (128, 128, 128, 8, 8),  # 16x16 output blocks, K=128 (4096^3 shape)
+]
+
+
+@pytest.mark.parametrize(
+    "Mt,Kt,Nt,block_m,block_n",
+    MULTI_CORE_PARAMS,
+    ids=[
+        f"tiles{mt}x{kt}x{nt}_blk{bm}x{bn}" for mt, kt, nt, bm, bn in MULTI_CORE_PARAMS
+    ],
+)
+@pytest.mark.requires_device
+def test_l1_acc_multicore(Mt, Kt, Nt, block_m, block_n, device):
+    """L1 accumulation with multicore and multiple output blocks."""
+    M, K, N = Mt * TILE, Kt * TILE, Nt * TILE
+    a_torch = torch.randn(M, K, dtype=torch.bfloat16)
+    b_torch = torch.randn(K, N, dtype=torch.bfloat16)
+    golden = (a_torch.float() @ b_torch.float()).float()
+
+    a = to_dram(a_torch, device)
+    b = to_dram(b_torch, device)
+    out = to_dram(torch.zeros(M, N, dtype=torch.bfloat16), device)
+
+    kernel = _make_l1_acc_kernel(block_m, block_n)
+    kernel(a, b, out)
+
+    result = ttnn.to_torch(out).float()
+    assert_pcc(golden, result, threshold=0.999)

From 455ad86a56ab5edfef88eaff7b3647eecfe4f0b5 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Thu, 9 Apr 2026 23:24:56 -0700
Subject: [PATCH 06/31] correct placement of pack_reconfig_l1_acc

---
 include/ttlang/Dialect/TTL/IR/TTL.h           |  6 ++++
 .../TTKernelInsertL1Accumulation.cpp          | 30 ++++++++++++-------
 .../Transforms/TTLAnnotateReductionLoops.cpp  |  2 +-
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/include/ttlang/Dialect/TTL/IR/TTL.h b/include/ttlang/Dialect/TTL/IR/TTL.h
index 72ef23f5e..f96a4f1eb 100644
--- a/include/ttlang/Dialect/TTL/IR/TTL.h
+++ b/include/ttlang/Dialect/TTL/IR/TTL.h
@@ -64,6 +64,12 @@ constexpr llvm::StringLiteral kTileLoopStrideAttrName("ttl.tile_loop_stride");
 /// ComputeOp is lowered to loops.
 constexpr llvm::StringLiteral kReductionLoopAttrName("ttl.reduction_loop");
 
+/// Marks a user-written scf.for as an L1 accumulation loop. Each iteration
+/// packs to the same CB slot; pack_reconfig_l1_acc makes subsequent
+/// iterations additive. Distinct from kReductionLoopAttrName which marks
+/// compiler-generated DST accumulation loops.
+constexpr llvm::StringLiteral kL1AccLoopAttrName("ttl.l1_acc_loop");
+
 /// Output CB index on tile ops that need it for init insertion.
 constexpr llvm::StringLiteral
     kBcastOutputCBIndexAttrName("ttl.bcast_output_cb_index");
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index 3c40a1872..713854283 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -35,26 +35,36 @@ namespace ttk = mlir::tt::ttkernel;
 
 namespace {
 
-/// Find the innermost enclosing reduction loop for an operation.
-static scf::ForOp findInnermostReductionLoop(Operation *op) {
+/// Find the enclosing loop that should control L1 accumulation.
+/// Prefers kL1AccLoopAttrName (user-annotated) over kReductionLoopAttrName
+/// (compiler-generated). Returns the innermost matching loop, but if a
+/// kL1AccLoopAttrName loop exists, kReductionLoopAttrName loops inside
+/// it are skipped (they use DST accumulation, not L1).
+static scf::ForOp findL1AccLoop(Operation *op) {
+  scf::ForOp l1AccLoop;
+  scf::ForOp reductionLoop;
   for (Operation *parent = op->getParentOp(); parent;
        parent = parent->getParentOp()) {
     if (auto forOp = dyn_cast<scf::ForOp>(parent)) {
-      if (forOp->hasAttr(kReductionLoopAttrName)) {
-        return forOp;
+      if (forOp->hasAttr(kL1AccLoopAttrName)) {
+        l1AccLoop = forOp;
+      } else if (forOp->hasAttr(kReductionLoopAttrName) && !reductionLoop) {
+        reductionLoop = forOp;
       }
     }
   }
-  return nullptr;
+  // User-annotated L1 acc loop takes priority.
+  return l1AccLoop ? l1AccLoop : reductionLoop;
 }
 
-/// Find the outermost enclosing reduction loop for an operation.
-static scf::ForOp findOutermostReductionLoop(Operation *op) {
+/// Find the outermost enclosing L1 acc or reduction loop for the disable guard.
+static scf::ForOp findOutermostL1AccLoop(Operation *op) {
   scf::ForOp outermost;
   for (Operation *parent = op->getParentOp(); parent;
        parent = parent->getParentOp()) {
     if (auto forOp = dyn_cast<scf::ForOp>(parent)) {
-      if (forOp->hasAttr(kReductionLoopAttrName)) {
+      if (forOp->hasAttr(kL1AccLoopAttrName) ||
+          forOp->hasAttr(kReductionLoopAttrName)) {
         outermost = forOp;
       }
     }
@@ -72,7 +82,7 @@ struct TTKernelInsertL1AccumulationPass
     // avoids invalidation issues from modifying IR during iteration.
     SmallVector<std::pair<ttk::TileRegsAcquireOp, scf::ForOp>> targets;
     moduleOp->walk([&](ttk::TileRegsAcquireOp acquireOp) {
-      auto reductionLoop = findInnermostReductionLoop(acquireOp);
+      auto reductionLoop = findL1AccLoop(acquireOp);
       if (!reductionLoop) {
         return;
       }
@@ -108,7 +118,7 @@ struct TTKernelInsertL1AccumulationPass
       ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag);
 
       // Disable L1 accumulation after the outermost reduction loop.
-      auto outermostLoop = findOutermostReductionLoop(acquireOp);
+      auto outermostLoop = findOutermostL1AccLoop(acquireOp);
       if (disabledLoops.insert(outermostLoop).second) {
         builder.setInsertionPointAfter(outermostLoop);
         Value disableFlag = arith::ConstantOp::create(
diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
index 181318da1..7545aba6d 100644
--- a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
@@ -59,7 +59,7 @@ struct TTLAnnotateReductionLoopsPass
       });
 
       if (hasReductionStore) {
-        forOp->setAttr(kReductionLoopAttrName, OpBuilder(forOp).getUnitAttr());
+        forOp->setAttr(kL1AccLoopAttrName, OpBuilder(forOp).getUnitAttr());
       }
     });
   }

From b51e5544d25e80c07559a74dced7e1be4fbd348a Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Fri, 10 Apr 2026 08:10:18 -0700
Subject: [PATCH 07/31] more packer accumulation fixes

---
 .../Transforms/TTKernelCombinePackTiles.cpp   |  3 +-
 .../Transforms/TTKernelInsertInits.cpp        |  3 +-
 .../TTKernelInsertL1Accumulation.cpp          | 99 ++++++++++++-------
 .../Transforms/TTLSubblockComputeForDST.cpp   | 52 ++--------
 4 files changed, 75 insertions(+), 82 deletions(-)

diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp
index 5d0c5ad5f..22eb9f2cc 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp
@@ -72,7 +72,8 @@ struct TTKernelCombinePackTilesPass
       for (Operation *parent = block->getParentOp(); parent;
            parent = parent->getParentOp()) {
         if (auto forOp = dyn_cast<scf::ForOp>(parent)) {
-          if (forOp->hasAttr(kReductionLoopAttrName)) {
+          if (forOp->hasAttr(kReductionLoopAttrName) ||
+              forOp->hasAttr(kL1AccLoopAttrName)) {
             return;
           }
         }
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp
index fb63e3e4f..b28ccded4 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp
@@ -359,7 +359,8 @@ static Operation *hoistAboveCompilerLoops(Operation *op) {
   while (auto *parentOp = insertBefore->getParentOp()) {
     if (isa<scf::ForOp>(parentOp) &&
         (parentOp->hasAttr(kTileLoopStrideAttrName) ||
-         parentOp->hasAttr(kSubblockLoopStrideAttrName))) {
+         parentOp->hasAttr(kSubblockLoopStrideAttrName) ||
+         parentOp->hasAttr(kL1AccLoopAttrName))) {
       insertBefore = parentOp;
     } else {
       break;
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index 713854283..646d1be64 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -35,26 +35,23 @@ namespace ttk = mlir::tt::ttkernel;
 
 namespace {
 
-/// Find the enclosing loop that should control L1 accumulation.
-/// Prefers kL1AccLoopAttrName (user-annotated) over kReductionLoopAttrName
-/// (compiler-generated). Returns the innermost matching loop, but if a
-/// kL1AccLoopAttrName loop exists, kReductionLoopAttrName loops inside
-/// it are skipped (they use DST accumulation, not L1).
+/// Find the enclosing loop that controls L1 accumulation.
+/// Prefers kL1AccLoopAttrName (user-annotated). Falls back to innermost
+/// kReductionLoopAttrName (compiler-generated, for reduce ops).
 static scf::ForOp findL1AccLoop(Operation *op) {
-  scf::ForOp l1AccLoop;
-  scf::ForOp reductionLoop;
+  scf::ForOp reductionFallback;
   for (Operation *parent = op->getParentOp(); parent;
        parent = parent->getParentOp()) {
     if (auto forOp = dyn_cast<scf::ForOp>(parent)) {
       if (forOp->hasAttr(kL1AccLoopAttrName)) {
-        l1AccLoop = forOp;
-      } else if (forOp->hasAttr(kReductionLoopAttrName) && !reductionLoop) {
-        reductionLoop = forOp;
+        return forOp;
+      }
+      if (forOp->hasAttr(kReductionLoopAttrName) && !reductionFallback) {
+        reductionFallback = forOp;
       }
     }
   }
-  // User-annotated L1 acc loop takes priority.
-  return l1AccLoop ? l1AccLoop : reductionLoop;
+  return reductionFallback;
 }
 
 /// Find the outermost enclosing L1 acc or reduction loop for the disable guard.
@@ -78,49 +75,77 @@ struct TTKernelInsertL1AccumulationPass
   void runOnOperation() override {
     auto moduleOp = getOperation();
 
-    // Collect all acquire ops inside reduction loops. Collecting first
-    // avoids invalidation issues from modifying IR during iteration.
-    SmallVector<std::pair<ttk::TileRegsAcquireOp, scf::ForOp>> targets;
+    // Collect L1 acc loops (kL1AccLoopAttrName or kReductionLoopAttrName)
+    // that contain pack_tile activity.
+    SmallVector<scf::ForOp> l1AccLoops;
+    llvm::SmallDenseSet<Operation *> seenLoops;
     moduleOp->walk([&](ttk::TileRegsAcquireOp acquireOp) {
-      auto reductionLoop = findL1AccLoop(acquireOp);
-      if (!reductionLoop) {
+      auto loop = findL1AccLoop(acquireOp);
+      if (!loop || !seenLoops.insert(loop).second) {
         return;
       }
-      // L1 accumulation uses additive packing -- only valid for sum
-      // reductions. Max reductions require DST accumulation (Phase 2)
-      // where the hardware max operation accumulates across iterations.
       bool hasMaxReduce = false;
-      reductionLoop->walk([&](ttk::ReduceTileOp reduceOp) {
+      loop->walk([&](ttk::ReduceTileOp reduceOp) {
         if (reduceOp.getReduceType() == ttk::ReduceType::Max) {
           hasMaxReduce = true;
         }
       });
       if (!hasMaxReduce) {
-        targets.emplace_back(acquireOp, reductionLoop);
+        l1AccLoops.push_back(loop);
       }
     });
 
+    // Insert pack_reconfig_l1_acc matching the tt-metal minimal_matmul
+    // pattern: enable at the END of the first K iteration (after all
+    // DstSections complete), disable after the loop. The enable guard
+    // uses `if (k == lb)` so it fires once when the first iteration
+    // finishes, and L1 acc stays enabled for all subsequent iterations.
     llvm::SmallDenseSet<Operation *> disabledLoops;
-    for (auto [acquireOp, reductionLoop] : targets) {
-      OpBuilder builder(acquireOp->getContext());
-      builder.setInsertionPointAfter(acquireOp);
-      Location loc = acquireOp.getLoc();
-
-      // Guard: if (loop_iv != lower_bound) pack_reconfig_l1_acc(1)
-      Value loopIV = reductionLoop.getInductionVar();
-      Value loopLB = reductionLoop.getLowerBound();
-      Value notFirstIter = arith::CmpIOp::create(
-          builder, loc, arith::CmpIPredicate::ne, loopIV, loopLB);
-      auto ifOp = scf::IfOp::create(builder, loc, notFirstIter);
+    for (scf::ForOp loop : l1AccLoops) {
+      OpBuilder builder(loop->getContext());
+      Location loc = loop.getLoc();
+
+      // Disable L1 acc before the loop to ensure clean state.
+      builder.setInsertionPoint(loop);
+      Value disablePre = arith::ConstantOp::create(
+          builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0));
+      ttk::PackReconfigL1AccOp::create(builder, loc, disablePre);
+
+      // Enable at end of first iteration, matching tt-metal:
+      //   if (k_block == 0) { PACK((llk_pack_reconfig_l1_acc(1))); }
+      Operation *yield = loop.getBody()->getTerminator();
+      builder.setInsertionPoint(yield);
+      Value loopIV = loop.getInductionVar();
+      Value loopLB = loop.getLowerBound();
+      Value isFirstIter = arith::CmpIOp::create(
+          builder, loc, arith::CmpIPredicate::eq, loopIV, loopLB);
+      auto ifOp = scf::IfOp::create(builder, loc, isFirstIter);
       builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
       Value enableFlag = arith::ConstantOp::create(
           builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1));
       ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag);
 
-      // Disable L1 accumulation after the outermost reduction loop.
-      auto outermostLoop = findOutermostL1AccLoop(acquireOp);
-      if (disabledLoops.insert(outermostLoop).second) {
-        builder.setInsertionPointAfter(outermostLoop);
+      // Disable after each L1 acc loop to prevent L1 acc state from
+      // leaking into outer loops or subsequent code.
+      if (disabledLoops.insert(loop.getOperation()).second) {
+        // For the outermost loop, place disable after cb_push_back.
+        // For inner loops, place directly after the loop.
+        auto outermostLoop = findOutermostL1AccLoop(loop);
+        bool isOutermost = !outermostLoop || outermostLoop == loop;
+        if (isOutermost) {
+          // Scan forward for cb_push_back.
+          Operation *insertPoint = loop->getNextNode();
+          while (insertPoint && !isa<ttk::CBPushBackOp>(insertPoint)) {
+            insertPoint = insertPoint->getNextNode();
+          }
+          if (insertPoint) {
+            builder.setInsertionPointAfter(insertPoint);
+          } else {
+            builder.setInsertionPointAfter(loop);
+          }
+        } else {
+          builder.setInsertionPointAfter(loop);
+        }
         Value disableFlag = arith::ConstantOp::create(
             builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0));
         ttk::PackReconfigL1AccOp::create(builder, loc, disableFlag);
diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
index cb32c9103..f6c082780 100644
--- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
@@ -216,46 +216,15 @@ struct TTLSubblockComputeForDSTPass
     SmallVector<int64_t> parallelSubblockSizes =
         computeMultiDimSubblockSizes(parallelDimSizes, parallelBudget);
 
-    // Expand back to full-rank subblock sizes. Parallel dims get the
-    // computed subblock size. For matmul when the parallel output exceeds
-    // DST capacity, reduction (K) dims are tiled to 1 for L1 accumulation:
-    // each K step packs to L1 independently, and TTKernelInsertL1Accumulation
-    // inserts pack_reconfig_l1_acc guards. When the output fits in DST, K
-    // stays at full size for DST accumulation (higher precision, fewer packs).
-    bool tileKToOne = false;
-    if (hasMatmulBlock) {
-      // Only tile K for standalone matmul (no accumulator). The fused
-      // prev + a @ b pattern has an accumulator operand that requires
-      // DST accumulation semantics (copy_tile + matmul_block). Tiling
-      // K would break the accumulator reload logic.
-      bool hasAccumulator = false;
-      computeOp.getBody().walk([&](TileMatmulBlockOp mmOp) {
-        if (mmOp.getAccumulator()) {
-          hasAccumulator = true;
-        }
-      });
-      if (!hasAccumulator) {
-        int64_t parallelProduct = 1;
-        for (auto sz : parallelSubblockSizes) {
-          parallelProduct *= sz;
-        }
-        // Tile K to 1 when: (1) subblocking IS needed (parallel output
-        // exceeds DST), (2) the subblock is strictly smaller than the
-        // full output, and (3) the subblock is non-trivial (> 1 tile).
-        // When the subblock degenerates to 1x1 (e.g., prime dimensions),
-        // K tiling provides no benefit -- the per-tile DST accumulation
-        // path handles it via generateAccumulatingLoops.
-        tileKToOne = parallelProduct > 1 && parallelProduct < effectiveTiles &&
-                     effectiveTiles > unrollFactor;
-      }
-    }
+    // Reduction dims keep their full size. For matmul, K accumulates
+    // in-place in DST via matmul_block(kt=K_block). L1 accumulation
+    // across user-managed outer K iterations is handled separately by
+    // TTKernelInsertL1Accumulation (kL1AccLoopAttrName).
     SmallVector<int64_t> subblockSizes(rank);
     int64_t parallelIdx = 0;
     for (int64_t d = 0; d < rank; ++d) {
       if (iterTypes[d] == utils::IteratorType::parallel) {
         subblockSizes[d] = parallelSubblockSizes[parallelIdx++];
-      } else if (tileKToOne) {
-        subblockSizes[d] = 1;
       } else {
         subblockSizes[d] = dimSizes[d];
       }
@@ -336,14 +305,11 @@ struct TTLSubblockComputeForDSTPass
     // can distinguish subblock loops from tile loops and compute correct
     // CB offsets (both linearized and per-dimension).
     for (size_t i = 0; i < subblockedDims.size(); ++i) {
-      int64_t dim = subblockedDims[i];
-      loopNest.loops[i]->setAttr(kSubblockLoopStrideAttrName,
-                                 b.getIndexAttr(blockStrides[dim]));
-      loopNest.loops[i]->setAttr(kSubblockDimAttrName, b.getIndexAttr(dim));
-      // Mark reduction dimension loops for L1 accumulation insertion.
-      if (iterTypes[dim] == utils::IteratorType::reduction) {
-        loopNest.loops[i]->setAttr(kReductionLoopAttrName, b.getUnitAttr());
-      }
+      loopNest.loops[i]->setAttr(
+          kSubblockLoopStrideAttrName,
+          b.getIndexAttr(blockStrides[subblockedDims[i]]));
+      loopNest.loops[i]->setAttr(kSubblockDimAttrName,
+                                 b.getIndexAttr(subblockedDims[i]));
     }
 
     // Precompute per-output subblock info: shape, tile count, and whether

From ec703c780b00cc289954a82a30e424ed5f654a0f Mon Sep 17 00:00:00 2001
From: Alex Richins <arichins@tenstorrent.com>
Date: Fri, 10 Apr 2026 10:03:35 -0700
Subject: [PATCH 08/31] matmul cleanup and 2d mcast (#465)

### Problem description

The existing matmul examples had stale patterns from before the
`acc=True` removal,
`split_work_to_nodes` in `utils/block_allocation.py` had a bug that
produced
incorrect block parameters for certain matrix shapes. Adding 2D
multicast
matmul example. Generally cleaning up and fixing test in metal_examples

  ### What's changed

- **API updates across metal examples**: Updated all existing metal
matmul examples
    (`1d_mcast_matmul`, `multinode_matmul`, `multinode_reuse_matmul`,
`single_node_matmul`) fixing back change ttnn apis and new update metal
headers

- **ttlang matmul accumulation pattern**: Updated all ttlang matmul
examples to use
the explicit `+=` accumulation pattern, replacing the removed
`store(..., acc=True)`
API. Accumulation is now expressed as `acc = ttl.math.fill(out_blk, 0)`
followed
by `acc += a_blk @ b_blk` in the K loop, with a final
`out_blk.store(acc)`.

- **2D mcast matmul (metal)**: Added a metal reference implementation
under
    `examples/metal_examples/2d_mcast_matmul/metal/`

  - **2D mcast matmul (tt-lang)**: Added a tt-lang implementation under
`examples/metal_examples/2d_mcast_matmul/ttlang/`. Uses `ttl.Pipe` and
`ttl.PipeNet` to express the A (row-wise) and B (column-wise) multicast
patterns,
    with `get_large_matmul_params` for block parameter selection.

  ### Checklist

  - [ ] New/Existing tests provide coverage for changes
- `test_block_allocation.py` updated with new coverage for the block
allocation fix
    - 2D mcast matmul example manually tested on hardware and in sim
---
 .../1d_mcast_matmul/metal/1d_matmul_metal.py  |  58 +--
 .../metal/kernels/reuse_compute.cpp           |   8 +-
 .../2d_mcast_matmul/metal/2d_mcast_matmul.py  | 432 ++++++++++++++++++
 .../metal/kernels/bmm_large_block_zm.cpp      | 107 +++++
 ..._tile_layout_in0_receiver_in1_receiver.cpp | 106 +++++
 ...mm_tile_layout_in0_receiver_in1_sender.cpp | 156 +++++++
 ...mm_tile_layout_in0_sender_in1_receiver.cpp | 155 +++++++
 ..._bmm_tile_layout_in0_sender_in1_sender.cpp | 200 ++++++++
 .../metal/kernels/writer_bmm_tile_layout.cpp  |  57 +++
 .../2d_mcast_matmul/ttlang/2d_mcast_matmul.py | 182 ++++++++
 .../metal/kernels/mm_compute.cpp              |   4 +-
 .../metal/multinode_matmul.py                 | 104 +++--
 .../ttlang/multinode_matmul.py                |  28 +-
 .../metal/kernels/bmm_large_block_zm.cpp      |   4 +-
 .../metal/multinode_reuse_matmul.py           | 102 +++--
 .../ttlang/multinode_reuse_matmul.py          |   3 +-
 .../metal/kernels/mm_compute.cpp              |   4 +-
 .../metal/single_node_matmul.py               |  26 +-
 .../ttlang/single_node_matmul.py              |   4 +-
 examples/tt_upsample.py                       |  12 +-
 python/utils/__init__.py                      |   7 +-
 python/utils/block_allocation.py              |  60 +--
 test/python/test_block_allocation.py          | 100 ++--
 23 files changed, 1688 insertions(+), 231 deletions(-)
 create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/2d_mcast_matmul.py
 create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/kernels/bmm_large_block_zm.cpp
 create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_receiver.cpp
 create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_sender.cpp
 create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_receiver.cpp
 create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_sender.cpp
 create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/kernels/writer_bmm_tile_layout.cpp
 create mode 100644 examples/metal_examples/2d_mcast_matmul/ttlang/2d_mcast_matmul.py

diff --git a/examples/metal_examples/1d_mcast_matmul/metal/1d_matmul_metal.py b/examples/metal_examples/1d_mcast_matmul/metal/1d_matmul_metal.py
index f5b6a0157..22429529b 100644
--- a/examples/metal_examples/1d_mcast_matmul/metal/1d_matmul_metal.py
+++ b/examples/metal_examples/1d_mcast_matmul/metal/1d_matmul_metal.py
@@ -175,11 +175,11 @@ def test_1d_matmul_metal(
     ), "1D matmul requires multiple blocks to use all 4 kernels"
 
     # Single sender node at (0, 0) broadcasts to all other nodes
-    in0_sender_node = ttnn.NodeRangeSet(
-        [ttnn.NodeRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(0, 0))]
+    in0_sender_node = ttnn.CoreRangeSet(
+        [ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(0, 0))]
     )
     # All compute nodes (entire grid used for computation)
-    all_nodes = ttnn.num_nodes_to_noderangeset(
+    all_nodes = ttnn.num_cores_to_corerangeset(
         num_worker_nodes, ttnn.CoreCoord(num_nodes_x, num_nodes_y), row_wise=True
     )
     # Receiver nodes are all nodes except the single sender node (0,0)
@@ -218,22 +218,22 @@ def test_1d_matmul_metal(
     block_count = 2
     a_cb_descriptor = ttnn.CBDescriptor(
         total_size=block_count * cb_page_size * (block_m * block_k),
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         format_descriptors=[a_cb_format],
     )
     b_cb_descriptor = ttnn.CBDescriptor(
         total_size=block_count * cb_page_size * (block_n * block_k),
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         format_descriptors=[b_cb_format],
     )
     out_cb_descriptor = ttnn.CBDescriptor(
         total_size=cb_page_size * (block_m * block_n),
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         format_descriptors=[out_cb_format],
     )
     intermediate_cb_descriptor = ttnn.CBDescriptor(
         total_size=cb_page_size * (block_m * block_n),
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         format_descriptors=[intermediate_cb_format],
     )
     in0_sender_semaphore_id = 0
@@ -346,19 +346,17 @@ def test_1d_matmul_metal(
         -(-num_worker_nodes // num_nodes_x) if num_nodes_x < num_worker_nodes else 1
     )
 
-    in0_sender_rt_args = [[[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)]
-    in0_receiver_rt_args = [
-        [[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)
-    ]
-    in1_writer_rt_args = [[[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)]
-    compute_rt_args = [[[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)]
+    in0_sender_rt_args = []
+    in0_receiver_rt_args = []
+    in1_writer_rt_args = []
+    compute_rt_args = []
 
     total_receivers = num_worker_nodes - 1
     print(
         f"1D matmul: Single sender at (0,0) multicasts to {total_receivers} receivers, across a grid of {num_x_nodes} x {num_y_nodes} nodes"
     )
 
-    noc_of_sender = device.worker_node_from_logical_node(ttnn.CoreCoord(0, 0))
+    noc_of_sender = device.worker_core_from_logical_core(ttnn.CoreCoord(0, 0))
 
     # Assign work to nodes
     worker_node_idx = 0
@@ -366,16 +364,17 @@ def test_1d_matmul_metal(
         for output_idx_x in range(num_x_nodes):
             if worker_node_idx >= num_worker_nodes:
                 break
+            core = ttnn.CoreCoord(output_idx_x, output_idx_y)
             # in0 sender args (only for node (0,0))
             # Single sender multicasts to all other nodes in the grid
             if output_idx_x == 0 and output_idx_y == 0:
                 # NOTE: multicast nocs require perfect rectangular node regions
                 # so when num_worker_nodes % num_nodes_x != 0, the last row of nodes will be multicasted to, but not utilized
-                mcast_end_node_noc = device.worker_node_from_logical_node(
+                mcast_end_node_noc = device.worker_core_from_logical_core(
                     ttnn.CoreCoord(num_x_nodes - 1, num_y_nodes - 1)
                 )
 
-                in0_sender_rt_args[output_idx_x][output_idx_y] = [
+                sender_args = [
                     a_tensor.buffer_address(),
                     0,
                     noc_of_sender.x,
@@ -383,39 +382,42 @@ def test_1d_matmul_metal(
                     mcast_end_node_noc.x,
                     mcast_end_node_noc.y,
                 ]
+                in0_sender_rt_args.append((core, sender_args))
                 print(
                     f"IN0_SENDER - RUNTIME_ARGS for node ({output_idx_x}, {output_idx_y}), worker: {worker_node_idx}"
                 )
                 print(
-                    f"IN0_SENDER_CORE - RUNTIME_ARGS ({len(in0_sender_rt_args[output_idx_x][output_idx_y])} args): {', '.join(map(str, in0_sender_rt_args[output_idx_x][output_idx_y]))}"
+                    f"IN0_SENDER_CORE - RUNTIME_ARGS ({len(sender_args)} args): {', '.join(map(str, sender_args))}"
                 )
 
             # in0 receiver args (for all nodes except (0,0))
             if not (output_idx_x == 0 and output_idx_y == 0):
-                in0_receiver_rt_args[output_idx_x][output_idx_y] = [
+                receiver_args = [
                     noc_of_sender.x,
                     noc_of_sender.y,
                 ]
+                in0_receiver_rt_args.append((core, receiver_args))
                 print(
                     f"IN0_RECEIVER - RUNTIME_ARGS for node ({output_idx_x}, {output_idx_y}), worker: {worker_node_idx}"
                 )
                 print(
-                    f"IN0_RECEIVER_CORE - RUNTIME_ARGS ({len(in0_receiver_rt_args[output_idx_x][output_idx_y])} args): {', '.join(map(str, in0_receiver_rt_args[output_idx_x][output_idx_y]))}"
+                    f"IN0_RECEIVER_CORE - RUNTIME_ARGS ({len(receiver_args)} args): {', '.join(map(str, receiver_args))}"
                 )
 
             # in1 reader + writer args (all nodes)
-            in1_writer_rt_args[output_idx_x][output_idx_y] = [
+            in1_writer_args = [
                 b_tensor.buffer_address(),
                 worker_node_idx * n_blocks_per_node * block_n,
                 output_tensor.buffer_address(),
                 worker_node_idx * n_blocks_per_node * block_n,
             ]
+            in1_writer_rt_args.append((core, in1_writer_args))
 
             print(
                 f"IN1_SENDER_WRITER - RUNTIME_ARGS for node ({output_idx_x}, {output_idx_y}), worker: {worker_node_idx}"
             )
             print(
-                f"IN1_SENDER_WRITER_CORE - RUNTIME_ARGS ({len(in1_writer_rt_args[output_idx_x][output_idx_y])} args): {', '.join(map(str, in1_writer_rt_args[output_idx_x][output_idx_y]))}"
+                f"IN1_SENDER_WRITER_CORE - RUNTIME_ARGS ({len(in1_writer_args)} args): {', '.join(map(str, in1_writer_args))}"
             )
             worker_node_idx += 1
 
@@ -427,7 +429,7 @@ def test_1d_matmul_metal(
     in0_sender_kernel_descriptor = ttnn.KernelDescriptor(
         kernel_source="examples/metal_examples/1d_mcast_matmul/metal/kernels/sender_in0_interleaved.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=in0_sender_node,
+        core_ranges=in0_sender_node,
         compile_time_args=in0_sender_compile_time_args,
         runtime_args=in0_sender_rt_args,
         config=ttnn.ReaderConfigDescriptor(),
@@ -436,7 +438,7 @@ def test_1d_matmul_metal(
     in0_receiver_kernel_descriptor = ttnn.KernelDescriptor(
         kernel_source="examples/metal_examples/1d_mcast_matmul/metal/kernels/reciever_in0_interleaved.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=in0_receiver_nodes,
+        core_ranges=in0_receiver_nodes,
         compile_time_args=in0_receiver_compile_time_args,
         runtime_args=in0_receiver_rt_args,
         config=ttnn.ReaderConfigDescriptor(),
@@ -445,7 +447,7 @@ def test_1d_matmul_metal(
     in1_writer_kernel_descriptor = ttnn.KernelDescriptor(
         kernel_source="examples/metal_examples/1d_mcast_matmul/metal/kernels/reader_in1_writer_out_interleaved.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         compile_time_args=in1_writer_compile_time_args,
         runtime_args=in1_writer_rt_args,
         config=ttnn.WriterConfigDescriptor(),
@@ -454,7 +456,7 @@ def test_1d_matmul_metal(
     compute_kernel_descriptor = ttnn.KernelDescriptor(
         kernel_source="examples/metal_examples/1d_mcast_matmul/metal/kernels/reuse_compute.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         compile_time_args=compute_compile_time_args,
         runtime_args=compute_rt_args,
         config=computeConfig,
@@ -465,12 +467,12 @@ def test_1d_matmul_metal(
         ttnn.SemaphoreDescriptor(
             id=in0_sender_semaphore_id,
             initial_value=0,
-            node_ranges=in0_sender_node,
+            core_ranges=in0_sender_node,
         ),
         ttnn.SemaphoreDescriptor(
             id=in0_receiver_semaphore_id,
             initial_value=0,
-            node_ranges=all_nodes,
+            core_ranges=all_nodes,
         ),
     ]
 
@@ -503,6 +505,6 @@ def test_1d_matmul_metal(
     torch_output = torch.matmul(a_tensor_torch, b_tensor_torch)
 
     assert_with_ulp(torch_output, metal_output)
-    print("test passed.")
+    print("Test passed!")
 
     ttnn.close_device(device)
diff --git a/examples/metal_examples/1d_mcast_matmul/metal/kernels/reuse_compute.cpp b/examples/metal_examples/1d_mcast_matmul/metal/kernels/reuse_compute.cpp
index fd4834d32..ceed82689 100644
--- a/examples/metal_examples/1d_mcast_matmul/metal/kernels/reuse_compute.cpp
+++ b/examples/metal_examples/1d_mcast_matmul/metal/kernels/reuse_compute.cpp
@@ -4,10 +4,10 @@
 
 #include <cstdint>
 
-#include "compute_kernel_api/matmul.h"
-#include "compute_kernel_api/pack_untilize.h"
-#include "compute_kernel_api/tile_move_copy.h"
-#include "compute_kernel_api/transpose_wh.h"
+#include "api/compute/matmul.h"
+#include "api/compute/pack_untilize.h"
+#include "api/compute/tile_move_copy.h"
+#include "api/compute/transpose_wh.h"
 #include "internal/mod_div_lib.h"
 
 namespace NAMESPACE {
diff --git a/examples/metal_examples/2d_mcast_matmul/metal/2d_mcast_matmul.py b/examples/metal_examples/2d_mcast_matmul/metal/2d_mcast_matmul.py
new file mode 100644
index 000000000..b09efb839
--- /dev/null
+++ b/examples/metal_examples/2d_mcast_matmul/metal/2d_mcast_matmul.py
@@ -0,0 +1,432 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+import ttnn
+from ttl.utils.block_allocation import get_large_matmul_params
+from ttl.utils.correctness import assert_with_ulp
+
+
+@pytest.mark.parametrize("M,K,N", [(3584, 768, 3072)])
+def test_2d_mcast_matmul(M, K, N):
+    device = ttnn.open_device(device_id=0)
+    Mt = M // ttnn.TILE_SIZE
+    Kt = K // ttnn.TILE_SIZE
+    Nt = N // ttnn.TILE_SIZE
+    in0_block_w = 2
+
+    device_grid = device.compute_with_storage_grid_size()
+    print(f"Device compute_with_storage_grid_size: ({device_grid.x}, {device_grid.y})")
+    num_nodes_x = device_grid.x
+    num_nodes_y = device_grid.y
+
+    block_params = get_large_matmul_params(
+        Mt, Nt, num_nodes_y, num_nodes_x, in0_block_w
+    )
+    per_node_M = block_params.block_h
+    per_node_N = block_params.block_w
+    out_subblock_h = block_params.subblock_h
+    out_subblock_w = block_params.subblock_w
+    assert per_node_M != 0, "get_large_matmul_params was not able to find a solution"
+    print(
+        f"per_node_M: {per_node_M}, per_node_N: {per_node_N}, out_subblock_h: {out_subblock_h}, out_subblock_w: {out_subblock_w}"
+    )
+    assert Mt % per_node_M == 0, "per_node_M must divide Mt"
+    assert Nt % per_node_N == 0, "per_node_N must divide Nt"
+    assert Kt % in0_block_w == 0, "in0_block_w must divide Kt"
+
+    num_blocks_y = Mt // per_node_M
+    num_blocks_x = Nt // per_node_N
+    assert (
+        num_blocks_x <= num_nodes_x and num_blocks_y <= num_nodes_y
+    ), "number of total blocks must be less than or equal to num nodes in each dimension"
+    assert (
+        num_blocks_x >= 2 and num_blocks_y >= 2
+    ), "2D mcast requires at least a 2x2 node grid"
+
+    num_active_x = num_blocks_x
+    num_active_y = num_blocks_y
+
+    all_nodes = ttnn.CoreRangeSet(
+        [
+            ttnn.CoreRange(
+                ttnn.CoreCoord(0, 0),
+                ttnn.CoreCoord(num_active_x - 1, num_active_y - 1),
+            )
+        ]
+    )
+    left_column = ttnn.CoreRangeSet(
+        [ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(0, num_active_y - 1))]
+    )
+    all_except_left_column = ttnn.CoreRangeSet(
+        [
+            ttnn.CoreRange(
+                ttnn.CoreCoord(1, 0),
+                ttnn.CoreCoord(num_active_x - 1, num_active_y - 1),
+            )
+        ]
+    )
+    in0_sender_in1_sender = ttnn.CoreRangeSet(
+        [ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(0, 0))]
+    )
+    in0_sender_in1_receiver = ttnn.CoreRangeSet(
+        [ttnn.CoreRange(ttnn.CoreCoord(0, 1), ttnn.CoreCoord(0, num_active_y - 1))]
+    )
+    in0_receiver_in1_sender = ttnn.CoreRangeSet(
+        [ttnn.CoreRange(ttnn.CoreCoord(1, 0), ttnn.CoreCoord(num_active_x - 1, 0))]
+    )
+    in0_receiver_in1_receiver = ttnn.CoreRangeSet(
+        [
+            ttnn.CoreRange(
+                ttnn.CoreCoord(1, 1),
+                ttnn.CoreCoord(num_active_x - 1, num_active_y - 1),
+            )
+        ]
+    )
+
+    dram_memory_config = ttnn.DRAM_MEMORY_CONFIG
+    a_tensor = ttnn.rand(
+        (M, K),
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+        memory_config=dram_memory_config,
+    )
+    b_tensor = ttnn.rand(
+        (K, N),
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+        memory_config=dram_memory_config,
+    )
+    output_tensor = ttnn.empty(
+        (M, N),
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+        memory_config=dram_memory_config,
+    )
+    dtype_size = 2  # bfloat16
+    cb_page_size = dtype_size * ttnn.TILE_SIZE * ttnn.TILE_SIZE
+
+    a_cb = 0
+    b_cb = 1
+    out_cb = 16
+    intermediate_cb = 24
+
+    a_cb_format = ttnn.CBFormatDescriptor(
+        buffer_index=a_cb,
+        data_format=ttnn.bfloat16,
+        page_size=cb_page_size,
+    )
+    b_cb_format = ttnn.CBFormatDescriptor(
+        buffer_index=b_cb,
+        data_format=ttnn.bfloat16,
+        page_size=cb_page_size,
+    )
+    out_cb_format = ttnn.CBFormatDescriptor(
+        buffer_index=out_cb,
+        data_format=ttnn.bfloat16,
+        page_size=cb_page_size,
+    )
+    intermediate_cb_format = ttnn.CBFormatDescriptor(
+        buffer_index=intermediate_cb,
+        data_format=ttnn.bfloat16,
+        page_size=cb_page_size,
+    )
+
+    in0_block_tiles = per_node_M * in0_block_w
+    in1_block_tiles = per_node_N * in0_block_w
+    out_block_tiles = per_node_M * per_node_N
+    buffer_factor = 2
+    a_cb_descriptor = ttnn.CBDescriptor(
+        total_size=buffer_factor * cb_page_size * in0_block_tiles,
+        core_ranges=all_nodes,
+        format_descriptors=[a_cb_format],
+    )
+    b_cb_descriptor = ttnn.CBDescriptor(
+        total_size=buffer_factor * cb_page_size * in1_block_tiles,
+        core_ranges=all_nodes,
+        format_descriptors=[b_cb_format],
+    )
+    out_cb_descriptor = ttnn.CBDescriptor(
+        total_size=cb_page_size * out_block_tiles,
+        core_ranges=all_nodes,
+        format_descriptors=[out_cb_format],
+    )
+    intermediate_cb_descriptor = ttnn.CBDescriptor(
+        total_size=cb_page_size * out_block_tiles,
+        core_ranges=all_nodes,
+        format_descriptors=[intermediate_cb_format],
+    )
+
+    # 4 semaphores for 2D mcast handshake
+    in0_mcast_sender_semaphore_id = 0
+    in0_mcast_receiver_semaphore_id = 1
+    in1_mcast_sender_semaphore_id = 2
+    in1_mcast_receiver_semaphore_id = 3
+
+    semaphore_descriptors = [
+        ttnn.SemaphoreDescriptor(
+            id=in0_mcast_sender_semaphore_id,
+            initial_value=0,
+            core_ranges=all_nodes,
+        ),
+        ttnn.SemaphoreDescriptor(
+            id=in0_mcast_receiver_semaphore_id,
+            initial_value=0,
+            core_ranges=all_nodes,
+        ),
+        ttnn.SemaphoreDescriptor(
+            id=in1_mcast_sender_semaphore_id,
+            initial_value=0,
+            core_ranges=all_nodes,
+        ),
+        ttnn.SemaphoreDescriptor(
+            id=in1_mcast_receiver_semaphore_id,
+            initial_value=0,
+            core_ranges=all_nodes,
+        ),
+    ]
+
+    # Compute kernel compile time args
+    in0_num_subblocks = per_node_M // out_subblock_h
+    in0_block_num_tiles = out_subblock_h * in0_block_w * in0_num_subblocks
+    in0_subblock_num_tiles = out_subblock_h * in0_block_w
+
+    in1_num_subblocks = per_node_N // out_subblock_w
+    in1_block_num_tiles_compute = out_subblock_w * in0_block_w * in1_num_subblocks
+    in1_per_node_w = out_subblock_w * in1_num_subblocks
+
+    num_blocks = Kt // in0_block_w
+    out_subblock_num_tiles = out_subblock_h * out_subblock_w
+
+    compute_compile_time_args = [
+        in0_block_w,
+        in0_num_subblocks,
+        in0_block_num_tiles,
+        in0_subblock_num_tiles,
+        in1_num_subblocks,
+        in1_block_num_tiles_compute,
+        in1_per_node_w,
+        num_blocks,
+        out_subblock_h,
+        out_subblock_w,
+        out_subblock_num_tiles,
+    ]
+    reader_compile_time_args = ttnn.TensorAccessorArgs(a_tensor).get_compile_time_args()
+    reader_compile_time_args.extend(
+        ttnn.TensorAccessorArgs(b_tensor).get_compile_time_args()
+    )
+    writer_compile_time_args = ttnn.TensorAccessorArgs(
+        output_tensor
+    ).get_compile_time_args()
+
+    reader_rt_args_corner = []
+    reader_rt_args_left = []
+    reader_rt_args_top = []
+    reader_rt_args_interior = []
+    writer_rt_args_left = []
+    writer_rt_args_rest = []
+
+    for node_idx_y in range(num_active_y):
+        for node_idx_x in range(num_active_x):
+            left_node = ttnn.CoreCoord(0, node_idx_y)
+            left_node_plus_one = ttnn.CoreCoord(1, node_idx_y)
+            right_node = ttnn.CoreCoord(num_active_x - 1, node_idx_y)
+            top_node = ttnn.CoreCoord(node_idx_x, 0)
+            top_node_plus_one = ttnn.CoreCoord(node_idx_x, 1)
+            bottom_node = ttnn.CoreCoord(node_idx_x, num_active_y - 1)
+
+            left_phys = device.worker_core_from_logical_core(left_node)
+            left_plus_one_phys = device.worker_core_from_logical_core(
+                left_node_plus_one
+            )
+            right_phys = device.worker_core_from_logical_core(right_node)
+            top_phys = device.worker_core_from_logical_core(top_node)
+            top_plus_one_phys = device.worker_core_from_logical_core(top_node_plus_one)
+            bottom_phys = device.worker_core_from_logical_core(bottom_node)
+
+            core = ttnn.CoreCoord(node_idx_x, node_idx_y)
+            mm_reader_args = [
+                a_tensor.buffer_address(),
+                Kt * per_node_M * node_idx_y,  # in0 start tile
+                1,  # in0 stride w
+                Kt,  # in0 stride h
+                in0_block_w,  # in0 next block stride
+                in0_block_w,  # in0 block w
+                per_node_M,  # in0 block h
+                in0_block_w * per_node_M,  # in0 block num tiles
+                b_tensor.buffer_address(),
+                per_node_N * node_idx_x,  # in1 start tile
+                1,  # in1 stride w
+                Nt,  # in1 stride h
+                in0_block_w * Nt,  # in1 next block stride
+                per_node_N,  # in1 block w
+                in0_block_w,  # in1 block h
+                per_node_N * in0_block_w,  # in1 block num tiles
+                Kt // in0_block_w,  # num blocks
+                # in0 mcast args (rightward from left column)
+                # NOTE: Physical NOC coords may be inverted from logical coords.
+                # The kernel passes (end, start) to get_noc_multicast_addr to
+                # produce the correct physical bounding box for the NOC in use.
+                right_phys.x,  # in0_mcast_dest_noc_start
+                right_phys.y,
+                left_plus_one_phys.x,  # in0_mcast_dest_noc_end
+                left_plus_one_phys.y,
+                num_active_x - 1,  # in0 mcast num dests
+                left_phys.x,
+                left_phys.y,
+                in0_mcast_sender_semaphore_id,
+                in0_mcast_receiver_semaphore_id,
+                # in1 mcast args (downward from top row)
+                # NOTE: Same start/end convention as in0 above.
+                bottom_phys.x,  # in1_mcast_dest_noc_start
+                bottom_phys.y,
+                top_plus_one_phys.x,  # in1_mcast_dest_noc_end
+                top_plus_one_phys.y,
+                num_active_y - 1,  # in1 mcast num dests
+                top_phys.x,
+                top_phys.y,
+                in1_mcast_sender_semaphore_id,
+                in1_mcast_receiver_semaphore_id,
+            ]
+
+            writer_args = [
+                output_tensor.buffer_address(),
+                node_idx_x * per_node_N + node_idx_y * per_node_M * Nt,
+                1,  # stride w
+                Nt,  # stride h
+                out_subblock_w,  # next subblock stride w
+                out_subblock_h * Nt,  # next subblock stride h
+                out_subblock_w,
+                out_subblock_h,
+                out_subblock_w * out_subblock_h,
+                per_node_N // out_subblock_w,  # num subblocks w
+                per_node_M // out_subblock_h,  # num subblocks h
+            ]
+
+            if node_idx_x == 0 and node_idx_y == 0:
+                reader_rt_args_corner.append((core, mm_reader_args))
+                writer_rt_args_left.append((core, writer_args))
+            elif node_idx_x == 0:
+                reader_rt_args_left.append((core, mm_reader_args))
+                writer_rt_args_left.append((core, writer_args))
+            elif node_idx_y == 0:
+                reader_rt_args_top.append((core, mm_reader_args))
+                writer_rt_args_rest.append((core, writer_args))
+            else:
+                reader_rt_args_interior.append((core, mm_reader_args))
+                writer_rt_args_rest.append((core, writer_args))
+
+    # Left column (in0 senders): reader on RISCV_1/NOC0, writer on RISCV_0/NOC1
+    # Non-left column (in0 receivers): reader on RISCV_1/NOC1, writer on RISCV_0/NOC0
+    reader_config_noc0 = ttnn.DataMovementConfigDescriptor(
+        processor=ttnn.DataMovementProcessor.RISCV_1,
+        noc=ttnn.NOC.RISCV_0_default,
+    )
+    reader_config_noc1 = ttnn.DataMovementConfigDescriptor(
+        processor=ttnn.DataMovementProcessor.RISCV_1,
+        noc=ttnn.NOC.RISCV_1_default,
+    )
+    writer_config_noc0 = ttnn.DataMovementConfigDescriptor(
+        processor=ttnn.DataMovementProcessor.RISCV_0,
+        noc=ttnn.NOC.RISCV_0_default,
+    )
+    writer_config_noc1 = ttnn.DataMovementConfigDescriptor(
+        processor=ttnn.DataMovementProcessor.RISCV_0,
+        noc=ttnn.NOC.RISCV_1_default,
+    )
+
+    computeConfig = ttnn.ComputeConfigDescriptor()
+    computeConfig.math_fidelity = ttnn.MathFidelity.HiFi4
+
+    reader_corner = ttnn.KernelDescriptor(
+        kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_sender.cpp",
+        source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
+        core_ranges=in0_sender_in1_sender,
+        compile_time_args=reader_compile_time_args,
+        runtime_args=reader_rt_args_corner,
+        config=reader_config_noc0,
+    )
+    reader_left = ttnn.KernelDescriptor(
+        kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_receiver.cpp",
+        source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
+        core_ranges=in0_sender_in1_receiver,
+        compile_time_args=reader_compile_time_args,
+        runtime_args=reader_rt_args_left,
+        config=reader_config_noc0,
+    )
+    reader_top = ttnn.KernelDescriptor(
+        kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_sender.cpp",
+        source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
+        core_ranges=in0_receiver_in1_sender,
+        compile_time_args=reader_compile_time_args,
+        runtime_args=reader_rt_args_top,
+        config=reader_config_noc1,
+    )
+    reader_interior = ttnn.KernelDescriptor(
+        kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_receiver.cpp",
+        source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
+        core_ranges=in0_receiver_in1_receiver,
+        compile_time_args=reader_compile_time_args,
+        runtime_args=reader_rt_args_interior,
+        config=reader_config_noc1,
+    )
+    writer_left_col = ttnn.KernelDescriptor(
+        kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/writer_bmm_tile_layout.cpp",
+        source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
+        core_ranges=left_column,
+        compile_time_args=writer_compile_time_args,
+        runtime_args=writer_rt_args_left,
+        config=writer_config_noc1,
+    )
+    writer_rest = ttnn.KernelDescriptor(
+        kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/writer_bmm_tile_layout.cpp",
+        source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
+        core_ranges=all_except_left_column,
+        compile_time_args=writer_compile_time_args,
+        runtime_args=writer_rt_args_rest,
+        config=writer_config_noc0,
+    )
+    compute_kernel = ttnn.KernelDescriptor(
+        kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/bmm_large_block_zm.cpp",
+        source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
+        core_ranges=all_nodes,
+        compile_time_args=compute_compile_time_args,
+        runtime_args=[],
+        config=computeConfig,
+    )
+
+    program_descriptor = ttnn.ProgramDescriptor(
+        kernels=[
+            reader_corner,
+            reader_left,
+            reader_top,
+            reader_interior,
+            writer_left_col,
+            writer_rest,
+            compute_kernel,
+        ],
+        semaphores=semaphore_descriptors,
+        cbs=[
+            a_cb_descriptor,
+            b_cb_descriptor,
+            out_cb_descriptor,
+            intermediate_cb_descriptor,
+        ],
+    )
+
+    print("Launching generic_op...")
+    output = ttnn.generic_op([a_tensor, b_tensor, output_tensor], program_descriptor)
+    print("Completed generic_op.")
+    metal_output = ttnn.to_torch(output).to(torch.bfloat16)
+
+    a_tensor_torch = ttnn.to_torch(a_tensor).to(torch.bfloat16)
+    b_tensor_torch = ttnn.to_torch(b_tensor).to(torch.bfloat16)
+    torch_output = torch.matmul(a_tensor_torch, b_tensor_torch)
+
+    assert_with_ulp(torch_output, metal_output)
+
+    ttnn.close_device(device)
diff --git a/examples/metal_examples/2d_mcast_matmul/metal/kernels/bmm_large_block_zm.cpp b/examples/metal_examples/2d_mcast_matmul/metal/kernels/bmm_large_block_zm.cpp
new file mode 100644
index 000000000..a2cb6cc37
--- /dev/null
+++ b/examples/metal_examples/2d_mcast_matmul/metal/kernels/bmm_large_block_zm.cpp
@@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Block matrix multiply compute kernel with z-major ordering.
+// Adapted from tt-metal programming example (batch loop removed).
+
+#include <cstdint>
+
+#include "api/compute/matmul.h"
+#include "api/compute/tile_move_copy.h"
+
+namespace NAMESPACE {
+void MAIN {
+  uint32_t in0_block_w = get_compile_time_arg_val(0);
+  uint32_t in0_num_subblocks = get_compile_time_arg_val(1);
+  uint32_t in0_block_num_tiles = get_compile_time_arg_val(2);
+  uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3);
+  uint32_t in1_num_subblocks = get_compile_time_arg_val(4);
+  uint32_t in1_block_num_tiles = get_compile_time_arg_val(5);
+  uint32_t in1_per_core_w = get_compile_time_arg_val(6);
+  uint32_t num_blocks = get_compile_time_arg_val(7);
+  uint32_t out_subblock_h = get_compile_time_arg_val(8);
+  uint32_t out_subblock_w = get_compile_time_arg_val(9);
+  uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10);
+
+  mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16);
+
+  bool spill = num_blocks > 1;
+  bool enable_reload = false;
+  uint32_t out_num_tiles_to_wait = out_subblock_num_tiles;
+
+  for (uint32_t block = 0; block < num_blocks; block++) {
+    bool last_out = block == (num_blocks - 1);
+
+    cb_wait_front(tt::CBIndex::c_0, in0_block_num_tiles);
+    cb_wait_front(tt::CBIndex::c_1, in1_block_num_tiles);
+    int in0_index_subblock_offset = 0;
+    for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks;
+         in0_subblock++) {
+      int in1_index_subblock_offset = 0;
+      for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks;
+           in1_subblock++) {
+        acquire_dst();
+
+        if (enable_reload) {
+          copy_tile_to_dst_init_short(tt::CBIndex::c_24);
+          cb_wait_front(tt::CBIndex::c_24, out_subblock_num_tiles);
+          for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+            copy_tile(tt::CBIndex::c_24, i, i);
+          }
+          cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles);
+          mm_init_short(tt::CBIndex::c_0, tt::CBIndex::c_1);
+        }
+
+        int dst_index = 0;
+        int in0_index_h_offset = 0;
+        for (uint32_t h = 0; h < out_subblock_h; h++) {
+          for (uint32_t w = 0; w < out_subblock_w; w++) {
+            int in1_index_inner_dim_offset = 0;
+            for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
+              int in0_index =
+                  in0_index_subblock_offset + in0_index_h_offset + inner_dim;
+              int in1_index =
+                  in1_index_subblock_offset + in1_index_inner_dim_offset + w;
+              matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, in0_index,
+                           in1_index, dst_index);
+              in1_index_inner_dim_offset += in1_per_core_w;
+            }
+            dst_index++;
+          }
+          in0_index_h_offset += in0_block_w;
+        }
+
+        if (last_out) {
+          cb_reserve_back(tt::CBIndex::c_16, out_subblock_num_tiles);
+          for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+            pack_tile(i, tt::CBIndex::c_16);
+          }
+          cb_push_back(tt::CBIndex::c_16, out_subblock_num_tiles);
+        } else {
+          if (block == 0) {
+            cb_reserve_back(tt::CBIndex::c_16, out_num_tiles_to_wait);
+            out_num_tiles_to_wait += out_subblock_num_tiles;
+          }
+          cb_reserve_back(tt::CBIndex::c_24, out_subblock_num_tiles);
+          for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+            pack_tile(i, tt::CBIndex::c_24);
+          }
+          cb_push_back(tt::CBIndex::c_24, out_subblock_num_tiles);
+        }
+
+        release_dst();
+        in1_index_subblock_offset += out_subblock_w;
+      }
+      in0_index_subblock_offset += in0_subblock_num_tiles;
+    }
+
+    if (spill) {
+      enable_reload = true;
+    }
+
+    cb_pop_front(tt::CBIndex::c_0, in0_block_num_tiles);
+    cb_pop_front(tt::CBIndex::c_1, in1_block_num_tiles);
+  }
+}
+} // namespace NAMESPACE
diff --git a/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_receiver.cpp b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_receiver.cpp
new file mode 100644
index 000000000..59353e110
--- /dev/null
+++ b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_receiver.cpp
@@ -0,0 +1,106 @@
+// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Interior cores: pure receiver for both in0 (from left column) and in1
+// (from top row). Signals readiness via semaphore and waits for data.
+// Adapted from tt-metal programming example (batch loop removed).
+
+#include "api/dataflow/dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+#include <stdint.h>
+
+void kernel_main() {
+  // in0 tensor args (unused but kept for uniform arg layout)
+  uint32_t in0_tensor_addr = get_arg_val<uint32_t>(0);
+  uint32_t in0_tensor_start_tile_id = get_arg_val<uint32_t>(1);
+  uint32_t in0_tensor_stride_w = get_arg_val<uint32_t>(2);
+  uint32_t in0_tensor_stride_h = get_arg_val<uint32_t>(3);
+  uint32_t in0_tensor_next_block_stride = get_arg_val<uint32_t>(4);
+
+  // in0 block args
+  uint32_t in0_block_w = get_arg_val<uint32_t>(5);
+  uint32_t in0_block_h = get_arg_val<uint32_t>(6);
+  uint32_t in0_block_num_tiles = get_arg_val<uint32_t>(7);
+
+  // in1 tensor args (unused but kept for uniform arg layout)
+  uint32_t in1_tensor_addr = get_arg_val<uint32_t>(8);
+  uint32_t in1_tensor_start_tile_id = get_arg_val<uint32_t>(9);
+  uint32_t in1_tensor_stride_w = get_arg_val<uint32_t>(10);
+  uint32_t in1_tensor_stride_h = get_arg_val<uint32_t>(11);
+  uint32_t in1_tensor_next_block_stride = get_arg_val<uint32_t>(12);
+
+  // in1 block args
+  uint32_t in1_block_w = get_arg_val<uint32_t>(13);
+  uint32_t in1_block_h = get_arg_val<uint32_t>(14);
+  uint32_t in1_block_num_tiles = get_arg_val<uint32_t>(15);
+
+  // in0/in1 common args
+  uint32_t num_blocks = get_arg_val<uint32_t>(16);
+
+  // in0 mcast args
+  uint32_t in0_mcast_dest_noc_start_x = get_arg_val<uint32_t>(17);
+  uint32_t in0_mcast_dest_noc_start_y = get_arg_val<uint32_t>(18);
+  uint32_t in0_mcast_dest_noc_end_x = get_arg_val<uint32_t>(19);
+  uint32_t in0_mcast_dest_noc_end_y = get_arg_val<uint32_t>(20);
+  uint32_t in0_mcast_num_dests = get_arg_val<uint32_t>(21);
+  uint32_t in0_mcast_sender_noc_x = get_arg_val<uint32_t>(22);
+  uint32_t in0_mcast_sender_noc_y = get_arg_val<uint32_t>(23);
+  uint32_t in0_mcast_sender_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(24));
+  uint32_t in0_mcast_receiver_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(25));
+
+  // in1 mcast args
+  uint32_t in1_mcast_dest_noc_start_x = get_arg_val<uint32_t>(26);
+  uint32_t in1_mcast_dest_noc_start_y = get_arg_val<uint32_t>(27);
+  uint32_t in1_mcast_dest_noc_end_x = get_arg_val<uint32_t>(28);
+  uint32_t in1_mcast_dest_noc_end_y = get_arg_val<uint32_t>(29);
+  uint32_t in1_mcast_num_dests = get_arg_val<uint32_t>(30);
+  uint32_t in1_mcast_sender_noc_x = get_arg_val<uint32_t>(31);
+  uint32_t in1_mcast_sender_noc_y = get_arg_val<uint32_t>(32);
+  uint32_t in1_mcast_sender_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(33));
+  uint32_t in1_mcast_receiver_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(34));
+
+  constexpr uint32_t cb_id_in0 = 0;
+  constexpr uint32_t cb_id_in1 = 1;
+
+  volatile tt_l1_ptr uint32_t *in0_mcast_receiver_semaphore_addr_ptr =
+      reinterpret_cast<volatile tt_l1_ptr uint32_t *>(
+          in0_mcast_receiver_semaphore_addr);
+  volatile tt_l1_ptr uint32_t *in1_mcast_receiver_semaphore_addr_ptr =
+      reinterpret_cast<volatile tt_l1_ptr uint32_t *>(
+          in1_mcast_receiver_semaphore_addr);
+
+  for (uint32_t block = 0; block < num_blocks; block++) {
+    // -- in0: receive via multicast from left column --
+    cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+
+    noc_semaphore_set(in0_mcast_receiver_semaphore_addr_ptr, INVALID);
+
+    uint64_t in0_mcast_sender_semaphore_noc_addr =
+        get_noc_addr(in0_mcast_sender_noc_x, in0_mcast_sender_noc_y,
+                     in0_mcast_sender_semaphore_addr);
+    noc_semaphore_inc(in0_mcast_sender_semaphore_noc_addr, 1);
+
+    noc_semaphore_wait(in0_mcast_receiver_semaphore_addr_ptr, VALID);
+
+    cb_push_back(cb_id_in0, in0_block_num_tiles);
+
+    // -- in1: receive via multicast from top row --
+    cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+
+    noc_semaphore_set(in1_mcast_receiver_semaphore_addr_ptr, INVALID);
+
+    uint64_t in1_mcast_sender_semaphore_noc_addr =
+        get_noc_addr(in1_mcast_sender_noc_x, in1_mcast_sender_noc_y,
+                     in1_mcast_sender_semaphore_addr);
+    noc_semaphore_inc(in1_mcast_sender_semaphore_noc_addr, 1);
+
+    noc_semaphore_wait(in1_mcast_receiver_semaphore_addr_ptr, VALID);
+
+    cb_push_back(cb_id_in1, in1_block_num_tiles);
+  }
+}
diff --git a/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_sender.cpp b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_sender.cpp
new file mode 100644
index 000000000..73ae2fa67
--- /dev/null
+++ b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_sender.cpp
@@ -0,0 +1,156 @@
+// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Top row cores (excluding corner): receives in0 via multicast from the left
+// column. Reads in1 from DRAM and multicasts downward along the column.
+// Adapted from tt-metal programming example (batch loop removed).
+
+#include "api/dataflow/dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+#include <stdint.h>
+
+void kernel_main() {
+  // in0 tensor args (unused on this core but kept for uniform arg layout)
+  uint32_t in0_tensor_addr = get_arg_val<uint32_t>(0);
+  uint32_t in0_tensor_start_tile_id = get_arg_val<uint32_t>(1);
+  uint32_t in0_tensor_stride_w = get_arg_val<uint32_t>(2);
+  uint32_t in0_tensor_stride_h = get_arg_val<uint32_t>(3);
+  uint32_t in0_tensor_next_block_stride = get_arg_val<uint32_t>(4);
+
+  // in0 block args
+  uint32_t in0_block_w = get_arg_val<uint32_t>(5);
+  uint32_t in0_block_h = get_arg_val<uint32_t>(6);
+  uint32_t in0_block_num_tiles = get_arg_val<uint32_t>(7);
+
+  // in1 tensor args
+  uint32_t in1_tensor_addr = get_arg_val<uint32_t>(8);
+  uint32_t in1_tensor_start_tile_id = get_arg_val<uint32_t>(9);
+  uint32_t in1_tensor_stride_w = get_arg_val<uint32_t>(10);
+  uint32_t in1_tensor_stride_h = get_arg_val<uint32_t>(11);
+  uint32_t in1_tensor_next_block_stride = get_arg_val<uint32_t>(12);
+
+  // in1 block args
+  uint32_t in1_block_w = get_arg_val<uint32_t>(13);
+  uint32_t in1_block_h = get_arg_val<uint32_t>(14);
+  uint32_t in1_block_num_tiles = get_arg_val<uint32_t>(15);
+
+  // in0/in1 common args
+  uint32_t num_blocks = get_arg_val<uint32_t>(16);
+
+  // in0 mcast args
+  uint32_t in0_mcast_dest_noc_start_x = get_arg_val<uint32_t>(17);
+  uint32_t in0_mcast_dest_noc_start_y = get_arg_val<uint32_t>(18);
+  uint32_t in0_mcast_dest_noc_end_x = get_arg_val<uint32_t>(19);
+  uint32_t in0_mcast_dest_noc_end_y = get_arg_val<uint32_t>(20);
+  uint32_t in0_mcast_num_dests = get_arg_val<uint32_t>(21);
+  uint32_t in0_mcast_sender_noc_x = get_arg_val<uint32_t>(22);
+  uint32_t in0_mcast_sender_noc_y = get_arg_val<uint32_t>(23);
+  uint32_t in0_mcast_sender_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(24));
+  uint32_t in0_mcast_receiver_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(25));
+
+  // in1 mcast args
+  uint32_t in1_mcast_dest_noc_start_x = get_arg_val<uint32_t>(26);
+  uint32_t in1_mcast_dest_noc_start_y = get_arg_val<uint32_t>(27);
+  uint32_t in1_mcast_dest_noc_end_x = get_arg_val<uint32_t>(28);
+  uint32_t in1_mcast_dest_noc_end_y = get_arg_val<uint32_t>(29);
+  uint32_t in1_mcast_num_dests = get_arg_val<uint32_t>(30);
+  uint32_t in1_mcast_sender_noc_x = get_arg_val<uint32_t>(31);
+  uint32_t in1_mcast_sender_noc_y = get_arg_val<uint32_t>(32);
+  uint32_t in1_mcast_sender_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(33));
+  uint32_t in1_mcast_receiver_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(34));
+
+  constexpr uint32_t cb_id_in0 = 0;
+  constexpr uint32_t cb_id_in1 = 1;
+
+  const uint32_t single_tile_size_bytes = get_tile_size(cb_id_in1);
+
+  uint32_t l1_write_addr_in1;
+
+  volatile tt_l1_ptr uint32_t *in0_mcast_receiver_semaphore_addr_ptr =
+      reinterpret_cast<volatile tt_l1_ptr uint32_t *>(
+          in0_mcast_receiver_semaphore_addr);
+
+  volatile tt_l1_ptr uint32_t *in1_mcast_receiver_semaphore_addr_ptr =
+      reinterpret_cast<volatile tt_l1_ptr uint32_t *>(
+          in1_mcast_receiver_semaphore_addr);
+  *(in1_mcast_receiver_semaphore_addr_ptr) = VALID;
+
+  volatile tt_l1_ptr uint32_t *in1_mcast_sender_semaphore_addr_ptr =
+      reinterpret_cast<volatile tt_l1_ptr uint32_t *>(
+          in1_mcast_sender_semaphore_addr);
+
+  constexpr auto s0_args = TensorAccessorArgs<0>();
+  constexpr auto s1_args =
+      TensorAccessorArgs<s0_args.next_compile_time_args_offset()>();
+  const auto s1 =
+      TensorAccessor(s1_args, in1_tensor_addr, single_tile_size_bytes);
+
+  uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id;
+  for (uint32_t block = 0; block < num_blocks; block++) {
+    // -- in0: receive via multicast from left column --
+    cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+
+    noc_semaphore_set(in0_mcast_receiver_semaphore_addr_ptr, INVALID);
+
+    uint64_t in0_mcast_sender_semaphore_noc_addr =
+        get_noc_addr(in0_mcast_sender_noc_x, in0_mcast_sender_noc_y,
+                     in0_mcast_sender_semaphore_addr);
+    noc_semaphore_inc(in0_mcast_sender_semaphore_noc_addr, 1);
+
+    noc_semaphore_wait(in0_mcast_receiver_semaphore_addr_ptr, VALID);
+
+    cb_push_back(cb_id_in0, in0_block_num_tiles);
+
+    // -- in1: read from DRAM and multicast downward --
+    cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+    l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+    uint32_t in1_start_address = l1_write_addr_in1;
+    uint32_t in1_block_size_bytes = 0;
+
+    uint32_t in1_tensor_row_start_tile_id =
+        in1_tensor_current_block_start_tile_id;
+    for (uint32_t h = 0; h < in1_block_h; h++) {
+      uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id;
+      for (uint32_t w = 0; w < in1_block_w; w++) {
+        noc_async_read_tile(in1_tensor_tile_id, s1, l1_write_addr_in1);
+        l1_write_addr_in1 += single_tile_size_bytes;
+        in1_tensor_tile_id += in1_tensor_stride_w;
+        in1_block_size_bytes += single_tile_size_bytes;
+      }
+      in1_tensor_row_start_tile_id += in1_tensor_stride_h;
+    }
+    in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride;
+
+    noc_async_read_barrier();
+
+    noc_semaphore_wait(in1_mcast_sender_semaphore_addr_ptr,
+                       in1_mcast_num_dests);
+    noc_semaphore_set(in1_mcast_sender_semaphore_addr_ptr, 0);
+
+    uint64_t in1_multicast_data_addr = get_noc_multicast_addr(
+        in1_mcast_dest_noc_start_x, in1_mcast_dest_noc_start_y,
+        in1_mcast_dest_noc_end_x, in1_mcast_dest_noc_end_y, in1_start_address);
+    noc_async_write_multicast(in1_start_address, in1_multicast_data_addr,
+                              in1_block_size_bytes, in1_mcast_num_dests);
+
+#ifdef ARCH_BLACKHOLE
+    noc_async_writes_flushed();
+#endif
+
+    uint64_t in1_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr(
+        in1_mcast_dest_noc_start_x, in1_mcast_dest_noc_start_y,
+        in1_mcast_dest_noc_end_x, in1_mcast_dest_noc_end_y,
+        in1_mcast_receiver_semaphore_addr);
+    noc_semaphore_set_multicast(in1_mcast_receiver_semaphore_addr,
+                                in1_mcast_receiver_semaphore_noc_addr,
+                                in1_mcast_num_dests);
+
+    cb_push_back(cb_id_in1, in1_block_num_tiles);
+  }
+}
diff --git a/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_receiver.cpp b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_receiver.cpp
new file mode 100644
index 000000000..02e045d6d
--- /dev/null
+++ b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_receiver.cpp
@@ -0,0 +1,155 @@
+// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Left column cores (excluding corner): reads in0 from DRAM and multicasts
+// rightward along the row. Receives in1 via multicast from the top row.
+// Adapted from tt-metal programming example (batch loop removed).
+
+#include "api/dataflow/dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+#include <stdint.h>
+
+void kernel_main() {
+  // in0 tensor args
+  uint32_t in0_tensor_addr = get_arg_val<uint32_t>(0);
+  uint32_t in0_tensor_start_tile_id = get_arg_val<uint32_t>(1);
+  uint32_t in0_tensor_stride_w = get_arg_val<uint32_t>(2);
+  uint32_t in0_tensor_stride_h = get_arg_val<uint32_t>(3);
+  uint32_t in0_tensor_next_block_stride = get_arg_val<uint32_t>(4);
+
+  // in0 block args
+  uint32_t in0_block_w = get_arg_val<uint32_t>(5);
+  uint32_t in0_block_h = get_arg_val<uint32_t>(6);
+  uint32_t in0_block_num_tiles = get_arg_val<uint32_t>(7);
+
+  // in1 tensor args (unused on this core but kept for uniform arg layout)
+  uint32_t in1_tensor_addr = get_arg_val<uint32_t>(8);
+  uint32_t in1_tensor_start_tile_id = get_arg_val<uint32_t>(9);
+  uint32_t in1_tensor_stride_w = get_arg_val<uint32_t>(10);
+  uint32_t in1_tensor_stride_h = get_arg_val<uint32_t>(11);
+  uint32_t in1_tensor_next_block_stride = get_arg_val<uint32_t>(12);
+
+  // in1 block args
+  uint32_t in1_block_w = get_arg_val<uint32_t>(13);
+  uint32_t in1_block_h = get_arg_val<uint32_t>(14);
+  uint32_t in1_block_num_tiles = get_arg_val<uint32_t>(15);
+
+  // in0/in1 common args
+  uint32_t num_blocks = get_arg_val<uint32_t>(16);
+
+  // in0 mcast args
+  uint32_t in0_mcast_dest_noc_start_x = get_arg_val<uint32_t>(17);
+  uint32_t in0_mcast_dest_noc_start_y = get_arg_val<uint32_t>(18);
+  uint32_t in0_mcast_dest_noc_end_x = get_arg_val<uint32_t>(19);
+  uint32_t in0_mcast_dest_noc_end_y = get_arg_val<uint32_t>(20);
+  uint32_t in0_mcast_num_dests = get_arg_val<uint32_t>(21);
+  uint32_t in0_mcast_sender_noc_x = get_arg_val<uint32_t>(22);
+  uint32_t in0_mcast_sender_noc_y = get_arg_val<uint32_t>(23);
+  uint32_t in0_mcast_sender_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(24));
+  uint32_t in0_mcast_receiver_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(25));
+
+  // in1 mcast args
+  uint32_t in1_mcast_dest_noc_start_x = get_arg_val<uint32_t>(26);
+  uint32_t in1_mcast_dest_noc_start_y = get_arg_val<uint32_t>(27);
+  uint32_t in1_mcast_dest_noc_end_x = get_arg_val<uint32_t>(28);
+  uint32_t in1_mcast_dest_noc_end_y = get_arg_val<uint32_t>(29);
+  uint32_t in1_mcast_num_dests = get_arg_val<uint32_t>(30);
+  uint32_t in1_mcast_sender_noc_x = get_arg_val<uint32_t>(31);
+  uint32_t in1_mcast_sender_noc_y = get_arg_val<uint32_t>(32);
+  uint32_t in1_mcast_sender_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(33));
+  uint32_t in1_mcast_receiver_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(34));
+
+  constexpr uint32_t cb_id_in0 = 0;
+  constexpr uint32_t cb_id_in1 = 1;
+
+  const uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0);
+
+  uint32_t l1_write_addr_in0;
+
+  volatile tt_l1_ptr uint32_t *in0_mcast_receiver_semaphore_addr_ptr =
+      reinterpret_cast<volatile tt_l1_ptr uint32_t *>(
+          in0_mcast_receiver_semaphore_addr);
+  *(in0_mcast_receiver_semaphore_addr_ptr) = VALID;
+
+  volatile tt_l1_ptr uint32_t *in0_mcast_sender_semaphore_addr_ptr =
+      reinterpret_cast<volatile tt_l1_ptr uint32_t *>(
+          in0_mcast_sender_semaphore_addr);
+
+  volatile tt_l1_ptr uint32_t *in1_mcast_receiver_semaphore_addr_ptr =
+      reinterpret_cast<volatile tt_l1_ptr uint32_t *>(
+          in1_mcast_receiver_semaphore_addr);
+
+  constexpr auto s0_args = TensorAccessorArgs<0>();
+  const auto s0 =
+      TensorAccessor(s0_args, in0_tensor_addr, single_tile_size_bytes);
+
+  uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id;
+  for (uint32_t block = 0; block < num_blocks; block++) {
+    // -- in0: read from DRAM and multicast rightward --
+    cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+    l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+
+    uint32_t in0_start_address = l1_write_addr_in0;
+    uint32_t in0_block_size_bytes = 0;
+
+    uint32_t in0_tensor_row_start_tile_id =
+        in0_tensor_current_block_start_tile_id;
+    for (uint32_t h = 0; h < in0_block_h; h++) {
+      uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id;
+      for (uint32_t w = 0; w < in0_block_w; w++) {
+        noc_async_read_tile(in0_tensor_tile_id, s0, l1_write_addr_in0);
+        l1_write_addr_in0 += single_tile_size_bytes;
+        in0_tensor_tile_id += in0_tensor_stride_w;
+        in0_block_size_bytes += single_tile_size_bytes;
+      }
+      in0_tensor_row_start_tile_id += in0_tensor_stride_h;
+    }
+    in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride;
+
+    noc_async_read_barrier();
+
+    noc_semaphore_wait(in0_mcast_sender_semaphore_addr_ptr,
+                       in0_mcast_num_dests);
+    noc_semaphore_set(in0_mcast_sender_semaphore_addr_ptr, 0);
+
+    uint64_t in0_multicast_data_addr = get_noc_multicast_addr(
+        in0_mcast_dest_noc_end_x, in0_mcast_dest_noc_end_y,
+        in0_mcast_dest_noc_start_x, in0_mcast_dest_noc_start_y,
+        in0_start_address);
+    noc_async_write_multicast(in0_start_address, in0_multicast_data_addr,
+                              in0_block_size_bytes, in0_mcast_num_dests);
+
+#ifdef ARCH_BLACKHOLE
+    noc_async_writes_flushed();
+#endif
+
+    uint64_t in0_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr(
+        in0_mcast_dest_noc_end_x, in0_mcast_dest_noc_end_y,
+        in0_mcast_dest_noc_start_x, in0_mcast_dest_noc_start_y,
+        in0_mcast_receiver_semaphore_addr);
+    noc_semaphore_set_multicast(in0_mcast_receiver_semaphore_addr,
+                                in0_mcast_receiver_semaphore_noc_addr,
+                                in0_mcast_num_dests);
+
+    cb_push_back(cb_id_in0, in0_block_num_tiles);
+
+    // -- in1: receive via multicast from top row --
+    cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+
+    noc_semaphore_set(in1_mcast_receiver_semaphore_addr_ptr, INVALID);
+
+    uint64_t in1_mcast_sender_semaphore_noc_addr =
+        get_noc_addr(in1_mcast_sender_noc_x, in1_mcast_sender_noc_y,
+                     in1_mcast_sender_semaphore_addr);
+    noc_semaphore_inc(in1_mcast_sender_semaphore_noc_addr, 1);
+
+    noc_semaphore_wait(in1_mcast_receiver_semaphore_addr_ptr, VALID);
+
+    cb_push_back(cb_id_in1, in1_block_num_tiles);
+  }
+}
diff --git a/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_sender.cpp b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_sender.cpp
new file mode 100644
index 000000000..2e2f6c68c
--- /dev/null
+++ b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_sender.cpp
@@ -0,0 +1,200 @@
+// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Corner core (0,0): reads both in0 and in1 from DRAM, multicasts
+// in0 rightward along the row and in1 downward along the column.
+// Adapted from tt-metal programming example (batch loop removed).
+
+#include "api/dataflow/dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+#include <stdint.h>
+
+void kernel_main() {
+  // in0 tensor args
+  uint32_t in0_tensor_addr = get_arg_val<uint32_t>(0);
+  uint32_t in0_tensor_start_tile_id = get_arg_val<uint32_t>(1);
+  uint32_t in0_tensor_stride_w = get_arg_val<uint32_t>(2);
+  uint32_t in0_tensor_stride_h = get_arg_val<uint32_t>(3);
+  uint32_t in0_tensor_next_block_stride = get_arg_val<uint32_t>(4);
+
+  // in0 block args
+  uint32_t in0_block_w = get_arg_val<uint32_t>(5);
+  uint32_t in0_block_h = get_arg_val<uint32_t>(6);
+  uint32_t in0_block_num_tiles = get_arg_val<uint32_t>(7);
+
+  // in1 tensor args
+  uint32_t in1_tensor_addr = get_arg_val<uint32_t>(8);
+  uint32_t in1_tensor_start_tile_id = get_arg_val<uint32_t>(9);
+  uint32_t in1_tensor_stride_w = get_arg_val<uint32_t>(10);
+  uint32_t in1_tensor_stride_h = get_arg_val<uint32_t>(11);
+  uint32_t in1_tensor_next_block_stride = get_arg_val<uint32_t>(12);
+
+  // in1 block args
+  uint32_t in1_block_w = get_arg_val<uint32_t>(13);
+  uint32_t in1_block_h = get_arg_val<uint32_t>(14);
+  uint32_t in1_block_num_tiles = get_arg_val<uint32_t>(15);
+
+  // in0/in1 common args
+  uint32_t num_blocks = get_arg_val<uint32_t>(16);
+
+  // in0 mcast args
+  uint32_t in0_mcast_dest_noc_start_x = get_arg_val<uint32_t>(17);
+  uint32_t in0_mcast_dest_noc_start_y = get_arg_val<uint32_t>(18);
+  uint32_t in0_mcast_dest_noc_end_x = get_arg_val<uint32_t>(19);
+  uint32_t in0_mcast_dest_noc_end_y = get_arg_val<uint32_t>(20);
+  uint32_t in0_mcast_num_dests = get_arg_val<uint32_t>(21);
+  uint32_t in0_mcast_sender_noc_x = get_arg_val<uint32_t>(22);
+  uint32_t in0_mcast_sender_noc_y = get_arg_val<uint32_t>(23);
+  uint32_t in0_mcast_sender_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(24));
+  uint32_t in0_mcast_receiver_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(25));
+
+  // in1 mcast args
+  uint32_t in1_mcast_dest_noc_start_x = get_arg_val<uint32_t>(26);
+  uint32_t in1_mcast_dest_noc_start_y = get_arg_val<uint32_t>(27);
+  uint32_t in1_mcast_dest_noc_end_x = get_arg_val<uint32_t>(28);
+  uint32_t in1_mcast_dest_noc_end_y = get_arg_val<uint32_t>(29);
+  uint32_t in1_mcast_num_dests = get_arg_val<uint32_t>(30);
+  uint32_t in1_mcast_sender_noc_x = get_arg_val<uint32_t>(31);
+  uint32_t in1_mcast_sender_noc_y = get_arg_val<uint32_t>(32);
+  uint32_t in1_mcast_sender_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(33));
+  uint32_t in1_mcast_receiver_semaphore_addr =
+      get_semaphore(get_arg_val<uint32_t>(34));
+
+  constexpr uint32_t cb_id_in0 = 0;
+  constexpr uint32_t cb_id_in1 = 1;
+
+  const uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0);
+
+  uint32_t l1_write_addr_in0;
+  uint32_t l1_write_addr_in1;
+
+  volatile tt_l1_ptr uint32_t *in0_mcast_receiver_semaphore_addr_ptr =
+      reinterpret_cast<volatile tt_l1_ptr uint32_t *>(
+          in0_mcast_receiver_semaphore_addr);
+  *(in0_mcast_receiver_semaphore_addr_ptr) = VALID;
+
+  volatile tt_l1_ptr uint32_t *in1_mcast_receiver_semaphore_addr_ptr =
+      reinterpret_cast<volatile tt_l1_ptr uint32_t *>(
+          in1_mcast_receiver_semaphore_addr);
+  *(in1_mcast_receiver_semaphore_addr_ptr) = VALID;
+
+  volatile tt_l1_ptr uint32_t *in0_mcast_sender_semaphore_addr_ptr =
+      reinterpret_cast<volatile tt_l1_ptr uint32_t *>(
+          in0_mcast_sender_semaphore_addr);
+
+  volatile tt_l1_ptr uint32_t *in1_mcast_sender_semaphore_addr_ptr =
+      reinterpret_cast<volatile tt_l1_ptr uint32_t *>(
+          in1_mcast_sender_semaphore_addr);
+
+  constexpr auto s0_args = TensorAccessorArgs<0>();
+  const auto s0 =
+      TensorAccessor(s0_args, in0_tensor_addr, single_tile_size_bytes);
+  constexpr auto s1_args =
+      TensorAccessorArgs<s0_args.next_compile_time_args_offset()>();
+  const auto s1 =
+      TensorAccessor(s1_args, in1_tensor_addr, single_tile_size_bytes);
+
+  uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id;
+  uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id;
+  for (uint32_t block = 0; block < num_blocks; block++) {
+    // -- in0: read from DRAM and multicast rightward --
+    cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+    l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+
+    uint32_t in0_start_address = l1_write_addr_in0;
+    uint32_t in0_block_size_bytes = 0;
+
+    uint32_t in0_tensor_row_start_tile_id =
+        in0_tensor_current_block_start_tile_id;
+    for (uint32_t h = 0; h < in0_block_h; h++) {
+      uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id;
+      for (uint32_t w = 0; w < in0_block_w; w++) {
+        noc_async_read_tile(in0_tensor_tile_id, s0, l1_write_addr_in0);
+        l1_write_addr_in0 += single_tile_size_bytes;
+        in0_tensor_tile_id += in0_tensor_stride_w;
+        in0_block_size_bytes += single_tile_size_bytes;
+      }
+      in0_tensor_row_start_tile_id += in0_tensor_stride_h;
+    }
+    in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride;
+
+    noc_async_read_barrier();
+
+    noc_semaphore_wait(in0_mcast_sender_semaphore_addr_ptr,
+                       in0_mcast_num_dests);
+    noc_semaphore_set(in0_mcast_sender_semaphore_addr_ptr, 0);
+
+    uint64_t in0_multicast_data_addr = get_noc_multicast_addr(
+        in0_mcast_dest_noc_end_x, in0_mcast_dest_noc_end_y,
+        in0_mcast_dest_noc_start_x, in0_mcast_dest_noc_start_y,
+        in0_start_address);
+    noc_async_write_multicast(in0_start_address, in0_multicast_data_addr,
+                              in0_block_size_bytes, in0_mcast_num_dests);
+
+#ifdef ARCH_BLACKHOLE
+    noc_async_writes_flushed();
+#endif
+
+    uint64_t in0_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr(
+        in0_mcast_dest_noc_end_x, in0_mcast_dest_noc_end_y,
+        in0_mcast_dest_noc_start_x, in0_mcast_dest_noc_start_y,
+        in0_mcast_receiver_semaphore_addr);
+    noc_semaphore_set_multicast(in0_mcast_receiver_semaphore_addr,
+                                in0_mcast_receiver_semaphore_noc_addr,
+                                in0_mcast_num_dests);
+
+    cb_push_back(cb_id_in0, in0_block_num_tiles);
+
+    // -- in1: read from DRAM and multicast downward --
+    cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+    l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+    uint32_t in1_start_address = l1_write_addr_in1;
+    uint32_t in1_block_size_bytes = 0;
+
+    uint32_t in1_tensor_row_start_tile_id =
+        in1_tensor_current_block_start_tile_id;
+    for (uint32_t h = 0; h < in1_block_h; h++) {
+      uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id;
+      for (uint32_t w = 0; w < in1_block_w; w++) {
+        noc_async_read_tile(in1_tensor_tile_id, s1, l1_write_addr_in1);
+        l1_write_addr_in1 += single_tile_size_bytes;
+        in1_tensor_tile_id += in1_tensor_stride_w;
+        in1_block_size_bytes += single_tile_size_bytes;
+      }
+      in1_tensor_row_start_tile_id += in1_tensor_stride_h;
+    }
+    in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride;
+
+    noc_async_read_barrier();
+
+    noc_semaphore_wait(in1_mcast_sender_semaphore_addr_ptr,
+                       in1_mcast_num_dests);
+    noc_semaphore_set(in1_mcast_sender_semaphore_addr_ptr, 0);
+
+    uint64_t in1_multicast_data_addr = get_noc_multicast_addr(
+        in1_mcast_dest_noc_end_x, in1_mcast_dest_noc_end_y,
+        in1_mcast_dest_noc_start_x, in1_mcast_dest_noc_start_y,
+        in1_start_address);
+    noc_async_write_multicast(in1_start_address, in1_multicast_data_addr,
+                              in1_block_size_bytes, in1_mcast_num_dests);
+
+#ifdef ARCH_BLACKHOLE
+    noc_async_writes_flushed();
+#endif
+
+    uint64_t in1_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr(
+        in1_mcast_dest_noc_end_x, in1_mcast_dest_noc_end_y,
+        in1_mcast_dest_noc_start_x, in1_mcast_dest_noc_start_y,
+        in1_mcast_receiver_semaphore_addr);
+    noc_semaphore_set_multicast(in1_mcast_receiver_semaphore_addr,
+                                in1_mcast_receiver_semaphore_noc_addr,
+                                in1_mcast_num_dests);
+
+    cb_push_back(cb_id_in1, in1_block_num_tiles);
+  }
+}
diff --git a/examples/metal_examples/2d_mcast_matmul/metal/kernels/writer_bmm_tile_layout.cpp b/examples/metal_examples/2d_mcast_matmul/metal/kernels/writer_bmm_tile_layout.cpp
new file mode 100644
index 000000000..d7a5565bc
--- /dev/null
+++ b/examples/metal_examples/2d_mcast_matmul/metal/kernels/writer_bmm_tile_layout.cpp
@@ -0,0 +1,57 @@
+// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Writer kernel for block matmul output.
+// Adapted from tt-metal programming example (batch loop removed).
+
+#include "api/dataflow/dataflow_api.h"
+
+void kernel_main() {
+  uint32_t out_tensor_addr = get_arg_val<uint32_t>(0);
+  uint32_t out_tensor_start_tile_id = get_arg_val<uint32_t>(1);
+  uint32_t out_tensor_stride_w = get_arg_val<uint32_t>(2);
+  uint32_t out_tensor_stride_h = get_arg_val<uint32_t>(3);
+  uint32_t out_tensor_next_subblock_stride_w = get_arg_val<uint32_t>(4);
+  uint32_t out_tensor_next_subblock_stride_h = get_arg_val<uint32_t>(5);
+
+  uint32_t out_subblock_w = get_arg_val<uint32_t>(6);
+  uint32_t out_subblock_h = get_arg_val<uint32_t>(7);
+  uint32_t out_subblock_tile_count = get_arg_val<uint32_t>(8);
+  uint32_t out_num_subblocks_w = get_arg_val<uint32_t>(9);
+  uint32_t out_num_subblocks_h = get_arg_val<uint32_t>(10);
+
+  constexpr uint32_t cb_id_out0 = 16;
+
+  const uint32_t single_tile_size_bytes = get_tile_size(cb_id_out0);
+
+  constexpr auto s_args = TensorAccessorArgs<0>();
+  const auto s =
+      TensorAccessor(s_args, out_tensor_addr, single_tile_size_bytes);
+
+  uint32_t out_tensor_sbh_start_tile_id = out_tensor_start_tile_id;
+  for (uint32_t sbh = 0; sbh < out_num_subblocks_h; sbh++) {
+    uint32_t out_tensor_sbw_start_tile_id = out_tensor_sbh_start_tile_id;
+    for (uint32_t sbw = 0; sbw < out_num_subblocks_w; sbw++) {
+      uint32_t out_tensor_sb_row_start_tile_id = out_tensor_sbw_start_tile_id;
+
+      cb_wait_front(cb_id_out0, out_subblock_tile_count);
+      uint32_t l1_read_addr = get_read_ptr(cb_id_out0);
+
+      for (uint32_t h = 0; h < out_subblock_h; h++) {
+        uint32_t out_tensor_tile_id = out_tensor_sb_row_start_tile_id;
+        for (uint32_t w = 0; w < out_subblock_w; w++) {
+          noc_async_write_tile(out_tensor_tile_id, s, l1_read_addr);
+          l1_read_addr += single_tile_size_bytes;
+          out_tensor_tile_id += out_tensor_stride_w;
+        }
+        out_tensor_sb_row_start_tile_id += out_tensor_stride_h;
+      }
+
+      noc_async_write_barrier();
+      cb_pop_front(cb_id_out0, out_subblock_tile_count);
+      out_tensor_sbw_start_tile_id += out_tensor_next_subblock_stride_w;
+    }
+    out_tensor_sbh_start_tile_id += out_tensor_next_subblock_stride_h;
+  }
+}
diff --git a/examples/metal_examples/2d_mcast_matmul/ttlang/2d_mcast_matmul.py b/examples/metal_examples/2d_mcast_matmul/ttlang/2d_mcast_matmul.py
new file mode 100644
index 000000000..1e67ff5d6
--- /dev/null
+++ b/examples/metal_examples/2d_mcast_matmul/ttlang/2d_mcast_matmul.py
@@ -0,0 +1,182 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+import ttnn
+
+import ttl
+from utils.block_allocation import get_large_matmul_params
+from utils.correctness import assert_with_ulp
+
+
+@ttl.operation(grid=("auto"))
+def tt_lang_2d_mcast_matmul(a: ttnn.Tensor, b: ttnn.Tensor, out: ttnn.Tensor):
+    assert a.shape[1] == b.shape[0], "Incompatible matrix shapes for multiplication."
+    assert a.shape[0] == out.shape[0], "Output matrix has incorrect number of rows."
+    assert b.shape[1] == out.shape[1], "Output matrix has incorrect number of columns."
+    M = a.shape[0]
+    N = b.shape[1]
+    K = a.shape[1]
+    Mt = M // ttnn.TILE_SIZE
+    Kt = K // ttnn.TILE_SIZE
+    Nt = N // ttnn.TILE_SIZE
+
+    K_block_size = 2
+
+    num_nodes_x, num_nodes_y = ttl.grid_size(dims=2)
+    block_params = get_large_matmul_params(
+        Mt, Nt, num_nodes_y, num_nodes_x, K_block_size
+    )
+    per_node_M = block_params.block_h
+    per_node_N = block_params.block_w
+    assert per_node_M != 0, "get_large_matmul_params was not able to find a solution"
+    print(f"per_node_M: {per_node_M}, per_node_N: {per_node_N}")
+    assert Mt % per_node_M == 0, "per_node_M must divide Mt"
+    assert Nt % per_node_N == 0, "per_node_N must divide Nt"
+    assert Kt % K_block_size == 0, "K_block_size must divide Kt"
+    num_blocks_y = Mt // per_node_M
+    num_blocks_x = Nt // per_node_N
+    assert (
+        num_blocks_x <= num_nodes_x and num_blocks_y <= num_nodes_y
+    ), "number of total blocks must be less than or equal to num nodes"
+    assert (
+        num_blocks_x >= 2 and num_blocks_y >= 2
+    ), "2D mcast requires at least a 2x2 active node grid"
+
+    num_active_x = num_blocks_x
+    num_active_y = num_blocks_y
+    num_blocks_k = Kt // K_block_size
+
+    block_count = 2
+    a_dfb = ttl.make_dataflow_buffer_like(
+        a, shape=(per_node_M, K_block_size), block_count=block_count
+    )
+    b_dfb = ttl.make_dataflow_buffer_like(
+        b, shape=(K_block_size, per_node_N), block_count=block_count
+    )
+    # non buffered output, matching metal implementation
+    out_dfb = ttl.make_dataflow_buffer_like(
+        out, shape=(per_node_M, per_node_N), block_count=1
+    )
+
+    # A multicast: left column (x=0) reads from DRAM and multicasts rightward along each row
+    a_pipes = [
+        ttl.Pipe((0, y), (slice(1, num_active_x), y)) for y in range(num_active_y)
+    ]
+    a_mcast_net = ttl.PipeNet(a_pipes)
+
+    # B multicast: top row (y=0) reads from DRAM and multicasts downward along each column
+    b_pipes = [
+        ttl.Pipe((x, 0), (x, slice(1, num_active_y))) for x in range(num_active_x)
+    ]
+    b_mcast_net = ttl.PipeNet(b_pipes)
+
+    @ttl.compute()
+    def mm_compute():
+        node_x, node_y = ttl.node(dims=2)
+        out_row = per_node_M * node_y
+        out_col = per_node_N * node_x
+        if (out_row < Mt) and (out_col < Nt):
+            with out_dfb.reserve() as out_blk:
+                acc = ttl.math.fill(out_blk, 0)
+                for _ in range(num_blocks_k):
+                    with (
+                        a_dfb.wait() as a_blk,
+                        b_dfb.wait() as b_blk,
+                    ):
+                        acc += a_blk @ b_blk
+                out_blk.store(acc)
+
+    @ttl.datamovement()
+    def mm_reader():
+        node_x, node_y = ttl.node(dims=2)
+        out_row = per_node_M * node_y
+        out_col = per_node_N * node_x
+        if (out_row < Mt) and (out_col < Nt):
+            for block_k in range(num_blocks_k):
+                k = block_k * K_block_size
+
+                # A: left column reads from DRAM and multicasts, other columns receive
+                with a_dfb.reserve() as a_blk:
+
+                    def a_pipe_src(pipe):
+                        in_rd = ttl.copy(
+                            a[
+                                out_row : (out_row + per_node_M),
+                                k : (k + K_block_size),
+                            ],
+                            a_blk,
+                        )
+                        in_rd.wait()
+                        mcast_wr = ttl.copy(a_blk, pipe)
+                        mcast_wr.wait()
+
+                    def a_pipe_dst(pipe):
+                        mcast_rd = ttl.copy(pipe, a_blk)
+                        mcast_rd.wait()
+
+                    a_mcast_net.if_src(a_pipe_src)
+                    a_mcast_net.if_dst(a_pipe_dst)
+
+                # B: top row reads from DRAM and multicasts, other rows receive
+                with b_dfb.reserve() as b_blk:
+
+                    def b_pipe_src(pipe):
+                        in_rd = ttl.copy(
+                            b[
+                                k : (k + K_block_size),
+                                out_col : (out_col + per_node_N),
+                            ],
+                            b_blk,
+                        )
+                        in_rd.wait()
+                        mcast_wr = ttl.copy(b_blk, pipe)
+                        mcast_wr.wait()
+
+                    def b_pipe_dst(pipe):
+                        mcast_rd = ttl.copy(pipe, b_blk)
+                        mcast_rd.wait()
+
+                    b_mcast_net.if_src(b_pipe_src)
+                    b_mcast_net.if_dst(b_pipe_dst)
+
+    @ttl.datamovement()
+    def mm_writer():
+        node_x, node_y = ttl.node(dims=2)
+        out_row = per_node_M * node_y
+        out_col = per_node_N * node_x
+        if (out_row < Mt) and (out_col < Nt):
+            with out_dfb.wait() as out_blk:
+                out_wr = ttl.copy(
+                    out_blk,
+                    out[
+                        out_row : (out_row + per_node_M),
+                        out_col : (out_col + per_node_N),
+                    ],
+                )
+                out_wr.wait()
+
+
+@pytest.mark.parametrize("M,K,N", [(3584, 768, 3072)])
+def test_2d_mcast_matmul_tt_lang(M, K, N):
+    """Test 2D multicast matmul operation."""
+    device = ttnn.open_device(device_id=0)
+    a = ttnn.rand((M, K), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT)
+    b = ttnn.rand((K, N), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT)
+    c = ttnn.empty((M, N), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT)
+
+    tt_lang_2d_mcast_matmul(a, b, c)
+
+    golden = torch.matmul(
+        ttnn.to_torch(a).to(torch.bfloat16), ttnn.to_torch(b).to(torch.bfloat16)
+    )
+    result = ttnn.to_torch(c).to(torch.bfloat16)
+    assert_with_ulp(golden, result)
+    print("Test passed!")
+
+    ttnn.close_device(device)
+
+
+if __name__ == "__main__":
+    test_2d_mcast_matmul_tt_lang(3584, 768, 3072)
diff --git a/examples/metal_examples/multinode_matmul/metal/kernels/mm_compute.cpp b/examples/metal_examples/multinode_matmul/metal/kernels/mm_compute.cpp
index 3dc7cdd89..543638ef3 100644
--- a/examples/metal_examples/multinode_matmul/metal/kernels/mm_compute.cpp
+++ b/examples/metal_examples/multinode_matmul/metal/kernels/mm_compute.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "compute_kernel_api/matmul.h"
-#include "compute_kernel_api/tile_move_copy.h"
+#include "api/compute/matmul.h"
+#include "api/compute/tile_move_copy.h"
 #include "hostdevcommon/kernel_structs.h"
 #include <cstdint>
 
diff --git a/examples/metal_examples/multinode_matmul/metal/multinode_matmul.py b/examples/metal_examples/multinode_matmul/metal/multinode_matmul.py
index 1cb65c431..3caea924b 100644
--- a/examples/metal_examples/multinode_matmul/metal/multinode_matmul.py
+++ b/examples/metal_examples/multinode_matmul/metal/multinode_matmul.py
@@ -28,8 +28,8 @@ def test_multinode_matmul(M, K, N):
 
     device_node_size = device.compute_with_storage_grid_size()
     upper_bound_node = ttnn.CoreCoord(device_node_size.x - 1, device_node_size.y - 1)
-    device_node_grid = ttnn.NodeRangeSet(
-        [ttnn.NodeRange(ttnn.CoreCoord(0, 0), upper_bound_node)]
+    device_node_grid = ttnn.CoreRangeSet(
+        [ttnn.CoreRange(ttnn.CoreCoord(0, 0), upper_bound_node)]
     )
     print(
         f"node_grid: {device_node_grid}, num_output_tiles_total: {num_output_tiles_total}"
@@ -92,17 +92,17 @@ def test_multinode_matmul(M, K, N):
 
     a_cb_descriptor = ttnn.CBDescriptor(
         total_size=cb_total_size,
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         format_descriptors=[a_cb_format],
     )
     b_cb_descriptor = ttnn.CBDescriptor(
         total_size=cb_total_size,
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         format_descriptors=[b_cb_format],
     )
     out_cb_descriptor = ttnn.CBDescriptor(
         total_size=cb_total_size,
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         format_descriptors=[out_cb_format],
     )
 
@@ -121,9 +121,9 @@ def test_multinode_matmul(M, K, N):
     # as the larger one to enable indexing in
     num_x_nodes = upper_bound_node.x + 1
     num_y_nodes = upper_bound_node.y + 1
-    reader_rt_args = [[[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)]
-    writer_rt_args = [[[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)]
-    compute_rt_args = [[[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)]
+    reader_rt_args = []
+    writer_rt_args = []
+    compute_rt_args = []
     current_tile = 0
     for node_range in node_group_1.ranges():
         for x in range(node_range.start.x, node_range.end.x + 1):
@@ -131,21 +131,32 @@ def test_multinode_matmul(M, K, N):
                 print(
                     f"Assigning node ({x},{y}) tile {current_tile} work_per_node1 {work_per_node1}"
                 )
-                reader_rt_args[x][y] = [
-                    a_tensor.buffer_address(),
-                    b_tensor.buffer_address(),
-                    Mt,
-                    Kt,
-                    Nt,
-                    current_tile,
-                    work_per_node1,
-                ]
-                writer_rt_args[x][y] = [
-                    output_tensor.buffer_address(),
-                    work_per_node1,
-                    current_tile,
-                ]
-                compute_rt_args[x][y] = [work_per_node1, Kt]
+                core = ttnn.CoreCoord(x, y)
+                reader_rt_args.append(
+                    (
+                        core,
+                        [
+                            a_tensor.buffer_address(),
+                            b_tensor.buffer_address(),
+                            Mt,
+                            Kt,
+                            Nt,
+                            current_tile,
+                            work_per_node1,
+                        ],
+                    )
+                )
+                writer_rt_args.append(
+                    (
+                        core,
+                        [
+                            output_tensor.buffer_address(),
+                            work_per_node1,
+                            current_tile,
+                        ],
+                    )
+                )
+                compute_rt_args.append((core, [work_per_node1, Kt]))
                 current_tile += work_per_node1
 
     for node_range in node_group_2.ranges():
@@ -154,21 +165,32 @@ def test_multinode_matmul(M, K, N):
                 print(
                     f"Assigning node ({x},{y}) tile {current_tile} work_per_node2 {work_per_node2}"
                 )
-                reader_rt_args[x][y] = [
-                    a_tensor.buffer_address(),
-                    b_tensor.buffer_address(),
-                    Mt,
-                    Kt,
-                    Nt,
-                    current_tile,
-                    work_per_node2,
-                ]
-                writer_rt_args[x][y] = [
-                    output_tensor.buffer_address(),
-                    work_per_node2,
-                    current_tile,
-                ]
-                compute_rt_args[x][y] = [work_per_node2, Kt]
+                core = ttnn.CoreCoord(x, y)
+                reader_rt_args.append(
+                    (
+                        core,
+                        [
+                            a_tensor.buffer_address(),
+                            b_tensor.buffer_address(),
+                            Mt,
+                            Kt,
+                            Nt,
+                            current_tile,
+                            work_per_node2,
+                        ],
+                    )
+                )
+                writer_rt_args.append(
+                    (
+                        core,
+                        [
+                            output_tensor.buffer_address(),
+                            work_per_node2,
+                            current_tile,
+                        ],
+                    )
+                )
+                compute_rt_args.append((core, [work_per_node2, Kt]))
                 current_tile += work_per_node2
 
     # Compute config init can't handle options, set here
@@ -180,7 +202,7 @@ def test_multinode_matmul(M, K, N):
     reader_kernel_descriptor = ttnn.KernelDescriptor(
         kernel_source="examples/metal_examples/multinode_matmul/metal/kernels/mm_reader.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         compile_time_args=reader_compile_time_args,
         runtime_args=reader_rt_args,
         config=ttnn.ReaderConfigDescriptor(),
@@ -188,7 +210,7 @@ def test_multinode_matmul(M, K, N):
     writer_kernel_descriptor = ttnn.KernelDescriptor(
         kernel_source="examples/metal_examples/multinode_matmul/metal/kernels/mm_writer.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         compile_time_args=writer_compile_time_args,
         runtime_args=writer_rt_args,
         config=ttnn.WriterConfigDescriptor(),
@@ -196,7 +218,7 @@ def test_multinode_matmul(M, K, N):
     compute_kernel_descriptor = ttnn.KernelDescriptor(
         kernel_source="examples/metal_examples/multinode_matmul/metal/kernels/mm_compute.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         compile_time_args=[],
         runtime_args=compute_rt_args,
         config=computeConfig,
diff --git a/examples/metal_examples/multinode_matmul/ttlang/multinode_matmul.py b/examples/metal_examples/multinode_matmul/ttlang/multinode_matmul.py
index 5da5e1aba..6053d8312 100644
--- a/examples/metal_examples/multinode_matmul/ttlang/multinode_matmul.py
+++ b/examples/metal_examples/multinode_matmul/ttlang/multinode_matmul.py
@@ -1,7 +1,6 @@
 # SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
 #
 # SPDX-License-Identifier: Apache-2.0
-# up to tt-lang spec, not intended to compile or run currently
 import pytest
 import torch
 
@@ -9,21 +8,10 @@
 import ttl
 
 from utils.correctness import assert_with_ulp
-from utils.block_allocation import split_work_to_nodes
+from utils.block_allocation import get_number_of_nodes_from_ranges, split_work_to_nodes
 
 
-def get_number_of_nodes(grid_range):
-    total_nodes = 0
-    if len(grid_range) != 0:
-        start = grid_range[0]
-        end = grid_range[1]
-        x_range = end[0] - start[0] + 1
-        y_range = end[1] - start[1] + 1
-        total_nodes += x_range * y_range
-    return total_nodes
-
-
-@ttl.operation(grid=(13, 10))
+@ttl.operation(grid=("auto"))
 def tt_lang_multinode_matmul(a: ttnn.Tensor, b: ttnn.Tensor, out: ttnn.Tensor):
     assert a.shape[1] == b.shape[0], "Incompatible matrix shapes for multiplication."
     assert a.shape[0] == out.shape[0], "Output matrix has incorrect number of rows."
@@ -44,15 +32,15 @@ def tt_lang_multinode_matmul(a: ttnn.Tensor, b: ttnn.Tensor, out: ttnn.Tensor):
     print(f"num_output_tiles_total: {num_output_tiles_total}")
     all_nodes, node_group_1, node_group_2, work_per_node1, work_per_node2 = (
         split_work_to_nodes(
-            ttl.grid_size(dims=2), num_output_tiles_total, row_wise=True
+            (ttl.grid_size(dims=1),), num_output_tiles_total, row_wise=True
         )
     )
     print(
         f"all_nodes: {all_nodes}, node_group_1: {node_group_1}, node_group_2: {node_group_2}, work_per_node1: {work_per_node1}, work_per_node2: {work_per_node2}"
     )
 
-    num_nodes_group_1 = get_number_of_nodes(node_group_1)
-    num_nodes_group_2 = get_number_of_nodes(node_group_2)
+    num_nodes_group_1 = get_number_of_nodes_from_ranges(node_group_1)
+    num_nodes_group_2 = get_number_of_nodes_from_ranges(node_group_2)
 
     def get_tiles_per_node(node_id):
         if node_id < num_nodes_group_1:
@@ -112,7 +100,7 @@ def mm_writer():
                 out_wr.wait()
 
 
-@pytest.mark.parametrize("M,K,N", [(256, 256, 256), (512, 512, 512)])
+@pytest.mark.parametrize("M,K,N", [(640, 640, 640)])
 def test_multinode_matmul_tt_lang(M, K, N):
     """Test multinode matmul operation."""
     device = ttnn.open_device(device_id=0)
@@ -127,10 +115,12 @@ def test_multinode_matmul_tt_lang(M, K, N):
     )
     result = ttnn.to_torch(c).to(torch.bfloat16)
     assert_with_ulp(golden, result)
+    print("Test passed!")
 
     ttnn.close_device(device)
 
 
 if __name__ == "__main__":
-    # TODO: This won't work with 256, 256, 256
+    test_multinode_matmul_tt_lang(256, 256, 256)
+    test_multinode_matmul_tt_lang(512, 512, 512)
     test_multinode_matmul_tt_lang(640, 640, 640)
diff --git a/examples/metal_examples/multinode_reuse_matmul/metal/kernels/bmm_large_block_zm.cpp b/examples/metal_examples/multinode_reuse_matmul/metal/kernels/bmm_large_block_zm.cpp
index 5d8f71592..eb7518b7e 100644
--- a/examples/metal_examples/multinode_reuse_matmul/metal/kernels/bmm_large_block_zm.cpp
+++ b/examples/metal_examples/multinode_reuse_matmul/metal/kernels/bmm_large_block_zm.cpp
@@ -4,8 +4,8 @@
 
 #include <cstdint>
 
-#include "compute_kernel_api/matmul.h"
-#include "compute_kernel_api/tile_move_copy.h"
+#include "api/compute/matmul.h"
+#include "api/compute/tile_move_copy.h"
 
 namespace NAMESPACE {
 void MAIN {
diff --git a/examples/metal_examples/multinode_reuse_matmul/metal/multinode_reuse_matmul.py b/examples/metal_examples/multinode_reuse_matmul/metal/multinode_reuse_matmul.py
index b926571d8..a65636ded 100644
--- a/examples/metal_examples/multinode_reuse_matmul/metal/multinode_reuse_matmul.py
+++ b/examples/metal_examples/multinode_reuse_matmul/metal/multinode_reuse_matmul.py
@@ -44,9 +44,9 @@ def test_metal_matmul(M, K, N):
     assert (
         num_blocks_x <= num_nodes_x and num_blocks_y <= num_nodes_y
     ), "number of total blocks must be less than or equal to num nodes in each dimension"
-    all_nodes = ttnn.NodeRangeSet(
+    all_nodes = ttnn.CoreRangeSet(
         [
-            ttnn.NodeRange(
+            ttnn.CoreRange(
                 ttnn.CoreCoord(0, 0), ttnn.CoreCoord(num_blocks_x - 1, num_blocks_y - 1)
             )
         ]
@@ -105,24 +105,24 @@ def test_metal_matmul(M, K, N):
     block_count = 2
     a_cb_descriptor = ttnn.CBDescriptor(
         total_size=block_count * cb_page_size * (per_node_M * K_block_size),
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         format_descriptors=[a_cb_format],
     )
     b_cb_descriptor = ttnn.CBDescriptor(
         total_size=block_count * cb_page_size * (per_node_N * K_block_size),
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         format_descriptors=[b_cb_format],
     )
     # example has output cb not double buffered
     out_cb_descriptor = ttnn.CBDescriptor(
         total_size=cb_page_size * (per_node_M * per_node_N),
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         format_descriptors=[out_cb_format],
     )
     # needs to be generated by compiler as tt-lang does not have a notion of the dst register, which is the only thing that this cb interacts with
     intermediate_cb_descriptor = ttnn.CBDescriptor(
         total_size=cb_page_size * (per_node_M * per_node_N),
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         format_descriptors=[intermediate_cb_format],
     )
 
@@ -164,9 +164,9 @@ def test_metal_matmul(M, K, N):
         output_tensor
     ).get_compile_time_args()
 
-    reader_rt_args = [[[] for _ in range(num_nodes_y)] for _ in range(num_nodes_x)]
-    writer_rt_args = [[[] for _ in range(num_nodes_y)] for _ in range(num_nodes_x)]
-    compute_rt_args = [[[] for _ in range(num_nodes_y)] for _ in range(num_nodes_x)]
+    reader_rt_args = []
+    writer_rt_args = []
+    compute_rt_args = []
     current_blk = 0
     print(
         f"num_blocks_x: {num_blocks_x}, num_blocks_y: {num_blocks_y}, output tiles is {Mt}x{Nt}"
@@ -180,39 +180,51 @@ def test_metal_matmul(M, K, N):
         for output_idx_x in range(num_blocks_x):
             node_x = current_blk % num_nodes_x
             node_y = current_blk // num_nodes_x
-            reader_rt_args[node_x][node_y] = [
-                a_tensor.buffer_address(),  # a_tensor_addr
-                Kt * per_node_M * output_idx_y,  # a_tensor_start_tile_id
-                1,  # a_tensor_stride_w
-                Kt,  # a_tensor_stride_h
-                K_block_size,  # a_tensor_next_block_stride
-                K_block_size,  # K_block_size
-                per_node_M,  # a_block_h
-                K_block_size * per_node_M,  # a_block_num_tiles
-                b_tensor.buffer_address(),  # b_tensor_addr
-                per_node_N * output_idx_x,  # b_tensor_start_tile_id
-                1,  # b_tensor_stride_w
-                Nt,  # b_tensor_stride_h
-                K_block_size * Nt,  # b_tensor_next_block_stride
-                per_node_N,  # b_block_w
-                K_block_size,  # b_block_h
-                per_node_N * K_block_size,  # b_block_num_tiles
-                Kt // K_block_size,  # num_blocks
-            ]
-            writer_rt_args[node_x][node_y] = [
-                output_tensor.buffer_address(),  # out_buffer_addr
-                (output_idx_x * per_node_N)
-                + (output_idx_y * per_node_M * Nt),  # out_tensor_start_tile_id
-                1,  # out_tensor_stride_w
-                Nt,  # out_tensor_stride_h
-                out_subblock_w,  # out_tensor_next_subblock_stride_w
-                out_subblock_h * Nt,  # out_tensor_next_subblock_stride_h
-                out_subblock_w,  # out_subblock_w
-                out_subblock_h,  # out_subblock_h
-                out_subblock_w * out_subblock_h,  # out_subblocks_w * out_subblocks_h
-                per_node_N // out_subblock_w,  # out_num_subblocks_w
-                per_node_M // out_subblock_h,  # out_num_subblocks_h
-            ]
+            core = ttnn.CoreCoord(node_x, node_y)
+            reader_rt_args.append(
+                (
+                    core,
+                    [
+                        a_tensor.buffer_address(),  # a_tensor_addr
+                        Kt * per_node_M * output_idx_y,  # a_tensor_start_tile_id
+                        1,  # a_tensor_stride_w
+                        Kt,  # a_tensor_stride_h
+                        K_block_size,  # a_tensor_next_block_stride
+                        K_block_size,  # K_block_size
+                        per_node_M,  # a_block_h
+                        K_block_size * per_node_M,  # a_block_num_tiles
+                        b_tensor.buffer_address(),  # b_tensor_addr
+                        per_node_N * output_idx_x,  # b_tensor_start_tile_id
+                        1,  # b_tensor_stride_w
+                        Nt,  # b_tensor_stride_h
+                        K_block_size * Nt,  # b_tensor_next_block_stride
+                        per_node_N,  # b_block_w
+                        K_block_size,  # b_block_h
+                        per_node_N * K_block_size,  # b_block_num_tiles
+                        Kt // K_block_size,  # num_blocks
+                    ],
+                )
+            )
+            writer_rt_args.append(
+                (
+                    core,
+                    [
+                        output_tensor.buffer_address(),  # out_buffer_addr
+                        (output_idx_x * per_node_N)
+                        + (output_idx_y * per_node_M * Nt),  # out_tensor_start_tile_id
+                        1,  # out_tensor_stride_w
+                        Nt,  # out_tensor_stride_h
+                        out_subblock_w,  # out_tensor_next_subblock_stride_w
+                        out_subblock_h * Nt,  # out_tensor_next_subblock_stride_h
+                        out_subblock_w,  # out_subblock_w
+                        out_subblock_h,  # out_subblock_h
+                        out_subblock_w
+                        * out_subblock_h,  # out_subblocks_w * out_subblocks_h
+                        per_node_N // out_subblock_w,  # out_num_subblocks_w
+                        per_node_M // out_subblock_h,  # out_num_subblocks_h
+                    ],
+                )
+            )
             print(
                 f"node {node_x},{node_y} assigned start out block slice [{output_idx_x * per_node_N}:{(output_idx_x + 1) * per_node_N}]x[{output_idx_y * per_node_M}:{(output_idx_y + 1) * per_node_M}]"
             )
@@ -224,7 +236,7 @@ def test_metal_matmul(M, K, N):
     reader_kernel_descriptor = ttnn.KernelDescriptor(
         kernel_source="examples/metal_examples/multinode_reuse_matmul/metal/kernels/reader_bmm_tile_layout.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         compile_time_args=reader_compile_time_args,
         runtime_args=reader_rt_args,
         config=ttnn.ReaderConfigDescriptor(),
@@ -232,7 +244,7 @@ def test_metal_matmul(M, K, N):
     writer_kernel_descriptor = ttnn.KernelDescriptor(
         kernel_source="examples/metal_examples/multinode_reuse_matmul/metal/kernels/writer_bmm_tile_layout.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         compile_time_args=writer_compile_time_args,
         runtime_args=writer_rt_args,
         config=ttnn.WriterConfigDescriptor(),
@@ -240,7 +252,7 @@ def test_metal_matmul(M, K, N):
     compute_kernel_descriptor = ttnn.KernelDescriptor(
         kernel_source="examples/metal_examples/multinode_reuse_matmul/metal/kernels/bmm_large_block_zm.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=all_nodes,
+        core_ranges=all_nodes,
         compile_time_args=compute_compile_time_args,
         runtime_args=compute_rt_args,
         config=computeConfig,
diff --git a/examples/metal_examples/multinode_reuse_matmul/ttlang/multinode_reuse_matmul.py b/examples/metal_examples/multinode_reuse_matmul/ttlang/multinode_reuse_matmul.py
index 3e51ed98d..60f230c18 100644
--- a/examples/metal_examples/multinode_reuse_matmul/ttlang/multinode_reuse_matmul.py
+++ b/examples/metal_examples/multinode_reuse_matmul/ttlang/multinode_reuse_matmul.py
@@ -10,7 +10,7 @@
 from utils.correctness import assert_with_ulp
 
 
-@ttl.operation(grid=(13, 10))
+@ttl.operation(grid=("auto"))
 def tt_lang_multinode_reuse_matmul(a: ttnn.Tensor, b: ttnn.Tensor, out: ttnn.Tensor):
     assert a.shape[1] == b.shape[0], "Incompatible matrix shapes for multiplication."
     assert a.shape[0] == out.shape[0], "Output matrix has incorrect number of rows."
@@ -123,6 +123,7 @@ def test_multinode_reuse_matmul_tt_lang(M, K, N):
     )
     result = ttnn.to_torch(c).to(torch.bfloat16)
     assert_with_ulp(golden, result)
+    print("Test passed!")
 
     ttnn.close_device(device)
 
diff --git a/examples/metal_examples/single_node_matmul/metal/kernels/mm_compute.cpp b/examples/metal_examples/single_node_matmul/metal/kernels/mm_compute.cpp
index 985b074f2..6d5b82501 100644
--- a/examples/metal_examples/single_node_matmul/metal/kernels/mm_compute.cpp
+++ b/examples/metal_examples/single_node_matmul/metal/kernels/mm_compute.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "compute_kernel_api/matmul.h"
-#include "compute_kernel_api/tile_move_copy.h"
+#include "api/compute/matmul.h"
+#include "api/compute/tile_move_copy.h"
 #include "hostdevcommon/kernel_structs.h"
 #include <cstdint>
 
diff --git a/examples/metal_examples/single_node_matmul/metal/single_node_matmul.py b/examples/metal_examples/single_node_matmul/metal/single_node_matmul.py
index 626f54a04..49fe95298 100644
--- a/examples/metal_examples/single_node_matmul/metal/single_node_matmul.py
+++ b/examples/metal_examples/single_node_matmul/metal/single_node_matmul.py
@@ -63,22 +63,22 @@ def test_singlenode_matmul_metal(M, K, N):
 
     # single node grid
     node = ttnn.CoreCoord(0, 0)
-    node_grid = ttnn.NodeRangeSet([ttnn.NodeRange(node, node)])
+    node_grid = ttnn.CoreRangeSet([ttnn.CoreRange(node, node)])
     dfb_block_count = 2
     cb_total_size = dfb_block_count * cb_page_size
     a_cb_descriptor = ttnn.CBDescriptor(
         total_size=cb_total_size,
-        node_ranges=node_grid,
+        core_ranges=node_grid,
         format_descriptors=[a_cb_format],
     )
     b_cb_descriptor = ttnn.CBDescriptor(
         total_size=cb_total_size,
-        node_ranges=node_grid,
+        core_ranges=node_grid,
         format_descriptors=[b_cb_format],
     )
     out_cb_descriptor = ttnn.CBDescriptor(
         total_size=cb_total_size,
-        node_ranges=node_grid,
+        core_ranges=node_grid,
         format_descriptors=[out_cb_format],
     )
 
@@ -100,27 +100,27 @@ def test_singlenode_matmul_metal(M, K, N):
     computeConfig.math_approx_mode = False
 
     reader_kernel_descriptor = ttnn.KernelDescriptor(
-        kernel_source="examples/metal_examples/singlenode_matmul/metal/kernels/mm_reader.cpp",
+        kernel_source="examples/metal_examples/single_node_matmul/metal/kernels/mm_reader.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=node_grid,
+        core_ranges=node_grid,
         compile_time_args=reader_compile_time_args,
-        runtime_args=[[reader_rt_args]],
+        runtime_args=[(node, reader_rt_args)],
         config=ttnn.ReaderConfigDescriptor(),
     )
     writer_kernel_descriptor = ttnn.KernelDescriptor(
-        kernel_source="examples/metal_examples/singlenode_matmul/metal/kernels/mm_writer.cpp",
+        kernel_source="examples/metal_examples/single_node_matmul/metal/kernels/mm_writer.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=node_grid,
+        core_ranges=node_grid,
         compile_time_args=writer_compile_time_args,
-        runtime_args=[[writer_rt_args]],
+        runtime_args=[(node, writer_rt_args)],
         config=ttnn.WriterConfigDescriptor(),
     )
     compute_kernel_descriptor = ttnn.KernelDescriptor(
-        kernel_source="examples/metal_examples/singlenode_matmul/metal/kernels/mm_compute.cpp",
+        kernel_source="examples/metal_examples/single_node_matmul/metal/kernels/mm_compute.cpp",
         source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH,
-        node_ranges=node_grid,
+        core_ranges=node_grid,
         compile_time_args=compute_compile_time_args,
-        runtime_args=[[[]]],
+        runtime_args=[],
         config=computeConfig,
     )
 
diff --git a/examples/metal_examples/single_node_matmul/ttlang/single_node_matmul.py b/examples/metal_examples/single_node_matmul/ttlang/single_node_matmul.py
index f93dc64dd..fc022dd5f 100644
--- a/examples/metal_examples/single_node_matmul/ttlang/single_node_matmul.py
+++ b/examples/metal_examples/single_node_matmul/ttlang/single_node_matmul.py
@@ -1,9 +1,6 @@
 # SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
 #
 # SPDX-License-Identifier: Apache-2.0
-# up to tt-lang spec, not intended to compile or run currently
-import sys
-from pathlib import Path
 import ttnn
 import pytest
 import torch
@@ -76,6 +73,7 @@ def test_singlenode_matmul_tt_lang():
     )
     result = ttnn.to_torch(c).to(torch.bfloat16)
     assert_with_ulp(golden, result)
+    print("Test passed!")
 
     ttnn.close_device(device)
 
diff --git a/examples/tt_upsample.py b/examples/tt_upsample.py
index c1d785379..75e66d311 100644
--- a/examples/tt_upsample.py
+++ b/examples/tt_upsample.py
@@ -9,7 +9,7 @@
 import ttl
 
 from utils.correctness import assert_with_ulp
-from utils.block_allocation import split_work_to_nodes
+from utils.block_allocation import get_number_of_nodes_from_ranges, split_work_to_nodes
 
 
 @ttl.operation(grid=(8, 8))
@@ -33,12 +33,8 @@ def tt_lang_upsample_nearest_rowwise_interleaved(
         f"all_cores: {all_cores}, core_group_1: {core_group_1}, core_group_2: {core_group_2}, work_per_core1: {work_per_core1}, work_per_core2: {work_per_core2}"
     )
 
-    num_cores_group_1 = (
-        core_group_1[1][-1] - core_group_1[0][-1] + 1 if core_group_1 else 0
-    )
-    num_cores_group_2 = (
-        core_group_2[1][-1] - core_group_2[0][-1] + 1 if core_group_2 else 0
-    )
+    num_cores_group_1 = get_number_of_nodes_from_ranges(core_group_1)
+    num_cores_group_2 = get_number_of_nodes_from_ranges(core_group_2)
 
     def get_work_per_core(core_id):
         if core_id < num_cores_group_1:
@@ -129,8 +125,6 @@ def test_tt_lang_upsample_nearest_rowwise_interleaved(input_shape, scale_factor)
     )
 
     golden_tensor = ttnn.upsample(input_tensor, scale_factor)
-    print(f"golden_tensor: {golden_tensor}")
-    print(f"output_tensor: {output_tensor}")
 
     assert_with_ulp(output_tensor.to_torch(), golden_tensor.to_torch(), ulp_threshold=1)
     print("Test passed!")
diff --git a/python/utils/__init__.py b/python/utils/__init__.py
index ebb199bfa..ead3f17b7 100644
--- a/python/utils/__init__.py
+++ b/python/utils/__init__.py
@@ -4,12 +4,17 @@
 
 """Utility functions for tt-lang."""
 
-from .block_allocation import get_large_matmul_params, split_work_to_nodes
+from .block_allocation import (
+    get_large_matmul_params,
+    get_number_of_nodes_from_ranges,
+    split_work_to_nodes,
+)
 from .correctness import assert_allclose, assert_pcc, assert_with_ulp
 
 __all__ = [
     # block_allocation
     "split_work_to_nodes",
+    "get_number_of_nodes_from_ranges",
     "get_large_matmul_params",
     # correctness
     "assert_pcc",
diff --git a/python/utils/block_allocation.py b/python/utils/block_allocation.py
index 8729051fe..23ed81358 100644
--- a/python/utils/block_allocation.py
+++ b/python/utils/block_allocation.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import itertools
 import math
-from collections import namedtuple
 from typing import List, Tuple
 from collections import namedtuple
 
@@ -22,6 +21,19 @@ def get_number_of_nodes(grid: Tuple[int, ...]) -> int:
     return node_count
 
 
+def get_number_of_nodes_from_ranges(
+    ranges: List[Tuple[Tuple[int, ...], Tuple[int, ...]]],
+) -> int:
+    """Count the total number of nodes across a list of rectangular ranges."""
+    total = 0
+    for start, end in ranges:
+        count = 1
+        for s, e in zip(start, end):
+            count *= e - s + 1
+        total += count
+    return total
+
+
 def filter_factor_pairs_by_2d_grid(
     factor_pairs: list[Tuple[int, int]], grid: Tuple[int, int]
 ) -> list[Tuple[int, int]]:
@@ -153,13 +165,15 @@ def split_work_to_nodes(
     grid_size: Tuple[int, ...], units_to_divide: int, row_wise: bool = True
 ) -> Tuple[
     int,
-    Tuple[Tuple[int, ...], Tuple[int, ...]],
-    Tuple[Tuple[int, ...], Tuple[int, ...]],
+    List[Tuple[Tuple[int, ...], Tuple[int, ...]]],
+    List[Tuple[Tuple[int, ...], Tuple[int, ...]]],
     int,
     int,
 ]:
-    """Splits work units among nodes in a from a single device grid.
-    currently can produce work splits that cannot map to CoreRanges directly, particlarily in 1-d grids
+    """Splits work units among nodes from a single device grid.
+
+    Matches the semantics of ttnn.split_work_to_cores: each group is a list of
+    rectangular (start, end) coordinate ranges (like a CoreRangeSet).
 
     Args:
         grid_size: A tuple representing the dimensions of the node grid.
@@ -167,14 +181,14 @@ def split_work_to_nodes(
         row_wise: If True, split work in a row-wise manner; otherwise, column-wise.
 
     Returns: A tuple containing:
-            - total number of nodes
-            - node group 1 as a tuple of tuples, start coord to end coord rectangle [inclusive, inclusive]
-            - node group 2 as a tuple of tuples, start coord to end coord rectangle [inclusive, inclusive]
+            - total number of nodes used
+            - node group 1 as a list of (start_coord, end_coord) ranges [inclusive]
+            - node group 2 as a list of (start_coord, end_coord) ranges [inclusive]
             - work units per node in group 1
             - work units per node in group 2
     """
     if units_to_divide == 0:
-        return (0, (), (), 0, 0)
+        return (0, [], [], 0, 0)
     simplified_grid_size = remove_leading_ones(grid_size)
     assert len(simplified_grid_size) <= 2, "only supports grids with a single device"
     total_nodes = get_number_of_nodes(grid_size)
@@ -185,14 +199,12 @@ def split_work_to_nodes(
     ):  # more nodes than work units, assign 1 unit to first N nodes
         if len(simplified_grid_size) == 1:
             end_coord = ((0,) * (len(grid_size) - 1)) + (units_to_divide - 1,)
+            return (units_to_divide, [(start_coord, end_coord)], [], 1, 0)
         elif len(simplified_grid_size) == 2:
             ranges = num_nodes_to_grid_ranges(
                 start_coord, units_to_divide, grid_size, row_wise
             )
-            end_coord = ((0,) * (len(grid_size) - 2)) + ranges[-1][
-                1
-            ]  # Last range's end coordinate
-        return (units_to_divide, (start_coord, end_coord), (), 1, 0)
+            return (units_to_divide, ranges, [], 1, 0)
     else:
         # more work units than nodes, divide work as evenly as possible
         if len(simplified_grid_size) == 1:
@@ -202,8 +214,8 @@ def split_work_to_nodes(
             if remaining_work == 0:
                 return (
                     total_nodes,
-                    ((0,) * len(grid_size), end_coord_all),
-                    (),
+                    [((0,) * len(grid_size), end_coord_all)],
+                    [],
                     work_per_node,
                     0,
                 )
@@ -211,8 +223,8 @@ def split_work_to_nodes(
             start_coord_2 = ((0,) * (len(grid_size) - 1)) + (remaining_work,)
             return (
                 total_nodes,
-                ((0,) * len(grid_size), end_coord_1),
-                (start_coord_2, end_coord_all),
+                [((0,) * len(grid_size), end_coord_1)],
+                [(start_coord_2, end_coord_all)],
                 work_per_node + 1,
                 work_per_node,
             )
@@ -234,7 +246,7 @@ def split_work_to_nodes(
                 num_nodes_y = grid_size[-2]
                 prefix = (0,) * (len(grid_size) - 2)
                 end_coord = prefix + (num_nodes_y - 1, num_nodes_x - 1)
-                return (total_nodes, (start_coord, end_coord), (), work_per_node, 0)
+                return (total_nodes, [(start_coord, end_coord)], [], work_per_node, 0)
 
             # Uneven division - need two groups
             else:
@@ -280,18 +292,10 @@ def split_work_to_nodes(
                     start_coord_group2, num_nodes_group2, grid_size, row_wise
                 )
 
-                # For simplified return, we'll return the bounding boxes
-                # Group 1: from (0,0,...) to last coord of group 1
-                group1_bbox = (start_coord, last_coord_group1)
-
-                # Group 2: from start to last coord of group 2
-                last_coord_group2 = group2_ranges[-1][1]
-                group2_bbox = (start_coord_group2, last_coord_group2)
-
                 return (
                     total_nodes,
-                    group1_bbox,
-                    group2_bbox,
+                    group1_ranges,
+                    group2_ranges,
                     work_per_node + 1,
                     work_per_node,
                 )
diff --git a/test/python/test_block_allocation.py b/test/python/test_block_allocation.py
index c74f1b01c..8f472acd7 100644
--- a/test/python/test_block_allocation.py
+++ b/test/python/test_block_allocation.py
@@ -14,6 +14,7 @@
 
 from ttl.utils.block_allocation import (
     get_large_matmul_params,
+    get_number_of_nodes_from_ranges,
     split_work_to_nodes,
 )
 
@@ -29,6 +30,32 @@ def extract_coords_from_ttnn_corerangeset(core_range_set):
     return coords
 
 
+@pytest.mark.parametrize(
+    "ranges,expected",
+    [
+        # Empty range list
+        ([], 0),
+        # Single 1D range
+        ([((0,), (4,))], 5),
+        # Single 2D range: full rectangle
+        ([((0, 0), (3, 7))], 32),
+        # Single point
+        ([((2, 3), (2, 3))], 1),
+        # Multiple 2D ranges: L-shape (4 full rows + partial row)
+        ([((0, 0), (3, 7)), ((4, 0), (4, 3))], 36),
+        # Multiple 1D ranges (disjoint)
+        ([((0,), (2,)), ((4,), (6,))], 6),
+        # Multiple 2D ranges: partial row + full rows + partial row
+        ([((0, 4), (0, 7)), ((1, 0), (2, 7)), ((3, 0), (3, 2))], 4 + 16 + 3),
+        # 3D range with leading dimension
+        ([((0, 0, 0), (0, 2, 4))], 15),
+    ],
+)
+def test_get_number_of_nodes_from_ranges(ranges, expected):
+    """Test get_number_of_nodes_from_ranges with known inputs and expected counts."""
+    assert get_number_of_nodes_from_ranges(ranges) == expected
+
+
 @pytest.mark.parametrize(
     "grid_size_tuple,units,row_wise",
     [
@@ -36,6 +63,7 @@ def extract_coords_from_ttnn_corerangeset(core_range_set):
         ((8, 8), 100, True),
         ((8, 8), 100, False),
         ((8, 8), 65, True),
+        ((8, 8), 65, False),
         ((8, 8), 129, True),
         # Test even distribution
         ((8, 8), 64, True),
@@ -43,6 +71,7 @@ def extract_coords_from_ttnn_corerangeset(core_range_set):
         # Test with different grid sizes
         ((4, 8), 50, True),
         ((7, 9), 100, False),
+        ((7, 9), 100, True),
         # Test fewer units than cores
         ((8, 8), 10, True),
         ((8, 8), 20, False),
@@ -50,16 +79,21 @@ def extract_coords_from_ttnn_corerangeset(core_range_set):
         # Test edge cases
         ((8, 8), 63, True),
         ((8, 8), 127, True),
+        # 2D grids that force multiple CoreRanges per group (L-shapes)
+        ((13, 10), 200, True),
+        ((13, 10), 200, False),
+        ((5, 7), 50, True),
+        ((3, 12), 40, False),
+        # Small grids with multi-range groups
+        ((2, 3), 10, True),
+        ((3, 2), 8, False),
     ],
 )
 def test_split_work_to_nodes(grid_size_tuple, units, row_wise):
     """Compare results from split_work_to_nodes and ttnn.split_work_to_cores"""
-    # Call new function
     new_result = split_work_to_nodes(grid_size_tuple, units, row_wise)
     new_total, new_g1, new_g2, new_w1, new_w2 = new_result
 
-    # Call ttnn function
-    # Create CoreRangeSet from grid_size_tuple
     num_cores_x = grid_size_tuple[-1]
     num_cores_y = grid_size_tuple[-2]
     ttnn_grid = ttnn.CoreRangeSet(
@@ -73,15 +107,14 @@ def test_split_work_to_nodes(grid_size_tuple, units, row_wise):
     ttnn_result = ttnn.split_work_to_cores(ttnn_grid, units, row_wise)
     ttnn_total, ttnn_all, ttnn_g1, ttnn_g2, ttnn_w1, ttnn_w2 = ttnn_result
 
-    # Extract coordinates from ttnn function
     ttnn_g1_coords = extract_coords_from_ttnn_corerangeset(ttnn_g1)
     ttnn_g2_coords = extract_coords_from_ttnn_corerangeset(ttnn_g2)
 
-    # Verify work distribution matches
     assert new_w1 == ttnn_w1, f"Work per core G1 mismatch: {new_w1} vs {ttnn_w1}"
     assert new_w2 == ttnn_w2, f"Work per core G2 mismatch: {new_w2} vs {ttnn_w2}"
 
-    # Calculate total cores in each group from ttnn
+    new_g1_num_cores = get_number_of_nodes_from_ranges(new_g1)
+    new_g2_num_cores = get_number_of_nodes_from_ranges(new_g2)
     ttnn_g1_num_cores = sum(
         (end[1] - start[1] + 1) * (end[0] - start[0] + 1)
         for start, end in ttnn_g1_coords
@@ -91,36 +124,37 @@ def test_split_work_to_nodes(grid_size_tuple, units, row_wise):
         for start, end in ttnn_g2_coords
     )
 
-    # Verify total work matches
-    new_total_work = ttnn_g1_num_cores * new_w1 + ttnn_g2_num_cores * new_w2
-    ttnn_total_work = ttnn_g1_num_cores * ttnn_w1 + ttnn_g2_num_cores * ttnn_w2
     assert (
-        new_total_work == ttnn_total_work == units
-    ), f"Total work mismatch: {new_total_work} vs {ttnn_total_work} vs {units}"
-
-    # Verify group 1 coordinates
-    if new_g1 and ttnn_g1_coords:
-        new_g1_start, new_g1_end = new_g1
-        ttnn_g1_first_start = ttnn_g1_coords[0][0]
-        ttnn_g1_last_end = ttnn_g1_coords[-1][1]
+        new_g1_num_cores == ttnn_g1_num_cores
+    ), f"Group 1 core count mismatch: {new_g1_num_cores} vs {ttnn_g1_num_cores}"
+    assert (
+        new_g2_num_cores == ttnn_g2_num_cores
+    ), f"Group 2 core count mismatch: {new_g2_num_cores} vs {ttnn_g2_num_cores}"
+
+    new_total_work = new_g1_num_cores * new_w1 + new_g2_num_cores * new_w2
+    assert new_total_work == units, f"Total work mismatch: {new_total_work} vs {units}"
+
+    assert len(new_g1) == len(
+        ttnn_g1_coords
+    ), f"Group 1 range count mismatch: {len(new_g1)} vs {len(ttnn_g1_coords)}"
+    for i, (new_range, ttnn_range) in enumerate(zip(new_g1, ttnn_g1_coords)):
+        assert (
+            new_range[0] == ttnn_range[0]
+        ), f"G1 range {i} start mismatch: {new_range[0]} vs {ttnn_range[0]}"
+        assert (
+            new_range[1] == ttnn_range[1]
+        ), f"G1 range {i} end mismatch: {new_range[1]} vs {ttnn_range[1]}"
+
+    assert len(new_g2) == len(
+        ttnn_g2_coords
+    ), f"Group 2 range count mismatch: {len(new_g2)} vs {len(ttnn_g2_coords)}"
+    for i, (new_range, ttnn_range) in enumerate(zip(new_g2, ttnn_g2_coords)):
         assert (
-            new_g1_start == ttnn_g1_first_start and new_g1_end == ttnn_g1_last_end
-        ), f"Group 1 coordinates mismatch: new {new_g1_start} -> {new_g1_end}, ttnn {ttnn_g1_first_start} -> {ttnn_g1_last_end}"
-
-    # Verify group 2 coordinates
-    if new_g2 and ttnn_g2_coords:
-        new_g2_start, new_g2_end = new_g2
-        ttnn_g2_first_start = ttnn_g2_coords[0][0]
-        ttnn_g2_last_end = ttnn_g2_coords[-1][1]
+            new_range[0] == ttnn_range[0]
+        ), f"G2 range {i} start mismatch: {new_range[0]} vs {ttnn_range[0]}"
         assert (
-            new_g2_start == ttnn_g2_first_start and new_g2_end == ttnn_g2_last_end
-        ), f"Group 2 coordinates mismatch: new {new_g2_start} -> {new_g2_end}, ttnn {ttnn_g2_first_start} -> {ttnn_g2_last_end}"
-
-    # Check empty groups match
-    if not new_g1:
-        assert not ttnn_g1_coords, "Group 1 empty mismatch"
-    if not new_g2:
-        assert not ttnn_g2_coords, "Group 2 empty mismatch"
+            new_range[1] == ttnn_range[1]
+        ), f"G2 range {i} end mismatch: {new_range[1]} vs {ttnn_range[1]}"
 
 
 @pytest.mark.parametrize(

From fe2bb68d5dcb53c6c3aadb25e7ec5ee4a8d72ba8 Mon Sep 17 00:00:00 2001
From: Alex Richins <arichins@tenstorrent.com>
Date: Fri, 10 Apr 2026 12:36:17 -0700
Subject: [PATCH 09/31] consistent missing device error (#481)

Problem description
When users pass host tensors (e.g., after ttnn.from_device()) to a
@ttl.operation, they get an opaque AttributeError: 'NoneType' object has
no attribute 'compute_with_storage_grid_size' with no indication that
the tensor needs to be on a device. This happens at two call sites:

_resolve_grid when grid='auto' -- tries to query the device compute grid
from a None device
CompiledTTNNKernel.__call__ -- validates kernel grid against device
compute grid on a None device
closes #389

What's changed
Added a shared _require_device(args) helper that scans tensor arguments
for an on-device tensor and returns the device. When all tensors are on
host, it raises a ValueError listing the host tensor shapes and showing
how to fix it:
```
ValueError: No device found on any tensor argument. All ttnn tensor inputs are on host:
  arg[0]: Shape([32, 32])
Place tensors on device before calling the operation, e.g.:
  ttnn.to_device(tensor, device)
  ttnn.from_torch(tensor, ..., device=device)
```

Both crash sites (_resolve_grid and CompiledTTNNKernel.__call__) now use
this helper instead of calling .device() without a None check. The five
other .device() call sites in the file already had proper None guards
and were not changed.

The simulator path (python/sim/) is unaffected

Checklist
- [x] New/Existing tests provide coverage for changes
---
 python/ttl/ttl_api.py              |  41 ++++++---
 test/python/test_missing_device.py | 140 +++++++++++++++++++++++++++++
 2 files changed, 171 insertions(+), 10 deletions(-)
 create mode 100644 test/python/test_missing_device.py

diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py
index 9cf005fa0..f15be2952 100644
--- a/python/ttl/ttl_api.py
+++ b/python/ttl/ttl_api.py
@@ -404,20 +404,41 @@ def _has_float32_args(args) -> bool:
     return False
 
 
+def _require_device(args):
+    """Extract the device from tensor arguments, raising if none are on-device.
+
+    Returns the first non-None device found. Raises ValueError with
+    a message listing which arguments are host tensors and suggesting
+    ttnn.to_device().
+    """
+    for i, arg in enumerate(args):
+        if is_ttnn_tensor(arg):
+            device = arg.device()
+            if device is not None:
+                return device
+    host_args = [
+        f"  arg[{i}]: {arg.shape}" for i, arg in enumerate(args) if is_ttnn_tensor(arg)
+    ]
+    if not host_args:
+        raise ValueError("No device found: no ttnn tensor arguments were provided.")
+    raise ValueError(
+        "No device found on any tensor argument. "
+        "All ttnn tensor inputs are on host:\n"
+        + "\n".join(host_args)
+        + "\nPlace tensors on device before calling the operation, e.g.:\n"
+        "  ttnn.to_device(tensor, device)\n"
+        "  ttnn.from_torch(tensor, ..., device=device)"
+    )
+
+
 def _resolve_grid(grid, args, kwargs):
     """Resolve grid, evaluating callable or 'auto' if needed."""
     if callable(grid):
         return grid(*args, **kwargs)
     if grid == "auto":
-        for arg in args:
-            if is_ttnn_tensor(arg) and hasattr(arg, "device"):
-                device = arg.device()
-                device_grid = device.compute_with_storage_grid_size()
-                return (device_grid.x, device_grid.y)
-        raise ValueError(
-            "grid='auto' requires at least one ttnn tensor argument "
-            "to determine device compute grid"
-        )
+        device = _require_device(args)
+        device_grid = device.compute_with_storage_grid_size()
+        return (device_grid.x, device_grid.y)
     return grid
 
 
@@ -529,7 +550,7 @@ def __call__(self, *args):
             raise ValueError(f"Expected {self.num_tensors} tensors, got {len(args)}")
 
         # Validate grid against device's compute grid.
-        device = args[0].device()
+        device = _require_device(args)
         device_grid = device.compute_with_storage_grid_size()
         kernel_grid = self.core_ranges.bounding_box().grid_size()
         if kernel_grid.x > device_grid.x or kernel_grid.y > device_grid.y:
diff --git a/test/python/test_missing_device.py b/test/python/test_missing_device.py
new file mode 100644
index 000000000..a7d772ecf
--- /dev/null
+++ b/test/python/test_missing_device.py
@@ -0,0 +1,140 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Tests for missing device error handling.
+
+Verifies that meaningful error messages are produced when operations
+receive host tensors instead of device tensors.
+"""
+
+import pytest
+import torch
+import ttl
+
+ttnn = pytest.importorskip("ttnn", exc_type=ImportError)
+
+
+@ttl.operation(grid="auto")
+def nop_auto_grid(a):
+    @ttl.compute()
+    def compute_nop():
+        pass
+
+    @ttl.datamovement()
+    def dm_nop1():
+        pass
+
+    @ttl.datamovement()
+    def dm_nop2():
+        pass
+
+
+@ttl.operation(grid="auto")
+def nop_auto_grid_2(a, b):
+    @ttl.compute()
+    def compute_nop():
+        pass
+
+    @ttl.datamovement()
+    def dm_nop1():
+        pass
+
+    @ttl.datamovement()
+    def dm_nop2():
+        pass
+
+
+@ttl.operation(grid=(1, 1))
+def nop_fixed_grid(a):
+    @ttl.compute()
+    def compute_nop():
+        pass
+
+    @ttl.datamovement()
+    def dm_nop1():
+        pass
+
+    @ttl.datamovement()
+    def dm_nop2():
+        pass
+
+
+def test_auto_grid_host_tensor():
+    """
+    grid='auto' with a host tensor should produce a clear error, not an AttributeError on NoneType.
+    """
+    a_host = ttnn.from_torch(
+        torch.zeros(32, 32, dtype=torch.bfloat16),
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+    )
+
+    with pytest.raises(ValueError, match="No device found"):
+        nop_auto_grid(a_host)
+
+
+def test_fixed_grid_host_tensor():
+    """
+    grid=(1,1) with a host tensor should produce a clear error, not an AttributeError on NoneType.
+    """
+    a_host = ttnn.from_torch(
+        torch.zeros(32, 32, dtype=torch.bfloat16),
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+    )
+
+    with pytest.raises(ValueError, match="No device found"):
+        nop_fixed_grid(a_host)
+
+
+def test_auto_grid_no_ttnn_tensors():
+    """
+    grid='auto' with no ttnn tensors should report that none were provided.
+    The fixed-grid path hits _require_device at __call__ time (post-compile),
+    so it can't be reached without a valid ttnn tensor to compile against.
+    """
+    with pytest.raises(ValueError, match="no ttnn tensor arguments were provided"):
+        nop_auto_grid(torch.zeros(32, 32, dtype=torch.bfloat16))
+
+
+def test_auto_grid_multiple_host_tensors():
+    """Error message should list all host tensor arguments."""
+    a_host = ttnn.from_torch(
+        torch.zeros(32, 32, dtype=torch.bfloat16),
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+    )
+    b_host = ttnn.from_torch(
+        torch.zeros(64, 64, dtype=torch.bfloat16),
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+    )
+
+    with pytest.raises(
+        ValueError, match="All ttnn tensor inputs are on host"
+    ) as exc_info:
+        nop_auto_grid_2(a_host, b_host)
+    msg = str(exc_info.value)
+    assert "arg[0]" in msg
+    assert "arg[1]" in msg
+
+
+def test_auto_grid_mixed_host_and_device(device):
+    """_require_device succeeds when at least one tensor is on-device."""
+    from ttl.ttl_api import _require_device
+
+    a_host = ttnn.from_torch(
+        torch.zeros(32, 32, dtype=torch.bfloat16),
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+    )
+    b_device = ttnn.from_torch(
+        torch.zeros(32, 32, dtype=torch.bfloat16),
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+    )
+
+    assert _require_device((a_host, b_device)) is not None

From 0350205f59de59c2ffd6889aa2041be9df968d76 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Fri, 10 Apr 2026 13:03:46 -0700
Subject: [PATCH 10/31] generalize the packer L1 accumulation guard placement

---
 .../TTKernelInsertL1Accumulation.cpp          | 129 ++++++++++++------
 1 file changed, 85 insertions(+), 44 deletions(-)

diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index 646d1be64..803f034ba 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -7,9 +7,10 @@
 //===----------------------------------------------------------------------===//
 //
 // Inserts pack_reconfig_l1_acc guards inside reduction loops. When a
-// tile_regs_acquire is inside a reduction loop, the packer must switch
-// to L1 accumulation mode from the second iteration onwards so that
-// pack_tile adds to the existing L1 value instead of overwriting.
+// tile_regs_release is inside a reduction loop, the packer is switched
+// to L1 accumulation mode once after the first iteration's pack so that
+// subsequent iterations add to the existing L1 value instead of
+// overwriting. The L1 acc state persists across tile_regs boundaries.
 //
 // See docs/development/AccumulatingComputeLowering.md for design details.
 //
@@ -95,60 +96,100 @@ struct TTKernelInsertL1AccumulationPass
       }
     });
 
-    // Insert pack_reconfig_l1_acc matching the tt-metal minimal_matmul
-    // pattern: enable at the END of the first K iteration (after all
-    // DstSections complete), disable after the loop. The enable guard
-    // uses `if (k == lb)` so it fires once when the first iteration
-    // finishes, and L1 acc stays enabled for all subsequent iterations.
+    // L1 accumulation guard placement. For any loop that
+    // accumulates in L1 (matmul K loop or reduce loop), the pattern is:
+    //
+    //   pack_reconfig_l1_acc(0)                // disable before loop
+    //   for (iv = lb; ...) {
+    //     [subblock 0: acquire...pack...release]
+    //     [subblock N: acquire...pack...release]
+    //     if (iv == lb) pack_reconfig_l1_acc(1) // enable once after first
+    //                                           // iteration's last pack
+    //   }
+    //   [cb_push_back if present]
+    //   pack_reconfig_l1_acc(0)                // disable after loop
+    //
+    // The L1 acc state persists across tile_regs boundaries, so the enable
+    // call only needs to happen once (after the first iteration completes
+    // all subblock packs). Disable guards are inserted once per outermost
+    // loop.
+
+    // Find the top-level operation in each L1 acc loop body that contains
+    // the last tile_regs_release. The release may be nested inside subblock
+    // loops, so we find the enclosing top-level op to insert after.
+    auto findTopLevelAncestor = [](Operation *op, Block *loopBody)
+        -> Operation * {
+      while (op && op->getBlock() != loopBody) {
+        op = op->getParentOp();
+      }
+      return op;
+    };
+
+    llvm::SmallDenseMap<Operation *, Operation *> enablePointPerLoop;
+    for (auto loop : l1AccLoops) {
+      Operation *lastTopLevel = nullptr;
+      loop->walk([&](ttk::TileRegsReleaseOp releaseOp) {
+        Operation *topLevel =
+            findTopLevelAncestor(releaseOp, loop.getBody());
+        if (topLevel) {
+          lastTopLevel = topLevel;
+        }
+      });
+      if (lastTopLevel) {
+        enablePointPerLoop[loop.getOperation()] = lastTopLevel;
+      }
+    }
+
     llvm::SmallDenseSet<Operation *> disabledLoops;
-    for (scf::ForOp loop : l1AccLoops) {
+    for (auto loop : l1AccLoops) {
+      auto iter = enablePointPerLoop.find(loop.getOperation());
+      if (iter == enablePointPerLoop.end()) {
+        continue;
+      }
+      Operation *enablePoint = iter->second;
       OpBuilder builder(loop->getContext());
-      Location loc = loop.getLoc();
-
-      // Disable L1 acc before the loop to ensure clean state.
-      builder.setInsertionPoint(loop);
-      Value disablePre = arith::ConstantOp::create(
-          builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0));
-      ttk::PackReconfigL1AccOp::create(builder, loc, disablePre);
-
-      // Enable at end of first iteration, matching tt-metal:
-      //   if (k_block == 0) { PACK((llk_pack_reconfig_l1_acc(1))); }
-      Operation *yield = loop.getBody()->getTerminator();
-      builder.setInsertionPoint(yield);
+      Location loc = enablePoint->getLoc();
+
+      // Conditional enable after the last subblock/release on the first
+      // iteration. Placed after the top-level op containing the last
+      // release so all subblock packs in iteration 0 write without
+      // accumulation.
+      builder.setInsertionPointAfter(enablePoint);
       Value loopIV = loop.getInductionVar();
       Value loopLB = loop.getLowerBound();
-      Value isFirstIter = arith::CmpIOp::create(
+      Value firstIter = arith::CmpIOp::create(
           builder, loc, arith::CmpIPredicate::eq, loopIV, loopLB);
-      auto ifOp = scf::IfOp::create(builder, loc, isFirstIter);
+      auto ifOp = scf::IfOp::create(builder, loc, firstIter);
       builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
       Value enableFlag = arith::ConstantOp::create(
           builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1));
       ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag);
 
-      // Disable after each L1 acc loop to prevent L1 acc state from
-      // leaking into outer loops or subsequent code.
-      if (disabledLoops.insert(loop.getOperation()).second) {
-        // For the outermost loop, place disable after cb_push_back.
-        // For inner loops, place directly after the loop.
-        auto outermostLoop = findOutermostL1AccLoop(loop);
-        bool isOutermost = !outermostLoop || outermostLoop == loop;
-        if (isOutermost) {
-          // Scan forward for cb_push_back.
-          Operation *insertPoint = loop->getNextNode();
-          while (insertPoint && !isa<ttk::CBPushBackOp>(insertPoint)) {
-            insertPoint = insertPoint->getNextNode();
-          }
-          if (insertPoint) {
-            builder.setInsertionPointAfter(insertPoint);
-          } else {
-            builder.setInsertionPointAfter(loop);
-          }
+      // Disable before and after the outermost L1 acc loop (once per loop).
+      auto outermostLoop = findOutermostL1AccLoop(loop);
+      if (!outermostLoop) {
+        outermostLoop = loop;
+      }
+      if (disabledLoops.insert(outermostLoop.getOperation()).second) {
+        // Disable before the loop.
+        builder.setInsertionPoint(outermostLoop);
+        Value disablePre = arith::ConstantOp::create(
+            builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0));
+        ttk::PackReconfigL1AccOp::create(builder, loc, disablePre);
+
+        // Disable after cb_push_back following the loop, or after the loop.
+        Operation *insertPoint = outermostLoop->getNextNode();
+        while (insertPoint && !isa<ttk::CBPushBackOp>(insertPoint)) {
+          insertPoint = insertPoint->getNextNode();
+        }
+        if (insertPoint) {
+          builder.setInsertionPointAfter(insertPoint);
         } else {
-          builder.setInsertionPointAfter(loop);
+          builder.setInsertionPointAfter(outermostLoop);
         }
-        Value disableFlag = arith::ConstantOp::create(
+        Value disablePost = arith::ConstantOp::create(
             builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0));
-        ttk::PackReconfigL1AccOp::create(builder, loc, disableFlag);
+        ttk::PackReconfigL1AccOp::create(builder, loc, disablePost);
       }
     }
   }

From f919130f7651edc9addc8e6236c0d29cbf816e0b Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Fri, 10 Apr 2026 13:04:40 -0700
Subject: [PATCH 11/31] add tests

---
 .../TTKernelInsertL1Accumulation.cpp          |   7 +-
 test/python/matmul_l1_acc_multinode.py        | 184 ++++++++++++++++++
 test/python/test_matmul_l1_acc_multinode.py   | 155 +++++++++++++++
 .../Transforms/insert_l1_accumulation.mlir    | 150 ++++++++++++++
 4 files changed, 492 insertions(+), 4 deletions(-)
 create mode 100644 test/python/matmul_l1_acc_multinode.py
 create mode 100644 test/python/test_matmul_l1_acc_multinode.py
 create mode 100644 test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir

diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index 803f034ba..11a2985ea 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -117,8 +117,8 @@ struct TTKernelInsertL1AccumulationPass
     // Find the top-level operation in each L1 acc loop body that contains
     // the last tile_regs_release. The release may be nested inside subblock
     // loops, so we find the enclosing top-level op to insert after.
-    auto findTopLevelAncestor = [](Operation *op, Block *loopBody)
-        -> Operation * {
+    auto findTopLevelAncestor = [](Operation *op,
+                                   Block *loopBody) -> Operation * {
       while (op && op->getBlock() != loopBody) {
         op = op->getParentOp();
       }
@@ -129,8 +129,7 @@ struct TTKernelInsertL1AccumulationPass
     for (auto loop : l1AccLoops) {
       Operation *lastTopLevel = nullptr;
       loop->walk([&](ttk::TileRegsReleaseOp releaseOp) {
-        Operation *topLevel =
-            findTopLevelAncestor(releaseOp, loop.getBody());
+        Operation *topLevel = findTopLevelAncestor(releaseOp, loop.getBody());
         if (topLevel) {
           lastTopLevel = topLevel;
         }
diff --git a/test/python/matmul_l1_acc_multinode.py b/test/python/matmul_l1_acc_multinode.py
new file mode 100644
index 000000000..f2bbdd968
--- /dev/null
+++ b/test/python/matmul_l1_acc_multinode.py
@@ -0,0 +1,184 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# REQUIRES: tt-device
+# RUN: env TTLANG_INITIAL_MLIR=%t.initial.mlir %python %s > %t.output 2>&1
+# RUN: FileCheck %s --check-prefix=CHECK-CPP < %t.output
+# RUN: FileCheck %s --check-prefix=CHECK-RESULT < %t.output
+
+"""
+Multinode matmul with L1 packer accumulation. Mirrors the benchmark kernel
+(make_matmul_l1_acc / v4_l1_acc): auto grid, split DMA (reader=A,
+writer=B+output), 8x8x8 blocks, K_num_blocks=4 at 1024x1024x1024.
+
+The compute thread uses the "reserve once, store K times, push once" pattern.
+The compiler detects the K reduction loop and inserts pack_reconfig_l1_acc
+guards so each K iteration packs additively to L1.
+
+Verifies the L1 packer accumulation pattern in generated C++: disable before
+K loop, conditional enable after first iteration, disable after cb_push_back.
+"""
+
+import ttl
+
+try:
+    import ttnn
+except ImportError:
+    print("TTNN not available - exiting")
+    exit(0)
+
+import torch
+
+TILE = 32
+M_BLOCK = 8
+K_BLOCK = 8
+N_BLOCK = 8
+
+
+@ttl.operation(grid="auto")
+def matmul_l1_acc(a, b, out):
+    Mt = a.shape[0] // TILE
+    Kt = a.shape[1] // TILE
+    Nt = b.shape[1] // TILE
+
+    K_num_blocks = Kt // K_BLOCK
+    M_num_blocks = Mt // M_BLOCK
+    N_num_blocks = Nt // N_BLOCK
+
+    grid_n, grid_m = ttl.grid_size(dims=2)
+    m_blocks_per_node = -(-M_num_blocks // grid_m)
+    n_blocks_per_node = -(-N_num_blocks // grid_n)
+
+    a_dfb = ttl.make_dataflow_buffer_like(a, shape=(M_BLOCK, K_BLOCK), block_count=2)
+    b_dfb = ttl.make_dataflow_buffer_like(b, shape=(K_BLOCK, N_BLOCK), block_count=2)
+    out_dfb = ttl.make_dataflow_buffer_like(
+        out, shape=(M_BLOCK, N_BLOCK), block_count=2
+    )
+
+    @ttl.compute()
+    def compute():
+        node_n, node_m = ttl.node(dims=2)
+        for local_m in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m
+            if m_block < M_num_blocks:
+                for local_n in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n
+                    if n_block < N_num_blocks:
+                        out_blk = out_dfb.reserve()
+                        for _ in range(K_num_blocks):
+                            a_blk = a_dfb.wait()
+                            b_blk = b_dfb.wait()
+                            out_blk.store(a_blk @ b_blk)
+                            a_blk.pop()
+                            b_blk.pop()
+                        out_blk.push()
+
+    @ttl.datamovement()
+    def reader():
+        node_n, node_m = ttl.node(dims=2)
+        for local_m in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m
+            if m_block < M_num_blocks:
+                m_off = m_block * M_BLOCK
+                for local_n in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n
+                    if n_block < N_num_blocks:
+                        for kb in range(K_num_blocks):
+                            k_off = kb * K_BLOCK
+                            with a_dfb.reserve() as a_blk:
+                                ttl.copy(
+                                    a[
+                                        m_off : m_off + M_BLOCK,
+                                        k_off : k_off + K_BLOCK,
+                                    ],
+                                    a_blk,
+                                ).wait()
+
+    @ttl.datamovement()
+    def writer():
+        node_n, node_m = ttl.node(dims=2)
+        for local_m in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m
+            if m_block < M_num_blocks:
+                m_off = m_block * M_BLOCK
+                for local_n in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n
+                    if n_block < N_num_blocks:
+                        n_off = n_block * N_BLOCK
+                        for kb in range(K_num_blocks):
+                            k_off = kb * K_BLOCK
+                            with b_dfb.reserve() as b_blk:
+                                ttl.copy(
+                                    b[
+                                        k_off : k_off + K_BLOCK,
+                                        n_off : n_off + N_BLOCK,
+                                    ],
+                                    b_blk,
+                                ).wait()
+                        with out_dfb.wait() as out_blk:
+                            ttl.copy(
+                                out_blk,
+                                out[
+                                    m_off : m_off + M_BLOCK,
+                                    n_off : n_off + N_BLOCK,
+                                ],
+                            ).wait()
+
+
+# =============================================================================
+# C++ output: L1 packer accumulation pattern
+#   1. Disable before the K loop
+#   2. Conditional enable after the first iteration (iv == lb)
+#   3. Disable after cb_push_back following the loop
+# =============================================================================
+
+# CHECK-CPP:      PACK((llk_pack_reconfig_l1_acc(
+# CHECK-CPP-NEXT: for
+# CHECK-CPP:        matmul_block(
+# CHECK-CPP:        pack_tile
+# CHECK-CPP:        if (
+# CHECK-CPP-NEXT:   PACK((llk_pack_reconfig_l1_acc(
+# CHECK-CPP:      cb_push_back(
+# CHECK-CPP-NEXT: PACK((llk_pack_reconfig_l1_acc(
+
+# CHECK-RESULT: PASS
+
+if __name__ == "__main__":
+    device = ttnn.open_device(device_id=0)
+
+    try:
+        # 32x32x32 tiles = 1024x1024x1024, 8x8x8 blocks -> K_num_blocks=4
+        Mt, Kt, Nt = 32, 32, 32
+        M, K, N = Mt * TILE, Kt * TILE, Nt * TILE
+
+        a_torch = torch.randn(M, K, dtype=torch.bfloat16)
+        b_torch = torch.randn(K, N, dtype=torch.bfloat16)
+        golden = (a_torch.float() @ b_torch.float()).float()
+
+        a_dev = ttnn.from_torch(
+            a_torch, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device
+        )
+        b_dev = ttnn.from_torch(
+            b_torch, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device
+        )
+        out_dev = ttnn.from_torch(
+            torch.zeros(M, N, dtype=torch.bfloat16),
+            dtype=ttnn.bfloat16,
+            layout=ttnn.TILE_LAYOUT,
+            device=device,
+        )
+
+        matmul_l1_acc(a_dev, b_dev, out_dev)
+
+        result = ttnn.to_torch(out_dev).float()
+        pcc = torch.corrcoef(torch.stack([result.flatten(), golden.flatten()]))[
+            0, 1
+        ].item()
+        if pcc > 0.999:
+            print("PASS")
+        else:
+            print(f"FAIL: PCC {pcc:.6f} < 0.999")
+
+    finally:
+        ttnn.close_device(device)
diff --git a/test/python/test_matmul_l1_acc_multinode.py b/test/python/test_matmul_l1_acc_multinode.py
new file mode 100644
index 000000000..0f1d603a4
--- /dev/null
+++ b/test/python/test_matmul_l1_acc_multinode.py
@@ -0,0 +1,155 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Multinode matmul with L1 packer accumulation, L1-only (no DRAM reads during
+compute). All input blocks are pre-loaded into L1 DFBs before the K reduction
+loop begins. The compiler inserts pack_reconfig_l1_acc guards so each K
+iteration packs additively to L1.
+
+Tests multicore configurations with a 2D grid and multiple K blocks.
+"""
+
+import pytest
+import torch
+import ttl
+
+ttnn = pytest.importorskip("ttnn", exc_type=ImportError)
+
+from ttlang_test_utils import to_dram
+from utils.correctness import assert_pcc
+
+TILE = 32
+
+
+def _make_l1_acc_multinode_kernel(block_m, block_n, grid="auto"):
+    """Multinode matmul with L1 accumulation.
+
+    All K blocks are pre-loaded into L1 before compute begins (no DRAM
+    streaming during the K loop). The compute thread reserves the output
+    DFB once, stores K times (triggering L1 accumulation), then pushes.
+    """
+
+    @ttl.operation(grid=grid)
+    def kernel(a, b, out):
+        Mt = a.shape[0] // TILE
+        Kt = a.shape[1] // TILE
+        Nt = b.shape[1] // TILE
+
+        M_num = Mt // block_m
+        N_num = Nt // block_n
+
+        grid_n, grid_m = ttl.grid_size(dims=2)
+        m_per = -(-M_num // grid_m)
+        n_per = -(-N_num // grid_n)
+
+        a_dfb = ttl.make_dataflow_buffer_like(a, shape=(block_m, 1), block_count=2)
+        b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, block_n), block_count=2)
+        out_dfb = ttl.make_dataflow_buffer_like(
+            out, shape=(block_m, block_n), block_count=2
+        )
+
+        @ttl.compute()
+        def compute():
+            node_n, node_m = ttl.node(dims=2)
+            for lm in range(m_per):
+                mb = node_m * m_per + lm
+                if mb < M_num:
+                    for ln in range(n_per):
+                        nb = node_n * n_per + ln
+                        if nb < N_num:
+                            out_blk = out_dfb.reserve()
+                            for _ in range(Kt):
+                                a_blk = a_dfb.wait()
+                                b_blk = b_dfb.wait()
+                                out_blk.store(a_blk @ b_blk)
+                                a_blk.pop()
+                                b_blk.pop()
+                            out_blk.push()
+
+        @ttl.datamovement()
+        def reader():
+            node_n, node_m = ttl.node(dims=2)
+            for lm in range(m_per):
+                mb = node_m * m_per + lm
+                if mb < M_num:
+                    m_off = mb * block_m
+                    for ln in range(n_per):
+                        nb = node_n * n_per + ln
+                        if nb < N_num:
+                            for kt in range(Kt):
+                                with a_dfb.reserve() as blk:
+                                    ttl.copy(
+                                        a[
+                                            m_off : m_off + block_m,
+                                            kt : kt + 1,
+                                        ],
+                                        blk,
+                                    ).wait()
+
+        @ttl.datamovement()
+        def writer():
+            node_n, node_m = ttl.node(dims=2)
+            for lm in range(m_per):
+                mb = node_m * m_per + lm
+                if mb < M_num:
+                    m_off = mb * block_m
+                    for ln in range(n_per):
+                        nb = node_n * n_per + ln
+                        if nb < N_num:
+                            n_off = nb * block_n
+                            for kt in range(Kt):
+                                with b_dfb.reserve() as blk:
+                                    ttl.copy(
+                                        b[
+                                            kt : kt + 1,
+                                            n_off : n_off + block_n,
+                                        ],
+                                        blk,
+                                    ).wait()
+                            with out_dfb.wait() as blk:
+                                ttl.copy(
+                                    blk,
+                                    out[
+                                        m_off : m_off + block_m,
+                                        n_off : n_off + block_n,
+                                    ],
+                                ).wait()
+
+    return kernel
+
+
+PARAMS = [
+    # (Mt, Kt, Nt, block_m, block_n, grid)
+    (4, 2, 4, 2, 2, (2, 2)),
+    (8, 4, 8, 4, 4, (2, 2)),
+    (8, 4, 8, 4, 4, "auto"),
+    (16, 8, 16, 8, 8, "auto"),
+]
+
+
+@pytest.mark.parametrize(
+    "Mt,Kt,Nt,block_m,block_n,grid",
+    PARAMS,
+    ids=[
+        f"tiles{mt}x{kt}x{nt}_blk{bm}x{bn}_grid{g}" for mt, kt, nt, bm, bn, g in PARAMS
+    ],
+)
+@pytest.mark.requires_device
+def test_l1_acc_multinode(Mt, Kt, Nt, block_m, block_n, grid, device):
+    """Multinode matmul with L1 packer accumulation across K iterations."""
+    M, K, N = Mt * TILE, Kt * TILE, Nt * TILE
+    a_torch = torch.randn(M, K, dtype=torch.bfloat16)
+    b_torch = torch.randn(K, N, dtype=torch.bfloat16)
+    golden = (a_torch.float() @ b_torch.float()).float()
+
+    a_dev = to_dram(a_torch, device)
+    b_dev = to_dram(b_torch, device)
+    out_dev = to_dram(torch.zeros(M, N, dtype=torch.bfloat16), device)
+
+    kernel = _make_l1_acc_multinode_kernel(block_m, block_n, grid=grid)
+    kernel(a_dev, b_dev, out_dev)
+
+    result = ttnn.to_torch(out_dev).float()
+    assert_pcc(golden, result, threshold=0.999)
diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
new file mode 100644
index 000000000..8c3375e06
--- /dev/null
+++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
@@ -0,0 +1,150 @@
+// Verifies ttkernel-insert-l1-accumulation: pack_reconfig_l1_acc guards are
+// inserted around reduction loops. The enable call happens once after the
+// first iteration's last pack (iv == lb), and disable guards bracket the
+// outermost loop.
+
+// RUN: ttlang-opt %s --pass-pipeline='builtin.module(ttkernel-insert-l1-accumulation)' --split-input-file | FileCheck %s
+
+// Basic L1 acc loop: enable after first iteration, disable before/after loop.
+
+// CHECK-LABEL: func.func @basic_l1_acc_loop
+// CHECK: ttkernel.pack_reconfig_l1_acc(%{{.*}}) : (i32)
+// CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to
+// CHECK:   ttkernel.tile_regs_acquire
+// CHECK:   ttkernel.pack_tile
+// CHECK:   ttkernel.tile_regs_release
+// CHECK:   %[[CMP:.*]] = arith.cmpi eq, %[[IV]], %[[LB]]
+// CHECK:   scf.if %[[CMP]]
+// CHECK:     %[[ENABLE:.*]] = arith.constant 1 : i32
+// CHECK:     ttkernel.pack_reconfig_l1_acc(%[[ENABLE]]) : (i32)
+// CHECK: }
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc(%{{.*}}) : (i32)
+func.func @basic_l1_acc_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}
+
+// -----
+
+// Reduction loop fallback (ttl.reduction_loop attribute) with sum reduce.
+
+// CHECK-LABEL: func.func @reduction_loop_fallback
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   arith.cmpi eq
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc
+// CHECK: ttkernel.pack_reconfig_l1_acc
+func.func @reduction_loop_fallback() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, <reduce_sum>, <reduce_dim_col>) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.reduction_loop}
+  return
+}
+
+// -----
+
+// Max reduce loops should NOT get L1 accumulation guards.
+
+// CHECK-LABEL: func.func @max_reduce_no_l1_acc
+// CHECK-NOT: pack_reconfig_l1_acc
+func.func @max_reduce_no_l1_acc() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, <reduce_max>, <reduce_dim_col>) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.reduction_loop}
+  return
+}
+
+// -----
+
+// No reduction loop attribute: no transformation.
+
+// CHECK-LABEL: func.func @no_reduction_loop
+// CHECK-NOT: pack_reconfig_l1_acc
+func.func @no_reduction_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  }
+  return
+}
+
+// -----
+
+// Subblocked loop: multiple acquire/release pairs per iteration inside nested
+// loops. The enable guard should appear once after the outermost subblock loop
+// (containing the last release), not after each individual release.
+
+// CHECK-LABEL: func.func @subblocked_loop
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to
+// CHECK:   scf.for
+// CHECK:     ttkernel.tile_regs_acquire
+// CHECK:     ttkernel.tile_regs_release
+// CHECK:   }
+// CHECK:   %[[CMP:.*]] = arith.cmpi eq, %[[IV]], %[[LB]]
+// CHECK:   scf.if %[[CMP]]
+// CHECK:     %[[ENABLE:.*]] = arith.constant 1 : i32
+// CHECK:     ttkernel.pack_reconfig_l1_acc(%[[ENABLE]]) : (i32)
+// CHECK: }
+// CHECK: ttkernel.pack_reconfig_l1_acc
+func.func @subblocked_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  scf.for %iv = %c0 to %c4 step %c1 {
+    scf.for %sb = %c0 to %c2 step %c1 {
+      ttkernel.tile_regs_acquire() : () -> ()
+      ttkernel.matmul_block(%cb, %cb, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> ()
+      ttkernel.tile_regs_commit() : () -> ()
+      ttkernel.tile_regs_wait() : () -> ()
+      ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+      ttkernel.tile_regs_release() : () -> ()
+    }
+  } {ttl.l1_acc_loop}
+  return
+}

From 2d11d0457e3f8ee61ed90702cf32e5d0bded2caa Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Fri, 10 Apr 2026 13:32:26 -0700
Subject: [PATCH 12/31] update tests

---
 .../TTLToTTKernel/reduce_lowering.mlir        | 10 ++--
 .../TTL/Transforms/subblock_matmul.mlir       | 47 +++++++++----------
 2 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir
index 9ddb49905..b7457dda8 100644
--- a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir
+++ b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir
@@ -87,16 +87,16 @@ func.func @reduce_sum_dim0_1x1() attributes {ttl.base_cta_index = 3 : i32, ttl.c
 // FP32: %[[CB2:.*]] = ttkernel.get_compile_time_arg_val(2)
 // FP32: scf.for %[[IV:.*]] = %[[C0]] to %[[C2]] step %[[C1]]
 // FP32-NEXT:   ttkernel.tile_regs_acquire
-// L1 accumulation guard: enable from second iteration.
-// FP32:   %[[NOT_FIRST:.*]] = arith.cmpi ne, %[[IV]], %[[C0]]
-// FP32-NEXT:   scf.if %[[NOT_FIRST]]
-// FP32-NEXT:     ttkernel.pack_reconfig_l1_acc(%[[C1I]])
-// FP32:        }
 // FP32:   ttkernel.reduce_init({{.*}}<reduce_sum>, <reduce_dim_col>) {full_fp32}
 // FP32:   ttkernel.reduce_tile({{.*}}<reduce_sum>, <reduce_dim_col>) {full_fp32
 // FP32:   ttkernel.reduce_uninit
 // FP32:   ttkernel.pack_tile(%[[C0]], %[[CB2]], %[[C0]], true)
 // FP32:   ttkernel.tile_regs_release
+// L1 accumulation guard: enable once after the first iteration's pack.
+// FP32:   %[[FIRST:.*]] = arith.cmpi eq, %[[IV]], %[[C0]]
+// FP32-NEXT:   scf.if %[[FIRST]]
+// FP32-NEXT:     ttkernel.pack_reconfig_l1_acc(%[[C1I]])
+// FP32:        }
 // FP32: } {ttl.reduction_loop
 // Disable L1 accumulation after reduction loop.
 // FP32: ttkernel.pack_reconfig_l1_acc({{.*}}0{{.*}})
diff --git a/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir b/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir
index aa4a891f8..9fb0212c9 100644
--- a/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir
+++ b/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir
@@ -1,40 +1,35 @@
 // Tests for ttl-subblock-compute-for-dst with matmul computes.
 // Matmul K (reduction) accumulates in-place in DST, so only M*N parallel
-// tiles count toward the DST budget. When the parallel output exceeds DST,
-// subblocking partitions M*N AND tiles K to 1 for L1 accumulation.
+// tiles count toward the DST budget. Subblocking partitions the M*N output
+// space while keeping K whole in each subblock.
 
 // RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(convert-ttl-to-compute, ttl-set-compute-kernel-config, ttl-assign-dst{enable-fpu-binary-ops=0}, ttl-subblock-compute-for-dst))' --split-input-file | FileCheck %s
 
 // -----
 
-// Purpose: M*N=16 exceeds f32 DST capacity (4). Subblocking partitions the
-// 4x4 output into 1x4 strips AND tiles K from 3 to 1. The K loop is
-// annotated with ttl.reduction_loop for L1 accumulation.
-// Loops: M (dim 0) 0..4 step 1, K (dim 2) 0..3 step 1.
+// Purpose: M*N=16 exceeds f32 DST capacity (4). K=3 is excluded from the
+// budget, so subblocking partitions the 4x4 output into 1x4 strips.
+// Loop on M (dim 0): 0 to 4 step 1. K (dim 2) stays at 3 in each subblock.
 
-// CHECK-LABEL: func.func @matmul_subblock_k_tiled
+// CHECK-LABEL: func.func @matmul_subblock_k_excluded
 // CHECK-SAME:  fp32_dest_acc_en = true
 // Outer subblock loop over M dimension.
-// CHECK:       scf.for %[[MIV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
-// Inner K reduction loop.
-// CHECK:         scf.for %[[KIV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
-// A sliced on M and K: [miv, kiv] [1, 1].
-// CHECK:           tensor.extract_slice {{.*}}[%[[MIV]], %[[KIV]]] [1, 1] [1, 1]
-// B sliced on K: [kiv, 0] [1, 4].
-// CHECK:           tensor.extract_slice {{.*}}[%[[KIV]], 0] [1, 4] [1, 1]
-// Output sliced on M: [miv, 0] [1, 4].
-// CHECK:           tensor.extract_slice {{.*}}[%[[MIV]], 0] [1, 4] [1, 1]
-// Inner compute on subblock [1, 4, 1] (M=1, N=4, K=1).
-// CHECK:           ttl.compute
-// CHECK-SAME:        tensor<1x1x!ttcore.tile<32x32, bf16>>
-// CHECK-SAME:        tensor<1x4x!ttcore.tile<32x32, bf16>>
-// CHECK-SAME:        tensor<1x4x!ttcore.tile<32x32, bf16>>
-// CHECK-SAME:        iterator_types = ["parallel", "parallel", "reduction"]
-// CHECK:             ttl.tile_matmul_block
-// K loop annotated for L1 accumulation.
-// CHECK:         } {{{.*}}ttl.reduction_loop{{.*}}}
+// CHECK:       scf.for %[[IV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} {
+// A sliced on M, K kept whole: [iv, 0] [1, 3].
+// CHECK:         tensor.extract_slice {{.*}}[%[[IV]], 0] [1, 3] [1, 1]
+// B not sliced (full [3, 4]).
+// CHECK:         tensor.extract_slice {{.*}}[0, 0] [3, 4] [1, 1]
+// Output sliced on M: [iv, 0] [1, 4].
+// CHECK:         tensor.extract_slice {{.*}}[%[[IV]], 0] [1, 4] [1, 1]
+// Inner compute on subblock [1, 4, 3] (M=1, N=4, K=3).
+// CHECK:         ttl.compute
+// CHECK-SAME:      tensor<1x3x!ttcore.tile<32x32, bf16>>
+// CHECK-SAME:      tensor<3x4x!ttcore.tile<32x32, bf16>>
+// CHECK-SAME:      tensor<1x4x!ttcore.tile<32x32, bf16>>
+// CHECK-SAME:      iterator_types = ["parallel", "parallel", "reduction"]
+// CHECK:           ttl.tile_matmul_block
 // CHECK:       }
-func.func @matmul_subblock_k_tiled(
+func.func @matmul_subblock_k_excluded(
     %arg0: tensor<4x3x!ttcore.tile<32x32, bf16>>,
     %arg1: tensor<3x4x!ttcore.tile<32x32, bf16>>) -> tensor<4x4x!ttcore.tile<32x32, bf16>> {
   %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[4, 3], !ttcore.tile<32x32, bf16>, 2>

From 1da0c9f16b1f44ec9fbdb2ad2f08711ca4bfccf6 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Sun, 12 Apr 2026 17:11:13 -0700
Subject: [PATCH 13/31] comments

---
 include/ttlang/Dialect/TTL/Passes.td          | 28 ++++++++++++-----
 .../TTKernelInsertL1Accumulation.cpp          | 30 ++++++-------------
 .../Transforms/TTLAnnotateReductionLoops.cpp  |  7 +++--
 .../Transforms/TTLSubblockComputeForDST.cpp   |  2 ++
 .../TTLToTTKernel/reduce_lowering.mlir        |  3 +-
 .../Transforms/insert_l1_accumulation.mlir    | 18 +++++++++++
 6 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index 8e8765181..b76a29138 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -31,12 +31,26 @@ def TTKernelInsertL1Accumulation
   let summary = "Insert L1 accumulation guards for reduction loops";
   let description = [{
     Inserts `pack_reconfig_l1_acc` guards inside reduction loops so that
-    pack operations accumulate into L1 instead of overwriting. On the first
-    iteration of a reduction loop, pack writes normally. On subsequent
-    iterations, the packer is reconfigured to add to the existing L1 value.
-
-    Reduction loops are identified by the `ttl.reduction_loop` attribute
-    on `scf.for` ops.
+    pack operations accumulate into L1 instead of overwriting. The enable
+    call happens once after the first iteration's last pack; the L1 acc
+    packer state persists across `tile_regs` boundaries. Disable guards
+    bracket the outermost reduction loop (parallel loops are not
+    considered). Max-reduce loops are excluded (max is not additive).
+
+    The pattern is:
+       pack_reconfig_l1_acc(0)                // disable before loop
+       for (iv = lb; ...) {
+         [subblock 0: acquire...pack...release]
+         [subblock N: acquire...pack...release]
+         if (iv == lb) pack_reconfig_l1_acc(1) // enable once after first
+                                               // iteration's last pack
+       }
+       [cb_push_back if present]
+       pack_reconfig_l1_acc(0)                // disable after loop
+
+    Reduction loops are identified by the `ttl.l1_acc_loop` (user-written)
+    or `ttl.reduction_loop` (compiler-generated) attributes on `scf.for`
+    ops, with `ttl.l1_acc_loop` taking precedence.
   }];
 
   let dependentDialects = [
@@ -52,7 +66,7 @@ def TTLAnnotateReductionLoops
   let description = [{
     Detects user-written `scf.for` loops where all iterations store to the
     same CB slot (reserved before the loop, pushed after) and annotates them
-    with `ttl.reduction_loop`. This enables `TTKernelInsertL1Accumulation`
+    with `ttl.l1_acc_loop`. This enables `TTKernelInsertL1Accumulation`
     to insert `pack_reconfig_l1_acc` guards so that packs accumulate across
     iterations instead of overwriting.
 
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index 11a2985ea..2ec189092 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -5,16 +5,6 @@
 //===----------------------------------------------------------------------===//
 // TTKernel Insert L1 Accumulation
 //===----------------------------------------------------------------------===//
-//
-// Inserts pack_reconfig_l1_acc guards inside reduction loops. When a
-// tile_regs_release is inside a reduction loop, the packer is switched
-// to L1 accumulation mode once after the first iteration's pack so that
-// subsequent iterations add to the existing L1 value instead of
-// overwriting. The L1 acc state persists across tile_regs boundaries.
-//
-// See docs/development/AccumulatingComputeLowering.md for design details.
-//
-//===----------------------------------------------------------------------===//
 
 #include "ttlang/Dialect/TTL/IR/TTL.h"
 #include "ttlang/Dialect/TTL/Passes.h"
@@ -109,14 +99,13 @@ struct TTKernelInsertL1AccumulationPass
     //   [cb_push_back if present]
     //   pack_reconfig_l1_acc(0)                // disable after loop
     //
-    // The L1 acc state persists across tile_regs boundaries, so the enable
+    // The L1 acc state persists across multiple dst sections, so the enable
     // call only needs to happen once (after the first iteration completes
-    // all subblock packs). Disable guards are inserted once per outermost
-    // loop.
+    // all its packs). Disable guards are inserted once per outermost
+    // reduction loop (parallel loops are not considered).
 
-    // Find the top-level operation in each L1 acc loop body that contains
-    // the last tile_regs_release. The release may be nested inside subblock
-    // loops, so we find the enclosing top-level op to insert after.
+    // Find the insertion point for the enable guard: the top-level op in
+    // the loop body that contains the last tile_regs_release.
     auto findTopLevelAncestor = [](Operation *op,
                                    Block *loopBody) -> Operation * {
       while (op && op->getBlock() != loopBody) {
@@ -149,10 +138,9 @@ struct TTKernelInsertL1AccumulationPass
       OpBuilder builder(loop->getContext());
       Location loc = enablePoint->getLoc();
 
-      // Conditional enable after the last subblock/release on the first
-      // iteration. Placed after the top-level op containing the last
-      // release so all subblock packs in iteration 0 write without
-      // accumulation.
+      // Enable L1 acc once, at the end of the first iteration of the
+      // reduction loop. All packs in iteration 0 write without
+      // accumulation; subsequent iterations add to the existing L1 value.
       builder.setInsertionPointAfter(enablePoint);
       Value loopIV = loop.getInductionVar();
       Value loopLB = loop.getLowerBound();
@@ -164,7 +152,7 @@ struct TTKernelInsertL1AccumulationPass
           builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1));
       ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag);
 
-      // Disable before and after the outermost L1 acc loop (once per loop).
+      // Bracket the outermost reduction loop with disable guards.
       auto outermostLoop = findOutermostL1AccLoop(loop);
       if (!outermostLoop) {
         outermostLoop = loop;
diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
index 7545aba6d..91d0f3c66 100644
--- a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
@@ -8,7 +8,7 @@
 //
 // Detects user-written scf.for loops that accumulate into the same CB slot
 // (reserve before loop, store inside, push after) and annotates them with
-// kReductionLoopAttrName for L1 accumulation.
+// kL1AccLoopAttrName for L1 accumulation.
 //
 //===----------------------------------------------------------------------===//
 
@@ -34,8 +34,9 @@ struct TTLAnnotateReductionLoopsPass
     func::FuncOp func = getOperation();
 
     func.walk([&](scf::ForOp forOp) {
-      // Skip loops already annotated (from compiler-generated tile loops).
-      if (forOp->hasAttr(kReductionLoopAttrName) ||
+      // Skip loops already annotated (compiler-generated or prior run).
+      if (forOp->hasAttr(kL1AccLoopAttrName) ||
+          forOp->hasAttr(kReductionLoopAttrName) ||
           forOp->hasAttr(kTileLoopStrideAttrName) ||
           forOp->hasAttr(kSubblockLoopStrideAttrName)) {
         return;
diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
index f6c082780..da6c65065 100644
--- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
@@ -122,6 +122,8 @@ struct TTLSubblockComputeForDSTPass
           if (isa<TileMatmulBlockOp>(op)) {
             hasMatmulBlock = true;
           }
+          return (hasAccumulating && hasMatmulBlock) ? WalkResult::interrupt()
+                                                     : WalkResult::advance();
         });
         if (hasAccumulating && !hasMatmulBlock) {
           return;
diff --git a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir
index b7457dda8..d6fcd1453 100644
--- a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir
+++ b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir
@@ -81,6 +81,7 @@ func.func @reduce_sum_dim0_1x1() attributes {ttl.base_cta_index = 3 : i32, ttl.c
 // FP32-DAG: %[[C0:.*]] = arith.constant 0 : index
 // FP32-DAG: %[[C1:.*]] = arith.constant 1 : index
 // FP32-DAG: %[[C2:.*]] = arith.constant 2 : index
+// FP32-DAG: %[[C0I:.*]] = arith.constant 0 : i32
 // FP32-DAG: %[[C1I:.*]] = arith.constant 1 : i32
 // FP32: %[[CB0:.*]] = ttkernel.get_compile_time_arg_val(0)
 // FP32: %[[CB1:.*]] = ttkernel.get_compile_time_arg_val(1)
@@ -99,7 +100,7 @@ func.func @reduce_sum_dim0_1x1() attributes {ttl.base_cta_index = 3 : i32, ttl.c
 // FP32:        }
 // FP32: } {ttl.reduction_loop
 // Disable L1 accumulation after reduction loop.
-// FP32: ttkernel.pack_reconfig_l1_acc({{.*}}0{{.*}})
+// FP32: ttkernel.pack_reconfig_l1_acc(%[[C0I]])
 func.func @reduce_2x1_l1_acc() attributes {ttl.base_cta_index = 3 : i32, ttl.crta_indices = [], ttl.kernel_thread = #ttkernel.thread<compute>} {
   %c2 = arith.constant 2 : index
   %c1 = arith.constant 1 : index
diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
index 8c3375e06..ef1c7542a 100644
--- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
+++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
@@ -148,3 +148,21 @@ func.func @subblocked_loop() attributes {ttkernel.thread = #ttkernel.thread<comp
   } {ttl.l1_acc_loop}
   return
 }
+
+// -----
+
+// L1 acc loop with no tile_regs_acquire/release inside: no guards inserted.
+
+// CHECK-LABEL: func.func @l1_acc_loop_no_sync
+// CHECK-NOT: pack_reconfig_l1_acc
+func.func @l1_acc_loop_no_sync() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+  } {ttl.l1_acc_loop}
+  return
+}

From c1bf820db707c0a0ab6e2aa8ad384b68d49ea6a8 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Sun, 12 Apr 2026 21:21:24 -0700
Subject: [PATCH 14/31] rename ttl-annotate-reduction-loops to
 ttl-annotate-l1-acc-loops; other cleanup; add tests

---
 include/ttlang/Dialect/TTL/Passes.td          |   6 +-
 .../Transforms/TTKernelInsertInits.cpp        |  11 +-
 .../TTKernelInsertL1Accumulation.cpp          |  19 ++-
 lib/Dialect/TTL/Pipelines/TTLPipelines.cpp    |   2 +-
 lib/Dialect/TTL/Transforms/CMakeLists.txt     |   2 +-
 ...ionLoops.cpp => TTLAnnotateL1AccLoops.cpp} |  24 +--
 python/ttl/ttl_api.py                         |   2 +-
 .../TTLToTTKernel/reduce_lowering.mlir        |   2 +
 .../Transforms/insert_l1_accumulation.mlir    | 145 ++++++++++++++++++
 .../TTL/Transforms/annotate_l1_acc_loops.mlir | 119 ++++++++++++++
 10 files changed, 297 insertions(+), 35 deletions(-)
 rename lib/Dialect/TTL/Transforms/{TTLAnnotateReductionLoops.cpp => TTLAnnotateL1AccLoops.cpp} (70%)
 create mode 100644 test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir

diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index b76a29138..360a1f4e2 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -60,9 +60,9 @@ def TTKernelInsertL1Accumulation
   ];
 }
 
-def TTLAnnotateReductionLoops
-    : Pass<"ttl-annotate-reduction-loops", "::mlir::func::FuncOp"> {
-  let summary = "Annotate user-written scf.for loops as reduction loops";
+def TTLAnnotateL1AccLoops
+    : Pass<"ttl-annotate-l1-acc-loops", "::mlir::func::FuncOp"> {
+  let summary = "Annotate user-written scf.for loops for L1 accumulation";
   let description = [{
     Detects user-written `scf.for` loops where all iterations store to the
     same CB slot (reserved before the loop, pushed after) and annotates them
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp
index b28ccded4..f6ddc93fa 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp
@@ -348,12 +348,11 @@ analyzeSyncRegion(ttk::TileRegsAcquireOp acquireOp, Value &inputCB,
 }
 
 /// Find the outermost enclosing insertion point by walking up through
-/// compiler-generated loops (marked with ttl.tile_loop_stride or
-/// ttl.subblock_loop_stride). By construction, these loops iterate over tiles
-/// within a single ttl.compute whose input/output CBs are fixed, so the
-/// CB configuration is invariant across iterations and hoisting is safe.
-/// Stops at unmarked loops to avoid hoisting past user loops that could
-/// contain multiple sync regions with different CB configurations.
+/// loops with invariant CB configurations: compiler-generated tile/subblock
+/// loops (ttl.tile_loop_stride, ttl.subblock_loop_stride) and L1
+/// accumulation loops (ttl.l1_acc_loop). All use fixed CBs across
+/// iterations, so init hoisting is safe. Stops at unmarked loops to avoid
+/// hoisting past user loops with varying CB configurations.
 static Operation *hoistAboveCompilerLoops(Operation *op) {
   Operation *insertBefore = op;
   while (auto *parentOp = insertBefore->getParentOp()) {
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index 2ec189092..41cfeb5a6 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -152,7 +152,10 @@ struct TTKernelInsertL1AccumulationPass
           builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1));
       ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag);
 
-      // Bracket the outermost reduction loop with disable guards.
+      // Bracket the outermost accumulation loop with disable guards.
+      // Both kL1AccLoopAttrName and kReductionLoopAttrName mean "all
+      // iterations write to the same CB slot," so the outermost such
+      // loop is the correct accumulation boundary.
       auto outermostLoop = findOutermostL1AccLoop(loop);
       if (!outermostLoop) {
         outermostLoop = loop;
@@ -164,13 +167,15 @@ struct TTKernelInsertL1AccumulationPass
             builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0));
         ttk::PackReconfigL1AccOp::create(builder, loc, disablePre);
 
-        // Disable after cb_push_back following the loop, or after the loop.
-        Operation *insertPoint = outermostLoop->getNextNode();
-        while (insertPoint && !isa<ttk::CBPushBackOp>(insertPoint)) {
-          insertPoint = insertPoint->getNextNode();
+        // Disable after any consecutive cb_push_back ops that follow the
+        // loop. Multi-output computes produce one push per output CB.
+        Operation *lastPush = nullptr;
+        for (Operation *op = outermostLoop->getNextNode();
+             op && isa<ttk::CBPushBackOp>(op); op = op->getNextNode()) {
+          lastPush = op;
         }
-        if (insertPoint) {
-          builder.setInsertionPointAfter(insertPoint);
+        if (lastPush) {
+          builder.setInsertionPointAfter(lastPush);
         } else {
           builder.setInsertionPointAfter(outermostLoop);
         }
diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
index d059fc090..85cc4e47c 100644
--- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
+++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
@@ -18,7 +18,7 @@ namespace mlir::tt::ttl {
 
 void createTTLToTTKernelPipeline(OpPassManager &pm,
                                  const TTLToTTKernelPipelineOptions &options) {
-  pm.addPass(createTTLAnnotateReductionLoops());
+  pm.addPass(createTTLAnnotateL1AccLoops());
   pm.addPass(createTTLConvertTTLToCompute());
   {
     TTLSetComputeKernelConfigOptions configOpts;
diff --git a/lib/Dialect/TTL/Transforms/CMakeLists.txt b/lib/Dialect/TTL/Transforms/CMakeLists.txt
index 40d01413d..061f72b0b 100644
--- a/lib/Dialect/TTL/Transforms/CMakeLists.txt
+++ b/lib/Dialect/TTL/Transforms/CMakeLists.txt
@@ -6,7 +6,7 @@ add_mlir_dialect_library(TTLangTTLTransforms
   LowerDPrintToEmitC.cpp
   LowerSignpostToEmitC.cpp
   TTLAnnotateCBAssociations.cpp
-  TTLAnnotateReductionLoops.cpp
+  TTLAnnotateL1AccLoops.cpp
   TTLDumpCBFlowGraph.cpp
   TTLLowerMatmulBlock.cpp
   TTLAssignDST.cpp
diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
similarity index 70%
rename from lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
rename to lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
index 91d0f3c66..a3246395d 100644
--- a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 //===----------------------------------------------------------------------===//
-// TTL Annotate Reduction Loops
+// TTL Annotate L1 Acc Loops
 //===----------------------------------------------------------------------===//
 //
 // Detects user-written scf.for loops that accumulate into the same CB slot
@@ -18,18 +18,17 @@
 
 #include "mlir/Dialect/SCF/IR/SCF.h"
 
-#define DEBUG_TYPE "ttl-annotate-reduction-loops"
+#define DEBUG_TYPE "ttl-annotate-l1-acc-loops"
 
 namespace mlir::tt::ttl {
 
-#define GEN_PASS_DEF_TTLANNOTATEREDUCTIONLOOPS
+#define GEN_PASS_DEF_TTLANNOTATEL1ACCLOOPS
 #include "ttlang/Dialect/TTL/Passes.h.inc"
 
 namespace {
 
-struct TTLAnnotateReductionLoopsPass
-    : public impl::TTLAnnotateReductionLoopsBase<
-          TTLAnnotateReductionLoopsPass> {
+struct TTLAnnotateL1AccLoopsPass
+    : public impl::TTLAnnotateL1AccLoopsBase<TTLAnnotateL1AccLoopsPass> {
   void runOnOperation() override {
     func::FuncOp func = getOperation();
 
@@ -46,16 +45,9 @@ struct TTLAnnotateReductionLoopsPass
       // CB that was reserved (ttl.cb_reserve) before the loop.
       bool hasReductionStore = false;
       forOp.getBody()->walk([&](StoreOp store) {
-        Value view = store.getView();
-        // Trace through attach_cb to find the cb_reserve.
-        if (auto attachCB = view.getDefiningOp<AttachCBOp>()) {
-          view = attachCB.getTensor();
-        }
-        if (auto reserve = view.getDefiningOp<CBReserveOp>()) {
-          // The cb_reserve must be OUTSIDE the for loop (before it).
-          if (!forOp->isAncestor(reserve)) {
-            hasReductionStore = true;
-          }
+        auto reserve = store.getView().getDefiningOp<CBReserveOp>();
+        if (reserve && !forOp->isAncestor(reserve)) {
+          hasReductionStore = true;
         }
       });
 
diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py
index b8f0d208a..4360b627c 100644
--- a/python/ttl/ttl_api.py
+++ b/python/ttl/ttl_api.py
@@ -1224,7 +1224,7 @@ def _compile_kernel(
         assign_dst_pass = f"ttl-assign-dst{{enable-fpu-binary-ops={fpu_flag}}}"
 
         pipeline_passes = [
-            "func.func(ttl-annotate-reduction-loops)",
+            "func.func(ttl-annotate-l1-acc-loops)",
             "func.func(convert-ttl-to-compute)",
             set_compute_config_pass,
             f"func.func({assign_dst_pass})",
diff --git a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir
index d6fcd1453..ab37a5760 100644
--- a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir
+++ b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir
@@ -86,6 +86,8 @@ func.func @reduce_sum_dim0_1x1() attributes {ttl.base_cta_index = 3 : i32, ttl.c
 // FP32: %[[CB0:.*]] = ttkernel.get_compile_time_arg_val(0)
 // FP32: %[[CB1:.*]] = ttkernel.get_compile_time_arg_val(1)
 // FP32: %[[CB2:.*]] = ttkernel.get_compile_time_arg_val(2)
+// Disable L1 accumulation before the reduction loop.
+// FP32: ttkernel.pack_reconfig_l1_acc(%[[C0I]])
 // FP32: scf.for %[[IV:.*]] = %[[C0]] to %[[C2]] step %[[C1]]
 // FP32-NEXT:   ttkernel.tile_regs_acquire
 // FP32:   ttkernel.reduce_init({{.*}}<reduce_sum>, <reduce_dim_col>) {full_fp32}
diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
index ef1c7542a..9d329427a 100644
--- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
+++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
@@ -166,3 +166,148 @@ func.func @l1_acc_loop_no_sync() attributes {ttkernel.thread = #ttkernel.thread<
   } {ttl.l1_acc_loop}
   return
 }
+
+// -----
+
+// L1 acc loop inside an unannotated outer loop (the realistic pattern:
+// outer M/N iteration loop wraps the inner K reduction loop). The disable
+// guards bracket the inner K loop, not the outer loop. Each outer
+// iteration gets a fresh disable-before -> K loop -> disable-after cycle.
+
+// CHECK-LABEL: func.func @l1_acc_inside_outer_loop
+// CHECK: scf.for
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK:   scf.for %[[IV:.*]] = %[[LB:.*]] to
+// CHECK:     ttkernel.tile_regs_acquire
+// CHECK:     ttkernel.pack_tile
+// CHECK:     ttkernel.tile_regs_release
+// CHECK:     %[[CMP:.*]] = arith.cmpi eq, %[[IV]], %[[LB]]
+// CHECK:     scf.if %[[CMP]]
+// CHECK:       ttkernel.pack_reconfig_l1_acc
+// CHECK:   }
+// CHECK:   ttkernel.cb_push_back
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+func.func @l1_acc_inside_outer_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %outer = %c0 to %c2 step %c1 {
+    scf.for %inner = %c0 to %c4 step %c1 {
+      ttkernel.tile_regs_acquire() : () -> ()
+      ttkernel.tile_regs_commit() : () -> ()
+      ttkernel.tile_regs_wait() : () -> ()
+      ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+      ttkernel.tile_regs_release() : () -> ()
+    } {ttl.l1_acc_loop}
+    ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  }
+  return
+}
+
+// -----
+
+// Multiple consecutive cb_push_back ops after the loop (multi-output compute).
+// The disable guard should go after the last push.
+
+// CHECK-LABEL: func.func @multi_push_after_loop
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   arith.cmpi eq
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+func.func @multi_push_after_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %cb1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb0, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.pack_tile(%c0, %cb1, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb0, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  ttkernel.cb_push_back(%cb1, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}
+
+// -----
+
+// Nested l1_acc loops: reserve is outside both loops, so both are annotated
+// and all iterations accumulate into the same CB slot. Disable guards
+// bracket the outermost loop; enable fires once after the first inner
+// iteration of the first outer iteration.
+
+// CHECK-LABEL: func.func @nested_l1_acc_loops
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   scf.for %[[IV:.*]] = %[[LB:.*]] to
+// CHECK:     ttkernel.tile_regs_acquire
+// CHECK:     ttkernel.tile_regs_release
+// CHECK:     arith.cmpi eq, %[[IV]], %[[LB]]
+// CHECK:     scf.if
+// CHECK:       ttkernel.pack_reconfig_l1_acc
+// CHECK: ttkernel.pack_reconfig_l1_acc
+func.func @nested_l1_acc_loops() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  scf.for %outer = %c0 to %c2 step %c1 {
+    scf.for %inner = %c0 to %c4 step %c1 {
+      ttkernel.tile_regs_acquire() : () -> ()
+      ttkernel.tile_regs_commit() : () -> ()
+      ttkernel.tile_regs_wait() : () -> ()
+      ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+      ttkernel.tile_regs_release() : () -> ()
+    } {ttl.l1_acc_loop}
+  } {ttl.l1_acc_loop}
+  return
+}
+
+// -----
+
+// Nested reduction loops (multi-dim reduce): all iterations contribute to
+// a single accumulated result. Same structure as nested l1_acc loops.
+
+// CHECK-LABEL: func.func @nested_reduction_loops
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   scf.for %[[IV:.*]] = %[[LB:.*]] to
+// CHECK:     ttkernel.tile_regs_acquire
+// CHECK:     ttkernel.tile_regs_release
+// CHECK:     arith.cmpi eq, %[[IV]], %[[LB]]
+// CHECK:     scf.if
+// CHECK:       ttkernel.pack_reconfig_l1_acc
+// CHECK: ttkernel.pack_reconfig_l1_acc
+func.func @nested_reduction_loops() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  scf.for %row = %c0 to %c2 step %c1 {
+    scf.for %col = %c0 to %c2 step %c1 {
+      ttkernel.tile_regs_acquire() : () -> ()
+      ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, <reduce_sum>, <reduce_dim_col>) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> ()
+      ttkernel.tile_regs_commit() : () -> ()
+      ttkernel.tile_regs_wait() : () -> ()
+      ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> ()
+      ttkernel.tile_regs_release() : () -> ()
+    } {ttl.reduction_loop}
+  } {ttl.reduction_loop}
+  return
+}
diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
new file mode 100644
index 000000000..aba630a66
--- /dev/null
+++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
@@ -0,0 +1,119 @@
+// Verifies ttl-annotate-l1-acc-loops: user-written scf.for loops that store
+// to a CB reserved outside the loop are annotated with ttl.l1_acc_loop.
+
+// RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(ttl-annotate-l1-acc-loops))' --split-input-file | FileCheck %s
+
+// Loop storing to an externally reserved CB should be annotated.
+
+// CHECK-LABEL: func.func @external_reserve
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+func.func @external_reserve(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// Loop where cb_reserve is INSIDE the loop should NOT be annotated.
+
+// CHECK-LABEL: func.func @internal_reserve
+// CHECK: scf.for
+// CHECK-NOT: ttl.l1_acc_loop
+// CHECK: }
+func.func @internal_reserve(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %result = scf.for %iv = %c0 to %c4 step %c1 iter_args(%acc = %arg0) -> (tensor<1x1x!ttcore.tile<32x32, bf16>>) {
+    %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    scf.yield %mm : tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return %result : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// Loops already annotated with compiler-generated attributes should be skipped.
+
+// CHECK-LABEL: func.func @skip_tile_loop
+// CHECK: scf.for
+// CHECK: } {ttl.tile_loop_stride
+// CHECK-NOT: ttl.l1_acc_loop
+func.func @skip_tile_loop(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttl.store %arg0, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  } {ttl.tile_loop_stride = array<i64: 1>}
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// Loops already annotated with ttl.reduction_loop should be skipped.
+
+// CHECK-LABEL: func.func @skip_reduction_loop
+// CHECK: scf.for
+// CHECK: } {ttl.reduction_loop
+// CHECK-NOT: ttl.l1_acc_loop
+func.func @skip_reduction_loop(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttl.store %arg0, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  } {ttl.reduction_loop}
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// Loop without any store should NOT be annotated.
+
+// CHECK-LABEL: func.func @no_store
+// CHECK: scf.for
+// CHECK-NOT: ttl.l1_acc_loop
+// CHECK: }
+func.func @no_store(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    // No ttl.store in the loop body.
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}

From 3109b916b8bc62959f03e59d4785717e326bd8c4 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Sun, 12 Apr 2026 22:16:31 -0700
Subject: [PATCH 15/31] implement += on blocks

---
 include/ttlang/Dialect/TTL/IR/TTLOps.td       | 14 ++++--
 include/ttlang/Dialect/TTL/Passes.td          |  9 ++--
 .../TTL/Transforms/TTLAnnotateL1AccLoops.cpp  | 19 ++++---
 python/ttl/_src/ttl_ast.py                    | 19 +++++++
 python/ttl/operators.py                       | 23 ++++++++-
 test/python/matmul_l1_acc_multinode.py        |  8 +--
 test/python/test_matmul_l1_acc.py             | 12 ++---
 test/python/test_matmul_l1_acc_multinode.py   |  9 ++--
 .../TTL/Transforms/annotate_l1_acc_loops.mlir | 49 +++++--------------
 9 files changed, 91 insertions(+), 71 deletions(-)

diff --git a/include/ttlang/Dialect/TTL/IR/TTLOps.td b/include/ttlang/Dialect/TTL/IR/TTLOps.td
index 93dfa2da8..5ad4c87a3 100644
--- a/include/ttlang/Dialect/TTL/IR/TTLOps.td
+++ b/include/ttlang/Dialect/TTL/IR/TTLOps.td
@@ -1039,8 +1039,13 @@ def TTL_StoreOp : TTL_Op<"store", [MemoryEffects<[MemWrite]>]> {
     `ttl.store` represents the user's intent to store a computed tensor into
     an output circular buffer.
 
-    Emitted by Python's `o.store(result)`. During `convert-ttl-to-compute`,
-    this op is transformed into a `ttl.tile_store` inside the compute body.
+    Emitted by Python's `o.store(result)` (overwrite) or `o += result`
+    (accumulate). During `convert-ttl-to-compute`, this op is transformed
+    into a `ttl.tile_store` inside the compute body.
+
+    When `accumulate` is set, the enclosing loop is annotated for L1
+    packer accumulation so that each iteration adds to the existing L1
+    value instead of overwriting.
 
     Example:
     ```mlir
@@ -1048,11 +1053,14 @@ def TTL_StoreOp : TTL_Op<"store", [MemoryEffects<[MemWrite]>]> {
     %result = ttl.add %a, %b : ...
     ttl.store %result, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>,
                                   tensor<1x1x!ttcore.tile<32x32, bf16>>
+    // Accumulating store (from +=):
+    ttl.store %result, %reserve {accumulate} : ...
     ```
   }];
   let arguments = (ins
     AnyRankedTensor:$tensor,
-    AnyRankedTensor:$view
+    AnyRankedTensor:$view,
+    OptionalAttr<UnitAttr>:$accumulate
   );
   let assemblyFormat = "$tensor `,` $view attr-dict `:` type($tensor) `,` type($view)";
   let hasVerifier = 1;
diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index 360a1f4e2..43540f6d1 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -64,14 +64,11 @@ def TTLAnnotateL1AccLoops
     : Pass<"ttl-annotate-l1-acc-loops", "::mlir::func::FuncOp"> {
   let summary = "Annotate user-written scf.for loops for L1 accumulation";
   let description = [{
-    Detects user-written `scf.for` loops where all iterations store to the
-    same CB slot (reserved before the loop, pushed after) and annotates them
-    with `ttl.l1_acc_loop`. This enables `TTKernelInsertL1Accumulation`
+    Detects user-written `scf.for` loops containing `ttl.store` ops with
+    the `accumulate` attribute (emitted by the `+=` operator) and annotates
+    them with `ttl.l1_acc_loop`. This enables `TTKernelInsertL1Accumulation`
     to insert `pack_reconfig_l1_acc` guards so that packs accumulate across
     iterations instead of overwriting.
-
-    Targets the pattern:
-      cb_reserve -> scf.for { compute(store to reserved CB) } -> cb_push
   }];
   let dependentDialects = [
     "::mlir::scf::SCFDialect"
diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
index a3246395d..767317fdf 100644
--- a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
@@ -6,9 +6,9 @@
 // TTL Annotate L1 Acc Loops
 //===----------------------------------------------------------------------===//
 //
-// Detects user-written scf.for loops that accumulate into the same CB slot
-// (reserve before loop, store inside, push after) and annotates them with
-// kL1AccLoopAttrName for L1 accumulation.
+// Detects user-written scf.for loops containing accumulating stores
+// (ttl.store with the {accumulate} attribute, emitted by +=) and annotates
+// them with kL1AccLoopAttrName for L1 packer accumulation.
 //
 //===----------------------------------------------------------------------===//
 
@@ -41,17 +41,16 @@ struct TTLAnnotateL1AccLoopsPass
         return;
       }
 
-      // Check if the loop body contains a store (ttl.store) targeting a
-      // CB that was reserved (ttl.cb_reserve) before the loop.
-      bool hasReductionStore = false;
+      // Check if the loop body contains an accumulating store (ttl.store
+      // with the {accumulate} attribute, emitted by the ``+=`` operator).
+      bool hasAccumulatingStore = false;
       forOp.getBody()->walk([&](StoreOp store) {
-        auto reserve = store.getView().getDefiningOp<CBReserveOp>();
-        if (reserve && !forOp->isAncestor(reserve)) {
-          hasReductionStore = true;
+        if (store.getAccumulate()) {
+          hasAccumulatingStore = true;
         }
       });
 
-      if (hasReductionStore) {
+      if (hasAccumulatingStore) {
         forOp->setAttr(kL1AccLoopAttrName, OpBuilder(forOp).getUnitAttr());
       }
     });
diff --git a/python/ttl/_src/ttl_ast.py b/python/ttl/_src/ttl_ast.py
index 3ccfc5e37..5e4073309 100644
--- a/python/ttl/_src/ttl_ast.py
+++ b/python/ttl/_src/ttl_ast.py
@@ -8,6 +8,7 @@
 from typing import List, Optional, Set
 
 from pykernel._src.kernel_ast import TTCompilerBase
+from pykernel._src.utils import _get_type_str
 from ttl.dialects import arith, func, ttcore, ttkernel
 from ttl.ir import *
 
@@ -296,6 +297,24 @@ def visit_Call(self, node):
                     raise
                 self._raise_error(node, str(e))
 
+    def visit_AugAssign(self, node):
+        """Handle += on tensor blocks via the registered __iadd__ method."""
+        with self._loc_for_node(node):
+            target = self.visit(node.target)
+            if (
+                isinstance(node.op, ast.Add)
+                and hasattr(target, "type")
+                and isinstance(target.type, RankedTensorType)
+            ):
+                rhs = self.visit(node.value)
+                mlir_type = _get_type_str(target.type)
+                iadd_fn = self._fn_map.get(f"{mlir_type}.__iadd__")
+                if iadd_fn:
+                    result = iadd_fn(target, rhs)
+                    self.symbol_tables[-1][node.target.id] = result
+                    return
+            return super().visit_AugAssign(node)
+
     def visit_BinOp(self, node):
         """Override to inject auto-profiling and provide better error messages."""
         with self._loc_for_node(node):
diff --git a/python/ttl/operators.py b/python/ttl/operators.py
index de4c4ba03..5df40e193 100644
--- a/python/ttl/operators.py
+++ b/python/ttl/operators.py
@@ -111,9 +111,10 @@ def __matmul__(ast_self: TensorBlock, rhs: TensorBlock) -> TensorBlock:
         return ttl.matmul(result_type, ast_self, rhs)
 
     def store(ast_self: TensorBlock, rhs: TensorBlock) -> None:
-        """Store result tensor to the output CB reserve view.
+        """Store result tensor to the output CB reserve view (overwrite).
 
         Emits ttl.store with the result tensor and reserve view.
+        Always overwrites the CB slot. For accumulation, use ``+=``.
         """
         if not _is_block(ast_self):
             raise ValueError(
@@ -122,6 +123,26 @@ def store(ast_self: TensorBlock, rhs: TensorBlock) -> None:
         reserve = _get_reserve_from_block(ast_self)
         ttl.store(rhs, reserve)
 
+    def __iadd__(ast_self: TensorBlock, rhs: TensorBlock) -> TensorBlock:
+        """Accumulate into a reserved block via L1 packer accumulation.
+
+        Emits ttl.store with the ``accumulate`` attribute. When used
+        inside a loop, the compiler inserts ``pack_reconfig_l1_acc``
+        guards so that each iteration adds to the existing L1 value
+        instead of overwriting.
+
+        This is an interim mechanism; the spec's full pattern
+        (``fill`` + lazy ``BlockExpr`` ``+=`` + ``store``) is deferred
+        to the BlockExpr PR (#446).
+        """
+        if not _is_block(ast_self):
+            raise ValueError(
+                "+= must be called on a block acquired from reserve(), not a regular tensor"
+            )
+        reserve = _get_reserve_from_block(ast_self)
+        ttl.store(rhs, reserve, accumulate=True)
+        return ast_self
+
     def push(ast_self: TensorBlock) -> None:
         """
         Signal that data is ready in the circular buffer (producer release).
diff --git a/test/python/matmul_l1_acc_multinode.py b/test/python/matmul_l1_acc_multinode.py
index f2bbdd968..e086d09b5 100644
--- a/test/python/matmul_l1_acc_multinode.py
+++ b/test/python/matmul_l1_acc_multinode.py
@@ -12,9 +12,9 @@
 (make_matmul_l1_acc / v4_l1_acc): auto grid, split DMA (reader=A,
 writer=B+output), 8x8x8 blocks, K_num_blocks=4 at 1024x1024x1024.
 
-The compute thread uses the "reserve once, store K times, push once" pattern.
-The compiler detects the K reduction loop and inserts pack_reconfig_l1_acc
-guards so each K iteration packs additively to L1.
+The compute thread uses += for accumulation across K iterations. The
+compiler inserts pack_reconfig_l1_acc guards so each K iteration packs
+additively to L1.
 
 Verifies the L1 packer accumulation pattern in generated C++: disable before
 K loop, conditional enable after first iteration, disable after cb_push_back.
@@ -69,7 +69,7 @@ def compute():
                         for _ in range(K_num_blocks):
                             a_blk = a_dfb.wait()
                             b_blk = b_dfb.wait()
-                            out_blk.store(a_blk @ b_blk)
+                            out_blk += a_blk @ b_blk
                             a_blk.pop()
                             b_blk.pop()
                         out_blk.push()
diff --git a/test/python/test_matmul_l1_acc.py b/test/python/test_matmul_l1_acc.py
index 599c33ea7..7f0b8bec3 100644
--- a/test/python/test_matmul_l1_acc.py
+++ b/test/python/test_matmul_l1_acc.py
@@ -3,11 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Matmul L1 accumulation: reserve once, store K times, push once.
+Matmul L1 accumulation via += across K iterations.
 
-The compiler detects the scf.for loop storing to the same reserved CB
-and annotates it as a reduction loop. TTKernelInsertL1Accumulation inserts
-pack_reconfig_l1_acc guards so each K iteration packs additively to L1.
+The += operator emits ttl.store with {accumulate}, which the compiler
+detects and annotates for L1 packer accumulation. Each K iteration packs
+additively to L1.
 
 Tests single-core and multicore configurations with various block sizes.
 """
@@ -29,7 +29,7 @@
 
 
 def _make_l1_acc_kernel(block_m, block_n, grid="auto"):
-    """Matmul with L1 accumulation: reserve once, store K times, push once."""
+    """Matmul with L1 accumulation via += across K iterations."""
 
     @ttl.operation(grid=grid)
     def kernel(a, b, out):
@@ -63,7 +63,7 @@ def compute():
                             for _ in range(Kt):
                                 a_blk = a_dfb.wait()
                                 b_blk = b_dfb.wait()
-                                out_blk.store(a_blk @ b_blk)
+                                out_blk += a_blk @ b_blk
                                 a_blk.pop()
                                 b_blk.pop()
                             out_blk.push()
diff --git a/test/python/test_matmul_l1_acc_multinode.py b/test/python/test_matmul_l1_acc_multinode.py
index 0f1d603a4..c0d456838 100644
--- a/test/python/test_matmul_l1_acc_multinode.py
+++ b/test/python/test_matmul_l1_acc_multinode.py
@@ -3,10 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Multinode matmul with L1 packer accumulation, L1-only (no DRAM reads during
-compute). All input blocks are pre-loaded into L1 DFBs before the K reduction
-loop begins. The compiler inserts pack_reconfig_l1_acc guards so each K
-iteration packs additively to L1.
+Multinode matmul with L1 packer accumulation via += across K iterations.
+L1-only (no DRAM reads during compute). All input blocks are pre-loaded
+into L1 DFBs before the K reduction loop begins.
 
 Tests multicore configurations with a 2D grid and multiple K blocks.
 """
@@ -63,7 +62,7 @@ def compute():
                             for _ in range(Kt):
                                 a_blk = a_dfb.wait()
                                 b_blk = b_dfb.wait()
-                                out_blk.store(a_blk @ b_blk)
+                                out_blk += a_blk @ b_blk
                                 a_blk.pop()
                                 b_blk.pop()
                             out_blk.push()
diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
index aba630a66..41f5bb935 100644
--- a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
+++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
@@ -1,14 +1,14 @@
-// Verifies ttl-annotate-l1-acc-loops: user-written scf.for loops that store
-// to a CB reserved outside the loop are annotated with ttl.l1_acc_loop.
+// Verifies ttl-annotate-l1-acc-loops: scf.for loops containing
+// ttl.store with {accumulate} are annotated with ttl.l1_acc_loop.
 
 // RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(ttl-annotate-l1-acc-loops))' --split-input-file | FileCheck %s
 
-// Loop storing to an externally reserved CB should be annotated.
+// Accumulating store inside a loop should annotate the loop.
 
-// CHECK-LABEL: func.func @external_reserve
+// CHECK-LABEL: func.func @accumulating_store
 // CHECK: scf.for
 // CHECK: } {ttl.l1_acc_loop}
-func.func @external_reserve(
+func.func @accumulating_store(
     %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
     %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
   %c0 = arith.constant 0 : index
@@ -22,20 +22,20 @@ func.func @external_reserve(
   %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
   scf.for %iv = %c0 to %c4 step %c1 {
     %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
-    ttl.store %mm, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
   }
   func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
 }
 
 // -----
 
-// Loop where cb_reserve is INSIDE the loop should NOT be annotated.
+// Plain store (no {accumulate}) should NOT annotate, even with external reserve.
 
-// CHECK-LABEL: func.func @internal_reserve
+// CHECK-LABEL: func.func @plain_store_no_annotation
 // CHECK: scf.for
 // CHECK-NOT: ttl.l1_acc_loop
 // CHECK: }
-func.func @internal_reserve(
+func.func @plain_store_no_annotation(
     %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
     %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
   %c0 = arith.constant 0 : index
@@ -46,39 +46,17 @@ func.func @internal_reserve(
   %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
   %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
   %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
-  %result = scf.for %iv = %c0 to %c4 step %c1 iter_args(%acc = %arg0) -> (tensor<1x1x!ttcore.tile<32x32, bf16>>) {
-    %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
     %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
     ttl.store %mm, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
-    scf.yield %mm : tensor<1x1x!ttcore.tile<32x32, bf16>>
   }
-  func.return %result : tensor<1x1x!ttcore.tile<32x32, bf16>>
-}
-
-// -----
-
-// Loops already annotated with compiler-generated attributes should be skipped.
-
-// CHECK-LABEL: func.func @skip_tile_loop
-// CHECK: scf.for
-// CHECK: } {ttl.tile_loop_stride
-// CHECK-NOT: ttl.l1_acc_loop
-func.func @skip_tile_loop(
-    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
-  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
-  scf.for %iv = %c0 to %c4 step %c1 {
-    ttl.store %arg0, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
-  } {ttl.tile_loop_stride = array<i64: 1>}
   func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
 }
 
 // -----
 
-// Loops already annotated with ttl.reduction_loop should be skipped.
+// Already-annotated loops should be skipped.
 
 // CHECK-LABEL: func.func @skip_reduction_loop
 // CHECK: scf.for
@@ -92,7 +70,7 @@ func.func @skip_reduction_loop(
   %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
   %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
   scf.for %iv = %c0 to %c4 step %c1 {
-    ttl.store %arg0, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %arg0, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
   } {ttl.reduction_loop}
   func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
 }
@@ -113,7 +91,6 @@ func.func @no_store(
   %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
   %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
   scf.for %iv = %c0 to %c4 step %c1 {
-    // No ttl.store in the loop body.
   }
   func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
 }

From 760d376ea5fbd9b9ca91f30b282c8789c43f12a2 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Sun, 12 Apr 2026 22:50:57 -0700
Subject: [PATCH 16/31] use _set_var helper

---
 python/pykernel/_src/base_ast.py   |  4 ++++
 python/pykernel/_src/kernel_ast.py | 17 +++++++----------
 python/ttl/_src/ttl_ast.py         | 28 +++++++++++-----------------
 3 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/python/pykernel/_src/base_ast.py b/python/pykernel/_src/base_ast.py
index afc6767d9..754d16590 100644
--- a/python/pykernel/_src/base_ast.py
+++ b/python/pykernel/_src/base_ast.py
@@ -83,6 +83,10 @@ def _var_exists(self, var_name):
                 return sym_table
         return {}
 
+    def _set_var(self, var_name, value):
+        """Bind a variable in the current (innermost) scope."""
+        self.symbol_tables[-1][var_name] = value
+
     def visit_Module(self, node):
         # Set default basic block
         with InsertionPoint(self.insert_point), Location.unknown():
diff --git a/python/pykernel/_src/kernel_ast.py b/python/pykernel/_src/kernel_ast.py
index 60bf4fbe3..715b5a938 100644
--- a/python/pykernel/_src/kernel_ast.py
+++ b/python/pykernel/_src/kernel_ast.py
@@ -211,8 +211,8 @@ def visit_For(self, node):
         with InsertionPoint(for_op.body), Location.unknown():
             self.symbol_tables.append({})
 
-            # Add the iterator into the symbol_table
-            self.symbol_tables[-1][node.target.id] = for_op.induction_variable
+            # Add the iterator into the symbol table.
+            self._set_var(node.target.id, for_op.induction_variable)
 
             for stmt in node.body:
                 self.visit(stmt)
@@ -259,17 +259,15 @@ def visit_Assign(self, node):
                     raise ValueError(
                         f"Not enough values to unpack from rt_args slice (expected {len(_vars)}, got {len(values)})"
                     )
-                # Since we are unpacking a tuple, types can't be assigned here:
-                sym_table = self.symbol_tables[-1]
+                # Since we are unpacking a tuple, types can't be assigned here.
                 for i in range(len(_vars)):
-                    sym_table[_tuple.elts[i].id] = values[i]
+                    self._set_var(_tuple.elts[i].id, values[i])
 
                 # Exit out of function now
                 return
 
         var = self.visit(node.targets[0])
         value = self.visit(node.value)
-        sym_table = self.symbol_tables[-1]
 
         # Handle Subscript Assignment here
         if isinstance(node.targets[0], ast.Subscript):
@@ -282,13 +280,12 @@ def visit_Assign(self, node):
         if hasattr(var, "type") and isinstance(var.type, MemRefType):
             memref.StoreOp(value, var, [arith.ConstantOp(IndexType.get(self.ctx), 0)])
         else:
-            sym_table[var_name] = value
+            self._set_var(var_name, value)
 
     def visit_AnnAssign(self, node):
         # NOTE: TTKernel types can not be used with memrefs
         var = self.visit(node.target)
         value = self.visit(node.value)
-        sym_table = self.symbol_tables[-1]
         var_name = node.target.id
 
         # Check the annotation for array creation
@@ -308,7 +305,7 @@ def visit_AnnAssign(self, node):
                 memref_type = MemRefType.get(
                     [elt.value for elt in node.annotation.elts[1:]], var_type
                 )
-                sym_table[var_name] = memref.alloca(memref_type, [], [])
+                self._set_var(var_name, memref.alloca(memref_type, [], []))
                 return
             else:
                 raise NotImplementedError(
@@ -324,7 +321,7 @@ def visit_AnnAssign(self, node):
             var_type = value.type
             memref_type = MemRefType.get([1], var_type)
             var = memref.alloca(memref_type, [], [])
-            sym_table[var_name] = var
+            self._set_var(var_name, var)
         else:
             assert isinstance(var, MemRefType), "Can not AnnAssign to non-memref types"
 
diff --git a/python/ttl/_src/ttl_ast.py b/python/ttl/_src/ttl_ast.py
index 5e4073309..898ec4d6e 100644
--- a/python/ttl/_src/ttl_ast.py
+++ b/python/ttl/_src/ttl_ast.py
@@ -182,11 +182,10 @@ def visit_Assign(self, node):
                 f"Cannot unpack {len(value)} values into {len(targets)} variables"
             )
 
-        sym_table = self.symbol_tables[-1]
         for elt, val in zip(targets, value):
             if not isinstance(elt, ast.Name):
                 raise ValueError("Tuple unpacking requires simple variable names")
-            sym_table[elt.id] = val
+            self._set_var(elt.id, val)
 
     def _loc_for_node(self, node):
         """Return file location for node if debug_locations enabled, else name location."""
@@ -311,7 +310,7 @@ def visit_AugAssign(self, node):
                 iadd_fn = self._fn_map.get(f"{mlir_type}.__iadd__")
                 if iadd_fn:
                     result = iadd_fn(target, rhs)
-                    self.symbol_tables[-1][node.target.id] = result
+                    self._set_var(node.target.id, result)
                     return
             return super().visit_AugAssign(node)
 
@@ -545,8 +544,8 @@ def _emit_entry(self, node):
         self.symbol_tables.append({})
         func_bb = self.func_entry.add_entry_block()
 
-        # Add ttl module to symbol table
-        self.symbol_tables[-1]["ttl"] = ttl
+        # Add ttl module to symbol table.
+        self._set_var("ttl", ttl)
 
         # Ensure TTL dialect is registered for type parsing
         ttl.ensure_dialects_registered(self.ctx)
@@ -555,12 +554,12 @@ def _emit_entry(self, node):
 
         # Emit function body
         with InsertionPoint(func_bb):
-            # Map TensorAccessor function arguments to symbol table
+            # Map TensorAccessor function arguments to symbol table.
             for i, name in enumerate(self._tensor_accessor_names):
-                self.symbol_tables[-1][name] = func_bb.arguments[i]
+                self._set_var(name, func_bb.arguments[i])
                 self.streams.add(name)
 
-            # Prepopulate other captures (non-tensor)
+            # Prepopulate other captures (non-tensor).
             from ..circular_buffer import CircularBuffer
 
             for name, val in self.captures.items():
@@ -568,16 +567,11 @@ def _emit_entry(self, node):
                     continue  # Already handled via function arguments
                 assert isinstance(name, str)
                 if isinstance(val, int):
-                    self.symbol_tables[-1][name] = arith.ConstantOp(
-                        IndexType.get(self.ctx), val
-                    )
+                    self._set_var(name, arith.ConstantOp(IndexType.get(self.ctx), val))
                 elif isinstance(val, float):
-                    self.symbol_tables[-1][name] = arith.ConstantOp(
-                        F32Type.get(self.ctx), val
-                    )
+                    self._set_var(name, arith.ConstantOp(F32Type.get(self.ctx), val))
                 elif isinstance(val, CircularBuffer):
-                    cb_val = self._emit_cb_from_capture(val)
-                    self.symbol_tables[-1][name] = cb_val
+                    self._set_var(name, self._emit_cb_from_capture(val))
                 else:
                     self._raise_error(
                         node, f"Invalid capture type for var {name}: {type(val)}"
@@ -963,7 +957,7 @@ def visit_With(self, node):
                             optional_vars,
                             "'with ... as var' requires a simple variable name",
                         )
-                    self.symbol_tables[-1][optional_vars.id] = acquire_result
+                    self._set_var(optional_vars.id, acquire_result)
 
             for stmt in node.body:
                 self.visit(stmt)

From 903af079cb4fb187fb8a531a8f32d7e7fcfd5799 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Sun, 12 Apr 2026 23:14:02 -0700
Subject: [PATCH 17/31] fix test

---
 test/python/matmul_l1_acc_multinode.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/test/python/matmul_l1_acc_multinode.py b/test/python/matmul_l1_acc_multinode.py
index e086d09b5..293a472b2 100644
--- a/test/python/matmul_l1_acc_multinode.py
+++ b/test/python/matmul_l1_acc_multinode.py
@@ -133,14 +133,16 @@ def writer():
 #   3. Disable after cb_push_back following the loop
 # =============================================================================
 
-# CHECK-CPP:      PACK((llk_pack_reconfig_l1_acc(
-# CHECK-CPP-NEXT: for
+# CHECK-CPP-DAG:  int32_t [[ENABLE:v[0-9]+]] = 1;
+# CHECK-CPP-DAG:  int32_t [[DISABLE:v[0-9]+]] = 0;
+# CHECK-CPP:      PACK((llk_pack_reconfig_l1_acc([[DISABLE]])));
+# CHECK-CPP:      for
 # CHECK-CPP:        matmul_block(
 # CHECK-CPP:        pack_tile
 # CHECK-CPP:        if (
-# CHECK-CPP-NEXT:   PACK((llk_pack_reconfig_l1_acc(
+# CHECK-CPP-NEXT:   PACK((llk_pack_reconfig_l1_acc([[ENABLE]])));
 # CHECK-CPP:      cb_push_back(
-# CHECK-CPP-NEXT: PACK((llk_pack_reconfig_l1_acc(
+# CHECK-CPP:      PACK((llk_pack_reconfig_l1_acc([[DISABLE]])));
 
 # CHECK-RESULT: PASS
 

From 61d0a1523fc020016885e4c8e4d4d7657222e005 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Mon, 13 Apr 2026 07:07:08 -0700
Subject: [PATCH 18/31] add subblocking test

---
 .../TTLToTTKernel/matmul_subblock_l1_acc.mlir | 107 ++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir

diff --git a/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir b/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir
new file mode 100644
index 000000000..ebf2b5aeb
--- /dev/null
+++ b/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir
@@ -0,0 +1,107 @@
+// Matmul with subblocking AND L1 accumulation. Output 3x3 bf16 = 9 tiles
+// exceeds bf16 DST capacity (8), triggering subblocking. The user K loop
+// with {accumulate} triggers L1 acc annotation and pack_reconfig_l1_acc
+// guard insertion.
+
+// RUN: ttlang-opt %s \
+// RUN:   -pass-pipeline='builtin.module( \
+// RUN:     func.func(ttl-annotate-l1-acc-loops, convert-ttl-to-compute, \
+// RUN:       ttl-assign-dst{enable-fpu-binary-ops=0}, \
+// RUN:       ttl-subblock-compute-for-dst, ttl-lower-matmul-block, \
+// RUN:       ttl-lower-to-loops{dst-accumulation=1}, ttl-schedule-operations, \
+// RUN:       ttl-annotate-cb-associations), \
+// RUN:     convert-ttl-to-ttkernel, ttkernel-insert-inits, \
+// RUN:     ttkernel-insert-l1-accumulation, canonicalize, cse)' \
+// RUN:   --split-input-file | FileCheck %s
+
+// CHECK-LABEL: func.func @matmul_3x3_k_loop
+// Disable before the K loop.
+// CHECK-DAG: %[[C0_I32:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[C1_I32:.*]] = arith.constant 1 : i32
+// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C0_I32]])
+// K loop with subblock loops inside.
+// CHECK: scf.for %[[K_IV:.*]] = %[[K_LB:.*]] to
+// Subblock loop: acquire, matmul, 3 pack_tiles (3x1 subblock), release.
+// CHECK:   scf.for
+// CHECK:     ttkernel.tile_regs_acquire
+// CHECK:     ttkernel.matmul_block
+// CHECK-COUNT-3: ttkernel.pack_tile
+// CHECK:     ttkernel.tile_regs_release
+// CHECK:   }
+// Enable after first K iteration.
+// CHECK:   arith.cmpi eq, %[[K_IV]], %[[K_LB]]
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc(%[[C1_I32]])
+// CHECK: }
+// Disable after push.
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C0_I32]])
+func.func @matmul_3x3_k_loop(
+    %arg0: tensor<3x2x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<2x3x!ttcore.tile<32x32, bf16>>) -> tensor<3x3x!ttcore.tile<32x32, bf16>>
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[3, 2], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[2, 3], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<3x2x!ttcore.tile<32x32, bf16>>, !ttl.cb<[3, 2], !ttcore.tile<32x32, bf16>, 2>) -> tensor<3x2x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<2x3x!ttcore.tile<32x32, bf16>>, !ttl.cb<[2, 3], !ttcore.tile<32x32, bf16>, 2>) -> tensor<2x3x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[3, 3], !ttcore.tile<32x32, bf16>, 2> -> tensor<3x3x!ttcore.tile<32x32, bf16>>
+  scf.for %k = %c0 to %c2 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<3x2x!ttcore.tile<32x32, bf16>>, tensor<2x3x!ttcore.tile<32x32, bf16>> -> tensor<3x3x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<3x3x!ttcore.tile<32x32, bf16>>, tensor<3x3x!ttcore.tile<32x32, bf16>>
+  }
+  ttl.cb_push %cb2 : <[3, 3], !ttcore.tile<32x32, bf16>, 2>
+  func.return %reserve : tensor<3x3x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// 8x8 output (64 tiles >> DST capacity 8) with K=4: heavily subblocked.
+// Verifies that multiple levels of subblock loops all sit inside the 
+// K loop's L1 acc guards.
+
+// CHECK-LABEL: func.func @matmul_8x8_k4
+// CHECK-DAG: %[[C0_I32:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[C1_I32:.*]] = arith.constant 1 : i32
+// Disable before K loop.
+// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C0_I32]])
+// K loop -> subblock row loop -> acquire, matmul K loop, 8x pack, release.
+// CHECK: scf.for %[[K_IV:.*]] = %[[K_LB:.*]] to
+// CHECK:   scf.for
+// CHECK:     ttkernel.tile_regs_acquire
+// CHECK:     scf.for
+// CHECK:       ttkernel.matmul_block
+// CHECK-COUNT-8: ttkernel.pack_tile
+// CHECK:     ttkernel.tile_regs_release
+// CHECK:   }
+// Enable after first K iteration.
+// CHECK:   arith.cmpi eq, %[[K_IV]], %[[K_LB]]
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc(%[[C1_I32]])
+// CHECK: }
+// Disable after push.
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C0_I32]])
+func.func @matmul_8x8_k4(
+    %arg0: tensor<8x8x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<8x8x!ttcore.tile<32x32, bf16>>) -> tensor<8x8x!ttcore.tile<32x32, bf16>>
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<8x8x!ttcore.tile<32x32, bf16>>, !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2>) -> tensor<8x8x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<8x8x!ttcore.tile<32x32, bf16>>, !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2>) -> tensor<8x8x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[8, 8], !ttcore.tile<32x32, bf16>, 2> -> tensor<8x8x!ttcore.tile<32x32, bf16>>
+  scf.for %k = %c0 to %c4 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<8x8x!ttcore.tile<32x32, bf16>>, tensor<8x8x!ttcore.tile<32x32, bf16>> -> tensor<8x8x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<8x8x!ttcore.tile<32x32, bf16>>, tensor<8x8x!ttcore.tile<32x32, bf16>>
+  }
+  ttl.cb_push %cb2 : <[8, 8], !ttcore.tile<32x32, bf16>, 2>
+  func.return %reserve : tensor<8x8x!ttcore.tile<32x32, bf16>>
+}

From 7f8f840d7fbeefa228cd5f5b340a582eeab6418d Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Mon, 13 Apr 2026 07:58:56 -0700
Subject: [PATCH 19/31] precommit

---
 .../ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir b/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir
index ebf2b5aeb..3e37a056a 100644
--- a/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir
+++ b/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir
@@ -60,7 +60,7 @@ func.func @matmul_3x3_k_loop(
 // -----
 
 // 8x8 output (64 tiles >> DST capacity 8) with K=4: heavily subblocked.
-// Verifies that multiple levels of subblock loops all sit inside the 
+// Verifies that multiple levels of subblock loops all sit inside the
 // K loop's L1 acc guards.
 
 // CHECK-LABEL: func.func @matmul_8x8_k4

From 6b9a14135221073fb4ee75f1cdfced53304e067d Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Mon, 13 Apr 2026 16:57:56 -0700
Subject: [PATCH 20/31] address comments

---
 .../TTL/Transforms/TTLAnnotateL1AccLoops.cpp  |  9 +++--
 test/python/matmul_l1_acc_multinode.py        | 16 ++++++---
 test/python/test_matmul_l1_acc_multinode.py   |  3 ++
 .../TTL/Transforms/annotate_l1_acc_loops.mlir | 34 +++++++++++++++++++
 4 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
index 767317fdf..7757c04ed 100644
--- a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
@@ -41,11 +41,14 @@ struct TTLAnnotateL1AccLoopsPass
         return;
       }
 
-      // Check if the loop body contains an accumulating store (ttl.store
-      // with the {accumulate} attribute, emitted by the ``+=`` operator).
+      // Check if this loop directly contains an accumulating store
+      // (ttl.store with the {accumulate} attribute, emitted by +=).
+      // Only count stores whose nearest enclosing scf.for is this forOp,
+      // so that nested inner loops are not attributed to outer loops.
       bool hasAccumulatingStore = false;
       forOp.getBody()->walk([&](StoreOp store) {
-        if (store.getAccumulate()) {
+        if (store.getAccumulate() &&
+            store->getParentOfType<scf::ForOp>() == forOp) {
           hasAccumulatingStore = true;
         }
       });
diff --git a/test/python/matmul_l1_acc_multinode.py b/test/python/matmul_l1_acc_multinode.py
index 293a472b2..822cc7692 100644
--- a/test/python/matmul_l1_acc_multinode.py
+++ b/test/python/matmul_l1_acc_multinode.py
@@ -8,9 +8,12 @@
 # RUN: FileCheck %s --check-prefix=CHECK-RESULT < %t.output
 
 """
-Multinode matmul with L1 packer accumulation. Mirrors the benchmark kernel
-(make_matmul_l1_acc / v4_l1_acc): auto grid, split DMA (reader=A,
-writer=B+output), 8x8x8 blocks, K_num_blocks=4 at 1024x1024x1024.
+Multinode matmul with L1 packer accumulation. Auto grid, split DMA (reader=A,
+writer=B+output), 8x8x8 blocks, K_num_blocks=4 at 3072x1024x3072.
+
+The larger dimensions (96x32x96 tiles, 12x4x12 blocks) ensure each core
+handles multiple output blocks (ceil(12/8)=2 per axis on an 8x8 grid),
+exercising the per-block L1 acc disable/re-enable sequence.
 
 The compute thread uses += for accumulation across K iterations. The
 compiler inserts pack_reconfig_l1_acc guards so each K iteration packs
@@ -150,8 +153,11 @@ def writer():
     device = ttnn.open_device(device_id=0)
 
     try:
-        # 32x32x32 tiles = 1024x1024x1024, 8x8x8 blocks -> K_num_blocks=4
-        Mt, Kt, Nt = 32, 32, 32
+        # 96x32x96 tiles = 3072x1024x3072, 8x8x8 blocks -> 12x4x12 blocks.
+        # With an 8x8 grid each core handles ceil(12/8)=2 M-blocks and
+        # 2 N-blocks (4 output blocks), exercising the per-block L1 acc
+        # disable/re-enable sequence.
+        Mt, Kt, Nt = 96, 32, 96
         M, K, N = Mt * TILE, Kt * TILE, Nt * TILE
 
         a_torch = torch.randn(M, K, dtype=torch.bfloat16)
diff --git a/test/python/test_matmul_l1_acc_multinode.py b/test/python/test_matmul_l1_acc_multinode.py
index c0d456838..70f3fdab6 100644
--- a/test/python/test_matmul_l1_acc_multinode.py
+++ b/test/python/test_matmul_l1_acc_multinode.py
@@ -125,6 +125,9 @@ def writer():
     (8, 4, 8, 4, 4, (2, 2)),
     (8, 4, 8, 4, 4, "auto"),
     (16, 8, 16, 8, 8, "auto"),
+    # Multi-block per core: M_num=4, N_num=4 on 2x2 grid -> 2 blocks/core/axis.
+    # Exercises per-block L1 acc disable/re-enable across output blocks.
+    (16, 4, 16, 4, 4, (2, 2)),
 ]
 
 
diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
index 41f5bb935..9596fe1b2 100644
--- a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
+++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
@@ -77,6 +77,40 @@ func.func @skip_reduction_loop(
 
 // -----
 
+// Nested loops: only the innermost loop containing the accumulating store
+// should be annotated, not the outer loop.
+
+// CHECK-LABEL: func.func @nested_only_inner
+// CHECK: scf.for
+// CHECK-NOT: ttl.l1_acc_loop
+// CHECK:   scf.for
+// CHECK:   } {ttl.l1_acc_loop}
+// CHECK: }
+func.func @nested_only_inner(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  // Outer loop (e.g., N-block loop) — should NOT be annotated.
+  scf.for %outer = %c0 to %c4 step %c1 {
+    // Inner loop (K-accumulation loop) — should be annotated.
+    scf.for %iv = %c0 to %c4 step %c1 {
+      %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+      ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    }
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
 // Loop without any store should NOT be annotated.
 
 // CHECK-LABEL: func.func @no_store

From 3a3a264e3e5a71b335c8b0fc2bbcaceb6db097e5 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Mon, 13 Apr 2026 17:41:13 -0700
Subject: [PATCH 21/31] add --ttl-strict-f32-acc compiler option

---
 docs/sphinx/reference/compiler-options.md     | 21 +++++-
 include/ttlang/Dialect/TTL/Passes.td          | 12 +++
 .../Dialect/TTL/Pipelines/TTLPipelines.h      |  4 +
 .../TTKernelInsertL1Accumulation.cpp          | 28 +++++++
 lib/Dialect/TTL/Pipelines/TTLPipelines.cpp    |  6 +-
 python/ttl/compiler_options.py                |  8 ++
 python/ttl/ttl_api.py                         |  3 +-
 .../insert_l1_accumulation_invalid.mlir       | 75 +++++++++++++++++++
 8 files changed, 152 insertions(+), 5 deletions(-)
 create mode 100644 test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir

diff --git a/docs/sphinx/reference/compiler-options.md b/docs/sphinx/reference/compiler-options.md
index 8f329e75d..189ddb0c0 100644
--- a/docs/sphinx/reference/compiler-options.md
+++ b/docs/sphinx/reference/compiler-options.md
@@ -17,6 +17,7 @@ python my_kernel.py --no-ttl-maximize-dst
 | `--ttl-block-matmul` / `--no-ttl-block-matmul` | enabled | Emit `matmul_block` (processes the full tile block atomically) instead of per-tile matmul loops. Disabling this option is not yet supported. |
 | `--ttl-auto-sync` / `--no-ttl-auto-sync` | disabled | Let the compiler insert and move DFB synchronization ops. When enabled, reserve/push may be refined to per-subblock granularity. When disabled, user-placed reserve/push is preserved as written. |
 | `--ttl-combine-pack-tiles` / `--no-ttl-combine-pack-tiles` | enabled | Combine consecutive `pack_tile` ops on the same CB with contiguous DST and CB indices into a single `pack_tile_block` call. |
+| `--ttl-strict-f32-acc` / `--no-ttl-strict-f32-acc` | disabled | Error at compile time if a `+=` accumulation loop's output block exceeds f32 DST capacity (4 tiles with double-buffering). When enabled, guarantees each accumulation step fits in a single DST section without subblocking. |
 
 ### Other Ways to Set These
 
@@ -110,6 +111,7 @@ ttlang-opt input.mlir -p 'ttl-to-ttkernel-pipeline{maximize-dst=true lower-to-em
 | `use-block-matmul` | bool | `true` | Lower matmul to block-level hardware calls (`experimental::matmul_block`). |
 | `auto-sync` | bool | `false` | Let the compiler insert and move DFB synchronization ops. |
 | `combine-pack-tiles` | bool | `true` | Combine consecutive `pack_tile` ops into `pack_tile_block`. |
+| `strict-f32-acc` | bool | `false` | Error if a `+=` accumulation loop's output block exceeds f32 DST capacity. |
 | `lower-to-emitc` | bool | `false` | Run the TTKernel-to-EmitC backend (produces C++ source). |
 
 The pipeline runs these passes in order:
@@ -125,9 +127,10 @@ The pipeline runs these passes in order:
 9. `ttl-annotate-cb-associations` — annotate block args with CB indices
 10. `convert-ttl-to-ttkernel` — lower TTL DMA ops to TTKernel
 11. `ttkernel-insert-inits` — insert hardware init ops before compute ops
-12. `ttkernel-combine-pack-tiles` — combine consecutive `pack_tile` into `pack_tile_block` *(only if `combine-pack-tiles=true`)*
-13. Canonicalization and CSE cleanup
-14. *(if `lower-to-emitc=true`)* `lower-affine`, `convert-ttkernel-to-emitc`, `emitc-form-expressions`
+12. `ttkernel-insert-l1-accumulation` — insert `pack_reconfig_l1_acc` guards for `+=` and reduction loops; errors if `strict-f32-acc=true` and output block exceeds f32 DST capacity
+13. `ttkernel-combine-pack-tiles` — combine consecutive `pack_tile` into `pack_tile_block` *(only if `combine-pack-tiles=true`)*
+14. Canonicalization and CSE cleanup
+15. *(if `lower-to-emitc=true`)* `lower-affine`, `convert-ttkernel-to-emitc`, `emitc-form-expressions`
 
 ### Individual Pass Options
 
@@ -185,3 +188,15 @@ Analyze circular buffer producer/consumer relationships and dump the flow graph.
 ```bash
 ttlang-opt input.mlir -p 'ttl-dump-cb-flow-graph{output="/tmp/cb_graph.json"}'
 ```
+
+#### `ttkernel-insert-l1-accumulation`
+
+Insert `pack_reconfig_l1_acc` guards around reduction and accumulation loops.
+
+| Option | Type | Default | Description |
+|---|---|---|---|
+| `strict-f32-acc` | bool | `false` | Error if a user-written `+=` accumulation loop requires subblocking because the output block exceeds f32 DST capacity (4 tiles with double-buffering). |
+
+```bash
+ttlang-opt input.mlir -p 'builtin.module(ttkernel-insert-l1-accumulation{strict-f32-acc=true})'
+```
diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index 43540f6d1..bcc284405 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -51,8 +51,20 @@ def TTKernelInsertL1Accumulation
     Reduction loops are identified by the `ttl.l1_acc_loop` (user-written)
     or `ttl.reduction_loop` (compiler-generated) attributes on `scf.for`
     ops, with `ttl.l1_acc_loop` taking precedence.
+
+    When `strict-f32-acc` is enabled, the pass errors if a user-written
+    accumulation loop (`+=`, marked `ttl.l1_acc_loop`) contains subblock
+    loops, indicating the output block exceeds f32 DST capacity. This
+    guarantees each K iteration fits in a single DST section.
   }];
 
+  let options = [
+    Option<"strictF32Acc", "strict-f32-acc", "bool", "false",
+           "Error if a user-written accumulation loop requires subblocking "
+           "(output exceeds f32 DST capacity). Guarantees full f32 "
+           "precision per accumulation step.">
+  ];
+
   let dependentDialects = [
     "::mlir::arith::ArithDialect",
     "::mlir::scf::SCFDialect",
diff --git a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h
index 18b4c8a4d..0014affd7 100644
--- a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h
+++ b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h
@@ -44,6 +44,10 @@ struct TTLToTTKernelPipelineOptions
       *this, "reduce-full-fp32",
       llvm::cl::desc("Enable FP32 accumulation for reduce operations."),
       llvm::cl::init(true)};
+  Option<bool> strictF32Acc{
+      *this, "strict-f32-acc",
+      llvm::cl::desc("Error if accumulation output exceeds f32 DST capacity."),
+      llvm::cl::init(false)};
 };
 
 void createTTLToTTKernelPipeline(mlir::OpPassManager &pm,
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index 41cfeb5a6..ae98876b7 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -63,6 +63,10 @@ static scf::ForOp findOutermostL1AccLoop(Operation *op) {
 struct TTKernelInsertL1AccumulationPass
     : public impl::TTKernelInsertL1AccumulationBase<
           TTKernelInsertL1AccumulationPass> {
+  using Base =
+      impl::TTKernelInsertL1AccumulationBase<TTKernelInsertL1AccumulationPass>;
+  using Base::Base;
+
   void runOnOperation() override {
     auto moduleOp = getOperation();
 
@@ -86,6 +90,30 @@ struct TTKernelInsertL1AccumulationPass
       }
     });
 
+    // When --strict-f32-acc is set, error if any user-written accumulation
+    // loop (kL1AccLoopAttrName, from +=) contains subblock loops, which
+    // indicates the output block exceeds f32 DST capacity.
+    // TODO(ttl): Instead of erroring, allocate an f32 L1 temporary and
+    // emit a cast to bf16 after the loop. This would give full f32
+    // precision regardless of block size, at the cost of 2x L1 per tile.
+    if (strictF32Acc) {
+      for (auto loop : l1AccLoops) {
+        if (!loop->hasAttr(kL1AccLoopAttrName))
+          continue;
+        bool hasSubblockLoop = false;
+        loop->walk([&](scf::ForOp inner) {
+          if (inner->hasAttr(kSubblockLoopStrideAttrName))
+            hasSubblockLoop = true;
+        });
+        if (hasSubblockLoop) {
+          loop->emitError(
+              "output block exceeds f32 DST capacity; reduce block "
+              "dimensions or compile without --ttl-strict-f32-acc");
+          return signalPassFailure();
+        }
+      }
+    }
+
     // L1 accumulation guard placement. For any loop that
     // accumulates in L1 (matmul K loop or reduce loop), the pattern is:
     //
diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
index 85cc4e47c..cc64fe3f5 100644
--- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
+++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
@@ -53,7 +53,11 @@ void createTTLToTTKernelPipeline(OpPassManager &pm,
     pm.addPass(createTTLConvertTTLToTTKernel(ttkOpts));
   }
   pm.addPass(createTTKernelInsertInits());
-  pm.addPass(createTTKernelInsertL1Accumulation());
+  {
+    TTKernelInsertL1AccumulationOptions l1AccOpts;
+    l1AccOpts.strictF32Acc = options.strictF32Acc;
+    pm.addPass(createTTKernelInsertL1Accumulation(l1AccOpts));
+  }
   if (options.combinePackTiles) {
     pm.addNestedPass<func::FuncOp>(createTTKernelCombinePackTiles());
   }
diff --git a/python/ttl/compiler_options.py b/python/ttl/compiler_options.py
index 6871cc7fe..bccf088ae 100644
--- a/python/ttl/compiler_options.py
+++ b/python/ttl/compiler_options.py
@@ -76,6 +76,13 @@ def _make_parser() -> argparse.ArgumentParser:
         action=argparse.BooleanOptionalAction,
         help="Enable FP32 accumulation for matmul operations (default: enabled).",
     )
+    p.add_argument(
+        "--ttl-strict-f32-acc",
+        default=None,
+        dest="strict_f32_acc",
+        action=argparse.BooleanOptionalAction,
+        help="Error if accumulation (+=) output block exceeds f32 DST capacity (default: disabled).",
+    )
     return p
 
 
@@ -121,6 +128,7 @@ class CompilerOptions:
     combine_pack_tiles: bool = True
     reduce_full_fp32: bool = True
     matmul_full_fp32: bool = True
+    strict_f32_acc: bool = False
 
     # Fields that were explicitly provided (not defaulted). Excluded from
     # equality and hashing so two instances with the same bool values are
diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py
index 4360b627c..f2c02cc71 100644
--- a/python/ttl/ttl_api.py
+++ b/python/ttl/ttl_api.py
@@ -1267,11 +1267,12 @@ def _compile_kernel(
             pipeline_passes.append(f'ttl-dump-cb-flow-graph{{output="{cb_flow_json}"}}')
 
         reduce_fp32_flag = int(compiler_options.reduce_full_fp32)
+        strict_f32_flag = int(compiler_options.strict_f32_acc)
         pipeline_passes += [
             "ttl-lower-dprint-to-emitc",
             f"convert-ttl-to-ttkernel{{reduce-full-fp32={reduce_fp32_flag}}}",
             "ttkernel-insert-inits",
-            "ttkernel-insert-l1-accumulation",
+            f"ttkernel-insert-l1-accumulation{{strict-f32-acc={strict_f32_flag}}}",
         ]
         if compiler_options.combine_pack_tiles:
             pipeline_passes.append("func.func(ttkernel-combine-pack-tiles)")
diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir
new file mode 100644
index 000000000..e79698877
--- /dev/null
+++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir
@@ -0,0 +1,75 @@
+// Negative tests for ttkernel-insert-l1-accumulation with --strict-f32-acc.
+
+// RUN: ttlang-opt %s --pass-pipeline='builtin.module(ttkernel-insert-l1-accumulation{strict-f32-acc=true})' --verify-diagnostics --split-input-file
+
+// L1 acc loop with subblock loop inside: strict-f32-acc should error.
+
+func.func @strict_f32_subblock_error() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  // expected-error @below {{output block exceeds f32 DST capacity}}
+  scf.for %iv = %c0 to %c4 step %c1 {
+    scf.for %sb = %c0 to %c2 step %c1 {
+      ttkernel.tile_regs_acquire() : () -> ()
+      ttkernel.matmul_block(%cb, %cb, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> ()
+      ttkernel.tile_regs_commit() : () -> ()
+      ttkernel.tile_regs_wait() : () -> ()
+      ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+      ttkernel.tile_regs_release() : () -> ()
+    } {ttl.subblock_loop_stride = 1 : index}
+  } {ttl.l1_acc_loop}
+  return
+}
+
+// -----
+
+// L1 acc loop WITHOUT subblock loops: strict-f32-acc should pass.
+
+// expected-no-diagnostics
+func.func @strict_f32_no_subblock_ok() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}
+
+// -----
+
+// Reduction loop (compiler-generated) with subblock: strict-f32-acc should
+// NOT error (only user-written l1_acc_loop triggers the check).
+
+// expected-no-diagnostics
+func.func @strict_f32_reduction_loop_ok() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  scf.for %iv = %c0 to %c2 step %c1 {
+    scf.for %sb = %c0 to %c2 step %c1 {
+      ttkernel.tile_regs_acquire() : () -> ()
+      ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, <reduce_sum>, <reduce_dim_col>) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> ()
+      ttkernel.tile_regs_commit() : () -> ()
+      ttkernel.tile_regs_wait() : () -> ()
+      ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> ()
+      ttkernel.tile_regs_release() : () -> ()
+    } {ttl.subblock_loop_stride = 1 : index}
+  } {ttl.reduction_loop}
+  return
+}

From 7295718a0ca11e3a44ee052d1120f888911108d5 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Mon, 13 Apr 2026 18:05:37 -0700
Subject: [PATCH 22/31] add TODO

---
 .../Transforms/TTKernelInsertL1Accumulation.cpp       | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index ae98876b7..a6bfffcc1 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -98,17 +98,18 @@ struct TTKernelInsertL1AccumulationPass
     // precision regardless of block size, at the cost of 2x L1 per tile.
     if (strictF32Acc) {
       for (auto loop : l1AccLoops) {
-        if (!loop->hasAttr(kL1AccLoopAttrName))
+        if (!loop->hasAttr(kL1AccLoopAttrName)) {
           continue;
+        }
         bool hasSubblockLoop = false;
         loop->walk([&](scf::ForOp inner) {
-          if (inner->hasAttr(kSubblockLoopStrideAttrName))
+          if (inner->hasAttr(kSubblockLoopStrideAttrName)) {
             hasSubblockLoop = true;
+          }
         });
         if (hasSubblockLoop) {
-          loop->emitError(
-              "output block exceeds f32 DST capacity; reduce block "
-              "dimensions or compile without --ttl-strict-f32-acc");
+          loop->emitError("output block exceeds f32 DST capacity; reduce block "
+                          "dimensions or compile without --ttl-strict-f32-acc");
           return signalPassFailure();
         }
       }

From 7c5ad7571bbc4154ae880f9e2a97dabfa1011ce4 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Tue, 14 Apr 2026 07:01:21 -0700
Subject: [PATCH 23/31] update doc

---
 docs/sphinx/reference/compiler-options.md | 31 ++++++++++++-----------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/docs/sphinx/reference/compiler-options.md b/docs/sphinx/reference/compiler-options.md
index 189ddb0c0..b18157c0a 100644
--- a/docs/sphinx/reference/compiler-options.md
+++ b/docs/sphinx/reference/compiler-options.md
@@ -116,21 +116,22 @@ ttlang-opt input.mlir -p 'ttl-to-ttkernel-pipeline{maximize-dst=true lower-to-em
 
 The pipeline runs these passes in order:
 
-1. `convert-ttl-to-compute` — lower TTL elementwise tensor ops to `ttl.compute` with tile ops
-2. `ttl-set-compute-kernel-config` — set `fp32_dest_acc_en` / `dst_full_sync_en` defaults
-3. `ttl-assign-dst` — DST register allocation (linear scan with copy insertion)
-4. `ttl-subblock-compute-for-dst` — tile `ttl.compute` into DST-sized subblocks *(only if `maximize-dst=true`)*; optionally refine reserve/push to per-subblock granularity *(only if `auto-sync=true`)*
-5. `ttl-insert-tile-regs-sync` — insert math/pack thread synchronization
-6. `ttl-lower-matmul-block` — mark block-matmul computes and expand stores *(only if `use-block-matmul=true`)*
-7. `ttl-lower-to-loops` — lower `ttl.compute` to `scf.for` loops
-8. `ttl-schedule-operations` — reorder tile ops by dependency depth and kind *(only if `maximize-dst=true`)*
-9. `ttl-annotate-cb-associations` — annotate block args with CB indices
-10. `convert-ttl-to-ttkernel` — lower TTL DMA ops to TTKernel
-11. `ttkernel-insert-inits` — insert hardware init ops before compute ops
-12. `ttkernel-insert-l1-accumulation` — insert `pack_reconfig_l1_acc` guards for `+=` and reduction loops; errors if `strict-f32-acc=true` and output block exceeds f32 DST capacity
-13. `ttkernel-combine-pack-tiles` — combine consecutive `pack_tile` into `pack_tile_block` *(only if `combine-pack-tiles=true`)*
-14. Canonicalization and CSE cleanup
-15. *(if `lower-to-emitc=true`)* `lower-affine`, `convert-ttkernel-to-emitc`, `emitc-form-expressions`
+1. `ttl-annotate-l1-acc-loops` — detect `+=` accumulation loops and annotate for L1 packer accumulation
+2. `convert-ttl-to-compute` — lower TTL elementwise tensor ops to `ttl.compute` with tile ops
+3. `ttl-set-compute-kernel-config` — set `fp32_dest_acc_en` / `dst_full_sync_en` defaults
+4. `ttl-assign-dst` — DST register allocation (linear scan with copy insertion)
+5. `ttl-subblock-compute-for-dst` — tile `ttl.compute` into DST-sized subblocks *(only if `maximize-dst=true`)*; optionally refine reserve/push to per-subblock granularity *(only if `auto-sync=true`)*
+6. `ttl-insert-tile-regs-sync` — insert math/pack thread synchronization
+7. `ttl-lower-matmul-block` — mark block-matmul computes and expand stores *(only if `use-block-matmul=true`)*
+8. `ttl-lower-to-loops` — lower `ttl.compute` to `scf.for` loops
+9. `ttl-schedule-operations` — reorder tile ops by dependency depth and kind *(only if `maximize-dst=true`)*
+10. `ttl-annotate-cb-associations` — annotate block args with CB indices
+11. `convert-ttl-to-ttkernel` — lower TTL DMA ops to TTKernel
+12. `ttkernel-insert-inits` — insert hardware init ops before compute ops
+13. `ttkernel-insert-l1-accumulation` — insert `pack_reconfig_l1_acc` guards for `+=` and reduction loops; errors if `strict-f32-acc=true` and output block exceeds f32 DST capacity
+14. `ttkernel-combine-pack-tiles` — combine consecutive `pack_tile` into `pack_tile_block` *(only if `combine-pack-tiles=true`)*
+15. Canonicalization and CSE cleanup
+16. *(if `lower-to-emitc=true`)* `lower-affine`, `convert-ttkernel-to-emitc`, `emitc-form-expressions`
 
 ### Individual Pass Options
 

From f4112403ba6cf912d7875af9bb2b0dca7e32bb88 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Tue, 14 Apr 2026 07:05:24 -0700
Subject: [PATCH 24/31] move the strict-f32-acc option to the subblocking pass;
 other cleanup

---
 docs/sphinx/reference/compiler-options.md     | 14 +---
 include/ttlang/Dialect/TTL/Passes.td          | 17 ++---
 .../TTKernelInsertL1Accumulation.cpp          | 44 +++--------
 lib/Dialect/TTL/Pipelines/TTLPipelines.cpp    |  7 +-
 .../TTL/Transforms/TTLAnnotateL1AccLoops.cpp  |  6 +-
 .../Transforms/TTLSubblockComputeForDST.cpp   | 34 +++++++++
 python/ttl/ttl_api.py                         |  6 +-
 .../insert_l1_accumulation_invalid.mlir       | 75 -------------------
 .../subblock_strict_f32_acc_invalid.mlir      | 61 +++++++++++++++
 9 files changed, 120 insertions(+), 144 deletions(-)
 delete mode 100644 test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir
 create mode 100644 test/ttlang/Dialect/TTL/Transforms/subblock_strict_f32_acc_invalid.mlir

diff --git a/docs/sphinx/reference/compiler-options.md b/docs/sphinx/reference/compiler-options.md
index b18157c0a..ff3596620 100644
--- a/docs/sphinx/reference/compiler-options.md
+++ b/docs/sphinx/reference/compiler-options.md
@@ -128,7 +128,7 @@ The pipeline runs these passes in order:
 10. `ttl-annotate-cb-associations` — annotate block args with CB indices
 11. `convert-ttl-to-ttkernel` — lower TTL DMA ops to TTKernel
 12. `ttkernel-insert-inits` — insert hardware init ops before compute ops
-13. `ttkernel-insert-l1-accumulation` — insert `pack_reconfig_l1_acc` guards for `+=` and reduction loops; errors if `strict-f32-acc=true` and output block exceeds f32 DST capacity
+13. `ttkernel-insert-l1-accumulation` — insert `pack_reconfig_l1_acc` guards for `+=` and reduction loops
 14. `ttkernel-combine-pack-tiles` — combine consecutive `pack_tile` into `pack_tile_block` *(only if `combine-pack-tiles=true`)*
 15. Canonicalization and CSE cleanup
 16. *(if `lower-to-emitc=true`)* `lower-affine`, `convert-ttkernel-to-emitc`, `emitc-form-expressions`
@@ -173,6 +173,7 @@ Partition `ttl.compute` into DST-sized subblocks.
 | Option | Type | Default | Description |
 |---|---|---|---|
 | `subblock-sync` | bool | `false` | Refine DFB reserve/push to per-subblock granularity, enabling `pack_tile_block` for contiguous subblocks. When disabled, user-placed reserve/push is preserved. |
+| `strict-f32-acc` | bool | `false` | Error if a `+=` accumulation loop with non-f32 output requires subblocking. Subblocking reduces accumulation precision because bf16 L1 intermediates truncate f32 DST values. |
 
 ```bash
 ttlang-opt input.mlir -p 'func.func(ttl-subblock-compute-for-dst{subblock-sync=true})'
@@ -190,14 +191,3 @@ Analyze circular buffer producer/consumer relationships and dump the flow graph.
 ttlang-opt input.mlir -p 'ttl-dump-cb-flow-graph{output="/tmp/cb_graph.json"}'
 ```
 
-#### `ttkernel-insert-l1-accumulation`
-
-Insert `pack_reconfig_l1_acc` guards around reduction and accumulation loops.
-
-| Option | Type | Default | Description |
-|---|---|---|---|
-| `strict-f32-acc` | bool | `false` | Error if a user-written `+=` accumulation loop requires subblocking because the output block exceeds f32 DST capacity (4 tiles with double-buffering). |
-
-```bash
-ttlang-opt input.mlir -p 'builtin.module(ttkernel-insert-l1-accumulation{strict-f32-acc=true})'
-```
diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index bcc284405..f4a49c4fc 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -52,19 +52,8 @@ def TTKernelInsertL1Accumulation
     or `ttl.reduction_loop` (compiler-generated) attributes on `scf.for`
     ops, with `ttl.l1_acc_loop` taking precedence.
 
-    When `strict-f32-acc` is enabled, the pass errors if a user-written
-    accumulation loop (`+=`, marked `ttl.l1_acc_loop`) contains subblock
-    loops, indicating the output block exceeds f32 DST capacity. This
-    guarantees each K iteration fits in a single DST section.
   }];
 
-  let options = [
-    Option<"strictF32Acc", "strict-f32-acc", "bool", "false",
-           "Error if a user-written accumulation loop requires subblocking "
-           "(output exceeds f32 DST capacity). Guarantees full f32 "
-           "precision per accumulation step.">
-  ];
-
   let dependentDialects = [
     "::mlir::arith::ArithDialect",
     "::mlir::scf::SCFDialect",
@@ -225,7 +214,11 @@ def TTLSubblockComputeForDST
     Option<"subblockSync", "subblock-sync", "bool", "false",
            "Refine DFB reserve/push to per-subblock granularity, enabling "
            "pack_tile_block for contiguous subblocks. When disabled (default), "
-           "user-placed reserve/push is preserved.">
+           "user-placed reserve/push is preserved.">,
+    Option<"strictF32Acc", "strict-f32-acc", "bool", "false",
+           "Error if a user-written accumulation loop (+=) with non-f32 "
+           "output requires subblocking. Subblocking reduces accumulation "
+           "precision because bf16 L1 intermediates are narrower than f32 DST.">
   ];
 
   let dependentDialects = [
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index a6bfffcc1..14409d95b 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -63,10 +63,6 @@ static scf::ForOp findOutermostL1AccLoop(Operation *op) {
 struct TTKernelInsertL1AccumulationPass
     : public impl::TTKernelInsertL1AccumulationBase<
           TTKernelInsertL1AccumulationPass> {
-  using Base =
-      impl::TTKernelInsertL1AccumulationBase<TTKernelInsertL1AccumulationPass>;
-  using Base::Base;
-
   void runOnOperation() override {
     auto moduleOp = getOperation();
 
@@ -90,31 +86,6 @@ struct TTKernelInsertL1AccumulationPass
       }
     });
 
-    // When --strict-f32-acc is set, error if any user-written accumulation
-    // loop (kL1AccLoopAttrName, from +=) contains subblock loops, which
-    // indicates the output block exceeds f32 DST capacity.
-    // TODO(ttl): Instead of erroring, allocate an f32 L1 temporary and
-    // emit a cast to bf16 after the loop. This would give full f32
-    // precision regardless of block size, at the cost of 2x L1 per tile.
-    if (strictF32Acc) {
-      for (auto loop : l1AccLoops) {
-        if (!loop->hasAttr(kL1AccLoopAttrName)) {
-          continue;
-        }
-        bool hasSubblockLoop = false;
-        loop->walk([&](scf::ForOp inner) {
-          if (inner->hasAttr(kSubblockLoopStrideAttrName)) {
-            hasSubblockLoop = true;
-          }
-        });
-        if (hasSubblockLoop) {
-          loop->emitError("output block exceeds f32 DST capacity; reduce block "
-                          "dimensions or compile without --ttl-strict-f32-acc");
-          return signalPassFailure();
-        }
-      }
-    }
-
     // L1 accumulation guard placement. For any loop that
     // accumulates in L1 (matmul K loop or reduce loop), the pattern is:
     //
@@ -190,11 +161,13 @@ struct TTKernelInsertL1AccumulationPass
         outermostLoop = loop;
       }
       if (disabledLoops.insert(outermostLoop.getOperation()).second) {
+        Location disableLoc = outermostLoop->getLoc();
         // Disable before the loop.
         builder.setInsertionPoint(outermostLoop);
-        Value disablePre = arith::ConstantOp::create(
-            builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0));
-        ttk::PackReconfigL1AccOp::create(builder, loc, disablePre);
+        Value disablePre =
+            arith::ConstantOp::create(builder, disableLoc, builder.getI32Type(),
+                                      builder.getI32IntegerAttr(0));
+        ttk::PackReconfigL1AccOp::create(builder, disableLoc, disablePre);
 
         // Disable after any consecutive cb_push_back ops that follow the
         // loop. Multi-output computes produce one push per output CB.
@@ -208,9 +181,10 @@ struct TTKernelInsertL1AccumulationPass
         } else {
           builder.setInsertionPointAfter(outermostLoop);
         }
-        Value disablePost = arith::ConstantOp::create(
-            builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0));
-        ttk::PackReconfigL1AccOp::create(builder, loc, disablePost);
+        Value disablePost =
+            arith::ConstantOp::create(builder, disableLoc, builder.getI32Type(),
+                                      builder.getI32IntegerAttr(0));
+        ttk::PackReconfigL1AccOp::create(builder, disableLoc, disablePost);
       }
     }
   }
diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
index cc64fe3f5..805a3c951 100644
--- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
+++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
@@ -33,6 +33,7 @@ void createTTLToTTKernelPipeline(OpPassManager &pm,
   if (options.maximizeDST) {
     TTLSubblockComputeForDSTOptions subblockOpts;
     subblockOpts.subblockSync = options.autoSync;
+    subblockOpts.strictF32Acc = options.strictF32Acc;
     pm.addPass(createTTLSubblockComputeForDST(subblockOpts));
   }
   if (options.useBlockMatmul) {
@@ -53,11 +54,7 @@ void createTTLToTTKernelPipeline(OpPassManager &pm,
     pm.addPass(createTTLConvertTTLToTTKernel(ttkOpts));
   }
   pm.addPass(createTTKernelInsertInits());
-  {
-    TTKernelInsertL1AccumulationOptions l1AccOpts;
-    l1AccOpts.strictF32Acc = options.strictF32Acc;
-    pm.addPass(createTTKernelInsertL1Accumulation(l1AccOpts));
-  }
+  pm.addPass(createTTKernelInsertL1Accumulation());
   if (options.combinePackTiles) {
     pm.addNestedPass<func::FuncOp>(createTTKernelCombinePackTiles());
   }
diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
index 7757c04ed..0f1f65ede 100644
--- a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
@@ -46,15 +46,17 @@ struct TTLAnnotateL1AccLoopsPass
       // Only count stores whose nearest enclosing scf.for is this forOp,
       // so that nested inner loops are not attributed to outer loops.
       bool hasAccumulatingStore = false;
-      forOp.getBody()->walk([&](StoreOp store) {
+      forOp.getBody()->walk([&](StoreOp store) -> WalkResult {
         if (store.getAccumulate() &&
             store->getParentOfType<scf::ForOp>() == forOp) {
           hasAccumulatingStore = true;
+          return WalkResult::interrupt();
         }
+        return WalkResult::advance();
       });
 
       if (hasAccumulatingStore) {
-        forOp->setAttr(kL1AccLoopAttrName, OpBuilder(forOp).getUnitAttr());
+        forOp->setAttr(kL1AccLoopAttrName, UnitAttr::get(forOp->getContext()));
       }
     });
   }
diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
index da6c65065..0d5c21add 100644
--- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
@@ -17,6 +17,8 @@
 #include "ttlang/Dialect/TTL/IR/TTLOpsUtils.h"
 #include "ttlang/Dialect/TTL/Passes.h"
 
+#include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h"
+
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -189,6 +191,38 @@ struct TTLSubblockComputeForDSTPass
       return success();
     }
 
+    // When --strict-f32-acc is set and this compute is inside a user-written
+    // accumulation loop (+=), error if the output type is not f32. Subblocking
+    // a non-f32 accumulating compute reduces precision: the f32 DST partial
+    // sums are truncated to bf16 when packed to L1 per K step.
+    // TODO(ttl): Instead of erroring, allocate an f32 L1 temporary and
+    // emit a cast to bf16 after the loop, giving full f32 precision at
+    // the cost of 2x L1 per tile.
+    if (strictF32Acc) {
+      bool insideL1AccLoop = false;
+      for (Operation *parent = computeOp->getParentOp(); parent;
+           parent = parent->getParentOp()) {
+        if (auto forOp = dyn_cast<scf::ForOp>(parent)) {
+          if (forOp->hasAttr(kL1AccLoopAttrName)) {
+            insideL1AccLoop = true;
+            break;
+          }
+        }
+      }
+      if (insideL1AccLoop) {
+        auto outType =
+            cast<RankedTensorType>(computeOp.getDpsInits()[0].getType());
+        auto tileType = cast<ttcore::TileType>(outType.getElementType());
+        if (tileType.getDataType() != ttcore::DataType::Float32) {
+          return computeOp.emitError(
+              "subblocking accumulation loop reduces precision: bf16 L1 "
+              "intermediates truncate f32 DST partial sums per K step; "
+              "reduce block dimensions to fit in f32 DST, use f32 output "
+              "type, or compile without --ttl-strict-f32-acc");
+        }
+      }
+    }
+
     // Only parallel dimensions are candidates for subblocking; reduction
     // dimensions must be fully included in each subblock. Matmul K is
     // excluded because it accumulates in-place (see hasMatmulBlock above).
diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py
index f2c02cc71..f2f161dde 100644
--- a/python/ttl/ttl_api.py
+++ b/python/ttl/ttl_api.py
@@ -1231,8 +1231,9 @@ def _compile_kernel(
         ]
         if compiler_options.maximize_dst:
             subblock_sync = "true" if compiler_options.auto_sync else "false"
+            strict_f32 = "true" if compiler_options.strict_f32_acc else "false"
             pipeline_passes.append(
-                f"func.func(ttl-subblock-compute-for-dst{{subblock-sync={subblock_sync}}})"
+                f"func.func(ttl-subblock-compute-for-dst{{subblock-sync={subblock_sync} strict-f32-acc={strict_f32}}})"
             )
         if compiler_options.use_block_matmul:
             pipeline_passes.append("func.func(ttl-lower-matmul-block)")
@@ -1267,12 +1268,11 @@ def _compile_kernel(
             pipeline_passes.append(f'ttl-dump-cb-flow-graph{{output="{cb_flow_json}"}}')
 
         reduce_fp32_flag = int(compiler_options.reduce_full_fp32)
-        strict_f32_flag = int(compiler_options.strict_f32_acc)
         pipeline_passes += [
             "ttl-lower-dprint-to-emitc",
             f"convert-ttl-to-ttkernel{{reduce-full-fp32={reduce_fp32_flag}}}",
             "ttkernel-insert-inits",
-            f"ttkernel-insert-l1-accumulation{{strict-f32-acc={strict_f32_flag}}}",
+            "ttkernel-insert-l1-accumulation",
         ]
         if compiler_options.combine_pack_tiles:
             pipeline_passes.append("func.func(ttkernel-combine-pack-tiles)")
diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir
deleted file mode 100644
index e79698877..000000000
--- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir
+++ /dev/null
@@ -1,75 +0,0 @@
-// Negative tests for ttkernel-insert-l1-accumulation with --strict-f32-acc.
-
-// RUN: ttlang-opt %s --pass-pipeline='builtin.module(ttkernel-insert-l1-accumulation{strict-f32-acc=true})' --verify-diagnostics --split-input-file
-
-// L1 acc loop with subblock loop inside: strict-f32-acc should error.
-
-func.func @strict_f32_subblock_error() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
-  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c0_i32 = arith.constant 0 : i32
-  %c1_i32 = arith.constant 1 : i32
-  %c2 = arith.constant 2 : index
-  %c4 = arith.constant 4 : index
-  // expected-error @below {{output block exceeds f32 DST capacity}}
-  scf.for %iv = %c0 to %c4 step %c1 {
-    scf.for %sb = %c0 to %c2 step %c1 {
-      ttkernel.tile_regs_acquire() : () -> ()
-      ttkernel.matmul_block(%cb, %cb, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> ()
-      ttkernel.tile_regs_commit() : () -> ()
-      ttkernel.tile_regs_wait() : () -> ()
-      ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
-      ttkernel.tile_regs_release() : () -> ()
-    } {ttl.subblock_loop_stride = 1 : index}
-  } {ttl.l1_acc_loop}
-  return
-}
-
-// -----
-
-// L1 acc loop WITHOUT subblock loops: strict-f32-acc should pass.
-
-// expected-no-diagnostics
-func.func @strict_f32_no_subblock_ok() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
-  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-  %c4_i32 = arith.constant 4 : i32
-  scf.for %iv = %c0 to %c4 step %c1 {
-    ttkernel.tile_regs_acquire() : () -> ()
-    ttkernel.tile_regs_commit() : () -> ()
-    ttkernel.tile_regs_wait() : () -> ()
-    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
-    ttkernel.tile_regs_release() : () -> ()
-  } {ttl.l1_acc_loop}
-  ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
-  return
-}
-
-// -----
-
-// Reduction loop (compiler-generated) with subblock: strict-f32-acc should
-// NOT error (only user-written l1_acc_loop triggers the check).
-
-// expected-no-diagnostics
-func.func @strict_f32_reduction_loop_ok() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
-  %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
-  %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
-  %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  scf.for %iv = %c0 to %c2 step %c1 {
-    scf.for %sb = %c0 to %c2 step %c1 {
-      ttkernel.tile_regs_acquire() : () -> ()
-      ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, <reduce_sum>, <reduce_dim_col>) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> ()
-      ttkernel.tile_regs_commit() : () -> ()
-      ttkernel.tile_regs_wait() : () -> ()
-      ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> ()
-      ttkernel.tile_regs_release() : () -> ()
-    } {ttl.subblock_loop_stride = 1 : index}
-  } {ttl.reduction_loop}
-  return
-}
diff --git a/test/ttlang/Dialect/TTL/Transforms/subblock_strict_f32_acc_invalid.mlir b/test/ttlang/Dialect/TTL/Transforms/subblock_strict_f32_acc_invalid.mlir
new file mode 100644
index 000000000..8c4318acd
--- /dev/null
+++ b/test/ttlang/Dialect/TTL/Transforms/subblock_strict_f32_acc_invalid.mlir
@@ -0,0 +1,61 @@
+// Negative tests for ttl-subblock-compute-for-dst with --strict-f32-acc.
+// The check fires when a user-written accumulation loop (+=) with non-f32
+// output requires subblocking, because bf16 L1 intermediates truncate f32
+// DST partial sums per K step.
+
+// RUN: ttlang-opt %s \
+// RUN:   --pass-pipeline='builtin.module(func.func( \
+// RUN:     ttl-annotate-l1-acc-loops, convert-ttl-to-compute, \
+// RUN:     ttl-assign-dst{enable-fpu-binary-ops=0}, \
+// RUN:     ttl-subblock-compute-for-dst{strict-f32-acc=true}))' \
+// RUN:   --verify-diagnostics --split-input-file
+
+// bf16 output 3x3 = 9 tiles exceeds f32 DST capacity (4): should error.
+
+func.func @strict_f32_subblock_bf16_error(
+    %arg0: tensor<3x2x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<2x3x!ttcore.tile<32x32, bf16>>) -> tensor<3x3x!ttcore.tile<32x32, bf16>>
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>, fp32_dest_acc_en} {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[3, 2], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[2, 3], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<3x2x!ttcore.tile<32x32, bf16>>, !ttl.cb<[3, 2], !ttcore.tile<32x32, bf16>, 2>) -> tensor<3x2x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<2x3x!ttcore.tile<32x32, bf16>>, !ttl.cb<[2, 3], !ttcore.tile<32x32, bf16>, 2>) -> tensor<2x3x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[3, 3], !ttcore.tile<32x32, bf16>, 2> -> tensor<3x3x!ttcore.tile<32x32, bf16>>
+  scf.for %k = %c0 to %c2 step %c1 {
+    // expected-error @below {{subblocking accumulation loop reduces precision}}
+    %mm = ttl.matmul %a, %b : tensor<3x2x!ttcore.tile<32x32, bf16>>, tensor<2x3x!ttcore.tile<32x32, bf16>> -> tensor<3x3x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<3x3x!ttcore.tile<32x32, bf16>>, tensor<3x3x!ttcore.tile<32x32, bf16>>
+  }
+  ttl.cb_push %cb2 : <[3, 3], !ttcore.tile<32x32, bf16>, 2>
+  func.return %reserve : tensor<3x3x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// bf16 output 2x2 = 4 tiles fits in f32 DST (4): no subblocking, no error.
+
+// expected-no-diagnostics
+func.func @strict_f32_fits_in_dst_ok(
+    %arg0: tensor<2x2x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<2x2x!ttcore.tile<32x32, bf16>>) -> tensor<2x2x!ttcore.tile<32x32, bf16>>
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>, fp32_dest_acc_en} {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<2x2x!ttcore.tile<32x32, bf16>>, !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2>) -> tensor<2x2x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<2x2x!ttcore.tile<32x32, bf16>>, !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2>) -> tensor<2x2x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[2, 2], !ttcore.tile<32x32, bf16>, 2> -> tensor<2x2x!ttcore.tile<32x32, bf16>>
+  scf.for %k = %c0 to %c2 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<2x2x!ttcore.tile<32x32, bf16>>, tensor<2x2x!ttcore.tile<32x32, bf16>> -> tensor<2x2x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<2x2x!ttcore.tile<32x32, bf16>>, tensor<2x2x!ttcore.tile<32x32, bf16>>
+  }
+  ttl.cb_push %cb2 : <[2, 2], !ttcore.tile<32x32, bf16>, 2>
+  func.return %reserve : tensor<2x2x!ttcore.tile<32x32, bf16>>
+}

From c40a42e5a7385b6c38aae3600f772f3589096bbc Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Tue, 14 Apr 2026 07:27:03 -0700
Subject: [PATCH 25/31] precommit

---
 docs/sphinx/reference/compiler-options.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/sphinx/reference/compiler-options.md b/docs/sphinx/reference/compiler-options.md
index ff3596620..78662db94 100644
--- a/docs/sphinx/reference/compiler-options.md
+++ b/docs/sphinx/reference/compiler-options.md
@@ -190,4 +190,3 @@ Analyze circular buffer producer/consumer relationships and dump the flow graph.
 ```bash
 ttlang-opt input.mlir -p 'ttl-dump-cb-flow-graph{output="/tmp/cb_graph.json"}'
 ```
-

From 3fc7ca9c96774ee5dd85276ad94dd0ada41579c8 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Tue, 14 Apr 2026 08:45:04 -0700
Subject: [PATCH 26/31] rewrite L1 acc annotation with dominance, fix
 consecutive += loops. add tests

1. Two consecutive += loops to same reserve
   MLIR: annotate_l1_acc_loops.mlir::consecutive_loops_same_reserve
         insert_l1_accumulation.mlir::consecutive_l1_acc_loops
   Device: test_matmul_l1_acc.py::test_l1_acc_consecutive_loops

2. Mixing += and .store() in same loop
   MLIR: annotate_l1_acc_loops.mlir::mixed_acc_and_plain_store
   Device: test_matmul_l1_acc.py::test_l1_acc_mixed_store

3. += with non-matmul RHS (sum reduction)
   MLIR: annotate_l1_acc_loops.mlir::non_matmul_accumulate
   Device: test_matmul_l1_acc.py::test_l1_acc_sum_reduction

4. Multiple += to different outputs in same loop
   MLIR: insert_l1_accumulation.mlir::two_outputs_one_loop
   Device: test_matmul_l1_acc.py::test_l1_acc_multi_output

5. Output fits in f32 DST (block_n <= 4)
   MLIR: matmul_subblock_l1_acc.mlir::matmul_3x3_k_loop
   Device: test_matmul_l1_acc.py::test_l1_acc_single_core[blk2x2_K2..K8]

6. K=1 single iteration
   MLIR: annotate_l1_acc_loops.mlir::single_iteration
   Device: test_matmul_l1_acc.py::test_l1_acc_single_iteration

7. += inside conditional
   MLIR: annotate_l1_acc_loops.mlir::acc_inside_conditional
   Device: not tested (DSL does not generate runtime conditionals in compute)
---
 include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h   |  22 ++
 include/ttlang/Dialect/TTL/Passes.td          |  36 +--
 .../TTKernelInsertL1Accumulation.cpp          | 241 ++++++++------
 .../TTL/Transforms/TTLAnnotateL1AccLoops.cpp  |  62 ++--
 test/python/test_matmul_l1_acc.py             | 306 ++++++++++++++++++
 .../Transforms/insert_l1_accumulation.mlir    | 157 ++++++++-
 .../TTL/Transforms/annotate_l1_acc_loops.mlir | 143 ++++++++
 7 files changed, 825 insertions(+), 142 deletions(-)

diff --git a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
index 94ab10d59..fbe4f5323 100644
--- a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
+++ b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
@@ -8,10 +8,12 @@
 #include "ttlang/Dialect/TTL/IR/TTL.h"
 #include "ttlang/Dialect/TTL/IR/TTLOps.h"
 #include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h"
+#include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "llvm/ADT/SetVector.h"
@@ -427,6 +429,26 @@ inline TileOp createTileOpWithPlaceholderDstIndex(OpBuilder &builder,
   return tileOp;
 }
 
+/// Collect the CB values targeted by pack_tile ops inside a loop.
+inline llvm::SmallDenseSet<Value, 2> getPackTileCBs(scf::ForOp loop) {
+  namespace ttk = mlir::tt::ttkernel;
+  llvm::SmallDenseSet<Value, 2> cbs;
+  loop->walk([&](ttk::PackTileOp packOp) { cbs.insert(packOp.getOutCb()); });
+  return cbs;
+}
+
+/// Returns true if two loops share any pack_tile CB target.
+inline bool sharePackCB(scf::ForOp loopA, scf::ForOp loopB) {
+  auto cbsA = getPackTileCBs(loopA);
+  auto cbsB = getPackTileCBs(loopB);
+  for (auto cb : cbsA) {
+    if (cbsB.contains(cb)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 } // namespace mlir::tt::ttl
 
 #endif // TTLANG_DIALECT_TTL_IR_TTLOPSUTILS_H
diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index f4a49c4fc..6a92a0ebe 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -30,26 +30,22 @@ def TTKernelInsertL1Accumulation
     : Pass<"ttkernel-insert-l1-accumulation", "::mlir::ModuleOp"> {
   let summary = "Insert L1 accumulation guards for reduction loops";
   let description = [{
-    Inserts `pack_reconfig_l1_acc` guards inside reduction loops so that
-    pack operations accumulate into L1 instead of overwriting. The enable
-    call happens once after the first iteration's last pack; the L1 acc
-    packer state persists across `tile_regs` boundaries. Disable guards
-    bracket the outermost reduction loop (parallel loops are not
-    considered). Max-reduce loops are excluded (max is not additive).
-
-    The pattern is:
-       pack_reconfig_l1_acc(0)                // disable before loop
-       for (iv = lb; ...) {
-         [subblock 0: acquire...pack...release]
-         [subblock N: acquire...pack...release]
-         if (iv == lb) pack_reconfig_l1_acc(1) // enable once after first
-                                               // iteration's last pack
-       }
-       [cb_push_back if present]
-       pack_reconfig_l1_acc(0)                // disable after loop
-
-    Reduction loops are identified by the `ttl.l1_acc_loop` (user-written)
-    or `ttl.reduction_loop` (compiler-generated) attributes on `scf.for`
+    Inserts `pack_reconfig_l1_acc` guards so that pack operations inside
+    annotated loops accumulate into L1 instead of overwriting. Max-reduce
+    loops are excluded (max is not additive).
+
+    Loops are grouped into accumulation scopes: consecutive sibling loops
+    that pack to the same CB share a single disable pair. Nested annotated
+    loops are folded into the outermost ancestor. Each group gets:
+    - `pack_reconfig_l1_acc(0)` before the first loop.
+    - `if (iv == lb) pack_reconfig_l1_acc(1)` after each loop's first
+      iteration (the L1 acc state persists across `tile_regs` boundaries).
+    - For the second+ loop in a group, an enable before the loop to re-enable
+      L1 acc after init ops that may reset packer state.
+    - `pack_reconfig_l1_acc(0)` after the last `cb_push_back`.
+
+    Loops are identified by the `ttl.l1_acc_loop` (user-written) or
+    `ttl.reduction_loop` (compiler-generated) attributes on `scf.for`
     ops, with `ttl.l1_acc_loop` taking precedence.
 
   }];
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index 14409d95b..9a4e2a394 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ttlang/Dialect/TTL/IR/TTL.h"
+#include "ttlang/Dialect/TTL/IR/TTLOpsUtils.h"
 #include "ttlang/Dialect/TTL/Passes.h"
 
 #include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
@@ -26,9 +27,10 @@ namespace ttk = mlir::tt::ttkernel;
 
 namespace {
 
-/// Find the enclosing loop that controls L1 accumulation.
-/// Prefers kL1AccLoopAttrName (user-annotated). Falls back to innermost
-/// kReductionLoopAttrName (compiler-generated, for reduce ops).
+/// Find the innermost enclosing L1 acc or reduction loop.
+/// User-written += loops (kL1AccLoopAttrName) take precedence over
+/// compiler-generated reduction loops because the user-specified loop
+/// structure determines the accumulation granularity.
 static scf::ForOp findL1AccLoop(Operation *op) {
   scf::ForOp reductionFallback;
   for (Operation *parent = op->getParentOp(); parent;
@@ -45,15 +47,16 @@ static scf::ForOp findL1AccLoop(Operation *op) {
   return reductionFallback;
 }
 
-/// Find the outermost enclosing L1 acc or reduction loop for the disable guard.
-static scf::ForOp findOutermostL1AccLoop(Operation *op) {
-  scf::ForOp outermost;
-  for (Operation *parent = op->getParentOp(); parent;
+/// Walk from loop up through parent ops, returning the outermost
+/// annotated ancestor. Returns loop itself if no annotated ancestor exists.
+static scf::ForOp findOutermostAnnotatedAncestor(scf::ForOp loop) {
+  scf::ForOp outermost = loop;
+  for (Operation *parent = loop->getParentOp(); parent;
        parent = parent->getParentOp()) {
-    if (auto forOp = dyn_cast<scf::ForOp>(parent)) {
-      if (forOp->hasAttr(kL1AccLoopAttrName) ||
-          forOp->hasAttr(kReductionLoopAttrName)) {
-        outermost = forOp;
+    if (auto parentFor = dyn_cast<scf::ForOp>(parent)) {
+      if (parentFor->hasAttr(kL1AccLoopAttrName) ||
+          parentFor->hasAttr(kReductionLoopAttrName)) {
+        outermost = parentFor;
       }
     }
   }
@@ -66,15 +69,25 @@ struct TTKernelInsertL1AccumulationPass
   void runOnOperation() override {
     auto moduleOp = getOperation();
 
-    // Collect L1 acc loops (kL1AccLoopAttrName or kReductionLoopAttrName)
-    // that contain pack_tile activity.
+    // Walk from TileRegsAcquireOp upward to find annotated loops —
+    // only loops with actual pack activity need L1 acc guards.
     SmallVector<scf::ForOp> l1AccLoops;
-    llvm::SmallDenseSet<Operation *> seenLoops;
+    llvm::SmallDenseSet<Operation *> visitedLoops;
     moduleOp->walk([&](ttk::TileRegsAcquireOp acquireOp) {
       auto loop = findL1AccLoop(acquireOp);
-      if (!loop || !seenLoops.insert(loop).second) {
+      if (!loop || !visitedLoops.insert(loop).second) {
         return;
       }
+      // Skip if this pass already ran (idempotency).
+      bool alreadyProcessed = false;
+      loop->walk([&](ttk::PackReconfigL1AccOp) {
+        alreadyProcessed = true;
+        return WalkResult::interrupt();
+      });
+      if (alreadyProcessed) {
+        return;
+      }
+      // Max reduce is not additive — L1 acc would corrupt the running max.
       bool hasMaxReduce = false;
       loop->walk([&](ttk::ReduceTileOp reduceOp) {
         if (reduceOp.getReduceType() == ttk::ReduceType::Max) {
@@ -86,106 +99,134 @@ struct TTKernelInsertL1AccumulationPass
       }
     });
 
-    // L1 accumulation guard placement. For any loop that
-    // accumulates in L1 (matmul K loop or reduce loop), the pattern is:
-    //
-    //   pack_reconfig_l1_acc(0)                // disable before loop
-    //   for (iv = lb; ...) {
-    //     [subblock 0: acquire...pack...release]
-    //     [subblock N: acquire...pack...release]
-    //     if (iv == lb) pack_reconfig_l1_acc(1) // enable once after first
-    //                                           // iteration's last pack
-    //   }
-    //   [cb_push_back if present]
-    //   pack_reconfig_l1_acc(0)                // disable after loop
-    //
-    // The L1 acc state persists across multiple dst sections, so the enable
-    // call only needs to happen once (after the first iteration completes
-    // all its packs). Disable guards are inserted once per outermost
-    // reduction loop (parallel loops are not considered).
-
-    // Find the insertion point for the enable guard: the top-level op in
-    // the loop body that contains the last tile_regs_release.
-    auto findTopLevelAncestor = [](Operation *op,
-                                   Block *loopBody) -> Operation * {
-      while (op && op->getBlock() != loopBody) {
-        op = op->getParentOp();
-      }
-      return op;
-    };
-
-    llvm::SmallDenseMap<Operation *, Operation *> enablePointPerLoop;
+    // The enable guard goes after the last pack in the first iteration.
+    // Packs live inside tile_regs_acquire/release sections, which may be
+    // nested in subblock loops. The top-level ancestor of the last release
+    // in the loop body is the correct insertion point.
+    llvm::SmallDenseMap<Operation *, Operation *> l1AccEnablePoint;
     for (auto loop : l1AccLoops) {
-      Operation *lastTopLevel = nullptr;
+      Operation *lastReleaseAncestor = nullptr;
       loop->walk([&](ttk::TileRegsReleaseOp releaseOp) {
-        Operation *topLevel = findTopLevelAncestor(releaseOp, loop.getBody());
-        if (topLevel) {
-          lastTopLevel = topLevel;
+        if (auto *ancestor =
+                loop.getBody()->findAncestorOpInBlock(*releaseOp)) {
+          lastReleaseAncestor = ancestor;
         }
       });
-      if (lastTopLevel) {
-        enablePointPerLoop[loop.getOperation()] = lastTopLevel;
+      if (lastReleaseAncestor) {
+        l1AccEnablePoint[loop.getOperation()] = lastReleaseAncestor;
       }
     }
 
-    llvm::SmallDenseSet<Operation *> disabledLoops;
+    // Step 1: Group loops into accumulation scopes. Consecutive sibling
+    // loops that pack to the same CB share a single disable pair. Nested
+    // annotated loops are folded into the outermost ancestor.
+    struct AccGroup {
+      scf::ForOp rootLoop;
+      SmallVector<scf::ForOp> loops;
+      Operation *scopeEnd = nullptr;
+    };
+    SmallVector<AccGroup> groups;
+    llvm::SmallDenseSet<Operation *> assignedToGroup;
+
     for (auto loop : l1AccLoops) {
-      auto iter = enablePointPerLoop.find(loop.getOperation());
-      if (iter == enablePointPerLoop.end()) {
+      if (!l1AccEnablePoint.count(loop.getOperation())) {
         continue;
       }
-      Operation *enablePoint = iter->second;
-      OpBuilder builder(loop->getContext());
-      Location loc = enablePoint->getLoc();
-
-      // Enable L1 acc once, at the end of the first iteration of the
-      // reduction loop. All packs in iteration 0 write without
-      // accumulation; subsequent iterations add to the existing L1 value.
-      builder.setInsertionPointAfter(enablePoint);
-      Value loopIV = loop.getInductionVar();
-      Value loopLB = loop.getLowerBound();
-      Value firstIter = arith::CmpIOp::create(
-          builder, loc, arith::CmpIPredicate::eq, loopIV, loopLB);
-      auto ifOp = scf::IfOp::create(builder, loc, firstIter);
-      builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-      Value enableFlag = arith::ConstantOp::create(
-          builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1));
-      ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag);
-
-      // Bracket the outermost accumulation loop with disable guards.
-      // Both kL1AccLoopAttrName and kReductionLoopAttrName mean "all
-      // iterations write to the same CB slot," so the outermost such
-      // loop is the correct accumulation boundary.
-      auto outermostLoop = findOutermostL1AccLoop(loop);
-      if (!outermostLoop) {
-        outermostLoop = loop;
+      if (assignedToGroup.contains(loop.getOperation())) {
+        continue;
       }
-      if (disabledLoops.insert(outermostLoop.getOperation()).second) {
-        Location disableLoc = outermostLoop->getLoc();
-        // Disable before the loop.
-        builder.setInsertionPoint(outermostLoop);
-        Value disablePre =
-            arith::ConstantOp::create(builder, disableLoc, builder.getI32Type(),
-                                      builder.getI32IntegerAttr(0));
-        ttk::PackReconfigL1AccOp::create(builder, disableLoc, disablePre);
-
-        // Disable after any consecutive cb_push_back ops that follow the
-        // loop. Multi-output computes produce one push per output CB.
-        Operation *lastPush = nullptr;
-        for (Operation *op = outermostLoop->getNextNode();
-             op && isa<ttk::CBPushBackOp>(op); op = op->getNextNode()) {
-          lastPush = op;
+
+      scf::ForOp rootLoop = findOutermostAnnotatedAncestor(loop);
+
+      AccGroup group;
+      group.rootLoop = rootLoop;
+      group.loops.push_back(loop);
+      assignedToGroup.insert(loop.getOperation());
+
+      // Collect sibling annotated loops that share a pack CB target.
+      for (Operation *op = rootLoop->getNextNode(); op;
+           op = op->getNextNode()) {
+        if (isa<ttk::CBPushBackOp>(op)) {
+          break;
         }
-        if (lastPush) {
-          builder.setInsertionPointAfter(lastPush);
+        auto sibling = dyn_cast<scf::ForOp>(op);
+        if (!sibling) {
+          continue;
+        }
+        if (!sibling->hasAttr(kL1AccLoopAttrName) &&
+            !sibling->hasAttr(kReductionLoopAttrName)) {
+          break;
+        }
+        if (!sharePackCB(rootLoop, sibling)) {
+          break;
+        }
+        group.loops.push_back(sibling);
+        assignedToGroup.insert(sibling.getOperation());
+      }
+
+      // Scope ends at the last trailing cb_push_back.
+      Operation *lastInGroup = group.loops.size() > 1
+                                   ? group.loops.back().getOperation()
+                                   : rootLoop.getOperation();
+      group.scopeEnd = lastInGroup;
+      for (Operation *op = lastInGroup->getNextNode(); op;
+           op = op->getNextNode()) {
+        if (isa<ttk::CBPushBackOp>(op)) {
+          group.scopeEnd = op;
         } else {
-          builder.setInsertionPointAfter(outermostLoop);
+          break;
+        }
+      }
+
+      groups.push_back(std::move(group));
+    }
+
+    // Step 2: Emit guards per group.
+    for (auto &group : groups) {
+      OpBuilder builder(group.rootLoop->getContext());
+      Location disableLoc = group.rootLoop->getLoc();
+
+      // Disable before the group.
+      builder.setInsertionPoint(group.rootLoop);
+      Value disableFlag =
+          arith::ConstantOp::create(builder, disableLoc, builder.getI32Type(),
+                                    builder.getI32IntegerAttr(0));
+      ttk::PackReconfigL1AccOp::create(builder, disableLoc, disableFlag);
+
+      for (size_t idx = 0; idx < group.loops.size(); ++idx) {
+        scf::ForOp loop = group.loops[idx];
+        auto iter = l1AccEnablePoint.find(loop.getOperation());
+        if (iter == l1AccEnablePoint.end()) {
+          continue;
         }
-        Value disablePost =
-            arith::ConstantOp::create(builder, disableLoc, builder.getI32Type(),
-                                      builder.getI32IntegerAttr(0));
-        ttk::PackReconfigL1AccOp::create(builder, disableLoc, disablePost);
+
+        // For the 2nd+ loop in a group, re-enable L1 acc before
+        // the loop because init ops between loops reset packer state.
+        if (idx > 0) {
+          builder.setInsertionPoint(loop);
+          Value enableFlag = arith::ConstantOp::create(
+              builder, loop->getLoc(), builder.getI32Type(),
+              builder.getI32IntegerAttr(1));
+          ttk::PackReconfigL1AccOp::create(builder, loop->getLoc(), enableFlag);
+        }
+
+        // Conditional enable after the first iteration's last pack.
+        Operation *afterOp = iter->second;
+        Location loc = afterOp->getLoc();
+        builder.setInsertionPointAfter(afterOp);
+        Value firstIter =
+            arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::eq,
+                                  loop.getInductionVar(), loop.getLowerBound());
+        auto ifOp = scf::IfOp::create(builder, loc, firstIter);
+        builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+        Value enableFlag = arith::ConstantOp::create(
+            builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1));
+        ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag);
       }
+
+      // Disable after the scope end.
+      builder.setInsertionPointAfter(group.scopeEnd);
+      ttk::PackReconfigL1AccOp::create(builder, disableLoc, disableFlag);
     }
   }
 };
diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
index 0f1f65ede..ab025919a 100644
--- a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
@@ -10,6 +10,14 @@
 // (ttl.store with the {accumulate} attribute, emitted by +=) and annotates
 // them with kL1AccLoopAttrName for L1 packer accumulation.
 //
+// Uses dominance: for each accumulating store, verifies the destination
+// cb_reserve properly dominates the enclosing loop (the reserve is outside
+// the loop, so the same L1 slot persists across iterations).
+//
+// TTKernelInsertL1Accumulation uses the annotated loops to find enable
+// points, and the enclosing cb_reserve_back/cb_push_back pair to determine
+// the accumulation scope for disable guards.
+//
 //===----------------------------------------------------------------------===//
 
 #include "ttlang/Dialect/TTL/IR/TTL.h"
@@ -17,6 +25,7 @@
 #include "ttlang/Dialect/TTL/Passes.h"
 
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Dominance.h"
 
 #define DEBUG_TYPE "ttl-annotate-l1-acc-loops"
 
@@ -27,37 +36,48 @@ namespace mlir::tt::ttl {
 
 namespace {
 
+/// Returns true if the loop carries any ttl.* annotation, indicating it
+/// was generated or already processed by a compiler pass.
+static bool hasCompilerAnnotation(scf::ForOp loop) {
+  for (auto attr : loop->getAttrs()) {
+    if (attr.getName().getValue().starts_with("ttl.")) {
+      return true;
+    }
+  }
+  return false;
+}
+
 struct TTLAnnotateL1AccLoopsPass
     : public impl::TTLAnnotateL1AccLoopsBase<TTLAnnotateL1AccLoopsPass> {
   void runOnOperation() override {
     func::FuncOp func = getOperation();
+    DominanceInfo domInfo(func);
 
-    func.walk([&](scf::ForOp forOp) {
-      // Skip loops already annotated (compiler-generated or prior run).
-      if (forOp->hasAttr(kL1AccLoopAttrName) ||
-          forOp->hasAttr(kReductionLoopAttrName) ||
-          forOp->hasAttr(kTileLoopStrideAttrName) ||
-          forOp->hasAttr(kSubblockLoopStrideAttrName)) {
+    func.walk([&](StoreOp store) {
+      if (!store.getAccumulate()) {
         return;
       }
 
-      // Check if this loop directly contains an accumulating store
-      // (ttl.store with the {accumulate} attribute, emitted by +=).
-      // Only count stores whose nearest enclosing scf.for is this forOp,
-      // so that nested inner loops are not attributed to outer loops.
-      bool hasAccumulatingStore = false;
-      forOp.getBody()->walk([&](StoreOp store) -> WalkResult {
-        if (store.getAccumulate() &&
-            store->getParentOfType<scf::ForOp>() == forOp) {
-          hasAccumulatingStore = true;
-          return WalkResult::interrupt();
-        }
-        return WalkResult::advance();
-      });
+      auto enclosingLoop = store->getParentOfType<scf::ForOp>();
+      if (!enclosingLoop) {
+        return;
+      }
+      if (hasCompilerAnnotation(enclosingLoop)) {
+        return;
+      }
 
-      if (hasAccumulatingStore) {
-        forOp->setAttr(kL1AccLoopAttrName, UnitAttr::get(forOp->getContext()));
+      // The reserve must properly dominate the enclosing loop: the
+      // reserve is outside the loop so the same L1 slot persists across
+      // iterations. If the reserve is inside the loop, each iteration
+      // gets a fresh slot and accumulation is meaningless.
+      Value reserve = store.getView();
+      Operation *reserveOp = reserve.getDefiningOp();
+      if (reserveOp && !domInfo.properlyDominates(reserveOp, enclosingLoop)) {
+        return;
       }
+
+      enclosingLoop->setAttr(kL1AccLoopAttrName,
+                             UnitAttr::get(enclosingLoop->getContext()));
     });
   }
 };
diff --git a/test/python/test_matmul_l1_acc.py b/test/python/test_matmul_l1_acc.py
index 7f0b8bec3..074a72637 100644
--- a/test/python/test_matmul_l1_acc.py
+++ b/test/python/test_matmul_l1_acc.py
@@ -189,3 +189,309 @@ def test_l1_acc_multicore(Mt, Kt, Nt, block_m, block_n, device):
 
     result = ttnn.to_torch(out).float()
     assert_pcc(golden, result, threshold=0.999)
+
+
+# ---------------------------------------------------------------------------
+# Non-matmul accumulation: += with a passthrough copy (sum reduction).
+# ---------------------------------------------------------------------------
+
+
+def _make_sum_reduction_kernel():
+    """Sum K input blocks via += (no matmul)."""
+
+    @ttl.operation(grid=(1, 1))
+    def kernel(inp, out):
+        Kt = inp.shape[0] // TILE
+        inp_dfb = ttl.make_dataflow_buffer_like(inp, shape=(1, 1), block_count=2)
+        out_dfb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            out_blk = out_dfb.reserve()
+            for _ in range(Kt):
+                inp_blk = inp_dfb.wait()
+                out_blk += inp_blk
+                inp_blk.pop()
+            out_blk.push()
+
+        @ttl.datamovement()
+        def dm_read():
+            for kt in range(Kt):
+                with inp_dfb.reserve() as blk:
+                    ttl.copy(inp[kt : kt + 1, 0:1], blk).wait()
+
+        @ttl.datamovement()
+        def dm_write():
+            with out_dfb.wait() as blk:
+                ttl.copy(blk, out[0:1, 0:1]).wait()
+
+    return kernel
+
+
+@pytest.mark.parametrize("Kt", [2, 4, 8], ids=[f"K{k}" for k in [2, 4, 8]])
+@pytest.mark.requires_device
+def test_l1_acc_sum_reduction(Kt, device):
+    """Sum K tiles via += without matmul (passthrough accumulation)."""
+    inp_torch = torch.randn(Kt * TILE, TILE, dtype=torch.bfloat16)
+    golden = inp_torch.float().reshape(Kt, TILE, TILE).sum(dim=0)
+
+    inp_dev = to_dram(inp_torch, device)
+    out_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device)
+
+    kernel = _make_sum_reduction_kernel()
+    kernel(inp_dev, out_dev)
+
+    result = ttnn.to_torch(out_dev).float()
+    assert_pcc(golden, result, threshold=0.999)
+
+
+# ---------------------------------------------------------------------------
+# K=1 single iteration: accumulation with one loop iteration.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_l1_acc_single_iteration(device):
+    """K=1: single-iteration += loop. Semantically equivalent to plain store."""
+    M, K, N = TILE, TILE, 2 * TILE
+    a_torch = torch.randn(M, K, dtype=torch.bfloat16)
+    b_torch = torch.randn(K, N, dtype=torch.bfloat16)
+    golden = (a_torch.float() @ b_torch.float()).float()
+
+    a_dev = to_dram(a_torch, device)
+    b_dev = to_dram(b_torch, device)
+    out_dev = to_dram(torch.zeros(M, N, dtype=torch.bfloat16), device)
+
+    kernel = _make_l1_acc_kernel(1, 2, grid=(1, 1))
+    kernel(a_dev, b_dev, out_dev)
+
+    result = ttnn.to_torch(out_dev).float()
+    assert_pcc(golden, result, threshold=0.999)
+
+
+# ---------------------------------------------------------------------------
+# Consecutive += loops to the same reserve (two input streams).
+# ---------------------------------------------------------------------------
+
+
+def _make_consecutive_acc_kernel(K1, K2):
+    """Two consecutive += loops to one output: out = (a@b summed K1) + (c@d summed K2)."""
+
+    @ttl.operation(grid=(1, 1))
+    def kernel(a, b, c, d, out):
+        a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2)
+        b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, 1), block_count=2)
+        c_dfb = ttl.make_dataflow_buffer_like(c, shape=(1, 1), block_count=2)
+        d_dfb = ttl.make_dataflow_buffer_like(d, shape=(1, 1), block_count=2)
+        out_dfb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            out_blk = out_dfb.reserve()
+            for _ in range(K1):
+                a_blk = a_dfb.wait()
+                b_blk = b_dfb.wait()
+                out_blk += a_blk @ b_blk
+                a_blk.pop()
+                b_blk.pop()
+            for _ in range(K2):
+                c_blk = c_dfb.wait()
+                d_blk = d_dfb.wait()
+                out_blk += c_blk @ d_blk
+                c_blk.pop()
+                d_blk.pop()
+            out_blk.push()
+
+        @ttl.datamovement()
+        def reader():
+            for kt in range(K1):
+                with a_dfb.reserve() as blk:
+                    ttl.copy(a[0:1, kt : kt + 1], blk).wait()
+                with b_dfb.reserve() as blk:
+                    ttl.copy(b[kt : kt + 1, 0:1], blk).wait()
+            for kt in range(K2):
+                with c_dfb.reserve() as blk:
+                    ttl.copy(c[0:1, kt : kt + 1], blk).wait()
+                with d_dfb.reserve() as blk:
+                    ttl.copy(d[kt : kt + 1, 0:1], blk).wait()
+
+        @ttl.datamovement()
+        def writer():
+            with out_dfb.wait() as blk:
+                ttl.copy(blk, out[0:1, 0:1]).wait()
+
+    return kernel
+
+
+@pytest.mark.requires_device
+def test_l1_acc_consecutive_loops(device):
+    """Two consecutive += loops to the same reserve block."""
+    K1, K2 = 2, 3
+    a_torch = torch.randn(TILE, K1 * TILE, dtype=torch.bfloat16)
+    b_torch = torch.randn(K1 * TILE, TILE, dtype=torch.bfloat16)
+    c_torch = torch.randn(TILE, K2 * TILE, dtype=torch.bfloat16)
+    d_torch = torch.randn(K2 * TILE, TILE, dtype=torch.bfloat16)
+    golden = (
+        (a_torch.float() @ b_torch.float()) + (c_torch.float() @ d_torch.float())
+    ).float()
+
+    a_dev = to_dram(a_torch, device)
+    b_dev = to_dram(b_torch, device)
+    c_dev = to_dram(c_torch, device)
+    d_dev = to_dram(d_torch, device)
+    out_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device)
+
+    kernel = _make_consecutive_acc_kernel(K1, K2)
+    kernel(a_dev, b_dev, c_dev, d_dev, out_dev)
+
+    result = ttnn.to_torch(out_dev).float()
+    assert_pcc(golden, result, threshold=0.999)
+
+
+# ---------------------------------------------------------------------------
+# Mixed .store() then += (overwrite first, accumulate rest).
+# ---------------------------------------------------------------------------
+
+
+def _make_mixed_store_acc_kernel(total_k):
+    """First iteration overwrites via .store(), rest accumulate via +=."""
+
+    @ttl.operation(grid=(1, 1))
+    def kernel(a, b, out):
+        a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2)
+        b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, 1), block_count=2)
+        out_dfb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            out_blk = out_dfb.reserve()
+            a_blk = a_dfb.wait()
+            b_blk = b_dfb.wait()
+            out_blk.store(a_blk @ b_blk)
+            a_blk.pop()
+            b_blk.pop()
+            for _ in range(total_k - 1):
+                a_blk = a_dfb.wait()
+                b_blk = b_dfb.wait()
+                out_blk += a_blk @ b_blk
+                a_blk.pop()
+                b_blk.pop()
+            out_blk.push()
+
+        @ttl.datamovement()
+        def reader():
+            for _ in range(total_k):
+                with a_dfb.reserve() as blk:
+                    ttl.copy(a[0:1, 0:1], blk).wait()
+                with b_dfb.reserve() as blk:
+                    ttl.copy(b[0:1, 0:1], blk).wait()
+
+        @ttl.datamovement()
+        def writer():
+            with out_dfb.wait() as blk:
+                ttl.copy(blk, out[0:1, 0:1]).wait()
+
+    return kernel
+
+
+@pytest.mark.parametrize("total_k", [2, 4], ids=[f"K{k}" for k in [2, 4]])
+@pytest.mark.requires_device
+def test_l1_acc_mixed_store(total_k, device):
+    """.store() first iteration, += for rest. Result = K * (a @ b)."""
+    a_torch = torch.randn(TILE, TILE, dtype=torch.bfloat16)
+    b_torch = torch.randn(TILE, TILE, dtype=torch.bfloat16)
+    golden = (total_k * (a_torch.float() @ b_torch.float())).float()
+
+    a_dev = to_dram(a_torch, device)
+    b_dev = to_dram(b_torch, device)
+    out_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device)
+
+    kernel = _make_mixed_store_acc_kernel(total_k)
+    kernel(a_dev, b_dev, out_dev)
+
+    result = ttnn.to_torch(out_dev).float()
+    assert_pcc(golden, result, threshold=0.999)
+
+
+# ---------------------------------------------------------------------------
+# Multiple += to different outputs in the same loop.
+# ---------------------------------------------------------------------------
+
+
+def _make_multi_output_kernel(Kt):
+    """One loop with += to two independent outputs."""
+
+    @ttl.operation(grid=(1, 1))
+    def kernel(a, b, c, d, out_a, out_b):
+        a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2)
+        b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, 1), block_count=2)
+        c_dfb = ttl.make_dataflow_buffer_like(c, shape=(1, 1), block_count=2)
+        d_dfb = ttl.make_dataflow_buffer_like(d, shape=(1, 1), block_count=2)
+        out_a_dfb = ttl.make_dataflow_buffer_like(out_a, shape=(1, 1), block_count=2)
+        out_b_dfb = ttl.make_dataflow_buffer_like(out_b, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            blk_a = out_a_dfb.reserve()
+            blk_b = out_b_dfb.reserve()
+            for _ in range(Kt):
+                a_blk = a_dfb.wait()
+                b_blk = b_dfb.wait()
+                blk_a += a_blk @ b_blk
+                a_blk.pop()
+                b_blk.pop()
+                c_blk = c_dfb.wait()
+                d_blk = d_dfb.wait()
+                blk_b += c_blk @ d_blk
+                c_blk.pop()
+                d_blk.pop()
+            blk_a.push()
+            blk_b.push()
+
+        @ttl.datamovement()
+        def reader():
+            for kt in range(Kt):
+                with a_dfb.reserve() as blk:
+                    ttl.copy(a[0:1, kt : kt + 1], blk).wait()
+                with b_dfb.reserve() as blk:
+                    ttl.copy(b[kt : kt + 1, 0:1], blk).wait()
+                with c_dfb.reserve() as blk:
+                    ttl.copy(c[0:1, kt : kt + 1], blk).wait()
+                with d_dfb.reserve() as blk:
+                    ttl.copy(d[kt : kt + 1, 0:1], blk).wait()
+
+        @ttl.datamovement()
+        def writer():
+            with out_a_dfb.wait() as blk:
+                ttl.copy(blk, out_a[0:1, 0:1]).wait()
+            with out_b_dfb.wait() as blk:
+                ttl.copy(blk, out_b[0:1, 0:1]).wait()
+
+    return kernel
+
+
+@pytest.mark.requires_device
+def test_l1_acc_multi_output(device):
+    """Two independent += outputs in the same K loop."""
+    Kt = 4
+    a_torch = torch.randn(TILE, Kt * TILE, dtype=torch.bfloat16)
+    b_torch = torch.randn(Kt * TILE, TILE, dtype=torch.bfloat16)
+    c_torch = torch.randn(TILE, Kt * TILE, dtype=torch.bfloat16)
+    d_torch = torch.randn(Kt * TILE, TILE, dtype=torch.bfloat16)
+    golden_a = (a_torch.float() @ b_torch.float()).float()
+    golden_b = (c_torch.float() @ d_torch.float()).float()
+
+    a_dev = to_dram(a_torch, device)
+    b_dev = to_dram(b_torch, device)
+    c_dev = to_dram(c_torch, device)
+    d_dev = to_dram(d_torch, device)
+    out_a_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device)
+    out_b_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device)
+
+    kernel = _make_multi_output_kernel(Kt)
+    kernel(a_dev, b_dev, c_dev, d_dev, out_a_dev, out_b_dev)
+
+    result_a = ttnn.to_torch(out_a_dev).float()
+    result_b = ttnn.to_torch(out_b_dev).float()
+    assert_pcc(golden_a, result_a, threshold=0.999)
+    assert_pcc(golden_b, result_b, threshold=0.999)
diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
index 9d329427a..07750a148 100644
--- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
+++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
@@ -1,9 +1,11 @@
 // Verifies ttkernel-insert-l1-accumulation: pack_reconfig_l1_acc guards are
 // inserted around reduction loops. The enable call happens once after the
 // first iteration's last pack (iv == lb), and disable guards bracket the
-// outermost loop.
+// accumulation scope.
 
 // RUN: ttlang-opt %s --pass-pipeline='builtin.module(ttkernel-insert-l1-accumulation)' --split-input-file | FileCheck %s
+// Idempotency: running twice produces the same output.
+// RUN: ttlang-opt %s --pass-pipeline='builtin.module(ttkernel-insert-l1-accumulation, ttkernel-insert-l1-accumulation)' --split-input-file | FileCheck %s
 
 // Basic L1 acc loop: enable after first iteration, disable before/after loop.
 
@@ -20,6 +22,7 @@
 // CHECK: }
 // CHECK: ttkernel.cb_push_back
 // CHECK: ttkernel.pack_reconfig_l1_acc(%{{.*}}) : (i32)
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
 func.func @basic_l1_acc_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
   %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
   %c0 = arith.constant 0 : index
@@ -48,6 +51,7 @@ func.func @basic_l1_acc_loop() attributes {ttkernel.thread = #ttkernel.thread<co
 // CHECK:   scf.if
 // CHECK:     ttkernel.pack_reconfig_l1_acc
 // CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
 func.func @reduction_loop_fallback() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
   %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
   %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
@@ -128,6 +132,7 @@ func.func @no_reduction_loop() attributes {ttkernel.thread = #ttkernel.thread<co
 // CHECK:     ttkernel.pack_reconfig_l1_acc(%[[ENABLE]]) : (i32)
 // CHECK: }
 // CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
 func.func @subblocked_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
   %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
   %c0 = arith.constant 0 : index
@@ -187,6 +192,7 @@ func.func @l1_acc_loop_no_sync() attributes {ttkernel.thread = #ttkernel.thread<
 // CHECK:   }
 // CHECK:   ttkernel.cb_push_back
 // CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
 func.func @l1_acc_inside_outer_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
   %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
   %c0 = arith.constant 0 : index
@@ -222,6 +228,7 @@ func.func @l1_acc_inside_outer_loop() attributes {ttkernel.thread = #ttkernel.th
 // CHECK: ttkernel.cb_push_back
 // CHECK: ttkernel.cb_push_back
 // CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
 func.func @multi_push_after_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
   %cb0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
   %cb1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
@@ -259,6 +266,7 @@ func.func @multi_push_after_loop() attributes {ttkernel.thread = #ttkernel.threa
 // CHECK:     scf.if
 // CHECK:       ttkernel.pack_reconfig_l1_acc
 // CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
 func.func @nested_l1_acc_loops() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
   %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
   %c0 = arith.constant 0 : index
@@ -292,6 +300,7 @@ func.func @nested_l1_acc_loops() attributes {ttkernel.thread = #ttkernel.thread<
 // CHECK:     scf.if
 // CHECK:       ttkernel.pack_reconfig_l1_acc
 // CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
 func.func @nested_reduction_loops() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
   %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
   %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
@@ -311,3 +320,149 @@ func.func @nested_reduction_loops() attributes {ttkernel.thread = #ttkernel.thre
   } {ttl.reduction_loop}
   return
 }
+
+// -----
+
+// Two consecutive L1 acc loops writing to the same CB.
+// The reserve/push scope spans both loops. One disable pair brackets the
+// entire scope; only the first loop gets the enable guard.
+
+// CHECK-LABEL: func.func @consecutive_l1_acc_loops
+// CHECK: ttkernel.cb_reserve_back
+// Disable before first loop.
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// First loop with enable guard.
+// CHECK: scf.for %[[IV1:.*]] = %[[LB1:.*]] to
+// CHECK:   ttkernel.tile_regs_acquire
+// CHECK:   ttkernel.pack_tile
+// CHECK:   ttkernel.tile_regs_release
+// CHECK:   arith.cmpi eq, %[[IV1]], %[[LB1]]
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// No disable between the loops. Unconditional enable re-arms L1 acc
+// after any init ops that may reset packer state.
+// CHECK-NOT: pack_reconfig_l1_acc(%{{.*}}0
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for %[[IV2:.*]] = %[[LB2:.*]] to
+// CHECK:   ttkernel.tile_regs_acquire
+// CHECK:   ttkernel.pack_tile
+// CHECK:   ttkernel.tile_regs_release
+// CHECK:   arith.cmpi eq, %[[IV2]], %[[LB2]]
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// Push then disable.
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @consecutive_l1_acc_loops() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  ttkernel.cb_reserve_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  scf.for %iv1 = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  scf.for %iv2 = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}
+
+// -----
+
+// Single loop with two independent accumulating outputs.
+// Both pack to different CBs but share one L1 acc enable/disable scope.
+
+// CHECK-LABEL: func.func @two_outputs_one_loop
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to
+// CHECK:   ttkernel.tile_regs_acquire
+// CHECK:   ttkernel.pack_tile
+// CHECK:   ttkernel.tile_regs_release
+// CHECK:   ttkernel.tile_regs_acquire
+// CHECK:   ttkernel.pack_tile
+// CHECK:   ttkernel.tile_regs_release
+// Enable after the last release (second output).
+// CHECK:   arith.cmpi eq, %[[IV]], %[[LB]]
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// Two pushes then disable.
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @two_outputs_one_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %cb1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb0, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb1, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb0, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  ttkernel.cb_push_back(%cb1, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}
+
+// -----
+
+// Idempotency: input already has pack_reconfig_l1_acc guards. Running
+// the pass again should not insert duplicates.
+
+// CHECK-LABEL: func.func @already_guarded
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @already_guarded() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c4_i32 = arith.constant 4 : i32
+  ttkernel.pack_reconfig_l1_acc(%c0_i32) : (i32) -> ()
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+    %cmp = arith.cmpi eq, %iv, %c0 : index
+    scf.if %cmp {
+      ttkernel.pack_reconfig_l1_acc(%c1_i32) : (i32) -> ()
+    }
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  ttkernel.pack_reconfig_l1_acc(%c0_i32) : (i32) -> ()
+  return
+}
diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
index 9596fe1b2..4f6fc6861 100644
--- a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
+++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
@@ -111,6 +111,149 @@ func.func @nested_only_inner(
 
 // -----
 
+// Two consecutive += loops to the same reserve. Both should be annotated.
+
+// CHECK-LABEL: func.func @consecutive_loops_same_reserve
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+func.func @consecutive_loops_same_reserve(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg2: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg3: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb3 = ttl.bind_cb {cb_index = 3, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb4 = ttl.bind_cb {cb_index = 4, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %c = ttl.attach_cb %arg2, %cb2 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %d = ttl.attach_cb %arg3, %cb3 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb4 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  scf.for %iv = %c0 to %c4 step %c1 {
+    %mm = ttl.matmul %c, %d : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// Loop with both += and plain .store() to same reserve. The loop
+// contains an accumulating store, so it should be annotated.
+
+// CHECK-LABEL: func.func @mixed_acc_and_plain_store
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+func.func @mixed_acc_and_plain_store(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// += with non-matmul RHS (passthrough accumulation).
+
+// CHECK-LABEL: func.func @non_matmul_accumulate
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+func.func @non_matmul_accumulate(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb1 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttl.store %a, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// K=1 (single iteration loop). Should still be annotated.
+
+// CHECK-LABEL: func.func @single_iteration
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+func.func @single_iteration(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c1 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// += inside scf.if inside scf.for. The store's nearest enclosing
+// ForOp is the outer loop, so it should be annotated.
+
+// CHECK-LABEL: func.func @acc_inside_conditional
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+func.func @acc_inside_conditional(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %cond: i1) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    scf.if %cond {
+      %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+      ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    }
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
 // Loop without any store should NOT be annotated.
 
 // CHECK-LABEL: func.func @no_store

From d5bcba8c1a6b8d664e4b191de25e790a61c16dff Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Tue, 14 Apr 2026 09:25:06 -0700
Subject: [PATCH 27/31] update comments. add another test for consecutive loops
 targetting different CBs

---
 .../TTKernelInsertL1Accumulation.cpp          | 16 ++++---
 .../TTL/Transforms/TTLAnnotateL1AccLoops.cpp  |  4 +-
 .../Transforms/insert_l1_accumulation.mlir    | 46 +++++++++++++++++++
 3 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index 9a4e2a394..bbafe955e 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -144,6 +144,8 @@ struct TTKernelInsertL1AccumulationPass
       assignedToGroup.insert(loop.getOperation());
 
       // Collect sibling annotated loops that share a pack CB target.
+      // sharePackCB walks recursively, so for nested loops (rootLoop
+      // wrapping loop), it finds pack_tile ops inside the inner loop.
       for (Operation *op = rootLoop->getNextNode(); op;
            op = op->getNextNode()) {
         if (isa<ttk::CBPushBackOp>(op)) {
@@ -164,16 +166,16 @@ struct TTKernelInsertL1AccumulationPass
         assignedToGroup.insert(sibling.getOperation());
       }
 
-      // Scope ends at the last trailing cb_push_back.
-      Operation *lastInGroup = group.loops.size() > 1
-                                   ? group.loops.back().getOperation()
-                                   : rootLoop.getOperation();
-      group.scopeEnd = lastInGroup;
-      for (Operation *op = lastInGroup->getNextNode(); op;
+      // Find scope end: scan forward from rootLoop past sibling loops
+      // and trailing cb_push_back ops. Starts from rootLoop (not the
+      // last inner loop) because push_back ops are siblings of rootLoop,
+      // not of nested inner loops.
+      group.scopeEnd = rootLoop;
+      for (Operation *op = rootLoop->getNextNode(); op;
            op = op->getNextNode()) {
         if (isa<ttk::CBPushBackOp>(op)) {
           group.scopeEnd = op;
-        } else {
+        } else if (!assignedToGroup.contains(op)) {
           break;
         }
       }
diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
index ab025919a..b2b247c6d 100644
--- a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
@@ -15,8 +15,8 @@
 // the loop, so the same L1 slot persists across iterations).
 //
 // TTKernelInsertL1Accumulation uses the annotated loops to find enable
-// points, and the enclosing cb_reserve_back/cb_push_back pair to determine
-// the accumulation scope for disable guards.
+// points, and groups consecutive sibling loops by shared pack CB targets
+// to determine the accumulation scope for disable guards.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
index 07750a148..2caaee3e3 100644
--- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
+++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
@@ -466,3 +466,49 @@ func.func @already_guarded() attributes {ttkernel.thread = #ttkernel.thread<comp
   ttkernel.pack_reconfig_l1_acc(%c0_i32) : (i32) -> ()
   return
 }
+
+// -----
+
+// Two consecutive annotated loops packing to DIFFERENT CBs.
+// Each loop gets its own independent disable pair.
+
+// CHECK-LABEL: func.func @different_cb_siblings
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @different_cb_siblings() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %cb1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %iv1 = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb0, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb0, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  scf.for %iv2 = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb1, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb1, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}

From dea4d75a5b881f517a6f7609c9acc3e55903efba Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Tue, 14 Apr 2026 09:45:35 -0700
Subject: [PATCH 28/31] final cleanup

---
 .../TTKernelInsertL1Accumulation.cpp          | 16 ++++--
 .../TTL/Transforms/TTLAnnotateL1AccLoops.cpp  | 18 +++++++
 test/python/test_matmul_l1_acc.py             | 12 ++---
 .../Transforms/insert_l1_accumulation.mlir    | 52 +++++++++++++++++++
 .../TTL/Transforms/annotate_l1_acc_loops.mlir | 34 +-----------
 .../annotate_l1_acc_loops_invalid.mlir        | 28 ++++++++++
 6 files changed, 117 insertions(+), 43 deletions(-)
 create mode 100644 test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops_invalid.mlir

diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index bbafe955e..f1b8c3e7c 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -166,17 +166,23 @@ struct TTKernelInsertL1AccumulationPass
         assignedToGroup.insert(sibling.getOperation());
       }
 
-      // Find scope end: scan forward from rootLoop past sibling loops
-      // and trailing cb_push_back ops. Starts from rootLoop (not the
-      // last inner loop) because push_back ops are siblings of rootLoop,
-      // not of nested inner loops.
+      // Find scope end: scan forward from rootLoop past grouped siblings,
+      // init ops between them, and trailing cb_push_back ops. Only stop
+      // at a non-grouped ForOp (a different accumulation scope) or a
+      // cb_reserve_back (start of a new reserve region).
+      // TODO: Consider adding structural accumulation_region ops to make this
+      // more robust and composable.
       group.scopeEnd = rootLoop;
       for (Operation *op = rootLoop->getNextNode(); op;
            op = op->getNextNode()) {
         if (isa<ttk::CBPushBackOp>(op)) {
           group.scopeEnd = op;
-        } else if (!assignedToGroup.contains(op)) {
+        } else if (isa<ttk::CBReserveBackOp>(op)) {
           break;
+        } else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
+          if (!assignedToGroup.contains(forOp)) {
+            break;
+          }
         }
       }
 
diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
index b2b247c6d..20972e5ce 100644
--- a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
@@ -52,6 +52,7 @@ struct TTLAnnotateL1AccLoopsPass
   void runOnOperation() override {
     func::FuncOp func = getOperation();
     DominanceInfo domInfo(func);
+    bool failed = false;
 
     func.walk([&](StoreOp store) {
       if (!store.getAccumulate()) {
@@ -66,6 +67,19 @@ struct TTLAnnotateL1AccLoopsPass
         return;
       }
 
+      // Conditional += is not supported: the L1 acc enable guard is conditional
+      // based on the loop induction variable, not on whether a pack actually
+      // executed. If the condition is false on iteration 0, subsequent
+      // iterations accumulate into uninitialized L1.
+      if (store->getParentOp() != enclosingLoop.getOperation()) {
+        store->emitError(
+            "+= inside a conditional is not supported (#504); move "
+            "the condition outside the accumulation loop or use a "
+            "separate loop for the conditional path");
+        failed = true;
+        return;
+      }
+
       // The reserve must properly dominate the enclosing loop: the
       // reserve is outside the loop so the same L1 slot persists across
       // iterations. If the reserve is inside the loop, each iteration
@@ -79,6 +93,10 @@ struct TTLAnnotateL1AccLoopsPass
       enclosingLoop->setAttr(kL1AccLoopAttrName,
                              UnitAttr::get(enclosingLoop->getContext()));
     });
+
+    if (failed) {
+      signalPassFailure();
+    }
   }
 };
 
diff --git a/test/python/test_matmul_l1_acc.py b/test/python/test_matmul_l1_acc.py
index 074a72637..8a8ed4d9a 100644
--- a/test/python/test_matmul_l1_acc.py
+++ b/test/python/test_matmul_l1_acc.py
@@ -349,12 +349,12 @@ def test_l1_acc_consecutive_loops(device):
 
 
 # ---------------------------------------------------------------------------
-# Mixed .store() then += (overwrite first, accumulate rest).
+# .store() before loop, += inside loop (overwrite then accumulate).
 # ---------------------------------------------------------------------------
 
 
-def _make_mixed_store_acc_kernel(total_k):
-    """First iteration overwrites via .store(), rest accumulate via +=."""
+def _make_store_then_acc_kernel(total_k):
+    """.store() before the += loop, then K-1 iterations accumulate via +=."""
 
     @ttl.operation(grid=(1, 1))
     def kernel(a, b, out):
@@ -396,8 +396,8 @@ def writer():
 
 @pytest.mark.parametrize("total_k", [2, 4], ids=[f"K{k}" for k in [2, 4]])
 @pytest.mark.requires_device
-def test_l1_acc_mixed_store(total_k, device):
-    """.store() first iteration, += for rest. Result = K * (a @ b)."""
+def test_l1_acc_store_then_acc(total_k, device):
+    """.store() before loop, += inside loop. Result = K * (a @ b)."""
     a_torch = torch.randn(TILE, TILE, dtype=torch.bfloat16)
     b_torch = torch.randn(TILE, TILE, dtype=torch.bfloat16)
     golden = (total_k * (a_torch.float() @ b_torch.float())).float()
@@ -406,7 +406,7 @@ def test_l1_acc_mixed_store(total_k, device):
     b_dev = to_dram(b_torch, device)
     out_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device)
 
-    kernel = _make_mixed_store_acc_kernel(total_k)
+    kernel = _make_store_then_acc_kernel(total_k)
     kernel(a_dev, b_dev, out_dev)
 
     result = ttnn.to_torch(out_dev).float()
diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
index 2caaee3e3..b4648f589 100644
--- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
+++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
@@ -512,3 +512,55 @@ func.func @different_cb_siblings() attributes {ttkernel.thread = #ttkernel.threa
   ttkernel.cb_push_back(%cb1, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
   return
 }
+
+// -----
+
+// Consecutive annotated loops with init ops between them (the real-world
+// pattern from the full pipeline). The scope must span past the init ops
+// to include the push after the second loop.
+
+// CHECK-LABEL: func.func @consecutive_with_init_between
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// CHECK-NOT: pack_reconfig_l1_acc(%{{.*}}0
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @consecutive_with_init_between() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %cb_in0 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %cb_in1 = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  ttkernel.cb_reserve_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  scf.for %iv1 = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.matmul_block(%cb_in0, %cb_in0, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  // Init op between the two loops (as generated by TTKernelInsertInits).
+  %dummy_init = arith.constant 42 : i32
+  scf.for %iv2 = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.matmul_block(%cb_in1, %cb_in1, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}
diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
index 4f6fc6861..5042dae62 100644
--- a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
+++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
@@ -152,10 +152,10 @@ func.func @consecutive_loops_same_reserve(
 // Loop with both += and plain .store() to same reserve. The loop
 // contains an accumulating store, so it should be annotated.
 
-// CHECK-LABEL: func.func @mixed_acc_and_plain_store
+// CHECK-LABEL: func.func @store_and_acc_in_same_loop
 // CHECK: scf.for
 // CHECK: } {ttl.l1_acc_loop}
-func.func @mixed_acc_and_plain_store(
+func.func @store_and_acc_in_same_loop(
     %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
     %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
   %c0 = arith.constant 0 : index
@@ -224,36 +224,6 @@ func.func @single_iteration(
 
 // -----
 
-// += inside scf.if inside scf.for. The store's nearest enclosing
-// ForOp is the outer loop, so it should be annotated.
-
-// CHECK-LABEL: func.func @acc_inside_conditional
-// CHECK: scf.for
-// CHECK: } {ttl.l1_acc_loop}
-func.func @acc_inside_conditional(
-    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
-    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>,
-    %cond: i1) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c4 = arith.constant 4 : index
-  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
-  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
-  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
-  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
-  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
-  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
-  scf.for %iv = %c0 to %c4 step %c1 {
-    scf.if %cond {
-      %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
-      ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
-    }
-  }
-  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
-}
-
-// -----
-
 // Loop without any store should NOT be annotated.
 
 // CHECK-LABEL: func.func @no_store
diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops_invalid.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops_invalid.mlir
new file mode 100644
index 000000000..494c1ba1d
--- /dev/null
+++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops_invalid.mlir
@@ -0,0 +1,28 @@
+// += inside a conditional is not supported.
+
+// RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(ttl-annotate-l1-acc-loops))' --verify-diagnostics --split-input-file
+
+// += inside scf.if inside scf.for is rejected.
+
+func.func @acc_inside_conditional(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %cond: i1) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    scf.if %cond {
+      %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+      // expected-error @below {{+= inside a conditional is not supported (#504)}}
+      ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    }
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}

From 6003a7d0dfa060301c3f5e94e39810836d8e82b3 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Tue, 14 Apr 2026 10:44:59 -0700
Subject: [PATCH 29/31] Downgrade from `mm_block_init` to `mm_block_init_short`
 for the 2nd+ loop in an accumulation group to avoid clobbering the
 Pack_L1_Acc register on Wormhole, while keeping the UNPACK+MATH
 reconfiguration that's needed for the different input CBs.

---
 .../TTKernelInsertL1Accumulation.cpp          | 29 ++++++++++++++++++-
 .../Transforms/insert_l1_accumulation.mlir    | 10 +++++--
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index f1b8c3e7c..2fe14aaeb 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -189,7 +189,34 @@ struct TTKernelInsertL1AccumulationPass
       groups.push_back(std::move(group));
     }
 
-    // Step 2: Emit guards per group.
+    // Step 2: For the 2nd+ loop in each group, downgrade full
+    // MatmulBlockInitOp to MatmulBlockInitShortOp. The full init
+    // writes config.val[3]=0 which clobbers the Pack_L1_Acc register
+    // bits on Wormhole. init_short only reconfigures UNPACK+MATH,
+    // leaving the PACK configuration (including L1 acc) intact.
+    for (auto &group : groups) {
+      for (size_t idx = 1; idx < group.loops.size(); ++idx) {
+        scf::ForOp loop = group.loops[idx];
+        // The init was hoisted before the loop by InsertInits.
+        for (Operation *op = loop->getPrevNode(); op; op = op->getPrevNode()) {
+          if (auto fullInit = dyn_cast<ttk::MatmulBlockInitOp>(op)) {
+            OpBuilder builder(fullInit);
+            ttk::MatmulBlockInitShortOp::create(
+                builder, fullInit->getLoc(), fullInit.getIn0Cb(),
+                fullInit.getIn1Cb(), fullInit.getTranspose(),
+                fullInit.getCtDim(), fullInit.getRtDim(), fullInit.getKtDim());
+            fullInit->erase();
+            break;
+          }
+          // Stop at a loop or other boundary.
+          if (isa<scf::ForOp>(op)) {
+            break;
+          }
+        }
+      }
+    }
+
+    // Step 3: Emit guards per group.
     for (auto &group : groups) {
       OpBuilder builder(group.rootLoop->getContext());
       Location disableLoc = group.rootLoop->getLoc();
diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
index b4648f589..c92fa4e2b 100644
--- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
+++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
@@ -524,7 +524,10 @@ func.func @different_cb_siblings() attributes {ttkernel.thread = #ttkernel.threa
 // CHECK: scf.for
 // CHECK:   ttkernel.pack_reconfig_l1_acc
 // CHECK: }
-// CHECK-NOT: pack_reconfig_l1_acc(%{{.*}}0
+// The full mm_block_init between loops is downgraded to init_short
+// to avoid clobbering Pack_L1_Acc register bits.
+// CHECK-NOT: "ttkernel.mm_block_init"(
+// CHECK: "ttkernel.mm_block_init_short"(
 // CHECK: ttkernel.pack_reconfig_l1_acc
 // CHECK: scf.for
 // CHECK:   ttkernel.pack_reconfig_l1_acc
@@ -551,8 +554,9 @@ func.func @consecutive_with_init_between() attributes {ttkernel.thread = #ttkern
     ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
     ttkernel.tile_regs_release() : () -> ()
   } {ttl.l1_acc_loop}
-  // Init op between the two loops (as generated by TTKernelInsertInits).
-  %dummy_init = arith.constant 42 : i32
+  // Full init between the two loops (as generated by TTKernelInsertInits).
+  // Should be downgraded to init_short by InsertL1Accumulation.
+  "ttkernel.mm_block_init"(%cb_in1, %cb_in1, %cb, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> ()
   scf.for %iv2 = %c0 to %c4 step %c1 {
     ttkernel.tile_regs_acquire() : () -> ()
     ttkernel.matmul_block(%cb_in1, %cb_in1, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> ()

From fd5b0bf6c945fb51b676ebb8daa4f4eb7464e3bf Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Tue, 14 Apr 2026 11:57:57 -0700
Subject: [PATCH 30/31] a bit more refactoring cleanup

---
 include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h   |  38 +++---
 .../Transforms/TTKernelInsertInits.cpp        |  33 ++++-
 .../TTKernelInsertL1Accumulation.cpp          | 118 +-----------------
 lib/Dialect/TTL/IR/TTLOpsUtils.cpp            | 105 ++++++++++++++++
 .../Transforms/insert_l1_accumulation.mlir    |  11 +-
 5 files changed, 160 insertions(+), 145 deletions(-)

diff --git a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
index fbe4f5323..9db998b35 100644
--- a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
+++ b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
@@ -5,17 +5,15 @@
 #ifndef TTLANG_DIALECT_TTL_IR_TTLOPSUTILS_H
 #define TTLANG_DIALECT_TTL_IR_TTLOPSUTILS_H
 
-#include "ttlang/Dialect/TTL/IR/TTL.h"
-#include "ttlang/Dialect/TTL/IR/TTLOps.h"
-#include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h"
-#include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
-
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
+#include "ttlang/Dialect/TTL/IR/TTL.h"
+#include "ttlang/Dialect/TTL/IR/TTLOps.h"
+#include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h"
 #include "llvm/ADT/SetVector.h"
 #include <cstdint>
 #include <optional>
@@ -430,24 +428,22 @@ inline TileOp createTileOpWithPlaceholderDstIndex(OpBuilder &builder,
 }
 
 /// Collect the CB values targeted by pack_tile ops inside a loop.
-inline llvm::SmallDenseSet<Value, 2> getPackTileCBs(scf::ForOp loop) {
-  namespace ttk = mlir::tt::ttkernel;
-  llvm::SmallDenseSet<Value, 2> cbs;
-  loop->walk([&](ttk::PackTileOp packOp) { cbs.insert(packOp.getOutCb()); });
-  return cbs;
-}
+llvm::SmallDenseSet<Value, 2> getPackTileCBs(scf::ForOp loop);
 
 /// Returns true if two loops share any pack_tile CB target.
-inline bool sharePackCB(scf::ForOp loopA, scf::ForOp loopB) {
-  auto cbsA = getPackTileCBs(loopA);
-  auto cbsB = getPackTileCBs(loopB);
-  for (auto cb : cbsA) {
-    if (cbsB.contains(cb)) {
-      return true;
-    }
-  }
-  return false;
-}
+bool sharePackCB(scf::ForOp loopA, scf::ForOp loopB);
+
+/// A group of consecutive sibling loops that pack to the same output CB.
+struct LoopGroup {
+  scf::ForOp rootLoop;
+  SmallVector<scf::ForOp> loops;
+  Operation *scopeEnd = nullptr;
+};
+
+/// Collect groups of annotated sibling loops that share a pack CB target.
+SmallVector<LoopGroup> collectLoopGroups(
+    ArrayRef<scf::ForOp> l1AccLoops,
+    const llvm::SmallDenseMap<Operation *, Operation *> &enablePointPerLoop);
 
 } // namespace mlir::tt::ttl
 
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp
index f6ddc93fa..075bc1184 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp
@@ -23,6 +23,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ttlang/Dialect/TTL/IR/TTL.h"
+#include "ttlang/Dialect/TTL/IR/TTLOpsUtils.h"
 #include "ttlang/Dialect/TTL/Passes.h"
 
 #include "ttmlir/Dialect/TTKernel/IR/TTKernel.h"
@@ -403,8 +404,36 @@ static LogicalResult insertCommonInits(ModuleOp moduleOp) {
       inputCB = outputCB;
     }
 
-    if (analysis.hasMatmul && in0CB && in1CB) {
-      // mm_block_init configures UNPACK + MATH + PACK for matmul_block.
+    // When a matmul init is hoisted before a loop that shares an
+    // output CB with a preceding sibling annotated loop, use
+    // init_short. The full init reconfigures the PACK pipeline
+    // which clobbers packer state (including L1 acc on Wormhole).
+    // init_short only reconfigures UNPACK+MATH.
+    bool useInitShort = false;
+    if (analysis.hasMatmul) {
+      if (auto forOp = dyn_cast<scf::ForOp>(insertBefore)) {
+        if (forOp->hasAttr(kL1AccLoopAttrName) ||
+            forOp->hasAttr(kReductionLoopAttrName)) {
+          for (Operation *prev = forOp->getPrevNode(); prev;
+               prev = prev->getPrevNode()) {
+            if (auto prevFor = dyn_cast<scf::ForOp>(prev)) {
+              if ((prevFor->hasAttr(kL1AccLoopAttrName) ||
+                   prevFor->hasAttr(kReductionLoopAttrName)) &&
+                  sharePackCB(prevFor, forOp)) {
+                useInitShort = true;
+              }
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    if (analysis.hasMatmul && in0CB && in1CB && useInitShort) {
+      ttk::MatmulBlockInitShortOp::create(
+          builder, loc, in0CB, in1CB, analysis.matmulTranspose,
+          analysis.matmulCt, analysis.matmulRt, analysis.matmulKt);
+    } else if (analysis.hasMatmul && in0CB && in1CB) {
       ttk::MatmulBlockInitOp::create(
           builder, loc, in0CB, in1CB, outputCB, analysis.matmulTranspose,
           analysis.matmulCt, analysis.matmulRt, analysis.matmulKt);
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index 2fe14aaeb..21a029bce 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -47,22 +47,6 @@ static scf::ForOp findL1AccLoop(Operation *op) {
   return reductionFallback;
 }
 
-/// Walk from loop up through parent ops, returning the outermost
-/// annotated ancestor. Returns loop itself if no annotated ancestor exists.
-static scf::ForOp findOutermostAnnotatedAncestor(scf::ForOp loop) {
-  scf::ForOp outermost = loop;
-  for (Operation *parent = loop->getParentOp(); parent;
-       parent = parent->getParentOp()) {
-    if (auto parentFor = dyn_cast<scf::ForOp>(parent)) {
-      if (parentFor->hasAttr(kL1AccLoopAttrName) ||
-          parentFor->hasAttr(kReductionLoopAttrName)) {
-        outermost = parentFor;
-      }
-    }
-  }
-  return outermost;
-}
-
 struct TTKernelInsertL1AccumulationPass
     : public impl::TTKernelInsertL1AccumulationBase<
           TTKernelInsertL1AccumulationPass> {
@@ -117,106 +101,10 @@ struct TTKernelInsertL1AccumulationPass
       }
     }
 
-    // Step 1: Group loops into accumulation scopes. Consecutive sibling
-    // loops that pack to the same CB share a single disable pair. Nested
-    // annotated loops are folded into the outermost ancestor.
-    struct AccGroup {
-      scf::ForOp rootLoop;
-      SmallVector<scf::ForOp> loops;
-      Operation *scopeEnd = nullptr;
-    };
-    SmallVector<AccGroup> groups;
-    llvm::SmallDenseSet<Operation *> assignedToGroup;
-
-    for (auto loop : l1AccLoops) {
-      if (!l1AccEnablePoint.count(loop.getOperation())) {
-        continue;
-      }
-      if (assignedToGroup.contains(loop.getOperation())) {
-        continue;
-      }
-
-      scf::ForOp rootLoop = findOutermostAnnotatedAncestor(loop);
-
-      AccGroup group;
-      group.rootLoop = rootLoop;
-      group.loops.push_back(loop);
-      assignedToGroup.insert(loop.getOperation());
-
-      // Collect sibling annotated loops that share a pack CB target.
-      // sharePackCB walks recursively, so for nested loops (rootLoop
-      // wrapping loop), it finds pack_tile ops inside the inner loop.
-      for (Operation *op = rootLoop->getNextNode(); op;
-           op = op->getNextNode()) {
-        if (isa<ttk::CBPushBackOp>(op)) {
-          break;
-        }
-        auto sibling = dyn_cast<scf::ForOp>(op);
-        if (!sibling) {
-          continue;
-        }
-        if (!sibling->hasAttr(kL1AccLoopAttrName) &&
-            !sibling->hasAttr(kReductionLoopAttrName)) {
-          break;
-        }
-        if (!sharePackCB(rootLoop, sibling)) {
-          break;
-        }
-        group.loops.push_back(sibling);
-        assignedToGroup.insert(sibling.getOperation());
-      }
-
-      // Find scope end: scan forward from rootLoop past grouped siblings,
-      // init ops between them, and trailing cb_push_back ops. Only stop
-      // at a non-grouped ForOp (a different accumulation scope) or a
-      // cb_reserve_back (start of a new reserve region).
-      // TODO: Consider adding structural accumulation_region ops to make this
-      // more robust and composable.
-      group.scopeEnd = rootLoop;
-      for (Operation *op = rootLoop->getNextNode(); op;
-           op = op->getNextNode()) {
-        if (isa<ttk::CBPushBackOp>(op)) {
-          group.scopeEnd = op;
-        } else if (isa<ttk::CBReserveBackOp>(op)) {
-          break;
-        } else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
-          if (!assignedToGroup.contains(forOp)) {
-            break;
-          }
-        }
-      }
-
-      groups.push_back(std::move(group));
-    }
-
-    // Step 2: For the 2nd+ loop in each group, downgrade full
-    // MatmulBlockInitOp to MatmulBlockInitShortOp. The full init
-    // writes config.val[3]=0 which clobbers the Pack_L1_Acc register
-    // bits on Wormhole. init_short only reconfigures UNPACK+MATH,
-    // leaving the PACK configuration (including L1 acc) intact.
-    for (auto &group : groups) {
-      for (size_t idx = 1; idx < group.loops.size(); ++idx) {
-        scf::ForOp loop = group.loops[idx];
-        // The init was hoisted before the loop by InsertInits.
-        for (Operation *op = loop->getPrevNode(); op; op = op->getPrevNode()) {
-          if (auto fullInit = dyn_cast<ttk::MatmulBlockInitOp>(op)) {
-            OpBuilder builder(fullInit);
-            ttk::MatmulBlockInitShortOp::create(
-                builder, fullInit->getLoc(), fullInit.getIn0Cb(),
-                fullInit.getIn1Cb(), fullInit.getTranspose(),
-                fullInit.getCtDim(), fullInit.getRtDim(), fullInit.getKtDim());
-            fullInit->erase();
-            break;
-          }
-          // Stop at a loop or other boundary.
-          if (isa<scf::ForOp>(op)) {
-            break;
-          }
-        }
-      }
-    }
+    // Group consecutive sibling loops that pack to the same CB.
+    auto groups = collectLoopGroups(l1AccLoops, l1AccEnablePoint);
 
-    // Step 3: Emit guards per group.
+    // Emit guards per group.
     for (auto &group : groups) {
       OpBuilder builder(group.rootLoop->getContext());
       Location disableLoc = group.rootLoop->getLoc();
diff --git a/lib/Dialect/TTL/IR/TTLOpsUtils.cpp b/lib/Dialect/TTL/IR/TTLOpsUtils.cpp
index de5bd5a2c..3e4993885 100644
--- a/lib/Dialect/TTL/IR/TTLOpsUtils.cpp
+++ b/lib/Dialect/TTL/IR/TTLOpsUtils.cpp
@@ -4,6 +4,8 @@
 
 #include "ttlang/Dialect/TTL/IR/TTLOpsUtils.h"
 
+#include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
+
 namespace mlir::tt::ttl {
 
 //===----------------------------------------------------------------------===//
@@ -132,4 +134,107 @@ llvm::StringRef describeTraceFailure(TraceFailureReason reason) {
   llvm_unreachable("unhandled TraceFailureReason");
 }
 
+//===----------------------------------------------------------------------===//
+// Loop grouping for L1 accumulation and init selection
+//===----------------------------------------------------------------------===//
+
+namespace ttk = mlir::tt::ttkernel;
+
+llvm::SmallDenseSet<Value, 2> getPackTileCBs(scf::ForOp loop) {
+  llvm::SmallDenseSet<Value, 2> cbs;
+  loop->walk([&](ttk::PackTileOp packOp) { cbs.insert(packOp.getOutCb()); });
+  return cbs;
+}
+
+bool sharePackCB(scf::ForOp loopA, scf::ForOp loopB) {
+  auto cbsA = getPackTileCBs(loopA);
+  auto cbsB = getPackTileCBs(loopB);
+  for (auto cb : cbsA) {
+    if (cbsB.contains(cb)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+SmallVector<LoopGroup> collectLoopGroups(
+    ArrayRef<scf::ForOp> l1AccLoops,
+    const llvm::SmallDenseMap<Operation *, Operation *> &enablePointPerLoop) {
+  // Find the outermost annotated ancestor of a loop.
+  auto findRoot = [](scf::ForOp loop) -> scf::ForOp {
+    scf::ForOp outermost = loop;
+    for (Operation *parent = loop->getParentOp(); parent;
+         parent = parent->getParentOp()) {
+      if (auto parentFor = dyn_cast<scf::ForOp>(parent)) {
+        if (parentFor->hasAttr(kL1AccLoopAttrName) ||
+            parentFor->hasAttr(kReductionLoopAttrName)) {
+          outermost = parentFor;
+        }
+      }
+    }
+    return outermost;
+  };
+
+  SmallVector<LoopGroup> groups;
+  llvm::SmallDenseSet<Operation *> assigned;
+
+  for (auto loop : l1AccLoops) {
+    if (!enablePointPerLoop.count(loop.getOperation())) {
+      continue;
+    }
+    if (assigned.contains(loop.getOperation())) {
+      continue;
+    }
+
+    scf::ForOp rootLoop = findRoot(loop);
+
+    LoopGroup group;
+    group.rootLoop = rootLoop;
+    group.loops.push_back(loop);
+    assigned.insert(loop.getOperation());
+
+    // Collect sibling annotated loops that share a pack CB target.
+    // sharePackCB walks recursively, so for nested loops (rootLoop
+    // wrapping loop), it finds pack_tile ops inside the inner loop.
+    for (Operation *op = rootLoop->getNextNode(); op; op = op->getNextNode()) {
+      if (isa<ttk::CBPushBackOp>(op)) {
+        break;
+      }
+      auto sibling = dyn_cast<scf::ForOp>(op);
+      if (!sibling) {
+        continue;
+      }
+      if (!sibling->hasAttr(kL1AccLoopAttrName) &&
+          !sibling->hasAttr(kReductionLoopAttrName)) {
+        break;
+      }
+      if (!sharePackCB(rootLoop, sibling)) {
+        break;
+      }
+      group.loops.push_back(sibling);
+      assigned.insert(sibling.getOperation());
+    }
+
+    // Find scope end: scan forward from rootLoop past grouped siblings,
+    // init ops between them, and trailing cb_push_back ops. Only stop
+    // at a non-grouped ForOp or a cb_reserve_back.
+    group.scopeEnd = rootLoop;
+    for (Operation *op = rootLoop->getNextNode(); op; op = op->getNextNode()) {
+      if (isa<ttk::CBPushBackOp>(op)) {
+        group.scopeEnd = op;
+      } else if (isa<ttk::CBReserveBackOp>(op)) {
+        break;
+      } else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
+        if (!assigned.contains(forOp)) {
+          break;
+        }
+      }
+    }
+
+    groups.push_back(std::move(group));
+  }
+
+  return groups;
+}
+
 } // namespace mlir::tt::ttl
diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
index c92fa4e2b..ca2ba92d4 100644
--- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
+++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
@@ -524,10 +524,7 @@ func.func @different_cb_siblings() attributes {ttkernel.thread = #ttkernel.threa
 // CHECK: scf.for
 // CHECK:   ttkernel.pack_reconfig_l1_acc
 // CHECK: }
-// The full mm_block_init between loops is downgraded to init_short
-// to avoid clobbering Pack_L1_Acc register bits.
-// CHECK-NOT: "ttkernel.mm_block_init"(
-// CHECK: "ttkernel.mm_block_init_short"(
+// CHECK-NOT: pack_reconfig_l1_acc(%{{.*}}0
 // CHECK: ttkernel.pack_reconfig_l1_acc
 // CHECK: scf.for
 // CHECK:   ttkernel.pack_reconfig_l1_acc
@@ -554,9 +551,9 @@ func.func @consecutive_with_init_between() attributes {ttkernel.thread = #ttkern
     ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
     ttkernel.tile_regs_release() : () -> ()
   } {ttl.l1_acc_loop}
-  // Full init between the two loops (as generated by TTKernelInsertInits).
-  // Should be downgraded to init_short by InsertL1Accumulation.
-  "ttkernel.mm_block_init"(%cb_in1, %cb_in1, %cb, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> ()
+  // init_short between the two loops (InsertInits emits init_short when
+  // sibling loops share an output CB, to avoid clobbering PACK config).
+  "ttkernel.mm_block_init_short"(%cb_in1, %cb_in1, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> ()
   scf.for %iv2 = %c0 to %c4 step %c1 {
     ttkernel.tile_regs_acquire() : () -> ()
     ttkernel.matmul_block(%cb_in1, %cb_in1, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> ()

From 3394dfc1d84d9ad27e57f9cb588137f5db23a3db Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Tue, 14 Apr 2026 12:02:33 -0700
Subject: [PATCH 31/31] reorg includes

---
 include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
index 9db998b35..2f2b6763a 100644
--- a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
+++ b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
@@ -5,16 +5,18 @@
 #ifndef TTLANG_DIALECT_TTL_IR_TTLOPSUTILS_H
 #define TTLANG_DIALECT_TTL_IR_TTLOPSUTILS_H
 
+#include "ttlang/Dialect/TTL/IR/TTL.h"
+#include "ttlang/Dialect/TTL/IR/TTLOps.h"
+#include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h"
+
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
-#include "ttlang/Dialect/TTL/IR/TTL.h"
-#include "ttlang/Dialect/TTL/IR/TTLOps.h"
-#include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h"
 #include "llvm/ADT/SetVector.h"
+
 #include <cstdint>
 #include <optional>