From db6d4b80097ee047ef22884833ccf725774df6e6 Mon Sep 17 00:00:00 2001 From: Peter Hizalev Date: Thu, 9 Apr 2026 20:02:47 -0700 Subject: [PATCH 01/31] [examples] Matmul tutorial (#485) --- docs/sphinx/elementwise-tutorial/index.md | 2 +- docs/sphinx/index.rst | 1 + docs/sphinx/matmul-tutorial/index.md | 446 ++++++++++++++++++ examples/matmul-tutorial/step_0_ttnn_base.py | 69 +++ .../step_1_single_node_single_tile_block.py | 210 +++++++++ .../step_2_single_node_multitile_block.py | 220 +++++++++ examples/matmul-tutorial/step_3_multinode.py | 236 +++++++++ .../step_4_multinode_grid_auto.py | 240 ++++++++++ .../step_5_multidevice_shard_m.py | 246 ++++++++++ .../step_6_multidevice_shard_k.py | 260 ++++++++++ .../step_7_multidevice_shard_k_all_reduce.py | 248 ++++++++++ 11 files changed, 2177 insertions(+), 1 deletion(-) create mode 100644 docs/sphinx/matmul-tutorial/index.md create mode 100644 examples/matmul-tutorial/step_0_ttnn_base.py create mode 100644 examples/matmul-tutorial/step_1_single_node_single_tile_block.py create mode 100644 examples/matmul-tutorial/step_2_single_node_multitile_block.py create mode 100644 examples/matmul-tutorial/step_3_multinode.py create mode 100644 examples/matmul-tutorial/step_4_multinode_grid_auto.py create mode 100644 examples/matmul-tutorial/step_5_multidevice_shard_m.py create mode 100644 examples/matmul-tutorial/step_6_multidevice_shard_k.py create mode 100644 examples/matmul-tutorial/step_7_multidevice_shard_k_all_reduce.py diff --git a/docs/sphinx/elementwise-tutorial/index.md b/docs/sphinx/elementwise-tutorial/index.md index 33faf632b..5eef11bd5 100644 --- a/docs/sphinx/elementwise-tutorial/index.md +++ b/docs/sphinx/elementwise-tutorial/index.md @@ -1,4 +1,4 @@ -# Elementwise Operation Tutorial +# Elementwise Tutorial This tutorial walks through building a fused elementwise operation in TT-Lang, introducing one concept at a time. Each step is a self-contained runnable diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst index e3c1f2532..0f0a63169 100644 --- a/docs/sphinx/index.rst +++ b/docs/sphinx/index.rst @@ -10,6 +10,7 @@ TT-Lang Documentation getting-started tour/index elementwise-tutorial/index + matmul-tutorial/index programming-guide claude-skills testing diff --git a/docs/sphinx/matmul-tutorial/index.md b/docs/sphinx/matmul-tutorial/index.md new file mode 100644 index 000000000..cf84bf156 --- /dev/null +++ b/docs/sphinx/matmul-tutorial/index.md @@ -0,0 +1,446 @@ +# Matmul Tutorial + +This tutorial walks through building a fused matrix multiplication operation in +TT-Lang, introducing one concept at a time. Each step is a self-contained +runnable script. + +## The Goal + +We want to compute `y = relu(a @ b + c)` on 8192×8192 `bfloat16` tensors. The +entire expression — matrix multiply, bias add, and activation — is the target +for kernel fusion: instead of dispatching three separate TT-NN operations that +each read and write DRAM, a custom TT-Lang operation streams tiles from DRAM +into L1, accumulates the dot product across the K dimension, adds the bias, and +applies relu before writing the result back. Later steps scale this to multiple +nodes and multiple devices using data parallelism and K-sharding. + +## Step 0 — TT-NN Baseline + +**Script**: [`examples/matmul-tutorial/step_0_ttnn_base.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_0_ttnn_base.py) + +The starting point uses TT-NN directly, with no custom operation: + +```python +y = ttnn.relu(ttnn.add(ttnn.matmul(a, b), c)) +``` + +Each call dispatches a separate operation and writes an intermediate tensor back +to DRAM. This is the reference we'll verify against as we build the custom +operation. Correctness is measured with Pearson Correlation Coefficient (PCC) +rather than `allclose` because matmul accumulates bfloat16 rounding differently +from a reference float32 computation. + +## Step 1 — Single Node, Single-Tile Block + +**Script**: [`examples/matmul-tutorial/step_1_single_node_single_tile_block.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_1_single_node_single_tile_block.py) + +This step introduces the complete TT-Lang programming model. The operation fuses +`relu(a @ b + c)` into a single pass, processing one 32×32 tile at a time on +one node. + +### Operation function and grid + +An operation is a Python function decorated with `@ttl.operation()`. The `grid` +argument selects how many nodes (Tensix cores) to run on. `grid=(1, 1)` means +a single node. + +```python +@ttl.operation(grid=(1, 1)) +def __tutorial_operation( + a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor, y: ttnn.Tensor +): + ... +``` + +The function arguments are the tensors the operation operates on. They live in +DRAM on device and are passed by the host at call time. + +### Dataflow buffers + +A *dataflow buffer* (DFB) is an L1 buffer shared between kernel functions within +a node. It is created once in the operation scope from a tensor likeness and a +block shape: + +```python +a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2) +``` + +`shape=(1, 1)` means each buffer entry holds one 32×32 tile. `block_count=2` +allocates two blocks in L1 so that the reader and compute kernels can work +concurrently — while compute processes one entry, the reader fills the other +(double-buffering). + +Matmul needs one additional DFB that the elementwise tutorial does not use: +`acc_dfb` holds the running accumulator for the K-reduction. Because compute +both reads the previous partial sum and writes a new one in each k-step, two +slots in `acc_dfb` alternate in a ping-pong pattern: + +```python +acc_dfb = ttl.make_dataflow_buffer_like(y, shape=(1, 1), block_count=2) +``` + +### Kernel functions + +Three kernel functions run concurrently inside the operation: + +```python +@ttl.compute() +def compute(): ... + +@ttl.datamovement() +def read(): ... + +@ttl.datamovement() +def write(): ... +``` + +**Reader DM kernel** — for each output tile `(m, n)`, first reads the bias +`c[m, n]` into `c_dfb`, then streams all k-tiles of `a` and `b` into their +DFBs: + +```python +for m_tile in range(m_tiles): + for n_tile in range(n_tiles): + with c_dfb.reserve() as c_blk: + ttl.copy(c[m_tile, n_tile], c_blk).wait() + + for k_tile in range(k_tiles): + with a_dfb.reserve() as a_blk, b_dfb.reserve() as b_blk: + tx_a = ttl.copy(a[m_tile, k_tile], a_blk) + tx_b = ttl.copy(b[k_tile, n_tile], b_blk) + tx_a.wait(); tx_b.wait() +``` + +`ttl.copy` starts a non-blocking transfer; `tx.wait()` waits for completion. +The index `a[m_tile, k_tile]` selects a tile in *tile coordinates* (not element +coordinates). The `with` block calls `push()` on exit, signalling the compute +kernel. + +**Compute kernel** — initializes the accumulator to zero, accumulates +`a @ b` across all k-tiles, then adds the bias and applies relu: + +```python +for _ in range(m_tiles): + for _ in range(n_tiles): + with acc_dfb.reserve() as acc_blk: + acc_blk.store(ttl.math.fill(acc_blk, 0)) # zero the accumulator + + for _ in range(k_tiles): + with ( + a_dfb.wait() as a_blk, + b_dfb.wait() as b_blk, + acc_dfb.wait() as pre_acc_blk, # previous partial sum + ): + with acc_dfb.reserve() as acc_blk: + acc_blk.store(pre_acc_blk + a_blk @ b_blk) + + with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk: + with y_dfb.reserve() as y_blk: + y_blk.store(ttl.math.relu(c_blk + acc_blk)) +``` + +`ttl.math.fill(acc_blk, 0)` produces a block expression that fills a block +with a scalar value; `store()` materializes the expression. `wait()` blocks +until the reader has pushed a filled tile. `reserve()` blocks until the writer +has freed an entry. The `with` block automatically calls `pop()` on inputs and +`push()` on the output when the scope exits. + +**Writer DM kernel** — copies completed output tiles from L1 back to DRAM: + +```python +with y_dfb.wait() as y_blk: + ttl.copy(y_blk, y[m_tile, n_tile]).wait() +``` + +## Step 2 — Single Node, Multi-Tile Block + +**Script**: [`examples/matmul-tutorial/step_2_single_node_multitile_block.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_2_single_node_multitile_block.py) + +Processing one tile at a time incurs a synchronization round-trip per tile and +limits the hardware's ability to amortize compute setup overhead. This step +groups tiles into larger blocks so that each transfer and compute iteration +covers a multi-tile patch. + +```python +M_GRANULARITY = 4 +N_GRANULARITY = 4 +K_GRANULARITY = 4 +``` + +The DFB shapes must match the tile dimensions of each tensor operand, which +differ because the matmul operands have different roles: + +```python +a_dfb = ttl.make_dataflow_buffer_like( + a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2 # M×K +) +b_dfb = ttl.make_dataflow_buffer_like( + b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2 # K×N +) +c_dfb = ttl.make_dataflow_buffer_like( + c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 # M×N +) +``` + +The iteration counts change from individual tiles to blocks, and the reader +selects a tile range per transfer: + +```python +m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block + +tx_a = ttl.copy( + a[start_m_tile:end_m_tile, start_k_tile:end_k_tile], + a_blk, +) +``` + +The operation structure, synchronization pattern, and compute expression are +unchanged from Step 1. + +## Step 3 — Multi-Node, Fixed Grid + +**Script**: [`examples/matmul-tutorial/step_3_multinode.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_3_multinode.py) + +This step parallelizes the operation across a 4×4 grid of nodes. To familiarize +the user with Tenstorrent hardware architecture we recommend reading +[TT Architecture and Metalium Guide](https://github.com/tenstorrent/tt-metal/blob/main/METALIUM_GUIDE.md). + +### Declaring a multi-node grid + +```python +@ttl.operation(grid=(4, 4)) +def __tutorial_operation(...): +``` + +All nodes execute the same operation body. They differentiate their work using +their coordinates in the grid. + +### Partitioning strategy + +For matmul, the M×N output space is partitioned across the grid. The K +dimension is **not** partitioned: every node iterates over all k-blocks to +accumulate its own independent partial product. No inter-node communication is +required. + +`ttl.grid_size(dims=2)` returns `(grid_n, grid_m)` — the number of nodes along +each dimension. `ttl.node(dims=2)` returns the `(node_n, node_m)` coordinates +of the current node, zero-based. + +```python +grid_n, grid_m = ttl.grid_size(dims=2) + +m_blocks_per_node = m_blocks // grid_m +n_blocks_per_node = n_blocks // grid_n +``` + +### Mapping local to global indices + +Each DM kernel uses its node coordinates to offset into the global tensor: + +```python +node_n, node_m = ttl.node(dims=2) + +for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + ... +for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + ... +``` + +The compute kernel iterates over the same `m_blocks_per_node × n_blocks_per_node` +count as the DM kernels, but does not need to know the node's coordinates +directly — the DM kernels already stream only the relevant tiles into the DFBs. + +This version requires the block counts to be evenly divisible by the grid. +See Step 4 for a version that handles arbitrary sizes. + +## Step 4 — Multi-Node, Auto Grid + +**Script**: [`examples/matmul-tutorial/step_4_multinode_grid_auto.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_4_multinode_grid_auto.py) + +This step removes two constraints from Step 3: the hard-coded grid size and +the requirement for even divisibility. + +### Auto grid + +```python +@ttl.operation(grid="auto") +``` + +`grid="auto"` lets the compiler select the largest grid that fits available +hardware resources. The operation must work correctly for any grid the compiler +may choose. + +### Ceiling division + +When the number of blocks does not divide evenly across the grid, nodes at the +trailing edge would be left without work. Ceiling division ensures every block +is assigned to some node: + +```python +m_blocks_per_node = -(-m_blocks // grid_m) # ceil(m_blocks / grid_m) +n_blocks_per_node = -(-n_blocks // grid_n) # ceil(n_blocks / grid_n) +``` + +### Bounds checking + +Nodes at the trailing edge may be assigned more iterations than there are +actual blocks. All three kernel functions guard per-block work: + +```python +for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + if m_block < m_blocks: # skip if past the end of the tensor + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: # skip if past the end of the tensor + ... +``` + +The guard must appear in every kernel function — compute, read, and write — +so that they all agree on exactly which blocks to process. + +## Step 5 — Multi-Device, Shard M + +**Script**: [`examples/matmul-tutorial/step_5_multidevice_shard_m.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_5_multidevice_shard_m.py) + +This step scales the operation to multiple devices using SPMD +(Single-Program Multiple-Data) mode. The TT-Lang operation body is unchanged +from Step 4; only the tensor distribution across devices changes. + +### Opening a mesh device + +```python +ttnn.set_fabric_config(ttnn.FabricConfig.FABRIC_1D) +mesh_device = ttnn.open_mesh_device(ttnn.MeshShape(1, n_devices)) +``` + +A mesh device groups multiple physical devices into a single logical device. +`FabricConfig.FABRIC_1D` configures a 1D ring interconnect between devices. + +### M-sharding strategy + +The M dimension of the output is split evenly: device `i` computes rows +`i * (M/n_devices)` through `(i+1) * (M/n_devices) - 1`. Because each row of +the output only depends on the corresponding rows of `a` and `c`, and on the +full matrix `b`, no inter-device communication is needed. + +```python +a = from_torch(a, ttnn.ShardTensorToMesh(mesh_device, dim=0)) # shard M rows +b = from_torch(b, ttnn.ReplicateTensorToMesh(mesh_device)) # replicate K×N +c = from_torch(c, ttnn.ShardTensorToMesh(mesh_device, dim=0)) # shard M rows +y = from_torch(y, ttnn.ShardTensorToMesh(mesh_device, dim=0)) # shard M rows +``` + +`ShardTensorToMesh(dim=0)` splits the tensor along its first dimension across +all devices. `ReplicateTensorToMesh` sends the same tensor to every device. + +### Gathering results + +After the operation, the per-device output shards are concatenated on the host: + +```python +y = ttnn.to_torch(y, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=0)) +``` + +The TT-Lang operation runs identically on each device in SPMD mode — `grid="auto"` +applies independently per device, filling the full per-device grid. + +## Step 6 — Multi-Device, Shard K + +**Script**: [`examples/matmul-tutorial/step_6_multidevice_shard_k.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_6_multidevice_shard_k.py) + +This step changes the sharding strategy: instead of splitting M across devices, +the K (reduction) dimension is split. This allows the matrix multiply to be +parallelized along the contraction axis at the cost of requiring a reduction +step to combine results. + +### K-sharding strategy + +Each device computes a partial dot product over its K slice: + +``` +device i: y_i = a[:, K_i] @ b[K_i, :] + c_i +``` + +where `K_i` is the slice of K assigned to device `i`. The full result is +`y = sum(y_i)`. + +```python +a = from_torch(a, ttnn.ShardTensorToMesh(mesh_device, dim=1)) # shard K cols +b = from_torch(b, ttnn.ShardTensorToMesh(mesh_device, dim=0)) # shard K rows +``` + +### Handling the bias + +The bias `c` must only be added once, not once per device. To handle this +within the uniform SPMD model, a stacked tensor is constructed where device 0 +receives the real `c` and all other devices receive zeros: + +```python +replicated_cs = torch.zeros((M * n_devices, N), dtype=torch.bfloat16) +replicated_cs[:M, :] = c # only the first M rows carry the real bias +replicated_cs = from_torch(replicated_cs, ttnn.ShardTensorToMesh(mesh_device, dim=0)) +``` + +After sharding along `dim=0`, device 0 gets `c` and devices 1..n−1 get zeros, +so the summation `sum(a_i @ b_i + c_i)` correctly produces `a @ b + c`. + +### Host-side reduction + +Because the kernel produces partial sums, relu cannot be applied on-device. +The host collects the partial outputs and reduces them manually before +activating: + +```python +partial_ys = ttnn.to_torch(partial_ys, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=0)) + +y = torch.zeros((M, N), dtype=torch.bfloat16) +for i in range(n_devices): + y += partial_ys[i * M : (i + 1) * M, :] + +y = torch.relu(y) +``` + +The TT-Lang operation body drops the `ttl.math.relu` from Step 4 and stores +the raw `c_blk + acc_blk` result, deferring activation to after the reduction. + +## Step 7 — Multi-Device, Shard K with All-Reduce + +**Script**: [`examples/matmul-tutorial/step_7_multidevice_shard_k_all_reduce.py`](https://github.com/tenstorrent/tt-lang/blob/main/examples/matmul-tutorial/step_7_multidevice_shard_k_all_reduce.py) + +This step replaces the host-side manual reduction from Step 6 with an +on-device all-reduce, keeping the result on the mesh and enabling the +activation to be applied on-device as well. + +### All-reduce + +```python +replicated_ys = ttnn.all_reduce(partial_ys) +replicated_ys = ttnn.relu(replicated_ys) +``` + +`ttnn.all_reduce` sums `partial_ys` across all devices using the TT-Fabric +interconnect. Each device ends up with the fully reduced M×N result — the +output is replicated rather than sharded. `ttnn.relu` is then applied +on-device to all replicas in parallel. + +### Verifying replicated results + +Because all-reduce replicates the result, every device holds a correct copy of +the full output. The verification loop checks each device's copy independently: + +```python +replicated_ys = ttnn.to_torch( + replicated_ys, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=0) +) +for i in range(n_devices): + y = replicated_ys[i * M : (i + 1) * M, :] + pcc = ... + assert pcc > 0.99 +``` + +Compared to Step 6, this approach avoids the host round-trip for reduction and +moves the relu entirely on-device. The TT-Lang operation body is identical to +Step 6. diff --git a/examples/matmul-tutorial/step_0_ttnn_base.py b/examples/matmul-tutorial/step_0_ttnn_base.py new file mode 100644 index 000000000..f57a48403 --- /dev/null +++ b/examples/matmul-tutorial/step_0_ttnn_base.py @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +# +# Tutorial Step 0: TT-NN Baseline +# ================================ +# This is the starting point: a matmul-bias-activation expressed entirely in +# TT-NN. No custom operation is involved. TT-NN dispatches each op separately, +# resulting in multiple DRAM round-trips. +# +# The operation: y = relu(a @ b + c) +# +# The subsequent tutorial steps replace this entire computation with a single +# fused TT-Lang operation, showing how to take control of data movement and +# compute explicitly. + +import ttnn +import torch + + +def from_torch(tensor: torch.Tensor): + + # Upload a bfloat16 torch tensor to DRAM on the device in tiled layout. + + return ttnn.from_torch( + tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=device, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + ) + + +torch.manual_seed(42) + +device = ttnn.open_device(device_id=0) + +try: + M, K, N = 8192, 8192, 8192 + + a = torch.randn((M, K), dtype=torch.bfloat16) + b = torch.randn((K, N), dtype=torch.bfloat16) + c = torch.randn((M, N), dtype=torch.bfloat16) + + expected_y = torch.relu(a @ b + c) + + a = from_torch(a) + b = from_torch(b) + c = from_torch(c) + + # TT-NN dispatches three separate operations: matmul, add, relu. + # With a custom TT-Lang operation we can fuse all three into a single + # kernel, reducing DRAM traffic and operation-launch overhead. + + y = ttnn.relu(ttnn.add(ttnn.matmul(a, b), c)) + + y = ttnn.to_torch(y) + + pcc = torch.corrcoef( + torch.stack([y.flatten().float(), expected_y.flatten().float()]) + )[0, 1].item() + + print(f"PCC {pcc:.6f}") + + assert pcc > 0.99 + +finally: + ttnn.close_device(device) diff --git a/examples/matmul-tutorial/step_1_single_node_single_tile_block.py b/examples/matmul-tutorial/step_1_single_node_single_tile_block.py new file mode 100644 index 000000000..a246a8daf --- /dev/null +++ b/examples/matmul-tutorial/step_1_single_node_single_tile_block.py @@ -0,0 +1,210 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +# +# Tutorial Step 1: Single Node, Single-Tile Block +# ================================================ +# Introduces the core TT-Lang programming model for matmul: +# - @ttl.operation — declares an operation and the grid it runs on +# - @ttl.compute — the compute kernel: tile-level matrix multiply and add +# - @ttl.datamovement — DM kernels: move data between DRAM and L1 +# - ttl.make_dataflow_buffer_like — creates an in-L1 dataflow buffer (DFB) +# that synchronizes data passing between kernels +# - ttl.copy / tx.wait — initiates and awaits a transfer +# - ttl.math.fill — fills a block with a scalar value (used to zero the +# accumulator before the k-reduction loop) +# +# The operation fuses a @ b + c followed by relu into a single kernel, +# processing one 32×32 tile at a time. The outer m×n loop iterates over +# output tiles; the inner k loop accumulates partial products. + +import ttnn +import torch + + +def from_torch(tensor: torch.Tensor): + + # Upload a bfloat16 torch tensor to DRAM on the device in tiled layout. + + return ttnn.from_torch( + tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=device, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + ) + + +import ttl + +# Tenstorrent hardware operates on 32×32 tiles. Tensor dimensions in tile +# coordinates are obtained by dividing the element-count by TILE_SIZE. + +TILE_SIZE = 32 + + +# @ttl.operation marks a Python function as a TT-Lang operation. +# grid=(1, 1) means the operation runs on a single node (one Tensix core). +# The function signature lists the tensors the operation reads and writes; +# these live in DRAM and are passed by the host at call time. + + +@ttl.operation(grid=(1, 1)) +def __tutorial_operation( + a: ttnn.Tensor, + b: ttnn.Tensor, + c: ttnn.Tensor, + y: ttnn.Tensor, +) -> None: + + # Compute iteration counts in tile coordinates. + + m_tiles = a.shape[0] // TILE_SIZE + n_tiles = b.shape[1] // TILE_SIZE + k_tiles = a.shape[1] // TILE_SIZE + + # Dataflow buffers (DFBs) are L1 buffers shared between threads. + # shape=(1, 1) means each entry holds exactly one 32×32 tile. + # block_count=2 allocates two blocks, enabling double-buffering: while the + # compute kernel processes one entry, the DM kernel can fill the other. + # + # acc_dfb is the running accumulator for the k-reduction. It is both + # produced and consumed by the compute kernel in a ping-pong pattern: + # each k-step reads the previous partial sum (pre_acc_blk) and writes a + # new one (acc_blk), so block_count=2 allows the two slots to alternate. + + a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2) + b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, 1), block_count=2) + c_dfb = ttl.make_dataflow_buffer_like(c, shape=(1, 1), block_count=2) + acc_dfb = ttl.make_dataflow_buffer_like(y, shape=(1, 1), block_count=2) + y_dfb = ttl.make_dataflow_buffer_like(y, shape=(1, 1), block_count=2) + + # The DM reader runs concurrently with the compute kernel. + # For each output tile (m, n) it first reads the bias tile c[m, n], then + # streams all k input tiles for a and b into their respective DFBs. + + @ttl.datamovement() + def read(): + for m_tile in range(m_tiles): + for n_tile in range(n_tiles): + + # Read the bias tile for this (m, n) output position first so + # it is available when the compute kernel finishes accumulating. + + with c_dfb.reserve() as c_blk: + tx_c = ttl.copy( + c[m_tile, n_tile], + c_blk, + ) + + tx_c.wait() + + for k_tile in range(k_tiles): + + # Stream a[m, k] and b[k, n] tiles into L1 for each step + # of the k-reduction. + + with ( + a_dfb.reserve() as a_blk, + b_dfb.reserve() as b_blk, + ): + tx_a = ttl.copy( + a[m_tile, k_tile], + a_blk, + ) + tx_b = ttl.copy( + b[k_tile, n_tile], + b_blk, + ) + + tx_a.wait() + tx_b.wait() + + # The compute kernel accumulates partial matmul products across k, then + # adds the bias and applies relu before writing the result to y_dfb. + + @ttl.compute() + def compute(): + for _ in range(m_tiles): + for _ in range(n_tiles): + + # Initialize the accumulator to zero before the k loop. + # ttl.math.fill produces a block expression; store() materializes + # it into acc_blk and pushes it so the k loop can consume it. + + with acc_dfb.reserve() as acc_blk: + acc_blk.store(ttl.math.fill(acc_blk, 0)) + + for _ in range(k_tiles): + + # Consume the previous partial sum (pre_acc_blk) along with + # the next a and b tiles, compute the updated partial sum, + # and push it back into acc_dfb for the next k-step. + + with ( + a_dfb.wait() as a_blk, + b_dfb.wait() as b_blk, + acc_dfb.wait() as pre_acc_blk, + ): + with acc_dfb.reserve() as acc_blk: + acc_blk.store(pre_acc_blk + a_blk @ b_blk) + + # After k is exhausted, add the bias and apply relu in one step. + + with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk: + with y_dfb.reserve() as y_blk: + y_blk.store(ttl.math.relu(c_blk + acc_blk)) + + # The DM writer reads completed output tiles from y_dfb and writes them + # back to the output tensor in DRAM. + + @ttl.datamovement() + def write(): + for m_tile in range(m_tiles): + for n_tile in range(n_tiles): + with y_dfb.wait() as y_blk: + tx = ttl.copy( + y_blk, + y[m_tile, n_tile], + ) + tx.wait() + + +def tutorial_operation(a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor): + y = from_torch(torch.zeros((a.shape[0], b.shape[1]), dtype=torch.bfloat16)) + __tutorial_operation(a, b, c, y) + return y + + +torch.manual_seed(42) + +device = ttnn.open_device(device_id=0) + +try: + M, K, N = 8192, 8192, 8192 + + a = torch.randn((M, K), dtype=torch.bfloat16) + b = torch.randn((K, N), dtype=torch.bfloat16) + c = torch.randn((M, N), dtype=torch.bfloat16) + + expected_y = torch.relu(a @ b + c) + + a = from_torch(a) + b = from_torch(b) + c = from_torch(c) + + y = tutorial_operation(a, b, c) + + y = ttnn.to_torch(y) + + pcc = torch.corrcoef( + torch.stack([y.flatten().float(), expected_y.flatten().float()]) + )[0, 1].item() + + print(f"PCC {pcc:.6f}") + + assert pcc > 0.99 + +finally: + ttnn.close_device(device) diff --git a/examples/matmul-tutorial/step_2_single_node_multitile_block.py b/examples/matmul-tutorial/step_2_single_node_multitile_block.py new file mode 100644 index 000000000..c2e88e4e0 --- /dev/null +++ b/examples/matmul-tutorial/step_2_single_node_multitile_block.py @@ -0,0 +1,220 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +# +# Tutorial Step 2: Single Node, Multi-Tile Block +# =============================================== +# Builds on Step 1 by processing multiple tiles per dataflow buffer entry +# instead of one tile at a time. +# +# New concepts introduced: +# - Multi-tile blocks: each DFB entry holds a granularity-sized patch of +# tiles. Fewer, larger memory transfers reduce per-transfer overhead and +# give the compute kernel more work per synchronization round-trip. +# - Asymmetric block shapes: a, b, and c have different tile dimensions +# (M×K, K×N, and M×N respectively), so their DFBs use matching shapes. +# +# Everything else (single node, same three-kernel structure) is identical to +# Step 1. The loop bodies are unchanged; only the DFB shapes and the tensor +# slice ranges differ. + +import ttnn +import torch + + +def from_torch(tensor: torch.Tensor): + return ttnn.from_torch( + tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=device, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + ) + + +import ttl + +TILE_SIZE = 32 + +# M_GRANULARITY, N_GRANULARITY, K_GRANULARITY control how many tiles fit along +# each matmul dimension per block. With all set to 4, each a-block is a 4×4 +# patch of tiles (128×128 elements), each b-block is 4×4, and each c/y-block +# is 4×4 in M×N space. + +M_GRANULARITY = 4 +N_GRANULARITY = 4 +K_GRANULARITY = 4 + + +@ttl.operation(grid=(1, 1)) +def __tutorial_operation( + a: ttnn.Tensor, + b: ttnn.Tensor, + c: ttnn.Tensor, + y: ttnn.Tensor, +) -> None: + m_tiles_per_block = M_GRANULARITY + n_tiles_per_block = N_GRANULARITY + k_tiles_per_block = K_GRANULARITY + + # m_blocks, n_blocks, k_blocks now count blocks, not individual tiles. + + m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block + n_blocks = b.shape[1] // TILE_SIZE // n_tiles_per_block + k_blocks = a.shape[1] // TILE_SIZE // k_tiles_per_block + + # DFB shapes match the tile dimensions of each tensor operand: + # a: M×K → shape (m_tiles_per_block, k_tiles_per_block) + # b: K×N → shape (k_tiles_per_block, n_tiles_per_block) + # c, acc, y: M×N → shape (m_tiles_per_block, n_tiles_per_block) + + a_dfb = ttl.make_dataflow_buffer_like( + a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2 + ) + b_dfb = ttl.make_dataflow_buffer_like( + b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2 + ) + c_dfb = ttl.make_dataflow_buffer_like( + c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + acc_dfb = ttl.make_dataflow_buffer_like( + y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + y_dfb = ttl.make_dataflow_buffer_like( + y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + + @ttl.datamovement() + def read(): + for m_block in range(m_blocks): + + # Convert block index to tile index range for the tensor slice. + + start_m_tile = m_block * m_tiles_per_block + end_m_tile = (m_block + 1) * m_tiles_per_block + + for n_block in range(n_blocks): + start_n_tile = n_block * n_tiles_per_block + end_n_tile = (n_block + 1) * n_tiles_per_block + + # Slice with a range to copy the entire M×N block in one transfer. + + with c_dfb.reserve() as c_blk: + tx_c = ttl.copy( + c[ + start_m_tile:end_m_tile, + start_n_tile:end_n_tile, + ], + c_blk, + ) + + tx_c.wait() + + for k_block in range(k_blocks): + start_k_tile = k_block * k_tiles_per_block + end_k_tile = (k_block + 1) * k_tiles_per_block + with ( + a_dfb.reserve() as a_blk, + b_dfb.reserve() as b_blk, + ): + tx_a = ttl.copy( + a[ + start_m_tile:end_m_tile, + start_k_tile:end_k_tile, + ], + a_blk, + ) + tx_b = ttl.copy( + b[ + start_k_tile:end_k_tile, + start_n_tile:end_n_tile, + ], + b_blk, + ) + + tx_a.wait() + tx_b.wait() + + # The compute kernel is unchanged in structure from Step 1. The hardware + # now operates on full multi-tile blocks per iteration rather than single + # tiles, amortizing synchronization overhead over more compute work. + + @ttl.compute() + def compute(): + for _ in range(m_blocks): + for _ in range(n_blocks): + with acc_dfb.reserve() as acc_blk: + acc_blk.store(ttl.math.fill(acc_blk, 0)) + + for _ in range(k_blocks): + with ( + a_dfb.wait() as a_blk, + b_dfb.wait() as b_blk, + acc_dfb.wait() as pre_acc_blk, + ): + with acc_dfb.reserve() as acc_blk: + acc_blk.store(pre_acc_blk + a_blk @ b_blk) + + with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk: + with y_dfb.reserve() as y_blk: + y_blk.store(ttl.math.relu(c_blk + acc_blk)) + + @ttl.datamovement() + def write(): + for m_block in range(m_blocks): + start_m_tile = m_block * m_tiles_per_block + end_m_tile = (m_block + 1) * m_tiles_per_block + + for n_block in range(n_blocks): + start_n_tile = n_block * n_tiles_per_block + end_n_tile = (n_block + 1) * n_tiles_per_block + + with y_dfb.wait() as y_blk: + tx = ttl.copy( + y_blk, + y[ + start_m_tile:end_m_tile, + start_n_tile:end_n_tile, + ], + ) + tx.wait() + + +def tutorial_operation(a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor): + y = from_torch(torch.zeros((a.shape[0], b.shape[1]), dtype=torch.bfloat16)) + __tutorial_operation(a, b, c, y) + return y + + +torch.manual_seed(42) + +device = ttnn.open_device(device_id=0) + +try: + M, K, N = 8192, 8192, 8192 + + a = torch.randn((M, K), dtype=torch.bfloat16) + b = torch.randn((K, N), dtype=torch.bfloat16) + c = torch.randn((M, N), dtype=torch.bfloat16) + + expected_y = torch.relu(a @ b + c) + + a = from_torch(a) + b = from_torch(b) + c = from_torch(c) + + y = tutorial_operation(a, b, c) + + y = ttnn.to_torch(y) + + pcc = torch.corrcoef( + torch.stack([y.flatten().float(), expected_y.flatten().float()]) + )[0, 1].item() + + print(f"PCC {pcc:.6f}") + + assert pcc > 0.99 + +finally: + ttnn.close_device(device) diff --git a/examples/matmul-tutorial/step_3_multinode.py b/examples/matmul-tutorial/step_3_multinode.py new file mode 100644 index 000000000..23368397e --- /dev/null +++ b/examples/matmul-tutorial/step_3_multinode.py @@ -0,0 +1,236 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +# +# Tutorial Step 3: Multi-Node, Fixed Grid +# ======================================== +# Extends Step 2 by running the operation across a grid of nodes in parallel. +# +# New concepts introduced: +# - grid=(4, 4) — run the operation on a 4×4 grid of nodes (16 cores) +# - ttl.grid_size(dims=2) — query the (n, m) grid dimensions at runtime +# - ttl.node(dims=2) — query this node's (node_n, node_m) position +# +# Each node processes an independent rectangular region of the output tensor, +# partitioned along the M and N dimensions. The K dimension is not partitioned: +# every node iterates over all k_blocks to accumulate its full partial product. +# This requires tensor dimensions to be evenly divisible by the grid (see Step +# 4 for a version that handles remainders). + +import ttnn +import torch + + +def from_torch(tensor: torch.Tensor): + return ttnn.from_torch( + tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=device, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + ) + + +import ttl + +TILE_SIZE = 32 +M_GRANULARITY = 4 +N_GRANULARITY = 4 +K_GRANULARITY = 4 + + +# grid=(4, 4) launches the operation body on every node of a 4-column × 4-row +# grid. All nodes execute the same code; they differentiate their work via +# ttl.node(). + + +@ttl.operation(grid=(4, 4)) +def __tutorial_operation( + a: ttnn.Tensor, + b: ttnn.Tensor, + c: ttnn.Tensor, + y: ttnn.Tensor, +) -> None: + m_tiles_per_block = M_GRANULARITY + n_tiles_per_block = N_GRANULARITY + k_tiles_per_block = K_GRANULARITY + + m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block + n_blocks = b.shape[1] // TILE_SIZE // n_tiles_per_block + k_blocks = a.shape[1] // TILE_SIZE // k_tiles_per_block + + # ttl.grid_size returns (grid_n, grid_m) matching the (n, m) convention + # used by ttl.node. The grid is partitioned so each node handles an + # independent slice of the M×N output space. + + grid_n, grid_m = ttl.grid_size(dims=2) + + # Divide the total block count evenly across the grid. + # Assumes the tensor is evenly divisible by the grid size. + + m_blocks_per_node = m_blocks // grid_m + n_blocks_per_node = n_blocks // grid_n + + a_dfb = ttl.make_dataflow_buffer_like( + a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2 + ) + b_dfb = ttl.make_dataflow_buffer_like( + b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2 + ) + c_dfb = ttl.make_dataflow_buffer_like( + c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + acc_dfb = ttl.make_dataflow_buffer_like( + y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + y_dfb = ttl.make_dataflow_buffer_like( + y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + + @ttl.datamovement() + def read(): + + # ttl.node() returns the zero-based coordinates of this specific node. + # node_n and node_m are used to offset into the global tensor. + + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + + # Map local block index to global block index. + + m_block = node_m * m_blocks_per_node + local_m_block + start_m_tile = m_block * m_tiles_per_block + end_m_tile = (m_block + 1) * m_tiles_per_block + + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + start_n_tile = n_block * n_tiles_per_block + end_n_tile = (n_block + 1) * n_tiles_per_block + + with c_dfb.reserve() as c_blk: + tx_c = ttl.copy( + c[ + start_m_tile:end_m_tile, + start_n_tile:end_n_tile, + ], + c_blk, + ) + + tx_c.wait() + + # All nodes iterate over the full k dimension to accumulate + # their partial matmul result independently. + + for k_block in range(k_blocks): + start_k_tile = k_block * k_tiles_per_block + end_k_tile = (k_block + 1) * k_tiles_per_block + with ( + a_dfb.reserve() as a_blk, + b_dfb.reserve() as b_blk, + ): + tx_a = ttl.copy( + a[ + start_m_tile:end_m_tile, + start_k_tile:end_k_tile, + ], + a_blk, + ) + tx_b = ttl.copy( + b[ + start_k_tile:end_k_tile, + start_n_tile:end_n_tile, + ], + b_blk, + ) + + tx_a.wait() + tx_b.wait() + + # The compute kernel iterates over the blocks assigned to this node. + # It does not need to know its node coordinates: the DM kernels already + # stream only the relevant tiles into the DFBs. + + @ttl.compute() + def compute(): + for _ in range(m_blocks_per_node): + for _ in range(n_blocks_per_node): + with acc_dfb.reserve() as acc_blk: + acc_blk.store(ttl.math.fill(acc_blk, 0)) + + for _ in range(k_blocks): + with ( + a_dfb.wait() as a_blk, + b_dfb.wait() as b_blk, + acc_dfb.wait() as pre_acc_blk, + ): + with acc_dfb.reserve() as acc_blk: + acc_blk.store(pre_acc_blk + a_blk @ b_blk) + + with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk: + with y_dfb.reserve() as y_blk: + y_blk.store(ttl.math.relu(c_blk + acc_blk)) + + @ttl.datamovement() + def write(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + start_m_tile = m_block * m_tiles_per_block + end_m_tile = (m_block + 1) * m_tiles_per_block + + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + start_n_tile = n_block * n_tiles_per_block + end_n_tile = (n_block + 1) * n_tiles_per_block + + with y_dfb.wait() as y_blk: + tx = ttl.copy( + y_blk, + y[ + start_m_tile:end_m_tile, + start_n_tile:end_n_tile, + ], + ) + tx.wait() + + +def tutorial_operation(a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor): + y = from_torch(torch.zeros((a.shape[0], b.shape[1]), dtype=torch.bfloat16)) + __tutorial_operation(a, b, c, y) + return y + + +torch.manual_seed(42) + +device = ttnn.open_device(device_id=0) + +try: + M, K, N = 8192, 8192, 8192 + + a = torch.randn((M, K), dtype=torch.bfloat16) + b = torch.randn((K, N), dtype=torch.bfloat16) + c = torch.randn((M, N), dtype=torch.bfloat16) + + expected_y = torch.relu(a @ b + c) + + a = from_torch(a) + b = from_torch(b) + c = from_torch(c) + + y = tutorial_operation(a, b, c) + + y = ttnn.to_torch(y) + + pcc = torch.corrcoef( + torch.stack([y.flatten().float(), expected_y.flatten().float()]) + )[0, 1].item() + + print(f"PCC {pcc:.6f}") + + assert pcc > 0.99 + +finally: + ttnn.close_device(device) diff --git a/examples/matmul-tutorial/step_4_multinode_grid_auto.py b/examples/matmul-tutorial/step_4_multinode_grid_auto.py new file mode 100644 index 000000000..aa87b49e0 --- /dev/null +++ b/examples/matmul-tutorial/step_4_multinode_grid_auto.py @@ -0,0 +1,240 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +# +# Tutorial Step 4: Multi-Node, Auto Grid +# ======================================= +# Extends Step 3 by removing the hard-coded grid size and handling tensor +# dimensions that are not evenly divisible by the grid. +# +# New concepts introduced: +# - grid="auto" — the compiler picks the largest grid available in the +# hardware; the operation must not assume any specific +# grid dimensions +# - ceiling division — ensures every block is assigned to a node even when +# the block count doesn't divide evenly across the grid +# - bounds checking — nodes at the trailing edge of the grid may have fewer +# blocks to process; guard all per-block work with +# `if m_block < m_blocks` / `if n_block < n_blocks` +# +# Because all three kernels must agree on which blocks to process, the bounds +# check appears in every kernel function. + +import ttnn +import torch + + +def from_torch(tensor: torch.Tensor): + return ttnn.from_torch( + tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=device, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + ) + + +import ttl + +TILE_SIZE = 32 +M_GRANULARITY = 4 +N_GRANULARITY = 4 +K_GRANULARITY = 4 + + +# grid="auto" asks the compiler to select the grid at compile time based on +# available hardware resources. The operation body must work correctly for any +# grid the compiler may choose. + + +@ttl.operation(grid="auto") +def __tutorial_operation( + a: ttnn.Tensor, + b: ttnn.Tensor, + c: ttnn.Tensor, + y: ttnn.Tensor, +) -> None: + m_tiles_per_block = M_GRANULARITY + n_tiles_per_block = N_GRANULARITY + k_tiles_per_block = K_GRANULARITY + + # Total block counts across the entire tensor (not per-node). + + m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block + n_blocks = b.shape[1] // TILE_SIZE // n_tiles_per_block + k_blocks = a.shape[1] // TILE_SIZE // k_tiles_per_block + + grid_n, grid_m = ttl.grid_size(dims=2) + + # Ceiling division: -(-x // y) is a concise Python idiom for ceil(x / y). + # This ensures every block is covered even when m_blocks or n_blocks is not + # a multiple of the grid size. Nodes in the last row/column of the grid + # may receive fewer blocks and rely on the bounds checks below to skip + # out-of-range work. + + m_blocks_per_node = -(-m_blocks // grid_m) # divceil + n_blocks_per_node = -(-n_blocks // grid_n) # divceil + + a_dfb = ttl.make_dataflow_buffer_like( + a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2 + ) + b_dfb = ttl.make_dataflow_buffer_like( + b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2 + ) + c_dfb = ttl.make_dataflow_buffer_like( + c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + acc_dfb = ttl.make_dataflow_buffer_like( + y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + y_dfb = ttl.make_dataflow_buffer_like( + y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + + @ttl.datamovement() + def read(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + + # Skip if this node was assigned more iterations than there are + # actual blocks (happens at the trailing edge of the grid). + + if m_block < m_blocks: + start_m_tile = m_block * m_tiles_per_block + end_m_tile = (m_block + 1) * m_tiles_per_block + + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: + start_n_tile = n_block * n_tiles_per_block + end_n_tile = (n_block + 1) * n_tiles_per_block + + with c_dfb.reserve() as c_blk: + tx_c = ttl.copy( + c[ + start_m_tile:end_m_tile, + start_n_tile:end_n_tile, + ], + c_blk, + ) + + tx_c.wait() + + for k_block in range(k_blocks): + start_k_tile = k_block * k_tiles_per_block + end_k_tile = (k_block + 1) * k_tiles_per_block + with ( + a_dfb.reserve() as a_blk, + b_dfb.reserve() as b_blk, + ): + tx_a = ttl.copy( + a[ + start_m_tile:end_m_tile, + start_k_tile:end_k_tile, + ], + a_blk, + ) + tx_b = ttl.copy( + b[ + start_k_tile:end_k_tile, + start_n_tile:end_n_tile, + ], + b_blk, + ) + + tx_a.wait() + tx_b.wait() + + @ttl.compute() + def compute(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + if m_block < m_blocks: + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: + with acc_dfb.reserve() as acc_blk: + acc_blk.store(ttl.math.fill(acc_blk, 0)) + + for _ in range(k_blocks): + with ( + a_dfb.wait() as a_blk, + b_dfb.wait() as b_blk, + acc_dfb.wait() as pre_acc_blk, + ): + with acc_dfb.reserve() as acc_blk: + acc_blk.store(pre_acc_blk + a_blk @ b_blk) + + with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk: + with y_dfb.reserve() as y_blk: + y_blk.store(ttl.math.relu(c_blk + acc_blk)) + + @ttl.datamovement() + def write(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + if m_block < m_blocks: + start_m_tile = m_block * m_tiles_per_block + end_m_tile = (m_block + 1) * m_tiles_per_block + + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: + start_n_tile = n_block * n_tiles_per_block + end_n_tile = (n_block + 1) * n_tiles_per_block + + with y_dfb.wait() as y_blk: + tx = ttl.copy( + y_blk, + y[ + start_m_tile:end_m_tile, + start_n_tile:end_n_tile, + ], + ) + tx.wait() + + +def tutorial_operation(a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor): + y = from_torch(torch.zeros((a.shape[0], b.shape[1]), dtype=torch.bfloat16)) + __tutorial_operation(a, b, c, y) + return y + + +torch.manual_seed(42) + +device = ttnn.open_device(device_id=0) + +try: + M, K, N = 8192, 8192, 8192 + + a = torch.randn((M, K), dtype=torch.bfloat16) + b = torch.randn((K, N), dtype=torch.bfloat16) + c = torch.randn((M, N), dtype=torch.bfloat16) + + expected_y = torch.relu(a @ b + c) + + a = from_torch(a) + b = from_torch(b) + c = from_torch(c) + + y = tutorial_operation(a, b, c) + + y = ttnn.to_torch(y) + + pcc = torch.corrcoef( + torch.stack([y.flatten().float(), expected_y.flatten().float()]) + )[0, 1].item() + + print(f"PCC {pcc:.6f}") + + assert pcc > 0.99 + +finally: + ttnn.close_device(device) diff --git a/examples/matmul-tutorial/step_5_multidevice_shard_m.py b/examples/matmul-tutorial/step_5_multidevice_shard_m.py new file mode 100644 index 000000000..378f7b2e2 --- /dev/null +++ b/examples/matmul-tutorial/step_5_multidevice_shard_m.py @@ -0,0 +1,246 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +# +# Tutorial Step 5: Multi-Device, Shard M +# ======================================= +# Extends Step 4 to run across multiple devices using SPMD (Single-Program +# Multiple-Data) mode. The TT-Lang operation itself is unchanged; only the +# tensor distribution across devices differs. +# +# New concepts introduced: +# - ttnn.MeshShape / ttnn.open_mesh_device — open a 1D mesh of all available +# devices +# - ttnn.ShardTensorToMesh(dim=0) — split a tensor along the M dimension so +# each device receives M/n_devices rows +# - ttnn.ReplicateTensorToMesh — send the same tensor to every device +# - ttnn.ConcatMeshToTensor(dim=0) — gather per-device output tensors back to +# the host by concatenating along M +# +# Sharding strategy: a and c are sharded along M (rows), b is replicated. +# Each device computes its portion of the M×N output independently with no +# inter-device communication required. The host concatenates the results. + +import ttnn +import torch + + +def from_torch(tensor: torch.Tensor, mesh_mapper): + + # Upload a bfloat16 torch tensor to DRAM on all mesh devices, applying the + # given mapper to determine how the tensor is distributed. + + return ttnn.from_torch( + tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=mesh_device, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + mesh_mapper=mesh_mapper, + ) + + +import ttl + +TILE_SIZE = 32 +M_GRANULARITY = 4 +N_GRANULARITY = 4 +K_GRANULARITY = 4 + + +# The TT-Lang operation body is identical to Step 4. grid="auto" applies +# independently to each device in SPMD mode; each device fills its own grid. + + +@ttl.operation(grid="auto") +def tutorial_operation( + a: ttnn.Tensor, + b: ttnn.Tensor, + c: ttnn.Tensor, + y: ttnn.Tensor, +) -> None: + m_tiles_per_block = M_GRANULARITY + n_tiles_per_block = N_GRANULARITY + k_tiles_per_block = K_GRANULARITY + + m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block + n_blocks = b.shape[1] // TILE_SIZE // n_tiles_per_block + k_blocks = a.shape[1] // TILE_SIZE // k_tiles_per_block + + grid_n, grid_m = ttl.grid_size(dims=2) + + m_blocks_per_node = -(-m_blocks // grid_m) # divceil + n_blocks_per_node = -(-n_blocks // grid_n) # divceil + + a_dfb = ttl.make_dataflow_buffer_like( + a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2 + ) + b_dfb = ttl.make_dataflow_buffer_like( + b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2 + ) + c_dfb = ttl.make_dataflow_buffer_like( + c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + acc_dfb = ttl.make_dataflow_buffer_like( + y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + y_dfb = ttl.make_dataflow_buffer_like( + y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + + @ttl.datamovement() + def read(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + if m_block < m_blocks: + start_m_tile = m_block * m_tiles_per_block + end_m_tile = (m_block + 1) * m_tiles_per_block + + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: + start_n_tile = n_block * n_tiles_per_block + end_n_tile = (n_block + 1) * n_tiles_per_block + + with c_dfb.reserve() as c_blk: + tx_c = ttl.copy( + c[ + start_m_tile:end_m_tile, + start_n_tile:end_n_tile, + ], + c_blk, + ) + + tx_c.wait() + + for k_block in range(k_blocks): + start_k_tile = k_block * k_tiles_per_block + end_k_tile = (k_block + 1) * k_tiles_per_block + with ( + a_dfb.reserve() as a_blk, + b_dfb.reserve() as b_blk, + ): + tx_a = ttl.copy( + a[ + start_m_tile:end_m_tile, + start_k_tile:end_k_tile, + ], + a_blk, + ) + tx_b = ttl.copy( + b[ + start_k_tile:end_k_tile, + start_n_tile:end_n_tile, + ], + b_blk, + ) + + tx_a.wait() + tx_b.wait() + + @ttl.compute() + def compute(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + if m_block < m_blocks: + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: + with acc_dfb.reserve() as acc_blk: + acc_blk.store(ttl.math.fill(acc_blk, 0)) + + for _ in range(k_blocks): + with ( + a_dfb.wait() as a_blk, + b_dfb.wait() as b_blk, + acc_dfb.wait() as pre_acc_blk, + ): + with acc_dfb.reserve() as acc_blk: + acc_blk.store(pre_acc_blk + a_blk @ b_blk) + + with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk: + with y_dfb.reserve() as y_blk: + y_blk.store(ttl.math.relu(c_blk + acc_blk)) + + @ttl.datamovement() + def write(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + if m_block < m_blocks: + start_m_tile = m_block * m_tiles_per_block + end_m_tile = (m_block + 1) * m_tiles_per_block + + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: + start_n_tile = n_block * n_tiles_per_block + end_n_tile = (n_block + 1) * n_tiles_per_block + + with y_dfb.wait() as y_blk: + tx = ttl.copy( + y_blk, + y[ + start_m_tile:end_m_tile, + start_n_tile:end_n_tile, + ], + ) + tx.wait() + + +torch.manual_seed(42) + +n_devices = ttnn.GetNumAvailableDevices() +assert n_devices > 0 and ( + n_devices & (n_devices - 1) == 0 +), "Number of available devices must be power of 2 " +ttnn.set_fabric_config(ttnn.FabricConfig.FABRIC_1D) + +# Open a 1D mesh of all available devices. Each device will process an +# independent M/n_devices slice of the output rows. + +mesh_device = ttnn.open_mesh_device(ttnn.MeshShape(1, n_devices)) + +try: + M, K, N = 8192, 8192, 8192 + + a = torch.randn((M, K), dtype=torch.bfloat16) + b = torch.randn((K, N), dtype=torch.bfloat16) + c = torch.randn((M, N), dtype=torch.bfloat16) + + expected_y = torch.relu(a @ b + c) + + # Distribute tensors across devices: + # a: sharded along M (each device gets M/n_devices rows) + # b: replicated on every device (all devices need the full K×N matrix) + # c: sharded along M to match the corresponding rows of a + + a = from_torch(a, ttnn.ShardTensorToMesh(mesh_device, dim=0)) + b = from_torch(b, ttnn.ReplicateTensorToMesh(mesh_device)) + c = from_torch(c, ttnn.ShardTensorToMesh(mesh_device, dim=0)) + + y = torch.zeros((M, N), dtype=torch.bfloat16) + y = from_torch(y, ttnn.ShardTensorToMesh(mesh_device, dim=0)) + + tutorial_operation(a, b, c, y) + + # Gather per-device output shards back to the host by concatenating along M. + + y = ttnn.to_torch(y, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=0)) + + pcc = torch.corrcoef( + torch.stack([y.flatten().float(), expected_y.flatten().float()]) + )[0, 1].item() + + print(f"PCC {pcc:.6f}") + + assert pcc > 0.99 + +finally: + ttnn.close_device(mesh_device) diff --git a/examples/matmul-tutorial/step_6_multidevice_shard_k.py b/examples/matmul-tutorial/step_6_multidevice_shard_k.py new file mode 100644 index 000000000..2cd2cba88 --- /dev/null +++ b/examples/matmul-tutorial/step_6_multidevice_shard_k.py @@ -0,0 +1,260 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +# +# Tutorial Step 6: Multi-Device, Shard K +# ======================================== +# Changes the sharding strategy from Step 5: instead of splitting M across +# devices, this step splits the K (reduction) dimension. +# +# New concepts introduced: +# - K-sharding: a is sharded along K (dim=1), b is sharded along K (dim=0). +# Each device computes a partial product a_i @ b_i over its K slice. +# - Partial results: because K is split, no single device has the full dot +# product. Each device produces a partial sum y_i = a_i @ b_i + c_i +# (where only device 0 carries the real bias c; the rest use zeros). +# - Host-side reduction: the host manually sums partial_ys across devices +# and applies relu after the reduction. +# +# The TT-Lang operation body is mostly unchanged from Step 4. The kernel produces +# a partial output (no relu) and writes it to y; the relu is deferred to the +# host so it can be applied after the cross-device reduction. + +import ttnn +import torch + + +def from_torch(tensor: torch.Tensor, mesh_mapper): + return ttnn.from_torch( + tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=mesh_device, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + mesh_mapper=mesh_mapper, + ) + + +import ttl + +TILE_SIZE = 32 +M_GRANULARITY = 4 +N_GRANULARITY = 4 +K_GRANULARITY = 4 + + +# The operation body is identical to Step 4 except relu is removed from the +# final store — the kernel now writes the raw c + acc result so the host can +# sum partial outputs across devices before activating. + + +@ttl.operation(grid="auto") +def tutorial_operation( + a: ttnn.Tensor, + b: ttnn.Tensor, + c: ttnn.Tensor, + y: ttnn.Tensor, +) -> None: + m_tiles_per_block = M_GRANULARITY + n_tiles_per_block = N_GRANULARITY + k_tiles_per_block = K_GRANULARITY + + m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block + n_blocks = b.shape[1] // TILE_SIZE // n_tiles_per_block + k_blocks = a.shape[1] // TILE_SIZE // k_tiles_per_block + + grid_n, grid_m = ttl.grid_size(dims=2) + + m_blocks_per_node = -(-m_blocks // grid_m) # divceil + n_blocks_per_node = -(-n_blocks // grid_n) # divceil + + a_dfb = ttl.make_dataflow_buffer_like( + a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2 + ) + b_dfb = ttl.make_dataflow_buffer_like( + b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2 + ) + c_dfb = ttl.make_dataflow_buffer_like( + c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + acc_dfb = ttl.make_dataflow_buffer_like( + y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + y_dfb = ttl.make_dataflow_buffer_like( + y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + + @ttl.datamovement() + def read(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + if m_block < m_blocks: + start_m_tile = m_block * m_tiles_per_block + end_m_tile = (m_block + 1) * m_tiles_per_block + + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: + start_n_tile = n_block * n_tiles_per_block + end_n_tile = (n_block + 1) * n_tiles_per_block + + with c_dfb.reserve() as c_blk: + tx_c = ttl.copy( + c[ + start_m_tile:end_m_tile, + start_n_tile:end_n_tile, + ], + c_blk, + ) + + tx_c.wait() + + for k_block in range(k_blocks): + start_k_tile = k_block * k_tiles_per_block + end_k_tile = (k_block + 1) * k_tiles_per_block + with ( + a_dfb.reserve() as a_blk, + b_dfb.reserve() as b_blk, + ): + tx_a = ttl.copy( + a[ + start_m_tile:end_m_tile, + start_k_tile:end_k_tile, + ], + a_blk, + ) + tx_b = ttl.copy( + b[ + start_k_tile:end_k_tile, + start_n_tile:end_n_tile, + ], + b_blk, + ) + + tx_a.wait() + tx_b.wait() + + @ttl.compute() + def compute(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + if m_block < m_blocks: + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: + with acc_dfb.reserve() as acc_blk: + acc_blk.store(ttl.math.fill(acc_blk, 0)) + + for _ in range(k_blocks): + with ( + a_dfb.wait() as a_blk, + b_dfb.wait() as b_blk, + acc_dfb.wait() as pre_acc_blk, + ): + with acc_dfb.reserve() as acc_blk: + acc_blk.store(pre_acc_blk + a_blk @ b_blk) + + with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk: + with y_dfb.reserve() as y_blk: + y_blk.store(c_blk + acc_blk) + + @ttl.datamovement() + def write(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + if m_block < m_blocks: + start_m_tile = m_block * m_tiles_per_block + end_m_tile = (m_block + 1) * m_tiles_per_block + + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: + start_n_tile = n_block * n_tiles_per_block + end_n_tile = (n_block + 1) * n_tiles_per_block + + with y_dfb.wait() as y_blk: + tx = ttl.copy( + y_blk, + y[ + start_m_tile:end_m_tile, + start_n_tile:end_n_tile, + ], + ) + tx.wait() + + +torch.manual_seed(42) + +n_devices = ttnn.GetNumAvailableDevices() +assert n_devices > 0 and ( + n_devices & (n_devices - 1) == 0 +), "Number of available devices must be power of 2 " +ttnn.set_fabric_config(ttnn.FabricConfig.FABRIC_1D) +mesh_device = ttnn.open_mesh_device(ttnn.MeshShape(1, n_devices)) + +try: + M, K, N = 8192, 8192, 8192 + + a = torch.randn((M, K), dtype=torch.bfloat16) + b = torch.randn((K, N), dtype=torch.bfloat16) + c = torch.randn((M, N), dtype=torch.bfloat16) + + expected_y = torch.relu(a @ b + c) + + # Distribute tensors across devices for K-sharding: + # a: sharded along K (dim=1) — each device gets M×(K/n_devices) columns + # b: sharded along K (dim=0) — each device gets (K/n_devices)×N rows + + a = from_torch(a, ttnn.ShardTensorToMesh(mesh_device, dim=1)) + b = from_torch(b, ttnn.ShardTensorToMesh(mesh_device, dim=0)) + + # The bias c should only be added once, not once per device. Build a + # stacked tensor of shape (M * n_devices, N): device 0 gets the real c, + # all other devices get zeros. After sharding along dim=0, each device + # receives its M×N slice: c for device 0, zeros for the rest. + + replicated_cs = torch.zeros((M * n_devices, N), dtype=torch.bfloat16) + replicated_cs[:M, :] = c + replicated_cs = from_torch( + replicated_cs, ttnn.ShardTensorToMesh(mesh_device, dim=0) + ) + + # partial_ys collects the per-device output: y_i = a_i @ b_i + c_i. + # These are partial sums that must be reduced on the host. + + partial_ys = torch.zeros((M * n_devices, N), dtype=torch.bfloat16) + partial_ys = from_torch(partial_ys, ttnn.ShardTensorToMesh(mesh_device, dim=0)) + + tutorial_operation(a, b, replicated_cs, partial_ys) + + partial_ys = ttnn.to_torch( + partial_ys, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=0) + ) + + # Sum the partial products from each device to recover the full matmul + # result, then apply relu on the host. + + y = torch.zeros((M, N)) + + for i in range(n_devices): + y += partial_ys[i * M : (i + 1) * M, :] + + y = torch.relu(y) + + pcc = torch.corrcoef( + torch.stack([y.flatten().float(), expected_y.flatten().float()]) + )[0, 1].item() + + print(f"PCC {pcc:.6f}") + + assert pcc > 0.99 + +finally: + ttnn.close_device(mesh_device) diff --git a/examples/matmul-tutorial/step_7_multidevice_shard_k_all_reduce.py b/examples/matmul-tutorial/step_7_multidevice_shard_k_all_reduce.py new file mode 100644 index 000000000..3a5259372 --- /dev/null +++ b/examples/matmul-tutorial/step_7_multidevice_shard_k_all_reduce.py @@ -0,0 +1,248 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +# +# Tutorial Step 7: Multi-Device, Shard K with All-Reduce +# ======================================================= +# Replaces the host-side manual reduction from Step 6 with an on-device +# all-reduce, keeping the final result on the mesh rather than pulling it +# to the host for summation. +# +# New concepts introduced: +# - ttnn.all_reduce — sums partial_ys across all devices in-place using the +# TT-Fabric interconnect; each device ends up with the fully reduced M×N +# result (the result is replicated across all devices) +# - Post-reduce activation: relu is applied on-device after all_reduce, +# replacing the host-side relu from Step 6 +# +# The TT-Lang operation body and the K-sharding setup are identical to Step 6. +# The only change is in the host code: ttnn.all_reduce + ttnn.relu replace the +# manual Python loop that summed partial outputs. + +import ttnn +import torch + + +def from_torch(tensor: torch.Tensor, mesh_mapper): + return ttnn.from_torch( + tensor, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=mesh_device, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + mesh_mapper=mesh_mapper, + ) + + +import ttl + +TILE_SIZE = 32 +M_GRANULARITY = 4 +N_GRANULARITY = 4 +K_GRANULARITY = 4 + + +@ttl.operation(grid="auto") +def tutorial_operation( + a: ttnn.Tensor, + b: ttnn.Tensor, + c: ttnn.Tensor, + y: ttnn.Tensor, +) -> None: + m_tiles_per_block = M_GRANULARITY + n_tiles_per_block = N_GRANULARITY + k_tiles_per_block = K_GRANULARITY + + m_blocks = a.shape[0] // TILE_SIZE // m_tiles_per_block + n_blocks = b.shape[1] // TILE_SIZE // n_tiles_per_block + k_blocks = a.shape[1] // TILE_SIZE // k_tiles_per_block + + grid_n, grid_m = ttl.grid_size(dims=2) + + m_blocks_per_node = -(-m_blocks // grid_m) # divceil + n_blocks_per_node = -(-n_blocks // grid_n) # divceil + + a_dfb = ttl.make_dataflow_buffer_like( + a, shape=(m_tiles_per_block, k_tiles_per_block), block_count=2 + ) + b_dfb = ttl.make_dataflow_buffer_like( + b, shape=(k_tiles_per_block, n_tiles_per_block), block_count=2 + ) + c_dfb = ttl.make_dataflow_buffer_like( + c, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + acc_dfb = ttl.make_dataflow_buffer_like( + y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + y_dfb = ttl.make_dataflow_buffer_like( + y, shape=(m_tiles_per_block, n_tiles_per_block), block_count=2 + ) + + @ttl.datamovement() + def read(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + if m_block < m_blocks: + start_m_tile = m_block * m_tiles_per_block + end_m_tile = (m_block + 1) * m_tiles_per_block + + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: + start_n_tile = n_block * n_tiles_per_block + end_n_tile = (n_block + 1) * n_tiles_per_block + + with c_dfb.reserve() as c_blk: + tx_c = ttl.copy( + c[ + start_m_tile:end_m_tile, + start_n_tile:end_n_tile, + ], + c_blk, + ) + + tx_c.wait() + + for k_block in range(k_blocks): + start_k_tile = k_block * k_tiles_per_block + end_k_tile = (k_block + 1) * k_tiles_per_block + with ( + a_dfb.reserve() as a_blk, + b_dfb.reserve() as b_blk, + ): + tx_a = ttl.copy( + a[ + start_m_tile:end_m_tile, + start_k_tile:end_k_tile, + ], + a_blk, + ) + tx_b = ttl.copy( + b[ + start_k_tile:end_k_tile, + start_n_tile:end_n_tile, + ], + b_blk, + ) + + tx_a.wait() + tx_b.wait() + + @ttl.compute() + def compute(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + if m_block < m_blocks: + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: + with acc_dfb.reserve() as acc_blk: + acc_blk.store(ttl.math.fill(acc_blk, 0)) + + for _ in range(k_blocks): + with ( + a_dfb.wait() as a_blk, + b_dfb.wait() as b_blk, + acc_dfb.wait() as pre_acc_blk, + ): + with acc_dfb.reserve() as acc_blk: + acc_blk.store(pre_acc_blk + a_blk @ b_blk) + + with c_dfb.wait() as c_blk, acc_dfb.wait() as acc_blk: + with y_dfb.reserve() as y_blk: + y_blk.store(c_blk + acc_blk) + + @ttl.datamovement() + def write(): + node_n, node_m = ttl.node(dims=2) + + for local_m_block in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m_block + if m_block < m_blocks: + start_m_tile = m_block * m_tiles_per_block + end_m_tile = (m_block + 1) * m_tiles_per_block + + for local_n_block in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n_block + if n_block < n_blocks: + start_n_tile = n_block * n_tiles_per_block + end_n_tile = (n_block + 1) * n_tiles_per_block + + with y_dfb.wait() as y_blk: + tx = ttl.copy( + y_blk, + y[ + start_m_tile:end_m_tile, + start_n_tile:end_n_tile, + ], + ) + tx.wait() + + +torch.manual_seed(42) + +n_devices = ttnn.GetNumAvailableDevices() +assert n_devices > 1 and ( + n_devices & (n_devices - 1) == 0 +), "Number of available devices must be >1 and be power of 2 " +ttnn.set_fabric_config(ttnn.FabricConfig.FABRIC_1D) +mesh_device = ttnn.open_mesh_device(ttnn.MeshShape(1, n_devices)) + +try: + M, K, N = 8192, 8192, 8192 + + a = torch.randn((M, K), dtype=torch.bfloat16) + b = torch.randn((K, N), dtype=torch.bfloat16) + c = torch.randn((M, N), dtype=torch.bfloat16) + + expected_y = torch.relu(a @ b + c) + + # K-sharding setup is identical to Step 6. + + a = from_torch(a, ttnn.ShardTensorToMesh(mesh_device, dim=1)) + b = from_torch(b, ttnn.ShardTensorToMesh(mesh_device, dim=0)) + + replicated_cs = torch.zeros((M * n_devices, N), dtype=torch.bfloat16) + replicated_cs[:M, :] = c + replicated_cs = from_torch( + replicated_cs, ttnn.ShardTensorToMesh(mesh_device, dim=0) + ) + + partial_ys = torch.zeros((M * n_devices, N), dtype=torch.bfloat16) + partial_ys = from_torch(partial_ys, ttnn.ShardTensorToMesh(mesh_device, dim=0)) + + tutorial_operation(a, b, replicated_cs, partial_ys) + + # ttnn.all_reduce sums partial_ys across all devices using TT-Fabric, + # producing a fully reduced M×N result replicated on every device. + # ttnn.relu is then applied on-device, replacing the host-side loop from + # Step 6. + + replicated_ys = ttnn.all_reduce(partial_ys) + replicated_ys = ttnn.relu(replicated_ys) + + # Because the result is replicated, all devices hold the correct answer. + # Verify each device's copy against the expected output. + + replicated_ys = ttnn.to_torch( + replicated_ys, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=0) + ) + + for i in range(n_devices): + y = replicated_ys[i * M : (i + 1) * M, :] + + pcc = torch.corrcoef( + torch.stack([y.flatten().float(), expected_y.flatten().float()]) + )[0, 1].item() + + print(f"PCC {pcc:.6f}") + + assert pcc > 0.99 + +finally: + ttnn.close_device(mesh_device) From 2544cb1dc169e603b54ab88d24141a38733032e2 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Thu, 9 Apr 2026 20:59:22 -0700 Subject: [PATCH 02/31] Allow subblocking for matmul accumulating computes SubblockComputeForDST previously skipped all accumulating computes. Matmul K accumulates in-place in DST without consuming DST slots, so subblocking the parallel (M, N) dims is safe. LowerMatmulBlock handles the subblocked compute before LowerToLoops sees it. --- .../TTL/Transforms/ConvertTTLComputeToSCF.cpp | 11 ++++++++--- .../Transforms/TTLSubblockComputeForDST.cpp | 18 ++++++++++++------ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/lib/Dialect/TTL/Transforms/ConvertTTLComputeToSCF.cpp b/lib/Dialect/TTL/Transforms/ConvertTTLComputeToSCF.cpp index 97aa10d3e..1625fecaa 100644 --- a/lib/Dialect/TTL/Transforms/ConvertTTLComputeToSCF.cpp +++ b/lib/Dialect/TTL/Transforms/ConvertTTLComputeToSCF.cpp @@ -373,14 +373,19 @@ struct LowerComputeToLoops : OpRewritePattern { }) .wasInterrupted(); - assert(!(isSubblocked && isAccumulating) && - "SubblockComputeForDST must skip accumulating computes"); - SmallVector iterTypes; for (Attribute attr : op.getIteratorTypes()) { iterTypes.push_back(mlir::cast(attr)); } + // Subblocked accumulating computes (matmul K>1 with output > DST) are + // handled by LowerMatmulBlock, which generates the K reduction loop + // with per-K DstSections. By the time LowerToLoops runs, the ComputeOp + // has been replaced. This assert catches unexpected cases. + assert(!(isSubblocked && isAccumulating) && + "subblocked accumulating computes should be handled by " + "LowerMatmulBlock before LowerToLoops"); + // Side-effect-only loops: no iter_args, no tensor.insert, no scf.yield // with tensor values. Stores are explicit side effects (tile_store). bool processingFailed = false; diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp index c599257ec..493a607e5 100644 --- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp +++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp @@ -103,24 +103,30 @@ struct TTLSubblockComputeForDSTPass func::FuncOp funcOp = getOperation(); // Collect compute ops to subblock (avoid modifying while walking). - // Skip accumulating computes -- subblocking would break reduction - // accumulation by splitting the reduction loop across subblocks. + // Skip non-matmul accumulating computes (e.g., reduce_tile) because + // subblocking would break their reduction accumulation semantics. + // Matmul accumulating computes are safe: K accumulates in-place in + // DST without consuming DST slots (effectiveTiles already excludes + // reduction dims for matmul -- see hasMatmulBlock logic below). SmallVector opsToSubblock; funcOp.walk([&](ComputeOp computeOp) { auto unrollAttr = computeOp->getAttrOfType(kUnrollFactorAttrName); if (unrollAttr && unrollAttr.getInt() > 1) { bool hasAccumulating = false; + bool hasMatmulBlock = false; computeOp.getBody().walk([&](Operation *op) { if (op->hasTrait()) { hasAccumulating = true; - return WalkResult::interrupt(); } - return WalkResult::advance(); + if (isa(op)) { + hasMatmulBlock = true; + } }); - if (!hasAccumulating) { - opsToSubblock.push_back(computeOp); + if (hasAccumulating && !hasMatmulBlock) { + return; } + opsToSubblock.push_back(computeOp); } }); From 01cb9b6c5a53a541eb4b1176df9a041bce55b9dc Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Thu, 9 Apr 2026 23:07:39 -0700 Subject: [PATCH 03/31] initial pack_reconfig_l1_acc support --- include/ttlang/Dialect/TTL/Passes.td | 18 +++++ .../Transforms/TTKernelCombinePackTiles.cpp | 14 ++++ lib/Dialect/TTL/Pipelines/TTLPipelines.cpp | 1 + lib/Dialect/TTL/Transforms/CMakeLists.txt | 1 + .../Transforms/TTLAnnotateReductionLoops.cpp | 71 +++++++++++++++++++ .../Transforms/TTLSubblockComputeForDST.cpp | 51 +++++++++++-- python/ttl/ttl_api.py | 1 + test/python/test_matmul_k_accumulation.py | 23 +++++- .../TTL/Transforms/subblock_matmul.mlir | 47 ++++++------ 9 files changed, 196 insertions(+), 31 deletions(-) create mode 100644 lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td index 71035366e..8e8765181 100644 --- a/include/ttlang/Dialect/TTL/Passes.td +++ b/include/ttlang/Dialect/TTL/Passes.td @@ -46,6 +46,24 @@ def TTKernelInsertL1Accumulation ]; } +def TTLAnnotateReductionLoops + : Pass<"ttl-annotate-reduction-loops", "::mlir::func::FuncOp"> { + let summary = "Annotate user-written scf.for loops as reduction loops"; + let description = [{ + Detects user-written `scf.for` loops where all iterations store to the + same CB slot (reserved before the loop, pushed after) and annotates them + with `ttl.reduction_loop`. This enables `TTKernelInsertL1Accumulation` + to insert `pack_reconfig_l1_acc` guards so that packs accumulate across + iterations instead of overwriting. + + Targets the pattern: + cb_reserve -> scf.for { compute(store to reserved CB) } -> cb_push + }]; + let dependentDialects = [ + "::mlir::scf::SCFDialect" + ]; +} + def TTLConvertTTLToCompute : Pass<"convert-ttl-to-compute", "::mlir::func::FuncOp"> { let summary = "Lower TTL elementwise tensor ops to ttl.compute with tile ops"; diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp index 1fa6a3503..5d0c5ad5f 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp @@ -11,12 +11,14 @@ // //===----------------------------------------------------------------------===// +#include "ttlang/Dialect/TTL/IR/TTL.h" #include "ttlang/Dialect/TTL/Passes.h" #include "ttmlir/Dialect/TTKernel/IR/TTKernel.h" #include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" @@ -63,6 +65,18 @@ struct TTKernelCombinePackTilesPass void runOnOperation() override { getOperation().walk([](Block *block) { + // Skip blocks inside reduction loops: pack_tile_block is + // incompatible with L1 accumulation (pack_reconfig_l1_acc). + // L1 acc requires individual pack_tile calls so each K iteration + // can independently add to the existing L1 value. + for (Operation *parent = block->getParentOp(); parent; + parent = parent->getParentOp()) { + if (auto forOp = dyn_cast(parent)) { + if (forOp->hasAttr(kReductionLoopAttrName)) { + return; + } + } + } // Collect all combinable runs first, then replace them. Replacing // during iteration would invalidate the block's operation list. SmallVector> runs; diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp index 81be3dad8..d059fc090 100644 --- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp +++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp @@ -18,6 +18,7 @@ namespace mlir::tt::ttl { void createTTLToTTKernelPipeline(OpPassManager &pm, const TTLToTTKernelPipelineOptions &options) { + pm.addPass(createTTLAnnotateReductionLoops()); pm.addPass(createTTLConvertTTLToCompute()); { TTLSetComputeKernelConfigOptions configOpts; diff --git a/lib/Dialect/TTL/Transforms/CMakeLists.txt b/lib/Dialect/TTL/Transforms/CMakeLists.txt index a5e2b3fb9..40d01413d 100644 --- a/lib/Dialect/TTL/Transforms/CMakeLists.txt +++ b/lib/Dialect/TTL/Transforms/CMakeLists.txt @@ -6,6 +6,7 @@ add_mlir_dialect_library(TTLangTTLTransforms LowerDPrintToEmitC.cpp LowerSignpostToEmitC.cpp TTLAnnotateCBAssociations.cpp + TTLAnnotateReductionLoops.cpp TTLDumpCBFlowGraph.cpp TTLLowerMatmulBlock.cpp TTLAssignDST.cpp diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp new file mode 100644 index 000000000..b7d090781 --- /dev/null +++ b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp @@ -0,0 +1,71 @@ +// SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +//===----------------------------------------------------------------------===// +// TTL Annotate Reduction Loops +//===----------------------------------------------------------------------===// +// +// Detects user-written scf.for loops that accumulate into the same CB slot +// (reserve before loop, store inside, push after) and annotates them with +// kReductionLoopAttrName for L1 accumulation. +// +//===----------------------------------------------------------------------===// + +#include "ttlang/Dialect/TTL/IR/TTL.h" +#include "ttlang/Dialect/TTL/IR/TTLOps.h" +#include "ttlang/Dialect/TTL/Passes.h" + +#include "mlir/Dialect/SCF/IR/SCF.h" + +#define DEBUG_TYPE "ttl-annotate-reduction-loops" + +namespace mlir::tt::ttl { + +#define GEN_PASS_DEF_TTLANNOTATEREDUCTIONLOOPS +#include "ttlang/Dialect/TTL/Passes.h.inc" + +namespace { + +struct TTLAnnotateReductionLoopsPass + : public impl::TTLAnnotateReductionLoopsBase< + TTLAnnotateReductionLoopsPass> { + void runOnOperation() override { + func::FuncOp func = getOperation(); + + func.walk([&](scf::ForOp forOp) { + // Skip loops already annotated (from compiler-generated tile loops). + if (forOp->hasAttr(kReductionLoopAttrName) || + forOp->hasAttr(kTileLoopStrideAttrName) || + forOp->hasAttr(kSubblockLoopStrideAttrName)) { + return; + } + + // Check if the loop body contains a store (ttl.store) targeting a + // CB that was reserved (ttl.cb_reserve) before the loop. + bool hasReductionStore = false; + forOp.getBody()->walk([&](StoreOp store) { + Value view = store.getView(); + // Trace through attach_cb to find the cb_reserve. + if (auto attachCB = view.getDefiningOp()) { + view = attachCB.getTensor(); + } + if (auto reserve = view.getDefiningOp()) { + // The cb_reserve must be OUTSIDE the for loop (before it). + if (!forOp->isAncestor(reserve)) { + hasReductionStore = true; + } + } + }); + + if (hasReductionStore) { + forOp->setAttr(kReductionLoopAttrName, + OpBuilder(forOp).getUnitAttr()); + } + }); + } +}; + +} // namespace + +} // namespace mlir::tt::ttl diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp index 493a607e5..5903dbbdc 100644 --- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp +++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp @@ -216,13 +216,47 @@ struct TTLSubblockComputeForDSTPass SmallVector parallelSubblockSizes = computeMultiDimSubblockSizes(parallelDimSizes, parallelBudget); - // Expand back to full-rank subblock sizes: reduction dims get their full - // size, parallel dims get the computed subblock size. + // Expand back to full-rank subblock sizes. Parallel dims get the + // computed subblock size. For matmul when the parallel output exceeds + // DST capacity, reduction (K) dims are tiled to 1 for L1 accumulation: + // each K step packs to L1 independently, and TTKernelInsertL1Accumulation + // inserts pack_reconfig_l1_acc guards. When the output fits in DST, K + // stays at full size for DST accumulation (higher precision, fewer packs). + bool tileKToOne = false; + if (hasMatmulBlock) { + // Only tile K for standalone matmul (no accumulator). The fused + // prev + a @ b pattern has an accumulator operand that requires + // DST accumulation semantics (copy_tile + matmul_block). Tiling + // K would break the accumulator reload logic. + bool hasAccumulator = false; + computeOp.getBody().walk([&](TileMatmulBlockOp mmOp) { + if (mmOp.getAccumulator()) { + hasAccumulator = true; + } + }); + if (!hasAccumulator) { + int64_t parallelProduct = 1; + for (auto sz : parallelSubblockSizes) { + parallelProduct *= sz; + } + // Tile K to 1 when: (1) subblocking IS needed (parallel output + // exceeds DST), (2) the subblock is strictly smaller than the + // full output, and (3) the subblock is non-trivial (> 1 tile). + // When the subblock degenerates to 1x1 (e.g., prime dimensions), + // K tiling provides no benefit -- the per-tile DST accumulation + // path handles it via generateAccumulatingLoops. + tileKToOne = parallelProduct > 1 && + parallelProduct < effectiveTiles && + effectiveTiles > unrollFactor; + } + } SmallVector subblockSizes(rank); int64_t parallelIdx = 0; for (int64_t d = 0; d < rank; ++d) { if (iterTypes[d] == utils::IteratorType::parallel) { subblockSizes[d] = parallelSubblockSizes[parallelIdx++]; + } else if (tileKToOne) { + subblockSizes[d] = 1; } else { subblockSizes[d] = dimSizes[d]; } @@ -303,11 +337,14 @@ struct TTLSubblockComputeForDSTPass // can distinguish subblock loops from tile loops and compute correct // CB offsets (both linearized and per-dimension). for (size_t i = 0; i < subblockedDims.size(); ++i) { - loopNest.loops[i]->setAttr( - kSubblockLoopStrideAttrName, - b.getIndexAttr(blockStrides[subblockedDims[i]])); - loopNest.loops[i]->setAttr(kSubblockDimAttrName, - b.getIndexAttr(subblockedDims[i])); + int64_t dim = subblockedDims[i]; + loopNest.loops[i]->setAttr(kSubblockLoopStrideAttrName, + b.getIndexAttr(blockStrides[dim])); + loopNest.loops[i]->setAttr(kSubblockDimAttrName, b.getIndexAttr(dim)); + // Mark reduction dimension loops for L1 accumulation insertion. + if (iterTypes[dim] == utils::IteratorType::reduction) { + loopNest.loops[i]->setAttr(kReductionLoopAttrName, b.getUnitAttr()); + } } // Precompute per-output subblock info: shape, tile count, and whether diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py index 9cf005fa0..2d6603df1 100644 --- a/python/ttl/ttl_api.py +++ b/python/ttl/ttl_api.py @@ -1203,6 +1203,7 @@ def _compile_kernel( assign_dst_pass = f"ttl-assign-dst{{enable-fpu-binary-ops={fpu_flag}}}" pipeline_passes = [ + "func.func(ttl-annotate-reduction-loops)", "func.func(convert-ttl-to-compute)", set_compute_config_pass, f"func.func({assign_dst_pass})", diff --git a/test/python/test_matmul_k_accumulation.py b/test/python/test_matmul_k_accumulation.py index a5d79d9d3..4ed632865 100644 --- a/test/python/test_matmul_k_accumulation.py +++ b/test/python/test_matmul_k_accumulation.py @@ -175,13 +175,30 @@ def test_matmul_k_accumulation_streaming(k_tiles, block_n, device): @pytest.mark.parametrize("k_tiles", K_TILES, ids=[f"K{k}" for k in K_TILES]) @pytest.mark.requires_device def test_matmul_k_accumulation_single_fill(k_tiles, block_n, device): - """Kt>1 single-fill accumulation: tighter bounds (f32 DST).""" + """Kt>1 single-fill accumulation. + + When the output block fits in DST (block_n <= 4 for f32), matmul_block + accumulates all K tiles in f32 DST with one bf16 truncation at the end + (tighter bounds). When the output exceeds DST capacity (block_n > 4 + for f32), the compiler tiles K to 1 for L1 accumulation, producing one + bf16 truncation per K step (same bounds as the streaming test). + """ scale = math.sqrt(k_tiles) + # DST capacity with fp32_dest_acc_en=true is 4. Output block is + # 1 x block_n. When block_n > 4, L1 acc activates with per-K-step + # bf16 truncation, requiring relaxed error bounds. + uses_l1_acc = block_n > 4 + if uses_l1_acc: + max_err = 0.5 * scale + mean_err = 0.05 * scale + else: + max_err = 0.1 * scale + mean_err = 0.01 * scale _run( _make_matmul_kn, k_tiles, block_n, device, - max_err_limit=0.1 * scale, - mean_err_limit=0.01 * scale, + max_err_limit=max_err, + mean_err_limit=mean_err, ) diff --git a/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir b/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir index 9fb0212c9..aa4a891f8 100644 --- a/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir +++ b/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir @@ -1,35 +1,40 @@ // Tests for ttl-subblock-compute-for-dst with matmul computes. // Matmul K (reduction) accumulates in-place in DST, so only M*N parallel -// tiles count toward the DST budget. Subblocking partitions the M*N output -// space while keeping K whole in each subblock. +// tiles count toward the DST budget. When the parallel output exceeds DST, +// subblocking partitions M*N AND tiles K to 1 for L1 accumulation. // RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(convert-ttl-to-compute, ttl-set-compute-kernel-config, ttl-assign-dst{enable-fpu-binary-ops=0}, ttl-subblock-compute-for-dst))' --split-input-file | FileCheck %s // ----- -// Purpose: M*N=16 exceeds f32 DST capacity (4). K=3 is excluded from the -// budget, so subblocking partitions the 4x4 output into 1x4 strips. -// Loop on M (dim 0): 0 to 4 step 1. K (dim 2) stays at 3 in each subblock. +// Purpose: M*N=16 exceeds f32 DST capacity (4). Subblocking partitions the +// 4x4 output into 1x4 strips AND tiles K from 3 to 1. The K loop is +// annotated with ttl.reduction_loop for L1 accumulation. +// Loops: M (dim 0) 0..4 step 1, K (dim 2) 0..3 step 1. -// CHECK-LABEL: func.func @matmul_subblock_k_excluded +// CHECK-LABEL: func.func @matmul_subblock_k_tiled // CHECK-SAME: fp32_dest_acc_en = true // Outer subblock loop over M dimension. -// CHECK: scf.for %[[IV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} { -// A sliced on M, K kept whole: [iv, 0] [1, 3]. -// CHECK: tensor.extract_slice {{.*}}[%[[IV]], 0] [1, 3] [1, 1] -// B not sliced (full [3, 4]). -// CHECK: tensor.extract_slice {{.*}}[0, 0] [3, 4] [1, 1] -// Output sliced on M: [iv, 0] [1, 4]. -// CHECK: tensor.extract_slice {{.*}}[%[[IV]], 0] [1, 4] [1, 1] -// Inner compute on subblock [1, 4, 3] (M=1, N=4, K=3). -// CHECK: ttl.compute -// CHECK-SAME: tensor<1x3x!ttcore.tile<32x32, bf16>> -// CHECK-SAME: tensor<3x4x!ttcore.tile<32x32, bf16>> -// CHECK-SAME: tensor<1x4x!ttcore.tile<32x32, bf16>> -// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"] -// CHECK: ttl.tile_matmul_block +// CHECK: scf.for %[[MIV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} { +// Inner K reduction loop. +// CHECK: scf.for %[[KIV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} { +// A sliced on M and K: [miv, kiv] [1, 1]. +// CHECK: tensor.extract_slice {{.*}}[%[[MIV]], %[[KIV]]] [1, 1] [1, 1] +// B sliced on K: [kiv, 0] [1, 4]. +// CHECK: tensor.extract_slice {{.*}}[%[[KIV]], 0] [1, 4] [1, 1] +// Output sliced on M: [miv, 0] [1, 4]. +// CHECK: tensor.extract_slice {{.*}}[%[[MIV]], 0] [1, 4] [1, 1] +// Inner compute on subblock [1, 4, 1] (M=1, N=4, K=1). +// CHECK: ttl.compute +// CHECK-SAME: tensor<1x1x!ttcore.tile<32x32, bf16>> +// CHECK-SAME: tensor<1x4x!ttcore.tile<32x32, bf16>> +// CHECK-SAME: tensor<1x4x!ttcore.tile<32x32, bf16>> +// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"] +// CHECK: ttl.tile_matmul_block +// K loop annotated for L1 accumulation. +// CHECK: } {{{.*}}ttl.reduction_loop{{.*}}} // CHECK: } -func.func @matmul_subblock_k_excluded( +func.func @matmul_subblock_k_tiled( %arg0: tensor<4x3x!ttcore.tile<32x32, bf16>>, %arg1: tensor<3x4x!ttcore.tile<32x32, bf16>>) -> tensor<4x4x!ttcore.tile<32x32, bf16>> { %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[4, 3], !ttcore.tile<32x32, bf16>, 2> From 513cc4d7dfc6d063db5f93d3d3446edef13ccc61 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Thu, 9 Apr 2026 23:07:50 -0700 Subject: [PATCH 04/31] preformat --- lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp | 3 +-- lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp index b7d090781..181318da1 100644 --- a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp +++ b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp @@ -59,8 +59,7 @@ struct TTLAnnotateReductionLoopsPass }); if (hasReductionStore) { - forOp->setAttr(kReductionLoopAttrName, - OpBuilder(forOp).getUnitAttr()); + forOp->setAttr(kReductionLoopAttrName, OpBuilder(forOp).getUnitAttr()); } }); } diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp index 5903dbbdc..cb32c9103 100644 --- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp +++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp @@ -245,8 +245,7 @@ struct TTLSubblockComputeForDSTPass // When the subblock degenerates to 1x1 (e.g., prime dimensions), // K tiling provides no benefit -- the per-tile DST accumulation // path handles it via generateAccumulatingLoops. - tileKToOne = parallelProduct > 1 && - parallelProduct < effectiveTiles && + tileKToOne = parallelProduct > 1 && parallelProduct < effectiveTiles && effectiveTiles > unrollFactor; } } From 64764602bdf6deb23a89af436db5c85c3ec41a3d Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Thu, 9 Apr 2026 23:13:02 -0700 Subject: [PATCH 05/31] add pytest for l1 accumulation --- test/python/test_matmul_l1_acc.py | 191 ++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 test/python/test_matmul_l1_acc.py diff --git a/test/python/test_matmul_l1_acc.py b/test/python/test_matmul_l1_acc.py new file mode 100644 index 000000000..599c33ea7 --- /dev/null +++ b/test/python/test_matmul_l1_acc.py @@ -0,0 +1,191 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Matmul L1 accumulation: reserve once, store K times, push once. + +The compiler detects the scf.for loop storing to the same reserved CB +and annotates it as a reduction loop. TTKernelInsertL1Accumulation inserts +pack_reconfig_l1_acc guards so each K iteration packs additively to L1. + +Tests single-core and multicore configurations with various block sizes. +""" + +# REQUIRES: ttnn +# UNSUPPORTED: system-darwin +# RUN: %python -m pytest %s -v --tb=short + +import pytest +import torch +import ttl + +ttnn = pytest.importorskip("ttnn", exc_type=ImportError) + +from ttlang_test_utils import to_dram +from utils.correctness import assert_pcc + +TILE = 32 + + +def _make_l1_acc_kernel(block_m, block_n, grid="auto"): + """Matmul with L1 accumulation: reserve once, store K times, push once.""" + + @ttl.operation(grid=grid) + def kernel(a, b, out): + Mt = a.shape[0] // TILE + Kt = a.shape[1] // TILE + Nt = b.shape[1] // TILE + + M_num = Mt // block_m + N_num = Nt // block_n + + grid_n, grid_m = ttl.grid_size(dims=2) + m_per = -(-M_num // grid_m) + n_per = -(-N_num // grid_n) + + a_dfb = ttl.make_dataflow_buffer_like(a, shape=(block_m, 1), block_count=2) + b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, block_n), block_count=2) + out_dfb = ttl.make_dataflow_buffer_like( + out, shape=(block_m, block_n), block_count=2 + ) + + @ttl.compute() + def compute(): + node_n, node_m = ttl.node(dims=2) + for lm in range(m_per): + mb = node_m * m_per + lm + if mb < M_num: + for ln in range(n_per): + nb = node_n * n_per + ln + if nb < N_num: + out_blk = out_dfb.reserve() + for _ in range(Kt): + a_blk = a_dfb.wait() + b_blk = b_dfb.wait() + out_blk.store(a_blk @ b_blk) + a_blk.pop() + b_blk.pop() + out_blk.push() + + @ttl.datamovement() + def reader(): + node_n, node_m = ttl.node(dims=2) + for lm in range(m_per): + mb = node_m * m_per + lm + if mb < M_num: + m_off = mb * block_m + for ln in range(n_per): + nb = node_n * n_per + ln + if nb < N_num: + for kt in range(Kt): + with a_dfb.reserve() as blk: + ttl.copy( + a[ + m_off : m_off + block_m, + kt : kt + 1, + ], + blk, + ).wait() + + @ttl.datamovement() + def writer(): + node_n, node_m = ttl.node(dims=2) + for lm in range(m_per): + mb = node_m * m_per + lm + if mb < M_num: + m_off = mb * block_m + for ln in range(n_per): + nb = node_n * n_per + ln + if nb < N_num: + n_off = nb * block_n + for kt in range(Kt): + with b_dfb.reserve() as blk: + ttl.copy( + b[ + kt : kt + 1, + n_off : n_off + block_n, + ], + blk, + ).wait() + with out_dfb.wait() as blk: + ttl.copy( + blk, + out[ + m_off : m_off + block_m, + n_off : n_off + block_n, + ], + ).wait() + + return kernel + + +# Single-core tests (grid=(1,1)) +SINGLE_CORE_PARAMS = [ + # (block_m, block_n, Kt) + (2, 2, 2), # Output 2x2=4 fits in f32 DST + (2, 2, 4), # K=4 + (3, 3, 2), # Output 3x3=9 > f32 DST(4) + (4, 4, 4), # Output 4x4=16 > f32 DST(4) + (8, 8, 2), # Large output, small K + (8, 8, 8), # Large output, large K +] + + +@pytest.mark.parametrize( + "block_m,block_n,Kt", + SINGLE_CORE_PARAMS, + ids=[f"blk{m}x{n}_K{k}" for m, n, k in SINGLE_CORE_PARAMS], +) +@pytest.mark.requires_device +def test_l1_acc_single_core(block_m, block_n, Kt, device): + """L1 accumulation on single core with various block sizes.""" + M, K, N = block_m * TILE, Kt * TILE, block_n * TILE + a_torch = torch.randn(M, K, dtype=torch.bfloat16) + b_torch = torch.randn(K, N, dtype=torch.bfloat16) + golden = (a_torch.float() @ b_torch.float()).float() + + a = to_dram(a_torch, device) + b = to_dram(b_torch, device) + out = to_dram(torch.zeros(M, N, dtype=torch.bfloat16), device) + + kernel = _make_l1_acc_kernel(block_m, block_n, grid=(1, 1)) + kernel(a, b, out) + + result = ttnn.to_torch(out).float() + assert_pcc(golden, result, threshold=0.999) + + +# Multicore tests (grid="auto") with multiple output blocks +MULTI_CORE_PARAMS = [ + # (Mt, Kt, Nt, block_m, block_n) + (16, 4, 16, 8, 8), # 2x2 output blocks, K=4 + (32, 8, 32, 8, 8), # 4x4 output blocks, K=8 + (128, 128, 128, 8, 8), # 16x16 output blocks, K=128 (4096^3 shape) +] + + +@pytest.mark.parametrize( + "Mt,Kt,Nt,block_m,block_n", + MULTI_CORE_PARAMS, + ids=[ + f"tiles{mt}x{kt}x{nt}_blk{bm}x{bn}" for mt, kt, nt, bm, bn in MULTI_CORE_PARAMS + ], +) +@pytest.mark.requires_device +def test_l1_acc_multicore(Mt, Kt, Nt, block_m, block_n, device): + """L1 accumulation with multicore and multiple output blocks.""" + M, K, N = Mt * TILE, Kt * TILE, Nt * TILE + a_torch = torch.randn(M, K, dtype=torch.bfloat16) + b_torch = torch.randn(K, N, dtype=torch.bfloat16) + golden = (a_torch.float() @ b_torch.float()).float() + + a = to_dram(a_torch, device) + b = to_dram(b_torch, device) + out = to_dram(torch.zeros(M, N, dtype=torch.bfloat16), device) + + kernel = _make_l1_acc_kernel(block_m, block_n) + kernel(a, b, out) + + result = ttnn.to_torch(out).float() + assert_pcc(golden, result, threshold=0.999) From 455ad86a56ab5edfef88eaff7b3647eecfe4f0b5 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Thu, 9 Apr 2026 23:24:56 -0700 Subject: [PATCH 06/31] correct placement of pack_reconfig_l1_acc --- include/ttlang/Dialect/TTL/IR/TTL.h | 6 ++++ .../TTKernelInsertL1Accumulation.cpp | 30 ++++++++++++------- .../Transforms/TTLAnnotateReductionLoops.cpp | 2 +- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/include/ttlang/Dialect/TTL/IR/TTL.h b/include/ttlang/Dialect/TTL/IR/TTL.h index 72ef23f5e..f96a4f1eb 100644 --- a/include/ttlang/Dialect/TTL/IR/TTL.h +++ b/include/ttlang/Dialect/TTL/IR/TTL.h @@ -64,6 +64,12 @@ constexpr llvm::StringLiteral kTileLoopStrideAttrName("ttl.tile_loop_stride"); /// ComputeOp is lowered to loops. constexpr llvm::StringLiteral kReductionLoopAttrName("ttl.reduction_loop"); +/// Marks a user-written scf.for as an L1 accumulation loop. Each iteration +/// packs to the same CB slot; pack_reconfig_l1_acc makes subsequent +/// iterations additive. Distinct from kReductionLoopAttrName which marks +/// compiler-generated DST accumulation loops. +constexpr llvm::StringLiteral kL1AccLoopAttrName("ttl.l1_acc_loop"); + /// Output CB index on tile ops that need it for init insertion. constexpr llvm::StringLiteral kBcastOutputCBIndexAttrName("ttl.bcast_output_cb_index"); diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index 3c40a1872..713854283 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -35,26 +35,36 @@ namespace ttk = mlir::tt::ttkernel; namespace { -/// Find the innermost enclosing reduction loop for an operation. -static scf::ForOp findInnermostReductionLoop(Operation *op) { +/// Find the enclosing loop that should control L1 accumulation. +/// Prefers kL1AccLoopAttrName (user-annotated) over kReductionLoopAttrName +/// (compiler-generated). Returns the innermost matching loop, but if a +/// kL1AccLoopAttrName loop exists, kReductionLoopAttrName loops inside +/// it are skipped (they use DST accumulation, not L1). +static scf::ForOp findL1AccLoop(Operation *op) { + scf::ForOp l1AccLoop; + scf::ForOp reductionLoop; for (Operation *parent = op->getParentOp(); parent; parent = parent->getParentOp()) { if (auto forOp = dyn_cast(parent)) { - if (forOp->hasAttr(kReductionLoopAttrName)) { - return forOp; + if (forOp->hasAttr(kL1AccLoopAttrName)) { + l1AccLoop = forOp; + } else if (forOp->hasAttr(kReductionLoopAttrName) && !reductionLoop) { + reductionLoop = forOp; } } } - return nullptr; + // User-annotated L1 acc loop takes priority. + return l1AccLoop ? l1AccLoop : reductionLoop; } -/// Find the outermost enclosing reduction loop for an operation. -static scf::ForOp findOutermostReductionLoop(Operation *op) { +/// Find the outermost enclosing L1 acc or reduction loop for the disable guard. +static scf::ForOp findOutermostL1AccLoop(Operation *op) { scf::ForOp outermost; for (Operation *parent = op->getParentOp(); parent; parent = parent->getParentOp()) { if (auto forOp = dyn_cast(parent)) { - if (forOp->hasAttr(kReductionLoopAttrName)) { + if (forOp->hasAttr(kL1AccLoopAttrName) || + forOp->hasAttr(kReductionLoopAttrName)) { outermost = forOp; } } @@ -72,7 +82,7 @@ struct TTKernelInsertL1AccumulationPass // avoids invalidation issues from modifying IR during iteration. SmallVector> targets; moduleOp->walk([&](ttk::TileRegsAcquireOp acquireOp) { - auto reductionLoop = findInnermostReductionLoop(acquireOp); + auto reductionLoop = findL1AccLoop(acquireOp); if (!reductionLoop) { return; } @@ -108,7 +118,7 @@ struct TTKernelInsertL1AccumulationPass ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag); // Disable L1 accumulation after the outermost reduction loop. - auto outermostLoop = findOutermostReductionLoop(acquireOp); + auto outermostLoop = findOutermostL1AccLoop(acquireOp); if (disabledLoops.insert(outermostLoop).second) { builder.setInsertionPointAfter(outermostLoop); Value disableFlag = arith::ConstantOp::create( diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp index 181318da1..7545aba6d 100644 --- a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp +++ b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp @@ -59,7 +59,7 @@ struct TTLAnnotateReductionLoopsPass }); if (hasReductionStore) { - forOp->setAttr(kReductionLoopAttrName, OpBuilder(forOp).getUnitAttr()); + forOp->setAttr(kL1AccLoopAttrName, OpBuilder(forOp).getUnitAttr()); } }); } From b51e5544d25e80c07559a74dced7e1be4fbd348a Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Fri, 10 Apr 2026 08:10:18 -0700 Subject: [PATCH 07/31] more packer accumulation fixes --- .../Transforms/TTKernelCombinePackTiles.cpp | 3 +- .../Transforms/TTKernelInsertInits.cpp | 3 +- .../TTKernelInsertL1Accumulation.cpp | 99 ++++++++++++------- .../Transforms/TTLSubblockComputeForDST.cpp | 52 ++-------- 4 files changed, 75 insertions(+), 82 deletions(-) diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp index 5d0c5ad5f..22eb9f2cc 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp @@ -72,7 +72,8 @@ struct TTKernelCombinePackTilesPass for (Operation *parent = block->getParentOp(); parent; parent = parent->getParentOp()) { if (auto forOp = dyn_cast(parent)) { - if (forOp->hasAttr(kReductionLoopAttrName)) { + if (forOp->hasAttr(kReductionLoopAttrName) || + forOp->hasAttr(kL1AccLoopAttrName)) { return; } } diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp index fb63e3e4f..b28ccded4 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp @@ -359,7 +359,8 @@ static Operation *hoistAboveCompilerLoops(Operation *op) { while (auto *parentOp = insertBefore->getParentOp()) { if (isa(parentOp) && (parentOp->hasAttr(kTileLoopStrideAttrName) || - parentOp->hasAttr(kSubblockLoopStrideAttrName))) { + parentOp->hasAttr(kSubblockLoopStrideAttrName) || + parentOp->hasAttr(kL1AccLoopAttrName))) { insertBefore = parentOp; } else { break; diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index 713854283..646d1be64 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -35,26 +35,23 @@ namespace ttk = mlir::tt::ttkernel; namespace { -/// Find the enclosing loop that should control L1 accumulation. -/// Prefers kL1AccLoopAttrName (user-annotated) over kReductionLoopAttrName -/// (compiler-generated). Returns the innermost matching loop, but if a -/// kL1AccLoopAttrName loop exists, kReductionLoopAttrName loops inside -/// it are skipped (they use DST accumulation, not L1). +/// Find the enclosing loop that controls L1 accumulation. +/// Prefers kL1AccLoopAttrName (user-annotated). Falls back to innermost +/// kReductionLoopAttrName (compiler-generated, for reduce ops). static scf::ForOp findL1AccLoop(Operation *op) { - scf::ForOp l1AccLoop; - scf::ForOp reductionLoop; + scf::ForOp reductionFallback; for (Operation *parent = op->getParentOp(); parent; parent = parent->getParentOp()) { if (auto forOp = dyn_cast(parent)) { if (forOp->hasAttr(kL1AccLoopAttrName)) { - l1AccLoop = forOp; - } else if (forOp->hasAttr(kReductionLoopAttrName) && !reductionLoop) { - reductionLoop = forOp; + return forOp; + } + if (forOp->hasAttr(kReductionLoopAttrName) && !reductionFallback) { + reductionFallback = forOp; } } } - // User-annotated L1 acc loop takes priority. - return l1AccLoop ? l1AccLoop : reductionLoop; + return reductionFallback; } /// Find the outermost enclosing L1 acc or reduction loop for the disable guard. @@ -78,49 +75,77 @@ struct TTKernelInsertL1AccumulationPass void runOnOperation() override { auto moduleOp = getOperation(); - // Collect all acquire ops inside reduction loops. Collecting first - // avoids invalidation issues from modifying IR during iteration. - SmallVector> targets; + // Collect L1 acc loops (kL1AccLoopAttrName or kReductionLoopAttrName) + // that contain pack_tile activity. + SmallVector l1AccLoops; + llvm::SmallDenseSet seenLoops; moduleOp->walk([&](ttk::TileRegsAcquireOp acquireOp) { - auto reductionLoop = findL1AccLoop(acquireOp); - if (!reductionLoop) { + auto loop = findL1AccLoop(acquireOp); + if (!loop || !seenLoops.insert(loop).second) { return; } - // L1 accumulation uses additive packing -- only valid for sum - // reductions. Max reductions require DST accumulation (Phase 2) - // where the hardware max operation accumulates across iterations. bool hasMaxReduce = false; - reductionLoop->walk([&](ttk::ReduceTileOp reduceOp) { + loop->walk([&](ttk::ReduceTileOp reduceOp) { if (reduceOp.getReduceType() == ttk::ReduceType::Max) { hasMaxReduce = true; } }); if (!hasMaxReduce) { - targets.emplace_back(acquireOp, reductionLoop); + l1AccLoops.push_back(loop); } }); + // Insert pack_reconfig_l1_acc matching the tt-metal minimal_matmul + // pattern: enable at the END of the first K iteration (after all + // DstSections complete), disable after the loop. The enable guard + // uses `if (k == lb)` so it fires once when the first iteration + // finishes, and L1 acc stays enabled for all subsequent iterations. llvm::SmallDenseSet disabledLoops; - for (auto [acquireOp, reductionLoop] : targets) { - OpBuilder builder(acquireOp->getContext()); - builder.setInsertionPointAfter(acquireOp); - Location loc = acquireOp.getLoc(); - - // Guard: if (loop_iv != lower_bound) pack_reconfig_l1_acc(1) - Value loopIV = reductionLoop.getInductionVar(); - Value loopLB = reductionLoop.getLowerBound(); - Value notFirstIter = arith::CmpIOp::create( - builder, loc, arith::CmpIPredicate::ne, loopIV, loopLB); - auto ifOp = scf::IfOp::create(builder, loc, notFirstIter); + for (scf::ForOp loop : l1AccLoops) { + OpBuilder builder(loop->getContext()); + Location loc = loop.getLoc(); + + // Disable L1 acc before the loop to ensure clean state. + builder.setInsertionPoint(loop); + Value disablePre = arith::ConstantOp::create( + builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0)); + ttk::PackReconfigL1AccOp::create(builder, loc, disablePre); + + // Enable at end of first iteration, matching tt-metal: + // if (k_block == 0) { PACK((llk_pack_reconfig_l1_acc(1))); } + Operation *yield = loop.getBody()->getTerminator(); + builder.setInsertionPoint(yield); + Value loopIV = loop.getInductionVar(); + Value loopLB = loop.getLowerBound(); + Value isFirstIter = arith::CmpIOp::create( + builder, loc, arith::CmpIPredicate::eq, loopIV, loopLB); + auto ifOp = scf::IfOp::create(builder, loc, isFirstIter); builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); Value enableFlag = arith::ConstantOp::create( builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1)); ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag); - // Disable L1 accumulation after the outermost reduction loop. - auto outermostLoop = findOutermostL1AccLoop(acquireOp); - if (disabledLoops.insert(outermostLoop).second) { - builder.setInsertionPointAfter(outermostLoop); + // Disable after each L1 acc loop to prevent L1 acc state from + // leaking into outer loops or subsequent code. + if (disabledLoops.insert(loop.getOperation()).second) { + // For the outermost loop, place disable after cb_push_back. + // For inner loops, place directly after the loop. + auto outermostLoop = findOutermostL1AccLoop(loop); + bool isOutermost = !outermostLoop || outermostLoop == loop; + if (isOutermost) { + // Scan forward for cb_push_back. + Operation *insertPoint = loop->getNextNode(); + while (insertPoint && !isa(insertPoint)) { + insertPoint = insertPoint->getNextNode(); + } + if (insertPoint) { + builder.setInsertionPointAfter(insertPoint); + } else { + builder.setInsertionPointAfter(loop); + } + } else { + builder.setInsertionPointAfter(loop); + } Value disableFlag = arith::ConstantOp::create( builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0)); ttk::PackReconfigL1AccOp::create(builder, loc, disableFlag); diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp index cb32c9103..f6c082780 100644 --- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp +++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp @@ -216,46 +216,15 @@ struct TTLSubblockComputeForDSTPass SmallVector parallelSubblockSizes = computeMultiDimSubblockSizes(parallelDimSizes, parallelBudget); - // Expand back to full-rank subblock sizes. Parallel dims get the - // computed subblock size. For matmul when the parallel output exceeds - // DST capacity, reduction (K) dims are tiled to 1 for L1 accumulation: - // each K step packs to L1 independently, and TTKernelInsertL1Accumulation - // inserts pack_reconfig_l1_acc guards. When the output fits in DST, K - // stays at full size for DST accumulation (higher precision, fewer packs). - bool tileKToOne = false; - if (hasMatmulBlock) { - // Only tile K for standalone matmul (no accumulator). The fused - // prev + a @ b pattern has an accumulator operand that requires - // DST accumulation semantics (copy_tile + matmul_block). Tiling - // K would break the accumulator reload logic. - bool hasAccumulator = false; - computeOp.getBody().walk([&](TileMatmulBlockOp mmOp) { - if (mmOp.getAccumulator()) { - hasAccumulator = true; - } - }); - if (!hasAccumulator) { - int64_t parallelProduct = 1; - for (auto sz : parallelSubblockSizes) { - parallelProduct *= sz; - } - // Tile K to 1 when: (1) subblocking IS needed (parallel output - // exceeds DST), (2) the subblock is strictly smaller than the - // full output, and (3) the subblock is non-trivial (> 1 tile). - // When the subblock degenerates to 1x1 (e.g., prime dimensions), - // K tiling provides no benefit -- the per-tile DST accumulation - // path handles it via generateAccumulatingLoops. - tileKToOne = parallelProduct > 1 && parallelProduct < effectiveTiles && - effectiveTiles > unrollFactor; - } - } + // Reduction dims keep their full size. For matmul, K accumulates + // in-place in DST via matmul_block(kt=K_block). L1 accumulation + // across user-managed outer K iterations is handled separately by + // TTKernelInsertL1Accumulation (kL1AccLoopAttrName). SmallVector subblockSizes(rank); int64_t parallelIdx = 0; for (int64_t d = 0; d < rank; ++d) { if (iterTypes[d] == utils::IteratorType::parallel) { subblockSizes[d] = parallelSubblockSizes[parallelIdx++]; - } else if (tileKToOne) { - subblockSizes[d] = 1; } else { subblockSizes[d] = dimSizes[d]; } @@ -336,14 +305,11 @@ struct TTLSubblockComputeForDSTPass // can distinguish subblock loops from tile loops and compute correct // CB offsets (both linearized and per-dimension). for (size_t i = 0; i < subblockedDims.size(); ++i) { - int64_t dim = subblockedDims[i]; - loopNest.loops[i]->setAttr(kSubblockLoopStrideAttrName, - b.getIndexAttr(blockStrides[dim])); - loopNest.loops[i]->setAttr(kSubblockDimAttrName, b.getIndexAttr(dim)); - // Mark reduction dimension loops for L1 accumulation insertion. - if (iterTypes[dim] == utils::IteratorType::reduction) { - loopNest.loops[i]->setAttr(kReductionLoopAttrName, b.getUnitAttr()); - } + loopNest.loops[i]->setAttr( + kSubblockLoopStrideAttrName, + b.getIndexAttr(blockStrides[subblockedDims[i]])); + loopNest.loops[i]->setAttr(kSubblockDimAttrName, + b.getIndexAttr(subblockedDims[i])); } // Precompute per-output subblock info: shape, tile count, and whether From ec703c780b00cc289954a82a30e424ed5f654a0f Mon Sep 17 00:00:00 2001 From: Alex Richins Date: Fri, 10 Apr 2026 10:03:35 -0700 Subject: [PATCH 08/31] matmul cleanup and 2d mcast (#465) ### Problem description The existing matmul examples had stale patterns from before the `acc=True` removal, `split_work_to_nodes` in `utils/block_allocation.py` had a bug that produced incorrect block parameters for certain matrix shapes. Adding 2D multicast matmul example. Generally cleaning up and fixing test in metal_examples ### What's changed - **API updates across metal examples**: Updated all existing metal matmul examples (`1d_mcast_matmul`, `multinode_matmul`, `multinode_reuse_matmul`, `single_node_matmul`) fixing back change ttnn apis and new update metal headers - **ttlang matmul accumulation pattern**: Updated all ttlang matmul examples to use the explicit `+=` accumulation pattern, replacing the removed `store(..., acc=True)` API. Accumulation is now expressed as `acc = ttl.math.fill(out_blk, 0)` followed by `acc += a_blk @ b_blk` in the K loop, with a final `out_blk.store(acc)`. - **2D mcast matmul (metal)**: Added a metal reference implementation under `examples/metal_examples/2d_mcast_matmul/metal/` - **2D mcast matmul (tt-lang)**: Added a tt-lang implementation under `examples/metal_examples/2d_mcast_matmul/ttlang/`. Uses `ttl.Pipe` and `ttl.PipeNet` to express the A (row-wise) and B (column-wise) multicast patterns, with `get_large_matmul_params` for block parameter selection. ### Checklist - [ ] New/Existing tests provide coverage for changes - `test_block_allocation.py` updated with new coverage for the block allocation fix - 2D mcast matmul example manually tested on hardware and in sim --- .../1d_mcast_matmul/metal/1d_matmul_metal.py | 58 +-- .../metal/kernels/reuse_compute.cpp | 8 +- .../2d_mcast_matmul/metal/2d_mcast_matmul.py | 432 ++++++++++++++++++ .../metal/kernels/bmm_large_block_zm.cpp | 107 +++++ ..._tile_layout_in0_receiver_in1_receiver.cpp | 106 +++++ ...mm_tile_layout_in0_receiver_in1_sender.cpp | 156 +++++++ ...mm_tile_layout_in0_sender_in1_receiver.cpp | 155 +++++++ ..._bmm_tile_layout_in0_sender_in1_sender.cpp | 200 ++++++++ .../metal/kernels/writer_bmm_tile_layout.cpp | 57 +++ .../2d_mcast_matmul/ttlang/2d_mcast_matmul.py | 182 ++++++++ .../metal/kernels/mm_compute.cpp | 4 +- .../metal/multinode_matmul.py | 104 +++-- .../ttlang/multinode_matmul.py | 28 +- .../metal/kernels/bmm_large_block_zm.cpp | 4 +- .../metal/multinode_reuse_matmul.py | 102 +++-- .../ttlang/multinode_reuse_matmul.py | 3 +- .../metal/kernels/mm_compute.cpp | 4 +- .../metal/single_node_matmul.py | 26 +- .../ttlang/single_node_matmul.py | 4 +- examples/tt_upsample.py | 12 +- python/utils/__init__.py | 7 +- python/utils/block_allocation.py | 60 +-- test/python/test_block_allocation.py | 100 ++-- 23 files changed, 1688 insertions(+), 231 deletions(-) create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/2d_mcast_matmul.py create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/kernels/bmm_large_block_zm.cpp create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_receiver.cpp create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_sender.cpp create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_receiver.cpp create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_sender.cpp create mode 100644 examples/metal_examples/2d_mcast_matmul/metal/kernels/writer_bmm_tile_layout.cpp create mode 100644 examples/metal_examples/2d_mcast_matmul/ttlang/2d_mcast_matmul.py diff --git a/examples/metal_examples/1d_mcast_matmul/metal/1d_matmul_metal.py b/examples/metal_examples/1d_mcast_matmul/metal/1d_matmul_metal.py index f5b6a0157..22429529b 100644 --- a/examples/metal_examples/1d_mcast_matmul/metal/1d_matmul_metal.py +++ b/examples/metal_examples/1d_mcast_matmul/metal/1d_matmul_metal.py @@ -175,11 +175,11 @@ def test_1d_matmul_metal( ), "1D matmul requires multiple blocks to use all 4 kernels" # Single sender node at (0, 0) broadcasts to all other nodes - in0_sender_node = ttnn.NodeRangeSet( - [ttnn.NodeRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(0, 0))] + in0_sender_node = ttnn.CoreRangeSet( + [ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(0, 0))] ) # All compute nodes (entire grid used for computation) - all_nodes = ttnn.num_nodes_to_noderangeset( + all_nodes = ttnn.num_cores_to_corerangeset( num_worker_nodes, ttnn.CoreCoord(num_nodes_x, num_nodes_y), row_wise=True ) # Receiver nodes are all nodes except the single sender node (0,0) @@ -218,22 +218,22 @@ def test_1d_matmul_metal( block_count = 2 a_cb_descriptor = ttnn.CBDescriptor( total_size=block_count * cb_page_size * (block_m * block_k), - node_ranges=all_nodes, + core_ranges=all_nodes, format_descriptors=[a_cb_format], ) b_cb_descriptor = ttnn.CBDescriptor( total_size=block_count * cb_page_size * (block_n * block_k), - node_ranges=all_nodes, + core_ranges=all_nodes, format_descriptors=[b_cb_format], ) out_cb_descriptor = ttnn.CBDescriptor( total_size=cb_page_size * (block_m * block_n), - node_ranges=all_nodes, + core_ranges=all_nodes, format_descriptors=[out_cb_format], ) intermediate_cb_descriptor = ttnn.CBDescriptor( total_size=cb_page_size * (block_m * block_n), - node_ranges=all_nodes, + core_ranges=all_nodes, format_descriptors=[intermediate_cb_format], ) in0_sender_semaphore_id = 0 @@ -346,19 +346,17 @@ def test_1d_matmul_metal( -(-num_worker_nodes // num_nodes_x) if num_nodes_x < num_worker_nodes else 1 ) - in0_sender_rt_args = [[[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)] - in0_receiver_rt_args = [ - [[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes) - ] - in1_writer_rt_args = [[[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)] - compute_rt_args = [[[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)] + in0_sender_rt_args = [] + in0_receiver_rt_args = [] + in1_writer_rt_args = [] + compute_rt_args = [] total_receivers = num_worker_nodes - 1 print( f"1D matmul: Single sender at (0,0) multicasts to {total_receivers} receivers, across a grid of {num_x_nodes} x {num_y_nodes} nodes" ) - noc_of_sender = device.worker_node_from_logical_node(ttnn.CoreCoord(0, 0)) + noc_of_sender = device.worker_core_from_logical_core(ttnn.CoreCoord(0, 0)) # Assign work to nodes worker_node_idx = 0 @@ -366,16 +364,17 @@ def test_1d_matmul_metal( for output_idx_x in range(num_x_nodes): if worker_node_idx >= num_worker_nodes: break + core = ttnn.CoreCoord(output_idx_x, output_idx_y) # in0 sender args (only for node (0,0)) # Single sender multicasts to all other nodes in the grid if output_idx_x == 0 and output_idx_y == 0: # NOTE: multicast nocs require perfect rectangular node regions # so when num_worker_nodes % num_nodes_x != 0, the last row of nodes will be multicasted to, but not utilized - mcast_end_node_noc = device.worker_node_from_logical_node( + mcast_end_node_noc = device.worker_core_from_logical_core( ttnn.CoreCoord(num_x_nodes - 1, num_y_nodes - 1) ) - in0_sender_rt_args[output_idx_x][output_idx_y] = [ + sender_args = [ a_tensor.buffer_address(), 0, noc_of_sender.x, @@ -383,39 +382,42 @@ def test_1d_matmul_metal( mcast_end_node_noc.x, mcast_end_node_noc.y, ] + in0_sender_rt_args.append((core, sender_args)) print( f"IN0_SENDER - RUNTIME_ARGS for node ({output_idx_x}, {output_idx_y}), worker: {worker_node_idx}" ) print( - f"IN0_SENDER_CORE - RUNTIME_ARGS ({len(in0_sender_rt_args[output_idx_x][output_idx_y])} args): {', '.join(map(str, in0_sender_rt_args[output_idx_x][output_idx_y]))}" + f"IN0_SENDER_CORE - RUNTIME_ARGS ({len(sender_args)} args): {', '.join(map(str, sender_args))}" ) # in0 receiver args (for all nodes except (0,0)) if not (output_idx_x == 0 and output_idx_y == 0): - in0_receiver_rt_args[output_idx_x][output_idx_y] = [ + receiver_args = [ noc_of_sender.x, noc_of_sender.y, ] + in0_receiver_rt_args.append((core, receiver_args)) print( f"IN0_RECEIVER - RUNTIME_ARGS for node ({output_idx_x}, {output_idx_y}), worker: {worker_node_idx}" ) print( - f"IN0_RECEIVER_CORE - RUNTIME_ARGS ({len(in0_receiver_rt_args[output_idx_x][output_idx_y])} args): {', '.join(map(str, in0_receiver_rt_args[output_idx_x][output_idx_y]))}" + f"IN0_RECEIVER_CORE - RUNTIME_ARGS ({len(receiver_args)} args): {', '.join(map(str, receiver_args))}" ) # in1 reader + writer args (all nodes) - in1_writer_rt_args[output_idx_x][output_idx_y] = [ + in1_writer_args = [ b_tensor.buffer_address(), worker_node_idx * n_blocks_per_node * block_n, output_tensor.buffer_address(), worker_node_idx * n_blocks_per_node * block_n, ] + in1_writer_rt_args.append((core, in1_writer_args)) print( f"IN1_SENDER_WRITER - RUNTIME_ARGS for node ({output_idx_x}, {output_idx_y}), worker: {worker_node_idx}" ) print( - f"IN1_SENDER_WRITER_CORE - RUNTIME_ARGS ({len(in1_writer_rt_args[output_idx_x][output_idx_y])} args): {', '.join(map(str, in1_writer_rt_args[output_idx_x][output_idx_y]))}" + f"IN1_SENDER_WRITER_CORE - RUNTIME_ARGS ({len(in1_writer_args)} args): {', '.join(map(str, in1_writer_args))}" ) worker_node_idx += 1 @@ -427,7 +429,7 @@ def test_1d_matmul_metal( in0_sender_kernel_descriptor = ttnn.KernelDescriptor( kernel_source="examples/metal_examples/1d_mcast_matmul/metal/kernels/sender_in0_interleaved.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=in0_sender_node, + core_ranges=in0_sender_node, compile_time_args=in0_sender_compile_time_args, runtime_args=in0_sender_rt_args, config=ttnn.ReaderConfigDescriptor(), @@ -436,7 +438,7 @@ def test_1d_matmul_metal( in0_receiver_kernel_descriptor = ttnn.KernelDescriptor( kernel_source="examples/metal_examples/1d_mcast_matmul/metal/kernels/reciever_in0_interleaved.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=in0_receiver_nodes, + core_ranges=in0_receiver_nodes, compile_time_args=in0_receiver_compile_time_args, runtime_args=in0_receiver_rt_args, config=ttnn.ReaderConfigDescriptor(), @@ -445,7 +447,7 @@ def test_1d_matmul_metal( in1_writer_kernel_descriptor = ttnn.KernelDescriptor( kernel_source="examples/metal_examples/1d_mcast_matmul/metal/kernels/reader_in1_writer_out_interleaved.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=all_nodes, + core_ranges=all_nodes, compile_time_args=in1_writer_compile_time_args, runtime_args=in1_writer_rt_args, config=ttnn.WriterConfigDescriptor(), @@ -454,7 +456,7 @@ def test_1d_matmul_metal( compute_kernel_descriptor = ttnn.KernelDescriptor( kernel_source="examples/metal_examples/1d_mcast_matmul/metal/kernels/reuse_compute.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=all_nodes, + core_ranges=all_nodes, compile_time_args=compute_compile_time_args, runtime_args=compute_rt_args, config=computeConfig, @@ -465,12 +467,12 @@ def test_1d_matmul_metal( ttnn.SemaphoreDescriptor( id=in0_sender_semaphore_id, initial_value=0, - node_ranges=in0_sender_node, + core_ranges=in0_sender_node, ), ttnn.SemaphoreDescriptor( id=in0_receiver_semaphore_id, initial_value=0, - node_ranges=all_nodes, + core_ranges=all_nodes, ), ] @@ -503,6 +505,6 @@ def test_1d_matmul_metal( torch_output = torch.matmul(a_tensor_torch, b_tensor_torch) assert_with_ulp(torch_output, metal_output) - print("test passed.") + print("Test passed!") ttnn.close_device(device) diff --git a/examples/metal_examples/1d_mcast_matmul/metal/kernels/reuse_compute.cpp b/examples/metal_examples/1d_mcast_matmul/metal/kernels/reuse_compute.cpp index fd4834d32..ceed82689 100644 --- a/examples/metal_examples/1d_mcast_matmul/metal/kernels/reuse_compute.cpp +++ b/examples/metal_examples/1d_mcast_matmul/metal/kernels/reuse_compute.cpp @@ -4,10 +4,10 @@ #include -#include "compute_kernel_api/matmul.h" -#include "compute_kernel_api/pack_untilize.h" -#include "compute_kernel_api/tile_move_copy.h" -#include "compute_kernel_api/transpose_wh.h" +#include "api/compute/matmul.h" +#include "api/compute/pack_untilize.h" +#include "api/compute/tile_move_copy.h" +#include "api/compute/transpose_wh.h" #include "internal/mod_div_lib.h" namespace NAMESPACE { diff --git a/examples/metal_examples/2d_mcast_matmul/metal/2d_mcast_matmul.py b/examples/metal_examples/2d_mcast_matmul/metal/2d_mcast_matmul.py new file mode 100644 index 000000000..b09efb839 --- /dev/null +++ b/examples/metal_examples/2d_mcast_matmul/metal/2d_mcast_matmul.py @@ -0,0 +1,432 @@ +# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 +import pytest +import torch +import ttnn +from ttl.utils.block_allocation import get_large_matmul_params +from ttl.utils.correctness import assert_with_ulp + + +@pytest.mark.parametrize("M,K,N", [(3584, 768, 3072)]) +def test_2d_mcast_matmul(M, K, N): + device = ttnn.open_device(device_id=0) + Mt = M // ttnn.TILE_SIZE + Kt = K // ttnn.TILE_SIZE + Nt = N // ttnn.TILE_SIZE + in0_block_w = 2 + + device_grid = device.compute_with_storage_grid_size() + print(f"Device compute_with_storage_grid_size: ({device_grid.x}, {device_grid.y})") + num_nodes_x = device_grid.x + num_nodes_y = device_grid.y + + block_params = get_large_matmul_params( + Mt, Nt, num_nodes_y, num_nodes_x, in0_block_w + ) + per_node_M = block_params.block_h + per_node_N = block_params.block_w + out_subblock_h = block_params.subblock_h + out_subblock_w = block_params.subblock_w + assert per_node_M != 0, "get_large_matmul_params was not able to find a solution" + print( + f"per_node_M: {per_node_M}, per_node_N: {per_node_N}, out_subblock_h: {out_subblock_h}, out_subblock_w: {out_subblock_w}" + ) + assert Mt % per_node_M == 0, "per_node_M must divide Mt" + assert Nt % per_node_N == 0, "per_node_N must divide Nt" + assert Kt % in0_block_w == 0, "in0_block_w must divide Kt" + + num_blocks_y = Mt // per_node_M + num_blocks_x = Nt // per_node_N + assert ( + num_blocks_x <= num_nodes_x and num_blocks_y <= num_nodes_y + ), "number of total blocks must be less than or equal to num nodes in each dimension" + assert ( + num_blocks_x >= 2 and num_blocks_y >= 2 + ), "2D mcast requires at least a 2x2 node grid" + + num_active_x = num_blocks_x + num_active_y = num_blocks_y + + all_nodes = ttnn.CoreRangeSet( + [ + ttnn.CoreRange( + ttnn.CoreCoord(0, 0), + ttnn.CoreCoord(num_active_x - 1, num_active_y - 1), + ) + ] + ) + left_column = ttnn.CoreRangeSet( + [ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(0, num_active_y - 1))] + ) + all_except_left_column = ttnn.CoreRangeSet( + [ + ttnn.CoreRange( + ttnn.CoreCoord(1, 0), + ttnn.CoreCoord(num_active_x - 1, num_active_y - 1), + ) + ] + ) + in0_sender_in1_sender = ttnn.CoreRangeSet( + [ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(0, 0))] + ) + in0_sender_in1_receiver = ttnn.CoreRangeSet( + [ttnn.CoreRange(ttnn.CoreCoord(0, 1), ttnn.CoreCoord(0, num_active_y - 1))] + ) + in0_receiver_in1_sender = ttnn.CoreRangeSet( + [ttnn.CoreRange(ttnn.CoreCoord(1, 0), ttnn.CoreCoord(num_active_x - 1, 0))] + ) + in0_receiver_in1_receiver = ttnn.CoreRangeSet( + [ + ttnn.CoreRange( + ttnn.CoreCoord(1, 1), + ttnn.CoreCoord(num_active_x - 1, num_active_y - 1), + ) + ] + ) + + dram_memory_config = ttnn.DRAM_MEMORY_CONFIG + a_tensor = ttnn.rand( + (M, K), + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=device, + memory_config=dram_memory_config, + ) + b_tensor = ttnn.rand( + (K, N), + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=device, + memory_config=dram_memory_config, + ) + output_tensor = ttnn.empty( + (M, N), + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=device, + memory_config=dram_memory_config, + ) + dtype_size = 2 # bfloat16 + cb_page_size = dtype_size * ttnn.TILE_SIZE * ttnn.TILE_SIZE + + a_cb = 0 + b_cb = 1 + out_cb = 16 + intermediate_cb = 24 + + a_cb_format = ttnn.CBFormatDescriptor( + buffer_index=a_cb, + data_format=ttnn.bfloat16, + page_size=cb_page_size, + ) + b_cb_format = ttnn.CBFormatDescriptor( + buffer_index=b_cb, + data_format=ttnn.bfloat16, + page_size=cb_page_size, + ) + out_cb_format = ttnn.CBFormatDescriptor( + buffer_index=out_cb, + data_format=ttnn.bfloat16, + page_size=cb_page_size, + ) + intermediate_cb_format = ttnn.CBFormatDescriptor( + buffer_index=intermediate_cb, + data_format=ttnn.bfloat16, + page_size=cb_page_size, + ) + + in0_block_tiles = per_node_M * in0_block_w + in1_block_tiles = per_node_N * in0_block_w + out_block_tiles = per_node_M * per_node_N + buffer_factor = 2 + a_cb_descriptor = ttnn.CBDescriptor( + total_size=buffer_factor * cb_page_size * in0_block_tiles, + core_ranges=all_nodes, + format_descriptors=[a_cb_format], + ) + b_cb_descriptor = ttnn.CBDescriptor( + total_size=buffer_factor * cb_page_size * in1_block_tiles, + core_ranges=all_nodes, + format_descriptors=[b_cb_format], + ) + out_cb_descriptor = ttnn.CBDescriptor( + total_size=cb_page_size * out_block_tiles, + core_ranges=all_nodes, + format_descriptors=[out_cb_format], + ) + intermediate_cb_descriptor = ttnn.CBDescriptor( + total_size=cb_page_size * out_block_tiles, + core_ranges=all_nodes, + format_descriptors=[intermediate_cb_format], + ) + + # 4 semaphores for 2D mcast handshake + in0_mcast_sender_semaphore_id = 0 + in0_mcast_receiver_semaphore_id = 1 + in1_mcast_sender_semaphore_id = 2 + in1_mcast_receiver_semaphore_id = 3 + + semaphore_descriptors = [ + ttnn.SemaphoreDescriptor( + id=in0_mcast_sender_semaphore_id, + initial_value=0, + core_ranges=all_nodes, + ), + ttnn.SemaphoreDescriptor( + id=in0_mcast_receiver_semaphore_id, + initial_value=0, + core_ranges=all_nodes, + ), + ttnn.SemaphoreDescriptor( + id=in1_mcast_sender_semaphore_id, + initial_value=0, + core_ranges=all_nodes, + ), + ttnn.SemaphoreDescriptor( + id=in1_mcast_receiver_semaphore_id, + initial_value=0, + core_ranges=all_nodes, + ), + ] + + # Compute kernel compile time args + in0_num_subblocks = per_node_M // out_subblock_h + in0_block_num_tiles = out_subblock_h * in0_block_w * in0_num_subblocks + in0_subblock_num_tiles = out_subblock_h * in0_block_w + + in1_num_subblocks = per_node_N // out_subblock_w + in1_block_num_tiles_compute = out_subblock_w * in0_block_w * in1_num_subblocks + in1_per_node_w = out_subblock_w * in1_num_subblocks + + num_blocks = Kt // in0_block_w + out_subblock_num_tiles = out_subblock_h * out_subblock_w + + compute_compile_time_args = [ + in0_block_w, + in0_num_subblocks, + in0_block_num_tiles, + in0_subblock_num_tiles, + in1_num_subblocks, + in1_block_num_tiles_compute, + in1_per_node_w, + num_blocks, + out_subblock_h, + out_subblock_w, + out_subblock_num_tiles, + ] + reader_compile_time_args = ttnn.TensorAccessorArgs(a_tensor).get_compile_time_args() + reader_compile_time_args.extend( + ttnn.TensorAccessorArgs(b_tensor).get_compile_time_args() + ) + writer_compile_time_args = ttnn.TensorAccessorArgs( + output_tensor + ).get_compile_time_args() + + reader_rt_args_corner = [] + reader_rt_args_left = [] + reader_rt_args_top = [] + reader_rt_args_interior = [] + writer_rt_args_left = [] + writer_rt_args_rest = [] + + for node_idx_y in range(num_active_y): + for node_idx_x in range(num_active_x): + left_node = ttnn.CoreCoord(0, node_idx_y) + left_node_plus_one = ttnn.CoreCoord(1, node_idx_y) + right_node = ttnn.CoreCoord(num_active_x - 1, node_idx_y) + top_node = ttnn.CoreCoord(node_idx_x, 0) + top_node_plus_one = ttnn.CoreCoord(node_idx_x, 1) + bottom_node = ttnn.CoreCoord(node_idx_x, num_active_y - 1) + + left_phys = device.worker_core_from_logical_core(left_node) + left_plus_one_phys = device.worker_core_from_logical_core( + left_node_plus_one + ) + right_phys = device.worker_core_from_logical_core(right_node) + top_phys = device.worker_core_from_logical_core(top_node) + top_plus_one_phys = device.worker_core_from_logical_core(top_node_plus_one) + bottom_phys = device.worker_core_from_logical_core(bottom_node) + + core = ttnn.CoreCoord(node_idx_x, node_idx_y) + mm_reader_args = [ + a_tensor.buffer_address(), + Kt * per_node_M * node_idx_y, # in0 start tile + 1, # in0 stride w + Kt, # in0 stride h + in0_block_w, # in0 next block stride + in0_block_w, # in0 block w + per_node_M, # in0 block h + in0_block_w * per_node_M, # in0 block num tiles + b_tensor.buffer_address(), + per_node_N * node_idx_x, # in1 start tile + 1, # in1 stride w + Nt, # in1 stride h + in0_block_w * Nt, # in1 next block stride + per_node_N, # in1 block w + in0_block_w, # in1 block h + per_node_N * in0_block_w, # in1 block num tiles + Kt // in0_block_w, # num blocks + # in0 mcast args (rightward from left column) + # NOTE: Physical NOC coords may be inverted from logical coords. + # The kernel passes (end, start) to get_noc_multicast_addr to + # produce the correct physical bounding box for the NOC in use. + right_phys.x, # in0_mcast_dest_noc_start + right_phys.y, + left_plus_one_phys.x, # in0_mcast_dest_noc_end + left_plus_one_phys.y, + num_active_x - 1, # in0 mcast num dests + left_phys.x, + left_phys.y, + in0_mcast_sender_semaphore_id, + in0_mcast_receiver_semaphore_id, + # in1 mcast args (downward from top row) + # NOTE: Same start/end convention as in0 above. + bottom_phys.x, # in1_mcast_dest_noc_start + bottom_phys.y, + top_plus_one_phys.x, # in1_mcast_dest_noc_end + top_plus_one_phys.y, + num_active_y - 1, # in1 mcast num dests + top_phys.x, + top_phys.y, + in1_mcast_sender_semaphore_id, + in1_mcast_receiver_semaphore_id, + ] + + writer_args = [ + output_tensor.buffer_address(), + node_idx_x * per_node_N + node_idx_y * per_node_M * Nt, + 1, # stride w + Nt, # stride h + out_subblock_w, # next subblock stride w + out_subblock_h * Nt, # next subblock stride h + out_subblock_w, + out_subblock_h, + out_subblock_w * out_subblock_h, + per_node_N // out_subblock_w, # num subblocks w + per_node_M // out_subblock_h, # num subblocks h + ] + + if node_idx_x == 0 and node_idx_y == 0: + reader_rt_args_corner.append((core, mm_reader_args)) + writer_rt_args_left.append((core, writer_args)) + elif node_idx_x == 0: + reader_rt_args_left.append((core, mm_reader_args)) + writer_rt_args_left.append((core, writer_args)) + elif node_idx_y == 0: + reader_rt_args_top.append((core, mm_reader_args)) + writer_rt_args_rest.append((core, writer_args)) + else: + reader_rt_args_interior.append((core, mm_reader_args)) + writer_rt_args_rest.append((core, writer_args)) + + # Left column (in0 senders): reader on RISCV_1/NOC0, writer on RISCV_0/NOC1 + # Non-left column (in0 receivers): reader on RISCV_1/NOC1, writer on RISCV_0/NOC0 + reader_config_noc0 = ttnn.DataMovementConfigDescriptor( + processor=ttnn.DataMovementProcessor.RISCV_1, + noc=ttnn.NOC.RISCV_0_default, + ) + reader_config_noc1 = ttnn.DataMovementConfigDescriptor( + processor=ttnn.DataMovementProcessor.RISCV_1, + noc=ttnn.NOC.RISCV_1_default, + ) + writer_config_noc0 = ttnn.DataMovementConfigDescriptor( + processor=ttnn.DataMovementProcessor.RISCV_0, + noc=ttnn.NOC.RISCV_0_default, + ) + writer_config_noc1 = ttnn.DataMovementConfigDescriptor( + processor=ttnn.DataMovementProcessor.RISCV_0, + noc=ttnn.NOC.RISCV_1_default, + ) + + computeConfig = ttnn.ComputeConfigDescriptor() + computeConfig.math_fidelity = ttnn.MathFidelity.HiFi4 + + reader_corner = ttnn.KernelDescriptor( + kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_sender.cpp", + source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, + core_ranges=in0_sender_in1_sender, + compile_time_args=reader_compile_time_args, + runtime_args=reader_rt_args_corner, + config=reader_config_noc0, + ) + reader_left = ttnn.KernelDescriptor( + kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_receiver.cpp", + source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, + core_ranges=in0_sender_in1_receiver, + compile_time_args=reader_compile_time_args, + runtime_args=reader_rt_args_left, + config=reader_config_noc0, + ) + reader_top = ttnn.KernelDescriptor( + kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_sender.cpp", + source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, + core_ranges=in0_receiver_in1_sender, + compile_time_args=reader_compile_time_args, + runtime_args=reader_rt_args_top, + config=reader_config_noc1, + ) + reader_interior = ttnn.KernelDescriptor( + kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_receiver.cpp", + source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, + core_ranges=in0_receiver_in1_receiver, + compile_time_args=reader_compile_time_args, + runtime_args=reader_rt_args_interior, + config=reader_config_noc1, + ) + writer_left_col = ttnn.KernelDescriptor( + kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/writer_bmm_tile_layout.cpp", + source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, + core_ranges=left_column, + compile_time_args=writer_compile_time_args, + runtime_args=writer_rt_args_left, + config=writer_config_noc1, + ) + writer_rest = ttnn.KernelDescriptor( + kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/writer_bmm_tile_layout.cpp", + source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, + core_ranges=all_except_left_column, + compile_time_args=writer_compile_time_args, + runtime_args=writer_rt_args_rest, + config=writer_config_noc0, + ) + compute_kernel = ttnn.KernelDescriptor( + kernel_source="examples/metal_examples/2d_mcast_matmul/metal/kernels/bmm_large_block_zm.cpp", + source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, + core_ranges=all_nodes, + compile_time_args=compute_compile_time_args, + runtime_args=[], + config=computeConfig, + ) + + program_descriptor = ttnn.ProgramDescriptor( + kernels=[ + reader_corner, + reader_left, + reader_top, + reader_interior, + writer_left_col, + writer_rest, + compute_kernel, + ], + semaphores=semaphore_descriptors, + cbs=[ + a_cb_descriptor, + b_cb_descriptor, + out_cb_descriptor, + intermediate_cb_descriptor, + ], + ) + + print("Launching generic_op...") + output = ttnn.generic_op([a_tensor, b_tensor, output_tensor], program_descriptor) + print("Completed generic_op.") + metal_output = ttnn.to_torch(output).to(torch.bfloat16) + + a_tensor_torch = ttnn.to_torch(a_tensor).to(torch.bfloat16) + b_tensor_torch = ttnn.to_torch(b_tensor).to(torch.bfloat16) + torch_output = torch.matmul(a_tensor_torch, b_tensor_torch) + + assert_with_ulp(torch_output, metal_output) + + ttnn.close_device(device) diff --git a/examples/metal_examples/2d_mcast_matmul/metal/kernels/bmm_large_block_zm.cpp b/examples/metal_examples/2d_mcast_matmul/metal/kernels/bmm_large_block_zm.cpp new file mode 100644 index 000000000..a2cb6cc37 --- /dev/null +++ b/examples/metal_examples/2d_mcast_matmul/metal/kernels/bmm_large_block_zm.cpp @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 +// +// Block matrix multiply compute kernel with z-major ordering. +// Adapted from tt-metal programming example (batch loop removed). + +#include + +#include "api/compute/matmul.h" +#include "api/compute/tile_move_copy.h" + +namespace NAMESPACE { +void MAIN { + uint32_t in0_block_w = get_compile_time_arg_val(0); + uint32_t in0_num_subblocks = get_compile_time_arg_val(1); + uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); + uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3); + uint32_t in1_num_subblocks = get_compile_time_arg_val(4); + uint32_t in1_block_num_tiles = get_compile_time_arg_val(5); + uint32_t in1_per_core_w = get_compile_time_arg_val(6); + uint32_t num_blocks = get_compile_time_arg_val(7); + uint32_t out_subblock_h = get_compile_time_arg_val(8); + uint32_t out_subblock_w = get_compile_time_arg_val(9); + uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); + + mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16); + + bool spill = num_blocks > 1; + bool enable_reload = false; + uint32_t out_num_tiles_to_wait = out_subblock_num_tiles; + + for (uint32_t block = 0; block < num_blocks; block++) { + bool last_out = block == (num_blocks - 1); + + cb_wait_front(tt::CBIndex::c_0, in0_block_num_tiles); + cb_wait_front(tt::CBIndex::c_1, in1_block_num_tiles); + int in0_index_subblock_offset = 0; + for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; + in0_subblock++) { + int in1_index_subblock_offset = 0; + for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; + in1_subblock++) { + acquire_dst(); + + if (enable_reload) { + copy_tile_to_dst_init_short(tt::CBIndex::c_24); + cb_wait_front(tt::CBIndex::c_24, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + copy_tile(tt::CBIndex::c_24, i, i); + } + cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles); + mm_init_short(tt::CBIndex::c_0, tt::CBIndex::c_1); + } + + int dst_index = 0; + int in0_index_h_offset = 0; + for (uint32_t h = 0; h < out_subblock_h; h++) { + for (uint32_t w = 0; w < out_subblock_w; w++) { + int in1_index_inner_dim_offset = 0; + for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) { + int in0_index = + in0_index_subblock_offset + in0_index_h_offset + inner_dim; + int in1_index = + in1_index_subblock_offset + in1_index_inner_dim_offset + w; + matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, in0_index, + in1_index, dst_index); + in1_index_inner_dim_offset += in1_per_core_w; + } + dst_index++; + } + in0_index_h_offset += in0_block_w; + } + + if (last_out) { + cb_reserve_back(tt::CBIndex::c_16, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, tt::CBIndex::c_16); + } + cb_push_back(tt::CBIndex::c_16, out_subblock_num_tiles); + } else { + if (block == 0) { + cb_reserve_back(tt::CBIndex::c_16, out_num_tiles_to_wait); + out_num_tiles_to_wait += out_subblock_num_tiles; + } + cb_reserve_back(tt::CBIndex::c_24, out_subblock_num_tiles); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, tt::CBIndex::c_24); + } + cb_push_back(tt::CBIndex::c_24, out_subblock_num_tiles); + } + + release_dst(); + in1_index_subblock_offset += out_subblock_w; + } + in0_index_subblock_offset += in0_subblock_num_tiles; + } + + if (spill) { + enable_reload = true; + } + + cb_pop_front(tt::CBIndex::c_0, in0_block_num_tiles); + cb_pop_front(tt::CBIndex::c_1, in1_block_num_tiles); + } +} +} // namespace NAMESPACE diff --git a/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_receiver.cpp b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_receiver.cpp new file mode 100644 index 000000000..59353e110 --- /dev/null +++ b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_receiver.cpp @@ -0,0 +1,106 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 +// +// Interior cores: pure receiver for both in0 (from left column) and in1 +// (from top row). Signals readiness via semaphore and waits for data. +// Adapted from tt-metal programming example (batch loop removed). + +#include "api/dataflow/dataflow_api.h" +#include "hostdevcommon/common_values.hpp" +#include + +void kernel_main() { + // in0 tensor args (unused but kept for uniform arg layout) + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args (unused but kept for uniform arg layout) + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // in0 mcast args + uint32_t in0_mcast_dest_noc_start_x = get_arg_val(17); + uint32_t in0_mcast_dest_noc_start_y = get_arg_val(18); + uint32_t in0_mcast_dest_noc_end_x = get_arg_val(19); + uint32_t in0_mcast_dest_noc_end_y = get_arg_val(20); + uint32_t in0_mcast_num_dests = get_arg_val(21); + uint32_t in0_mcast_sender_noc_x = get_arg_val(22); + uint32_t in0_mcast_sender_noc_y = get_arg_val(23); + uint32_t in0_mcast_sender_semaphore_addr = + get_semaphore(get_arg_val(24)); + uint32_t in0_mcast_receiver_semaphore_addr = + get_semaphore(get_arg_val(25)); + + // in1 mcast args + uint32_t in1_mcast_dest_noc_start_x = get_arg_val(26); + uint32_t in1_mcast_dest_noc_start_y = get_arg_val(27); + uint32_t in1_mcast_dest_noc_end_x = get_arg_val(28); + uint32_t in1_mcast_dest_noc_end_y = get_arg_val(29); + uint32_t in1_mcast_num_dests = get_arg_val(30); + uint32_t in1_mcast_sender_noc_x = get_arg_val(31); + uint32_t in1_mcast_sender_noc_y = get_arg_val(32); + uint32_t in1_mcast_sender_semaphore_addr = + get_semaphore(get_arg_val(33)); + uint32_t in1_mcast_receiver_semaphore_addr = + get_semaphore(get_arg_val(34)); + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + volatile tt_l1_ptr uint32_t *in0_mcast_receiver_semaphore_addr_ptr = + reinterpret_cast( + in0_mcast_receiver_semaphore_addr); + volatile tt_l1_ptr uint32_t *in1_mcast_receiver_semaphore_addr_ptr = + reinterpret_cast( + in1_mcast_receiver_semaphore_addr); + + for (uint32_t block = 0; block < num_blocks; block++) { + // -- in0: receive via multicast from left column -- + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + + noc_semaphore_set(in0_mcast_receiver_semaphore_addr_ptr, INVALID); + + uint64_t in0_mcast_sender_semaphore_noc_addr = + get_noc_addr(in0_mcast_sender_noc_x, in0_mcast_sender_noc_y, + in0_mcast_sender_semaphore_addr); + noc_semaphore_inc(in0_mcast_sender_semaphore_noc_addr, 1); + + noc_semaphore_wait(in0_mcast_receiver_semaphore_addr_ptr, VALID); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + + // -- in1: receive via multicast from top row -- + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + + noc_semaphore_set(in1_mcast_receiver_semaphore_addr_ptr, INVALID); + + uint64_t in1_mcast_sender_semaphore_noc_addr = + get_noc_addr(in1_mcast_sender_noc_x, in1_mcast_sender_noc_y, + in1_mcast_sender_semaphore_addr); + noc_semaphore_inc(in1_mcast_sender_semaphore_noc_addr, 1); + + noc_semaphore_wait(in1_mcast_receiver_semaphore_addr_ptr, VALID); + + cb_push_back(cb_id_in1, in1_block_num_tiles); + } +} diff --git a/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_sender.cpp b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_sender.cpp new file mode 100644 index 000000000..73ae2fa67 --- /dev/null +++ b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_receiver_in1_sender.cpp @@ -0,0 +1,156 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 +// +// Top row cores (excluding corner): receives in0 via multicast from the left +// column. Reads in1 from DRAM and multicasts downward along the column. +// Adapted from tt-metal programming example (batch loop removed). + +#include "api/dataflow/dataflow_api.h" +#include "hostdevcommon/common_values.hpp" +#include + +void kernel_main() { + // in0 tensor args (unused on this core but kept for uniform arg layout) + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // in0 mcast args + uint32_t in0_mcast_dest_noc_start_x = get_arg_val(17); + uint32_t in0_mcast_dest_noc_start_y = get_arg_val(18); + uint32_t in0_mcast_dest_noc_end_x = get_arg_val(19); + uint32_t in0_mcast_dest_noc_end_y = get_arg_val(20); + uint32_t in0_mcast_num_dests = get_arg_val(21); + uint32_t in0_mcast_sender_noc_x = get_arg_val(22); + uint32_t in0_mcast_sender_noc_y = get_arg_val(23); + uint32_t in0_mcast_sender_semaphore_addr = + get_semaphore(get_arg_val(24)); + uint32_t in0_mcast_receiver_semaphore_addr = + get_semaphore(get_arg_val(25)); + + // in1 mcast args + uint32_t in1_mcast_dest_noc_start_x = get_arg_val(26); + uint32_t in1_mcast_dest_noc_start_y = get_arg_val(27); + uint32_t in1_mcast_dest_noc_end_x = get_arg_val(28); + uint32_t in1_mcast_dest_noc_end_y = get_arg_val(29); + uint32_t in1_mcast_num_dests = get_arg_val(30); + uint32_t in1_mcast_sender_noc_x = get_arg_val(31); + uint32_t in1_mcast_sender_noc_y = get_arg_val(32); + uint32_t in1_mcast_sender_semaphore_addr = + get_semaphore(get_arg_val(33)); + uint32_t in1_mcast_receiver_semaphore_addr = + get_semaphore(get_arg_val(34)); + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + const uint32_t single_tile_size_bytes = get_tile_size(cb_id_in1); + + uint32_t l1_write_addr_in1; + + volatile tt_l1_ptr uint32_t *in0_mcast_receiver_semaphore_addr_ptr = + reinterpret_cast( + in0_mcast_receiver_semaphore_addr); + + volatile tt_l1_ptr uint32_t *in1_mcast_receiver_semaphore_addr_ptr = + reinterpret_cast( + in1_mcast_receiver_semaphore_addr); + *(in1_mcast_receiver_semaphore_addr_ptr) = VALID; + + volatile tt_l1_ptr uint32_t *in1_mcast_sender_semaphore_addr_ptr = + reinterpret_cast( + in1_mcast_sender_semaphore_addr); + + constexpr auto s0_args = TensorAccessorArgs<0>(); + constexpr auto s1_args = + TensorAccessorArgs(); + const auto s1 = + TensorAccessor(s1_args, in1_tensor_addr, single_tile_size_bytes); + + uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id; + for (uint32_t block = 0; block < num_blocks; block++) { + // -- in0: receive via multicast from left column -- + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + + noc_semaphore_set(in0_mcast_receiver_semaphore_addr_ptr, INVALID); + + uint64_t in0_mcast_sender_semaphore_noc_addr = + get_noc_addr(in0_mcast_sender_noc_x, in0_mcast_sender_noc_y, + in0_mcast_sender_semaphore_addr); + noc_semaphore_inc(in0_mcast_sender_semaphore_noc_addr, 1); + + noc_semaphore_wait(in0_mcast_receiver_semaphore_addr_ptr, VALID); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + + // -- in1: read from DRAM and multicast downward -- + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + + uint32_t in1_start_address = l1_write_addr_in1; + uint32_t in1_block_size_bytes = 0; + + uint32_t in1_tensor_row_start_tile_id = + in1_tensor_current_block_start_tile_id; + for (uint32_t h = 0; h < in1_block_h; h++) { + uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id; + for (uint32_t w = 0; w < in1_block_w; w++) { + noc_async_read_tile(in1_tensor_tile_id, s1, l1_write_addr_in1); + l1_write_addr_in1 += single_tile_size_bytes; + in1_tensor_tile_id += in1_tensor_stride_w; + in1_block_size_bytes += single_tile_size_bytes; + } + in1_tensor_row_start_tile_id += in1_tensor_stride_h; + } + in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride; + + noc_async_read_barrier(); + + noc_semaphore_wait(in1_mcast_sender_semaphore_addr_ptr, + in1_mcast_num_dests); + noc_semaphore_set(in1_mcast_sender_semaphore_addr_ptr, 0); + + uint64_t in1_multicast_data_addr = get_noc_multicast_addr( + in1_mcast_dest_noc_start_x, in1_mcast_dest_noc_start_y, + in1_mcast_dest_noc_end_x, in1_mcast_dest_noc_end_y, in1_start_address); + noc_async_write_multicast(in1_start_address, in1_multicast_data_addr, + in1_block_size_bytes, in1_mcast_num_dests); + +#ifdef ARCH_BLACKHOLE + noc_async_writes_flushed(); +#endif + + uint64_t in1_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr( + in1_mcast_dest_noc_start_x, in1_mcast_dest_noc_start_y, + in1_mcast_dest_noc_end_x, in1_mcast_dest_noc_end_y, + in1_mcast_receiver_semaphore_addr); + noc_semaphore_set_multicast(in1_mcast_receiver_semaphore_addr, + in1_mcast_receiver_semaphore_noc_addr, + in1_mcast_num_dests); + + cb_push_back(cb_id_in1, in1_block_num_tiles); + } +} diff --git a/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_receiver.cpp b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_receiver.cpp new file mode 100644 index 000000000..02e045d6d --- /dev/null +++ b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_receiver.cpp @@ -0,0 +1,155 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 +// +// Left column cores (excluding corner): reads in0 from DRAM and multicasts +// rightward along the row. Receives in1 via multicast from the top row. +// Adapted from tt-metal programming example (batch loop removed). + +#include "api/dataflow/dataflow_api.h" +#include "hostdevcommon/common_values.hpp" +#include + +void kernel_main() { + // in0 tensor args + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args (unused on this core but kept for uniform arg layout) + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // in0 mcast args + uint32_t in0_mcast_dest_noc_start_x = get_arg_val(17); + uint32_t in0_mcast_dest_noc_start_y = get_arg_val(18); + uint32_t in0_mcast_dest_noc_end_x = get_arg_val(19); + uint32_t in0_mcast_dest_noc_end_y = get_arg_val(20); + uint32_t in0_mcast_num_dests = get_arg_val(21); + uint32_t in0_mcast_sender_noc_x = get_arg_val(22); + uint32_t in0_mcast_sender_noc_y = get_arg_val(23); + uint32_t in0_mcast_sender_semaphore_addr = + get_semaphore(get_arg_val(24)); + uint32_t in0_mcast_receiver_semaphore_addr = + get_semaphore(get_arg_val(25)); + + // in1 mcast args + uint32_t in1_mcast_dest_noc_start_x = get_arg_val(26); + uint32_t in1_mcast_dest_noc_start_y = get_arg_val(27); + uint32_t in1_mcast_dest_noc_end_x = get_arg_val(28); + uint32_t in1_mcast_dest_noc_end_y = get_arg_val(29); + uint32_t in1_mcast_num_dests = get_arg_val(30); + uint32_t in1_mcast_sender_noc_x = get_arg_val(31); + uint32_t in1_mcast_sender_noc_y = get_arg_val(32); + uint32_t in1_mcast_sender_semaphore_addr = + get_semaphore(get_arg_val(33)); + uint32_t in1_mcast_receiver_semaphore_addr = + get_semaphore(get_arg_val(34)); + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + const uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0); + + uint32_t l1_write_addr_in0; + + volatile tt_l1_ptr uint32_t *in0_mcast_receiver_semaphore_addr_ptr = + reinterpret_cast( + in0_mcast_receiver_semaphore_addr); + *(in0_mcast_receiver_semaphore_addr_ptr) = VALID; + + volatile tt_l1_ptr uint32_t *in0_mcast_sender_semaphore_addr_ptr = + reinterpret_cast( + in0_mcast_sender_semaphore_addr); + + volatile tt_l1_ptr uint32_t *in1_mcast_receiver_semaphore_addr_ptr = + reinterpret_cast( + in1_mcast_receiver_semaphore_addr); + + constexpr auto s0_args = TensorAccessorArgs<0>(); + const auto s0 = + TensorAccessor(s0_args, in0_tensor_addr, single_tile_size_bytes); + + uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id; + for (uint32_t block = 0; block < num_blocks; block++) { + // -- in0: read from DRAM and multicast rightward -- + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + + uint32_t in0_start_address = l1_write_addr_in0; + uint32_t in0_block_size_bytes = 0; + + uint32_t in0_tensor_row_start_tile_id = + in0_tensor_current_block_start_tile_id; + for (uint32_t h = 0; h < in0_block_h; h++) { + uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id; + for (uint32_t w = 0; w < in0_block_w; w++) { + noc_async_read_tile(in0_tensor_tile_id, s0, l1_write_addr_in0); + l1_write_addr_in0 += single_tile_size_bytes; + in0_tensor_tile_id += in0_tensor_stride_w; + in0_block_size_bytes += single_tile_size_bytes; + } + in0_tensor_row_start_tile_id += in0_tensor_stride_h; + } + in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride; + + noc_async_read_barrier(); + + noc_semaphore_wait(in0_mcast_sender_semaphore_addr_ptr, + in0_mcast_num_dests); + noc_semaphore_set(in0_mcast_sender_semaphore_addr_ptr, 0); + + uint64_t in0_multicast_data_addr = get_noc_multicast_addr( + in0_mcast_dest_noc_end_x, in0_mcast_dest_noc_end_y, + in0_mcast_dest_noc_start_x, in0_mcast_dest_noc_start_y, + in0_start_address); + noc_async_write_multicast(in0_start_address, in0_multicast_data_addr, + in0_block_size_bytes, in0_mcast_num_dests); + +#ifdef ARCH_BLACKHOLE + noc_async_writes_flushed(); +#endif + + uint64_t in0_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr( + in0_mcast_dest_noc_end_x, in0_mcast_dest_noc_end_y, + in0_mcast_dest_noc_start_x, in0_mcast_dest_noc_start_y, + in0_mcast_receiver_semaphore_addr); + noc_semaphore_set_multicast(in0_mcast_receiver_semaphore_addr, + in0_mcast_receiver_semaphore_noc_addr, + in0_mcast_num_dests); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + + // -- in1: receive via multicast from top row -- + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + + noc_semaphore_set(in1_mcast_receiver_semaphore_addr_ptr, INVALID); + + uint64_t in1_mcast_sender_semaphore_noc_addr = + get_noc_addr(in1_mcast_sender_noc_x, in1_mcast_sender_noc_y, + in1_mcast_sender_semaphore_addr); + noc_semaphore_inc(in1_mcast_sender_semaphore_noc_addr, 1); + + noc_semaphore_wait(in1_mcast_receiver_semaphore_addr_ptr, VALID); + + cb_push_back(cb_id_in1, in1_block_num_tiles); + } +} diff --git a/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_sender.cpp b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_sender.cpp new file mode 100644 index 000000000..2e2f6c68c --- /dev/null +++ b/examples/metal_examples/2d_mcast_matmul/metal/kernels/reader_bmm_tile_layout_in0_sender_in1_sender.cpp @@ -0,0 +1,200 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 +// +// Corner core (0,0): reads both in0 and in1 from DRAM, multicasts +// in0 rightward along the row and in1 downward along the column. +// Adapted from tt-metal programming example (batch loop removed). + +#include "api/dataflow/dataflow_api.h" +#include "hostdevcommon/common_values.hpp" +#include + +void kernel_main() { + // in0 tensor args + uint32_t in0_tensor_addr = get_arg_val(0); + uint32_t in0_tensor_start_tile_id = get_arg_val(1); + uint32_t in0_tensor_stride_w = get_arg_val(2); + uint32_t in0_tensor_stride_h = get_arg_val(3); + uint32_t in0_tensor_next_block_stride = get_arg_val(4); + + // in0 block args + uint32_t in0_block_w = get_arg_val(5); + uint32_t in0_block_h = get_arg_val(6); + uint32_t in0_block_num_tiles = get_arg_val(7); + + // in1 tensor args + uint32_t in1_tensor_addr = get_arg_val(8); + uint32_t in1_tensor_start_tile_id = get_arg_val(9); + uint32_t in1_tensor_stride_w = get_arg_val(10); + uint32_t in1_tensor_stride_h = get_arg_val(11); + uint32_t in1_tensor_next_block_stride = get_arg_val(12); + + // in1 block args + uint32_t in1_block_w = get_arg_val(13); + uint32_t in1_block_h = get_arg_val(14); + uint32_t in1_block_num_tiles = get_arg_val(15); + + // in0/in1 common args + uint32_t num_blocks = get_arg_val(16); + + // in0 mcast args + uint32_t in0_mcast_dest_noc_start_x = get_arg_val(17); + uint32_t in0_mcast_dest_noc_start_y = get_arg_val(18); + uint32_t in0_mcast_dest_noc_end_x = get_arg_val(19); + uint32_t in0_mcast_dest_noc_end_y = get_arg_val(20); + uint32_t in0_mcast_num_dests = get_arg_val(21); + uint32_t in0_mcast_sender_noc_x = get_arg_val(22); + uint32_t in0_mcast_sender_noc_y = get_arg_val(23); + uint32_t in0_mcast_sender_semaphore_addr = + get_semaphore(get_arg_val(24)); + uint32_t in0_mcast_receiver_semaphore_addr = + get_semaphore(get_arg_val(25)); + + // in1 mcast args + uint32_t in1_mcast_dest_noc_start_x = get_arg_val(26); + uint32_t in1_mcast_dest_noc_start_y = get_arg_val(27); + uint32_t in1_mcast_dest_noc_end_x = get_arg_val(28); + uint32_t in1_mcast_dest_noc_end_y = get_arg_val(29); + uint32_t in1_mcast_num_dests = get_arg_val(30); + uint32_t in1_mcast_sender_noc_x = get_arg_val(31); + uint32_t in1_mcast_sender_noc_y = get_arg_val(32); + uint32_t in1_mcast_sender_semaphore_addr = + get_semaphore(get_arg_val(33)); + uint32_t in1_mcast_receiver_semaphore_addr = + get_semaphore(get_arg_val(34)); + + constexpr uint32_t cb_id_in0 = 0; + constexpr uint32_t cb_id_in1 = 1; + + const uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0); + + uint32_t l1_write_addr_in0; + uint32_t l1_write_addr_in1; + + volatile tt_l1_ptr uint32_t *in0_mcast_receiver_semaphore_addr_ptr = + reinterpret_cast( + in0_mcast_receiver_semaphore_addr); + *(in0_mcast_receiver_semaphore_addr_ptr) = VALID; + + volatile tt_l1_ptr uint32_t *in1_mcast_receiver_semaphore_addr_ptr = + reinterpret_cast( + in1_mcast_receiver_semaphore_addr); + *(in1_mcast_receiver_semaphore_addr_ptr) = VALID; + + volatile tt_l1_ptr uint32_t *in0_mcast_sender_semaphore_addr_ptr = + reinterpret_cast( + in0_mcast_sender_semaphore_addr); + + volatile tt_l1_ptr uint32_t *in1_mcast_sender_semaphore_addr_ptr = + reinterpret_cast( + in1_mcast_sender_semaphore_addr); + + constexpr auto s0_args = TensorAccessorArgs<0>(); + const auto s0 = + TensorAccessor(s0_args, in0_tensor_addr, single_tile_size_bytes); + constexpr auto s1_args = + TensorAccessorArgs(); + const auto s1 = + TensorAccessor(s1_args, in1_tensor_addr, single_tile_size_bytes); + + uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id; + uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id; + for (uint32_t block = 0; block < num_blocks; block++) { + // -- in0: read from DRAM and multicast rightward -- + cb_reserve_back(cb_id_in0, in0_block_num_tiles); + l1_write_addr_in0 = get_write_ptr(cb_id_in0); + + uint32_t in0_start_address = l1_write_addr_in0; + uint32_t in0_block_size_bytes = 0; + + uint32_t in0_tensor_row_start_tile_id = + in0_tensor_current_block_start_tile_id; + for (uint32_t h = 0; h < in0_block_h; h++) { + uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id; + for (uint32_t w = 0; w < in0_block_w; w++) { + noc_async_read_tile(in0_tensor_tile_id, s0, l1_write_addr_in0); + l1_write_addr_in0 += single_tile_size_bytes; + in0_tensor_tile_id += in0_tensor_stride_w; + in0_block_size_bytes += single_tile_size_bytes; + } + in0_tensor_row_start_tile_id += in0_tensor_stride_h; + } + in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride; + + noc_async_read_barrier(); + + noc_semaphore_wait(in0_mcast_sender_semaphore_addr_ptr, + in0_mcast_num_dests); + noc_semaphore_set(in0_mcast_sender_semaphore_addr_ptr, 0); + + uint64_t in0_multicast_data_addr = get_noc_multicast_addr( + in0_mcast_dest_noc_end_x, in0_mcast_dest_noc_end_y, + in0_mcast_dest_noc_start_x, in0_mcast_dest_noc_start_y, + in0_start_address); + noc_async_write_multicast(in0_start_address, in0_multicast_data_addr, + in0_block_size_bytes, in0_mcast_num_dests); + +#ifdef ARCH_BLACKHOLE + noc_async_writes_flushed(); +#endif + + uint64_t in0_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr( + in0_mcast_dest_noc_end_x, in0_mcast_dest_noc_end_y, + in0_mcast_dest_noc_start_x, in0_mcast_dest_noc_start_y, + in0_mcast_receiver_semaphore_addr); + noc_semaphore_set_multicast(in0_mcast_receiver_semaphore_addr, + in0_mcast_receiver_semaphore_noc_addr, + in0_mcast_num_dests); + + cb_push_back(cb_id_in0, in0_block_num_tiles); + + // -- in1: read from DRAM and multicast downward -- + cb_reserve_back(cb_id_in1, in1_block_num_tiles); + l1_write_addr_in1 = get_write_ptr(cb_id_in1); + + uint32_t in1_start_address = l1_write_addr_in1; + uint32_t in1_block_size_bytes = 0; + + uint32_t in1_tensor_row_start_tile_id = + in1_tensor_current_block_start_tile_id; + for (uint32_t h = 0; h < in1_block_h; h++) { + uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id; + for (uint32_t w = 0; w < in1_block_w; w++) { + noc_async_read_tile(in1_tensor_tile_id, s1, l1_write_addr_in1); + l1_write_addr_in1 += single_tile_size_bytes; + in1_tensor_tile_id += in1_tensor_stride_w; + in1_block_size_bytes += single_tile_size_bytes; + } + in1_tensor_row_start_tile_id += in1_tensor_stride_h; + } + in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride; + + noc_async_read_barrier(); + + noc_semaphore_wait(in1_mcast_sender_semaphore_addr_ptr, + in1_mcast_num_dests); + noc_semaphore_set(in1_mcast_sender_semaphore_addr_ptr, 0); + + uint64_t in1_multicast_data_addr = get_noc_multicast_addr( + in1_mcast_dest_noc_end_x, in1_mcast_dest_noc_end_y, + in1_mcast_dest_noc_start_x, in1_mcast_dest_noc_start_y, + in1_start_address); + noc_async_write_multicast(in1_start_address, in1_multicast_data_addr, + in1_block_size_bytes, in1_mcast_num_dests); + +#ifdef ARCH_BLACKHOLE + noc_async_writes_flushed(); +#endif + + uint64_t in1_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr( + in1_mcast_dest_noc_end_x, in1_mcast_dest_noc_end_y, + in1_mcast_dest_noc_start_x, in1_mcast_dest_noc_start_y, + in1_mcast_receiver_semaphore_addr); + noc_semaphore_set_multicast(in1_mcast_receiver_semaphore_addr, + in1_mcast_receiver_semaphore_noc_addr, + in1_mcast_num_dests); + + cb_push_back(cb_id_in1, in1_block_num_tiles); + } +} diff --git a/examples/metal_examples/2d_mcast_matmul/metal/kernels/writer_bmm_tile_layout.cpp b/examples/metal_examples/2d_mcast_matmul/metal/kernels/writer_bmm_tile_layout.cpp new file mode 100644 index 000000000..d7a5565bc --- /dev/null +++ b/examples/metal_examples/2d_mcast_matmul/metal/kernels/writer_bmm_tile_layout.cpp @@ -0,0 +1,57 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 +// +// Writer kernel for block matmul output. +// Adapted from tt-metal programming example (batch loop removed). + +#include "api/dataflow/dataflow_api.h" + +void kernel_main() { + uint32_t out_tensor_addr = get_arg_val(0); + uint32_t out_tensor_start_tile_id = get_arg_val(1); + uint32_t out_tensor_stride_w = get_arg_val(2); + uint32_t out_tensor_stride_h = get_arg_val(3); + uint32_t out_tensor_next_subblock_stride_w = get_arg_val(4); + uint32_t out_tensor_next_subblock_stride_h = get_arg_val(5); + + uint32_t out_subblock_w = get_arg_val(6); + uint32_t out_subblock_h = get_arg_val(7); + uint32_t out_subblock_tile_count = get_arg_val(8); + uint32_t out_num_subblocks_w = get_arg_val(9); + uint32_t out_num_subblocks_h = get_arg_val(10); + + constexpr uint32_t cb_id_out0 = 16; + + const uint32_t single_tile_size_bytes = get_tile_size(cb_id_out0); + + constexpr auto s_args = TensorAccessorArgs<0>(); + const auto s = + TensorAccessor(s_args, out_tensor_addr, single_tile_size_bytes); + + uint32_t out_tensor_sbh_start_tile_id = out_tensor_start_tile_id; + for (uint32_t sbh = 0; sbh < out_num_subblocks_h; sbh++) { + uint32_t out_tensor_sbw_start_tile_id = out_tensor_sbh_start_tile_id; + for (uint32_t sbw = 0; sbw < out_num_subblocks_w; sbw++) { + uint32_t out_tensor_sb_row_start_tile_id = out_tensor_sbw_start_tile_id; + + cb_wait_front(cb_id_out0, out_subblock_tile_count); + uint32_t l1_read_addr = get_read_ptr(cb_id_out0); + + for (uint32_t h = 0; h < out_subblock_h; h++) { + uint32_t out_tensor_tile_id = out_tensor_sb_row_start_tile_id; + for (uint32_t w = 0; w < out_subblock_w; w++) { + noc_async_write_tile(out_tensor_tile_id, s, l1_read_addr); + l1_read_addr += single_tile_size_bytes; + out_tensor_tile_id += out_tensor_stride_w; + } + out_tensor_sb_row_start_tile_id += out_tensor_stride_h; + } + + noc_async_write_barrier(); + cb_pop_front(cb_id_out0, out_subblock_tile_count); + out_tensor_sbw_start_tile_id += out_tensor_next_subblock_stride_w; + } + out_tensor_sbh_start_tile_id += out_tensor_next_subblock_stride_h; + } +} diff --git a/examples/metal_examples/2d_mcast_matmul/ttlang/2d_mcast_matmul.py b/examples/metal_examples/2d_mcast_matmul/ttlang/2d_mcast_matmul.py new file mode 100644 index 000000000..1e67ff5d6 --- /dev/null +++ b/examples/metal_examples/2d_mcast_matmul/ttlang/2d_mcast_matmul.py @@ -0,0 +1,182 @@ +# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 +import pytest +import torch +import ttnn + +import ttl +from utils.block_allocation import get_large_matmul_params +from utils.correctness import assert_with_ulp + + +@ttl.operation(grid=("auto")) +def tt_lang_2d_mcast_matmul(a: ttnn.Tensor, b: ttnn.Tensor, out: ttnn.Tensor): + assert a.shape[1] == b.shape[0], "Incompatible matrix shapes for multiplication." + assert a.shape[0] == out.shape[0], "Output matrix has incorrect number of rows." + assert b.shape[1] == out.shape[1], "Output matrix has incorrect number of columns." + M = a.shape[0] + N = b.shape[1] + K = a.shape[1] + Mt = M // ttnn.TILE_SIZE + Kt = K // ttnn.TILE_SIZE + Nt = N // ttnn.TILE_SIZE + + K_block_size = 2 + + num_nodes_x, num_nodes_y = ttl.grid_size(dims=2) + block_params = get_large_matmul_params( + Mt, Nt, num_nodes_y, num_nodes_x, K_block_size + ) + per_node_M = block_params.block_h + per_node_N = block_params.block_w + assert per_node_M != 0, "get_large_matmul_params was not able to find a solution" + print(f"per_node_M: {per_node_M}, per_node_N: {per_node_N}") + assert Mt % per_node_M == 0, "per_node_M must divide Mt" + assert Nt % per_node_N == 0, "per_node_N must divide Nt" + assert Kt % K_block_size == 0, "K_block_size must divide Kt" + num_blocks_y = Mt // per_node_M + num_blocks_x = Nt // per_node_N + assert ( + num_blocks_x <= num_nodes_x and num_blocks_y <= num_nodes_y + ), "number of total blocks must be less than or equal to num nodes" + assert ( + num_blocks_x >= 2 and num_blocks_y >= 2 + ), "2D mcast requires at least a 2x2 active node grid" + + num_active_x = num_blocks_x + num_active_y = num_blocks_y + num_blocks_k = Kt // K_block_size + + block_count = 2 + a_dfb = ttl.make_dataflow_buffer_like( + a, shape=(per_node_M, K_block_size), block_count=block_count + ) + b_dfb = ttl.make_dataflow_buffer_like( + b, shape=(K_block_size, per_node_N), block_count=block_count + ) + # non buffered output, matching metal implementation + out_dfb = ttl.make_dataflow_buffer_like( + out, shape=(per_node_M, per_node_N), block_count=1 + ) + + # A multicast: left column (x=0) reads from DRAM and multicasts rightward along each row + a_pipes = [ + ttl.Pipe((0, y), (slice(1, num_active_x), y)) for y in range(num_active_y) + ] + a_mcast_net = ttl.PipeNet(a_pipes) + + # B multicast: top row (y=0) reads from DRAM and multicasts downward along each column + b_pipes = [ + ttl.Pipe((x, 0), (x, slice(1, num_active_y))) for x in range(num_active_x) + ] + b_mcast_net = ttl.PipeNet(b_pipes) + + @ttl.compute() + def mm_compute(): + node_x, node_y = ttl.node(dims=2) + out_row = per_node_M * node_y + out_col = per_node_N * node_x + if (out_row < Mt) and (out_col < Nt): + with out_dfb.reserve() as out_blk: + acc = ttl.math.fill(out_blk, 0) + for _ in range(num_blocks_k): + with ( + a_dfb.wait() as a_blk, + b_dfb.wait() as b_blk, + ): + acc += a_blk @ b_blk + out_blk.store(acc) + + @ttl.datamovement() + def mm_reader(): + node_x, node_y = ttl.node(dims=2) + out_row = per_node_M * node_y + out_col = per_node_N * node_x + if (out_row < Mt) and (out_col < Nt): + for block_k in range(num_blocks_k): + k = block_k * K_block_size + + # A: left column reads from DRAM and multicasts, other columns receive + with a_dfb.reserve() as a_blk: + + def a_pipe_src(pipe): + in_rd = ttl.copy( + a[ + out_row : (out_row + per_node_M), + k : (k + K_block_size), + ], + a_blk, + ) + in_rd.wait() + mcast_wr = ttl.copy(a_blk, pipe) + mcast_wr.wait() + + def a_pipe_dst(pipe): + mcast_rd = ttl.copy(pipe, a_blk) + mcast_rd.wait() + + a_mcast_net.if_src(a_pipe_src) + a_mcast_net.if_dst(a_pipe_dst) + + # B: top row reads from DRAM and multicasts, other rows receive + with b_dfb.reserve() as b_blk: + + def b_pipe_src(pipe): + in_rd = ttl.copy( + b[ + k : (k + K_block_size), + out_col : (out_col + per_node_N), + ], + b_blk, + ) + in_rd.wait() + mcast_wr = ttl.copy(b_blk, pipe) + mcast_wr.wait() + + def b_pipe_dst(pipe): + mcast_rd = ttl.copy(pipe, b_blk) + mcast_rd.wait() + + b_mcast_net.if_src(b_pipe_src) + b_mcast_net.if_dst(b_pipe_dst) + + @ttl.datamovement() + def mm_writer(): + node_x, node_y = ttl.node(dims=2) + out_row = per_node_M * node_y + out_col = per_node_N * node_x + if (out_row < Mt) and (out_col < Nt): + with out_dfb.wait() as out_blk: + out_wr = ttl.copy( + out_blk, + out[ + out_row : (out_row + per_node_M), + out_col : (out_col + per_node_N), + ], + ) + out_wr.wait() + + +@pytest.mark.parametrize("M,K,N", [(3584, 768, 3072)]) +def test_2d_mcast_matmul_tt_lang(M, K, N): + """Test 2D multicast matmul operation.""" + device = ttnn.open_device(device_id=0) + a = ttnn.rand((M, K), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT) + b = ttnn.rand((K, N), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT) + c = ttnn.empty((M, N), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT) + + tt_lang_2d_mcast_matmul(a, b, c) + + golden = torch.matmul( + ttnn.to_torch(a).to(torch.bfloat16), ttnn.to_torch(b).to(torch.bfloat16) + ) + result = ttnn.to_torch(c).to(torch.bfloat16) + assert_with_ulp(golden, result) + print("Test passed!") + + ttnn.close_device(device) + + +if __name__ == "__main__": + test_2d_mcast_matmul_tt_lang(3584, 768, 3072) diff --git a/examples/metal_examples/multinode_matmul/metal/kernels/mm_compute.cpp b/examples/metal_examples/multinode_matmul/metal/kernels/mm_compute.cpp index 3dc7cdd89..543638ef3 100644 --- a/examples/metal_examples/multinode_matmul/metal/kernels/mm_compute.cpp +++ b/examples/metal_examples/multinode_matmul/metal/kernels/mm_compute.cpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "compute_kernel_api/matmul.h" -#include "compute_kernel_api/tile_move_copy.h" +#include "api/compute/matmul.h" +#include "api/compute/tile_move_copy.h" #include "hostdevcommon/kernel_structs.h" #include diff --git a/examples/metal_examples/multinode_matmul/metal/multinode_matmul.py b/examples/metal_examples/multinode_matmul/metal/multinode_matmul.py index 1cb65c431..3caea924b 100644 --- a/examples/metal_examples/multinode_matmul/metal/multinode_matmul.py +++ b/examples/metal_examples/multinode_matmul/metal/multinode_matmul.py @@ -28,8 +28,8 @@ def test_multinode_matmul(M, K, N): device_node_size = device.compute_with_storage_grid_size() upper_bound_node = ttnn.CoreCoord(device_node_size.x - 1, device_node_size.y - 1) - device_node_grid = ttnn.NodeRangeSet( - [ttnn.NodeRange(ttnn.CoreCoord(0, 0), upper_bound_node)] + device_node_grid = ttnn.CoreRangeSet( + [ttnn.CoreRange(ttnn.CoreCoord(0, 0), upper_bound_node)] ) print( f"node_grid: {device_node_grid}, num_output_tiles_total: {num_output_tiles_total}" @@ -92,17 +92,17 @@ def test_multinode_matmul(M, K, N): a_cb_descriptor = ttnn.CBDescriptor( total_size=cb_total_size, - node_ranges=all_nodes, + core_ranges=all_nodes, format_descriptors=[a_cb_format], ) b_cb_descriptor = ttnn.CBDescriptor( total_size=cb_total_size, - node_ranges=all_nodes, + core_ranges=all_nodes, format_descriptors=[b_cb_format], ) out_cb_descriptor = ttnn.CBDescriptor( total_size=cb_total_size, - node_ranges=all_nodes, + core_ranges=all_nodes, format_descriptors=[out_cb_format], ) @@ -121,9 +121,9 @@ def test_multinode_matmul(M, K, N): # as the larger one to enable indexing in num_x_nodes = upper_bound_node.x + 1 num_y_nodes = upper_bound_node.y + 1 - reader_rt_args = [[[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)] - writer_rt_args = [[[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)] - compute_rt_args = [[[] for _ in range(num_y_nodes)] for _ in range(num_x_nodes)] + reader_rt_args = [] + writer_rt_args = [] + compute_rt_args = [] current_tile = 0 for node_range in node_group_1.ranges(): for x in range(node_range.start.x, node_range.end.x + 1): @@ -131,21 +131,32 @@ def test_multinode_matmul(M, K, N): print( f"Assigning node ({x},{y}) tile {current_tile} work_per_node1 {work_per_node1}" ) - reader_rt_args[x][y] = [ - a_tensor.buffer_address(), - b_tensor.buffer_address(), - Mt, - Kt, - Nt, - current_tile, - work_per_node1, - ] - writer_rt_args[x][y] = [ - output_tensor.buffer_address(), - work_per_node1, - current_tile, - ] - compute_rt_args[x][y] = [work_per_node1, Kt] + core = ttnn.CoreCoord(x, y) + reader_rt_args.append( + ( + core, + [ + a_tensor.buffer_address(), + b_tensor.buffer_address(), + Mt, + Kt, + Nt, + current_tile, + work_per_node1, + ], + ) + ) + writer_rt_args.append( + ( + core, + [ + output_tensor.buffer_address(), + work_per_node1, + current_tile, + ], + ) + ) + compute_rt_args.append((core, [work_per_node1, Kt])) current_tile += work_per_node1 for node_range in node_group_2.ranges(): @@ -154,21 +165,32 @@ def test_multinode_matmul(M, K, N): print( f"Assigning node ({x},{y}) tile {current_tile} work_per_node2 {work_per_node2}" ) - reader_rt_args[x][y] = [ - a_tensor.buffer_address(), - b_tensor.buffer_address(), - Mt, - Kt, - Nt, - current_tile, - work_per_node2, - ] - writer_rt_args[x][y] = [ - output_tensor.buffer_address(), - work_per_node2, - current_tile, - ] - compute_rt_args[x][y] = [work_per_node2, Kt] + core = ttnn.CoreCoord(x, y) + reader_rt_args.append( + ( + core, + [ + a_tensor.buffer_address(), + b_tensor.buffer_address(), + Mt, + Kt, + Nt, + current_tile, + work_per_node2, + ], + ) + ) + writer_rt_args.append( + ( + core, + [ + output_tensor.buffer_address(), + work_per_node2, + current_tile, + ], + ) + ) + compute_rt_args.append((core, [work_per_node2, Kt])) current_tile += work_per_node2 # Compute config init can't handle options, set here @@ -180,7 +202,7 @@ def test_multinode_matmul(M, K, N): reader_kernel_descriptor = ttnn.KernelDescriptor( kernel_source="examples/metal_examples/multinode_matmul/metal/kernels/mm_reader.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=all_nodes, + core_ranges=all_nodes, compile_time_args=reader_compile_time_args, runtime_args=reader_rt_args, config=ttnn.ReaderConfigDescriptor(), @@ -188,7 +210,7 @@ def test_multinode_matmul(M, K, N): writer_kernel_descriptor = ttnn.KernelDescriptor( kernel_source="examples/metal_examples/multinode_matmul/metal/kernels/mm_writer.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=all_nodes, + core_ranges=all_nodes, compile_time_args=writer_compile_time_args, runtime_args=writer_rt_args, config=ttnn.WriterConfigDescriptor(), @@ -196,7 +218,7 @@ def test_multinode_matmul(M, K, N): compute_kernel_descriptor = ttnn.KernelDescriptor( kernel_source="examples/metal_examples/multinode_matmul/metal/kernels/mm_compute.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=all_nodes, + core_ranges=all_nodes, compile_time_args=[], runtime_args=compute_rt_args, config=computeConfig, diff --git a/examples/metal_examples/multinode_matmul/ttlang/multinode_matmul.py b/examples/metal_examples/multinode_matmul/ttlang/multinode_matmul.py index 5da5e1aba..6053d8312 100644 --- a/examples/metal_examples/multinode_matmul/ttlang/multinode_matmul.py +++ b/examples/metal_examples/multinode_matmul/ttlang/multinode_matmul.py @@ -1,7 +1,6 @@ # SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC # # SPDX-License-Identifier: Apache-2.0 -# up to tt-lang spec, not intended to compile or run currently import pytest import torch @@ -9,21 +8,10 @@ import ttl from utils.correctness import assert_with_ulp -from utils.block_allocation import split_work_to_nodes +from utils.block_allocation import get_number_of_nodes_from_ranges, split_work_to_nodes -def get_number_of_nodes(grid_range): - total_nodes = 0 - if len(grid_range) != 0: - start = grid_range[0] - end = grid_range[1] - x_range = end[0] - start[0] + 1 - y_range = end[1] - start[1] + 1 - total_nodes += x_range * y_range - return total_nodes - - -@ttl.operation(grid=(13, 10)) +@ttl.operation(grid=("auto")) def tt_lang_multinode_matmul(a: ttnn.Tensor, b: ttnn.Tensor, out: ttnn.Tensor): assert a.shape[1] == b.shape[0], "Incompatible matrix shapes for multiplication." assert a.shape[0] == out.shape[0], "Output matrix has incorrect number of rows." @@ -44,15 +32,15 @@ def tt_lang_multinode_matmul(a: ttnn.Tensor, b: ttnn.Tensor, out: ttnn.Tensor): print(f"num_output_tiles_total: {num_output_tiles_total}") all_nodes, node_group_1, node_group_2, work_per_node1, work_per_node2 = ( split_work_to_nodes( - ttl.grid_size(dims=2), num_output_tiles_total, row_wise=True + (ttl.grid_size(dims=1),), num_output_tiles_total, row_wise=True ) ) print( f"all_nodes: {all_nodes}, node_group_1: {node_group_1}, node_group_2: {node_group_2}, work_per_node1: {work_per_node1}, work_per_node2: {work_per_node2}" ) - num_nodes_group_1 = get_number_of_nodes(node_group_1) - num_nodes_group_2 = get_number_of_nodes(node_group_2) + num_nodes_group_1 = get_number_of_nodes_from_ranges(node_group_1) + num_nodes_group_2 = get_number_of_nodes_from_ranges(node_group_2) def get_tiles_per_node(node_id): if node_id < num_nodes_group_1: @@ -112,7 +100,7 @@ def mm_writer(): out_wr.wait() -@pytest.mark.parametrize("M,K,N", [(256, 256, 256), (512, 512, 512)]) +@pytest.mark.parametrize("M,K,N", [(640, 640, 640)]) def test_multinode_matmul_tt_lang(M, K, N): """Test multinode matmul operation.""" device = ttnn.open_device(device_id=0) @@ -127,10 +115,12 @@ def test_multinode_matmul_tt_lang(M, K, N): ) result = ttnn.to_torch(c).to(torch.bfloat16) assert_with_ulp(golden, result) + print("Test passed!") ttnn.close_device(device) if __name__ == "__main__": - # TODO: This won't work with 256, 256, 256 + test_multinode_matmul_tt_lang(256, 256, 256) + test_multinode_matmul_tt_lang(512, 512, 512) test_multinode_matmul_tt_lang(640, 640, 640) diff --git a/examples/metal_examples/multinode_reuse_matmul/metal/kernels/bmm_large_block_zm.cpp b/examples/metal_examples/multinode_reuse_matmul/metal/kernels/bmm_large_block_zm.cpp index 5d8f71592..eb7518b7e 100644 --- a/examples/metal_examples/multinode_reuse_matmul/metal/kernels/bmm_large_block_zm.cpp +++ b/examples/metal_examples/multinode_reuse_matmul/metal/kernels/bmm_large_block_zm.cpp @@ -4,8 +4,8 @@ #include -#include "compute_kernel_api/matmul.h" -#include "compute_kernel_api/tile_move_copy.h" +#include "api/compute/matmul.h" +#include "api/compute/tile_move_copy.h" namespace NAMESPACE { void MAIN { diff --git a/examples/metal_examples/multinode_reuse_matmul/metal/multinode_reuse_matmul.py b/examples/metal_examples/multinode_reuse_matmul/metal/multinode_reuse_matmul.py index b926571d8..a65636ded 100644 --- a/examples/metal_examples/multinode_reuse_matmul/metal/multinode_reuse_matmul.py +++ b/examples/metal_examples/multinode_reuse_matmul/metal/multinode_reuse_matmul.py @@ -44,9 +44,9 @@ def test_metal_matmul(M, K, N): assert ( num_blocks_x <= num_nodes_x and num_blocks_y <= num_nodes_y ), "number of total blocks must be less than or equal to num nodes in each dimension" - all_nodes = ttnn.NodeRangeSet( + all_nodes = ttnn.CoreRangeSet( [ - ttnn.NodeRange( + ttnn.CoreRange( ttnn.CoreCoord(0, 0), ttnn.CoreCoord(num_blocks_x - 1, num_blocks_y - 1) ) ] @@ -105,24 +105,24 @@ def test_metal_matmul(M, K, N): block_count = 2 a_cb_descriptor = ttnn.CBDescriptor( total_size=block_count * cb_page_size * (per_node_M * K_block_size), - node_ranges=all_nodes, + core_ranges=all_nodes, format_descriptors=[a_cb_format], ) b_cb_descriptor = ttnn.CBDescriptor( total_size=block_count * cb_page_size * (per_node_N * K_block_size), - node_ranges=all_nodes, + core_ranges=all_nodes, format_descriptors=[b_cb_format], ) # example has output cb not double buffered out_cb_descriptor = ttnn.CBDescriptor( total_size=cb_page_size * (per_node_M * per_node_N), - node_ranges=all_nodes, + core_ranges=all_nodes, format_descriptors=[out_cb_format], ) # needs to be generated by compiler as tt-lang does not have a notion of the dst register, which is the only thing that this cb interacts with intermediate_cb_descriptor = ttnn.CBDescriptor( total_size=cb_page_size * (per_node_M * per_node_N), - node_ranges=all_nodes, + core_ranges=all_nodes, format_descriptors=[intermediate_cb_format], ) @@ -164,9 +164,9 @@ def test_metal_matmul(M, K, N): output_tensor ).get_compile_time_args() - reader_rt_args = [[[] for _ in range(num_nodes_y)] for _ in range(num_nodes_x)] - writer_rt_args = [[[] for _ in range(num_nodes_y)] for _ in range(num_nodes_x)] - compute_rt_args = [[[] for _ in range(num_nodes_y)] for _ in range(num_nodes_x)] + reader_rt_args = [] + writer_rt_args = [] + compute_rt_args = [] current_blk = 0 print( f"num_blocks_x: {num_blocks_x}, num_blocks_y: {num_blocks_y}, output tiles is {Mt}x{Nt}" @@ -180,39 +180,51 @@ def test_metal_matmul(M, K, N): for output_idx_x in range(num_blocks_x): node_x = current_blk % num_nodes_x node_y = current_blk // num_nodes_x - reader_rt_args[node_x][node_y] = [ - a_tensor.buffer_address(), # a_tensor_addr - Kt * per_node_M * output_idx_y, # a_tensor_start_tile_id - 1, # a_tensor_stride_w - Kt, # a_tensor_stride_h - K_block_size, # a_tensor_next_block_stride - K_block_size, # K_block_size - per_node_M, # a_block_h - K_block_size * per_node_M, # a_block_num_tiles - b_tensor.buffer_address(), # b_tensor_addr - per_node_N * output_idx_x, # b_tensor_start_tile_id - 1, # b_tensor_stride_w - Nt, # b_tensor_stride_h - K_block_size * Nt, # b_tensor_next_block_stride - per_node_N, # b_block_w - K_block_size, # b_block_h - per_node_N * K_block_size, # b_block_num_tiles - Kt // K_block_size, # num_blocks - ] - writer_rt_args[node_x][node_y] = [ - output_tensor.buffer_address(), # out_buffer_addr - (output_idx_x * per_node_N) - + (output_idx_y * per_node_M * Nt), # out_tensor_start_tile_id - 1, # out_tensor_stride_w - Nt, # out_tensor_stride_h - out_subblock_w, # out_tensor_next_subblock_stride_w - out_subblock_h * Nt, # out_tensor_next_subblock_stride_h - out_subblock_w, # out_subblock_w - out_subblock_h, # out_subblock_h - out_subblock_w * out_subblock_h, # out_subblocks_w * out_subblocks_h - per_node_N // out_subblock_w, # out_num_subblocks_w - per_node_M // out_subblock_h, # out_num_subblocks_h - ] + core = ttnn.CoreCoord(node_x, node_y) + reader_rt_args.append( + ( + core, + [ + a_tensor.buffer_address(), # a_tensor_addr + Kt * per_node_M * output_idx_y, # a_tensor_start_tile_id + 1, # a_tensor_stride_w + Kt, # a_tensor_stride_h + K_block_size, # a_tensor_next_block_stride + K_block_size, # K_block_size + per_node_M, # a_block_h + K_block_size * per_node_M, # a_block_num_tiles + b_tensor.buffer_address(), # b_tensor_addr + per_node_N * output_idx_x, # b_tensor_start_tile_id + 1, # b_tensor_stride_w + Nt, # b_tensor_stride_h + K_block_size * Nt, # b_tensor_next_block_stride + per_node_N, # b_block_w + K_block_size, # b_block_h + per_node_N * K_block_size, # b_block_num_tiles + Kt // K_block_size, # num_blocks + ], + ) + ) + writer_rt_args.append( + ( + core, + [ + output_tensor.buffer_address(), # out_buffer_addr + (output_idx_x * per_node_N) + + (output_idx_y * per_node_M * Nt), # out_tensor_start_tile_id + 1, # out_tensor_stride_w + Nt, # out_tensor_stride_h + out_subblock_w, # out_tensor_next_subblock_stride_w + out_subblock_h * Nt, # out_tensor_next_subblock_stride_h + out_subblock_w, # out_subblock_w + out_subblock_h, # out_subblock_h + out_subblock_w + * out_subblock_h, # out_subblocks_w * out_subblocks_h + per_node_N // out_subblock_w, # out_num_subblocks_w + per_node_M // out_subblock_h, # out_num_subblocks_h + ], + ) + ) print( f"node {node_x},{node_y} assigned start out block slice [{output_idx_x * per_node_N}:{(output_idx_x + 1) * per_node_N}]x[{output_idx_y * per_node_M}:{(output_idx_y + 1) * per_node_M}]" ) @@ -224,7 +236,7 @@ def test_metal_matmul(M, K, N): reader_kernel_descriptor = ttnn.KernelDescriptor( kernel_source="examples/metal_examples/multinode_reuse_matmul/metal/kernels/reader_bmm_tile_layout.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=all_nodes, + core_ranges=all_nodes, compile_time_args=reader_compile_time_args, runtime_args=reader_rt_args, config=ttnn.ReaderConfigDescriptor(), @@ -232,7 +244,7 @@ def test_metal_matmul(M, K, N): writer_kernel_descriptor = ttnn.KernelDescriptor( kernel_source="examples/metal_examples/multinode_reuse_matmul/metal/kernels/writer_bmm_tile_layout.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=all_nodes, + core_ranges=all_nodes, compile_time_args=writer_compile_time_args, runtime_args=writer_rt_args, config=ttnn.WriterConfigDescriptor(), @@ -240,7 +252,7 @@ def test_metal_matmul(M, K, N): compute_kernel_descriptor = ttnn.KernelDescriptor( kernel_source="examples/metal_examples/multinode_reuse_matmul/metal/kernels/bmm_large_block_zm.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=all_nodes, + core_ranges=all_nodes, compile_time_args=compute_compile_time_args, runtime_args=compute_rt_args, config=computeConfig, diff --git a/examples/metal_examples/multinode_reuse_matmul/ttlang/multinode_reuse_matmul.py b/examples/metal_examples/multinode_reuse_matmul/ttlang/multinode_reuse_matmul.py index 3e51ed98d..60f230c18 100644 --- a/examples/metal_examples/multinode_reuse_matmul/ttlang/multinode_reuse_matmul.py +++ b/examples/metal_examples/multinode_reuse_matmul/ttlang/multinode_reuse_matmul.py @@ -10,7 +10,7 @@ from utils.correctness import assert_with_ulp -@ttl.operation(grid=(13, 10)) +@ttl.operation(grid=("auto")) def tt_lang_multinode_reuse_matmul(a: ttnn.Tensor, b: ttnn.Tensor, out: ttnn.Tensor): assert a.shape[1] == b.shape[0], "Incompatible matrix shapes for multiplication." assert a.shape[0] == out.shape[0], "Output matrix has incorrect number of rows." @@ -123,6 +123,7 @@ def test_multinode_reuse_matmul_tt_lang(M, K, N): ) result = ttnn.to_torch(c).to(torch.bfloat16) assert_with_ulp(golden, result) + print("Test passed!") ttnn.close_device(device) diff --git a/examples/metal_examples/single_node_matmul/metal/kernels/mm_compute.cpp b/examples/metal_examples/single_node_matmul/metal/kernels/mm_compute.cpp index 985b074f2..6d5b82501 100644 --- a/examples/metal_examples/single_node_matmul/metal/kernels/mm_compute.cpp +++ b/examples/metal_examples/single_node_matmul/metal/kernels/mm_compute.cpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "compute_kernel_api/matmul.h" -#include "compute_kernel_api/tile_move_copy.h" +#include "api/compute/matmul.h" +#include "api/compute/tile_move_copy.h" #include "hostdevcommon/kernel_structs.h" #include diff --git a/examples/metal_examples/single_node_matmul/metal/single_node_matmul.py b/examples/metal_examples/single_node_matmul/metal/single_node_matmul.py index 626f54a04..49fe95298 100644 --- a/examples/metal_examples/single_node_matmul/metal/single_node_matmul.py +++ b/examples/metal_examples/single_node_matmul/metal/single_node_matmul.py @@ -63,22 +63,22 @@ def test_singlenode_matmul_metal(M, K, N): # single node grid node = ttnn.CoreCoord(0, 0) - node_grid = ttnn.NodeRangeSet([ttnn.NodeRange(node, node)]) + node_grid = ttnn.CoreRangeSet([ttnn.CoreRange(node, node)]) dfb_block_count = 2 cb_total_size = dfb_block_count * cb_page_size a_cb_descriptor = ttnn.CBDescriptor( total_size=cb_total_size, - node_ranges=node_grid, + core_ranges=node_grid, format_descriptors=[a_cb_format], ) b_cb_descriptor = ttnn.CBDescriptor( total_size=cb_total_size, - node_ranges=node_grid, + core_ranges=node_grid, format_descriptors=[b_cb_format], ) out_cb_descriptor = ttnn.CBDescriptor( total_size=cb_total_size, - node_ranges=node_grid, + core_ranges=node_grid, format_descriptors=[out_cb_format], ) @@ -100,27 +100,27 @@ def test_singlenode_matmul_metal(M, K, N): computeConfig.math_approx_mode = False reader_kernel_descriptor = ttnn.KernelDescriptor( - kernel_source="examples/metal_examples/singlenode_matmul/metal/kernels/mm_reader.cpp", + kernel_source="examples/metal_examples/single_node_matmul/metal/kernels/mm_reader.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=node_grid, + core_ranges=node_grid, compile_time_args=reader_compile_time_args, - runtime_args=[[reader_rt_args]], + runtime_args=[(node, reader_rt_args)], config=ttnn.ReaderConfigDescriptor(), ) writer_kernel_descriptor = ttnn.KernelDescriptor( - kernel_source="examples/metal_examples/singlenode_matmul/metal/kernels/mm_writer.cpp", + kernel_source="examples/metal_examples/single_node_matmul/metal/kernels/mm_writer.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=node_grid, + core_ranges=node_grid, compile_time_args=writer_compile_time_args, - runtime_args=[[writer_rt_args]], + runtime_args=[(node, writer_rt_args)], config=ttnn.WriterConfigDescriptor(), ) compute_kernel_descriptor = ttnn.KernelDescriptor( - kernel_source="examples/metal_examples/singlenode_matmul/metal/kernels/mm_compute.cpp", + kernel_source="examples/metal_examples/single_node_matmul/metal/kernels/mm_compute.cpp", source_type=ttnn.KernelDescriptor.SourceType.FILE_PATH, - node_ranges=node_grid, + core_ranges=node_grid, compile_time_args=compute_compile_time_args, - runtime_args=[[[]]], + runtime_args=[], config=computeConfig, ) diff --git a/examples/metal_examples/single_node_matmul/ttlang/single_node_matmul.py b/examples/metal_examples/single_node_matmul/ttlang/single_node_matmul.py index f93dc64dd..fc022dd5f 100644 --- a/examples/metal_examples/single_node_matmul/ttlang/single_node_matmul.py +++ b/examples/metal_examples/single_node_matmul/ttlang/single_node_matmul.py @@ -1,9 +1,6 @@ # SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC # # SPDX-License-Identifier: Apache-2.0 -# up to tt-lang spec, not intended to compile or run currently -import sys -from pathlib import Path import ttnn import pytest import torch @@ -76,6 +73,7 @@ def test_singlenode_matmul_tt_lang(): ) result = ttnn.to_torch(c).to(torch.bfloat16) assert_with_ulp(golden, result) + print("Test passed!") ttnn.close_device(device) diff --git a/examples/tt_upsample.py b/examples/tt_upsample.py index c1d785379..75e66d311 100644 --- a/examples/tt_upsample.py +++ b/examples/tt_upsample.py @@ -9,7 +9,7 @@ import ttl from utils.correctness import assert_with_ulp -from utils.block_allocation import split_work_to_nodes +from utils.block_allocation import get_number_of_nodes_from_ranges, split_work_to_nodes @ttl.operation(grid=(8, 8)) @@ -33,12 +33,8 @@ def tt_lang_upsample_nearest_rowwise_interleaved( f"all_cores: {all_cores}, core_group_1: {core_group_1}, core_group_2: {core_group_2}, work_per_core1: {work_per_core1}, work_per_core2: {work_per_core2}" ) - num_cores_group_1 = ( - core_group_1[1][-1] - core_group_1[0][-1] + 1 if core_group_1 else 0 - ) - num_cores_group_2 = ( - core_group_2[1][-1] - core_group_2[0][-1] + 1 if core_group_2 else 0 - ) + num_cores_group_1 = get_number_of_nodes_from_ranges(core_group_1) + num_cores_group_2 = get_number_of_nodes_from_ranges(core_group_2) def get_work_per_core(core_id): if core_id < num_cores_group_1: @@ -129,8 +125,6 @@ def test_tt_lang_upsample_nearest_rowwise_interleaved(input_shape, scale_factor) ) golden_tensor = ttnn.upsample(input_tensor, scale_factor) - print(f"golden_tensor: {golden_tensor}") - print(f"output_tensor: {output_tensor}") assert_with_ulp(output_tensor.to_torch(), golden_tensor.to_torch(), ulp_threshold=1) print("Test passed!") diff --git a/python/utils/__init__.py b/python/utils/__init__.py index ebb199bfa..ead3f17b7 100644 --- a/python/utils/__init__.py +++ b/python/utils/__init__.py @@ -4,12 +4,17 @@ """Utility functions for tt-lang.""" -from .block_allocation import get_large_matmul_params, split_work_to_nodes +from .block_allocation import ( + get_large_matmul_params, + get_number_of_nodes_from_ranges, + split_work_to_nodes, +) from .correctness import assert_allclose, assert_pcc, assert_with_ulp __all__ = [ # block_allocation "split_work_to_nodes", + "get_number_of_nodes_from_ranges", "get_large_matmul_params", # correctness "assert_pcc", diff --git a/python/utils/block_allocation.py b/python/utils/block_allocation.py index 8729051fe..23ed81358 100644 --- a/python/utils/block_allocation.py +++ b/python/utils/block_allocation.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 import itertools import math -from collections import namedtuple from typing import List, Tuple from collections import namedtuple @@ -22,6 +21,19 @@ def get_number_of_nodes(grid: Tuple[int, ...]) -> int: return node_count +def get_number_of_nodes_from_ranges( + ranges: List[Tuple[Tuple[int, ...], Tuple[int, ...]]], +) -> int: + """Count the total number of nodes across a list of rectangular ranges.""" + total = 0 + for start, end in ranges: + count = 1 + for s, e in zip(start, end): + count *= e - s + 1 + total += count + return total + + def filter_factor_pairs_by_2d_grid( factor_pairs: list[Tuple[int, int]], grid: Tuple[int, int] ) -> list[Tuple[int, int]]: @@ -153,13 +165,15 @@ def split_work_to_nodes( grid_size: Tuple[int, ...], units_to_divide: int, row_wise: bool = True ) -> Tuple[ int, - Tuple[Tuple[int, ...], Tuple[int, ...]], - Tuple[Tuple[int, ...], Tuple[int, ...]], + List[Tuple[Tuple[int, ...], Tuple[int, ...]]], + List[Tuple[Tuple[int, ...], Tuple[int, ...]]], int, int, ]: - """Splits work units among nodes in a from a single device grid. - currently can produce work splits that cannot map to CoreRanges directly, particlarily in 1-d grids + """Splits work units among nodes from a single device grid. + + Matches the semantics of ttnn.split_work_to_cores: each group is a list of + rectangular (start, end) coordinate ranges (like a CoreRangeSet). Args: grid_size: A tuple representing the dimensions of the node grid. @@ -167,14 +181,14 @@ def split_work_to_nodes( row_wise: If True, split work in a row-wise manner; otherwise, column-wise. Returns: A tuple containing: - - total number of nodes - - node group 1 as a tuple of tuples, start coord to end coord rectangle [inclusive, inclusive] - - node group 2 as a tuple of tuples, start coord to end coord rectangle [inclusive, inclusive] + - total number of nodes used + - node group 1 as a list of (start_coord, end_coord) ranges [inclusive] + - node group 2 as a list of (start_coord, end_coord) ranges [inclusive] - work units per node in group 1 - work units per node in group 2 """ if units_to_divide == 0: - return (0, (), (), 0, 0) + return (0, [], [], 0, 0) simplified_grid_size = remove_leading_ones(grid_size) assert len(simplified_grid_size) <= 2, "only supports grids with a single device" total_nodes = get_number_of_nodes(grid_size) @@ -185,14 +199,12 @@ def split_work_to_nodes( ): # more nodes than work units, assign 1 unit to first N nodes if len(simplified_grid_size) == 1: end_coord = ((0,) * (len(grid_size) - 1)) + (units_to_divide - 1,) + return (units_to_divide, [(start_coord, end_coord)], [], 1, 0) elif len(simplified_grid_size) == 2: ranges = num_nodes_to_grid_ranges( start_coord, units_to_divide, grid_size, row_wise ) - end_coord = ((0,) * (len(grid_size) - 2)) + ranges[-1][ - 1 - ] # Last range's end coordinate - return (units_to_divide, (start_coord, end_coord), (), 1, 0) + return (units_to_divide, ranges, [], 1, 0) else: # more work units than nodes, divide work as evenly as possible if len(simplified_grid_size) == 1: @@ -202,8 +214,8 @@ def split_work_to_nodes( if remaining_work == 0: return ( total_nodes, - ((0,) * len(grid_size), end_coord_all), - (), + [((0,) * len(grid_size), end_coord_all)], + [], work_per_node, 0, ) @@ -211,8 +223,8 @@ def split_work_to_nodes( start_coord_2 = ((0,) * (len(grid_size) - 1)) + (remaining_work,) return ( total_nodes, - ((0,) * len(grid_size), end_coord_1), - (start_coord_2, end_coord_all), + [((0,) * len(grid_size), end_coord_1)], + [(start_coord_2, end_coord_all)], work_per_node + 1, work_per_node, ) @@ -234,7 +246,7 @@ def split_work_to_nodes( num_nodes_y = grid_size[-2] prefix = (0,) * (len(grid_size) - 2) end_coord = prefix + (num_nodes_y - 1, num_nodes_x - 1) - return (total_nodes, (start_coord, end_coord), (), work_per_node, 0) + return (total_nodes, [(start_coord, end_coord)], [], work_per_node, 0) # Uneven division - need two groups else: @@ -280,18 +292,10 @@ def split_work_to_nodes( start_coord_group2, num_nodes_group2, grid_size, row_wise ) - # For simplified return, we'll return the bounding boxes - # Group 1: from (0,0,...) to last coord of group 1 - group1_bbox = (start_coord, last_coord_group1) - - # Group 2: from start to last coord of group 2 - last_coord_group2 = group2_ranges[-1][1] - group2_bbox = (start_coord_group2, last_coord_group2) - return ( total_nodes, - group1_bbox, - group2_bbox, + group1_ranges, + group2_ranges, work_per_node + 1, work_per_node, ) diff --git a/test/python/test_block_allocation.py b/test/python/test_block_allocation.py index c74f1b01c..8f472acd7 100644 --- a/test/python/test_block_allocation.py +++ b/test/python/test_block_allocation.py @@ -14,6 +14,7 @@ from ttl.utils.block_allocation import ( get_large_matmul_params, + get_number_of_nodes_from_ranges, split_work_to_nodes, ) @@ -29,6 +30,32 @@ def extract_coords_from_ttnn_corerangeset(core_range_set): return coords +@pytest.mark.parametrize( + "ranges,expected", + [ + # Empty range list + ([], 0), + # Single 1D range + ([((0,), (4,))], 5), + # Single 2D range: full rectangle + ([((0, 0), (3, 7))], 32), + # Single point + ([((2, 3), (2, 3))], 1), + # Multiple 2D ranges: L-shape (4 full rows + partial row) + ([((0, 0), (3, 7)), ((4, 0), (4, 3))], 36), + # Multiple 1D ranges (disjoint) + ([((0,), (2,)), ((4,), (6,))], 6), + # Multiple 2D ranges: partial row + full rows + partial row + ([((0, 4), (0, 7)), ((1, 0), (2, 7)), ((3, 0), (3, 2))], 4 + 16 + 3), + # 3D range with leading dimension + ([((0, 0, 0), (0, 2, 4))], 15), + ], +) +def test_get_number_of_nodes_from_ranges(ranges, expected): + """Test get_number_of_nodes_from_ranges with known inputs and expected counts.""" + assert get_number_of_nodes_from_ranges(ranges) == expected + + @pytest.mark.parametrize( "grid_size_tuple,units,row_wise", [ @@ -36,6 +63,7 @@ def extract_coords_from_ttnn_corerangeset(core_range_set): ((8, 8), 100, True), ((8, 8), 100, False), ((8, 8), 65, True), + ((8, 8), 65, False), ((8, 8), 129, True), # Test even distribution ((8, 8), 64, True), @@ -43,6 +71,7 @@ def extract_coords_from_ttnn_corerangeset(core_range_set): # Test with different grid sizes ((4, 8), 50, True), ((7, 9), 100, False), + ((7, 9), 100, True), # Test fewer units than cores ((8, 8), 10, True), ((8, 8), 20, False), @@ -50,16 +79,21 @@ def extract_coords_from_ttnn_corerangeset(core_range_set): # Test edge cases ((8, 8), 63, True), ((8, 8), 127, True), + # 2D grids that force multiple CoreRanges per group (L-shapes) + ((13, 10), 200, True), + ((13, 10), 200, False), + ((5, 7), 50, True), + ((3, 12), 40, False), + # Small grids with multi-range groups + ((2, 3), 10, True), + ((3, 2), 8, False), ], ) def test_split_work_to_nodes(grid_size_tuple, units, row_wise): """Compare results from split_work_to_nodes and ttnn.split_work_to_cores""" - # Call new function new_result = split_work_to_nodes(grid_size_tuple, units, row_wise) new_total, new_g1, new_g2, new_w1, new_w2 = new_result - # Call ttnn function - # Create CoreRangeSet from grid_size_tuple num_cores_x = grid_size_tuple[-1] num_cores_y = grid_size_tuple[-2] ttnn_grid = ttnn.CoreRangeSet( @@ -73,15 +107,14 @@ def test_split_work_to_nodes(grid_size_tuple, units, row_wise): ttnn_result = ttnn.split_work_to_cores(ttnn_grid, units, row_wise) ttnn_total, ttnn_all, ttnn_g1, ttnn_g2, ttnn_w1, ttnn_w2 = ttnn_result - # Extract coordinates from ttnn function ttnn_g1_coords = extract_coords_from_ttnn_corerangeset(ttnn_g1) ttnn_g2_coords = extract_coords_from_ttnn_corerangeset(ttnn_g2) - # Verify work distribution matches assert new_w1 == ttnn_w1, f"Work per core G1 mismatch: {new_w1} vs {ttnn_w1}" assert new_w2 == ttnn_w2, f"Work per core G2 mismatch: {new_w2} vs {ttnn_w2}" - # Calculate total cores in each group from ttnn + new_g1_num_cores = get_number_of_nodes_from_ranges(new_g1) + new_g2_num_cores = get_number_of_nodes_from_ranges(new_g2) ttnn_g1_num_cores = sum( (end[1] - start[1] + 1) * (end[0] - start[0] + 1) for start, end in ttnn_g1_coords @@ -91,36 +124,37 @@ def test_split_work_to_nodes(grid_size_tuple, units, row_wise): for start, end in ttnn_g2_coords ) - # Verify total work matches - new_total_work = ttnn_g1_num_cores * new_w1 + ttnn_g2_num_cores * new_w2 - ttnn_total_work = ttnn_g1_num_cores * ttnn_w1 + ttnn_g2_num_cores * ttnn_w2 assert ( - new_total_work == ttnn_total_work == units - ), f"Total work mismatch: {new_total_work} vs {ttnn_total_work} vs {units}" - - # Verify group 1 coordinates - if new_g1 and ttnn_g1_coords: - new_g1_start, new_g1_end = new_g1 - ttnn_g1_first_start = ttnn_g1_coords[0][0] - ttnn_g1_last_end = ttnn_g1_coords[-1][1] + new_g1_num_cores == ttnn_g1_num_cores + ), f"Group 1 core count mismatch: {new_g1_num_cores} vs {ttnn_g1_num_cores}" + assert ( + new_g2_num_cores == ttnn_g2_num_cores + ), f"Group 2 core count mismatch: {new_g2_num_cores} vs {ttnn_g2_num_cores}" + + new_total_work = new_g1_num_cores * new_w1 + new_g2_num_cores * new_w2 + assert new_total_work == units, f"Total work mismatch: {new_total_work} vs {units}" + + assert len(new_g1) == len( + ttnn_g1_coords + ), f"Group 1 range count mismatch: {len(new_g1)} vs {len(ttnn_g1_coords)}" + for i, (new_range, ttnn_range) in enumerate(zip(new_g1, ttnn_g1_coords)): + assert ( + new_range[0] == ttnn_range[0] + ), f"G1 range {i} start mismatch: {new_range[0]} vs {ttnn_range[0]}" + assert ( + new_range[1] == ttnn_range[1] + ), f"G1 range {i} end mismatch: {new_range[1]} vs {ttnn_range[1]}" + + assert len(new_g2) == len( + ttnn_g2_coords + ), f"Group 2 range count mismatch: {len(new_g2)} vs {len(ttnn_g2_coords)}" + for i, (new_range, ttnn_range) in enumerate(zip(new_g2, ttnn_g2_coords)): assert ( - new_g1_start == ttnn_g1_first_start and new_g1_end == ttnn_g1_last_end - ), f"Group 1 coordinates mismatch: new {new_g1_start} -> {new_g1_end}, ttnn {ttnn_g1_first_start} -> {ttnn_g1_last_end}" - - # Verify group 2 coordinates - if new_g2 and ttnn_g2_coords: - new_g2_start, new_g2_end = new_g2 - ttnn_g2_first_start = ttnn_g2_coords[0][0] - ttnn_g2_last_end = ttnn_g2_coords[-1][1] + new_range[0] == ttnn_range[0] + ), f"G2 range {i} start mismatch: {new_range[0]} vs {ttnn_range[0]}" assert ( - new_g2_start == ttnn_g2_first_start and new_g2_end == ttnn_g2_last_end - ), f"Group 2 coordinates mismatch: new {new_g2_start} -> {new_g2_end}, ttnn {ttnn_g2_first_start} -> {ttnn_g2_last_end}" - - # Check empty groups match - if not new_g1: - assert not ttnn_g1_coords, "Group 1 empty mismatch" - if not new_g2: - assert not ttnn_g2_coords, "Group 2 empty mismatch" + new_range[1] == ttnn_range[1] + ), f"G2 range {i} end mismatch: {new_range[1]} vs {ttnn_range[1]}" @pytest.mark.parametrize( From fe2bb68d5dcb53c6c3aadb25e7ec5ee4a8d72ba8 Mon Sep 17 00:00:00 2001 From: Alex Richins Date: Fri, 10 Apr 2026 12:36:17 -0700 Subject: [PATCH 09/31] consistent missing device error (#481) Problem description When users pass host tensors (e.g., after ttnn.from_device()) to a @ttl.operation, they get an opaque AttributeError: 'NoneType' object has no attribute 'compute_with_storage_grid_size' with no indication that the tensor needs to be on a device. This happens at two call sites: _resolve_grid when grid='auto' -- tries to query the device compute grid from a None device CompiledTTNNKernel.__call__ -- validates kernel grid against device compute grid on a None device closes #389 What's changed Added a shared _require_device(args) helper that scans tensor arguments for an on-device tensor and returns the device. When all tensors are on host, it raises a ValueError listing the host tensor shapes and showing how to fix it: ``` ValueError: No device found on any tensor argument. All ttnn tensor inputs are on host: arg[0]: Shape([32, 32]) Place tensors on device before calling the operation, e.g.: ttnn.to_device(tensor, device) ttnn.from_torch(tensor, ..., device=device) ``` Both crash sites (_resolve_grid and CompiledTTNNKernel.__call__) now use this helper instead of calling .device() without a None check. The five other .device() call sites in the file already had proper None guards and were not changed. The simulator path (python/sim/) is unaffected Checklist - [x] New/Existing tests provide coverage for changes --- python/ttl/ttl_api.py | 41 ++++++--- test/python/test_missing_device.py | 140 +++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+), 10 deletions(-) create mode 100644 test/python/test_missing_device.py diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py index 9cf005fa0..f15be2952 100644 --- a/python/ttl/ttl_api.py +++ b/python/ttl/ttl_api.py @@ -404,20 +404,41 @@ def _has_float32_args(args) -> bool: return False +def _require_device(args): + """Extract the device from tensor arguments, raising if none are on-device. + + Returns the first non-None device found. Raises ValueError with + a message listing which arguments are host tensors and suggesting + ttnn.to_device(). + """ + for i, arg in enumerate(args): + if is_ttnn_tensor(arg): + device = arg.device() + if device is not None: + return device + host_args = [ + f" arg[{i}]: {arg.shape}" for i, arg in enumerate(args) if is_ttnn_tensor(arg) + ] + if not host_args: + raise ValueError("No device found: no ttnn tensor arguments were provided.") + raise ValueError( + "No device found on any tensor argument. " + "All ttnn tensor inputs are on host:\n" + + "\n".join(host_args) + + "\nPlace tensors on device before calling the operation, e.g.:\n" + " ttnn.to_device(tensor, device)\n" + " ttnn.from_torch(tensor, ..., device=device)" + ) + + def _resolve_grid(grid, args, kwargs): """Resolve grid, evaluating callable or 'auto' if needed.""" if callable(grid): return grid(*args, **kwargs) if grid == "auto": - for arg in args: - if is_ttnn_tensor(arg) and hasattr(arg, "device"): - device = arg.device() - device_grid = device.compute_with_storage_grid_size() - return (device_grid.x, device_grid.y) - raise ValueError( - "grid='auto' requires at least one ttnn tensor argument " - "to determine device compute grid" - ) + device = _require_device(args) + device_grid = device.compute_with_storage_grid_size() + return (device_grid.x, device_grid.y) return grid @@ -529,7 +550,7 @@ def __call__(self, *args): raise ValueError(f"Expected {self.num_tensors} tensors, got {len(args)}") # Validate grid against device's compute grid. - device = args[0].device() + device = _require_device(args) device_grid = device.compute_with_storage_grid_size() kernel_grid = self.core_ranges.bounding_box().grid_size() if kernel_grid.x > device_grid.x or kernel_grid.y > device_grid.y: diff --git a/test/python/test_missing_device.py b/test/python/test_missing_device.py new file mode 100644 index 000000000..a7d772ecf --- /dev/null +++ b/test/python/test_missing_device.py @@ -0,0 +1,140 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Tests for missing device error handling. + +Verifies that meaningful error messages are produced when operations +receive host tensors instead of device tensors. +""" + +import pytest +import torch +import ttl + +ttnn = pytest.importorskip("ttnn", exc_type=ImportError) + + +@ttl.operation(grid="auto") +def nop_auto_grid(a): + @ttl.compute() + def compute_nop(): + pass + + @ttl.datamovement() + def dm_nop1(): + pass + + @ttl.datamovement() + def dm_nop2(): + pass + + +@ttl.operation(grid="auto") +def nop_auto_grid_2(a, b): + @ttl.compute() + def compute_nop(): + pass + + @ttl.datamovement() + def dm_nop1(): + pass + + @ttl.datamovement() + def dm_nop2(): + pass + + +@ttl.operation(grid=(1, 1)) +def nop_fixed_grid(a): + @ttl.compute() + def compute_nop(): + pass + + @ttl.datamovement() + def dm_nop1(): + pass + + @ttl.datamovement() + def dm_nop2(): + pass + + +def test_auto_grid_host_tensor(): + """ + grid='auto' with a host tensor should produce a clear error, not an AttributeError on NoneType. + """ + a_host = ttnn.from_torch( + torch.zeros(32, 32, dtype=torch.bfloat16), + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + ) + + with pytest.raises(ValueError, match="No device found"): + nop_auto_grid(a_host) + + +def test_fixed_grid_host_tensor(): + """ + grid=(1,1) with a host tensor should produce a clear error, not an AttributeError on NoneType. + """ + a_host = ttnn.from_torch( + torch.zeros(32, 32, dtype=torch.bfloat16), + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + ) + + with pytest.raises(ValueError, match="No device found"): + nop_fixed_grid(a_host) + + +def test_auto_grid_no_ttnn_tensors(): + """ + grid='auto' with no ttnn tensors should report that none were provided. + The fixed-grid path hits _require_device at __call__ time (post-compile), + so it can't be reached without a valid ttnn tensor to compile against. + """ + with pytest.raises(ValueError, match="no ttnn tensor arguments were provided"): + nop_auto_grid(torch.zeros(32, 32, dtype=torch.bfloat16)) + + +def test_auto_grid_multiple_host_tensors(): + """Error message should list all host tensor arguments.""" + a_host = ttnn.from_torch( + torch.zeros(32, 32, dtype=torch.bfloat16), + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + ) + b_host = ttnn.from_torch( + torch.zeros(64, 64, dtype=torch.bfloat16), + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + ) + + with pytest.raises( + ValueError, match="All ttnn tensor inputs are on host" + ) as exc_info: + nop_auto_grid_2(a_host, b_host) + msg = str(exc_info.value) + assert "arg[0]" in msg + assert "arg[1]" in msg + + +def test_auto_grid_mixed_host_and_device(device): + """_require_device succeeds when at least one tensor is on-device.""" + from ttl.ttl_api import _require_device + + a_host = ttnn.from_torch( + torch.zeros(32, 32, dtype=torch.bfloat16), + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + ) + b_device = ttnn.from_torch( + torch.zeros(32, 32, dtype=torch.bfloat16), + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=device, + ) + + assert _require_device((a_host, b_device)) is not None From 0350205f59de59c2ffd6889aa2041be9df968d76 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Fri, 10 Apr 2026 13:03:46 -0700 Subject: [PATCH 10/31] generalize the packer L1 accumulation guard placement --- .../TTKernelInsertL1Accumulation.cpp | 129 ++++++++++++------ 1 file changed, 85 insertions(+), 44 deletions(-) diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index 646d1be64..803f034ba 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -7,9 +7,10 @@ //===----------------------------------------------------------------------===// // // Inserts pack_reconfig_l1_acc guards inside reduction loops. When a -// tile_regs_acquire is inside a reduction loop, the packer must switch -// to L1 accumulation mode from the second iteration onwards so that -// pack_tile adds to the existing L1 value instead of overwriting. +// tile_regs_release is inside a reduction loop, the packer is switched +// to L1 accumulation mode once after the first iteration's pack so that +// subsequent iterations add to the existing L1 value instead of +// overwriting. The L1 acc state persists across tile_regs boundaries. // // See docs/development/AccumulatingComputeLowering.md for design details. // @@ -95,60 +96,100 @@ struct TTKernelInsertL1AccumulationPass } }); - // Insert pack_reconfig_l1_acc matching the tt-metal minimal_matmul - // pattern: enable at the END of the first K iteration (after all - // DstSections complete), disable after the loop. The enable guard - // uses `if (k == lb)` so it fires once when the first iteration - // finishes, and L1 acc stays enabled for all subsequent iterations. + // L1 accumulation guard placement. For any loop that + // accumulates in L1 (matmul K loop or reduce loop), the pattern is: + // + // pack_reconfig_l1_acc(0) // disable before loop + // for (iv = lb; ...) { + // [subblock 0: acquire...pack...release] + // [subblock N: acquire...pack...release] + // if (iv == lb) pack_reconfig_l1_acc(1) // enable once after first + // // iteration's last pack + // } + // [cb_push_back if present] + // pack_reconfig_l1_acc(0) // disable after loop + // + // The L1 acc state persists across tile_regs boundaries, so the enable + // call only needs to happen once (after the first iteration completes + // all subblock packs). Disable guards are inserted once per outermost + // loop. + + // Find the top-level operation in each L1 acc loop body that contains + // the last tile_regs_release. The release may be nested inside subblock + // loops, so we find the enclosing top-level op to insert after. + auto findTopLevelAncestor = [](Operation *op, Block *loopBody) + -> Operation * { + while (op && op->getBlock() != loopBody) { + op = op->getParentOp(); + } + return op; + }; + + llvm::SmallDenseMap enablePointPerLoop; + for (auto loop : l1AccLoops) { + Operation *lastTopLevel = nullptr; + loop->walk([&](ttk::TileRegsReleaseOp releaseOp) { + Operation *topLevel = + findTopLevelAncestor(releaseOp, loop.getBody()); + if (topLevel) { + lastTopLevel = topLevel; + } + }); + if (lastTopLevel) { + enablePointPerLoop[loop.getOperation()] = lastTopLevel; + } + } + llvm::SmallDenseSet disabledLoops; - for (scf::ForOp loop : l1AccLoops) { + for (auto loop : l1AccLoops) { + auto iter = enablePointPerLoop.find(loop.getOperation()); + if (iter == enablePointPerLoop.end()) { + continue; + } + Operation *enablePoint = iter->second; OpBuilder builder(loop->getContext()); - Location loc = loop.getLoc(); - - // Disable L1 acc before the loop to ensure clean state. - builder.setInsertionPoint(loop); - Value disablePre = arith::ConstantOp::create( - builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0)); - ttk::PackReconfigL1AccOp::create(builder, loc, disablePre); - - // Enable at end of first iteration, matching tt-metal: - // if (k_block == 0) { PACK((llk_pack_reconfig_l1_acc(1))); } - Operation *yield = loop.getBody()->getTerminator(); - builder.setInsertionPoint(yield); + Location loc = enablePoint->getLoc(); + + // Conditional enable after the last subblock/release on the first + // iteration. Placed after the top-level op containing the last + // release so all subblock packs in iteration 0 write without + // accumulation. + builder.setInsertionPointAfter(enablePoint); Value loopIV = loop.getInductionVar(); Value loopLB = loop.getLowerBound(); - Value isFirstIter = arith::CmpIOp::create( + Value firstIter = arith::CmpIOp::create( builder, loc, arith::CmpIPredicate::eq, loopIV, loopLB); - auto ifOp = scf::IfOp::create(builder, loc, isFirstIter); + auto ifOp = scf::IfOp::create(builder, loc, firstIter); builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); Value enableFlag = arith::ConstantOp::create( builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1)); ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag); - // Disable after each L1 acc loop to prevent L1 acc state from - // leaking into outer loops or subsequent code. - if (disabledLoops.insert(loop.getOperation()).second) { - // For the outermost loop, place disable after cb_push_back. - // For inner loops, place directly after the loop. - auto outermostLoop = findOutermostL1AccLoop(loop); - bool isOutermost = !outermostLoop || outermostLoop == loop; - if (isOutermost) { - // Scan forward for cb_push_back. - Operation *insertPoint = loop->getNextNode(); - while (insertPoint && !isa(insertPoint)) { - insertPoint = insertPoint->getNextNode(); - } - if (insertPoint) { - builder.setInsertionPointAfter(insertPoint); - } else { - builder.setInsertionPointAfter(loop); - } + // Disable before and after the outermost L1 acc loop (once per loop). + auto outermostLoop = findOutermostL1AccLoop(loop); + if (!outermostLoop) { + outermostLoop = loop; + } + if (disabledLoops.insert(outermostLoop.getOperation()).second) { + // Disable before the loop. + builder.setInsertionPoint(outermostLoop); + Value disablePre = arith::ConstantOp::create( + builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0)); + ttk::PackReconfigL1AccOp::create(builder, loc, disablePre); + + // Disable after cb_push_back following the loop, or after the loop. + Operation *insertPoint = outermostLoop->getNextNode(); + while (insertPoint && !isa(insertPoint)) { + insertPoint = insertPoint->getNextNode(); + } + if (insertPoint) { + builder.setInsertionPointAfter(insertPoint); } else { - builder.setInsertionPointAfter(loop); + builder.setInsertionPointAfter(outermostLoop); } - Value disableFlag = arith::ConstantOp::create( + Value disablePost = arith::ConstantOp::create( builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0)); - ttk::PackReconfigL1AccOp::create(builder, loc, disableFlag); + ttk::PackReconfigL1AccOp::create(builder, loc, disablePost); } } } From f919130f7651edc9addc8e6236c0d29cbf816e0b Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Fri, 10 Apr 2026 13:04:40 -0700 Subject: [PATCH 11/31] add tests --- .../TTKernelInsertL1Accumulation.cpp | 7 +- test/python/matmul_l1_acc_multinode.py | 184 ++++++++++++++++++ test/python/test_matmul_l1_acc_multinode.py | 155 +++++++++++++++ .../Transforms/insert_l1_accumulation.mlir | 150 ++++++++++++++ 4 files changed, 492 insertions(+), 4 deletions(-) create mode 100644 test/python/matmul_l1_acc_multinode.py create mode 100644 test/python/test_matmul_l1_acc_multinode.py create mode 100644 test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index 803f034ba..11a2985ea 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -117,8 +117,8 @@ struct TTKernelInsertL1AccumulationPass // Find the top-level operation in each L1 acc loop body that contains // the last tile_regs_release. The release may be nested inside subblock // loops, so we find the enclosing top-level op to insert after. - auto findTopLevelAncestor = [](Operation *op, Block *loopBody) - -> Operation * { + auto findTopLevelAncestor = [](Operation *op, + Block *loopBody) -> Operation * { while (op && op->getBlock() != loopBody) { op = op->getParentOp(); } @@ -129,8 +129,7 @@ struct TTKernelInsertL1AccumulationPass for (auto loop : l1AccLoops) { Operation *lastTopLevel = nullptr; loop->walk([&](ttk::TileRegsReleaseOp releaseOp) { - Operation *topLevel = - findTopLevelAncestor(releaseOp, loop.getBody()); + Operation *topLevel = findTopLevelAncestor(releaseOp, loop.getBody()); if (topLevel) { lastTopLevel = topLevel; } diff --git a/test/python/matmul_l1_acc_multinode.py b/test/python/matmul_l1_acc_multinode.py new file mode 100644 index 000000000..f2bbdd968 --- /dev/null +++ b/test/python/matmul_l1_acc_multinode.py @@ -0,0 +1,184 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +# REQUIRES: tt-device +# RUN: env TTLANG_INITIAL_MLIR=%t.initial.mlir %python %s > %t.output 2>&1 +# RUN: FileCheck %s --check-prefix=CHECK-CPP < %t.output +# RUN: FileCheck %s --check-prefix=CHECK-RESULT < %t.output + +""" +Multinode matmul with L1 packer accumulation. Mirrors the benchmark kernel +(make_matmul_l1_acc / v4_l1_acc): auto grid, split DMA (reader=A, +writer=B+output), 8x8x8 blocks, K_num_blocks=4 at 1024x1024x1024. + +The compute thread uses the "reserve once, store K times, push once" pattern. +The compiler detects the K reduction loop and inserts pack_reconfig_l1_acc +guards so each K iteration packs additively to L1. + +Verifies the L1 packer accumulation pattern in generated C++: disable before +K loop, conditional enable after first iteration, disable after cb_push_back. +""" + +import ttl + +try: + import ttnn +except ImportError: + print("TTNN not available - exiting") + exit(0) + +import torch + +TILE = 32 +M_BLOCK = 8 +K_BLOCK = 8 +N_BLOCK = 8 + + +@ttl.operation(grid="auto") +def matmul_l1_acc(a, b, out): + Mt = a.shape[0] // TILE + Kt = a.shape[1] // TILE + Nt = b.shape[1] // TILE + + K_num_blocks = Kt // K_BLOCK + M_num_blocks = Mt // M_BLOCK + N_num_blocks = Nt // N_BLOCK + + grid_n, grid_m = ttl.grid_size(dims=2) + m_blocks_per_node = -(-M_num_blocks // grid_m) + n_blocks_per_node = -(-N_num_blocks // grid_n) + + a_dfb = ttl.make_dataflow_buffer_like(a, shape=(M_BLOCK, K_BLOCK), block_count=2) + b_dfb = ttl.make_dataflow_buffer_like(b, shape=(K_BLOCK, N_BLOCK), block_count=2) + out_dfb = ttl.make_dataflow_buffer_like( + out, shape=(M_BLOCK, N_BLOCK), block_count=2 + ) + + @ttl.compute() + def compute(): + node_n, node_m = ttl.node(dims=2) + for local_m in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m + if m_block < M_num_blocks: + for local_n in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n + if n_block < N_num_blocks: + out_blk = out_dfb.reserve() + for _ in range(K_num_blocks): + a_blk = a_dfb.wait() + b_blk = b_dfb.wait() + out_blk.store(a_blk @ b_blk) + a_blk.pop() + b_blk.pop() + out_blk.push() + + @ttl.datamovement() + def reader(): + node_n, node_m = ttl.node(dims=2) + for local_m in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m + if m_block < M_num_blocks: + m_off = m_block * M_BLOCK + for local_n in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n + if n_block < N_num_blocks: + for kb in range(K_num_blocks): + k_off = kb * K_BLOCK + with a_dfb.reserve() as a_blk: + ttl.copy( + a[ + m_off : m_off + M_BLOCK, + k_off : k_off + K_BLOCK, + ], + a_blk, + ).wait() + + @ttl.datamovement() + def writer(): + node_n, node_m = ttl.node(dims=2) + for local_m in range(m_blocks_per_node): + m_block = node_m * m_blocks_per_node + local_m + if m_block < M_num_blocks: + m_off = m_block * M_BLOCK + for local_n in range(n_blocks_per_node): + n_block = node_n * n_blocks_per_node + local_n + if n_block < N_num_blocks: + n_off = n_block * N_BLOCK + for kb in range(K_num_blocks): + k_off = kb * K_BLOCK + with b_dfb.reserve() as b_blk: + ttl.copy( + b[ + k_off : k_off + K_BLOCK, + n_off : n_off + N_BLOCK, + ], + b_blk, + ).wait() + with out_dfb.wait() as out_blk: + ttl.copy( + out_blk, + out[ + m_off : m_off + M_BLOCK, + n_off : n_off + N_BLOCK, + ], + ).wait() + + +# ============================================================================= +# C++ output: L1 packer accumulation pattern +# 1. Disable before the K loop +# 2. Conditional enable after the first iteration (iv == lb) +# 3. Disable after cb_push_back following the loop +# ============================================================================= + +# CHECK-CPP: PACK((llk_pack_reconfig_l1_acc( +# CHECK-CPP-NEXT: for +# CHECK-CPP: matmul_block( +# CHECK-CPP: pack_tile +# CHECK-CPP: if ( +# CHECK-CPP-NEXT: PACK((llk_pack_reconfig_l1_acc( +# CHECK-CPP: cb_push_back( +# CHECK-CPP-NEXT: PACK((llk_pack_reconfig_l1_acc( + +# CHECK-RESULT: PASS + +if __name__ == "__main__": + device = ttnn.open_device(device_id=0) + + try: + # 32x32x32 tiles = 1024x1024x1024, 8x8x8 blocks -> K_num_blocks=4 + Mt, Kt, Nt = 32, 32, 32 + M, K, N = Mt * TILE, Kt * TILE, Nt * TILE + + a_torch = torch.randn(M, K, dtype=torch.bfloat16) + b_torch = torch.randn(K, N, dtype=torch.bfloat16) + golden = (a_torch.float() @ b_torch.float()).float() + + a_dev = ttnn.from_torch( + a_torch, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device + ) + b_dev = ttnn.from_torch( + b_torch, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device + ) + out_dev = ttnn.from_torch( + torch.zeros(M, N, dtype=torch.bfloat16), + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + device=device, + ) + + matmul_l1_acc(a_dev, b_dev, out_dev) + + result = ttnn.to_torch(out_dev).float() + pcc = torch.corrcoef(torch.stack([result.flatten(), golden.flatten()]))[ + 0, 1 + ].item() + if pcc > 0.999: + print("PASS") + else: + print(f"FAIL: PCC {pcc:.6f} < 0.999") + + finally: + ttnn.close_device(device) diff --git a/test/python/test_matmul_l1_acc_multinode.py b/test/python/test_matmul_l1_acc_multinode.py new file mode 100644 index 000000000..0f1d603a4 --- /dev/null +++ b/test/python/test_matmul_l1_acc_multinode.py @@ -0,0 +1,155 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Multinode matmul with L1 packer accumulation, L1-only (no DRAM reads during +compute). All input blocks are pre-loaded into L1 DFBs before the K reduction +loop begins. The compiler inserts pack_reconfig_l1_acc guards so each K +iteration packs additively to L1. + +Tests multicore configurations with a 2D grid and multiple K blocks. +""" + +import pytest +import torch +import ttl + +ttnn = pytest.importorskip("ttnn", exc_type=ImportError) + +from ttlang_test_utils import to_dram +from utils.correctness import assert_pcc + +TILE = 32 + + +def _make_l1_acc_multinode_kernel(block_m, block_n, grid="auto"): + """Multinode matmul with L1 accumulation. + + All K blocks are pre-loaded into L1 before compute begins (no DRAM + streaming during the K loop). The compute thread reserves the output + DFB once, stores K times (triggering L1 accumulation), then pushes. + """ + + @ttl.operation(grid=grid) + def kernel(a, b, out): + Mt = a.shape[0] // TILE + Kt = a.shape[1] // TILE + Nt = b.shape[1] // TILE + + M_num = Mt // block_m + N_num = Nt // block_n + + grid_n, grid_m = ttl.grid_size(dims=2) + m_per = -(-M_num // grid_m) + n_per = -(-N_num // grid_n) + + a_dfb = ttl.make_dataflow_buffer_like(a, shape=(block_m, 1), block_count=2) + b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, block_n), block_count=2) + out_dfb = ttl.make_dataflow_buffer_like( + out, shape=(block_m, block_n), block_count=2 + ) + + @ttl.compute() + def compute(): + node_n, node_m = ttl.node(dims=2) + for lm in range(m_per): + mb = node_m * m_per + lm + if mb < M_num: + for ln in range(n_per): + nb = node_n * n_per + ln + if nb < N_num: + out_blk = out_dfb.reserve() + for _ in range(Kt): + a_blk = a_dfb.wait() + b_blk = b_dfb.wait() + out_blk.store(a_blk @ b_blk) + a_blk.pop() + b_blk.pop() + out_blk.push() + + @ttl.datamovement() + def reader(): + node_n, node_m = ttl.node(dims=2) + for lm in range(m_per): + mb = node_m * m_per + lm + if mb < M_num: + m_off = mb * block_m + for ln in range(n_per): + nb = node_n * n_per + ln + if nb < N_num: + for kt in range(Kt): + with a_dfb.reserve() as blk: + ttl.copy( + a[ + m_off : m_off + block_m, + kt : kt + 1, + ], + blk, + ).wait() + + @ttl.datamovement() + def writer(): + node_n, node_m = ttl.node(dims=2) + for lm in range(m_per): + mb = node_m * m_per + lm + if mb < M_num: + m_off = mb * block_m + for ln in range(n_per): + nb = node_n * n_per + ln + if nb < N_num: + n_off = nb * block_n + for kt in range(Kt): + with b_dfb.reserve() as blk: + ttl.copy( + b[ + kt : kt + 1, + n_off : n_off + block_n, + ], + blk, + ).wait() + with out_dfb.wait() as blk: + ttl.copy( + blk, + out[ + m_off : m_off + block_m, + n_off : n_off + block_n, + ], + ).wait() + + return kernel + + +PARAMS = [ + # (Mt, Kt, Nt, block_m, block_n, grid) + (4, 2, 4, 2, 2, (2, 2)), + (8, 4, 8, 4, 4, (2, 2)), + (8, 4, 8, 4, 4, "auto"), + (16, 8, 16, 8, 8, "auto"), +] + + +@pytest.mark.parametrize( + "Mt,Kt,Nt,block_m,block_n,grid", + PARAMS, + ids=[ + f"tiles{mt}x{kt}x{nt}_blk{bm}x{bn}_grid{g}" for mt, kt, nt, bm, bn, g in PARAMS + ], +) +@pytest.mark.requires_device +def test_l1_acc_multinode(Mt, Kt, Nt, block_m, block_n, grid, device): + """Multinode matmul with L1 packer accumulation across K iterations.""" + M, K, N = Mt * TILE, Kt * TILE, Nt * TILE + a_torch = torch.randn(M, K, dtype=torch.bfloat16) + b_torch = torch.randn(K, N, dtype=torch.bfloat16) + golden = (a_torch.float() @ b_torch.float()).float() + + a_dev = to_dram(a_torch, device) + b_dev = to_dram(b_torch, device) + out_dev = to_dram(torch.zeros(M, N, dtype=torch.bfloat16), device) + + kernel = _make_l1_acc_multinode_kernel(block_m, block_n, grid=grid) + kernel(a_dev, b_dev, out_dev) + + result = ttnn.to_torch(out_dev).float() + assert_pcc(golden, result, threshold=0.999) diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir new file mode 100644 index 000000000..8c3375e06 --- /dev/null +++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir @@ -0,0 +1,150 @@ +// Verifies ttkernel-insert-l1-accumulation: pack_reconfig_l1_acc guards are +// inserted around reduction loops. The enable call happens once after the +// first iteration's last pack (iv == lb), and disable guards bracket the +// outermost loop. + +// RUN: ttlang-opt %s --pass-pipeline='builtin.module(ttkernel-insert-l1-accumulation)' --split-input-file | FileCheck %s + +// Basic L1 acc loop: enable after first iteration, disable before/after loop. + +// CHECK-LABEL: func.func @basic_l1_acc_loop +// CHECK: ttkernel.pack_reconfig_l1_acc(%{{.*}}) : (i32) +// CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to +// CHECK: ttkernel.tile_regs_acquire +// CHECK: ttkernel.pack_tile +// CHECK: ttkernel.tile_regs_release +// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[IV]], %[[LB]] +// CHECK: scf.if %[[CMP]] +// CHECK: %[[ENABLE:.*]] = arith.constant 1 : i32 +// CHECK: ttkernel.pack_reconfig_l1_acc(%[[ENABLE]]) : (i32) +// CHECK: } +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.pack_reconfig_l1_acc(%{{.*}}) : (i32) +func.func @basic_l1_acc_loop() attributes {ttkernel.thread = #ttkernel.thread} { + %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c4_i32 = arith.constant 4 : i32 + scf.for %iv = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.l1_acc_loop} + ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + return +} + +// ----- + +// Reduction loop fallback (ttl.reduction_loop attribute) with sum reduce. + +// CHECK-LABEL: func.func @reduction_loop_fallback +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for +// CHECK: arith.cmpi eq +// CHECK: scf.if +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: ttkernel.pack_reconfig_l1_acc +func.func @reduction_loop_fallback() attributes {ttkernel.thread = #ttkernel.thread} { + %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> + %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> + %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + scf.for %iv = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, , ) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.reduction_loop} + return +} + +// ----- + +// Max reduce loops should NOT get L1 accumulation guards. + +// CHECK-LABEL: func.func @max_reduce_no_l1_acc +// CHECK-NOT: pack_reconfig_l1_acc +func.func @max_reduce_no_l1_acc() attributes {ttkernel.thread = #ttkernel.thread} { + %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> + %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> + %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + scf.for %iv = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, , ) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.reduction_loop} + return +} + +// ----- + +// No reduction loop attribute: no transformation. + +// CHECK-LABEL: func.func @no_reduction_loop +// CHECK-NOT: pack_reconfig_l1_acc +func.func @no_reduction_loop() attributes {ttkernel.thread = #ttkernel.thread} { + %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + scf.for %iv = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } + return +} + +// ----- + +// Subblocked loop: multiple acquire/release pairs per iteration inside nested +// loops. The enable guard should appear once after the outermost subblock loop +// (containing the last release), not after each individual release. + +// CHECK-LABEL: func.func @subblocked_loop +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to +// CHECK: scf.for +// CHECK: ttkernel.tile_regs_acquire +// CHECK: ttkernel.tile_regs_release +// CHECK: } +// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[IV]], %[[LB]] +// CHECK: scf.if %[[CMP]] +// CHECK: %[[ENABLE:.*]] = arith.constant 1 : i32 +// CHECK: ttkernel.pack_reconfig_l1_acc(%[[ENABLE]]) : (i32) +// CHECK: } +// CHECK: ttkernel.pack_reconfig_l1_acc +func.func @subblocked_loop() attributes {ttkernel.thread = #ttkernel.thread} { + %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + scf.for %iv = %c0 to %c4 step %c1 { + scf.for %sb = %c0 to %c2 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.matmul_block(%cb, %cb, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } + } {ttl.l1_acc_loop} + return +} From 2d11d0457e3f8ee61ed90702cf32e5d0bded2caa Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Fri, 10 Apr 2026 13:32:26 -0700 Subject: [PATCH 12/31] update tests --- .../TTLToTTKernel/reduce_lowering.mlir | 10 ++-- .../TTL/Transforms/subblock_matmul.mlir | 47 +++++++++---------- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir index 9ddb49905..b7457dda8 100644 --- a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir +++ b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir @@ -87,16 +87,16 @@ func.func @reduce_sum_dim0_1x1() attributes {ttl.base_cta_index = 3 : i32, ttl.c // FP32: %[[CB2:.*]] = ttkernel.get_compile_time_arg_val(2) // FP32: scf.for %[[IV:.*]] = %[[C0]] to %[[C2]] step %[[C1]] // FP32-NEXT: ttkernel.tile_regs_acquire -// L1 accumulation guard: enable from second iteration. -// FP32: %[[NOT_FIRST:.*]] = arith.cmpi ne, %[[IV]], %[[C0]] -// FP32-NEXT: scf.if %[[NOT_FIRST]] -// FP32-NEXT: ttkernel.pack_reconfig_l1_acc(%[[C1I]]) -// FP32: } // FP32: ttkernel.reduce_init({{.*}}, ) {full_fp32} // FP32: ttkernel.reduce_tile({{.*}}, ) {full_fp32 // FP32: ttkernel.reduce_uninit // FP32: ttkernel.pack_tile(%[[C0]], %[[CB2]], %[[C0]], true) // FP32: ttkernel.tile_regs_release +// L1 accumulation guard: enable once after the first iteration's pack. +// FP32: %[[FIRST:.*]] = arith.cmpi eq, %[[IV]], %[[C0]] +// FP32-NEXT: scf.if %[[FIRST]] +// FP32-NEXT: ttkernel.pack_reconfig_l1_acc(%[[C1I]]) +// FP32: } // FP32: } {ttl.reduction_loop // Disable L1 accumulation after reduction loop. // FP32: ttkernel.pack_reconfig_l1_acc({{.*}}0{{.*}}) diff --git a/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir b/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir index aa4a891f8..9fb0212c9 100644 --- a/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir +++ b/test/ttlang/Dialect/TTL/Transforms/subblock_matmul.mlir @@ -1,40 +1,35 @@ // Tests for ttl-subblock-compute-for-dst with matmul computes. // Matmul K (reduction) accumulates in-place in DST, so only M*N parallel -// tiles count toward the DST budget. When the parallel output exceeds DST, -// subblocking partitions M*N AND tiles K to 1 for L1 accumulation. +// tiles count toward the DST budget. Subblocking partitions the M*N output +// space while keeping K whole in each subblock. // RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(convert-ttl-to-compute, ttl-set-compute-kernel-config, ttl-assign-dst{enable-fpu-binary-ops=0}, ttl-subblock-compute-for-dst))' --split-input-file | FileCheck %s // ----- -// Purpose: M*N=16 exceeds f32 DST capacity (4). Subblocking partitions the -// 4x4 output into 1x4 strips AND tiles K from 3 to 1. The K loop is -// annotated with ttl.reduction_loop for L1 accumulation. -// Loops: M (dim 0) 0..4 step 1, K (dim 2) 0..3 step 1. +// Purpose: M*N=16 exceeds f32 DST capacity (4). K=3 is excluded from the +// budget, so subblocking partitions the 4x4 output into 1x4 strips. +// Loop on M (dim 0): 0 to 4 step 1. K (dim 2) stays at 3 in each subblock. -// CHECK-LABEL: func.func @matmul_subblock_k_tiled +// CHECK-LABEL: func.func @matmul_subblock_k_excluded // CHECK-SAME: fp32_dest_acc_en = true // Outer subblock loop over M dimension. -// CHECK: scf.for %[[MIV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} { -// Inner K reduction loop. -// CHECK: scf.for %[[KIV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} { -// A sliced on M and K: [miv, kiv] [1, 1]. -// CHECK: tensor.extract_slice {{.*}}[%[[MIV]], %[[KIV]]] [1, 1] [1, 1] -// B sliced on K: [kiv, 0] [1, 4]. -// CHECK: tensor.extract_slice {{.*}}[%[[KIV]], 0] [1, 4] [1, 1] -// Output sliced on M: [miv, 0] [1, 4]. -// CHECK: tensor.extract_slice {{.*}}[%[[MIV]], 0] [1, 4] [1, 1] -// Inner compute on subblock [1, 4, 1] (M=1, N=4, K=1). -// CHECK: ttl.compute -// CHECK-SAME: tensor<1x1x!ttcore.tile<32x32, bf16>> -// CHECK-SAME: tensor<1x4x!ttcore.tile<32x32, bf16>> -// CHECK-SAME: tensor<1x4x!ttcore.tile<32x32, bf16>> -// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"] -// CHECK: ttl.tile_matmul_block -// K loop annotated for L1 accumulation. -// CHECK: } {{{.*}}ttl.reduction_loop{{.*}}} +// CHECK: scf.for %[[IV:.*]] = %{{.*}} to %{{.*}} step %{{.*}} { +// A sliced on M, K kept whole: [iv, 0] [1, 3]. +// CHECK: tensor.extract_slice {{.*}}[%[[IV]], 0] [1, 3] [1, 1] +// B not sliced (full [3, 4]). +// CHECK: tensor.extract_slice {{.*}}[0, 0] [3, 4] [1, 1] +// Output sliced on M: [iv, 0] [1, 4]. +// CHECK: tensor.extract_slice {{.*}}[%[[IV]], 0] [1, 4] [1, 1] +// Inner compute on subblock [1, 4, 3] (M=1, N=4, K=3). +// CHECK: ttl.compute +// CHECK-SAME: tensor<1x3x!ttcore.tile<32x32, bf16>> +// CHECK-SAME: tensor<3x4x!ttcore.tile<32x32, bf16>> +// CHECK-SAME: tensor<1x4x!ttcore.tile<32x32, bf16>> +// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"] +// CHECK: ttl.tile_matmul_block // CHECK: } -func.func @matmul_subblock_k_tiled( +func.func @matmul_subblock_k_excluded( %arg0: tensor<4x3x!ttcore.tile<32x32, bf16>>, %arg1: tensor<3x4x!ttcore.tile<32x32, bf16>>) -> tensor<4x4x!ttcore.tile<32x32, bf16>> { %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[4, 3], !ttcore.tile<32x32, bf16>, 2> From 1da0c9f16b1f44ec9fbdb2ad2f08711ca4bfccf6 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Sun, 12 Apr 2026 17:11:13 -0700 Subject: [PATCH 13/31] comments --- include/ttlang/Dialect/TTL/Passes.td | 28 ++++++++++++----- .../TTKernelInsertL1Accumulation.cpp | 30 ++++++------------- .../Transforms/TTLAnnotateReductionLoops.cpp | 7 +++-- .../Transforms/TTLSubblockComputeForDST.cpp | 2 ++ .../TTLToTTKernel/reduce_lowering.mlir | 3 +- .../Transforms/insert_l1_accumulation.mlir | 18 +++++++++++ 6 files changed, 56 insertions(+), 32 deletions(-) diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td index 8e8765181..b76a29138 100644 --- a/include/ttlang/Dialect/TTL/Passes.td +++ b/include/ttlang/Dialect/TTL/Passes.td @@ -31,12 +31,26 @@ def TTKernelInsertL1Accumulation let summary = "Insert L1 accumulation guards for reduction loops"; let description = [{ Inserts `pack_reconfig_l1_acc` guards inside reduction loops so that - pack operations accumulate into L1 instead of overwriting. On the first - iteration of a reduction loop, pack writes normally. On subsequent - iterations, the packer is reconfigured to add to the existing L1 value. - - Reduction loops are identified by the `ttl.reduction_loop` attribute - on `scf.for` ops. + pack operations accumulate into L1 instead of overwriting. The enable + call happens once after the first iteration's last pack; the L1 acc + packer state persists across `tile_regs` boundaries. Disable guards + bracket the outermost reduction loop (parallel loops are not + considered). Max-reduce loops are excluded (max is not additive). + + The pattern is: + pack_reconfig_l1_acc(0) // disable before loop + for (iv = lb; ...) { + [subblock 0: acquire...pack...release] + [subblock N: acquire...pack...release] + if (iv == lb) pack_reconfig_l1_acc(1) // enable once after first + // iteration's last pack + } + [cb_push_back if present] + pack_reconfig_l1_acc(0) // disable after loop + + Reduction loops are identified by the `ttl.l1_acc_loop` (user-written) + or `ttl.reduction_loop` (compiler-generated) attributes on `scf.for` + ops, with `ttl.l1_acc_loop` taking precedence. }]; let dependentDialects = [ @@ -52,7 +66,7 @@ def TTLAnnotateReductionLoops let description = [{ Detects user-written `scf.for` loops where all iterations store to the same CB slot (reserved before the loop, pushed after) and annotates them - with `ttl.reduction_loop`. This enables `TTKernelInsertL1Accumulation` + with `ttl.l1_acc_loop`. This enables `TTKernelInsertL1Accumulation` to insert `pack_reconfig_l1_acc` guards so that packs accumulate across iterations instead of overwriting. diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index 11a2985ea..2ec189092 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -5,16 +5,6 @@ //===----------------------------------------------------------------------===// // TTKernel Insert L1 Accumulation //===----------------------------------------------------------------------===// -// -// Inserts pack_reconfig_l1_acc guards inside reduction loops. When a -// tile_regs_release is inside a reduction loop, the packer is switched -// to L1 accumulation mode once after the first iteration's pack so that -// subsequent iterations add to the existing L1 value instead of -// overwriting. The L1 acc state persists across tile_regs boundaries. -// -// See docs/development/AccumulatingComputeLowering.md for design details. -// -//===----------------------------------------------------------------------===// #include "ttlang/Dialect/TTL/IR/TTL.h" #include "ttlang/Dialect/TTL/Passes.h" @@ -109,14 +99,13 @@ struct TTKernelInsertL1AccumulationPass // [cb_push_back if present] // pack_reconfig_l1_acc(0) // disable after loop // - // The L1 acc state persists across tile_regs boundaries, so the enable + // The L1 acc state persists across multiple dst sections, so the enable // call only needs to happen once (after the first iteration completes - // all subblock packs). Disable guards are inserted once per outermost - // loop. + // all its packs). Disable guards are inserted once per outermost + // reduction loop (parallel loops are not considered). - // Find the top-level operation in each L1 acc loop body that contains - // the last tile_regs_release. The release may be nested inside subblock - // loops, so we find the enclosing top-level op to insert after. + // Find the insertion point for the enable guard: the top-level op in + // the loop body that contains the last tile_regs_release. auto findTopLevelAncestor = [](Operation *op, Block *loopBody) -> Operation * { while (op && op->getBlock() != loopBody) { @@ -149,10 +138,9 @@ struct TTKernelInsertL1AccumulationPass OpBuilder builder(loop->getContext()); Location loc = enablePoint->getLoc(); - // Conditional enable after the last subblock/release on the first - // iteration. Placed after the top-level op containing the last - // release so all subblock packs in iteration 0 write without - // accumulation. + // Enable L1 acc once, at the end of the first iteration of the + // reduction loop. All packs in iteration 0 write without + // accumulation; subsequent iterations add to the existing L1 value. builder.setInsertionPointAfter(enablePoint); Value loopIV = loop.getInductionVar(); Value loopLB = loop.getLowerBound(); @@ -164,7 +152,7 @@ struct TTKernelInsertL1AccumulationPass builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1)); ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag); - // Disable before and after the outermost L1 acc loop (once per loop). + // Bracket the outermost reduction loop with disable guards. auto outermostLoop = findOutermostL1AccLoop(loop); if (!outermostLoop) { outermostLoop = loop; diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp index 7545aba6d..91d0f3c66 100644 --- a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp +++ b/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp @@ -8,7 +8,7 @@ // // Detects user-written scf.for loops that accumulate into the same CB slot // (reserve before loop, store inside, push after) and annotates them with -// kReductionLoopAttrName for L1 accumulation. +// kL1AccLoopAttrName for L1 accumulation. // //===----------------------------------------------------------------------===// @@ -34,8 +34,9 @@ struct TTLAnnotateReductionLoopsPass func::FuncOp func = getOperation(); func.walk([&](scf::ForOp forOp) { - // Skip loops already annotated (from compiler-generated tile loops). - if (forOp->hasAttr(kReductionLoopAttrName) || + // Skip loops already annotated (compiler-generated or prior run). + if (forOp->hasAttr(kL1AccLoopAttrName) || + forOp->hasAttr(kReductionLoopAttrName) || forOp->hasAttr(kTileLoopStrideAttrName) || forOp->hasAttr(kSubblockLoopStrideAttrName)) { return; diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp index f6c082780..da6c65065 100644 --- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp +++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp @@ -122,6 +122,8 @@ struct TTLSubblockComputeForDSTPass if (isa(op)) { hasMatmulBlock = true; } + return (hasAccumulating && hasMatmulBlock) ? WalkResult::interrupt() + : WalkResult::advance(); }); if (hasAccumulating && !hasMatmulBlock) { return; diff --git a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir index b7457dda8..d6fcd1453 100644 --- a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir +++ b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir @@ -81,6 +81,7 @@ func.func @reduce_sum_dim0_1x1() attributes {ttl.base_cta_index = 3 : i32, ttl.c // FP32-DAG: %[[C0:.*]] = arith.constant 0 : index // FP32-DAG: %[[C1:.*]] = arith.constant 1 : index // FP32-DAG: %[[C2:.*]] = arith.constant 2 : index +// FP32-DAG: %[[C0I:.*]] = arith.constant 0 : i32 // FP32-DAG: %[[C1I:.*]] = arith.constant 1 : i32 // FP32: %[[CB0:.*]] = ttkernel.get_compile_time_arg_val(0) // FP32: %[[CB1:.*]] = ttkernel.get_compile_time_arg_val(1) @@ -99,7 +100,7 @@ func.func @reduce_sum_dim0_1x1() attributes {ttl.base_cta_index = 3 : i32, ttl.c // FP32: } // FP32: } {ttl.reduction_loop // Disable L1 accumulation after reduction loop. -// FP32: ttkernel.pack_reconfig_l1_acc({{.*}}0{{.*}}) +// FP32: ttkernel.pack_reconfig_l1_acc(%[[C0I]]) func.func @reduce_2x1_l1_acc() attributes {ttl.base_cta_index = 3 : i32, ttl.crta_indices = [], ttl.kernel_thread = #ttkernel.thread} { %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir index 8c3375e06..ef1c7542a 100644 --- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir +++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir @@ -148,3 +148,21 @@ func.func @subblocked_loop() attributes {ttkernel.thread = #ttkernel.thread} { + %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c4_i32 = arith.constant 4 : i32 + scf.for %iv = %c0 to %c4 step %c1 { + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + } {ttl.l1_acc_loop} + return +} From c1bf820db707c0a0ab6e2aa8ad384b68d49ea6a8 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Sun, 12 Apr 2026 21:21:24 -0700 Subject: [PATCH 14/31] rename ttl-annotate-reduction-loops to ttl-annotate-l1-acc-loops; other cleanup; add tests --- include/ttlang/Dialect/TTL/Passes.td | 6 +- .../Transforms/TTKernelInsertInits.cpp | 11 +- .../TTKernelInsertL1Accumulation.cpp | 19 ++- lib/Dialect/TTL/Pipelines/TTLPipelines.cpp | 2 +- lib/Dialect/TTL/Transforms/CMakeLists.txt | 2 +- ...ionLoops.cpp => TTLAnnotateL1AccLoops.cpp} | 24 +-- python/ttl/ttl_api.py | 2 +- .../TTLToTTKernel/reduce_lowering.mlir | 2 + .../Transforms/insert_l1_accumulation.mlir | 145 ++++++++++++++++++ .../TTL/Transforms/annotate_l1_acc_loops.mlir | 119 ++++++++++++++ 10 files changed, 297 insertions(+), 35 deletions(-) rename lib/Dialect/TTL/Transforms/{TTLAnnotateReductionLoops.cpp => TTLAnnotateL1AccLoops.cpp} (70%) create mode 100644 test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td index b76a29138..360a1f4e2 100644 --- a/include/ttlang/Dialect/TTL/Passes.td +++ b/include/ttlang/Dialect/TTL/Passes.td @@ -60,9 +60,9 @@ def TTKernelInsertL1Accumulation ]; } -def TTLAnnotateReductionLoops - : Pass<"ttl-annotate-reduction-loops", "::mlir::func::FuncOp"> { - let summary = "Annotate user-written scf.for loops as reduction loops"; +def TTLAnnotateL1AccLoops + : Pass<"ttl-annotate-l1-acc-loops", "::mlir::func::FuncOp"> { + let summary = "Annotate user-written scf.for loops for L1 accumulation"; let description = [{ Detects user-written `scf.for` loops where all iterations store to the same CB slot (reserved before the loop, pushed after) and annotates them diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp index b28ccded4..f6ddc93fa 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp @@ -348,12 +348,11 @@ analyzeSyncRegion(ttk::TileRegsAcquireOp acquireOp, Value &inputCB, } /// Find the outermost enclosing insertion point by walking up through -/// compiler-generated loops (marked with ttl.tile_loop_stride or -/// ttl.subblock_loop_stride). By construction, these loops iterate over tiles -/// within a single ttl.compute whose input/output CBs are fixed, so the -/// CB configuration is invariant across iterations and hoisting is safe. -/// Stops at unmarked loops to avoid hoisting past user loops that could -/// contain multiple sync regions with different CB configurations. +/// loops with invariant CB configurations: compiler-generated tile/subblock +/// loops (ttl.tile_loop_stride, ttl.subblock_loop_stride) and L1 +/// accumulation loops (ttl.l1_acc_loop). All use fixed CBs across +/// iterations, so init hoisting is safe. Stops at unmarked loops to avoid +/// hoisting past user loops with varying CB configurations. static Operation *hoistAboveCompilerLoops(Operation *op) { Operation *insertBefore = op; while (auto *parentOp = insertBefore->getParentOp()) { diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index 2ec189092..41cfeb5a6 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -152,7 +152,10 @@ struct TTKernelInsertL1AccumulationPass builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1)); ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag); - // Bracket the outermost reduction loop with disable guards. + // Bracket the outermost accumulation loop with disable guards. + // Both kL1AccLoopAttrName and kReductionLoopAttrName mean "all + // iterations write to the same CB slot," so the outermost such + // loop is the correct accumulation boundary. auto outermostLoop = findOutermostL1AccLoop(loop); if (!outermostLoop) { outermostLoop = loop; @@ -164,13 +167,15 @@ struct TTKernelInsertL1AccumulationPass builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0)); ttk::PackReconfigL1AccOp::create(builder, loc, disablePre); - // Disable after cb_push_back following the loop, or after the loop. - Operation *insertPoint = outermostLoop->getNextNode(); - while (insertPoint && !isa(insertPoint)) { - insertPoint = insertPoint->getNextNode(); + // Disable after any consecutive cb_push_back ops that follow the + // loop. Multi-output computes produce one push per output CB. + Operation *lastPush = nullptr; + for (Operation *op = outermostLoop->getNextNode(); + op && isa(op); op = op->getNextNode()) { + lastPush = op; } - if (insertPoint) { - builder.setInsertionPointAfter(insertPoint); + if (lastPush) { + builder.setInsertionPointAfter(lastPush); } else { builder.setInsertionPointAfter(outermostLoop); } diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp index d059fc090..85cc4e47c 100644 --- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp +++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp @@ -18,7 +18,7 @@ namespace mlir::tt::ttl { void createTTLToTTKernelPipeline(OpPassManager &pm, const TTLToTTKernelPipelineOptions &options) { - pm.addPass(createTTLAnnotateReductionLoops()); + pm.addPass(createTTLAnnotateL1AccLoops()); pm.addPass(createTTLConvertTTLToCompute()); { TTLSetComputeKernelConfigOptions configOpts; diff --git a/lib/Dialect/TTL/Transforms/CMakeLists.txt b/lib/Dialect/TTL/Transforms/CMakeLists.txt index 40d01413d..061f72b0b 100644 --- a/lib/Dialect/TTL/Transforms/CMakeLists.txt +++ b/lib/Dialect/TTL/Transforms/CMakeLists.txt @@ -6,7 +6,7 @@ add_mlir_dialect_library(TTLangTTLTransforms LowerDPrintToEmitC.cpp LowerSignpostToEmitC.cpp TTLAnnotateCBAssociations.cpp - TTLAnnotateReductionLoops.cpp + TTLAnnotateL1AccLoops.cpp TTLDumpCBFlowGraph.cpp TTLLowerMatmulBlock.cpp TTLAssignDST.cpp diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp similarity index 70% rename from lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp rename to lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp index 91d0f3c66..a3246395d 100644 --- a/lib/Dialect/TTL/Transforms/TTLAnnotateReductionLoops.cpp +++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 //===----------------------------------------------------------------------===// -// TTL Annotate Reduction Loops +// TTL Annotate L1 Acc Loops //===----------------------------------------------------------------------===// // // Detects user-written scf.for loops that accumulate into the same CB slot @@ -18,18 +18,17 @@ #include "mlir/Dialect/SCF/IR/SCF.h" -#define DEBUG_TYPE "ttl-annotate-reduction-loops" +#define DEBUG_TYPE "ttl-annotate-l1-acc-loops" namespace mlir::tt::ttl { -#define GEN_PASS_DEF_TTLANNOTATEREDUCTIONLOOPS +#define GEN_PASS_DEF_TTLANNOTATEL1ACCLOOPS #include "ttlang/Dialect/TTL/Passes.h.inc" namespace { -struct TTLAnnotateReductionLoopsPass - : public impl::TTLAnnotateReductionLoopsBase< - TTLAnnotateReductionLoopsPass> { +struct TTLAnnotateL1AccLoopsPass + : public impl::TTLAnnotateL1AccLoopsBase { void runOnOperation() override { func::FuncOp func = getOperation(); @@ -46,16 +45,9 @@ struct TTLAnnotateReductionLoopsPass // CB that was reserved (ttl.cb_reserve) before the loop. bool hasReductionStore = false; forOp.getBody()->walk([&](StoreOp store) { - Value view = store.getView(); - // Trace through attach_cb to find the cb_reserve. - if (auto attachCB = view.getDefiningOp()) { - view = attachCB.getTensor(); - } - if (auto reserve = view.getDefiningOp()) { - // The cb_reserve must be OUTSIDE the for loop (before it). - if (!forOp->isAncestor(reserve)) { - hasReductionStore = true; - } + auto reserve = store.getView().getDefiningOp(); + if (reserve && !forOp->isAncestor(reserve)) { + hasReductionStore = true; } }); diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py index b8f0d208a..4360b627c 100644 --- a/python/ttl/ttl_api.py +++ b/python/ttl/ttl_api.py @@ -1224,7 +1224,7 @@ def _compile_kernel( assign_dst_pass = f"ttl-assign-dst{{enable-fpu-binary-ops={fpu_flag}}}" pipeline_passes = [ - "func.func(ttl-annotate-reduction-loops)", + "func.func(ttl-annotate-l1-acc-loops)", "func.func(convert-ttl-to-compute)", set_compute_config_pass, f"func.func({assign_dst_pass})", diff --git a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir index d6fcd1453..ab37a5760 100644 --- a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir +++ b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir @@ -86,6 +86,8 @@ func.func @reduce_sum_dim0_1x1() attributes {ttl.base_cta_index = 3 : i32, ttl.c // FP32: %[[CB0:.*]] = ttkernel.get_compile_time_arg_val(0) // FP32: %[[CB1:.*]] = ttkernel.get_compile_time_arg_val(1) // FP32: %[[CB2:.*]] = ttkernel.get_compile_time_arg_val(2) +// Disable L1 accumulation before the reduction loop. +// FP32: ttkernel.pack_reconfig_l1_acc(%[[C0I]]) // FP32: scf.for %[[IV:.*]] = %[[C0]] to %[[C2]] step %[[C1]] // FP32-NEXT: ttkernel.tile_regs_acquire // FP32: ttkernel.reduce_init({{.*}}, ) {full_fp32} diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir index ef1c7542a..9d329427a 100644 --- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir +++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir @@ -166,3 +166,148 @@ func.func @l1_acc_loop_no_sync() attributes {ttkernel.thread = #ttkernel.thread< } {ttl.l1_acc_loop} return } + +// ----- + +// L1 acc loop inside an unannotated outer loop (the realistic pattern: +// outer M/N iteration loop wraps the inner K reduction loop). The disable +// guards bracket the inner K loop, not the outer loop. Each outer +// iteration gets a fresh disable-before -> K loop -> disable-after cycle. + +// CHECK-LABEL: func.func @l1_acc_inside_outer_loop +// CHECK: scf.for +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to +// CHECK: ttkernel.tile_regs_acquire +// CHECK: ttkernel.pack_tile +// CHECK: ttkernel.tile_regs_release +// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[IV]], %[[LB]] +// CHECK: scf.if %[[CMP]] +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: } +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.pack_reconfig_l1_acc +func.func @l1_acc_inside_outer_loop() attributes {ttkernel.thread = #ttkernel.thread} { + %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %c4_i32 = arith.constant 4 : i32 + scf.for %outer = %c0 to %c2 step %c1 { + scf.for %inner = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.l1_acc_loop} + ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + } + return +} + +// ----- + +// Multiple consecutive cb_push_back ops after the loop (multi-output compute). +// The disable guard should go after the last push. + +// CHECK-LABEL: func.func @multi_push_after_loop +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for +// CHECK: arith.cmpi eq +// CHECK: scf.if +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: } +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.pack_reconfig_l1_acc +func.func @multi_push_after_loop() attributes {ttkernel.thread = #ttkernel.thread} { + %cb0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %cb1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c4_i32 = arith.constant 4 : i32 + scf.for %iv = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb0, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.pack_tile(%c0, %cb1, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.l1_acc_loop} + ttkernel.cb_push_back(%cb0, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + ttkernel.cb_push_back(%cb1, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + return +} + +// ----- + +// Nested l1_acc loops: reserve is outside both loops, so both are annotated +// and all iterations accumulate into the same CB slot. Disable guards +// bracket the outermost loop; enable fires once after the first inner +// iteration of the first outer iteration. + +// CHECK-LABEL: func.func @nested_l1_acc_loops +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for +// CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to +// CHECK: ttkernel.tile_regs_acquire +// CHECK: ttkernel.tile_regs_release +// CHECK: arith.cmpi eq, %[[IV]], %[[LB]] +// CHECK: scf.if +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: ttkernel.pack_reconfig_l1_acc +func.func @nested_l1_acc_loops() attributes {ttkernel.thread = #ttkernel.thread} { + %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + scf.for %outer = %c0 to %c2 step %c1 { + scf.for %inner = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.l1_acc_loop} + } {ttl.l1_acc_loop} + return +} + +// ----- + +// Nested reduction loops (multi-dim reduce): all iterations contribute to +// a single accumulated result. Same structure as nested l1_acc loops. + +// CHECK-LABEL: func.func @nested_reduction_loops +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for +// CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to +// CHECK: ttkernel.tile_regs_acquire +// CHECK: ttkernel.tile_regs_release +// CHECK: arith.cmpi eq, %[[IV]], %[[LB]] +// CHECK: scf.if +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: ttkernel.pack_reconfig_l1_acc +func.func @nested_reduction_loops() attributes {ttkernel.thread = #ttkernel.thread} { + %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> + %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> + %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + scf.for %row = %c0 to %c2 step %c1 { + scf.for %col = %c0 to %c2 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, , ) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.reduction_loop} + } {ttl.reduction_loop} + return +} diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir new file mode 100644 index 000000000..aba630a66 --- /dev/null +++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir @@ -0,0 +1,119 @@ +// Verifies ttl-annotate-l1-acc-loops: user-written scf.for loops that store +// to a CB reserved outside the loop are annotated with ttl.l1_acc_loop. + +// RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(ttl-annotate-l1-acc-loops))' --split-input-file | FileCheck %s + +// Loop storing to an externally reserved CB should be annotated. + +// CHECK-LABEL: func.func @external_reserve +// CHECK: scf.for +// CHECK: } {ttl.l1_acc_loop} +func.func @external_reserve( + %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>, + %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + scf.for %iv = %c0 to %c4 step %c1 { + %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + } + func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> +} + +// ----- + +// Loop where cb_reserve is INSIDE the loop should NOT be annotated. + +// CHECK-LABEL: func.func @internal_reserve +// CHECK: scf.for +// CHECK-NOT: ttl.l1_acc_loop +// CHECK: } +func.func @internal_reserve( + %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>, + %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %result = scf.for %iv = %c0 to %c4 step %c1 iter_args(%acc = %arg0) -> (tensor<1x1x!ttcore.tile<32x32, bf16>>) { + %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + scf.yield %mm : tensor<1x1x!ttcore.tile<32x32, bf16>> + } + func.return %result : tensor<1x1x!ttcore.tile<32x32, bf16>> +} + +// ----- + +// Loops already annotated with compiler-generated attributes should be skipped. + +// CHECK-LABEL: func.func @skip_tile_loop +// CHECK: scf.for +// CHECK: } {ttl.tile_loop_stride +// CHECK-NOT: ttl.l1_acc_loop +func.func @skip_tile_loop( + %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + scf.for %iv = %c0 to %c4 step %c1 { + ttl.store %arg0, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + } {ttl.tile_loop_stride = array} + func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> +} + +// ----- + +// Loops already annotated with ttl.reduction_loop should be skipped. + +// CHECK-LABEL: func.func @skip_reduction_loop +// CHECK: scf.for +// CHECK: } {ttl.reduction_loop +// CHECK-NOT: ttl.l1_acc_loop +func.func @skip_reduction_loop( + %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + scf.for %iv = %c0 to %c4 step %c1 { + ttl.store %arg0, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + } {ttl.reduction_loop} + func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> +} + +// ----- + +// Loop without any store should NOT be annotated. + +// CHECK-LABEL: func.func @no_store +// CHECK: scf.for +// CHECK-NOT: ttl.l1_acc_loop +// CHECK: } +func.func @no_store( + %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + scf.for %iv = %c0 to %c4 step %c1 { + // No ttl.store in the loop body. + } + func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> +} From 3109b916b8bc62959f03e59d4785717e326bd8c4 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Sun, 12 Apr 2026 22:16:31 -0700 Subject: [PATCH 15/31] implement += on blocks --- include/ttlang/Dialect/TTL/IR/TTLOps.td | 14 ++++-- include/ttlang/Dialect/TTL/Passes.td | 9 ++-- .../TTL/Transforms/TTLAnnotateL1AccLoops.cpp | 19 ++++--- python/ttl/_src/ttl_ast.py | 19 +++++++ python/ttl/operators.py | 23 ++++++++- test/python/matmul_l1_acc_multinode.py | 8 +-- test/python/test_matmul_l1_acc.py | 12 ++--- test/python/test_matmul_l1_acc_multinode.py | 9 ++-- .../TTL/Transforms/annotate_l1_acc_loops.mlir | 49 +++++-------------- 9 files changed, 91 insertions(+), 71 deletions(-) diff --git a/include/ttlang/Dialect/TTL/IR/TTLOps.td b/include/ttlang/Dialect/TTL/IR/TTLOps.td index 93dfa2da8..5ad4c87a3 100644 --- a/include/ttlang/Dialect/TTL/IR/TTLOps.td +++ b/include/ttlang/Dialect/TTL/IR/TTLOps.td @@ -1039,8 +1039,13 @@ def TTL_StoreOp : TTL_Op<"store", [MemoryEffects<[MemWrite]>]> { `ttl.store` represents the user's intent to store a computed tensor into an output circular buffer. - Emitted by Python's `o.store(result)`. During `convert-ttl-to-compute`, - this op is transformed into a `ttl.tile_store` inside the compute body. + Emitted by Python's `o.store(result)` (overwrite) or `o += result` + (accumulate). During `convert-ttl-to-compute`, this op is transformed + into a `ttl.tile_store` inside the compute body. + + When `accumulate` is set, the enclosing loop is annotated for L1 + packer accumulation so that each iteration adds to the existing L1 + value instead of overwriting. Example: ```mlir @@ -1048,11 +1053,14 @@ def TTL_StoreOp : TTL_Op<"store", [MemoryEffects<[MemWrite]>]> { %result = ttl.add %a, %b : ... ttl.store %result, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + // Accumulating store (from +=): + ttl.store %result, %reserve {accumulate} : ... ``` }]; let arguments = (ins AnyRankedTensor:$tensor, - AnyRankedTensor:$view + AnyRankedTensor:$view, + OptionalAttr:$accumulate ); let assemblyFormat = "$tensor `,` $view attr-dict `:` type($tensor) `,` type($view)"; let hasVerifier = 1; diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td index 360a1f4e2..43540f6d1 100644 --- a/include/ttlang/Dialect/TTL/Passes.td +++ b/include/ttlang/Dialect/TTL/Passes.td @@ -64,14 +64,11 @@ def TTLAnnotateL1AccLoops : Pass<"ttl-annotate-l1-acc-loops", "::mlir::func::FuncOp"> { let summary = "Annotate user-written scf.for loops for L1 accumulation"; let description = [{ - Detects user-written `scf.for` loops where all iterations store to the - same CB slot (reserved before the loop, pushed after) and annotates them - with `ttl.l1_acc_loop`. This enables `TTKernelInsertL1Accumulation` + Detects user-written `scf.for` loops containing `ttl.store` ops with + the `accumulate` attribute (emitted by the `+=` operator) and annotates + them with `ttl.l1_acc_loop`. This enables `TTKernelInsertL1Accumulation` to insert `pack_reconfig_l1_acc` guards so that packs accumulate across iterations instead of overwriting. - - Targets the pattern: - cb_reserve -> scf.for { compute(store to reserved CB) } -> cb_push }]; let dependentDialects = [ "::mlir::scf::SCFDialect" diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp index a3246395d..767317fdf 100644 --- a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp +++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp @@ -6,9 +6,9 @@ // TTL Annotate L1 Acc Loops //===----------------------------------------------------------------------===// // -// Detects user-written scf.for loops that accumulate into the same CB slot -// (reserve before loop, store inside, push after) and annotates them with -// kL1AccLoopAttrName for L1 accumulation. +// Detects user-written scf.for loops containing accumulating stores +// (ttl.store with the {accumulate} attribute, emitted by +=) and annotates +// them with kL1AccLoopAttrName for L1 packer accumulation. // //===----------------------------------------------------------------------===// @@ -41,17 +41,16 @@ struct TTLAnnotateL1AccLoopsPass return; } - // Check if the loop body contains a store (ttl.store) targeting a - // CB that was reserved (ttl.cb_reserve) before the loop. - bool hasReductionStore = false; + // Check if the loop body contains an accumulating store (ttl.store + // with the {accumulate} attribute, emitted by the ``+=`` operator). + bool hasAccumulatingStore = false; forOp.getBody()->walk([&](StoreOp store) { - auto reserve = store.getView().getDefiningOp(); - if (reserve && !forOp->isAncestor(reserve)) { - hasReductionStore = true; + if (store.getAccumulate()) { + hasAccumulatingStore = true; } }); - if (hasReductionStore) { + if (hasAccumulatingStore) { forOp->setAttr(kL1AccLoopAttrName, OpBuilder(forOp).getUnitAttr()); } }); diff --git a/python/ttl/_src/ttl_ast.py b/python/ttl/_src/ttl_ast.py index 3ccfc5e37..5e4073309 100644 --- a/python/ttl/_src/ttl_ast.py +++ b/python/ttl/_src/ttl_ast.py @@ -8,6 +8,7 @@ from typing import List, Optional, Set from pykernel._src.kernel_ast import TTCompilerBase +from pykernel._src.utils import _get_type_str from ttl.dialects import arith, func, ttcore, ttkernel from ttl.ir import * @@ -296,6 +297,24 @@ def visit_Call(self, node): raise self._raise_error(node, str(e)) + def visit_AugAssign(self, node): + """Handle += on tensor blocks via the registered __iadd__ method.""" + with self._loc_for_node(node): + target = self.visit(node.target) + if ( + isinstance(node.op, ast.Add) + and hasattr(target, "type") + and isinstance(target.type, RankedTensorType) + ): + rhs = self.visit(node.value) + mlir_type = _get_type_str(target.type) + iadd_fn = self._fn_map.get(f"{mlir_type}.__iadd__") + if iadd_fn: + result = iadd_fn(target, rhs) + self.symbol_tables[-1][node.target.id] = result + return + return super().visit_AugAssign(node) + def visit_BinOp(self, node): """Override to inject auto-profiling and provide better error messages.""" with self._loc_for_node(node): diff --git a/python/ttl/operators.py b/python/ttl/operators.py index de4c4ba03..5df40e193 100644 --- a/python/ttl/operators.py +++ b/python/ttl/operators.py @@ -111,9 +111,10 @@ def __matmul__(ast_self: TensorBlock, rhs: TensorBlock) -> TensorBlock: return ttl.matmul(result_type, ast_self, rhs) def store(ast_self: TensorBlock, rhs: TensorBlock) -> None: - """Store result tensor to the output CB reserve view. + """Store result tensor to the output CB reserve view (overwrite). Emits ttl.store with the result tensor and reserve view. + Always overwrites the CB slot. For accumulation, use ``+=``. """ if not _is_block(ast_self): raise ValueError( @@ -122,6 +123,26 @@ def store(ast_self: TensorBlock, rhs: TensorBlock) -> None: reserve = _get_reserve_from_block(ast_self) ttl.store(rhs, reserve) + def __iadd__(ast_self: TensorBlock, rhs: TensorBlock) -> TensorBlock: + """Accumulate into a reserved block via L1 packer accumulation. + + Emits ttl.store with the ``accumulate`` attribute. When used + inside a loop, the compiler inserts ``pack_reconfig_l1_acc`` + guards so that each iteration adds to the existing L1 value + instead of overwriting. + + This is an interim mechanism; the spec's full pattern + (``fill`` + lazy ``BlockExpr`` ``+=`` + ``store``) is deferred + to the BlockExpr PR (#446). + """ + if not _is_block(ast_self): + raise ValueError( + "+= must be called on a block acquired from reserve(), not a regular tensor" + ) + reserve = _get_reserve_from_block(ast_self) + ttl.store(rhs, reserve, accumulate=True) + return ast_self + def push(ast_self: TensorBlock) -> None: """ Signal that data is ready in the circular buffer (producer release). diff --git a/test/python/matmul_l1_acc_multinode.py b/test/python/matmul_l1_acc_multinode.py index f2bbdd968..e086d09b5 100644 --- a/test/python/matmul_l1_acc_multinode.py +++ b/test/python/matmul_l1_acc_multinode.py @@ -12,9 +12,9 @@ (make_matmul_l1_acc / v4_l1_acc): auto grid, split DMA (reader=A, writer=B+output), 8x8x8 blocks, K_num_blocks=4 at 1024x1024x1024. -The compute thread uses the "reserve once, store K times, push once" pattern. -The compiler detects the K reduction loop and inserts pack_reconfig_l1_acc -guards so each K iteration packs additively to L1. +The compute thread uses += for accumulation across K iterations. The +compiler inserts pack_reconfig_l1_acc guards so each K iteration packs +additively to L1. Verifies the L1 packer accumulation pattern in generated C++: disable before K loop, conditional enable after first iteration, disable after cb_push_back. @@ -69,7 +69,7 @@ def compute(): for _ in range(K_num_blocks): a_blk = a_dfb.wait() b_blk = b_dfb.wait() - out_blk.store(a_blk @ b_blk) + out_blk += a_blk @ b_blk a_blk.pop() b_blk.pop() out_blk.push() diff --git a/test/python/test_matmul_l1_acc.py b/test/python/test_matmul_l1_acc.py index 599c33ea7..7f0b8bec3 100644 --- a/test/python/test_matmul_l1_acc.py +++ b/test/python/test_matmul_l1_acc.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: Apache-2.0 """ -Matmul L1 accumulation: reserve once, store K times, push once. +Matmul L1 accumulation via += across K iterations. -The compiler detects the scf.for loop storing to the same reserved CB -and annotates it as a reduction loop. TTKernelInsertL1Accumulation inserts -pack_reconfig_l1_acc guards so each K iteration packs additively to L1. +The += operator emits ttl.store with {accumulate}, which the compiler +detects and annotates for L1 packer accumulation. Each K iteration packs +additively to L1. Tests single-core and multicore configurations with various block sizes. """ @@ -29,7 +29,7 @@ def _make_l1_acc_kernel(block_m, block_n, grid="auto"): - """Matmul with L1 accumulation: reserve once, store K times, push once.""" + """Matmul with L1 accumulation via += across K iterations.""" @ttl.operation(grid=grid) def kernel(a, b, out): @@ -63,7 +63,7 @@ def compute(): for _ in range(Kt): a_blk = a_dfb.wait() b_blk = b_dfb.wait() - out_blk.store(a_blk @ b_blk) + out_blk += a_blk @ b_blk a_blk.pop() b_blk.pop() out_blk.push() diff --git a/test/python/test_matmul_l1_acc_multinode.py b/test/python/test_matmul_l1_acc_multinode.py index 0f1d603a4..c0d456838 100644 --- a/test/python/test_matmul_l1_acc_multinode.py +++ b/test/python/test_matmul_l1_acc_multinode.py @@ -3,10 +3,9 @@ # SPDX-License-Identifier: Apache-2.0 """ -Multinode matmul with L1 packer accumulation, L1-only (no DRAM reads during -compute). All input blocks are pre-loaded into L1 DFBs before the K reduction -loop begins. The compiler inserts pack_reconfig_l1_acc guards so each K -iteration packs additively to L1. +Multinode matmul with L1 packer accumulation via += across K iterations. +L1-only (no DRAM reads during compute). All input blocks are pre-loaded +into L1 DFBs before the K reduction loop begins. Tests multicore configurations with a 2D grid and multiple K blocks. """ @@ -63,7 +62,7 @@ def compute(): for _ in range(Kt): a_blk = a_dfb.wait() b_blk = b_dfb.wait() - out_blk.store(a_blk @ b_blk) + out_blk += a_blk @ b_blk a_blk.pop() b_blk.pop() out_blk.push() diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir index aba630a66..41f5bb935 100644 --- a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir +++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir @@ -1,14 +1,14 @@ -// Verifies ttl-annotate-l1-acc-loops: user-written scf.for loops that store -// to a CB reserved outside the loop are annotated with ttl.l1_acc_loop. +// Verifies ttl-annotate-l1-acc-loops: scf.for loops containing +// ttl.store with {accumulate} are annotated with ttl.l1_acc_loop. // RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(ttl-annotate-l1-acc-loops))' --split-input-file | FileCheck %s -// Loop storing to an externally reserved CB should be annotated. +// Accumulating store inside a loop should annotate the loop. -// CHECK-LABEL: func.func @external_reserve +// CHECK-LABEL: func.func @accumulating_store // CHECK: scf.for // CHECK: } {ttl.l1_acc_loop} -func.func @external_reserve( +func.func @accumulating_store( %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>, %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { %c0 = arith.constant 0 : index @@ -22,20 +22,20 @@ func.func @external_reserve( %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> scf.for %iv = %c0 to %c4 step %c1 { %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>> - ttl.store %mm, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> } func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> } // ----- -// Loop where cb_reserve is INSIDE the loop should NOT be annotated. +// Plain store (no {accumulate}) should NOT annotate, even with external reserve. -// CHECK-LABEL: func.func @internal_reserve +// CHECK-LABEL: func.func @plain_store_no_annotation // CHECK: scf.for // CHECK-NOT: ttl.l1_acc_loop // CHECK: } -func.func @internal_reserve( +func.func @plain_store_no_annotation( %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>, %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { %c0 = arith.constant 0 : index @@ -46,39 +46,17 @@ func.func @internal_reserve( %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> - %result = scf.for %iv = %c0 to %c4 step %c1 iter_args(%acc = %arg0) -> (tensor<1x1x!ttcore.tile<32x32, bf16>>) { - %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + scf.for %iv = %c0 to %c4 step %c1 { %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>> ttl.store %mm, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> - scf.yield %mm : tensor<1x1x!ttcore.tile<32x32, bf16>> } - func.return %result : tensor<1x1x!ttcore.tile<32x32, bf16>> -} - -// ----- - -// Loops already annotated with compiler-generated attributes should be skipped. - -// CHECK-LABEL: func.func @skip_tile_loop -// CHECK: scf.for -// CHECK: } {ttl.tile_loop_stride -// CHECK-NOT: ttl.l1_acc_loop -func.func @skip_tile_loop( - %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> - %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> - scf.for %iv = %c0 to %c4 step %c1 { - ttl.store %arg0, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> - } {ttl.tile_loop_stride = array} func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> } // ----- -// Loops already annotated with ttl.reduction_loop should be skipped. +// Already-annotated loops should be skipped. // CHECK-LABEL: func.func @skip_reduction_loop // CHECK: scf.for @@ -92,7 +70,7 @@ func.func @skip_reduction_loop( %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> scf.for %iv = %c0 to %c4 step %c1 { - ttl.store %arg0, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + ttl.store %arg0, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> } {ttl.reduction_loop} func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> } @@ -113,7 +91,6 @@ func.func @no_store( %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> scf.for %iv = %c0 to %c4 step %c1 { - // No ttl.store in the loop body. } func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> } From 760d376ea5fbd9b9ca91f30b282c8789c43f12a2 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Sun, 12 Apr 2026 22:50:57 -0700 Subject: [PATCH 16/31] use _set_var helper --- python/pykernel/_src/base_ast.py | 4 ++++ python/pykernel/_src/kernel_ast.py | 17 +++++++---------- python/ttl/_src/ttl_ast.py | 28 +++++++++++----------------- 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/python/pykernel/_src/base_ast.py b/python/pykernel/_src/base_ast.py index afc6767d9..754d16590 100644 --- a/python/pykernel/_src/base_ast.py +++ b/python/pykernel/_src/base_ast.py @@ -83,6 +83,10 @@ def _var_exists(self, var_name): return sym_table return {} + def _set_var(self, var_name, value): + """Bind a variable in the current (innermost) scope.""" + self.symbol_tables[-1][var_name] = value + def visit_Module(self, node): # Set default basic block with InsertionPoint(self.insert_point), Location.unknown(): diff --git a/python/pykernel/_src/kernel_ast.py b/python/pykernel/_src/kernel_ast.py index 60bf4fbe3..715b5a938 100644 --- a/python/pykernel/_src/kernel_ast.py +++ b/python/pykernel/_src/kernel_ast.py @@ -211,8 +211,8 @@ def visit_For(self, node): with InsertionPoint(for_op.body), Location.unknown(): self.symbol_tables.append({}) - # Add the iterator into the symbol_table - self.symbol_tables[-1][node.target.id] = for_op.induction_variable + # Add the iterator into the symbol table. + self._set_var(node.target.id, for_op.induction_variable) for stmt in node.body: self.visit(stmt) @@ -259,17 +259,15 @@ def visit_Assign(self, node): raise ValueError( f"Not enough values to unpack from rt_args slice (expected {len(_vars)}, got {len(values)})" ) - # Since we are unpacking a tuple, types can't be assigned here: - sym_table = self.symbol_tables[-1] + # Since we are unpacking a tuple, types can't be assigned here. for i in range(len(_vars)): - sym_table[_tuple.elts[i].id] = values[i] + self._set_var(_tuple.elts[i].id, values[i]) # Exit out of function now return var = self.visit(node.targets[0]) value = self.visit(node.value) - sym_table = self.symbol_tables[-1] # Handle Subscript Assignment here if isinstance(node.targets[0], ast.Subscript): @@ -282,13 +280,12 @@ def visit_Assign(self, node): if hasattr(var, "type") and isinstance(var.type, MemRefType): memref.StoreOp(value, var, [arith.ConstantOp(IndexType.get(self.ctx), 0)]) else: - sym_table[var_name] = value + self._set_var(var_name, value) def visit_AnnAssign(self, node): # NOTE: TTKernel types can not be used with memrefs var = self.visit(node.target) value = self.visit(node.value) - sym_table = self.symbol_tables[-1] var_name = node.target.id # Check the annotation for array creation @@ -308,7 +305,7 @@ def visit_AnnAssign(self, node): memref_type = MemRefType.get( [elt.value for elt in node.annotation.elts[1:]], var_type ) - sym_table[var_name] = memref.alloca(memref_type, [], []) + self._set_var(var_name, memref.alloca(memref_type, [], [])) return else: raise NotImplementedError( @@ -324,7 +321,7 @@ def visit_AnnAssign(self, node): var_type = value.type memref_type = MemRefType.get([1], var_type) var = memref.alloca(memref_type, [], []) - sym_table[var_name] = var + self._set_var(var_name, var) else: assert isinstance(var, MemRefType), "Can not AnnAssign to non-memref types" diff --git a/python/ttl/_src/ttl_ast.py b/python/ttl/_src/ttl_ast.py index 5e4073309..898ec4d6e 100644 --- a/python/ttl/_src/ttl_ast.py +++ b/python/ttl/_src/ttl_ast.py @@ -182,11 +182,10 @@ def visit_Assign(self, node): f"Cannot unpack {len(value)} values into {len(targets)} variables" ) - sym_table = self.symbol_tables[-1] for elt, val in zip(targets, value): if not isinstance(elt, ast.Name): raise ValueError("Tuple unpacking requires simple variable names") - sym_table[elt.id] = val + self._set_var(elt.id, val) def _loc_for_node(self, node): """Return file location for node if debug_locations enabled, else name location.""" @@ -311,7 +310,7 @@ def visit_AugAssign(self, node): iadd_fn = self._fn_map.get(f"{mlir_type}.__iadd__") if iadd_fn: result = iadd_fn(target, rhs) - self.symbol_tables[-1][node.target.id] = result + self._set_var(node.target.id, result) return return super().visit_AugAssign(node) @@ -545,8 +544,8 @@ def _emit_entry(self, node): self.symbol_tables.append({}) func_bb = self.func_entry.add_entry_block() - # Add ttl module to symbol table - self.symbol_tables[-1]["ttl"] = ttl + # Add ttl module to symbol table. + self._set_var("ttl", ttl) # Ensure TTL dialect is registered for type parsing ttl.ensure_dialects_registered(self.ctx) @@ -555,12 +554,12 @@ def _emit_entry(self, node): # Emit function body with InsertionPoint(func_bb): - # Map TensorAccessor function arguments to symbol table + # Map TensorAccessor function arguments to symbol table. for i, name in enumerate(self._tensor_accessor_names): - self.symbol_tables[-1][name] = func_bb.arguments[i] + self._set_var(name, func_bb.arguments[i]) self.streams.add(name) - # Prepopulate other captures (non-tensor) + # Prepopulate other captures (non-tensor). from ..circular_buffer import CircularBuffer for name, val in self.captures.items(): @@ -568,16 +567,11 @@ def _emit_entry(self, node): continue # Already handled via function arguments assert isinstance(name, str) if isinstance(val, int): - self.symbol_tables[-1][name] = arith.ConstantOp( - IndexType.get(self.ctx), val - ) + self._set_var(name, arith.ConstantOp(IndexType.get(self.ctx), val)) elif isinstance(val, float): - self.symbol_tables[-1][name] = arith.ConstantOp( - F32Type.get(self.ctx), val - ) + self._set_var(name, arith.ConstantOp(F32Type.get(self.ctx), val)) elif isinstance(val, CircularBuffer): - cb_val = self._emit_cb_from_capture(val) - self.symbol_tables[-1][name] = cb_val + self._set_var(name, self._emit_cb_from_capture(val)) else: self._raise_error( node, f"Invalid capture type for var {name}: {type(val)}" @@ -963,7 +957,7 @@ def visit_With(self, node): optional_vars, "'with ... as var' requires a simple variable name", ) - self.symbol_tables[-1][optional_vars.id] = acquire_result + self._set_var(optional_vars.id, acquire_result) for stmt in node.body: self.visit(stmt) From 903af079cb4fb187fb8a531a8f32d7e7fcfd5799 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Sun, 12 Apr 2026 23:14:02 -0700 Subject: [PATCH 17/31] fix test --- test/python/matmul_l1_acc_multinode.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test/python/matmul_l1_acc_multinode.py b/test/python/matmul_l1_acc_multinode.py index e086d09b5..293a472b2 100644 --- a/test/python/matmul_l1_acc_multinode.py +++ b/test/python/matmul_l1_acc_multinode.py @@ -133,14 +133,16 @@ def writer(): # 3. Disable after cb_push_back following the loop # ============================================================================= -# CHECK-CPP: PACK((llk_pack_reconfig_l1_acc( -# CHECK-CPP-NEXT: for +# CHECK-CPP-DAG: int32_t [[ENABLE:v[0-9]+]] = 1; +# CHECK-CPP-DAG: int32_t [[DISABLE:v[0-9]+]] = 0; +# CHECK-CPP: PACK((llk_pack_reconfig_l1_acc([[DISABLE]]))); +# CHECK-CPP: for # CHECK-CPP: matmul_block( # CHECK-CPP: pack_tile # CHECK-CPP: if ( -# CHECK-CPP-NEXT: PACK((llk_pack_reconfig_l1_acc( +# CHECK-CPP-NEXT: PACK((llk_pack_reconfig_l1_acc([[ENABLE]]))); # CHECK-CPP: cb_push_back( -# CHECK-CPP-NEXT: PACK((llk_pack_reconfig_l1_acc( +# CHECK-CPP: PACK((llk_pack_reconfig_l1_acc([[DISABLE]]))); # CHECK-RESULT: PASS From 61d0a1523fc020016885e4c8e4d4d7657222e005 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Mon, 13 Apr 2026 07:07:08 -0700 Subject: [PATCH 18/31] add subblocking test --- .../TTLToTTKernel/matmul_subblock_l1_acc.mlir | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir diff --git a/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir b/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir new file mode 100644 index 000000000..ebf2b5aeb --- /dev/null +++ b/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir @@ -0,0 +1,107 @@ +// Matmul with subblocking AND L1 accumulation. Output 3x3 bf16 = 9 tiles +// exceeds bf16 DST capacity (8), triggering subblocking. The user K loop +// with {accumulate} triggers L1 acc annotation and pack_reconfig_l1_acc +// guard insertion. + +// RUN: ttlang-opt %s \ +// RUN: -pass-pipeline='builtin.module( \ +// RUN: func.func(ttl-annotate-l1-acc-loops, convert-ttl-to-compute, \ +// RUN: ttl-assign-dst{enable-fpu-binary-ops=0}, \ +// RUN: ttl-subblock-compute-for-dst, ttl-lower-matmul-block, \ +// RUN: ttl-lower-to-loops{dst-accumulation=1}, ttl-schedule-operations, \ +// RUN: ttl-annotate-cb-associations), \ +// RUN: convert-ttl-to-ttkernel, ttkernel-insert-inits, \ +// RUN: ttkernel-insert-l1-accumulation, canonicalize, cse)' \ +// RUN: --split-input-file | FileCheck %s + +// CHECK-LABEL: func.func @matmul_3x3_k_loop +// Disable before the K loop. +// CHECK-DAG: %[[C0_I32:.*]] = arith.constant 0 : i32 +// CHECK-DAG: %[[C1_I32:.*]] = arith.constant 1 : i32 +// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C0_I32]]) +// K loop with subblock loops inside. +// CHECK: scf.for %[[K_IV:.*]] = %[[K_LB:.*]] to +// Subblock loop: acquire, matmul, 3 pack_tiles (3x1 subblock), release. +// CHECK: scf.for +// CHECK: ttkernel.tile_regs_acquire +// CHECK: ttkernel.matmul_block +// CHECK-COUNT-3: ttkernel.pack_tile +// CHECK: ttkernel.tile_regs_release +// CHECK: } +// Enable after first K iteration. +// CHECK: arith.cmpi eq, %[[K_IV]], %[[K_LB]] +// CHECK: scf.if +// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C1_I32]]) +// CHECK: } +// Disable after push. +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C0_I32]]) +func.func @matmul_3x3_k_loop( + %arg0: tensor<3x2x!ttcore.tile<32x32, bf16>>, + %arg1: tensor<2x3x!ttcore.tile<32x32, bf16>>) -> tensor<3x3x!ttcore.tile<32x32, bf16>> + attributes {ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[3, 2], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[2, 3], !ttcore.tile<32x32, bf16>, 2> + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<3x2x!ttcore.tile<32x32, bf16>>, !ttl.cb<[3, 2], !ttcore.tile<32x32, bf16>, 2>) -> tensor<3x2x!ttcore.tile<32x32, bf16>> + %b = ttl.attach_cb %arg1, %cb1 : (tensor<2x3x!ttcore.tile<32x32, bf16>>, !ttl.cb<[2, 3], !ttcore.tile<32x32, bf16>, 2>) -> tensor<2x3x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb2 : <[3, 3], !ttcore.tile<32x32, bf16>, 2> -> tensor<3x3x!ttcore.tile<32x32, bf16>> + scf.for %k = %c0 to %c2 step %c1 { + %mm = ttl.matmul %a, %b : tensor<3x2x!ttcore.tile<32x32, bf16>>, tensor<2x3x!ttcore.tile<32x32, bf16>> -> tensor<3x3x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve {accumulate} : tensor<3x3x!ttcore.tile<32x32, bf16>>, tensor<3x3x!ttcore.tile<32x32, bf16>> + } + ttl.cb_push %cb2 : <[3, 3], !ttcore.tile<32x32, bf16>, 2> + func.return %reserve : tensor<3x3x!ttcore.tile<32x32, bf16>> +} + +// ----- + +// 8x8 output (64 tiles >> DST capacity 8) with K=4: heavily subblocked. +// Verifies that multiple levels of subblock loops all sit inside the +// K loop's L1 acc guards. + +// CHECK-LABEL: func.func @matmul_8x8_k4 +// CHECK-DAG: %[[C0_I32:.*]] = arith.constant 0 : i32 +// CHECK-DAG: %[[C1_I32:.*]] = arith.constant 1 : i32 +// Disable before K loop. +// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C0_I32]]) +// K loop -> subblock row loop -> acquire, matmul K loop, 8x pack, release. +// CHECK: scf.for %[[K_IV:.*]] = %[[K_LB:.*]] to +// CHECK: scf.for +// CHECK: ttkernel.tile_regs_acquire +// CHECK: scf.for +// CHECK: ttkernel.matmul_block +// CHECK-COUNT-8: ttkernel.pack_tile +// CHECK: ttkernel.tile_regs_release +// CHECK: } +// Enable after first K iteration. +// CHECK: arith.cmpi eq, %[[K_IV]], %[[K_LB]] +// CHECK: scf.if +// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C1_I32]]) +// CHECK: } +// Disable after push. +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C0_I32]]) +func.func @matmul_8x8_k4( + %arg0: tensor<8x8x!ttcore.tile<32x32, bf16>>, + %arg1: tensor<8x8x!ttcore.tile<32x32, bf16>>) -> tensor<8x8x!ttcore.tile<32x32, bf16>> + attributes {ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2> + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<8x8x!ttcore.tile<32x32, bf16>>, !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2>) -> tensor<8x8x!ttcore.tile<32x32, bf16>> + %b = ttl.attach_cb %arg1, %cb1 : (tensor<8x8x!ttcore.tile<32x32, bf16>>, !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2>) -> tensor<8x8x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb2 : <[8, 8], !ttcore.tile<32x32, bf16>, 2> -> tensor<8x8x!ttcore.tile<32x32, bf16>> + scf.for %k = %c0 to %c4 step %c1 { + %mm = ttl.matmul %a, %b : tensor<8x8x!ttcore.tile<32x32, bf16>>, tensor<8x8x!ttcore.tile<32x32, bf16>> -> tensor<8x8x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve {accumulate} : tensor<8x8x!ttcore.tile<32x32, bf16>>, tensor<8x8x!ttcore.tile<32x32, bf16>> + } + ttl.cb_push %cb2 : <[8, 8], !ttcore.tile<32x32, bf16>, 2> + func.return %reserve : tensor<8x8x!ttcore.tile<32x32, bf16>> +} From 7f8f840d7fbeefa228cd5f5b340a582eeab6418d Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Mon, 13 Apr 2026 07:58:56 -0700 Subject: [PATCH 19/31] precommit --- .../ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir b/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir index ebf2b5aeb..3e37a056a 100644 --- a/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir +++ b/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir @@ -60,7 +60,7 @@ func.func @matmul_3x3_k_loop( // ----- // 8x8 output (64 tiles >> DST capacity 8) with K=4: heavily subblocked. -// Verifies that multiple levels of subblock loops all sit inside the +// Verifies that multiple levels of subblock loops all sit inside the // K loop's L1 acc guards. // CHECK-LABEL: func.func @matmul_8x8_k4 From 6b9a14135221073fb4ee75f1cdfced53304e067d Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Mon, 13 Apr 2026 16:57:56 -0700 Subject: [PATCH 20/31] address comments --- .../TTL/Transforms/TTLAnnotateL1AccLoops.cpp | 9 +++-- test/python/matmul_l1_acc_multinode.py | 16 ++++++--- test/python/test_matmul_l1_acc_multinode.py | 3 ++ .../TTL/Transforms/annotate_l1_acc_loops.mlir | 34 +++++++++++++++++++ 4 files changed, 54 insertions(+), 8 deletions(-) diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp index 767317fdf..7757c04ed 100644 --- a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp +++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp @@ -41,11 +41,14 @@ struct TTLAnnotateL1AccLoopsPass return; } - // Check if the loop body contains an accumulating store (ttl.store - // with the {accumulate} attribute, emitted by the ``+=`` operator). + // Check if this loop directly contains an accumulating store + // (ttl.store with the {accumulate} attribute, emitted by +=). + // Only count stores whose nearest enclosing scf.for is this forOp, + // so that nested inner loops are not attributed to outer loops. bool hasAccumulatingStore = false; forOp.getBody()->walk([&](StoreOp store) { - if (store.getAccumulate()) { + if (store.getAccumulate() && + store->getParentOfType() == forOp) { hasAccumulatingStore = true; } }); diff --git a/test/python/matmul_l1_acc_multinode.py b/test/python/matmul_l1_acc_multinode.py index 293a472b2..822cc7692 100644 --- a/test/python/matmul_l1_acc_multinode.py +++ b/test/python/matmul_l1_acc_multinode.py @@ -8,9 +8,12 @@ # RUN: FileCheck %s --check-prefix=CHECK-RESULT < %t.output """ -Multinode matmul with L1 packer accumulation. Mirrors the benchmark kernel -(make_matmul_l1_acc / v4_l1_acc): auto grid, split DMA (reader=A, -writer=B+output), 8x8x8 blocks, K_num_blocks=4 at 1024x1024x1024. +Multinode matmul with L1 packer accumulation. Auto grid, split DMA (reader=A, +writer=B+output), 8x8x8 blocks, K_num_blocks=4 at 3072x1024x3072. + +The larger dimensions (96x32x96 tiles, 12x4x12 blocks) ensure each core +handles multiple output blocks (ceil(12/8)=2 per axis on an 8x8 grid), +exercising the per-block L1 acc disable/re-enable sequence. The compute thread uses += for accumulation across K iterations. The compiler inserts pack_reconfig_l1_acc guards so each K iteration packs @@ -150,8 +153,11 @@ def writer(): device = ttnn.open_device(device_id=0) try: - # 32x32x32 tiles = 1024x1024x1024, 8x8x8 blocks -> K_num_blocks=4 - Mt, Kt, Nt = 32, 32, 32 + # 96x32x96 tiles = 3072x1024x3072, 8x8x8 blocks -> 12x4x12 blocks. + # With an 8x8 grid each core handles ceil(12/8)=2 M-blocks and + # 2 N-blocks (4 output blocks), exercising the per-block L1 acc + # disable/re-enable sequence. + Mt, Kt, Nt = 96, 32, 96 M, K, N = Mt * TILE, Kt * TILE, Nt * TILE a_torch = torch.randn(M, K, dtype=torch.bfloat16) diff --git a/test/python/test_matmul_l1_acc_multinode.py b/test/python/test_matmul_l1_acc_multinode.py index c0d456838..70f3fdab6 100644 --- a/test/python/test_matmul_l1_acc_multinode.py +++ b/test/python/test_matmul_l1_acc_multinode.py @@ -125,6 +125,9 @@ def writer(): (8, 4, 8, 4, 4, (2, 2)), (8, 4, 8, 4, 4, "auto"), (16, 8, 16, 8, 8, "auto"), + # Multi-block per core: M_num=4, N_num=4 on 2x2 grid -> 2 blocks/core/axis. + # Exercises per-block L1 acc disable/re-enable across output blocks. + (16, 4, 16, 4, 4, (2, 2)), ] diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir index 41f5bb935..9596fe1b2 100644 --- a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir +++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir @@ -77,6 +77,40 @@ func.func @skip_reduction_loop( // ----- +// Nested loops: only the innermost loop containing the accumulating store +// should be annotated, not the outer loop. + +// CHECK-LABEL: func.func @nested_only_inner +// CHECK: scf.for +// CHECK-NOT: ttl.l1_acc_loop +// CHECK: scf.for +// CHECK: } {ttl.l1_acc_loop} +// CHECK: } +func.func @nested_only_inner( + %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>, + %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + // Outer loop (e.g., N-block loop) — should NOT be annotated. + scf.for %outer = %c0 to %c4 step %c1 { + // Inner loop (K-accumulation loop) — should be annotated. + scf.for %iv = %c0 to %c4 step %c1 { + %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + } + } + func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> +} + +// ----- + // Loop without any store should NOT be annotated. // CHECK-LABEL: func.func @no_store From 3a3a264e3e5a71b335c8b0fc2bbcaceb6db097e5 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Mon, 13 Apr 2026 17:41:13 -0700 Subject: [PATCH 21/31] add --ttl-strict-f32-acc compiler option --- docs/sphinx/reference/compiler-options.md | 21 +++++- include/ttlang/Dialect/TTL/Passes.td | 12 +++ .../Dialect/TTL/Pipelines/TTLPipelines.h | 4 + .../TTKernelInsertL1Accumulation.cpp | 28 +++++++ lib/Dialect/TTL/Pipelines/TTLPipelines.cpp | 6 +- python/ttl/compiler_options.py | 8 ++ python/ttl/ttl_api.py | 3 +- .../insert_l1_accumulation_invalid.mlir | 75 +++++++++++++++++++ 8 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir diff --git a/docs/sphinx/reference/compiler-options.md b/docs/sphinx/reference/compiler-options.md index 8f329e75d..189ddb0c0 100644 --- a/docs/sphinx/reference/compiler-options.md +++ b/docs/sphinx/reference/compiler-options.md @@ -17,6 +17,7 @@ python my_kernel.py --no-ttl-maximize-dst | `--ttl-block-matmul` / `--no-ttl-block-matmul` | enabled | Emit `matmul_block` (processes the full tile block atomically) instead of per-tile matmul loops. Disabling this option is not yet supported. | | `--ttl-auto-sync` / `--no-ttl-auto-sync` | disabled | Let the compiler insert and move DFB synchronization ops. When enabled, reserve/push may be refined to per-subblock granularity. When disabled, user-placed reserve/push is preserved as written. | | `--ttl-combine-pack-tiles` / `--no-ttl-combine-pack-tiles` | enabled | Combine consecutive `pack_tile` ops on the same CB with contiguous DST and CB indices into a single `pack_tile_block` call. | +| `--ttl-strict-f32-acc` / `--no-ttl-strict-f32-acc` | disabled | Error at compile time if a `+=` accumulation loop's output block exceeds f32 DST capacity (4 tiles with double-buffering). When enabled, guarantees each accumulation step fits in a single DST section without subblocking. | ### Other Ways to Set These @@ -110,6 +111,7 @@ ttlang-opt input.mlir -p 'ttl-to-ttkernel-pipeline{maximize-dst=true lower-to-em | `use-block-matmul` | bool | `true` | Lower matmul to block-level hardware calls (`experimental::matmul_block`). | | `auto-sync` | bool | `false` | Let the compiler insert and move DFB synchronization ops. | | `combine-pack-tiles` | bool | `true` | Combine consecutive `pack_tile` ops into `pack_tile_block`. | +| `strict-f32-acc` | bool | `false` | Error if a `+=` accumulation loop's output block exceeds f32 DST capacity. | | `lower-to-emitc` | bool | `false` | Run the TTKernel-to-EmitC backend (produces C++ source). | The pipeline runs these passes in order: @@ -125,9 +127,10 @@ The pipeline runs these passes in order: 9. `ttl-annotate-cb-associations` — annotate block args with CB indices 10. `convert-ttl-to-ttkernel` — lower TTL DMA ops to TTKernel 11. `ttkernel-insert-inits` — insert hardware init ops before compute ops -12. `ttkernel-combine-pack-tiles` — combine consecutive `pack_tile` into `pack_tile_block` *(only if `combine-pack-tiles=true`)* -13. Canonicalization and CSE cleanup -14. *(if `lower-to-emitc=true`)* `lower-affine`, `convert-ttkernel-to-emitc`, `emitc-form-expressions` +12. `ttkernel-insert-l1-accumulation` — insert `pack_reconfig_l1_acc` guards for `+=` and reduction loops; errors if `strict-f32-acc=true` and output block exceeds f32 DST capacity +13. `ttkernel-combine-pack-tiles` — combine consecutive `pack_tile` into `pack_tile_block` *(only if `combine-pack-tiles=true`)* +14. Canonicalization and CSE cleanup +15. *(if `lower-to-emitc=true`)* `lower-affine`, `convert-ttkernel-to-emitc`, `emitc-form-expressions` ### Individual Pass Options @@ -185,3 +188,15 @@ Analyze circular buffer producer/consumer relationships and dump the flow graph. ```bash ttlang-opt input.mlir -p 'ttl-dump-cb-flow-graph{output="/tmp/cb_graph.json"}' ``` + +#### `ttkernel-insert-l1-accumulation` + +Insert `pack_reconfig_l1_acc` guards around reduction and accumulation loops. + +| Option | Type | Default | Description | +|---|---|---|---| +| `strict-f32-acc` | bool | `false` | Error if a user-written `+=` accumulation loop requires subblocking because the output block exceeds f32 DST capacity (4 tiles with double-buffering). | + +```bash +ttlang-opt input.mlir -p 'builtin.module(ttkernel-insert-l1-accumulation{strict-f32-acc=true})' +``` diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td index 43540f6d1..bcc284405 100644 --- a/include/ttlang/Dialect/TTL/Passes.td +++ b/include/ttlang/Dialect/TTL/Passes.td @@ -51,8 +51,20 @@ def TTKernelInsertL1Accumulation Reduction loops are identified by the `ttl.l1_acc_loop` (user-written) or `ttl.reduction_loop` (compiler-generated) attributes on `scf.for` ops, with `ttl.l1_acc_loop` taking precedence. + + When `strict-f32-acc` is enabled, the pass errors if a user-written + accumulation loop (`+=`, marked `ttl.l1_acc_loop`) contains subblock + loops, indicating the output block exceeds f32 DST capacity. This + guarantees each K iteration fits in a single DST section. }]; + let options = [ + Option<"strictF32Acc", "strict-f32-acc", "bool", "false", + "Error if a user-written accumulation loop requires subblocking " + "(output exceeds f32 DST capacity). Guarantees full f32 " + "precision per accumulation step."> + ]; + let dependentDialects = [ "::mlir::arith::ArithDialect", "::mlir::scf::SCFDialect", diff --git a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h index 18b4c8a4d..0014affd7 100644 --- a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h +++ b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h @@ -44,6 +44,10 @@ struct TTLToTTKernelPipelineOptions *this, "reduce-full-fp32", llvm::cl::desc("Enable FP32 accumulation for reduce operations."), llvm::cl::init(true)}; + Option strictF32Acc{ + *this, "strict-f32-acc", + llvm::cl::desc("Error if accumulation output exceeds f32 DST capacity."), + llvm::cl::init(false)}; }; void createTTLToTTKernelPipeline(mlir::OpPassManager &pm, diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index 41cfeb5a6..ae98876b7 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -63,6 +63,10 @@ static scf::ForOp findOutermostL1AccLoop(Operation *op) { struct TTKernelInsertL1AccumulationPass : public impl::TTKernelInsertL1AccumulationBase< TTKernelInsertL1AccumulationPass> { + using Base = + impl::TTKernelInsertL1AccumulationBase; + using Base::Base; + void runOnOperation() override { auto moduleOp = getOperation(); @@ -86,6 +90,30 @@ struct TTKernelInsertL1AccumulationPass } }); + // When --strict-f32-acc is set, error if any user-written accumulation + // loop (kL1AccLoopAttrName, from +=) contains subblock loops, which + // indicates the output block exceeds f32 DST capacity. + // TODO(ttl): Instead of erroring, allocate an f32 L1 temporary and + // emit a cast to bf16 after the loop. This would give full f32 + // precision regardless of block size, at the cost of 2x L1 per tile. + if (strictF32Acc) { + for (auto loop : l1AccLoops) { + if (!loop->hasAttr(kL1AccLoopAttrName)) + continue; + bool hasSubblockLoop = false; + loop->walk([&](scf::ForOp inner) { + if (inner->hasAttr(kSubblockLoopStrideAttrName)) + hasSubblockLoop = true; + }); + if (hasSubblockLoop) { + loop->emitError( + "output block exceeds f32 DST capacity; reduce block " + "dimensions or compile without --ttl-strict-f32-acc"); + return signalPassFailure(); + } + } + } + // L1 accumulation guard placement. For any loop that // accumulates in L1 (matmul K loop or reduce loop), the pattern is: // diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp index 85cc4e47c..cc64fe3f5 100644 --- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp +++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp @@ -53,7 +53,11 @@ void createTTLToTTKernelPipeline(OpPassManager &pm, pm.addPass(createTTLConvertTTLToTTKernel(ttkOpts)); } pm.addPass(createTTKernelInsertInits()); - pm.addPass(createTTKernelInsertL1Accumulation()); + { + TTKernelInsertL1AccumulationOptions l1AccOpts; + l1AccOpts.strictF32Acc = options.strictF32Acc; + pm.addPass(createTTKernelInsertL1Accumulation(l1AccOpts)); + } if (options.combinePackTiles) { pm.addNestedPass(createTTKernelCombinePackTiles()); } diff --git a/python/ttl/compiler_options.py b/python/ttl/compiler_options.py index 6871cc7fe..bccf088ae 100644 --- a/python/ttl/compiler_options.py +++ b/python/ttl/compiler_options.py @@ -76,6 +76,13 @@ def _make_parser() -> argparse.ArgumentParser: action=argparse.BooleanOptionalAction, help="Enable FP32 accumulation for matmul operations (default: enabled).", ) + p.add_argument( + "--ttl-strict-f32-acc", + default=None, + dest="strict_f32_acc", + action=argparse.BooleanOptionalAction, + help="Error if accumulation (+=) output block exceeds f32 DST capacity (default: disabled).", + ) return p @@ -121,6 +128,7 @@ class CompilerOptions: combine_pack_tiles: bool = True reduce_full_fp32: bool = True matmul_full_fp32: bool = True + strict_f32_acc: bool = False # Fields that were explicitly provided (not defaulted). Excluded from # equality and hashing so two instances with the same bool values are diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py index 4360b627c..f2c02cc71 100644 --- a/python/ttl/ttl_api.py +++ b/python/ttl/ttl_api.py @@ -1267,11 +1267,12 @@ def _compile_kernel( pipeline_passes.append(f'ttl-dump-cb-flow-graph{{output="{cb_flow_json}"}}') reduce_fp32_flag = int(compiler_options.reduce_full_fp32) + strict_f32_flag = int(compiler_options.strict_f32_acc) pipeline_passes += [ "ttl-lower-dprint-to-emitc", f"convert-ttl-to-ttkernel{{reduce-full-fp32={reduce_fp32_flag}}}", "ttkernel-insert-inits", - "ttkernel-insert-l1-accumulation", + f"ttkernel-insert-l1-accumulation{{strict-f32-acc={strict_f32_flag}}}", ] if compiler_options.combine_pack_tiles: pipeline_passes.append("func.func(ttkernel-combine-pack-tiles)") diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir new file mode 100644 index 000000000..e79698877 --- /dev/null +++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir @@ -0,0 +1,75 @@ +// Negative tests for ttkernel-insert-l1-accumulation with --strict-f32-acc. + +// RUN: ttlang-opt %s --pass-pipeline='builtin.module(ttkernel-insert-l1-accumulation{strict-f32-acc=true})' --verify-diagnostics --split-input-file + +// L1 acc loop with subblock loop inside: strict-f32-acc should error. + +func.func @strict_f32_subblock_error() attributes {ttkernel.thread = #ttkernel.thread} { + %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + // expected-error @below {{output block exceeds f32 DST capacity}} + scf.for %iv = %c0 to %c4 step %c1 { + scf.for %sb = %c0 to %c2 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.matmul_block(%cb, %cb, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.subblock_loop_stride = 1 : index} + } {ttl.l1_acc_loop} + return +} + +// ----- + +// L1 acc loop WITHOUT subblock loops: strict-f32-acc should pass. + +// expected-no-diagnostics +func.func @strict_f32_no_subblock_ok() attributes {ttkernel.thread = #ttkernel.thread} { + %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c4_i32 = arith.constant 4 : i32 + scf.for %iv = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.l1_acc_loop} + ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + return +} + +// ----- + +// Reduction loop (compiler-generated) with subblock: strict-f32-acc should +// NOT error (only user-written l1_acc_loop triggers the check). + +// expected-no-diagnostics +func.func @strict_f32_reduction_loop_ok() attributes {ttkernel.thread = #ttkernel.thread} { + %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> + %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> + %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + scf.for %iv = %c0 to %c2 step %c1 { + scf.for %sb = %c0 to %c2 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, , ) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.subblock_loop_stride = 1 : index} + } {ttl.reduction_loop} + return +} From 7295718a0ca11e3a44ee052d1120f888911108d5 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Mon, 13 Apr 2026 18:05:37 -0700 Subject: [PATCH 22/31] add TODO --- .../Transforms/TTKernelInsertL1Accumulation.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index ae98876b7..a6bfffcc1 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -98,17 +98,18 @@ struct TTKernelInsertL1AccumulationPass // precision regardless of block size, at the cost of 2x L1 per tile. if (strictF32Acc) { for (auto loop : l1AccLoops) { - if (!loop->hasAttr(kL1AccLoopAttrName)) + if (!loop->hasAttr(kL1AccLoopAttrName)) { continue; + } bool hasSubblockLoop = false; loop->walk([&](scf::ForOp inner) { - if (inner->hasAttr(kSubblockLoopStrideAttrName)) + if (inner->hasAttr(kSubblockLoopStrideAttrName)) { hasSubblockLoop = true; + } }); if (hasSubblockLoop) { - loop->emitError( - "output block exceeds f32 DST capacity; reduce block " - "dimensions or compile without --ttl-strict-f32-acc"); + loop->emitError("output block exceeds f32 DST capacity; reduce block " + "dimensions or compile without --ttl-strict-f32-acc"); return signalPassFailure(); } } From 7c5ad7571bbc4154ae880f9e2a97dabfa1011ce4 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Tue, 14 Apr 2026 07:01:21 -0700 Subject: [PATCH 23/31] update doc --- docs/sphinx/reference/compiler-options.md | 31 ++++++++++++----------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/docs/sphinx/reference/compiler-options.md b/docs/sphinx/reference/compiler-options.md index 189ddb0c0..b18157c0a 100644 --- a/docs/sphinx/reference/compiler-options.md +++ b/docs/sphinx/reference/compiler-options.md @@ -116,21 +116,22 @@ ttlang-opt input.mlir -p 'ttl-to-ttkernel-pipeline{maximize-dst=true lower-to-em The pipeline runs these passes in order: -1. `convert-ttl-to-compute` — lower TTL elementwise tensor ops to `ttl.compute` with tile ops -2. `ttl-set-compute-kernel-config` — set `fp32_dest_acc_en` / `dst_full_sync_en` defaults -3. `ttl-assign-dst` — DST register allocation (linear scan with copy insertion) -4. `ttl-subblock-compute-for-dst` — tile `ttl.compute` into DST-sized subblocks *(only if `maximize-dst=true`)*; optionally refine reserve/push to per-subblock granularity *(only if `auto-sync=true`)* -5. `ttl-insert-tile-regs-sync` — insert math/pack thread synchronization -6. `ttl-lower-matmul-block` — mark block-matmul computes and expand stores *(only if `use-block-matmul=true`)* -7. `ttl-lower-to-loops` — lower `ttl.compute` to `scf.for` loops -8. `ttl-schedule-operations` — reorder tile ops by dependency depth and kind *(only if `maximize-dst=true`)* -9. `ttl-annotate-cb-associations` — annotate block args with CB indices -10. `convert-ttl-to-ttkernel` — lower TTL DMA ops to TTKernel -11. `ttkernel-insert-inits` — insert hardware init ops before compute ops -12. `ttkernel-insert-l1-accumulation` — insert `pack_reconfig_l1_acc` guards for `+=` and reduction loops; errors if `strict-f32-acc=true` and output block exceeds f32 DST capacity -13. `ttkernel-combine-pack-tiles` — combine consecutive `pack_tile` into `pack_tile_block` *(only if `combine-pack-tiles=true`)* -14. Canonicalization and CSE cleanup -15. *(if `lower-to-emitc=true`)* `lower-affine`, `convert-ttkernel-to-emitc`, `emitc-form-expressions` +1. `ttl-annotate-l1-acc-loops` — detect `+=` accumulation loops and annotate for L1 packer accumulation +2. `convert-ttl-to-compute` — lower TTL elementwise tensor ops to `ttl.compute` with tile ops +3. `ttl-set-compute-kernel-config` — set `fp32_dest_acc_en` / `dst_full_sync_en` defaults +4. `ttl-assign-dst` — DST register allocation (linear scan with copy insertion) +5. `ttl-subblock-compute-for-dst` — tile `ttl.compute` into DST-sized subblocks *(only if `maximize-dst=true`)*; optionally refine reserve/push to per-subblock granularity *(only if `auto-sync=true`)* +6. `ttl-insert-tile-regs-sync` — insert math/pack thread synchronization +7. `ttl-lower-matmul-block` — mark block-matmul computes and expand stores *(only if `use-block-matmul=true`)* +8. `ttl-lower-to-loops` — lower `ttl.compute` to `scf.for` loops +9. `ttl-schedule-operations` — reorder tile ops by dependency depth and kind *(only if `maximize-dst=true`)* +10. `ttl-annotate-cb-associations` — annotate block args with CB indices +11. `convert-ttl-to-ttkernel` — lower TTL DMA ops to TTKernel +12. `ttkernel-insert-inits` — insert hardware init ops before compute ops +13. `ttkernel-insert-l1-accumulation` — insert `pack_reconfig_l1_acc` guards for `+=` and reduction loops; errors if `strict-f32-acc=true` and output block exceeds f32 DST capacity +14. `ttkernel-combine-pack-tiles` — combine consecutive `pack_tile` into `pack_tile_block` *(only if `combine-pack-tiles=true`)* +15. Canonicalization and CSE cleanup +16. *(if `lower-to-emitc=true`)* `lower-affine`, `convert-ttkernel-to-emitc`, `emitc-form-expressions` ### Individual Pass Options From f4112403ba6cf912d7875af9bb2b0dca7e32bb88 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Tue, 14 Apr 2026 07:05:24 -0700 Subject: [PATCH 24/31] move the strict-f32-acc option to the subblocking pass; other cleanup --- docs/sphinx/reference/compiler-options.md | 14 +--- include/ttlang/Dialect/TTL/Passes.td | 17 ++--- .../TTKernelInsertL1Accumulation.cpp | 44 +++-------- lib/Dialect/TTL/Pipelines/TTLPipelines.cpp | 7 +- .../TTL/Transforms/TTLAnnotateL1AccLoops.cpp | 6 +- .../Transforms/TTLSubblockComputeForDST.cpp | 34 +++++++++ python/ttl/ttl_api.py | 6 +- .../insert_l1_accumulation_invalid.mlir | 75 ------------------- .../subblock_strict_f32_acc_invalid.mlir | 61 +++++++++++++++ 9 files changed, 120 insertions(+), 144 deletions(-) delete mode 100644 test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir create mode 100644 test/ttlang/Dialect/TTL/Transforms/subblock_strict_f32_acc_invalid.mlir diff --git a/docs/sphinx/reference/compiler-options.md b/docs/sphinx/reference/compiler-options.md index b18157c0a..ff3596620 100644 --- a/docs/sphinx/reference/compiler-options.md +++ b/docs/sphinx/reference/compiler-options.md @@ -128,7 +128,7 @@ The pipeline runs these passes in order: 10. `ttl-annotate-cb-associations` — annotate block args with CB indices 11. `convert-ttl-to-ttkernel` — lower TTL DMA ops to TTKernel 12. `ttkernel-insert-inits` — insert hardware init ops before compute ops -13. `ttkernel-insert-l1-accumulation` — insert `pack_reconfig_l1_acc` guards for `+=` and reduction loops; errors if `strict-f32-acc=true` and output block exceeds f32 DST capacity +13. `ttkernel-insert-l1-accumulation` — insert `pack_reconfig_l1_acc` guards for `+=` and reduction loops 14. `ttkernel-combine-pack-tiles` — combine consecutive `pack_tile` into `pack_tile_block` *(only if `combine-pack-tiles=true`)* 15. Canonicalization and CSE cleanup 16. *(if `lower-to-emitc=true`)* `lower-affine`, `convert-ttkernel-to-emitc`, `emitc-form-expressions` @@ -173,6 +173,7 @@ Partition `ttl.compute` into DST-sized subblocks. | Option | Type | Default | Description | |---|---|---|---| | `subblock-sync` | bool | `false` | Refine DFB reserve/push to per-subblock granularity, enabling `pack_tile_block` for contiguous subblocks. When disabled, user-placed reserve/push is preserved. | +| `strict-f32-acc` | bool | `false` | Error if a `+=` accumulation loop with non-f32 output requires subblocking. Subblocking reduces accumulation precision because bf16 L1 intermediates truncate f32 DST values. | ```bash ttlang-opt input.mlir -p 'func.func(ttl-subblock-compute-for-dst{subblock-sync=true})' @@ -190,14 +191,3 @@ Analyze circular buffer producer/consumer relationships and dump the flow graph. ttlang-opt input.mlir -p 'ttl-dump-cb-flow-graph{output="/tmp/cb_graph.json"}' ``` -#### `ttkernel-insert-l1-accumulation` - -Insert `pack_reconfig_l1_acc` guards around reduction and accumulation loops. - -| Option | Type | Default | Description | -|---|---|---|---| -| `strict-f32-acc` | bool | `false` | Error if a user-written `+=` accumulation loop requires subblocking because the output block exceeds f32 DST capacity (4 tiles with double-buffering). | - -```bash -ttlang-opt input.mlir -p 'builtin.module(ttkernel-insert-l1-accumulation{strict-f32-acc=true})' -``` diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td index bcc284405..f4a49c4fc 100644 --- a/include/ttlang/Dialect/TTL/Passes.td +++ b/include/ttlang/Dialect/TTL/Passes.td @@ -52,19 +52,8 @@ def TTKernelInsertL1Accumulation or `ttl.reduction_loop` (compiler-generated) attributes on `scf.for` ops, with `ttl.l1_acc_loop` taking precedence. - When `strict-f32-acc` is enabled, the pass errors if a user-written - accumulation loop (`+=`, marked `ttl.l1_acc_loop`) contains subblock - loops, indicating the output block exceeds f32 DST capacity. This - guarantees each K iteration fits in a single DST section. }]; - let options = [ - Option<"strictF32Acc", "strict-f32-acc", "bool", "false", - "Error if a user-written accumulation loop requires subblocking " - "(output exceeds f32 DST capacity). Guarantees full f32 " - "precision per accumulation step."> - ]; - let dependentDialects = [ "::mlir::arith::ArithDialect", "::mlir::scf::SCFDialect", @@ -225,7 +214,11 @@ def TTLSubblockComputeForDST Option<"subblockSync", "subblock-sync", "bool", "false", "Refine DFB reserve/push to per-subblock granularity, enabling " "pack_tile_block for contiguous subblocks. When disabled (default), " - "user-placed reserve/push is preserved."> + "user-placed reserve/push is preserved.">, + Option<"strictF32Acc", "strict-f32-acc", "bool", "false", + "Error if a user-written accumulation loop (+=) with non-f32 " + "output requires subblocking. Subblocking reduces accumulation " + "precision because bf16 L1 intermediates are narrower than f32 DST."> ]; let dependentDialects = [ diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index a6bfffcc1..14409d95b 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -63,10 +63,6 @@ static scf::ForOp findOutermostL1AccLoop(Operation *op) { struct TTKernelInsertL1AccumulationPass : public impl::TTKernelInsertL1AccumulationBase< TTKernelInsertL1AccumulationPass> { - using Base = - impl::TTKernelInsertL1AccumulationBase; - using Base::Base; - void runOnOperation() override { auto moduleOp = getOperation(); @@ -90,31 +86,6 @@ struct TTKernelInsertL1AccumulationPass } }); - // When --strict-f32-acc is set, error if any user-written accumulation - // loop (kL1AccLoopAttrName, from +=) contains subblock loops, which - // indicates the output block exceeds f32 DST capacity. - // TODO(ttl): Instead of erroring, allocate an f32 L1 temporary and - // emit a cast to bf16 after the loop. This would give full f32 - // precision regardless of block size, at the cost of 2x L1 per tile. - if (strictF32Acc) { - for (auto loop : l1AccLoops) { - if (!loop->hasAttr(kL1AccLoopAttrName)) { - continue; - } - bool hasSubblockLoop = false; - loop->walk([&](scf::ForOp inner) { - if (inner->hasAttr(kSubblockLoopStrideAttrName)) { - hasSubblockLoop = true; - } - }); - if (hasSubblockLoop) { - loop->emitError("output block exceeds f32 DST capacity; reduce block " - "dimensions or compile without --ttl-strict-f32-acc"); - return signalPassFailure(); - } - } - } - // L1 accumulation guard placement. For any loop that // accumulates in L1 (matmul K loop or reduce loop), the pattern is: // @@ -190,11 +161,13 @@ struct TTKernelInsertL1AccumulationPass outermostLoop = loop; } if (disabledLoops.insert(outermostLoop.getOperation()).second) { + Location disableLoc = outermostLoop->getLoc(); // Disable before the loop. builder.setInsertionPoint(outermostLoop); - Value disablePre = arith::ConstantOp::create( - builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0)); - ttk::PackReconfigL1AccOp::create(builder, loc, disablePre); + Value disablePre = + arith::ConstantOp::create(builder, disableLoc, builder.getI32Type(), + builder.getI32IntegerAttr(0)); + ttk::PackReconfigL1AccOp::create(builder, disableLoc, disablePre); // Disable after any consecutive cb_push_back ops that follow the // loop. Multi-output computes produce one push per output CB. @@ -208,9 +181,10 @@ struct TTKernelInsertL1AccumulationPass } else { builder.setInsertionPointAfter(outermostLoop); } - Value disablePost = arith::ConstantOp::create( - builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0)); - ttk::PackReconfigL1AccOp::create(builder, loc, disablePost); + Value disablePost = + arith::ConstantOp::create(builder, disableLoc, builder.getI32Type(), + builder.getI32IntegerAttr(0)); + ttk::PackReconfigL1AccOp::create(builder, disableLoc, disablePost); } } } diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp index cc64fe3f5..805a3c951 100644 --- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp +++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp @@ -33,6 +33,7 @@ void createTTLToTTKernelPipeline(OpPassManager &pm, if (options.maximizeDST) { TTLSubblockComputeForDSTOptions subblockOpts; subblockOpts.subblockSync = options.autoSync; + subblockOpts.strictF32Acc = options.strictF32Acc; pm.addPass(createTTLSubblockComputeForDST(subblockOpts)); } if (options.useBlockMatmul) { @@ -53,11 +54,7 @@ void createTTLToTTKernelPipeline(OpPassManager &pm, pm.addPass(createTTLConvertTTLToTTKernel(ttkOpts)); } pm.addPass(createTTKernelInsertInits()); - { - TTKernelInsertL1AccumulationOptions l1AccOpts; - l1AccOpts.strictF32Acc = options.strictF32Acc; - pm.addPass(createTTKernelInsertL1Accumulation(l1AccOpts)); - } + pm.addPass(createTTKernelInsertL1Accumulation()); if (options.combinePackTiles) { pm.addNestedPass(createTTKernelCombinePackTiles()); } diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp index 7757c04ed..0f1f65ede 100644 --- a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp +++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp @@ -46,15 +46,17 @@ struct TTLAnnotateL1AccLoopsPass // Only count stores whose nearest enclosing scf.for is this forOp, // so that nested inner loops are not attributed to outer loops. bool hasAccumulatingStore = false; - forOp.getBody()->walk([&](StoreOp store) { + forOp.getBody()->walk([&](StoreOp store) -> WalkResult { if (store.getAccumulate() && store->getParentOfType() == forOp) { hasAccumulatingStore = true; + return WalkResult::interrupt(); } + return WalkResult::advance(); }); if (hasAccumulatingStore) { - forOp->setAttr(kL1AccLoopAttrName, OpBuilder(forOp).getUnitAttr()); + forOp->setAttr(kL1AccLoopAttrName, UnitAttr::get(forOp->getContext())); } }); } diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp index da6c65065..0d5c21add 100644 --- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp +++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp @@ -17,6 +17,8 @@ #include "ttlang/Dialect/TTL/IR/TTLOpsUtils.h" #include "ttlang/Dialect/TTL/Passes.h" +#include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h" + #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" @@ -189,6 +191,38 @@ struct TTLSubblockComputeForDSTPass return success(); } + // When --strict-f32-acc is set and this compute is inside a user-written + // accumulation loop (+=), error if the output type is not f32. Subblocking + // a non-f32 accumulating compute reduces precision: the f32 DST partial + // sums are truncated to bf16 when packed to L1 per K step. + // TODO(ttl): Instead of erroring, allocate an f32 L1 temporary and + // emit a cast to bf16 after the loop, giving full f32 precision at + // the cost of 2x L1 per tile. + if (strictF32Acc) { + bool insideL1AccLoop = false; + for (Operation *parent = computeOp->getParentOp(); parent; + parent = parent->getParentOp()) { + if (auto forOp = dyn_cast(parent)) { + if (forOp->hasAttr(kL1AccLoopAttrName)) { + insideL1AccLoop = true; + break; + } + } + } + if (insideL1AccLoop) { + auto outType = + cast(computeOp.getDpsInits()[0].getType()); + auto tileType = cast(outType.getElementType()); + if (tileType.getDataType() != ttcore::DataType::Float32) { + return computeOp.emitError( + "subblocking accumulation loop reduces precision: bf16 L1 " + "intermediates truncate f32 DST partial sums per K step; " + "reduce block dimensions to fit in f32 DST, use f32 output " + "type, or compile without --ttl-strict-f32-acc"); + } + } + } + // Only parallel dimensions are candidates for subblocking; reduction // dimensions must be fully included in each subblock. Matmul K is // excluded because it accumulates in-place (see hasMatmulBlock above). diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py index f2c02cc71..f2f161dde 100644 --- a/python/ttl/ttl_api.py +++ b/python/ttl/ttl_api.py @@ -1231,8 +1231,9 @@ def _compile_kernel( ] if compiler_options.maximize_dst: subblock_sync = "true" if compiler_options.auto_sync else "false" + strict_f32 = "true" if compiler_options.strict_f32_acc else "false" pipeline_passes.append( - f"func.func(ttl-subblock-compute-for-dst{{subblock-sync={subblock_sync}}})" + f"func.func(ttl-subblock-compute-for-dst{{subblock-sync={subblock_sync} strict-f32-acc={strict_f32}}})" ) if compiler_options.use_block_matmul: pipeline_passes.append("func.func(ttl-lower-matmul-block)") @@ -1267,12 +1268,11 @@ def _compile_kernel( pipeline_passes.append(f'ttl-dump-cb-flow-graph{{output="{cb_flow_json}"}}') reduce_fp32_flag = int(compiler_options.reduce_full_fp32) - strict_f32_flag = int(compiler_options.strict_f32_acc) pipeline_passes += [ "ttl-lower-dprint-to-emitc", f"convert-ttl-to-ttkernel{{reduce-full-fp32={reduce_fp32_flag}}}", "ttkernel-insert-inits", - f"ttkernel-insert-l1-accumulation{{strict-f32-acc={strict_f32_flag}}}", + "ttkernel-insert-l1-accumulation", ] if compiler_options.combine_pack_tiles: pipeline_passes.append("func.func(ttkernel-combine-pack-tiles)") diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir deleted file mode 100644 index e79698877..000000000 --- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation_invalid.mlir +++ /dev/null @@ -1,75 +0,0 @@ -// Negative tests for ttkernel-insert-l1-accumulation with --strict-f32-acc. - -// RUN: ttlang-opt %s --pass-pipeline='builtin.module(ttkernel-insert-l1-accumulation{strict-f32-acc=true})' --verify-diagnostics --split-input-file - -// L1 acc loop with subblock loop inside: strict-f32-acc should error. - -func.func @strict_f32_subblock_error() attributes {ttkernel.thread = #ttkernel.thread} { - %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c0_i32 = arith.constant 0 : i32 - %c1_i32 = arith.constant 1 : i32 - %c2 = arith.constant 2 : index - %c4 = arith.constant 4 : index - // expected-error @below {{output block exceeds f32 DST capacity}} - scf.for %iv = %c0 to %c4 step %c1 { - scf.for %sb = %c0 to %c2 step %c1 { - ttkernel.tile_regs_acquire() : () -> () - ttkernel.matmul_block(%cb, %cb, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> () - ttkernel.tile_regs_commit() : () -> () - ttkernel.tile_regs_wait() : () -> () - ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () - ttkernel.tile_regs_release() : () -> () - } {ttl.subblock_loop_stride = 1 : index} - } {ttl.l1_acc_loop} - return -} - -// ----- - -// L1 acc loop WITHOUT subblock loops: strict-f32-acc should pass. - -// expected-no-diagnostics -func.func @strict_f32_no_subblock_ok() attributes {ttkernel.thread = #ttkernel.thread} { - %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %c4_i32 = arith.constant 4 : i32 - scf.for %iv = %c0 to %c4 step %c1 { - ttkernel.tile_regs_acquire() : () -> () - ttkernel.tile_regs_commit() : () -> () - ttkernel.tile_regs_wait() : () -> () - ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () - ttkernel.tile_regs_release() : () -> () - } {ttl.l1_acc_loop} - ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () - return -} - -// ----- - -// Reduction loop (compiler-generated) with subblock: strict-f32-acc should -// NOT error (only user-written l1_acc_loop triggers the check). - -// expected-no-diagnostics -func.func @strict_f32_reduction_loop_ok() attributes {ttkernel.thread = #ttkernel.thread} { - %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> - %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> - %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - scf.for %iv = %c0 to %c2 step %c1 { - scf.for %sb = %c0 to %c2 step %c1 { - ttkernel.tile_regs_acquire() : () -> () - ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, , ) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> () - ttkernel.tile_regs_commit() : () -> () - ttkernel.tile_regs_wait() : () -> () - ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> () - ttkernel.tile_regs_release() : () -> () - } {ttl.subblock_loop_stride = 1 : index} - } {ttl.reduction_loop} - return -} diff --git a/test/ttlang/Dialect/TTL/Transforms/subblock_strict_f32_acc_invalid.mlir b/test/ttlang/Dialect/TTL/Transforms/subblock_strict_f32_acc_invalid.mlir new file mode 100644 index 000000000..8c4318acd --- /dev/null +++ b/test/ttlang/Dialect/TTL/Transforms/subblock_strict_f32_acc_invalid.mlir @@ -0,0 +1,61 @@ +// Negative tests for ttl-subblock-compute-for-dst with --strict-f32-acc. +// The check fires when a user-written accumulation loop (+=) with non-f32 +// output requires subblocking, because bf16 L1 intermediates truncate f32 +// DST partial sums per K step. + +// RUN: ttlang-opt %s \ +// RUN: --pass-pipeline='builtin.module(func.func( \ +// RUN: ttl-annotate-l1-acc-loops, convert-ttl-to-compute, \ +// RUN: ttl-assign-dst{enable-fpu-binary-ops=0}, \ +// RUN: ttl-subblock-compute-for-dst{strict-f32-acc=true}))' \ +// RUN: --verify-diagnostics --split-input-file + +// bf16 output 3x3 = 9 tiles exceeds f32 DST capacity (4): should error. + +func.func @strict_f32_subblock_bf16_error( + %arg0: tensor<3x2x!ttcore.tile<32x32, bf16>>, + %arg1: tensor<2x3x!ttcore.tile<32x32, bf16>>) -> tensor<3x3x!ttcore.tile<32x32, bf16>> + attributes {ttl.kernel_thread = #ttkernel.thread, fp32_dest_acc_en} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[3, 2], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[2, 3], !ttcore.tile<32x32, bf16>, 2> + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<3x2x!ttcore.tile<32x32, bf16>>, !ttl.cb<[3, 2], !ttcore.tile<32x32, bf16>, 2>) -> tensor<3x2x!ttcore.tile<32x32, bf16>> + %b = ttl.attach_cb %arg1, %cb1 : (tensor<2x3x!ttcore.tile<32x32, bf16>>, !ttl.cb<[2, 3], !ttcore.tile<32x32, bf16>, 2>) -> tensor<2x3x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb2 : <[3, 3], !ttcore.tile<32x32, bf16>, 2> -> tensor<3x3x!ttcore.tile<32x32, bf16>> + scf.for %k = %c0 to %c2 step %c1 { + // expected-error @below {{subblocking accumulation loop reduces precision}} + %mm = ttl.matmul %a, %b : tensor<3x2x!ttcore.tile<32x32, bf16>>, tensor<2x3x!ttcore.tile<32x32, bf16>> -> tensor<3x3x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve {accumulate} : tensor<3x3x!ttcore.tile<32x32, bf16>>, tensor<3x3x!ttcore.tile<32x32, bf16>> + } + ttl.cb_push %cb2 : <[3, 3], !ttcore.tile<32x32, bf16>, 2> + func.return %reserve : tensor<3x3x!ttcore.tile<32x32, bf16>> +} + +// ----- + +// bf16 output 2x2 = 4 tiles fits in f32 DST (4): no subblocking, no error. + +// expected-no-diagnostics +func.func @strict_f32_fits_in_dst_ok( + %arg0: tensor<2x2x!ttcore.tile<32x32, bf16>>, + %arg1: tensor<2x2x!ttcore.tile<32x32, bf16>>) -> tensor<2x2x!ttcore.tile<32x32, bf16>> + attributes {ttl.kernel_thread = #ttkernel.thread, fp32_dest_acc_en} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2> + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<2x2x!ttcore.tile<32x32, bf16>>, !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2>) -> tensor<2x2x!ttcore.tile<32x32, bf16>> + %b = ttl.attach_cb %arg1, %cb1 : (tensor<2x2x!ttcore.tile<32x32, bf16>>, !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2>) -> tensor<2x2x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb2 : <[2, 2], !ttcore.tile<32x32, bf16>, 2> -> tensor<2x2x!ttcore.tile<32x32, bf16>> + scf.for %k = %c0 to %c2 step %c1 { + %mm = ttl.matmul %a, %b : tensor<2x2x!ttcore.tile<32x32, bf16>>, tensor<2x2x!ttcore.tile<32x32, bf16>> -> tensor<2x2x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve {accumulate} : tensor<2x2x!ttcore.tile<32x32, bf16>>, tensor<2x2x!ttcore.tile<32x32, bf16>> + } + ttl.cb_push %cb2 : <[2, 2], !ttcore.tile<32x32, bf16>, 2> + func.return %reserve : tensor<2x2x!ttcore.tile<32x32, bf16>> +} From c40a42e5a7385b6c38aae3600f772f3589096bbc Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Tue, 14 Apr 2026 07:27:03 -0700 Subject: [PATCH 25/31] precommit --- docs/sphinx/reference/compiler-options.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/sphinx/reference/compiler-options.md b/docs/sphinx/reference/compiler-options.md index ff3596620..78662db94 100644 --- a/docs/sphinx/reference/compiler-options.md +++ b/docs/sphinx/reference/compiler-options.md @@ -190,4 +190,3 @@ Analyze circular buffer producer/consumer relationships and dump the flow graph. ```bash ttlang-opt input.mlir -p 'ttl-dump-cb-flow-graph{output="/tmp/cb_graph.json"}' ``` - From 3fc7ca9c96774ee5dd85276ad94dd0ada41579c8 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Tue, 14 Apr 2026 08:45:04 -0700 Subject: [PATCH 26/31] rewrite L1 acc annotation with dominance, fix consecutive += loops. add tests 1. Two consecutive += loops to same reserve MLIR: annotate_l1_acc_loops.mlir::consecutive_loops_same_reserve insert_l1_accumulation.mlir::consecutive_l1_acc_loops Device: test_matmul_l1_acc.py::test_l1_acc_consecutive_loops 2. Mixing += and .store() in same loop MLIR: annotate_l1_acc_loops.mlir::mixed_acc_and_plain_store Device: test_matmul_l1_acc.py::test_l1_acc_mixed_store 3. += with non-matmul RHS (sum reduction) MLIR: annotate_l1_acc_loops.mlir::non_matmul_accumulate Device: test_matmul_l1_acc.py::test_l1_acc_sum_reduction 4. Multiple += to different outputs in same loop MLIR: insert_l1_accumulation.mlir::two_outputs_one_loop Device: test_matmul_l1_acc.py::test_l1_acc_multi_output 5. Output fits in f32 DST (block_n <= 4) MLIR: matmul_subblock_l1_acc.mlir::matmul_3x3_k_loop Device: test_matmul_l1_acc.py::test_l1_acc_single_core[blk2x2_K2..K8] 6. K=1 single iteration MLIR: annotate_l1_acc_loops.mlir::single_iteration Device: test_matmul_l1_acc.py::test_l1_acc_single_iteration 7. += inside conditional MLIR: annotate_l1_acc_loops.mlir::acc_inside_conditional Device: not tested (DSL does not generate runtime conditionals in compute) --- include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h | 22 ++ include/ttlang/Dialect/TTL/Passes.td | 36 +-- .../TTKernelInsertL1Accumulation.cpp | 241 ++++++++------ .../TTL/Transforms/TTLAnnotateL1AccLoops.cpp | 62 ++-- test/python/test_matmul_l1_acc.py | 306 ++++++++++++++++++ .../Transforms/insert_l1_accumulation.mlir | 157 ++++++++- .../TTL/Transforms/annotate_l1_acc_loops.mlir | 143 ++++++++ 7 files changed, 825 insertions(+), 142 deletions(-) diff --git a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h index 94ab10d59..fbe4f5323 100644 --- a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h +++ b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h @@ -8,10 +8,12 @@ #include "ttlang/Dialect/TTL/IR/TTL.h" #include "ttlang/Dialect/TTL/IR/TTLOps.h" #include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h" +#include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arith/Utils/Utils.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Interfaces/ViewLikeInterface.h" #include "llvm/ADT/SetVector.h" @@ -427,6 +429,26 @@ inline TileOp createTileOpWithPlaceholderDstIndex(OpBuilder &builder, return tileOp; } +/// Collect the CB values targeted by pack_tile ops inside a loop. +inline llvm::SmallDenseSet getPackTileCBs(scf::ForOp loop) { + namespace ttk = mlir::tt::ttkernel; + llvm::SmallDenseSet cbs; + loop->walk([&](ttk::PackTileOp packOp) { cbs.insert(packOp.getOutCb()); }); + return cbs; +} + +/// Returns true if two loops share any pack_tile CB target. +inline bool sharePackCB(scf::ForOp loopA, scf::ForOp loopB) { + auto cbsA = getPackTileCBs(loopA); + auto cbsB = getPackTileCBs(loopB); + for (auto cb : cbsA) { + if (cbsB.contains(cb)) { + return true; + } + } + return false; +} + } // namespace mlir::tt::ttl #endif // TTLANG_DIALECT_TTL_IR_TTLOPSUTILS_H diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td index f4a49c4fc..6a92a0ebe 100644 --- a/include/ttlang/Dialect/TTL/Passes.td +++ b/include/ttlang/Dialect/TTL/Passes.td @@ -30,26 +30,22 @@ def TTKernelInsertL1Accumulation : Pass<"ttkernel-insert-l1-accumulation", "::mlir::ModuleOp"> { let summary = "Insert L1 accumulation guards for reduction loops"; let description = [{ - Inserts `pack_reconfig_l1_acc` guards inside reduction loops so that - pack operations accumulate into L1 instead of overwriting. The enable - call happens once after the first iteration's last pack; the L1 acc - packer state persists across `tile_regs` boundaries. Disable guards - bracket the outermost reduction loop (parallel loops are not - considered). Max-reduce loops are excluded (max is not additive). - - The pattern is: - pack_reconfig_l1_acc(0) // disable before loop - for (iv = lb; ...) { - [subblock 0: acquire...pack...release] - [subblock N: acquire...pack...release] - if (iv == lb) pack_reconfig_l1_acc(1) // enable once after first - // iteration's last pack - } - [cb_push_back if present] - pack_reconfig_l1_acc(0) // disable after loop - - Reduction loops are identified by the `ttl.l1_acc_loop` (user-written) - or `ttl.reduction_loop` (compiler-generated) attributes on `scf.for` + Inserts `pack_reconfig_l1_acc` guards so that pack operations inside + annotated loops accumulate into L1 instead of overwriting. Max-reduce + loops are excluded (max is not additive). + + Loops are grouped into accumulation scopes: consecutive sibling loops + that pack to the same CB share a single disable pair. Nested annotated + loops are folded into the outermost ancestor. Each group gets: + - `pack_reconfig_l1_acc(0)` before the first loop. + - `if (iv == lb) pack_reconfig_l1_acc(1)` after each loop's first + iteration (the L1 acc state persists across `tile_regs` boundaries). + - For the second+ loop in a group, an enable before the loop to re-enable + L1 acc after init ops that may reset packer state. + - `pack_reconfig_l1_acc(0)` after the last `cb_push_back`. + + Loops are identified by the `ttl.l1_acc_loop` (user-written) or + `ttl.reduction_loop` (compiler-generated) attributes on `scf.for` ops, with `ttl.l1_acc_loop` taking precedence. }]; diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index 14409d95b..9a4e2a394 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "ttlang/Dialect/TTL/IR/TTL.h" +#include "ttlang/Dialect/TTL/IR/TTLOpsUtils.h" #include "ttlang/Dialect/TTL/Passes.h" #include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h" @@ -26,9 +27,10 @@ namespace ttk = mlir::tt::ttkernel; namespace { -/// Find the enclosing loop that controls L1 accumulation. -/// Prefers kL1AccLoopAttrName (user-annotated). Falls back to innermost -/// kReductionLoopAttrName (compiler-generated, for reduce ops). +/// Find the innermost enclosing L1 acc or reduction loop. +/// User-written += loops (kL1AccLoopAttrName) take precedence over +/// compiler-generated reduction loops because the user-specified loop +/// structure determines the accumulation granularity. static scf::ForOp findL1AccLoop(Operation *op) { scf::ForOp reductionFallback; for (Operation *parent = op->getParentOp(); parent; @@ -45,15 +47,16 @@ static scf::ForOp findL1AccLoop(Operation *op) { return reductionFallback; } -/// Find the outermost enclosing L1 acc or reduction loop for the disable guard. -static scf::ForOp findOutermostL1AccLoop(Operation *op) { - scf::ForOp outermost; - for (Operation *parent = op->getParentOp(); parent; +/// Walk from loop up through parent ops, returning the outermost +/// annotated ancestor. Returns loop itself if no annotated ancestor exists. +static scf::ForOp findOutermostAnnotatedAncestor(scf::ForOp loop) { + scf::ForOp outermost = loop; + for (Operation *parent = loop->getParentOp(); parent; parent = parent->getParentOp()) { - if (auto forOp = dyn_cast(parent)) { - if (forOp->hasAttr(kL1AccLoopAttrName) || - forOp->hasAttr(kReductionLoopAttrName)) { - outermost = forOp; + if (auto parentFor = dyn_cast(parent)) { + if (parentFor->hasAttr(kL1AccLoopAttrName) || + parentFor->hasAttr(kReductionLoopAttrName)) { + outermost = parentFor; } } } @@ -66,15 +69,25 @@ struct TTKernelInsertL1AccumulationPass void runOnOperation() override { auto moduleOp = getOperation(); - // Collect L1 acc loops (kL1AccLoopAttrName or kReductionLoopAttrName) - // that contain pack_tile activity. + // Walk from TileRegsAcquireOp upward to find annotated loops — + // only loops with actual pack activity need L1 acc guards. SmallVector l1AccLoops; - llvm::SmallDenseSet seenLoops; + llvm::SmallDenseSet visitedLoops; moduleOp->walk([&](ttk::TileRegsAcquireOp acquireOp) { auto loop = findL1AccLoop(acquireOp); - if (!loop || !seenLoops.insert(loop).second) { + if (!loop || !visitedLoops.insert(loop).second) { return; } + // Skip if this pass already ran (idempotency). + bool alreadyProcessed = false; + loop->walk([&](ttk::PackReconfigL1AccOp) { + alreadyProcessed = true; + return WalkResult::interrupt(); + }); + if (alreadyProcessed) { + return; + } + // Max reduce is not additive — L1 acc would corrupt the running max. bool hasMaxReduce = false; loop->walk([&](ttk::ReduceTileOp reduceOp) { if (reduceOp.getReduceType() == ttk::ReduceType::Max) { @@ -86,106 +99,134 @@ struct TTKernelInsertL1AccumulationPass } }); - // L1 accumulation guard placement. For any loop that - // accumulates in L1 (matmul K loop or reduce loop), the pattern is: - // - // pack_reconfig_l1_acc(0) // disable before loop - // for (iv = lb; ...) { - // [subblock 0: acquire...pack...release] - // [subblock N: acquire...pack...release] - // if (iv == lb) pack_reconfig_l1_acc(1) // enable once after first - // // iteration's last pack - // } - // [cb_push_back if present] - // pack_reconfig_l1_acc(0) // disable after loop - // - // The L1 acc state persists across multiple dst sections, so the enable - // call only needs to happen once (after the first iteration completes - // all its packs). Disable guards are inserted once per outermost - // reduction loop (parallel loops are not considered). - - // Find the insertion point for the enable guard: the top-level op in - // the loop body that contains the last tile_regs_release. - auto findTopLevelAncestor = [](Operation *op, - Block *loopBody) -> Operation * { - while (op && op->getBlock() != loopBody) { - op = op->getParentOp(); - } - return op; - }; - - llvm::SmallDenseMap enablePointPerLoop; + // The enable guard goes after the last pack in the first iteration. + // Packs live inside tile_regs_acquire/release sections, which may be + // nested in subblock loops. The top-level ancestor of the last release + // in the loop body is the correct insertion point. + llvm::SmallDenseMap l1AccEnablePoint; for (auto loop : l1AccLoops) { - Operation *lastTopLevel = nullptr; + Operation *lastReleaseAncestor = nullptr; loop->walk([&](ttk::TileRegsReleaseOp releaseOp) { - Operation *topLevel = findTopLevelAncestor(releaseOp, loop.getBody()); - if (topLevel) { - lastTopLevel = topLevel; + if (auto *ancestor = + loop.getBody()->findAncestorOpInBlock(*releaseOp)) { + lastReleaseAncestor = ancestor; } }); - if (lastTopLevel) { - enablePointPerLoop[loop.getOperation()] = lastTopLevel; + if (lastReleaseAncestor) { + l1AccEnablePoint[loop.getOperation()] = lastReleaseAncestor; } } - llvm::SmallDenseSet disabledLoops; + // Step 1: Group loops into accumulation scopes. Consecutive sibling + // loops that pack to the same CB share a single disable pair. Nested + // annotated loops are folded into the outermost ancestor. + struct AccGroup { + scf::ForOp rootLoop; + SmallVector loops; + Operation *scopeEnd = nullptr; + }; + SmallVector groups; + llvm::SmallDenseSet assignedToGroup; + for (auto loop : l1AccLoops) { - auto iter = enablePointPerLoop.find(loop.getOperation()); - if (iter == enablePointPerLoop.end()) { + if (!l1AccEnablePoint.count(loop.getOperation())) { continue; } - Operation *enablePoint = iter->second; - OpBuilder builder(loop->getContext()); - Location loc = enablePoint->getLoc(); - - // Enable L1 acc once, at the end of the first iteration of the - // reduction loop. All packs in iteration 0 write without - // accumulation; subsequent iterations add to the existing L1 value. - builder.setInsertionPointAfter(enablePoint); - Value loopIV = loop.getInductionVar(); - Value loopLB = loop.getLowerBound(); - Value firstIter = arith::CmpIOp::create( - builder, loc, arith::CmpIPredicate::eq, loopIV, loopLB); - auto ifOp = scf::IfOp::create(builder, loc, firstIter); - builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); - Value enableFlag = arith::ConstantOp::create( - builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1)); - ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag); - - // Bracket the outermost accumulation loop with disable guards. - // Both kL1AccLoopAttrName and kReductionLoopAttrName mean "all - // iterations write to the same CB slot," so the outermost such - // loop is the correct accumulation boundary. - auto outermostLoop = findOutermostL1AccLoop(loop); - if (!outermostLoop) { - outermostLoop = loop; + if (assignedToGroup.contains(loop.getOperation())) { + continue; } - if (disabledLoops.insert(outermostLoop.getOperation()).second) { - Location disableLoc = outermostLoop->getLoc(); - // Disable before the loop. - builder.setInsertionPoint(outermostLoop); - Value disablePre = - arith::ConstantOp::create(builder, disableLoc, builder.getI32Type(), - builder.getI32IntegerAttr(0)); - ttk::PackReconfigL1AccOp::create(builder, disableLoc, disablePre); - - // Disable after any consecutive cb_push_back ops that follow the - // loop. Multi-output computes produce one push per output CB. - Operation *lastPush = nullptr; - for (Operation *op = outermostLoop->getNextNode(); - op && isa(op); op = op->getNextNode()) { - lastPush = op; + + scf::ForOp rootLoop = findOutermostAnnotatedAncestor(loop); + + AccGroup group; + group.rootLoop = rootLoop; + group.loops.push_back(loop); + assignedToGroup.insert(loop.getOperation()); + + // Collect sibling annotated loops that share a pack CB target. + for (Operation *op = rootLoop->getNextNode(); op; + op = op->getNextNode()) { + if (isa(op)) { + break; } - if (lastPush) { - builder.setInsertionPointAfter(lastPush); + auto sibling = dyn_cast(op); + if (!sibling) { + continue; + } + if (!sibling->hasAttr(kL1AccLoopAttrName) && + !sibling->hasAttr(kReductionLoopAttrName)) { + break; + } + if (!sharePackCB(rootLoop, sibling)) { + break; + } + group.loops.push_back(sibling); + assignedToGroup.insert(sibling.getOperation()); + } + + // Scope ends at the last trailing cb_push_back. + Operation *lastInGroup = group.loops.size() > 1 + ? group.loops.back().getOperation() + : rootLoop.getOperation(); + group.scopeEnd = lastInGroup; + for (Operation *op = lastInGroup->getNextNode(); op; + op = op->getNextNode()) { + if (isa(op)) { + group.scopeEnd = op; } else { - builder.setInsertionPointAfter(outermostLoop); + break; + } + } + + groups.push_back(std::move(group)); + } + + // Step 2: Emit guards per group. + for (auto &group : groups) { + OpBuilder builder(group.rootLoop->getContext()); + Location disableLoc = group.rootLoop->getLoc(); + + // Disable before the group. + builder.setInsertionPoint(group.rootLoop); + Value disableFlag = + arith::ConstantOp::create(builder, disableLoc, builder.getI32Type(), + builder.getI32IntegerAttr(0)); + ttk::PackReconfigL1AccOp::create(builder, disableLoc, disableFlag); + + for (size_t idx = 0; idx < group.loops.size(); ++idx) { + scf::ForOp loop = group.loops[idx]; + auto iter = l1AccEnablePoint.find(loop.getOperation()); + if (iter == l1AccEnablePoint.end()) { + continue; } - Value disablePost = - arith::ConstantOp::create(builder, disableLoc, builder.getI32Type(), - builder.getI32IntegerAttr(0)); - ttk::PackReconfigL1AccOp::create(builder, disableLoc, disablePost); + + // For the 2nd+ loop in a group, re-enable L1 acc before + // the loop because init ops between loops reset packer state. + if (idx > 0) { + builder.setInsertionPoint(loop); + Value enableFlag = arith::ConstantOp::create( + builder, loop->getLoc(), builder.getI32Type(), + builder.getI32IntegerAttr(1)); + ttk::PackReconfigL1AccOp::create(builder, loop->getLoc(), enableFlag); + } + + // Conditional enable after the first iteration's last pack. + Operation *afterOp = iter->second; + Location loc = afterOp->getLoc(); + builder.setInsertionPointAfter(afterOp); + Value firstIter = + arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::eq, + loop.getInductionVar(), loop.getLowerBound()); + auto ifOp = scf::IfOp::create(builder, loc, firstIter); + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + Value enableFlag = arith::ConstantOp::create( + builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1)); + ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag); } + + // Disable after the scope end. + builder.setInsertionPointAfter(group.scopeEnd); + ttk::PackReconfigL1AccOp::create(builder, disableLoc, disableFlag); } } }; diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp index 0f1f65ede..ab025919a 100644 --- a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp +++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp @@ -10,6 +10,14 @@ // (ttl.store with the {accumulate} attribute, emitted by +=) and annotates // them with kL1AccLoopAttrName for L1 packer accumulation. // +// Uses dominance: for each accumulating store, verifies the destination +// cb_reserve properly dominates the enclosing loop (the reserve is outside +// the loop, so the same L1 slot persists across iterations). +// +// TTKernelInsertL1Accumulation uses the annotated loops to find enable +// points, and the enclosing cb_reserve_back/cb_push_back pair to determine +// the accumulation scope for disable guards. +// //===----------------------------------------------------------------------===// #include "ttlang/Dialect/TTL/IR/TTL.h" @@ -17,6 +25,7 @@ #include "ttlang/Dialect/TTL/Passes.h" #include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/Dominance.h" #define DEBUG_TYPE "ttl-annotate-l1-acc-loops" @@ -27,37 +36,48 @@ namespace mlir::tt::ttl { namespace { +/// Returns true if the loop carries any ttl.* annotation, indicating it +/// was generated or already processed by a compiler pass. +static bool hasCompilerAnnotation(scf::ForOp loop) { + for (auto attr : loop->getAttrs()) { + if (attr.getName().getValue().starts_with("ttl.")) { + return true; + } + } + return false; +} + struct TTLAnnotateL1AccLoopsPass : public impl::TTLAnnotateL1AccLoopsBase { void runOnOperation() override { func::FuncOp func = getOperation(); + DominanceInfo domInfo(func); - func.walk([&](scf::ForOp forOp) { - // Skip loops already annotated (compiler-generated or prior run). - if (forOp->hasAttr(kL1AccLoopAttrName) || - forOp->hasAttr(kReductionLoopAttrName) || - forOp->hasAttr(kTileLoopStrideAttrName) || - forOp->hasAttr(kSubblockLoopStrideAttrName)) { + func.walk([&](StoreOp store) { + if (!store.getAccumulate()) { return; } - // Check if this loop directly contains an accumulating store - // (ttl.store with the {accumulate} attribute, emitted by +=). - // Only count stores whose nearest enclosing scf.for is this forOp, - // so that nested inner loops are not attributed to outer loops. - bool hasAccumulatingStore = false; - forOp.getBody()->walk([&](StoreOp store) -> WalkResult { - if (store.getAccumulate() && - store->getParentOfType() == forOp) { - hasAccumulatingStore = true; - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); + auto enclosingLoop = store->getParentOfType(); + if (!enclosingLoop) { + return; + } + if (hasCompilerAnnotation(enclosingLoop)) { + return; + } - if (hasAccumulatingStore) { - forOp->setAttr(kL1AccLoopAttrName, UnitAttr::get(forOp->getContext())); + // The reserve must properly dominate the enclosing loop: the + // reserve is outside the loop so the same L1 slot persists across + // iterations. If the reserve is inside the loop, each iteration + // gets a fresh slot and accumulation is meaningless. + Value reserve = store.getView(); + Operation *reserveOp = reserve.getDefiningOp(); + if (reserveOp && !domInfo.properlyDominates(reserveOp, enclosingLoop)) { + return; } + + enclosingLoop->setAttr(kL1AccLoopAttrName, + UnitAttr::get(enclosingLoop->getContext())); }); } }; diff --git a/test/python/test_matmul_l1_acc.py b/test/python/test_matmul_l1_acc.py index 7f0b8bec3..074a72637 100644 --- a/test/python/test_matmul_l1_acc.py +++ b/test/python/test_matmul_l1_acc.py @@ -189,3 +189,309 @@ def test_l1_acc_multicore(Mt, Kt, Nt, block_m, block_n, device): result = ttnn.to_torch(out).float() assert_pcc(golden, result, threshold=0.999) + + +# --------------------------------------------------------------------------- +# Non-matmul accumulation: += with a passthrough copy (sum reduction). +# --------------------------------------------------------------------------- + + +def _make_sum_reduction_kernel(): + """Sum K input blocks via += (no matmul).""" + + @ttl.operation(grid=(1, 1)) + def kernel(inp, out): + Kt = inp.shape[0] // TILE + inp_dfb = ttl.make_dataflow_buffer_like(inp, shape=(1, 1), block_count=2) + out_dfb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2) + + @ttl.compute() + def compute(): + out_blk = out_dfb.reserve() + for _ in range(Kt): + inp_blk = inp_dfb.wait() + out_blk += inp_blk + inp_blk.pop() + out_blk.push() + + @ttl.datamovement() + def dm_read(): + for kt in range(Kt): + with inp_dfb.reserve() as blk: + ttl.copy(inp[kt : kt + 1, 0:1], blk).wait() + + @ttl.datamovement() + def dm_write(): + with out_dfb.wait() as blk: + ttl.copy(blk, out[0:1, 0:1]).wait() + + return kernel + + +@pytest.mark.parametrize("Kt", [2, 4, 8], ids=[f"K{k}" for k in [2, 4, 8]]) +@pytest.mark.requires_device +def test_l1_acc_sum_reduction(Kt, device): + """Sum K tiles via += without matmul (passthrough accumulation).""" + inp_torch = torch.randn(Kt * TILE, TILE, dtype=torch.bfloat16) + golden = inp_torch.float().reshape(Kt, TILE, TILE).sum(dim=0) + + inp_dev = to_dram(inp_torch, device) + out_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device) + + kernel = _make_sum_reduction_kernel() + kernel(inp_dev, out_dev) + + result = ttnn.to_torch(out_dev).float() + assert_pcc(golden, result, threshold=0.999) + + +# --------------------------------------------------------------------------- +# K=1 single iteration: accumulation with one loop iteration. +# --------------------------------------------------------------------------- + + +@pytest.mark.requires_device +def test_l1_acc_single_iteration(device): + """K=1: single-iteration += loop. Semantically equivalent to plain store.""" + M, K, N = TILE, TILE, 2 * TILE + a_torch = torch.randn(M, K, dtype=torch.bfloat16) + b_torch = torch.randn(K, N, dtype=torch.bfloat16) + golden = (a_torch.float() @ b_torch.float()).float() + + a_dev = to_dram(a_torch, device) + b_dev = to_dram(b_torch, device) + out_dev = to_dram(torch.zeros(M, N, dtype=torch.bfloat16), device) + + kernel = _make_l1_acc_kernel(1, 2, grid=(1, 1)) + kernel(a_dev, b_dev, out_dev) + + result = ttnn.to_torch(out_dev).float() + assert_pcc(golden, result, threshold=0.999) + + +# --------------------------------------------------------------------------- +# Consecutive += loops to the same reserve (two input streams). +# --------------------------------------------------------------------------- + + +def _make_consecutive_acc_kernel(K1, K2): + """Two consecutive += loops to one output: out = (a@b summed K1) + (c@d summed K2).""" + + @ttl.operation(grid=(1, 1)) + def kernel(a, b, c, d, out): + a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2) + b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, 1), block_count=2) + c_dfb = ttl.make_dataflow_buffer_like(c, shape=(1, 1), block_count=2) + d_dfb = ttl.make_dataflow_buffer_like(d, shape=(1, 1), block_count=2) + out_dfb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2) + + @ttl.compute() + def compute(): + out_blk = out_dfb.reserve() + for _ in range(K1): + a_blk = a_dfb.wait() + b_blk = b_dfb.wait() + out_blk += a_blk @ b_blk + a_blk.pop() + b_blk.pop() + for _ in range(K2): + c_blk = c_dfb.wait() + d_blk = d_dfb.wait() + out_blk += c_blk @ d_blk + c_blk.pop() + d_blk.pop() + out_blk.push() + + @ttl.datamovement() + def reader(): + for kt in range(K1): + with a_dfb.reserve() as blk: + ttl.copy(a[0:1, kt : kt + 1], blk).wait() + with b_dfb.reserve() as blk: + ttl.copy(b[kt : kt + 1, 0:1], blk).wait() + for kt in range(K2): + with c_dfb.reserve() as blk: + ttl.copy(c[0:1, kt : kt + 1], blk).wait() + with d_dfb.reserve() as blk: + ttl.copy(d[kt : kt + 1, 0:1], blk).wait() + + @ttl.datamovement() + def writer(): + with out_dfb.wait() as blk: + ttl.copy(blk, out[0:1, 0:1]).wait() + + return kernel + + +@pytest.mark.requires_device +def test_l1_acc_consecutive_loops(device): + """Two consecutive += loops to the same reserve block.""" + K1, K2 = 2, 3 + a_torch = torch.randn(TILE, K1 * TILE, dtype=torch.bfloat16) + b_torch = torch.randn(K1 * TILE, TILE, dtype=torch.bfloat16) + c_torch = torch.randn(TILE, K2 * TILE, dtype=torch.bfloat16) + d_torch = torch.randn(K2 * TILE, TILE, dtype=torch.bfloat16) + golden = ( + (a_torch.float() @ b_torch.float()) + (c_torch.float() @ d_torch.float()) + ).float() + + a_dev = to_dram(a_torch, device) + b_dev = to_dram(b_torch, device) + c_dev = to_dram(c_torch, device) + d_dev = to_dram(d_torch, device) + out_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device) + + kernel = _make_consecutive_acc_kernel(K1, K2) + kernel(a_dev, b_dev, c_dev, d_dev, out_dev) + + result = ttnn.to_torch(out_dev).float() + assert_pcc(golden, result, threshold=0.999) + + +# --------------------------------------------------------------------------- +# Mixed .store() then += (overwrite first, accumulate rest). +# --------------------------------------------------------------------------- + + +def _make_mixed_store_acc_kernel(total_k): + """First iteration overwrites via .store(), rest accumulate via +=.""" + + @ttl.operation(grid=(1, 1)) + def kernel(a, b, out): + a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2) + b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, 1), block_count=2) + out_dfb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2) + + @ttl.compute() + def compute(): + out_blk = out_dfb.reserve() + a_blk = a_dfb.wait() + b_blk = b_dfb.wait() + out_blk.store(a_blk @ b_blk) + a_blk.pop() + b_blk.pop() + for _ in range(total_k - 1): + a_blk = a_dfb.wait() + b_blk = b_dfb.wait() + out_blk += a_blk @ b_blk + a_blk.pop() + b_blk.pop() + out_blk.push() + + @ttl.datamovement() + def reader(): + for _ in range(total_k): + with a_dfb.reserve() as blk: + ttl.copy(a[0:1, 0:1], blk).wait() + with b_dfb.reserve() as blk: + ttl.copy(b[0:1, 0:1], blk).wait() + + @ttl.datamovement() + def writer(): + with out_dfb.wait() as blk: + ttl.copy(blk, out[0:1, 0:1]).wait() + + return kernel + + +@pytest.mark.parametrize("total_k", [2, 4], ids=[f"K{k}" for k in [2, 4]]) +@pytest.mark.requires_device +def test_l1_acc_mixed_store(total_k, device): + """.store() first iteration, += for rest. Result = K * (a @ b).""" + a_torch = torch.randn(TILE, TILE, dtype=torch.bfloat16) + b_torch = torch.randn(TILE, TILE, dtype=torch.bfloat16) + golden = (total_k * (a_torch.float() @ b_torch.float())).float() + + a_dev = to_dram(a_torch, device) + b_dev = to_dram(b_torch, device) + out_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device) + + kernel = _make_mixed_store_acc_kernel(total_k) + kernel(a_dev, b_dev, out_dev) + + result = ttnn.to_torch(out_dev).float() + assert_pcc(golden, result, threshold=0.999) + + +# --------------------------------------------------------------------------- +# Multiple += to different outputs in the same loop. +# --------------------------------------------------------------------------- + + +def _make_multi_output_kernel(Kt): + """One loop with += to two independent outputs.""" + + @ttl.operation(grid=(1, 1)) + def kernel(a, b, c, d, out_a, out_b): + a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2) + b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, 1), block_count=2) + c_dfb = ttl.make_dataflow_buffer_like(c, shape=(1, 1), block_count=2) + d_dfb = ttl.make_dataflow_buffer_like(d, shape=(1, 1), block_count=2) + out_a_dfb = ttl.make_dataflow_buffer_like(out_a, shape=(1, 1), block_count=2) + out_b_dfb = ttl.make_dataflow_buffer_like(out_b, shape=(1, 1), block_count=2) + + @ttl.compute() + def compute(): + blk_a = out_a_dfb.reserve() + blk_b = out_b_dfb.reserve() + for _ in range(Kt): + a_blk = a_dfb.wait() + b_blk = b_dfb.wait() + blk_a += a_blk @ b_blk + a_blk.pop() + b_blk.pop() + c_blk = c_dfb.wait() + d_blk = d_dfb.wait() + blk_b += c_blk @ d_blk + c_blk.pop() + d_blk.pop() + blk_a.push() + blk_b.push() + + @ttl.datamovement() + def reader(): + for kt in range(Kt): + with a_dfb.reserve() as blk: + ttl.copy(a[0:1, kt : kt + 1], blk).wait() + with b_dfb.reserve() as blk: + ttl.copy(b[kt : kt + 1, 0:1], blk).wait() + with c_dfb.reserve() as blk: + ttl.copy(c[0:1, kt : kt + 1], blk).wait() + with d_dfb.reserve() as blk: + ttl.copy(d[kt : kt + 1, 0:1], blk).wait() + + @ttl.datamovement() + def writer(): + with out_a_dfb.wait() as blk: + ttl.copy(blk, out_a[0:1, 0:1]).wait() + with out_b_dfb.wait() as blk: + ttl.copy(blk, out_b[0:1, 0:1]).wait() + + return kernel + + +@pytest.mark.requires_device +def test_l1_acc_multi_output(device): + """Two independent += outputs in the same K loop.""" + Kt = 4 + a_torch = torch.randn(TILE, Kt * TILE, dtype=torch.bfloat16) + b_torch = torch.randn(Kt * TILE, TILE, dtype=torch.bfloat16) + c_torch = torch.randn(TILE, Kt * TILE, dtype=torch.bfloat16) + d_torch = torch.randn(Kt * TILE, TILE, dtype=torch.bfloat16) + golden_a = (a_torch.float() @ b_torch.float()).float() + golden_b = (c_torch.float() @ d_torch.float()).float() + + a_dev = to_dram(a_torch, device) + b_dev = to_dram(b_torch, device) + c_dev = to_dram(c_torch, device) + d_dev = to_dram(d_torch, device) + out_a_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device) + out_b_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device) + + kernel = _make_multi_output_kernel(Kt) + kernel(a_dev, b_dev, c_dev, d_dev, out_a_dev, out_b_dev) + + result_a = ttnn.to_torch(out_a_dev).float() + result_b = ttnn.to_torch(out_b_dev).float() + assert_pcc(golden_a, result_a, threshold=0.999) + assert_pcc(golden_b, result_b, threshold=0.999) diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir index 9d329427a..07750a148 100644 --- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir +++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir @@ -1,9 +1,11 @@ // Verifies ttkernel-insert-l1-accumulation: pack_reconfig_l1_acc guards are // inserted around reduction loops. The enable call happens once after the // first iteration's last pack (iv == lb), and disable guards bracket the -// outermost loop. +// accumulation scope. // RUN: ttlang-opt %s --pass-pipeline='builtin.module(ttkernel-insert-l1-accumulation)' --split-input-file | FileCheck %s +// Idempotency: running twice produces the same output. +// RUN: ttlang-opt %s --pass-pipeline='builtin.module(ttkernel-insert-l1-accumulation, ttkernel-insert-l1-accumulation)' --split-input-file | FileCheck %s // Basic L1 acc loop: enable after first iteration, disable before/after loop. @@ -20,6 +22,7 @@ // CHECK: } // CHECK: ttkernel.cb_push_back // CHECK: ttkernel.pack_reconfig_l1_acc(%{{.*}}) : (i32) +// CHECK-NOT: ttkernel.pack_reconfig_l1_acc func.func @basic_l1_acc_loop() attributes {ttkernel.thread = #ttkernel.thread} { %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> %c0 = arith.constant 0 : index @@ -48,6 +51,7 @@ func.func @basic_l1_acc_loop() attributes {ttkernel.thread = #ttkernel.thread} { %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> @@ -128,6 +132,7 @@ func.func @no_reduction_loop() attributes {ttkernel.thread = #ttkernel.thread} { %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> %c0 = arith.constant 0 : index @@ -187,6 +192,7 @@ func.func @l1_acc_loop_no_sync() attributes {ttkernel.thread = #ttkernel.thread< // CHECK: } // CHECK: ttkernel.cb_push_back // CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK-NOT: ttkernel.pack_reconfig_l1_acc func.func @l1_acc_inside_outer_loop() attributes {ttkernel.thread = #ttkernel.thread} { %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> %c0 = arith.constant 0 : index @@ -222,6 +228,7 @@ func.func @l1_acc_inside_outer_loop() attributes {ttkernel.thread = #ttkernel.th // CHECK: ttkernel.cb_push_back // CHECK: ttkernel.cb_push_back // CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK-NOT: ttkernel.pack_reconfig_l1_acc func.func @multi_push_after_loop() attributes {ttkernel.thread = #ttkernel.thread} { %cb0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> %cb1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> @@ -259,6 +266,7 @@ func.func @multi_push_after_loop() attributes {ttkernel.thread = #ttkernel.threa // CHECK: scf.if // CHECK: ttkernel.pack_reconfig_l1_acc // CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK-NOT: ttkernel.pack_reconfig_l1_acc func.func @nested_l1_acc_loops() attributes {ttkernel.thread = #ttkernel.thread} { %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> %c0 = arith.constant 0 : index @@ -292,6 +300,7 @@ func.func @nested_l1_acc_loops() attributes {ttkernel.thread = #ttkernel.thread< // CHECK: scf.if // CHECK: ttkernel.pack_reconfig_l1_acc // CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK-NOT: ttkernel.pack_reconfig_l1_acc func.func @nested_reduction_loops() attributes {ttkernel.thread = #ttkernel.thread} { %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>> @@ -311,3 +320,149 @@ func.func @nested_reduction_loops() attributes {ttkernel.thread = #ttkernel.thre } {ttl.reduction_loop} return } + +// ----- + +// Two consecutive L1 acc loops writing to the same CB. +// The reserve/push scope spans both loops. One disable pair brackets the +// entire scope; only the first loop gets the enable guard. + +// CHECK-LABEL: func.func @consecutive_l1_acc_loops +// CHECK: ttkernel.cb_reserve_back +// Disable before first loop. +// CHECK: ttkernel.pack_reconfig_l1_acc +// First loop with enable guard. +// CHECK: scf.for %[[IV1:.*]] = %[[LB1:.*]] to +// CHECK: ttkernel.tile_regs_acquire +// CHECK: ttkernel.pack_tile +// CHECK: ttkernel.tile_regs_release +// CHECK: arith.cmpi eq, %[[IV1]], %[[LB1]] +// CHECK: scf.if +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: } +// No disable between the loops. Unconditional enable re-arms L1 acc +// after any init ops that may reset packer state. +// CHECK-NOT: pack_reconfig_l1_acc(%{{.*}}0 +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for %[[IV2:.*]] = %[[LB2:.*]] to +// CHECK: ttkernel.tile_regs_acquire +// CHECK: ttkernel.pack_tile +// CHECK: ttkernel.tile_regs_release +// CHECK: arith.cmpi eq, %[[IV2]], %[[LB2]] +// CHECK: scf.if +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: } +// Push then disable. +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK-NOT: ttkernel.pack_reconfig_l1_acc +func.func @consecutive_l1_acc_loops() attributes {ttkernel.thread = #ttkernel.thread} { + %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c4_i32 = arith.constant 4 : i32 + ttkernel.cb_reserve_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + scf.for %iv1 = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.l1_acc_loop} + scf.for %iv2 = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.l1_acc_loop} + ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + return +} + +// ----- + +// Single loop with two independent accumulating outputs. +// Both pack to different CBs but share one L1 acc enable/disable scope. + +// CHECK-LABEL: func.func @two_outputs_one_loop +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to +// CHECK: ttkernel.tile_regs_acquire +// CHECK: ttkernel.pack_tile +// CHECK: ttkernel.tile_regs_release +// CHECK: ttkernel.tile_regs_acquire +// CHECK: ttkernel.pack_tile +// CHECK: ttkernel.tile_regs_release +// Enable after the last release (second output). +// CHECK: arith.cmpi eq, %[[IV]], %[[LB]] +// CHECK: scf.if +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: } +// Two pushes then disable. +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK-NOT: ttkernel.pack_reconfig_l1_acc +func.func @two_outputs_one_loop() attributes {ttkernel.thread = #ttkernel.thread} { + %cb0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %cb1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c4_i32 = arith.constant 4 : i32 + scf.for %iv = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb0, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + ttkernel.tile_regs_acquire() : () -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb1, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.l1_acc_loop} + ttkernel.cb_push_back(%cb0, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + ttkernel.cb_push_back(%cb1, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + return +} + +// ----- + +// Idempotency: input already has pack_reconfig_l1_acc guards. Running +// the pass again should not insert duplicates. + +// CHECK-LABEL: func.func @already_guarded +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: } +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK-NOT: ttkernel.pack_reconfig_l1_acc +func.func @already_guarded() attributes {ttkernel.thread = #ttkernel.thread} { + %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c4_i32 = arith.constant 4 : i32 + ttkernel.pack_reconfig_l1_acc(%c0_i32) : (i32) -> () + scf.for %iv = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + %cmp = arith.cmpi eq, %iv, %c0 : index + scf.if %cmp { + ttkernel.pack_reconfig_l1_acc(%c1_i32) : (i32) -> () + } + } {ttl.l1_acc_loop} + ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + ttkernel.pack_reconfig_l1_acc(%c0_i32) : (i32) -> () + return +} diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir index 9596fe1b2..4f6fc6861 100644 --- a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir +++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir @@ -111,6 +111,149 @@ func.func @nested_only_inner( // ----- +// Two consecutive += loops to the same reserve. Both should be annotated. + +// CHECK-LABEL: func.func @consecutive_loops_same_reserve +// CHECK: scf.for +// CHECK: } {ttl.l1_acc_loop} +// CHECK: scf.for +// CHECK: } {ttl.l1_acc_loop} +func.func @consecutive_loops_same_reserve( + %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>, + %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>, + %arg2: tensor<1x1x!ttcore.tile<32x32, bf16>>, + %arg3: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb3 = ttl.bind_cb {cb_index = 3, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb4 = ttl.bind_cb {cb_index = 4, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %c = ttl.attach_cb %arg2, %cb2 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %d = ttl.attach_cb %arg3, %cb3 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb4 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + scf.for %iv = %c0 to %c4 step %c1 { + %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + } + scf.for %iv = %c0 to %c4 step %c1 { + %mm = ttl.matmul %c, %d : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + } + func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> +} + +// ----- + +// Loop with both += and plain .store() to same reserve. The loop +// contains an accumulating store, so it should be annotated. + +// CHECK-LABEL: func.func @mixed_acc_and_plain_store +// CHECK: scf.for +// CHECK: } {ttl.l1_acc_loop} +func.func @mixed_acc_and_plain_store( + %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>, + %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + scf.for %iv = %c0 to %c4 step %c1 { + %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + } + func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> +} + +// ----- + +// += with non-matmul RHS (passthrough accumulation). + +// CHECK-LABEL: func.func @non_matmul_accumulate +// CHECK: scf.for +// CHECK: } {ttl.l1_acc_loop} +func.func @non_matmul_accumulate( + %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb1 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + scf.for %iv = %c0 to %c4 step %c1 { + ttl.store %a, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + } + func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> +} + +// ----- + +// K=1 (single iteration loop). Should still be annotated. + +// CHECK-LABEL: func.func @single_iteration +// CHECK: scf.for +// CHECK: } {ttl.l1_acc_loop} +func.func @single_iteration( + %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>, + %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + scf.for %iv = %c0 to %c1 step %c1 { + %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + } + func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> +} + +// ----- + +// += inside scf.if inside scf.for. The store's nearest enclosing +// ForOp is the outer loop, so it should be annotated. + +// CHECK-LABEL: func.func @acc_inside_conditional +// CHECK: scf.for +// CHECK: } {ttl.l1_acc_loop} +func.func @acc_inside_conditional( + %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>, + %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>, + %cond: i1) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + scf.for %iv = %c0 to %c4 step %c1 { + scf.if %cond { + %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + } + } + func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> +} + +// ----- + // Loop without any store should NOT be annotated. // CHECK-LABEL: func.func @no_store From d5bcba8c1a6b8d664e4b191de25e790a61c16dff Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Tue, 14 Apr 2026 09:25:06 -0700 Subject: [PATCH 27/31] update comments. add another test for consecutive loops targetting different CBs --- .../TTKernelInsertL1Accumulation.cpp | 16 ++++--- .../TTL/Transforms/TTLAnnotateL1AccLoops.cpp | 4 +- .../Transforms/insert_l1_accumulation.mlir | 46 +++++++++++++++++++ 3 files changed, 57 insertions(+), 9 deletions(-) diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index 9a4e2a394..bbafe955e 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -144,6 +144,8 @@ struct TTKernelInsertL1AccumulationPass assignedToGroup.insert(loop.getOperation()); // Collect sibling annotated loops that share a pack CB target. + // sharePackCB walks recursively, so for nested loops (rootLoop + // wrapping loop), it finds pack_tile ops inside the inner loop. for (Operation *op = rootLoop->getNextNode(); op; op = op->getNextNode()) { if (isa(op)) { @@ -164,16 +166,16 @@ struct TTKernelInsertL1AccumulationPass assignedToGroup.insert(sibling.getOperation()); } - // Scope ends at the last trailing cb_push_back. - Operation *lastInGroup = group.loops.size() > 1 - ? group.loops.back().getOperation() - : rootLoop.getOperation(); - group.scopeEnd = lastInGroup; - for (Operation *op = lastInGroup->getNextNode(); op; + // Find scope end: scan forward from rootLoop past sibling loops + // and trailing cb_push_back ops. Starts from rootLoop (not the + // last inner loop) because push_back ops are siblings of rootLoop, + // not of nested inner loops. + group.scopeEnd = rootLoop; + for (Operation *op = rootLoop->getNextNode(); op; op = op->getNextNode()) { if (isa(op)) { group.scopeEnd = op; - } else { + } else if (!assignedToGroup.contains(op)) { break; } } diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp index ab025919a..b2b247c6d 100644 --- a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp +++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp @@ -15,8 +15,8 @@ // the loop, so the same L1 slot persists across iterations). // // TTKernelInsertL1Accumulation uses the annotated loops to find enable -// points, and the enclosing cb_reserve_back/cb_push_back pair to determine -// the accumulation scope for disable guards. +// points, and groups consecutive sibling loops by shared pack CB targets +// to determine the accumulation scope for disable guards. // //===----------------------------------------------------------------------===// diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir index 07750a148..2caaee3e3 100644 --- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir +++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir @@ -466,3 +466,49 @@ func.func @already_guarded() attributes {ttkernel.thread = #ttkernel.thread () return } + +// ----- + +// Two consecutive annotated loops packing to DIFFERENT CBs. +// Each loop gets its own independent disable pair. + +// CHECK-LABEL: func.func @different_cb_siblings +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: } +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK-NOT: ttkernel.pack_reconfig_l1_acc +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: } +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK-NOT: ttkernel.pack_reconfig_l1_acc +func.func @different_cb_siblings() attributes {ttkernel.thread = #ttkernel.thread} { + %cb0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %cb1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c4_i32 = arith.constant 4 : i32 + scf.for %iv1 = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb0, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.l1_acc_loop} + ttkernel.cb_push_back(%cb0, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + scf.for %iv2 = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb1, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.l1_acc_loop} + ttkernel.cb_push_back(%cb1, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + return +} From dea4d75a5b881f517a6f7609c9acc3e55903efba Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Tue, 14 Apr 2026 09:45:35 -0700 Subject: [PATCH 28/31] final cleanup --- .../TTKernelInsertL1Accumulation.cpp | 16 ++++-- .../TTL/Transforms/TTLAnnotateL1AccLoops.cpp | 18 +++++++ test/python/test_matmul_l1_acc.py | 12 ++--- .../Transforms/insert_l1_accumulation.mlir | 52 +++++++++++++++++++ .../TTL/Transforms/annotate_l1_acc_loops.mlir | 34 +----------- .../annotate_l1_acc_loops_invalid.mlir | 28 ++++++++++ 6 files changed, 117 insertions(+), 43 deletions(-) create mode 100644 test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops_invalid.mlir diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index bbafe955e..f1b8c3e7c 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -166,17 +166,23 @@ struct TTKernelInsertL1AccumulationPass assignedToGroup.insert(sibling.getOperation()); } - // Find scope end: scan forward from rootLoop past sibling loops - // and trailing cb_push_back ops. Starts from rootLoop (not the - // last inner loop) because push_back ops are siblings of rootLoop, - // not of nested inner loops. + // Find scope end: scan forward from rootLoop past grouped siblings, + // init ops between them, and trailing cb_push_back ops. Only stop + // at a non-grouped ForOp (a different accumulation scope) or a + // cb_reserve_back (start of a new reserve region). + // TODO: Consider adding structural accumulation_region ops to make this + // more robust and composable. group.scopeEnd = rootLoop; for (Operation *op = rootLoop->getNextNode(); op; op = op->getNextNode()) { if (isa(op)) { group.scopeEnd = op; - } else if (!assignedToGroup.contains(op)) { + } else if (isa(op)) { break; + } else if (auto forOp = dyn_cast(op)) { + if (!assignedToGroup.contains(forOp)) { + break; + } } } diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp index b2b247c6d..20972e5ce 100644 --- a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp +++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp @@ -52,6 +52,7 @@ struct TTLAnnotateL1AccLoopsPass void runOnOperation() override { func::FuncOp func = getOperation(); DominanceInfo domInfo(func); + bool failed = false; func.walk([&](StoreOp store) { if (!store.getAccumulate()) { @@ -66,6 +67,19 @@ struct TTLAnnotateL1AccLoopsPass return; } + // Conditional += is not supported: the L1 acc enable guard is conditional + // based on the loop induction variable, not on whether a pack actually + // executed. If the condition is false on iteration 0, subsequent + // iterations accumulate into uninitialized L1. + if (store->getParentOp() != enclosingLoop.getOperation()) { + store->emitError( + "+= inside a conditional is not supported (#504); move " + "the condition outside the accumulation loop or use a " + "separate loop for the conditional path"); + failed = true; + return; + } + // The reserve must properly dominate the enclosing loop: the // reserve is outside the loop so the same L1 slot persists across // iterations. If the reserve is inside the loop, each iteration @@ -79,6 +93,10 @@ struct TTLAnnotateL1AccLoopsPass enclosingLoop->setAttr(kL1AccLoopAttrName, UnitAttr::get(enclosingLoop->getContext())); }); + + if (failed) { + signalPassFailure(); + } } }; diff --git a/test/python/test_matmul_l1_acc.py b/test/python/test_matmul_l1_acc.py index 074a72637..8a8ed4d9a 100644 --- a/test/python/test_matmul_l1_acc.py +++ b/test/python/test_matmul_l1_acc.py @@ -349,12 +349,12 @@ def test_l1_acc_consecutive_loops(device): # --------------------------------------------------------------------------- -# Mixed .store() then += (overwrite first, accumulate rest). +# .store() before loop, += inside loop (overwrite then accumulate). # --------------------------------------------------------------------------- -def _make_mixed_store_acc_kernel(total_k): - """First iteration overwrites via .store(), rest accumulate via +=.""" +def _make_store_then_acc_kernel(total_k): + """.store() before the += loop, then K-1 iterations accumulate via +=.""" @ttl.operation(grid=(1, 1)) def kernel(a, b, out): @@ -396,8 +396,8 @@ def writer(): @pytest.mark.parametrize("total_k", [2, 4], ids=[f"K{k}" for k in [2, 4]]) @pytest.mark.requires_device -def test_l1_acc_mixed_store(total_k, device): - """.store() first iteration, += for rest. Result = K * (a @ b).""" +def test_l1_acc_store_then_acc(total_k, device): + """.store() before loop, += inside loop. Result = K * (a @ b).""" a_torch = torch.randn(TILE, TILE, dtype=torch.bfloat16) b_torch = torch.randn(TILE, TILE, dtype=torch.bfloat16) golden = (total_k * (a_torch.float() @ b_torch.float())).float() @@ -406,7 +406,7 @@ def test_l1_acc_mixed_store(total_k, device): b_dev = to_dram(b_torch, device) out_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device) - kernel = _make_mixed_store_acc_kernel(total_k) + kernel = _make_store_then_acc_kernel(total_k) kernel(a_dev, b_dev, out_dev) result = ttnn.to_torch(out_dev).float() diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir index 2caaee3e3..b4648f589 100644 --- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir +++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir @@ -512,3 +512,55 @@ func.func @different_cb_siblings() attributes {ttkernel.thread = #ttkernel.threa ttkernel.cb_push_back(%cb1, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () return } + +// ----- + +// Consecutive annotated loops with init ops between them (the real-world +// pattern from the full pipeline). The scope must span past the init ops +// to include the push after the second loop. + +// CHECK-LABEL: func.func @consecutive_with_init_between +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: } +// CHECK-NOT: pack_reconfig_l1_acc(%{{.*}}0 +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: scf.for +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK: } +// CHECK: ttkernel.cb_push_back +// CHECK: ttkernel.pack_reconfig_l1_acc +// CHECK-NOT: ttkernel.pack_reconfig_l1_acc +func.func @consecutive_with_init_between() attributes {ttkernel.thread = #ttkernel.thread} { + %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %cb_in0 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %cb_in1 = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c4 = arith.constant 4 : index + %c4_i32 = arith.constant 4 : i32 + ttkernel.cb_reserve_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + scf.for %iv1 = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.matmul_block(%cb_in0, %cb_in0, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.l1_acc_loop} + // Init op between the two loops (as generated by TTKernelInsertInits). + %dummy_init = arith.constant 42 : i32 + scf.for %iv2 = %c0 to %c4 step %c1 { + ttkernel.tile_regs_acquire() : () -> () + ttkernel.matmul_block(%cb_in1, %cb_in1, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> () + ttkernel.tile_regs_commit() : () -> () + ttkernel.tile_regs_wait() : () -> () + ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () + ttkernel.tile_regs_release() : () -> () + } {ttl.l1_acc_loop} + ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> () + return +} diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir index 4f6fc6861..5042dae62 100644 --- a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir +++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir @@ -152,10 +152,10 @@ func.func @consecutive_loops_same_reserve( // Loop with both += and plain .store() to same reserve. The loop // contains an accumulating store, so it should be annotated. -// CHECK-LABEL: func.func @mixed_acc_and_plain_store +// CHECK-LABEL: func.func @store_and_acc_in_same_loop // CHECK: scf.for // CHECK: } {ttl.l1_acc_loop} -func.func @mixed_acc_and_plain_store( +func.func @store_and_acc_in_same_loop( %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>, %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { %c0 = arith.constant 0 : index @@ -224,36 +224,6 @@ func.func @single_iteration( // ----- -// += inside scf.if inside scf.for. The store's nearest enclosing -// ForOp is the outer loop, so it should be annotated. - -// CHECK-LABEL: func.func @acc_inside_conditional -// CHECK: scf.for -// CHECK: } {ttl.l1_acc_loop} -func.func @acc_inside_conditional( - %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>, - %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>, - %cond: i1) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> - %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> - %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> - %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> - %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> - %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> - scf.for %iv = %c0 to %c4 step %c1 { - scf.if %cond { - %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>> - ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> - } - } - func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> -} - -// ----- - // Loop without any store should NOT be annotated. // CHECK-LABEL: func.func @no_store diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops_invalid.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops_invalid.mlir new file mode 100644 index 000000000..494c1ba1d --- /dev/null +++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops_invalid.mlir @@ -0,0 +1,28 @@ +// += inside a conditional is not supported. + +// RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(ttl-annotate-l1-acc-loops))' --verify-diagnostics --split-input-file + +// += inside scf.if inside scf.for is rejected. + +func.func @acc_inside_conditional( + %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>, + %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>, + %cond: i1) -> tensor<1x1x!ttcore.tile<32x32, bf16>> { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2> + %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> + %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + scf.for %iv = %c0 to %c4 step %c1 { + scf.if %cond { + %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>> + // expected-error @below {{+= inside a conditional is not supported (#504)}} + ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> + } + } + func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>> +} From 6003a7d0dfa060301c3f5e94e39810836d8e82b3 Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Tue, 14 Apr 2026 10:44:59 -0700 Subject: [PATCH 29/31] Downgrade from `mm_block_init` to `mm_block_init_short` for the 2nd+ loop in an accumulation group to avoid clobbering the Pack_L1_Acc register on Wormhole, while keeping the UNPACK+MATH reconfiguration that's needed for the different input CBs. --- .../TTKernelInsertL1Accumulation.cpp | 29 ++++++++++++++++++- .../Transforms/insert_l1_accumulation.mlir | 10 +++++-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index f1b8c3e7c..2fe14aaeb 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -189,7 +189,34 @@ struct TTKernelInsertL1AccumulationPass groups.push_back(std::move(group)); } - // Step 2: Emit guards per group. + // Step 2: For the 2nd+ loop in each group, downgrade full + // MatmulBlockInitOp to MatmulBlockInitShortOp. The full init + // writes config.val[3]=0 which clobbers the Pack_L1_Acc register + // bits on Wormhole. init_short only reconfigures UNPACK+MATH, + // leaving the PACK configuration (including L1 acc) intact. + for (auto &group : groups) { + for (size_t idx = 1; idx < group.loops.size(); ++idx) { + scf::ForOp loop = group.loops[idx]; + // The init was hoisted before the loop by InsertInits. + for (Operation *op = loop->getPrevNode(); op; op = op->getPrevNode()) { + if (auto fullInit = dyn_cast(op)) { + OpBuilder builder(fullInit); + ttk::MatmulBlockInitShortOp::create( + builder, fullInit->getLoc(), fullInit.getIn0Cb(), + fullInit.getIn1Cb(), fullInit.getTranspose(), + fullInit.getCtDim(), fullInit.getRtDim(), fullInit.getKtDim()); + fullInit->erase(); + break; + } + // Stop at a loop or other boundary. + if (isa(op)) { + break; + } + } + } + } + + // Step 3: Emit guards per group. for (auto &group : groups) { OpBuilder builder(group.rootLoop->getContext()); Location disableLoc = group.rootLoop->getLoc(); diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir index b4648f589..c92fa4e2b 100644 --- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir +++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir @@ -524,7 +524,10 @@ func.func @different_cb_siblings() attributes {ttkernel.thread = #ttkernel.threa // CHECK: scf.for // CHECK: ttkernel.pack_reconfig_l1_acc // CHECK: } -// CHECK-NOT: pack_reconfig_l1_acc(%{{.*}}0 +// The full mm_block_init between loops is downgraded to init_short +// to avoid clobbering Pack_L1_Acc register bits. +// CHECK-NOT: "ttkernel.mm_block_init"( +// CHECK: "ttkernel.mm_block_init_short"( // CHECK: ttkernel.pack_reconfig_l1_acc // CHECK: scf.for // CHECK: ttkernel.pack_reconfig_l1_acc @@ -551,8 +554,9 @@ func.func @consecutive_with_init_between() attributes {ttkernel.thread = #ttkern ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () ttkernel.tile_regs_release() : () -> () } {ttl.l1_acc_loop} - // Init op between the two loops (as generated by TTKernelInsertInits). - %dummy_init = arith.constant 42 : i32 + // Full init between the two loops (as generated by TTKernelInsertInits). + // Should be downgraded to init_short by InsertL1Accumulation. + "ttkernel.mm_block_init"(%cb_in1, %cb_in1, %cb, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> () scf.for %iv2 = %c0 to %c4 step %c1 { ttkernel.tile_regs_acquire() : () -> () ttkernel.matmul_block(%cb_in1, %cb_in1, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> () From fd5b0bf6c945fb51b676ebb8daa4f4eb7464e3bf Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Tue, 14 Apr 2026 11:57:57 -0700 Subject: [PATCH 30/31] a bit more refactoring cleanup --- include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h | 38 +++--- .../Transforms/TTKernelInsertInits.cpp | 33 ++++- .../TTKernelInsertL1Accumulation.cpp | 118 +----------------- lib/Dialect/TTL/IR/TTLOpsUtils.cpp | 105 ++++++++++++++++ .../Transforms/insert_l1_accumulation.mlir | 11 +- 5 files changed, 160 insertions(+), 145 deletions(-) diff --git a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h index fbe4f5323..9db998b35 100644 --- a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h +++ b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h @@ -5,17 +5,15 @@ #ifndef TTLANG_DIALECT_TTL_IR_TTLOPSUTILS_H #define TTLANG_DIALECT_TTL_IR_TTLOPSUTILS_H -#include "ttlang/Dialect/TTL/IR/TTL.h" -#include "ttlang/Dialect/TTL/IR/TTLOps.h" -#include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h" -#include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h" - #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Interfaces/ViewLikeInterface.h" +#include "ttlang/Dialect/TTL/IR/TTL.h" +#include "ttlang/Dialect/TTL/IR/TTLOps.h" +#include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h" #include "llvm/ADT/SetVector.h" #include #include @@ -430,24 +428,22 @@ inline TileOp createTileOpWithPlaceholderDstIndex(OpBuilder &builder, } /// Collect the CB values targeted by pack_tile ops inside a loop. -inline llvm::SmallDenseSet getPackTileCBs(scf::ForOp loop) { - namespace ttk = mlir::tt::ttkernel; - llvm::SmallDenseSet cbs; - loop->walk([&](ttk::PackTileOp packOp) { cbs.insert(packOp.getOutCb()); }); - return cbs; -} +llvm::SmallDenseSet getPackTileCBs(scf::ForOp loop); /// Returns true if two loops share any pack_tile CB target. -inline bool sharePackCB(scf::ForOp loopA, scf::ForOp loopB) { - auto cbsA = getPackTileCBs(loopA); - auto cbsB = getPackTileCBs(loopB); - for (auto cb : cbsA) { - if (cbsB.contains(cb)) { - return true; - } - } - return false; -} +bool sharePackCB(scf::ForOp loopA, scf::ForOp loopB); + +/// A group of consecutive sibling loops that pack to the same output CB. +struct LoopGroup { + scf::ForOp rootLoop; + SmallVector loops; + Operation *scopeEnd = nullptr; +}; + +/// Collect groups of annotated sibling loops that share a pack CB target. +SmallVector collectLoopGroups( + ArrayRef l1AccLoops, + const llvm::SmallDenseMap &enablePointPerLoop); } // namespace mlir::tt::ttl diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp index f6ddc93fa..075bc1184 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp @@ -23,6 +23,7 @@ //===----------------------------------------------------------------------===// #include "ttlang/Dialect/TTL/IR/TTL.h" +#include "ttlang/Dialect/TTL/IR/TTLOpsUtils.h" #include "ttlang/Dialect/TTL/Passes.h" #include "ttmlir/Dialect/TTKernel/IR/TTKernel.h" @@ -403,8 +404,36 @@ static LogicalResult insertCommonInits(ModuleOp moduleOp) { inputCB = outputCB; } - if (analysis.hasMatmul && in0CB && in1CB) { - // mm_block_init configures UNPACK + MATH + PACK for matmul_block. + // When a matmul init is hoisted before a loop that shares an + // output CB with a preceding sibling annotated loop, use + // init_short. The full init reconfigures the PACK pipeline + // which clobbers packer state (including L1 acc on Wormhole). + // init_short only reconfigures UNPACK+MATH. + bool useInitShort = false; + if (analysis.hasMatmul) { + if (auto forOp = dyn_cast(insertBefore)) { + if (forOp->hasAttr(kL1AccLoopAttrName) || + forOp->hasAttr(kReductionLoopAttrName)) { + for (Operation *prev = forOp->getPrevNode(); prev; + prev = prev->getPrevNode()) { + if (auto prevFor = dyn_cast(prev)) { + if ((prevFor->hasAttr(kL1AccLoopAttrName) || + prevFor->hasAttr(kReductionLoopAttrName)) && + sharePackCB(prevFor, forOp)) { + useInitShort = true; + } + break; + } + } + } + } + } + + if (analysis.hasMatmul && in0CB && in1CB && useInitShort) { + ttk::MatmulBlockInitShortOp::create( + builder, loc, in0CB, in1CB, analysis.matmulTranspose, + analysis.matmulCt, analysis.matmulRt, analysis.matmulKt); + } else if (analysis.hasMatmul && in0CB && in1CB) { ttk::MatmulBlockInitOp::create( builder, loc, in0CB, in1CB, outputCB, analysis.matmulTranspose, analysis.matmulCt, analysis.matmulRt, analysis.matmulKt); diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp index 2fe14aaeb..21a029bce 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp @@ -47,22 +47,6 @@ static scf::ForOp findL1AccLoop(Operation *op) { return reductionFallback; } -/// Walk from loop up through parent ops, returning the outermost -/// annotated ancestor. Returns loop itself if no annotated ancestor exists. -static scf::ForOp findOutermostAnnotatedAncestor(scf::ForOp loop) { - scf::ForOp outermost = loop; - for (Operation *parent = loop->getParentOp(); parent; - parent = parent->getParentOp()) { - if (auto parentFor = dyn_cast(parent)) { - if (parentFor->hasAttr(kL1AccLoopAttrName) || - parentFor->hasAttr(kReductionLoopAttrName)) { - outermost = parentFor; - } - } - } - return outermost; -} - struct TTKernelInsertL1AccumulationPass : public impl::TTKernelInsertL1AccumulationBase< TTKernelInsertL1AccumulationPass> { @@ -117,106 +101,10 @@ struct TTKernelInsertL1AccumulationPass } } - // Step 1: Group loops into accumulation scopes. Consecutive sibling - // loops that pack to the same CB share a single disable pair. Nested - // annotated loops are folded into the outermost ancestor. - struct AccGroup { - scf::ForOp rootLoop; - SmallVector loops; - Operation *scopeEnd = nullptr; - }; - SmallVector groups; - llvm::SmallDenseSet assignedToGroup; - - for (auto loop : l1AccLoops) { - if (!l1AccEnablePoint.count(loop.getOperation())) { - continue; - } - if (assignedToGroup.contains(loop.getOperation())) { - continue; - } - - scf::ForOp rootLoop = findOutermostAnnotatedAncestor(loop); - - AccGroup group; - group.rootLoop = rootLoop; - group.loops.push_back(loop); - assignedToGroup.insert(loop.getOperation()); - - // Collect sibling annotated loops that share a pack CB target. - // sharePackCB walks recursively, so for nested loops (rootLoop - // wrapping loop), it finds pack_tile ops inside the inner loop. - for (Operation *op = rootLoop->getNextNode(); op; - op = op->getNextNode()) { - if (isa(op)) { - break; - } - auto sibling = dyn_cast(op); - if (!sibling) { - continue; - } - if (!sibling->hasAttr(kL1AccLoopAttrName) && - !sibling->hasAttr(kReductionLoopAttrName)) { - break; - } - if (!sharePackCB(rootLoop, sibling)) { - break; - } - group.loops.push_back(sibling); - assignedToGroup.insert(sibling.getOperation()); - } - - // Find scope end: scan forward from rootLoop past grouped siblings, - // init ops between them, and trailing cb_push_back ops. Only stop - // at a non-grouped ForOp (a different accumulation scope) or a - // cb_reserve_back (start of a new reserve region). - // TODO: Consider adding structural accumulation_region ops to make this - // more robust and composable. - group.scopeEnd = rootLoop; - for (Operation *op = rootLoop->getNextNode(); op; - op = op->getNextNode()) { - if (isa(op)) { - group.scopeEnd = op; - } else if (isa(op)) { - break; - } else if (auto forOp = dyn_cast(op)) { - if (!assignedToGroup.contains(forOp)) { - break; - } - } - } - - groups.push_back(std::move(group)); - } - - // Step 2: For the 2nd+ loop in each group, downgrade full - // MatmulBlockInitOp to MatmulBlockInitShortOp. The full init - // writes config.val[3]=0 which clobbers the Pack_L1_Acc register - // bits on Wormhole. init_short only reconfigures UNPACK+MATH, - // leaving the PACK configuration (including L1 acc) intact. - for (auto &group : groups) { - for (size_t idx = 1; idx < group.loops.size(); ++idx) { - scf::ForOp loop = group.loops[idx]; - // The init was hoisted before the loop by InsertInits. - for (Operation *op = loop->getPrevNode(); op; op = op->getPrevNode()) { - if (auto fullInit = dyn_cast(op)) { - OpBuilder builder(fullInit); - ttk::MatmulBlockInitShortOp::create( - builder, fullInit->getLoc(), fullInit.getIn0Cb(), - fullInit.getIn1Cb(), fullInit.getTranspose(), - fullInit.getCtDim(), fullInit.getRtDim(), fullInit.getKtDim()); - fullInit->erase(); - break; - } - // Stop at a loop or other boundary. - if (isa(op)) { - break; - } - } - } - } + // Group consecutive sibling loops that pack to the same CB. + auto groups = collectLoopGroups(l1AccLoops, l1AccEnablePoint); - // Step 3: Emit guards per group. + // Emit guards per group. for (auto &group : groups) { OpBuilder builder(group.rootLoop->getContext()); Location disableLoc = group.rootLoop->getLoc(); diff --git a/lib/Dialect/TTL/IR/TTLOpsUtils.cpp b/lib/Dialect/TTL/IR/TTLOpsUtils.cpp index de5bd5a2c..3e4993885 100644 --- a/lib/Dialect/TTL/IR/TTLOpsUtils.cpp +++ b/lib/Dialect/TTL/IR/TTLOpsUtils.cpp @@ -4,6 +4,8 @@ #include "ttlang/Dialect/TTL/IR/TTLOpsUtils.h" +#include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h" + namespace mlir::tt::ttl { //===----------------------------------------------------------------------===// @@ -132,4 +134,107 @@ llvm::StringRef describeTraceFailure(TraceFailureReason reason) { llvm_unreachable("unhandled TraceFailureReason"); } +//===----------------------------------------------------------------------===// +// Loop grouping for L1 accumulation and init selection +//===----------------------------------------------------------------------===// + +namespace ttk = mlir::tt::ttkernel; + +llvm::SmallDenseSet getPackTileCBs(scf::ForOp loop) { + llvm::SmallDenseSet cbs; + loop->walk([&](ttk::PackTileOp packOp) { cbs.insert(packOp.getOutCb()); }); + return cbs; +} + +bool sharePackCB(scf::ForOp loopA, scf::ForOp loopB) { + auto cbsA = getPackTileCBs(loopA); + auto cbsB = getPackTileCBs(loopB); + for (auto cb : cbsA) { + if (cbsB.contains(cb)) { + return true; + } + } + return false; +} + +SmallVector collectLoopGroups( + ArrayRef l1AccLoops, + const llvm::SmallDenseMap &enablePointPerLoop) { + // Find the outermost annotated ancestor of a loop. + auto findRoot = [](scf::ForOp loop) -> scf::ForOp { + scf::ForOp outermost = loop; + for (Operation *parent = loop->getParentOp(); parent; + parent = parent->getParentOp()) { + if (auto parentFor = dyn_cast(parent)) { + if (parentFor->hasAttr(kL1AccLoopAttrName) || + parentFor->hasAttr(kReductionLoopAttrName)) { + outermost = parentFor; + } + } + } + return outermost; + }; + + SmallVector groups; + llvm::SmallDenseSet assigned; + + for (auto loop : l1AccLoops) { + if (!enablePointPerLoop.count(loop.getOperation())) { + continue; + } + if (assigned.contains(loop.getOperation())) { + continue; + } + + scf::ForOp rootLoop = findRoot(loop); + + LoopGroup group; + group.rootLoop = rootLoop; + group.loops.push_back(loop); + assigned.insert(loop.getOperation()); + + // Collect sibling annotated loops that share a pack CB target. + // sharePackCB walks recursively, so for nested loops (rootLoop + // wrapping loop), it finds pack_tile ops inside the inner loop. + for (Operation *op = rootLoop->getNextNode(); op; op = op->getNextNode()) { + if (isa(op)) { + break; + } + auto sibling = dyn_cast(op); + if (!sibling) { + continue; + } + if (!sibling->hasAttr(kL1AccLoopAttrName) && + !sibling->hasAttr(kReductionLoopAttrName)) { + break; + } + if (!sharePackCB(rootLoop, sibling)) { + break; + } + group.loops.push_back(sibling); + assigned.insert(sibling.getOperation()); + } + + // Find scope end: scan forward from rootLoop past grouped siblings, + // init ops between them, and trailing cb_push_back ops. Only stop + // at a non-grouped ForOp or a cb_reserve_back. + group.scopeEnd = rootLoop; + for (Operation *op = rootLoop->getNextNode(); op; op = op->getNextNode()) { + if (isa(op)) { + group.scopeEnd = op; + } else if (isa(op)) { + break; + } else if (auto forOp = dyn_cast(op)) { + if (!assigned.contains(forOp)) { + break; + } + } + } + + groups.push_back(std::move(group)); + } + + return groups; +} + } // namespace mlir::tt::ttl diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir index c92fa4e2b..ca2ba92d4 100644 --- a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir +++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir @@ -524,10 +524,7 @@ func.func @different_cb_siblings() attributes {ttkernel.thread = #ttkernel.threa // CHECK: scf.for // CHECK: ttkernel.pack_reconfig_l1_acc // CHECK: } -// The full mm_block_init between loops is downgraded to init_short -// to avoid clobbering Pack_L1_Acc register bits. -// CHECK-NOT: "ttkernel.mm_block_init"( -// CHECK: "ttkernel.mm_block_init_short"( +// CHECK-NOT: pack_reconfig_l1_acc(%{{.*}}0 // CHECK: ttkernel.pack_reconfig_l1_acc // CHECK: scf.for // CHECK: ttkernel.pack_reconfig_l1_acc @@ -554,9 +551,9 @@ func.func @consecutive_with_init_between() attributes {ttkernel.thread = #ttkern ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> () ttkernel.tile_regs_release() : () -> () } {ttl.l1_acc_loop} - // Full init between the two loops (as generated by TTKernelInsertInits). - // Should be downgraded to init_short by InsertL1Accumulation. - "ttkernel.mm_block_init"(%cb_in1, %cb_in1, %cb, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> () + // init_short between the two loops (InsertInits emits init_short when + // sibling loops share an output CB, to avoid clobbering PACK config). + "ttkernel.mm_block_init_short"(%cb_in1, %cb_in1, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> () scf.for %iv2 = %c0 to %c4 step %c1 { ttkernel.tile_regs_acquire() : () -> () ttkernel.matmul_block(%cb_in1, %cb_in1, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> () From 3394dfc1d84d9ad27e57f9cb588137f5db23a3db Mon Sep 17 00:00:00 2001 From: Boyana Norris Date: Tue, 14 Apr 2026 12:02:33 -0700 Subject: [PATCH 31/31] reorg includes --- include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h index 9db998b35..2f2b6763a 100644 --- a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h +++ b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h @@ -5,16 +5,18 @@ #ifndef TTLANG_DIALECT_TTL_IR_TTLOPSUTILS_H #define TTLANG_DIALECT_TTL_IR_TTLOPSUTILS_H +#include "ttlang/Dialect/TTL/IR/TTL.h" +#include "ttlang/Dialect/TTL/IR/TTLOps.h" +#include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h" + #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Interfaces/ViewLikeInterface.h" -#include "ttlang/Dialect/TTL/IR/TTL.h" -#include "ttlang/Dialect/TTL/IR/TTLOps.h" -#include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h" #include "llvm/ADT/SetVector.h" + #include #include