From 3416b302782a42e5146d573c8c3f5e116be6d269 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Thu, 7 May 2026 22:15:50 -0700
Subject: [PATCH 01/11] fix lastOwnedUse computation

---
 .../TTL/Transforms/TTLInsertCBSync.cpp        | 131 ++++-
 test/python/test_auto_pop_push.py             | 544 ++++++++++++++++++
 .../TTL/Transforms/insert_cb_sync.mlir        | 101 ++++
 third-party/tt-mlir                           |   2 +-
 4 files changed, 764 insertions(+), 14 deletions(-)
 create mode 100644 test/python/test_auto_pop_push.py
diff --git a/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp b/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
index 64708a847..03d4518a4 100644
--- a/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
@@ -106,7 +106,8 @@ static bool directDFBUseMatchesAcquire(AcquireInterval interval,
 }
 
 static bool projectToAcquireBlock(AcquireInterval interval, Operation *op,
-                                  Operation *&projected) {
+                                  Operation *&projected,
+                                  bool ignoreBoundary = false) {
   Block *block = interval.acquire->getBlock();
   projected = op->getBlock() == block ? op : block->findAncestorOpInBlock(*op);
   if (!projected) {
@@ -115,7 +116,7 @@ static bool projectToAcquireBlock(AcquireInterval interval, Operation *op,
   if (!isBefore(interval.acquire, projected)) {
     return false;
   }
-  if (interval.syncClassBoundary &&
+  if (!ignoreBoundary && interval.syncClassBoundary &&
       !isBefore(projected, interval.syncClassBoundary)) {
     return false;
   }
@@ -129,12 +130,26 @@ static void updateLatestUse(Operation *candidate, Operation *&latest) {
 }
 
 /// Find releases owned by this acquire interval.
+///
+/// `lastOwnedUse` extends the release-search upper bound past the
+/// next-acquire boundary when the interval's tensor SSA uses live past it
+/// (the deferred-use case). Without this extension the pass would not be
+/// idempotent: the cb_pop inserted after the deferred use would lie past
+/// the next-acquire boundary, and a subsequent run would re-insert it.
 static ReleaseSearch findOwnedReleases(AcquireInterval interval,
+                                       Operation *lastOwnedUse,
                                        ArrayRef<Operation *> allReleases,
                                        const DenseSet<Operation *> &erased) {
   ReleaseSearch result;
   Block *block = interval.acquire->getBlock();
 
+  // Allow same-block releases between the acquire and `lastOwnedUse`,
+  // ignoring the next-acquire boundary when the use itself sits past it.
+  bool useExtendsPastBoundary =
+      lastOwnedUse && lastOwnedUse != interval.acquire &&
+      interval.syncClassBoundary &&
+      !isBefore(lastOwnedUse, interval.syncClassBoundary);
+
   for (Operation *release : allReleases) {
     if (erased.contains(release)) {
       continue;
@@ -145,10 +160,20 @@ static ReleaseSearch findOwnedReleases(AcquireInterval interval,
 
     if (release->getBlock() == block) {
       Operation *projected = nullptr;
-      if (!projectToAcquireBlock(interval, release, projected)) {
+      if (projectToAcquireBlock(interval, release, projected)) {
+        result.hasSameLevelRelease = true;
         continue;
       }
-      result.hasSameLevelRelease = true;
+      // Boundary failed. Re-check with the extended upper bound to keep
+      // the pass idempotent in the deferred-use shape: a release at or
+      // after the acquire's last owned use is the one this acquire would
+      // have inserted, so treat it as same-level.
+      if (useExtendsPastBoundary &&
+          projectToAcquireBlock(interval, release, projected,
+                                /*ignoreBoundary=*/true) &&
+          !isBefore(projected, lastOwnedUse)) {
+        result.hasSameLevelRelease = true;
+      }
       continue;
     }
 
@@ -197,17 +222,77 @@ static Operation *findNextSyncClassAcquire(Value cb, Operation *acquire,
 }
 
 /// Return the last op in `acquire`'s block that consumes the acquired slot.
-/// Tensor uses follow the acquire result; direct DFB copies use direction.
-/// `boundary` stops the scan at the next `cb_reserve` for reserve intervals or
-/// the next `cb_wait` for wait intervals.
+///
+/// ## Ownership
+///
+/// A use `U` is *owned by* `acquire` if `U` accesses the slot `acquire`
+/// acquired. Two disjoint criteria establish ownership:
+///
+/// **(a) SSA criterion** -- `U` is reachable from `acquire`'s result
+/// through identity-shaped tensor ops (`attach_cb`, `tensor.extract`,
+/// `tensor.extract_slice`, compute ops, `ttl.store`). Per-tile SSA values
+/// uniquely identify their source acquire, so this criterion has no
+/// positional bound: a use of `cb_wait t1`'s tile is owned by `t1`
+/// regardless of where it appears, even past later acquires on the same
+/// DFB.
+///
+/// **(b) Op-order criterion** -- `U` references the CB directly as a
+/// `ttl.copy` operand on the side matching the acquire's sync class (the
+/// DM-thread case, e.g. `ttl.copy %cb, %slice` for a writer). With no SSA
+/// tile handle, ownership is positional: `U` belongs to the latest
+/// acquire on `(cb, sync class)` that precedes it in op order.
+/// Equivalently, `U` is bounded between `acquire` and
+/// `interval.syncClassBoundary`.
+///
+/// The criteria are disjoint because DM-thread `ttl.copy` does not flow
+/// through `attach_cb` (it takes the CB directly), and compute-thread
+/// uses always go through `attach_cb` and never reference the CB as a
+/// direct operand of a tile op.
+///
+/// ### Why two criteria
+///
+/// Compute threads work through SSA tile handles
+/// (`cb_wait` result -> `attach_cb` -> `ttl.store` / compute ops), so (a)
+/// applies and the next-acquire boundary is irrelevant -- SSA already
+/// distinguishes which slot the use refers to. DM threads use direct CB
+/// references (`ttl.copy %cb, %slice`) where no tile handle exists, so
+/// (b) is the fallback and the boundary is essential to disambiguate
+/// between consecutive direct uses on the same CB. Unifying would require
+/// changing `ttl.copy` to take the attached tensor instead of the CB -- a
+/// dialect change deferred as future work.
+///
+/// ## Invariants on the inserted release
+///
+/// For each acquire `A`, the inserted release `R_A` must satisfy:
+///
+/// 1. **Causal dominance** -- every owned use of `A` precedes `R_A` in op
+///    order (after projecting nested uses to `A`'s block). This pass
+///    enforces it directly: the release is positioned after the last
+///    owned use returned by this function.
+///
+/// 2. **FIFO monotonicity** -- for `A_0 < A_1 < ...` on the same
+///    `(cb, sync class)`, the inserted releases satisfy
+///    `R_0 < R_1 < ...` in op order. The CB front pointer advances
+///    monotonically; out-of-order pops would advance it past slots whose
+///    data is still needed.
+///
+/// (1) is enforced explicitly here. (2) is enforced *implicitly* when
+/// consumers under criterion (a) appear in declaration order
+/// (`use(t1); use(t2); use(t3)`), because the resulting `lastUse(A_i)`
+/// values are then themselves in op order. Reordered consumes
+/// (`use(t2); use(t1)`) silently violate (2): the pass places `R_0` after
+/// `R_1` and the front pointer advances past `t1`'s slot before `t1` is
+/// read. Lifting that restriction is future work that requires a
+/// multi-tile `cb_wait_front(N)` with per-acquire `src_idx` so each
+/// consumer reads its tile by index, decoupled from pop ordering.
 static Operation *findLastOwnedUse(AcquireInterval interval) {
   Operation *last = interval.acquire;
   DenseSet<Operation *> visited;
   SmallVector<Value, 8> worklist;
 
-  auto extend = [&](Operation *user) {
+  auto extend = [&](Operation *user, bool ignoreBoundary) {
     Operation *projected = nullptr;
-    if (!projectToAcquireBlock(interval, user, projected)) {
+    if (!projectToAcquireBlock(interval, user, projected, ignoreBoundary)) {
       return false;
     }
     if (!visited.insert(user).second) {
@@ -220,6 +305,10 @@ static Operation *findLastOwnedUse(AcquireInterval interval) {
     return true;
   };
 
+  // Direct DFB uses: start from the CB value's users and recurse through
+  // their SSA results (e.g. ttl.copy returns a transfer_handle whose ttl.wait
+  // marks the actual end of the transfer). Boundary applies because two
+  // direct DFB uses on the same CB belong to different intervals.
   for (OpOperand &use : interval.cb.getUses()) {
     Operation *user = use.getOwner();
     if (user == interval.acquire) {
@@ -231,9 +320,24 @@ static Operation *findLastOwnedUse(AcquireInterval interval) {
     if (!directDFBUseMatchesAcquire(interval, user)) {
       continue;
     }
-    extend(user);
+    extend(user, /*ignoreBoundary=*/false);
+  }
+  while (!worklist.empty()) {
+    Value value = worklist.pop_back_val();
+    for (OpOperand &use : value.getUses()) {
+      Operation *user = use.getOwner();
+      if (isa<CBPushOp, CBPopOp>(user)) {
+        continue;
+      }
+      extend(user, /*ignoreBoundary=*/false);
+    }
   }
 
+  // Tensor SSA uses: start from the acquire's result and recurse through
+  // attach_cb / store / compute users. The next-acquire boundary does NOT
+  // apply: a tile produced by `cb_wait t1` may legitimately be consumed
+  // after `cb_wait t2`. Bounding this walk caused the issue #536 follow-up
+  // bug.
   if (interval.acquire->getNumResults() > 0) {
     worklist.push_back(interval.acquire->getResult(0));
   }
@@ -244,7 +348,7 @@ static Operation *findLastOwnedUse(AcquireInterval interval) {
       if (isa<CBPushOp, CBPopOp>(user)) {
         continue;
       }
-      extend(user);
+      extend(user, /*ignoreBoundary=*/true);
     }
   }
 
@@ -266,7 +370,9 @@ static void insertMissingReleases(ArrayRef<Operation *> acquires,
                                   CreateReleaseFn createRelease) {
   for (Operation *acquire : acquires) {
     AcquireInterval interval = makeAcquireInterval(acquire, acquires);
-    ReleaseSearch releaseSearch = findOwnedReleases(interval, releases, erased);
+    Operation *last = findLastOwnedUse(interval);
+    ReleaseSearch releaseSearch =
+        findOwnedReleases(interval, last, releases, erased);
     if (releaseSearch.hasSameLevelRelease) {
       continue;
     }
@@ -276,7 +382,6 @@ static void insertMissingReleases(ArrayRef<Operation *> acquires,
       nestedRelease->erase();
     }
 
-    Operation *last = findLastOwnedUse(interval);
     builder.setInsertionPointAfter(last);
     createRelease(builder, acquire->getLoc(), interval.cb);
   }
diff --git a/test/python/test_auto_pop_push.py b/test/python/test_auto_pop_push.py
new file mode 100644
index 000000000..dd58dc132
--- /dev/null
+++ b/test/python/test_auto_pop_push.py
@@ -0,0 +1,544 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Coverage for ttl-insert-cb-sync auto-injection edge cases.
+
+Each test exercises a distinct shape that the auto pop/push placement must
+handle, including the issue #536 follow-up case_a and case_b reproducers
+(deferred consumer uses across multiple consecutive cb.wait() calls on the
+same DFB).
+"""
+
+import pytest
+import torch
+
+ttnn = pytest.importorskip("ttnn", exc_type=ImportError)
+
+import ttl  # noqa: E402
+
+from ttlang_test_utils import to_dram  # noqa: E402
+
+TILE = 32
+
+
+# ---------------------------------------------------------------------------
+# Deferred consumer uses across multiple consecutive cb.wait() calls.
+#
+# The auto-pop pass clamps each wait's owned-use search at the next wait on
+# the same DFB. If the consumer use of an earlier wait's tile lives past the
+# later waits (e.g., 4 waits followed by 4 stores), the pass fails to find
+# the use, places the pop right after the wait, and the read pointer
+# advances before the data is consumed. See issue #536 follow-up comment.
+# ---------------------------------------------------------------------------
+
+
+def _run(device, kernel, num_out_tiles, expected):
+    out_t = to_dram(
+        torch.full((TILE, num_out_tiles * TILE), -42.0, dtype=torch.bfloat16),
+        device,
+    )
+    kernel(out_t)
+    ttnn.synchronize_device(device)
+    out_h = ttnn.to_torch(out_t)
+    actual = [out_h[0, i * TILE].item() for i in range(num_out_tiles)]
+    assert actual == expected, f"actual={actual} expected={expected}"
+
+
+@pytest.mark.requires_device
+def test_issue_536_followup_case_a_three_waits_no_loop(device):
+    """case_a from issue #536 follow-up: 3 consecutive cb.wait() calls in
+    compute() with no enclosing loop, all consumer stores after the last
+    wait."""
+
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=3)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=3)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 11.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 22.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 33.0))
+
+            t1 = cb.wait()
+            t2 = cb.wait()
+            t3 = cb.wait()
+
+            with out_cb.reserve() as o:
+                o.store(t1)
+            with out_cb.reserve() as o:
+                o.store(t2)
+            with out_cb.reserve() as o:
+                o.store(t3)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 2]).wait()
+
+    _run(device, repro, 3, [11.0, 22.0, 33.0])
+
+
+@pytest.mark.requires_device
+def test_issue_536_followup_case_b_four_waits_in_loop(device):
+    """case_b from issue #536 follow-up: 4 consecutive cb.wait() calls
+    inside a for-loop in compute(), 3 iterations, all consumer stores
+    after the four waits in each iteration."""
+
+    N_ITERS = 3
+    N_PER_ITER = 4
+    TOTAL = N_ITERS * N_PER_ITER
+
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=TOTAL)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=4)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 1.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 2.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 3.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 4.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 5.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 6.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 7.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 8.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 9.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 10.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 11.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 12.0))
+
+            for _ in range(N_ITERS):
+                t1 = cb.wait()
+                t2 = cb.wait()
+                t3 = cb.wait()
+                t4 = cb.wait()
+                with out_cb.reserve() as o:
+                    o.store(t1)
+                with out_cb.reserve() as o:
+                    o.store(t2)
+                with out_cb.reserve() as o:
+                    o.store(t3)
+                with out_cb.reserve() as o:
+                    o.store(t4)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            for col in range(TOTAL):
+                blk = out_cb.wait()
+                ttl.copy(blk, out[0, col]).wait()
+
+    _run(device, repro, TOTAL, [float(i + 1) for i in range(TOTAL)])
+
+
+@pytest.mark.requires_device
+def test_interleaved_wait_consume_pop_baseline(device):
+    """Sanity check: the safe shape (consume each wait before the next wait)
+    works after the #536 fix. This is the form the auto-pop pass currently
+    reasons about correctly."""
+
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=4)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=4)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 1.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 2.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 3.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 4.0))
+
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 2]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 3]).wait()
+
+    _run(device, repro, 4, [1.0, 2.0, 3.0, 4.0])
+
+
+# ---------------------------------------------------------------------------
+# Reused Python variable name ("tx-name collision"): the second assignment
+# rebinds the local but the first acquire's SSA value still has uses. The
+# auto-pop pass operates on SSA values, so this should be unaffected.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_python_name_reuse_does_not_alias_ssa(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            with out_cb.reserve() as v:
+                v.store(ttl.math.fill(v, 5.0))
+            with out_cb.reserve() as v:
+                v.store(ttl.math.fill(v, 6.0))
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            tx = out_cb.wait()
+            tx = ttl.copy(tx, out[0, 0])
+            tx.wait()
+            tx = out_cb.wait()
+            tx = ttl.copy(tx, out[0, 1])
+            tx.wait()
+
+    _run(device, repro, 2, [5.0, 6.0])
+
+
+# ---------------------------------------------------------------------------
+# Nested scf.for with independent acquires in the inner and outer bodies.
+# updateBoundary() only treats acquires that share a common ancestor block
+# as boundaries; an inner-loop acquire never bounds an outer-loop acquire.
+# Verify that auto-pop placement remains correct across the loop boundary.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_nested_for_independent_acquires_per_loop(device):
+    OUTER = 2
+    INNER = 3
+    TOTAL = OUTER * INNER
+
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=TOTAL)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 1.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 2.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 3.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 4.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 5.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 6.0))
+
+            for _outer in range(OUTER):
+                for _inner in range(INNER):
+                    with cb.wait() as src, out_cb.reserve() as dst:
+                        dst.store(src)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            for col in range(TOTAL):
+                blk = out_cb.wait()
+                ttl.copy(blk, out[0, col]).wait()
+
+    _run(device, repro, TOTAL, [float(i + 1) for i in range(TOTAL)])
+
+
+# ---------------------------------------------------------------------------
+# Mixed immediate + deferred consumer uses. Some cb.wait results are consumed
+# before the next wait; others are consumed after multiple subsequent waits.
+# Boundary handling must be correct for both shapes simultaneously.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_mixed_immediate_and_deferred_consumes(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=4)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=4)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 100.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 200.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 300.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 400.0))
+
+            # First wait + immediate consume.
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+            # Three more waits with deferred consumes after all of them.
+            t2 = cb.wait()
+            t3 = cb.wait()
+            t4 = cb.wait()
+            with out_cb.reserve() as o:
+                o.store(t2)
+            with out_cb.reserve() as o:
+                o.store(t3)
+            with out_cb.reserve() as o:
+                o.store(t4)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 2]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 3]).wait()
+
+    _run(device, repro, 4, [100.0, 200.0, 300.0, 400.0])
+
+
+# ---------------------------------------------------------------------------
+# Long chain of consecutive cb.wait acquires with deferred consumes. Stresses
+# the boundary-relaxed walk on a wider chain than case_a / case_b.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_eight_consecutive_waits_deferred_consumes(device):
+    N = 8
+
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=N)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=N)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 1.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 2.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 3.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 4.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 5.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 6.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 7.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 8.0))
+
+            t1 = cb.wait()
+            t2 = cb.wait()
+            t3 = cb.wait()
+            t4 = cb.wait()
+            t5 = cb.wait()
+            t6 = cb.wait()
+            t7 = cb.wait()
+            t8 = cb.wait()
+            with out_cb.reserve() as o:
+                o.store(t1)
+            with out_cb.reserve() as o:
+                o.store(t2)
+            with out_cb.reserve() as o:
+                o.store(t3)
+            with out_cb.reserve() as o:
+                o.store(t4)
+            with out_cb.reserve() as o:
+                o.store(t5)
+            with out_cb.reserve() as o:
+                o.store(t6)
+            with out_cb.reserve() as o:
+                o.store(t7)
+            with out_cb.reserve() as o:
+                o.store(t8)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 2]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 3]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 4]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 5]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 6]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 7]).wait()
+
+    _run(device, repro, N, [float(i + 1) for i in range(N)])
+
+
+# ---------------------------------------------------------------------------
+# Two distinct CBs interleaved: each wait pair has deferred consumes. The
+# next-acquire boundary is per-CB; this test verifies independence.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_two_cbs_interleaved_deferred_consumes(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb_a = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+        cb_b = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=4)
+
+        @ttl.compute()
+        def compute():
+            with cb_a.reserve() as v:
+                v.store(ttl.math.fill(v, 10.0))
+            with cb_a.reserve() as v:
+                v.store(ttl.math.fill(v, 20.0))
+            with cb_b.reserve() as v:
+                v.store(ttl.math.fill(v, 30.0))
+            with cb_b.reserve() as v:
+                v.store(ttl.math.fill(v, 40.0))
+
+            # Interleave waits across two CBs; defer consumes for all four.
+            a1 = cb_a.wait()
+            b1 = cb_b.wait()
+            a2 = cb_a.wait()
+            b2 = cb_b.wait()
+            with out_cb.reserve() as o:
+                o.store(a1)
+            with out_cb.reserve() as o:
+                o.store(b1)
+            with out_cb.reserve() as o:
+                o.store(a2)
+            with out_cb.reserve() as o:
+                o.store(b2)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 2]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 3]).wait()
+
+    _run(device, repro, 4, [10.0, 30.0, 20.0, 40.0])
+
+
+# ---------------------------------------------------------------------------
+# Producer-side deferred reserves: 3 cb.reserve handles acquired, then 3
+# stores fired after all reserves. Mirror of case_a for the producer side.
+# Pattern is the explicit reserve-handle form used in test_layernorm.py and
+# simple_bcast.py rather than the `with cb.reserve() as v` form.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_three_consecutive_reserves_deferred_stores(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=3)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=3)
+
+        @ttl.compute()
+        def compute():
+            r1 = cb.reserve()
+            r2 = cb.reserve()
+            r3 = cb.reserve()
+            r1.store(ttl.math.fill(r1, 7.0))
+            r2.store(ttl.math.fill(r2, 8.0))
+            r3.store(ttl.math.fill(r3, 9.0))
+
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 2]).wait()
+
+    _run(device, repro, 3, [7.0, 8.0, 9.0])
diff --git a/test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir b/test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir
index 2db9ff751..7eb402a70 100644
--- a/test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir
+++ b/test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir
@@ -778,3 +778,104 @@ func.func @dm_wait_before_reserve_same_dfb(
   ttl.wait %tx1 : !ttl.transfer_handle<read>
   func.return
 }
+
+// -----
+
+// Test 27: Three consecutive cb_wait acquires on the same DFB whose tensor
+// SSA uses are deferred until after every wait has been issued. The
+// next-acquire boundary must not clamp tensor-use discovery; each pop must
+// land after its own attach_cb's consumer use, naturally interleaving the
+// pops between the per-tile uses so the read pointer advances in FIFO
+// order. Regression for the issue #536 follow-up case_a reproducer.
+
+// CHECK-LABEL: func.func @three_consecutive_waits_deferred_consumers
+// CHECK: %[[CBIN:.+]] = ttl.bind_cb{cb_index = 0
+// CHECK: %[[CBOUT:.+]] = ttl.bind_cb{cb_index = 1
+// CHECK: ttl.cb_wait %[[CBIN]]
+// CHECK-NEXT: ttl.attach_cb
+// CHECK-NEXT: ttl.cb_wait %[[CBIN]]
+// CHECK-NEXT: ttl.attach_cb
+// CHECK-NEXT: ttl.cb_wait %[[CBIN]]
+// CHECK-NEXT: ttl.attach_cb
+// CHECK: ttl.store
+// CHECK-NEXT: ttl.cb_pop %[[CBIN]]
+// CHECK-NEXT: ttl.cb_push %[[CBOUT]]
+// CHECK: ttl.store
+// CHECK-NEXT: ttl.cb_pop %[[CBIN]]
+// CHECK-NEXT: ttl.cb_push %[[CBOUT]]
+// CHECK: ttl.store
+// CHECK-NEXT: ttl.cb_pop %[[CBIN]]
+// CHECK-NEXT: ttl.cb_push %[[CBOUT]]
+// CHECK-NOT: ttl.cb_pop
+// CHECK: return
+func.func @three_consecutive_waits_deferred_consumers()
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %cb_in = ttl.bind_cb{cb_index = 0, block_count = 3} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  %cb_out = ttl.bind_cb{cb_index = 1, block_count = 3} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  %w0 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a0 = ttl.attach_cb %w0, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 3>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %w1 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a1 = ttl.attach_cb %w1, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 3>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %w2 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a2 = ttl.attach_cb %w2, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 3>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %r0 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a0, %r0 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  %r1 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a1, %r1 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  %r2 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a2, %r2 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  func.return
+}
+
+// -----
+
+// Test 28: Four consecutive cb_wait acquires inside an scf.for, with their
+// consumer stores deferred to after every wait per iteration. Verifies the
+// boundary drop applies inside loop bodies and that pops are placed inside
+// the loop body (not hoisted past the loop). Regression for the issue #536
+// follow-up case_b reproducer.
+
+// CHECK-LABEL: func.func @four_consecutive_waits_in_loop
+// CHECK: scf.for
+// CHECK: ttl.cb_wait
+// CHECK: ttl.cb_wait
+// CHECK: ttl.cb_wait
+// CHECK: ttl.cb_wait
+// CHECK-COUNT-4: ttl.cb_pop
+// CHECK: }
+// CHECK-NOT: ttl.cb_pop
+// CHECK: return
+func.func @four_consecutive_waits_in_loop()
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %cb_in = ttl.bind_cb{cb_index = 0, block_count = 12} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 12>
+  %cb_out = ttl.bind_cb{cb_index = 1, block_count = 4} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 4>
+  scf.for %i = %c0 to %c3 step %c1 {
+    %w0 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 12> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %a0 = ttl.attach_cb %w0, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 12>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %w1 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 12> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %a1 = ttl.attach_cb %w1, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 12>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %w2 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 12> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %a2 = ttl.attach_cb %w2, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 12>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %w3 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 12> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %a3 = ttl.attach_cb %w3, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 12>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %r0 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %a0, %r0 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+    %r1 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %a1, %r1 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+    %r2 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %a2, %r2 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+    %r3 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %a3, %r3 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+  }
+  func.return
+}
diff --git a/third-party/tt-mlir b/third-party/tt-mlir
index 80d7805ff..05aff66f4 160000
--- a/third-party/tt-mlir
+++ b/third-party/tt-mlir
@@ -1 +1 @@
-Subproject commit 80d7805ff5b04778fb1ca4c08fc4197d0014be8b
+Subproject commit 05aff66f459aafa3afafc46132956cfb14ae12d0

From 21ca0f31e371fe41e1c3364d70f3cac8a6946a67 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Thu, 7 May 2026 22:52:37 -0700
Subject: [PATCH 02/11] update doc

---
 docs/development/DFBManagement.md             | 82 ++++++++++++++++---
 .../TTL/Transforms/TTLInsertCBSync.cpp        | 67 ++-------------
 2 files changed, 74 insertions(+), 75 deletions(-)

diff --git a/docs/development/DFBManagement.md b/docs/development/DFBManagement.md
index 699b61b36..b3c262108 100644
--- a/docs/development/DFBManagement.md
+++ b/docs/development/DFBManagement.md
@@ -63,29 +63,85 @@ correct DFB interval boundary.
 
 The pass treats every acquire as opening a DFB live interval. The interval
 starts at `cb_reserve` or `cb_wait` and ends after the last operation that can
-use the acquired slot. A later acquire in the same DFB sync class bounds
-release matching and use discovery, because its release belongs to a different
-live interval.
+use the acquired slot.
 
 DFB sync classes separate the producer side from the consumer side:
 `cb_reserve`/`cb_push` form producer intervals, and `cb_wait`/`cb_pop` form
 consumer intervals. Producer acquires bound other producer intervals; consumer
 acquires bound other consumer intervals.
 
-The pass finds owned uses from two sources:
-
-- Tensor-form uses follow the result of `cb_reserve` or `cb_wait` through
-  `ttl.attach_cb`, `ttl.store`, and compute operations.
-- Direct DFB uses follow `ttl.copy` operations where the DFB operand direction
-  matches the interval's DFB sync class. Producer intervals include copies into
-  the DFB; consumer intervals include copies from the DFB. This is required for
-  data movement kernels, where copies do not use the tensor value returned by
-  the acquire op.
-
 Uses inside descendant regions are projected to their ancestor operation in the
 acquire's block. This conservatively places the release after the enclosing
 structured op when the exact use is nested in an `scf.for` or `scf.if` body.
 
+### Ownership
+
+A use `U` is *owned by* `acquire` if `U` accesses the slot `acquire` acquired.
+Two disjoint criteria establish ownership:
+
+- **(a) SSA criterion** -- `U` is reachable from `acquire`'s result through
+  identity-shaped tensor ops (`attach_cb`, `tensor.extract`,
+  `tensor.extract_slice`, compute ops, `ttl.store`). Per-tile SSA values
+  uniquely identify their source acquire, so this criterion has no positional
+  bound: a use of `cb_wait t1`'s tile is owned by `t1` regardless of where it
+  appears, even past later acquires on the same DFB.
+
+- **(b) Op-order criterion** -- `U` references the CB directly as a `ttl.copy`
+  operand on the side matching the acquire's sync class (the DM-thread case,
+  e.g. `ttl.copy %cb, %slice` for a writer). With no SSA tile handle,
+  ownership is positional: `U` belongs to the latest acquire on
+  `(cb, sync class)` that precedes it in op order. Equivalently, `U` is
+  bounded between `acquire` and the next acquire on the same sync class
+  (`interval.syncClassBoundary` in the pass).
+
+The criteria are disjoint. DM-thread `ttl.copy` does not flow through
+`attach_cb` (it takes the CB directly). Compute-thread uses always go through
+`attach_cb` and never reference the CB as a direct operand of a tile op.
+
+#### Why two criteria
+
+Compute threads work through SSA tile handles
+(`cb_wait` result -> `attach_cb` -> `ttl.store` / compute ops), so (a) applies
+and the next-acquire boundary is irrelevant -- SSA already distinguishes which
+slot the use refers to. DM threads use direct CB references
+(`ttl.copy %cb, %slice`) where no tile handle exists, so (b) is the fallback
+and the boundary is essential to disambiguate consecutive direct uses on the
+same CB. Unifying would require changing `ttl.copy` to take the attached
+tensor instead of the CB, a dialect change tracked as future work.
+
+### Invariants on the inserted release
+
+For each acquire `A`, the inserted release `R_A` must satisfy:
+
+1. **Causal dominance** -- every owned use of `A` precedes `R_A` in op order
+   (after projecting nested uses to `A`'s block). The pass enforces this
+   directly: the release is positioned after the last owned use returned by
+   `findLastOwnedUse`.
+
+2. **FIFO monotonicity** -- for `A_0 < A_1 < ...` on the same `(cb, sync
+   class)`, the inserted releases satisfy `R_0 < R_1 < ...` in op order. The
+   CB front (or back) pointer advances monotonically; out-of-order pops would
+   advance it past slots whose data is still needed.
+
+(1) is enforced explicitly by the pass. (2) is enforced *implicitly* when
+consumers under criterion (a) appear in declaration order
+(`use(t1); use(t2); use(t3)`), because the resulting `lastUse(A_i)` values are
+then themselves in op order. Reordered consumes (`use(t2); use(t1)`) silently
+violate (2): the pass places `R_0` after `R_1` and the front pointer advances
+past `t1`'s slot before `t1` is read. Lifting that restriction is future work
+that requires multi-tile `cb_wait_front(N)` with per-acquire `src_idx` so each
+consumer reads its tile by index, decoupled from pop ordering.
+
+### Idempotency
+
+When the pass runs twice on the same IR, the second run must observe the
+releases inserted by the first as already-present and skip re-injection.
+Because criterion (a) places releases past the next-acquire boundary in the
+deferred-use case, `findOwnedReleases` extends its release-search upper bound
+to the acquire's last owned use. Without this extension, the second run sees
+the inserted release as past the boundary and treats the acquire as needing
+another release.
+
 ### Slot State Model
 
 The pass models producer and consumer acquires as separate slot lifetimes:
diff --git a/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp b/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
index 03d4518a4..fdf225cc7 100644
--- a/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
@@ -223,68 +223,11 @@ static Operation *findNextSyncClassAcquire(Value cb, Operation *acquire,
 
 /// Return the last op in `acquire`'s block that consumes the acquired slot.
 ///
-/// ## Ownership
-///
-/// A use `U` is *owned by* `acquire` if `U` accesses the slot `acquire`
-/// acquired. Two disjoint criteria establish ownership:
-///
-/// **(a) SSA criterion** -- `U` is reachable from `acquire`'s result
-/// through identity-shaped tensor ops (`attach_cb`, `tensor.extract`,
-/// `tensor.extract_slice`, compute ops, `ttl.store`). Per-tile SSA values
-/// uniquely identify their source acquire, so this criterion has no
-/// positional bound: a use of `cb_wait t1`'s tile is owned by `t1`
-/// regardless of where it appears, even past later acquires on the same
-/// DFB.
-///
-/// **(b) Op-order criterion** -- `U` references the CB directly as a
-/// `ttl.copy` operand on the side matching the acquire's sync class (the
-/// DM-thread case, e.g. `ttl.copy %cb, %slice` for a writer). With no SSA
-/// tile handle, ownership is positional: `U` belongs to the latest
-/// acquire on `(cb, sync class)` that precedes it in op order.
-/// Equivalently, `U` is bounded between `acquire` and
-/// `interval.syncClassBoundary`.
-///
-/// The criteria are disjoint because DM-thread `ttl.copy` does not flow
-/// through `attach_cb` (it takes the CB directly), and compute-thread
-/// uses always go through `attach_cb` and never reference the CB as a
-/// direct operand of a tile op.
-///
-/// ### Why two criteria
-///
-/// Compute threads work through SSA tile handles
-/// (`cb_wait` result -> `attach_cb` -> `ttl.store` / compute ops), so (a)
-/// applies and the next-acquire boundary is irrelevant -- SSA already
-/// distinguishes which slot the use refers to. DM threads use direct CB
-/// references (`ttl.copy %cb, %slice`) where no tile handle exists, so
-/// (b) is the fallback and the boundary is essential to disambiguate
-/// between consecutive direct uses on the same CB. Unifying would require
-/// changing `ttl.copy` to take the attached tensor instead of the CB -- a
-/// dialect change deferred as future work.
-///
-/// ## Invariants on the inserted release
-///
-/// For each acquire `A`, the inserted release `R_A` must satisfy:
-///
-/// 1. **Causal dominance** -- every owned use of `A` precedes `R_A` in op
-///    order (after projecting nested uses to `A`'s block). This pass
-///    enforces it directly: the release is positioned after the last
-///    owned use returned by this function.
-///
-/// 2. **FIFO monotonicity** -- for `A_0 < A_1 < ...` on the same
-///    `(cb, sync class)`, the inserted releases satisfy
-///    `R_0 < R_1 < ...` in op order. The CB front pointer advances
-///    monotonically; out-of-order pops would advance it past slots whose
-///    data is still needed.
-///
-/// (1) is enforced explicitly here. (2) is enforced *implicitly* when
-/// consumers under criterion (a) appear in declaration order
-/// (`use(t1); use(t2); use(t3)`), because the resulting `lastUse(A_i)`
-/// values are then themselves in op order. Reordered consumes
-/// (`use(t2); use(t1)`) silently violate (2): the pass places `R_0` after
-/// `R_1` and the front pointer advances past `t1`'s slot before `t1` is
-/// read. Lifting that restriction is future work that requires a
-/// multi-tile `cb_wait_front(N)` with per-acquire `src_idx` so each
-/// consumer reads its tile by index, decoupled from pop ordering.
+/// Use discovery walks two sources with different boundary policies: direct
+/// CB uses (bounded by the next same-class acquire) and tensor SSA uses
+/// (unbounded). See `docs/development/DFBManagement.md` "DFB Sync Insertion"
+/// for the full ownership model, why the criteria differ, and the causal /
+/// FIFO invariants the inserted release must satisfy.
 static Operation *findLastOwnedUse(AcquireInterval interval) {
   Operation *last = interval.acquire;
   DenseSet<Operation *> visited;

From b283893f1ef6f6718725dd1a7f5e16c9eaf6cfbf Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Thu, 7 May 2026 23:08:23 -0700
Subject: [PATCH 03/11] add tests

---
 test/python/test_auto_pop_push.py | 527 ++++++++++++++++++++++++++++++
 1 file changed, 527 insertions(+)

diff --git a/test/python/test_auto_pop_push.py b/test/python/test_auto_pop_push.py
index dd58dc132..9f9d19159 100644
--- a/test/python/test_auto_pop_push.py
+++ b/test/python/test_auto_pop_push.py
@@ -542,3 +542,530 @@ def dm_write():
             ttl.copy(blk, out[0, 2]).wait()
 
     _run(device, repro, 3, [7.0, 8.0, 9.0])
+
+
+# ---------------------------------------------------------------------------
+# Wait-result fanout. A single cb.wait() result is consumed by multiple
+# downstream stores; the SSA walk must discover every transitive use, not
+# just the first one. If it stops early, a later store reads from a slot
+# that has already been popped.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_wait_result_fanout_multiple_consumers(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 42.0))
+
+            t = cb.wait()
+            with out_cb.reserve() as o1:
+                o1.store(t)
+            with out_cb.reserve() as o2:
+                o2.store(t)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+
+    _run(device, repro, 2, [42.0, 42.0])
+
+
+# ---------------------------------------------------------------------------
+# DM-thread producer with three consecutive reserves whose ttl.copy
+# completions are deferred. Stresses criterion (b) (direct CB use) at a
+# depth beyond the existing #536-fix coverage of two consecutive reserves.
+#
+# This pattern silently miscompiles today: ttl.copy takes the CB directly
+# (not the reserve result), so SSA cannot associate each copy with its
+# specific reserve. With three reserves before any copy, all copies sit
+# past r1's next-acquire boundary and get attributed to the last reserve.
+# r1's pop is inserted before any data is written. The dialect fix tracked
+# in plans/UnifyTTLCopyAcquireOwnership.md (encoding ownership in SSA via
+# the attach_cb chain) lifts this restriction; the test flips to PASS
+# then.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+@pytest.mark.xfail(
+    strict=True,
+    reason="Batched DM-thread reserve/copy/wait/push pattern needs "
+    "ttl.copy to thread the reserve result through SSA. Lifted by "
+    "the dialect change in plans/UnifyTTLCopyAcquireOwnership.md.",
+)
+def test_dm_read_three_consecutive_reserves_deferred_copies(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(inp, out):
+        inp_cb = ttl.make_dataflow_buffer_like(inp, shape=(1, 1), block_count=3)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=3)
+
+        @ttl.compute()
+        def compute():
+            with inp_cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+            with inp_cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+            with inp_cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+
+        @ttl.datamovement()
+        def dm_read():
+            r1 = inp_cb.reserve()
+            r2 = inp_cb.reserve()
+            r3 = inp_cb.reserve()
+            tx1 = ttl.copy(inp[0, 0], r1)
+            tx2 = ttl.copy(inp[1, 0], r2)
+            tx3 = ttl.copy(inp[2, 0], r3)
+            tx1.wait()
+            tx2.wait()
+            tx3.wait()
+            r1.push()
+            r2.push()
+            r3.push()
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[1, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[2, 0]).wait()
+
+    torch.manual_seed(11536)
+    inp_t = to_dram(torch.randn((3 * TILE, TILE), dtype=torch.bfloat16), device)
+    out_t = to_dram(torch.full((3 * TILE, TILE), -42.0, dtype=torch.bfloat16), device)
+    repro(inp_t, out_t)
+    ttnn.synchronize_device(device)
+    inp_h = ttnn.to_torch(inp_t)
+    out_h = ttnn.to_torch(out_t)
+    assert torch.equal(out_h, inp_h)
+
+
+# ---------------------------------------------------------------------------
+# DM-thread consumer with three consecutive cb.wait() acquires whose
+# ttl.copy completions are deferred. Mirror of the dm_read producer case
+# above on the consumer side. Same dialect-level root cause.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+@pytest.mark.xfail(
+    strict=True,
+    reason="Batched DM-thread wait/copy/wait/pop pattern needs ttl.copy "
+    "to thread the wait result through SSA. Lifted by the dialect "
+    "change in plans/UnifyTTLCopyAcquireOwnership.md.",
+)
+def test_dm_write_three_consecutive_waits_deferred_copies(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=3)
+
+        @ttl.compute()
+        def compute():
+            with out_cb.reserve() as v:
+                v.store(ttl.math.fill(v, 50.0))
+            with out_cb.reserve() as v:
+                v.store(ttl.math.fill(v, 60.0))
+            with out_cb.reserve() as v:
+                v.store(ttl.math.fill(v, 70.0))
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            b1 = out_cb.wait()
+            b2 = out_cb.wait()
+            b3 = out_cb.wait()
+            tx1 = ttl.copy(b1, out[0, 0])
+            tx2 = ttl.copy(b2, out[0, 1])
+            tx3 = ttl.copy(b3, out[0, 2])
+            tx1.wait()
+            tx2.wait()
+            tx3.wait()
+            b1.pop()
+            b2.pop()
+            b3.pop()
+
+    _run(device, repro, 3, [50.0, 60.0, 70.0])
+
+
+# ---------------------------------------------------------------------------
+# Cross-thread deferred chain. dm_read produces 4 tiles into inp_cb with
+# deferred pushes, compute consumes 4 from inp_cb with deferred uses,
+# dm_write writes 4 out with deferred pops. Exercises auto-injection
+# across all three threads simultaneously. Inherits the batched DM-thread
+# miscompile in both DM threads; the compute side already works.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+@pytest.mark.xfail(
+    strict=True,
+    reason="Inherits the batched DM-thread reserve/wait miscompile in "
+    "the dm_read and dm_write halves. "
+    "Lifted by the dialect change in "
+    "plans/UnifyTTLCopyAcquireOwnership.md.",
+)
+def test_cross_thread_deferred_chain(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(inp, out):
+        inp_cb = ttl.make_dataflow_buffer_like(inp, shape=(1, 1), block_count=4)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=4)
+
+        @ttl.compute()
+        def compute():
+            t1 = inp_cb.wait()
+            t2 = inp_cb.wait()
+            t3 = inp_cb.wait()
+            t4 = inp_cb.wait()
+            with out_cb.reserve() as o:
+                o.store(t1)
+            with out_cb.reserve() as o:
+                o.store(t2)
+            with out_cb.reserve() as o:
+                o.store(t3)
+            with out_cb.reserve() as o:
+                o.store(t4)
+
+        @ttl.datamovement()
+        def dm_read():
+            r1 = inp_cb.reserve()
+            r2 = inp_cb.reserve()
+            r3 = inp_cb.reserve()
+            r4 = inp_cb.reserve()
+            tx1 = ttl.copy(inp[0, 0], r1)
+            tx2 = ttl.copy(inp[1, 0], r2)
+            tx3 = ttl.copy(inp[2, 0], r3)
+            tx4 = ttl.copy(inp[3, 0], r4)
+            tx1.wait()
+            tx2.wait()
+            tx3.wait()
+            tx4.wait()
+            r1.push()
+            r2.push()
+            r3.push()
+            r4.push()
+
+        @ttl.datamovement()
+        def dm_write():
+            b1 = out_cb.wait()
+            b2 = out_cb.wait()
+            b3 = out_cb.wait()
+            b4 = out_cb.wait()
+            tx1 = ttl.copy(b1, out[0, 0])
+            tx2 = ttl.copy(b2, out[1, 0])
+            tx3 = ttl.copy(b3, out[2, 0])
+            tx4 = ttl.copy(b4, out[3, 0])
+            tx1.wait()
+            tx2.wait()
+            tx3.wait()
+            tx4.wait()
+            b1.pop()
+            b2.pop()
+            b3.pop()
+            b4.pop()
+
+    torch.manual_seed(536)
+    inp_t = to_dram(torch.randn((4 * TILE, TILE), dtype=torch.bfloat16), device)
+    out_t = to_dram(torch.full((4 * TILE, TILE), -42.0, dtype=torch.bfloat16), device)
+    repro(inp_t, out_t)
+    ttnn.synchronize_device(device)
+    inp_h = ttnn.to_torch(inp_t)
+    out_h = ttnn.to_torch(out_t)
+    assert torch.equal(out_h, inp_h)
+
+
+# ---------------------------------------------------------------------------
+# Reordered consumes -- consumer reads tile values out of declaration
+# order. Without multi-tile coalescing, the pass places pop ops in op
+# order matching the consume sites, which violates FIFO monotonicity.
+# Documented as xfail(strict=True); flips to PASS the day the multi-tile
+# coalescing follow-on lands and provides per-acquire src_idx so
+# consumers can read by index.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+@pytest.mark.xfail(
+    strict=True,
+    reason="Reordered consumes (use(t2) before use(t1)) violate CB FIFO "
+    "monotonicity. Lifted by future multi-tile cb_wait_front(N) "
+    "coalescing with per-acquire src_idx.",
+)
+def test_reordered_consumes_violate_fifo_xfail(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 1.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 2.0))
+
+            t1 = cb.wait()
+            t2 = cb.wait()
+            # Consume t2 BEFORE t1 -- requires per-tile src_idx to be correct.
+            with out_cb.reserve() as o:
+                o.store(t2)
+            with out_cb.reserve() as o:
+                o.store(t1)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+
+    _run(device, repro, 2, [2.0, 1.0])
+
+
+# ---------------------------------------------------------------------------
+# Multi-tile block shape. shape=(1,2) means each CB slot holds two tiles.
+# Consecutive cb.wait()s with deferred consumes verify the boundary
+# handling does not assume single-tile geometry.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_multi_tile_block_shape_deferred_consumes(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(inp, out):
+        cb = ttl.make_dataflow_buffer_like(inp, shape=(1, 2), block_count=2)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 2), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            t1 = cb.wait()
+            t2 = cb.wait()
+            with out_cb.reserve() as o:
+                o.store(t1)
+            with out_cb.reserve() as o:
+                o.store(t2)
+
+        @ttl.datamovement()
+        def dm_read():
+            r1 = cb.reserve()
+            tx1 = ttl.copy(inp[0:1, 0:2], r1)
+            tx1.wait()
+            r1.push()
+            r2 = cb.reserve()
+            tx2 = ttl.copy(inp[0:1, 2:4], r2)
+            tx2.wait()
+            r2.push()
+
+        @ttl.datamovement()
+        def dm_write():
+            b1 = out_cb.wait()
+            ttl.copy(b1, out[0:1, 0:2]).wait()
+            b1.pop()
+            b2 = out_cb.wait()
+            ttl.copy(b2, out[0:1, 2:4]).wait()
+            b2.pop()
+
+    torch.manual_seed(909)
+    inp_t = to_dram(torch.randn((TILE, 4 * TILE), dtype=torch.bfloat16), device)
+    out_t = to_dram(torch.full((TILE, 4 * TILE), -42.0, dtype=torch.bfloat16), device)
+    repro(inp_t, out_t)
+    ttnn.synchronize_device(device)
+    inp_h = ttnn.to_torch(inp_t)
+    out_h = ttnn.to_torch(out_t)
+    assert torch.equal(out_h, inp_h)
+
+
+# ---------------------------------------------------------------------------
+# Tight block_count -- block_count exactly equal to the
+# consecutive-acquire count, no slack. Producer must push all 4 before the
+# consumer can read; ordering bugs that block_count slack would mask are
+# exposed here.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_tight_block_count_four_consecutive_waits(device):
+    N = 4
+
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=N)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=N)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 1.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 2.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 3.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 4.0))
+
+            t1 = cb.wait()
+            t2 = cb.wait()
+            t3 = cb.wait()
+            t4 = cb.wait()
+            with out_cb.reserve() as o:
+                o.store(t1)
+            with out_cb.reserve() as o:
+                o.store(t2)
+            with out_cb.reserve() as o:
+                o.store(t3)
+            with out_cb.reserve() as o:
+                o.store(t4)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            for col in range(N):
+                blk = out_cb.wait()
+                ttl.copy(blk, out[0, col]).wait()
+
+    _run(device, repro, N, [1.0, 2.0, 3.0, 4.0])
+
+
+# ---------------------------------------------------------------------------
+# Producer-side mixed -- a reserve consumed immediately followed by two
+# reserves with deferred stores. Mirror of the mixed-immediate-deferred
+# consumer test above on the producer side.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_producer_mixed_immediate_and_deferred_stores(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=3)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=3)
+
+        @ttl.compute()
+        def compute():
+            # Immediate reserve + store (the with-block form).
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 11.0))
+            # Two deferred reserves with stores after both reserves.
+            r2 = cb.reserve()
+            r3 = cb.reserve()
+            r2.store(ttl.math.fill(r2, 22.0))
+            r3.store(ttl.math.fill(r3, 33.0))
+
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 2]).wait()
+
+    _run(device, repro, 3, [11.0, 22.0, 33.0])
+
+
+# ---------------------------------------------------------------------------
+# block_count=1 (single-slot CB). Degenerate but legal: every
+# producer-consumer pair must serialize through the single slot. Tests
+# the pass on the smallest legal CB topology.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_single_slot_cb_serialized(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=1)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=1)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 99.0))
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 88.0))
+            with cb.wait() as src, out_cb.reserve() as dst:
+                dst.store(src)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+
+    _run(device, repro, 2, [99.0, 88.0])
+
+
+# ---------------------------------------------------------------------------
+# Long DM-thread loop with many iterations. Exercises per-iteration pop
+# placement under wider iteration counts than case_b's 12.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_long_dm_thread_loop_64_iterations(device):
+    N = 64
+
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=4)
+
+        @ttl.compute()
+        def compute():
+            for _ in range(N):
+                with cb.reserve() as v:
+                    v.store(ttl.math.fill(v, 17.0))
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            for col in range(N):
+                blk = cb.wait()
+                ttl.copy(blk, out[0, col]).wait()
+
+    _run(device, repro, N, [17.0] * N)

From 974085b53f9cc9b7b0fad17debe363fc8d5102f3 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Fri, 8 May 2026 07:36:34 -0700
Subject: [PATCH 04/11] more tests and cleanup

---
 docs/development/DFBManagement.md             |  39 ++--
 .../TTL/Transforms/TTLInsertCBSync.cpp        |  86 ++++----
 test/python/test_auto_pop_push.py             | 208 ++++++++++++++----
 .../TTL/Transforms/insert_cb_sync.mlir        | 107 +++++++--
 4 files changed, 320 insertions(+), 120 deletions(-)

diff --git a/docs/development/DFBManagement.md b/docs/development/DFBManagement.md
index b3c262108..c1a1dfad6 100644
--- a/docs/development/DFBManagement.md
+++ b/docs/development/DFBManagement.md
@@ -79,14 +79,14 @@ structured op when the exact use is nested in an `scf.for` or `scf.if` body.
 A use `U` is *owned by* `acquire` if `U` accesses the slot `acquire` acquired.
 Two disjoint criteria establish ownership:
 
-- **(a) SSA criterion** -- `U` is reachable from `acquire`'s result through
+- **Tile-SSA ownership** -- `U` is reachable from `acquire`'s result through
   identity-shaped tensor ops (`attach_cb`, `tensor.extract`,
   `tensor.extract_slice`, compute ops, `ttl.store`). Per-tile SSA values
   uniquely identify their source acquire, so this criterion has no positional
   bound: a use of `cb_wait t1`'s tile is owned by `t1` regardless of where it
   appears, even past later acquires on the same DFB.
 
-- **(b) Op-order criterion** -- `U` references the CB directly as a `ttl.copy`
+- **Direct-CB ownership** -- `U` references the CB directly as a `ttl.copy`
   operand on the side matching the acquire's sync class (the DM-thread case,
   e.g. `ttl.copy %cb, %slice` for a writer). With no SSA tile handle,
   ownership is positional: `U` belongs to the latest acquire on
@@ -101,13 +101,14 @@ The criteria are disjoint. DM-thread `ttl.copy` does not flow through
 #### Why two criteria
 
 Compute threads work through SSA tile handles
-(`cb_wait` result -> `attach_cb` -> `ttl.store` / compute ops), so (a) applies
-and the next-acquire boundary is irrelevant -- SSA already distinguishes which
-slot the use refers to. DM threads use direct CB references
-(`ttl.copy %cb, %slice`) where no tile handle exists, so (b) is the fallback
-and the boundary is essential to disambiguate consecutive direct uses on the
-same CB. Unifying would require changing `ttl.copy` to take the attached
-tensor instead of the CB, a dialect change tracked as future work.
+(`cb_wait` result -> `attach_cb` -> `ttl.store` / compute ops), so tile-SSA
+ownership applies and the next-acquire boundary is irrelevant -- SSA already
+distinguishes which slot the use refers to. DM threads use direct CB
+references (`ttl.copy %cb, %slice`) where no tile handle exists, so direct-CB
+ownership is the fallback and the boundary is essential to disambiguate
+consecutive direct uses on the same CB. Unifying would require changing
+`ttl.copy` to take the attached tensor instead of the CB, a dialect change
+tracked as future work.
 
 ### Invariants on the inserted release
 
@@ -136,11 +137,12 @@ consumer reads its tile by index, decoupled from pop ordering.
 
 When the pass runs twice on the same IR, the second run must observe the
 releases inserted by the first as already-present and skip re-injection.
-Because criterion (a) places releases past the next-acquire boundary in the
-deferred-use case, `findOwnedReleases` extends its release-search upper bound
-to the acquire's last owned use. Without this extension, the second run sees
-the inserted release as past the boundary and treats the acquire as needing
-another release.
+Because tile-SSA ownership can place a release past the next-acquire boundary
+(when a tile is consumed later than the next acquire on the same DFB),
+`findOwnedReleases` extends its release-search upper bound to the acquire's
+last owned use. Without this extension, the second run sees the inserted
+release as past the boundary and treats the acquire as needing another
+release.
 
 ### Slot State Model
 
@@ -190,10 +192,11 @@ cb_wait A  ->  owned reads  ->  cb_pop A  ->  cb_wait B
                                   inserted release
 ```
 
-Once a later acquire in the same DFB sync class starts, subsequent releases are
-considered part of that later interval. They cannot release the slot acquired by
-the earlier operation because the earlier slot must already be released before
-the DFB read or write pointer is reused.
+Direct-CB ownership is positional: a release after the next acquire in the
+same sync class is owned by that next acquire, not the earlier one. Tile-SSA
+ownership is unbounded: a release placed after a tile's last use can sit past
+the next acquire and still belong to the earlier interval. The pass
+distinguishes these two cases by use criterion, not by a single bound.
 
 ### Algorithm
 
diff --git a/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp b/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
index fdf225cc7..0fcd11159 100644
--- a/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
@@ -7,14 +7,9 @@
 //===----------------------------------------------------------------------===//
 //
 // Inserts missing cb_push / cb_pop for unmatched cb_reserve / cb_wait ops.
-//
-// Each acquire opens a DFB live interval. The pass finds owned uses from two
-// sources: SSA users of the acquire result, and direction-matched direct DFB
-// copy operands. Uses in descendant regions project to their ancestor in the
-// acquire block.
-//
-// Nested releases are erased and reinserted at the acquire block scope.
-// Same-level releases make the pass idempotent.
+// Owned-use discovery is asymmetric: tensor SSA uses are unbounded, direct
+// CB uses are bounded by the next same-class acquire. See
+// `docs/development/DFBManagement.md` for the ownership model.
 //
 // Legality invariants:
 //   P1. cb_push follows reserve-side writes before write pointer reuse.
@@ -129,13 +124,9 @@ static void updateLatestUse(Operation *candidate, Operation *&latest) {
   }
 }
 
-/// Find releases owned by this acquire interval.
-///
-/// `lastOwnedUse` extends the release-search upper bound past the
-/// next-acquire boundary when the interval's tensor SSA uses live past it
-/// (the deferred-use case). Without this extension the pass would not be
-/// idempotent: the cb_pop inserted after the deferred use would lie past
-/// the next-acquire boundary, and a subsequent run would re-insert it.
+/// Find releases owned by this acquire interval. When `lastOwnedUse` is
+/// non-null and falls past the next-acquire boundary, also accept releases
+/// in that extended range so the pass is idempotent on re-run.
 static ReleaseSearch findOwnedReleases(AcquireInterval interval,
                                        Operation *lastOwnedUse,
                                        ArrayRef<Operation *> allReleases,
@@ -143,8 +134,6 @@ static ReleaseSearch findOwnedReleases(AcquireInterval interval,
   ReleaseSearch result;
   Block *block = interval.acquire->getBlock();
 
-  // Allow same-block releases between the acquire and `lastOwnedUse`,
-  // ignoring the next-acquire boundary when the use itself sits past it.
   bool useExtendsPastBoundary =
       lastOwnedUse && lastOwnedUse != interval.acquire &&
       interval.syncClassBoundary &&
@@ -164,10 +153,8 @@ static ReleaseSearch findOwnedReleases(AcquireInterval interval,
         result.hasSameLevelRelease = true;
         continue;
       }
-      // Boundary failed. Re-check with the extended upper bound to keep
-      // the pass idempotent in the deferred-use shape: a release at or
-      // after the acquire's last owned use is the one this acquire would
-      // have inserted, so treat it as same-level.
+      // Re-check past the boundary: a release at or after the acquire's
+      // last owned use is one this pass would have inserted on a prior run.
       if (useExtendsPastBoundary &&
           projectToAcquireBlock(interval, release, projected,
                                 /*ignoreBoundary=*/true) &&
@@ -222,12 +209,8 @@ static Operation *findNextSyncClassAcquire(Value cb, Operation *acquire,
 }
 
 /// Return the last op in `acquire`'s block that consumes the acquired slot.
-///
-/// Use discovery walks two sources with different boundary policies: direct
-/// CB uses (bounded by the next same-class acquire) and tensor SSA uses
-/// (unbounded). See `docs/development/DFBManagement.md` "DFB Sync Insertion"
-/// for the full ownership model, why the criteria differ, and the causal /
-/// FIFO invariants the inserted release must satisfy.
+/// Direct CB uses are bounded by the next same-class acquire; tensor SSA
+/// uses are not. See `docs/development/DFBManagement.md` for the model.
 static Operation *findLastOwnedUse(AcquireInterval interval) {
   Operation *last = interval.acquire;
   DenseSet<Operation *> visited;
@@ -248,6 +231,19 @@ static Operation *findLastOwnedUse(AcquireInterval interval) {
     return true;
   };
 
+  auto drainWorklist = [&](bool ignoreBoundary) {
+    while (!worklist.empty()) {
+      Value value = worklist.pop_back_val();
+      for (OpOperand &use : value.getUses()) {
+        Operation *user = use.getOwner();
+        if (isa<CBPushOp, CBPopOp>(user)) {
+          continue;
+        }
+        extend(user, ignoreBoundary);
+      }
+    }
+  };
+
   // Direct DFB uses: start from the CB value's users and recurse through
   // their SSA results (e.g. ttl.copy returns a transfer_handle whose ttl.wait
   // marks the actual end of the transfer). Boundary applies because two
@@ -265,16 +261,7 @@ static Operation *findLastOwnedUse(AcquireInterval interval) {
     }
     extend(user, /*ignoreBoundary=*/false);
   }
-  while (!worklist.empty()) {
-    Value value = worklist.pop_back_val();
-    for (OpOperand &use : value.getUses()) {
-      Operation *user = use.getOwner();
-      if (isa<CBPushOp, CBPopOp>(user)) {
-        continue;
-      }
-      extend(user, /*ignoreBoundary=*/false);
-    }
-  }
+  drainWorklist(/*ignoreBoundary=*/false);
 
   // Tensor SSA uses: start from the acquire's result and recurse through
   // attach_cb / store / compute users. The next-acquire boundary does NOT
@@ -284,16 +271,7 @@ static Operation *findLastOwnedUse(AcquireInterval interval) {
   if (interval.acquire->getNumResults() > 0) {
     worklist.push_back(interval.acquire->getResult(0));
   }
-  while (!worklist.empty()) {
-    Value value = worklist.pop_back_val();
-    for (OpOperand &use : value.getUses()) {
-      Operation *user = use.getOwner();
-      if (isa<CBPushOp, CBPopOp>(user)) {
-        continue;
-      }
-      extend(user, /*ignoreBoundary=*/true);
-    }
-  }
+  drainWorklist(/*ignoreBoundary=*/true);
 
   return last;
 }
@@ -313,13 +291,23 @@ static void insertMissingReleases(ArrayRef<Operation *> acquires,
                                   CreateReleaseFn createRelease) {
   for (Operation *acquire : acquires) {
     AcquireInterval interval = makeAcquireInterval(acquire, acquires);
-    Operation *last = findLastOwnedUse(interval);
+    // Cheap check first: any release inside the strict next-acquire range?
     ReleaseSearch releaseSearch =
-        findOwnedReleases(interval, last, releases, erased);
+        findOwnedReleases(interval, /*lastOwnedUse=*/nullptr, releases, erased);
     if (releaseSearch.hasSameLevelRelease) {
       continue;
     }
 
+    // Compute the last owned use; it both bounds the idempotency recheck
+    // and pinpoints the insertion point.
+    Operation *last = findLastOwnedUse(interval);
+    if (last != interval.acquire) {
+      releaseSearch = findOwnedReleases(interval, last, releases, erased);
+      if (releaseSearch.hasSameLevelRelease) {
+        continue;
+      }
+    }
+
     for (Operation *nestedRelease : releaseSearch.nestedReleases) {
       erased.insert(nestedRelease);
       nestedRelease->erase();
diff --git a/test/python/test_auto_pop_push.py b/test/python/test_auto_pop_push.py
index 9f9d19159..4e8b5bd88 100644
--- a/test/python/test_auto_pop_push.py
+++ b/test/python/test_auto_pop_push.py
@@ -8,6 +8,11 @@
 handle, including the issue #536 follow-up case_a and case_b reproducers
 (deferred consumer uses across multiple consecutive cb.wait() calls on the
 same DFB).
+
+Several tests are marked xfail(strict). Each describes a real pattern
+that currently produces wrong runtime output (or fails to compile) and
+will start passing once a tracked compiler follow-up lands. The
+explanation for each is at the test site.
 """
 
 import pytest
@@ -585,27 +590,22 @@ def dm_write():
 
 
 # ---------------------------------------------------------------------------
-# DM-thread producer with three consecutive reserves whose ttl.copy
-# completions are deferred. Stresses criterion (b) (direct CB use) at a
-# depth beyond the existing #536-fix coverage of two consecutive reserves.
-#
-# This pattern silently miscompiles today: ttl.copy takes the CB directly
-# (not the reserve result), so SSA cannot associate each copy with its
-# specific reserve. With three reserves before any copy, all copies sit
-# past r1's next-acquire boundary and get attributed to the last reserve.
-# r1's pop is inserted before any data is written. The dialect fix tracked
-# in plans/UnifyTTLCopyAcquireOwnership.md (encoding ownership in SSA via
-# the attach_cb chain) lifts this restriction; the test flips to PASS
-# then.
+# xfail (#555). DM-thread producer with three consecutive reserves
+# followed by three ttl.copy completions. ttl.copy takes a !ttl.cb operand
+# directly rather than a tensor SSA value derived from cb_reserve, so the
+# IR carries no def-use edge identifying which copy fills which reserve.
+# The pass falls back to op-order reasoning and attributes all three
+# copies to the last reserve. The push for the earlier reserves lands
+# before any data is written; the buffer's write pointer advances past
+# empty slots. Lifted by #555 (encode DFB ownership in SSA on ttl.copy).
 # ---------------------------------------------------------------------------
 
 
 @pytest.mark.requires_device
 @pytest.mark.xfail(
     strict=True,
-    reason="Batched DM-thread reserve/copy/wait/push pattern needs "
-    "ttl.copy to thread the reserve result through SSA. Lifted by "
-    "the dialect change in plans/UnifyTTLCopyAcquireOwnership.md.",
+    reason="Batched DM-thread reserve/copy/wait/push pattern. "
+    "Lifted by #555 (encode DFB ownership in SSA on ttl.copy).",
 )
 def test_dm_read_three_consecutive_reserves_deferred_copies(device):
     @ttl.operation(grid=(1, 1))
@@ -657,18 +657,20 @@ def dm_write():
 
 
 # ---------------------------------------------------------------------------
-# DM-thread consumer with three consecutive cb.wait() acquires whose
-# ttl.copy completions are deferred. Mirror of the dm_read producer case
-# above on the consumer side. Same dialect-level root cause.
+# xfail (#555). DM-thread consumer with three consecutive cb.wait()
+# acquires followed by three ttl.copy completions. Consumer-side mirror
+# of the dm_read case above; ttl.copy reads from the bare !ttl.cb operand
+# instead of the cb_wait result, so the pass cannot tell which copy
+# consumes which acquired slot and pops the earlier slots before the
+# corresponding copies read them. Lifted by #555.
 # ---------------------------------------------------------------------------
 
 
 @pytest.mark.requires_device
 @pytest.mark.xfail(
     strict=True,
-    reason="Batched DM-thread wait/copy/wait/pop pattern needs ttl.copy "
-    "to thread the wait result through SSA. Lifted by the dialect "
-    "change in plans/UnifyTTLCopyAcquireOwnership.md.",
+    reason="Batched DM-thread wait/copy/wait/pop pattern. "
+    "Lifted by #555 (encode DFB ownership in SSA on ttl.copy).",
 )
 def test_dm_write_three_consecutive_waits_deferred_copies(device):
     @ttl.operation(grid=(1, 1))
@@ -707,11 +709,12 @@ def dm_write():
 
 
 # ---------------------------------------------------------------------------
-# Cross-thread deferred chain. dm_read produces 4 tiles into inp_cb with
-# deferred pushes, compute consumes 4 from inp_cb with deferred uses,
-# dm_write writes 4 out with deferred pops. Exercises auto-injection
-# across all three threads simultaneously. Inherits the batched DM-thread
-# miscompile in both DM threads; the compute side already works.
+# xfail (#555). Cross-thread chain: dm_read reserves 4 slots up front then
+# writes them, compute waits 4 then consumes them, dm_write waits 4 then
+# writes them out. The compute-side auto-injection works (SSA def-use
+# anchors ownership), but both DM threads inherit the same batched
+# reserve/copy or wait/copy miscompile as the two tests above. Lifted by
+# #555.
 # ---------------------------------------------------------------------------
 
 
@@ -719,9 +722,7 @@ def dm_write():
 @pytest.mark.xfail(
     strict=True,
     reason="Inherits the batched DM-thread reserve/wait miscompile in "
-    "the dm_read and dm_write halves. "
-    "Lifted by the dialect change in "
-    "plans/UnifyTTLCopyAcquireOwnership.md.",
+    "both dm_read and dm_write halves. Lifted by #555.",
 )
 def test_cross_thread_deferred_chain(device):
     @ttl.operation(grid=(1, 1))
@@ -793,12 +794,13 @@ def dm_write():
 
 
 # ---------------------------------------------------------------------------
-# Reordered consumes -- consumer reads tile values out of declaration
-# order. Without multi-tile coalescing, the pass places pop ops in op
-# order matching the consume sites, which violates FIFO monotonicity.
-# Documented as xfail(strict=True); flips to PASS the day the multi-tile
-# coalescing follow-on lands and provides per-acquire src_idx so
-# consumers can read by index.
+# xfail (#556). Consumer reads tile 2 before tile 1 (out of declaration
+# order). The buffer exposes a single FIFO front pointer, so there is no
+# way to release the second slot before the first; the current pass
+# places releases in the order it observes the consumes, violating FIFO
+# monotonicity. Lifted by #556 (coalesce consecutive cb_wait into one
+# cb_wait_front(N) with per-acquire src_idx, decoupling consume order
+# from release order).
 # ---------------------------------------------------------------------------
 
 
@@ -806,8 +808,7 @@ def dm_write():
 @pytest.mark.xfail(
     strict=True,
     reason="Reordered consumes (use(t2) before use(t1)) violate CB FIFO "
-    "monotonicity. Lifted by future multi-tile cb_wait_front(N) "
-    "coalescing with per-acquire src_idx.",
+    "monotonicity. Lifted by #556 (multi-tile cb_wait_front coalescing).",
 )
 def test_reordered_consumes_violate_fifo_xfail(device):
     @ttl.operation(grid=(1, 1))
@@ -1069,3 +1070,136 @@ def dm_write():
                 ttl.copy(blk, out[0, col]).wait()
 
     _run(device, repro, N, [17.0] * N)
+
+
+# ---------------------------------------------------------------------------
+# Multiple direct CB uses on a single DM-thread acquire.
+#
+# A single cb.wait() followed by two ttl.copy() reads from the same slot to
+# different output positions. Both copies are direct CB operands on the same
+# acquire (criterion-b ownership). The pop must land after the last copy; if
+# findLastOwnedUse stopped at the first copy, the pop would advance the read
+# pointer before the second copy reads, producing stale data in row 1.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_dm_write_two_copies_same_acquire(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 5.0))
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            ttl.copy(blk, out[0, 1]).wait()
+
+    out_t = to_dram(torch.full((TILE, 2 * TILE), -42.0, dtype=torch.bfloat16), device)
+    repro(out_t)
+    ttnn.synchronize_device(device)
+    out_h = ttnn.to_torch(out_t)
+    assert out_h[0, 0].item() == 5.0
+    assert out_h[0, TILE].item() == 5.0
+
+
+# ---------------------------------------------------------------------------
+# Producer-side analog of case_b: 3 consecutive cb.reserve() per iteration
+# of an scf.for, with the matching stores deferred until after the third
+# reserve. Each push must land after its own slot's store, inside the loop
+# body. Symmetric coverage to test 28 in insert_cb_sync.mlir for producers.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_producer_three_reserves_deferred_stores_in_loop(device):
+    N_ITERS = 3
+    N_PER_ITER = 3
+    TOTAL = N_ITERS * N_PER_ITER
+
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=TOTAL)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=4)
+
+        @ttl.compute()
+        def compute():
+            for _ in range(N_ITERS):
+                r1 = cb.reserve()
+                r2 = cb.reserve()
+                r3 = cb.reserve()
+                r1.store(ttl.math.fill(r1, 1.0))
+                r2.store(ttl.math.fill(r2, 2.0))
+                r3.store(ttl.math.fill(r3, 3.0))
+
+            for _ in range(TOTAL):
+                with cb.wait() as src, out_cb.reserve() as dst:
+                    dst.store(src)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            for col in range(TOTAL):
+                blk = out_cb.wait()
+                ttl.copy(blk, out[0, col]).wait()
+
+    expected = [1.0, 2.0, 3.0] * N_ITERS
+    _run(device, repro, TOTAL, expected)
+
+
+# ---------------------------------------------------------------------------
+# xfail (#540). Tensor recurrence (acc = acc + ...) carrying an acquired
+# tile through scf.for iter_args. The DSL today does not lower this
+# shape consistently; PR #540 adds the missing materialization. Once
+# #540 lands, the auto-pop pass must follow uses through the iter_arg
+# block argument so the pop lands after the loop, not before. Mirrors
+# lit test 30 in insert_cb_sync.mlir.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+@pytest.mark.xfail(
+    strict=True,
+    reason="Tensor recurrence carrying an acquired tile through scf.for "
+    "iter_args. Lifted by #540 (materialize tensor loop state).",
+)
+def test_wait_result_through_for_iter_args(device):
+    N = 4
+
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=1)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=1)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 1.0))
+            acc = cb.wait()
+            for _ in range(N):
+                acc = acc + acc
+            with out_cb.reserve() as o:
+                o.store(acc)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+
+    _run(device, repro, 1, [float(2**N)])
diff --git a/test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir b/test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir
index 7eb402a70..a2828050a 100644
--- a/test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir
+++ b/test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir
@@ -781,12 +781,9 @@ func.func @dm_wait_before_reserve_same_dfb(
 
 // -----
 
-// Test 27: Three consecutive cb_wait acquires on the same DFB whose tensor
-// SSA uses are deferred until after every wait has been issued. The
-// next-acquire boundary must not clamp tensor-use discovery; each pop must
-// land after its own attach_cb's consumer use, naturally interleaving the
-// pops between the per-tile uses so the read pointer advances in FIFO
-// order. Regression for the issue #536 follow-up case_a reproducer.
+// Test 27: Three consecutive cb_wait on the same DFB; consumer stores run
+// after the third wait. Each pop must land after its own store, not clamped
+// at the next wait. Regression for issue #536 case_a.
 
 // CHECK-LABEL: func.func @three_consecutive_waits_deferred_consumers
 // CHECK: %[[CBIN:.+]] = ttl.bind_cb{cb_index = 0
@@ -832,19 +829,30 @@ func.func @three_consecutive_waits_deferred_consumers()
 
 // -----
 
-// Test 28: Four consecutive cb_wait acquires inside an scf.for, with their
-// consumer stores deferred to after every wait per iteration. Verifies the
-// boundary drop applies inside loop bodies and that pops are placed inside
-// the loop body (not hoisted past the loop). Regression for the issue #536
-// follow-up case_b reproducer.
+// Test 28: Four consecutive cb_wait inside scf.for; per-iteration stores
+// run after the fourth wait. Pops stay inside the body and interleave with
+// the stores. Regression for issue #536 case_b.
 
 // CHECK-LABEL: func.func @four_consecutive_waits_in_loop
+// CHECK: %[[CBIN:.+]] = ttl.bind_cb{cb_index = 0
+// CHECK: %[[CBOUT:.+]] = ttl.bind_cb{cb_index = 1
 // CHECK: scf.for
-// CHECK: ttl.cb_wait
-// CHECK: ttl.cb_wait
-// CHECK: ttl.cb_wait
-// CHECK: ttl.cb_wait
-// CHECK-COUNT-4: ttl.cb_pop
+// CHECK: ttl.cb_wait %[[CBIN]]
+// CHECK: ttl.cb_wait %[[CBIN]]
+// CHECK: ttl.cb_wait %[[CBIN]]
+// CHECK: ttl.cb_wait %[[CBIN]]
+// CHECK: ttl.store
+// CHECK-NEXT: ttl.cb_pop %[[CBIN]]
+// CHECK-NEXT: ttl.cb_push %[[CBOUT]]
+// CHECK: ttl.store
+// CHECK-NEXT: ttl.cb_pop %[[CBIN]]
+// CHECK-NEXT: ttl.cb_push %[[CBOUT]]
+// CHECK: ttl.store
+// CHECK-NEXT: ttl.cb_pop %[[CBIN]]
+// CHECK-NEXT: ttl.cb_push %[[CBOUT]]
+// CHECK: ttl.store
+// CHECK-NEXT: ttl.cb_pop %[[CBIN]]
+// CHECK-NEXT: ttl.cb_push %[[CBOUT]]
 // CHECK: }
 // CHECK-NOT: ttl.cb_pop
 // CHECK: return
@@ -879,3 +887,70 @@ func.func @four_consecutive_waits_in_loop()
   }
   func.return
 }
+
+// -----
+
+// Test 29: Producer-side analog of test 27. Three consecutive cb_reserve;
+// stores run after the third reserve. Each push lands after its own store.
+
+// CHECK-LABEL: func.func @three_consecutive_reserves_deferred_stores
+// CHECK: %[[CB:.+]] = ttl.bind_cb{cb_index = 0
+// CHECK: ttl.cb_reserve %[[CB]]
+// CHECK-NEXT: ttl.cb_reserve %[[CB]]
+// CHECK-NEXT: ttl.cb_reserve %[[CB]]
+// CHECK: ttl.store
+// CHECK-NEXT: ttl.cb_push %[[CB]]
+// CHECK: ttl.store
+// CHECK-NEXT: ttl.cb_push %[[CB]]
+// CHECK: ttl.store
+// CHECK-NEXT: ttl.cb_push %[[CB]]
+// CHECK-NOT: ttl.cb_push
+// CHECK: return
+func.func @three_consecutive_reserves_deferred_stores(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>)
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %cb = ttl.bind_cb{cb_index = 0, block_count = 3} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  %r0 = ttl.cb_reserve %cb : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %r1 = ttl.cb_reserve %cb : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %r2 = ttl.cb_reserve %cb : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %arg0, %r0 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %arg0, %r1 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %arg0, %r2 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  func.return
+}
+
+// -----
+
+// Test 30: cb_wait result carried through scf.for iter_args. The acquired
+// tile is yielded into the loop's iter_args; the consumer store reads
+// the iter_arg inside the body. findLastOwnedUse must follow uses through
+// the iter_arg block argument so the pop lands after the loop, not before.
+// PR #540 adds DSL support for emitting this shape; this test guards the
+// pass against regressions once that lands.
+
+// CHECK-LABEL: func.func @wait_result_through_for_iter_args
+// CHECK: %[[CB:.+]] = ttl.bind_cb{cb_index = 0
+// CHECK: %[[CBOUT:.+]] = ttl.bind_cb{cb_index = 1
+// CHECK: ttl.cb_wait %[[CB]]
+// CHECK: scf.for
+// CHECK: ttl.store
+// CHECK: scf.yield
+// CHECK: }
+// CHECK-NEXT: ttl.cb_pop %[[CB]]
+// CHECK: return
+func.func @wait_result_through_for_iter_args() attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb = ttl.bind_cb{cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb_out = ttl.bind_cb{cb_index = 1, block_count = 4} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 4>
+  %w = ttl.cb_wait %cb : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a = ttl.attach_cb %w, %cb : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %final = scf.for %i = %c0 to %c4 step %c1 iter_args(%carry = %a) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+    %r = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %carry, %r : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+    scf.yield %carry : tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return
+}

From 7e02bd5351034859ae995bb85b6609c46f847477 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Fri, 8 May 2026 08:32:16 -0700
Subject: [PATCH 05/11] replace weird "shape" usage in comments/doc

---
 docs/development/DFBManagement.md                    |  4 ++--
 test/python/test_auto_pop_push.py                    | 12 ++++++------
 .../Dialect/TTL/Transforms/insert_cb_sync.mlir       | 10 +++++-----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/development/DFBManagement.md b/docs/development/DFBManagement.md
index c1a1dfad6..f3a88bff6 100644
--- a/docs/development/DFBManagement.md
+++ b/docs/development/DFBManagement.md
@@ -80,8 +80,8 @@ A use `U` is *owned by* `acquire` if `U` accesses the slot `acquire` acquired.
 Two disjoint criteria establish ownership:
 
 - **Tile-SSA ownership** -- `U` is reachable from `acquire`'s result through
-  identity-shaped tensor ops (`attach_cb`, `tensor.extract`,
-  `tensor.extract_slice`, compute ops, `ttl.store`). Per-tile SSA values
+  the def-use chain over `attach_cb`, `tensor.extract`,
+  `tensor.extract_slice`, compute ops, and `ttl.store`. Per-tile SSA values
   uniquely identify their source acquire, so this criterion has no positional
   bound: a use of `cb_wait t1`'s tile is owned by `t1` regardless of where it
   appears, even past later acquires on the same DFB.
diff --git a/test/python/test_auto_pop_push.py b/test/python/test_auto_pop_push.py
index 4e8b5bd88..f25344bac 100644
--- a/test/python/test_auto_pop_push.py
+++ b/test/python/test_auto_pop_push.py
@@ -4,8 +4,8 @@
 
 """Coverage for ttl-insert-cb-sync auto-injection edge cases.
 
-Each test exercises a distinct shape that the auto pop/push placement must
-handle, including the issue #536 follow-up case_a and case_b reproducers
+Each test exercises a distinct pattern that the auto pop/push placement
+must handle, including the issue #536 follow-up case_a and case_b reproducers
 (deferred consumer uses across multiple consecutive cb.wait() calls on the
 same DFB).
 
@@ -168,9 +168,9 @@ def dm_write():
 
 @pytest.mark.requires_device
 def test_interleaved_wait_consume_pop_baseline(device):
-    """Sanity check: the safe shape (consume each wait before the next wait)
-    works after the #536 fix. This is the form the auto-pop pass currently
-    reasons about correctly."""
+    """Sanity check: the safe form (consume each wait before the next wait)
+    works after the #536 fix. This is the pattern the auto-pop pass
+    currently reasons about correctly."""
 
     @ttl.operation(grid=(1, 1))
     def repro(out):
@@ -1162,7 +1162,7 @@ def dm_write():
 # ---------------------------------------------------------------------------
 # xfail (#540). Tensor recurrence (acc = acc + ...) carrying an acquired
 # tile through scf.for iter_args. The DSL today does not lower this
-# shape consistently; PR #540 adds the missing materialization. Once
+# pattern consistently; PR #540 adds the missing materialization. Once
 # #540 lands, the auto-pop pass must follow uses through the iter_arg
 # block argument so the pop lands after the loop, not before. Mirrors
 # lit test 30 in insert_cb_sync.mlir.
diff --git a/test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir b/test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir
index a2828050a..3c95a7237 100644
--- a/test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir
+++ b/test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir
@@ -922,11 +922,11 @@ func.func @three_consecutive_reserves_deferred_stores(
 // -----
 
 // Test 30: cb_wait result carried through scf.for iter_args. The acquired
-// tile is yielded into the loop's iter_args; the consumer store reads
-// the iter_arg inside the body. findLastOwnedUse must follow uses through
-// the iter_arg block argument so the pop lands after the loop, not before.
-// PR #540 adds DSL support for emitting this shape; this test guards the
-// pass against regressions once that lands.
+// tile flows into the loop as an iter_arg; findLastOwnedUse sees scf.for
+// as a user of the wait result via that operand edge and projects the
+// pop to after the loop. The test guards that projection -- not
+// body-internal iter-arg substitution -- so the pop lands after the
+// loop, not before. PR #540 makes this pattern reachable from the DSL.
 
 // CHECK-LABEL: func.func @wait_result_through_for_iter_args
 // CHECK: %[[CB:.+]] = ttl.bind_cb{cb_index = 0

From 72fbbd824a6229f1556f1c93ff5fe9978a232412 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Fri, 8 May 2026 20:21:56 -0700
Subject: [PATCH 06/11]  New pass `ttl-coalesce-dfb-acquires` rewrites N
 consecutive same-DFB acquires + N matching releases into the canonical
 tt-metal cumulative-wait shape (`cb_wait_front(N*k)` + per-block
 `tensor.extract_slice` views + `cb_pop_front(N*k)`). `addSliceOffset` already
 folds the slice offsets into the per-tile `src_idx` / `dst_idx`, so no
 lowering changes. Symmetric on the producer side. Fixes #556.

---
 include/ttlang/Dialect/TTL/IR/TTLOps.td       |  16 +-
 include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h   |  20 ++
 include/ttlang/Dialect/TTL/Passes.td          |  72 ++++++
 .../ttlang/Dialect/Utils/ConversionUtils.h    |   4 +
 lib/Dialect/TTL/IR/TTLOps.cpp                 |  83 ++++---
 lib/Dialect/TTL/Pipelines/TTLPipelines.cpp    |   1 +
 lib/Dialect/TTL/Transforms/CMakeLists.txt     |   1 +
 .../TTL/Transforms/ConvertTTLToCompute.cpp    |   4 +-
 .../TTL/Transforms/TTLCoalesceDFBAcquires.cpp | 223 +++++++++++++++++
 .../TTL/Transforms/TTLInsertCBSync.cpp        |   3 +-
 python/ttl/ttl_api.py                         |   1 +
 test/me2e/builder/pipeline.py                 |   1 +
 test/python/test_auto_pop_push.py             |  20 +-
 .../ttlang/Dialect/TTL/IR/cb_ops_invalid.mlir |  17 +-
 .../TTL/Transforms/coalesce_dfb_acquires.mlir | 228 ++++++++++++++++++
 15 files changed, 638 insertions(+), 56 deletions(-)
 create mode 100644 lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp
 create mode 100644 test/ttlang/Dialect/TTL/Transforms/coalesce_dfb_acquires.mlir

diff --git a/include/ttlang/Dialect/TTL/IR/TTLOps.td b/include/ttlang/Dialect/TTL/IR/TTLOps.td
index 1328f2e80..cc64e4752 100644
--- a/include/ttlang/Dialect/TTL/IR/TTLOps.td
+++ b/include/ttlang/Dialect/TTL/IR/TTLOps.td
@@ -1127,15 +1127,19 @@ def TTL_CBWaitOp : TTL_Op<"cb_wait",
     This operation is used by consumer threads (typically compute kernels or
     data movement kernels writing to DRAM) to wait for data from producers.
 
-    The number of pages is derived from the CB's shape (elements per block).
+    The number of pages defaults to the CB's elements per block, but can be
+    overridden via the optional `num_tiles` attribute when the wait spans
+    multiple coalesced blocks (see `ttl-coalesce-dfb-acquires`).
 
     Example:
     ```mlir
     %view = ttl.cb_wait %cb : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %coalesced = ttl.cb_wait %cb {num_tiles = 3 : i64} : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x3x!ttcore.tile<32x32, bf16>>
     ```
   }];
   let arguments = (ins
-    TTL_CircularBuffer:$cb
+    TTL_CircularBuffer:$cb,
+    OptionalAttr<I64Attr>:$num_tiles
   );
   let results = (outs AnyRankedTensor:$result);
   let assemblyFormat = "$cb attr-dict `:` type($cb) `->` type($result)";
@@ -1151,15 +1155,19 @@ def TTL_CBPopOp : TTL_Op<"cb_pop", [MemoryEffects<[MemWrite]>]> {
     This operation must be called after reading data acquired via `ttl.cb_wait`.
     It increments the CB's consumer pointer.
 
-    The number of pages is derived from the CB's shape (elements per block).
+    The number of pages defaults to the CB's elements per block, but can be
+    overridden via the optional `num_tiles` attribute when the pop releases
+    multiple coalesced blocks (see `ttl-coalesce-dfb-acquires`).
 
     Example:
     ```mlir
     ttl.cb_pop %cb : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+    ttl.cb_pop %cb {num_tiles = 3 : i64} : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
     ```
   }];
   let arguments = (ins
-    TTL_CircularBuffer:$cb
+    TTL_CircularBuffer:$cb,
+    OptionalAttr<I64Attr>:$num_tiles
   );
   let assemblyFormat = "$cb attr-dict `:` type($cb)";
   let hasVerifier = 1;
diff --git a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
index c9425ea57..19ecbaac7 100644
--- a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
+++ b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
@@ -45,6 +45,26 @@ inline mlir::Value traceUnrealizedCasts(mlir::Value value) {
   return value;
 }
 
+/// Walk through `tensor.extract_slice` ops and return the underlying
+/// `ttl.cb_reserve` op, or null if the chain doesn't end at one.
+inline mlir::tt::ttl::CBReserveOp findCBReserveForView(mlir::Value view) {
+  while (auto slice = view.getDefiningOp<mlir::tensor::ExtractSliceOp>()) {
+    view = slice.getSource();
+  }
+  return view.getDefiningOp<mlir::tt::ttl::CBReserveOp>();
+}
+
+/// Trace through any number of `ttl.attach_cb` ops and return the
+/// underlying tensor SSA value. `attach_cb` is an identity op that records
+/// a tensor->CB association; callers that want to inspect the upstream
+/// producer (e.g. a `tensor.extract_slice`) should call this first.
+inline mlir::Value traceAttachCBs(mlir::Value value) {
+  while (auto attach = value.getDefiningOp<mlir::tt::ttl::AttachCBOp>()) {
+    value = attach.getTensor();
+  }
+  return value;
+}
+
 /// Return the element type for a ttcore::TileType.
 inline std::optional<mlir::Type> getTileElementType(mlir::Type type) {
   if (auto tileType = mlir::dyn_cast<ttcore::TileType>(type)) {
diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index 9fc7b1968..aa68cc110 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -25,6 +25,78 @@ def TTLInsertCBSync
   let dependentDialects = [];
 }
 
+def TTLCoalesceDFBAcquires
+    : Pass<"ttl-coalesce-dfb-acquires", "::mlir::func::FuncOp"> {
+  let summary = "Coalesce consecutive same-DFB acquires into one multi-tile acquire";
+  let description = [{
+    Rewrites the source pattern
+
+    ```
+    t1 = cb.wait();  t2 = cb.wait();  t3 = cb.wait();
+    use(t1);         use(t2);         use(t3);
+    ```
+
+    into the canonical tt-metal "cumulative wait + indexed reads +
+    coalesced pop" shape
+
+    ```
+    cb_wait_front(cb, 3*k);
+    copy_tile(cb, /*src_idx=*/0, dst);  // t1
+    copy_tile(cb, /*src_idx=*/k, dst);  // t2
+    copy_tile(cb, /*src_idx=*/2*k, dst);// t3
+    cb_pop_front(cb, 3*k);
+    ```
+
+    used in tt-metal compute kernels (`tt-metal/tt_metal/kernels/compute/
+    eltwise_binary.cpp`, `bcast_h.cpp`, the matmul kernels, etc.) when a
+    consumer holds a fixed multi-tile window before processing it.
+
+    Pre-coalesce, each `ttl.cb_wait` lowers to its own `cb_wait_front(k)`
+    and each `ttl.cb_pop` to `cb_pop_front(k)`. Because metal's
+    `cb_wait_front`/`cb_pop_front` are non-cumulative, this races: the
+    first pop advances the front before the producer has pushed enough
+    tiles to satisfy the next read.
+
+    At the IR level the rewrite is
+
+    ```mlir
+    // Before (post `ttl-insert-cb-sync`):
+    %t1 = ttl.cb_wait %cb : <[1,k],...,bc> -> tensor<1xkx...>
+    %t2 = ttl.cb_wait %cb : <[1,k],...,bc> -> tensor<1xkx...>
+    %t3 = ttl.cb_wait %cb : <[1,k],...,bc> -> tensor<1xkx...>
+    ... use %t1 ... ttl.cb_pop %cb
+    ... use %t2 ... ttl.cb_pop %cb
+    ... use %t3 ... ttl.cb_pop %cb
+
+    // After:
+    %g  = ttl.cb_wait %cb {num_tiles = 3*k} : ... -> tensor<1x(3k)x...>
+    %t1 = tensor.extract_slice %g [0, 0]    [1, k] [1, 1]
+    %t2 = tensor.extract_slice %g [0, k]    [1, k] [1, 1]
+    %t3 = tensor.extract_slice %g [0, 2*k]  [1, k] [1, 1]
+    ... use %t1 ... use %t2 ... use %t3 ...
+    ttl.cb_pop %cb {num_tiles = 3*k}
+    ```
+
+    `addSliceOffset` already folds each `extract_slice` offset into the
+    per-tile `src_idx` at lowering time, so no lowering changes are
+    required.
+
+    Symmetric on the producer side (`cb_reserve` / `cb_push`), with each
+    per-block `extract_slice` becoming the view of a downstream
+    `ttl.tile_store` / `ttl.store`.
+
+    Detection: forward walk per block; an acquire group is a maximal run
+    of same-kind same-DFB acquires separated only by `ttl.attach_cb` or
+    `arith.constant` ops. Acquires already carrying a `num_tiles`
+    attribute (e.g. set by `ttl-subblock-compute-for-dst`) are not
+    coalesced and terminate the group. The pass is idempotent.
+  }];
+
+  let dependentDialects = [
+    "::mlir::tensor::TensorDialect"
+  ];
+}
+
 def TTLInsertCopyWait
     : Pass<"ttl-insert-copy-wait", "::mlir::func::FuncOp"> {
   let summary = "Insert missing ttl.wait for unmatched ttl.copy ops";
diff --git a/include/ttlang/Dialect/Utils/ConversionUtils.h b/include/ttlang/Dialect/Utils/ConversionUtils.h
index 3b1066e50..d3979e95b 100644
--- a/include/ttlang/Dialect/Utils/ConversionUtils.h
+++ b/include/ttlang/Dialect/Utils/ConversionUtils.h
@@ -15,6 +15,7 @@
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "ttlang/Dialect/TTL/IR/TTL.h"
+#include "ttlang/Dialect/TTL/IR/TTLOps.h"
 #include "ttlang/Dialect/TTL/IR/TTLOpsTypes.h"
 #include "ttmlir/Dialect/TTKernel/IR/TTKernelOpsTypes.h"
 #include "llvm/ADT/Twine.h"
@@ -40,6 +41,9 @@ inline Value addSliceOffset(Value operand, Value localIndex, OpBuilder &builder,
   if (auto extract = tensor.getDefiningOp<mlir::tensor::ExtractOp>()) {
     tensor = extract.getTensor();
   }
+  // Trace through `ttl.attach_cb` so a slice upstream of an attach_cb is
+  // still discoverable.
+  tensor = mlir::tt::ttl::traceAttachCBs(tensor);
   auto slice = tensor.getDefiningOp<mlir::tensor::ExtractSliceOp>();
   if (!slice) {
     return localIndex;
diff --git a/lib/Dialect/TTL/IR/TTLOps.cpp b/lib/Dialect/TTL/IR/TTLOps.cpp
index 57c2eed47..061213619 100644
--- a/lib/Dialect/TTL/IR/TTLOps.cpp
+++ b/lib/Dialect/TTL/IR/TTLOps.cpp
@@ -939,35 +939,46 @@ mlir::LogicalResult mlir::tt::ttl::ComputeOp::verify() {
   return success();
 }
 
+// Verify a `num_tiles`-bearing acquire (cb_reserve / cb_wait): the result
+// tensor must agree with the CB's element type, the tile-count attribute,
+// and `num_tiles` must not exceed the CB's total tile capacity. The bound
+// is across blocks (elementsPerBlock * blockCount) so coalesced acquires
+// can span multiple CB blocks.
+static mlir::LogicalResult
+verifyCBAcquireWithNumTiles(mlir::Operation *op,
+                            mlir::tt::ttl::CircularBufferType cbTy,
+                            mlir::RankedTensorType resultTy, int64_t numTiles) {
+  auto cbElemTy = cbTy.getElementType();
+  if (cbElemTy != resultTy.getElementType()) {
+    return op->emitOpError()
+           << "result element type (" << resultTy.getElementType()
+           << ") must match DFB element type (" << cbElemTy << ")";
+  }
+  int64_t resultTiles = 1;
+  for (int64_t d : resultTy.getShape()) {
+    resultTiles *= d;
+  }
+  if (resultTiles != numTiles) {
+    return op->emitOpError()
+           << "result tensor has " << resultTiles
+           << " tiles but num_tiles attribute is " << numTiles;
+  }
+  int64_t cbCapacity = cbTy.getTotalElements();
+  if (numTiles > cbCapacity) {
+    return op->emitOpError() << "num_tiles (" << numTiles
+                             << ") exceeds DFB capacity (" << cbCapacity << ")";
+  }
+  return mlir::success();
+}
+
 mlir::LogicalResult mlir::tt::ttl::CBReserveOp::verify() {
   auto cbTy = mlir::cast<CircularBufferType>(getCb().getType());
   auto resultTy = mlir::cast<RankedTensorType>(getResult().getType());
 
-  // When `num_tiles` is present, the result shape is a subblock of the CB.
-  // Verify element type match and that tile count is consistent.
   if (getNumTiles()) {
-    auto cbElemTy = cbTy.getElementType();
-    if (cbElemTy != resultTy.getElementType()) {
-      return emitOpError() << "result element type ("
-                           << resultTy.getElementType()
-                           << ") must match DFB element type (" << cbElemTy
-                           << ")";
-    }
-    int64_t resultTiles = 1;
-    for (int64_t d : resultTy.getShape()) {
-      resultTiles *= d;
-    }
-    if (resultTiles != static_cast<int64_t>(getNumTiles().value())) {
-      return emitOpError() << "result tensor has " << resultTiles
-                           << " tiles but num_tiles attribute is "
-                           << getNumTiles().value();
-    }
-    int64_t cbCapacity = cbTy.getElementsPerBlock();
-    if (resultTiles > cbCapacity) {
-      return emitOpError() << "num_tiles (" << resultTiles
-                           << ") exceeds DFB capacity (" << cbCapacity << ")";
-    }
-    return mlir::success();
+    return verifyCBAcquireWithNumTiles(
+        getOperation(), cbTy, resultTy,
+        static_cast<int64_t>(getNumTiles().value()));
   }
 
   return verifyCBOpWithResult(getOperation(), cbTy, resultTy);
@@ -976,7 +987,7 @@ mlir::LogicalResult mlir::tt::ttl::CBReserveOp::verify() {
 mlir::LogicalResult mlir::tt::ttl::CBPushOp::verify() {
   if (getNumTiles()) {
     auto cbTy = mlir::cast<CircularBufferType>(getCb().getType());
-    int64_t cbCapacity = cbTy.getElementsPerBlock();
+    int64_t cbCapacity = cbTy.getTotalElements();
     int64_t numTiles = static_cast<int64_t>(getNumTiles().value());
     if (numTiles > cbCapacity) {
       return emitOpError() << "num_tiles (" << numTiles
@@ -989,6 +1000,13 @@ mlir::LogicalResult mlir::tt::ttl::CBPushOp::verify() {
 mlir::LogicalResult mlir::tt::ttl::CBWaitOp::verify() {
   auto cbTy = mlir::cast<CircularBufferType>(getCb().getType());
   auto resultTy = mlir::cast<RankedTensorType>(getResult().getType());
+
+  if (getNumTiles()) {
+    return verifyCBAcquireWithNumTiles(
+        getOperation(), cbTy, resultTy,
+        static_cast<int64_t>(getNumTiles().value()));
+  }
+
   return verifyCBOpWithResult(getOperation(), cbTy, resultTy);
 }
 
@@ -997,8 +1015,15 @@ mlir::Value mlir::tt::ttl::CBReserveOp::getViewSource() { return getCb(); }
 mlir::Value mlir::tt::ttl::CBWaitOp::getViewSource() { return getCb(); }
 
 mlir::LogicalResult mlir::tt::ttl::CBPopOp::verify() {
-  // cb_pop has no result to verify; the CB type is already enforced by
-  // tablegen constraints.
+  if (getNumTiles()) {
+    auto cbTy = mlir::cast<CircularBufferType>(getCb().getType());
+    int64_t cbCapacity = cbTy.getTotalElements();
+    int64_t numTiles = static_cast<int64_t>(getNumTiles().value());
+    if (numTiles > cbCapacity) {
+      return emitOpError() << "num_tiles (" << numTiles
+                           << ") exceeds DFB capacity (" << cbCapacity << ")";
+    }
+  }
   return success();
 }
 
@@ -1027,7 +1052,9 @@ mlir::LogicalResult mlir::tt::ttl::StoreOp::verify() {
     }
   }
 
-  if (!getView().getDefiningOp<CBReserveOp>()) {
+  // The view must ultimately come from a `ttl.cb_reserve`, possibly
+  // through intervening `tensor.extract_slice` ops.
+  if (!findCBReserveForView(getView())) {
     return emitOpError() << "view must come from ttl.cb_reserve";
   }
 
diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
index b3e298c0c..e8abe7f69 100644
--- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
+++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
@@ -25,6 +25,7 @@ void createTTLToTTKernelPipeline(OpPassManager &pm,
   }
   pm.addNestedPass<func::FuncOp>(createTTLInsertCopyWait());
   pm.addNestedPass<func::FuncOp>(createTTLInsertCBSync());
+  pm.addNestedPass<func::FuncOp>(createTTLCoalesceDFBAcquires());
   pm.addPass(createTTLAnnotateL1AccLoops());
   pm.addPass(createTTLConvertTTLToCompute());
   {
diff --git a/lib/Dialect/TTL/Transforms/CMakeLists.txt b/lib/Dialect/TTL/Transforms/CMakeLists.txt
index b9eeaef27..045fbdb71 100644
--- a/lib/Dialect/TTL/Transforms/CMakeLists.txt
+++ b/lib/Dialect/TTL/Transforms/CMakeLists.txt
@@ -9,6 +9,7 @@ add_mlir_dialect_library(TTLangTTLTransforms
   LowerSignpostToEmitC.cpp
   TTLAnnotateCBAssociations.cpp
   TTLAnnotateL1AccLoops.cpp
+  TTLCoalesceDFBAcquires.cpp
   TTLDumpCBFlowGraph.cpp
   TTLFinalizeDFBIndices.cpp
   TTLInsertCBSync.cpp
diff --git a/lib/Dialect/TTL/Transforms/ConvertTTLToCompute.cpp b/lib/Dialect/TTL/Transforms/ConvertTTLToCompute.cpp
index a286328a5..6898a3f6c 100644
--- a/lib/Dialect/TTL/Transforms/ConvertTTLToCompute.cpp
+++ b/lib/Dialect/TTL/Transforms/ConvertTTLToCompute.cpp
@@ -51,7 +51,7 @@ static SmallVector<Value> collectOutputCBs(Operation *op) {
   DenseSet<Value> seen;
   for (OpOperand &use : op->getResult(0).getUses()) {
     if (auto storeOp = dyn_cast<StoreOp>(use.getOwner())) {
-      auto reserve = storeOp.getView().getDefiningOp<CBReserveOp>();
+      auto reserve = findCBReserveForView(storeOp.getView());
       if (!reserve) {
         return {};
       }
@@ -1176,7 +1176,7 @@ struct LowerStoreToCompute : OpRewritePattern<StoreOp> {
                                 PatternRewriter &rewriter) const override {
     Value input = op.getTensor();
     Value reserveView = op.getView();
-    auto reserve = reserveView.getDefiningOp<CBReserveOp>();
+    auto reserve = findCBReserveForView(reserveView);
     if (!reserve) {
       return rewriter.notifyMatchFailure(op, "view not from ttl.cb_reserve");
     }
diff --git a/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp b/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp
new file mode 100644
index 000000000..cee1fdff0
--- /dev/null
+++ b/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp
@@ -0,0 +1,223 @@
+// SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+//===----------------------------------------------------------------------===//
+// TTL Coalesce DFB Acquires
+//===----------------------------------------------------------------------===//
+//
+// Rewrites N consecutive same-DFB acquires + N matching releases into the
+// canonical tt-metal cumulative-wait shape:
+//
+//     cb_wait_front(cb, N*k);
+//     copy_tile(cb, /*src_idx=*/0,    dst);
+//     copy_tile(cb, /*src_idx=*/k,    dst);
+//     ...
+//     cb_pop_front(cb, N*k);
+//
+// At the IR level:
+//
+//     %t1 = ttl.cb_wait %cb            %g  = ttl.cb_wait %cb {num_tiles=N*k}
+//     %t2 = ttl.cb_wait %cb            %t1 = extract_slice %g [0, 0]   [1,k]
+//     ...                              %t2 = extract_slice %g [0, k]   [1,k]
+//     ttl.cb_pop %cb                   ...
+//     ttl.cb_pop %cb                   ttl.cb_pop %cb {num_tiles=N*k}
+//
+// `addSliceOffset` already folds the `extract_slice` offsets into the
+// per-tile `src_idx` / `dst_idx` at lowering, so no lowering changes are
+// needed. Symmetric for `cb_reserve` / `cb_push`.
+//
+// See issue #556 and `docs/development/DFBManagement.md`.
+//===----------------------------------------------------------------------===//
+
+#include "ttlang/Dialect/TTL/IR/TTLOps.h"
+#include "ttlang/Dialect/TTL/Passes.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "llvm/ADT/SmallVector.h"
+
+#define DEBUG_TYPE "ttl-coalesce-dfb-acquires"
+
+namespace mlir::tt::ttl {
+
+#define GEN_PASS_DEF_TTLCOALESCEDFBACQUIRES
+#include "ttlang/Dialect/TTL/Passes.h.inc"
+
+namespace {
+
+// Ops permitted to interleave between consecutive acquires without breaking
+// a coalescable group. Verified empirically against
+// `test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir:812-817`: the
+// frontend emits `cb_wait` immediately followed by `attach_cb`, so a
+// three-wait group has six interleaved ops.
+static bool isInterleaveOk(Operation *op) {
+  return isa<AttachCBOp, arith::ConstantOp>(op);
+}
+
+// Build the coalesced acquire's result type. For the common rank-2 case
+// `tensor<1 x k x elem>` (matching the `num_tiles` shape convention from
+// `cb_ops_invalid.mlir` and `TTLSubblockComputeForDST`), produce
+// `tensor<1 x (N*k) x elem>`. Higher-rank shapes are not coalesced.
+static RankedTensorType buildCoalescedType(RankedTensorType unitTy,
+                                           int64_t totalTiles) {
+  auto shape = unitTy.getShape();
+  assert(shape.size() == 2 && shape[0] == 1 &&
+         "coalesce expects rank-2 acquire with leading 1");
+  return RankedTensorType::get({1, totalTiles}, unitTy.getElementType());
+}
+
+// `tensor.extract_slice` for the i-th member of an N-block group:
+// offsets = [0, i*k], sizes = [1, k], strides = [1, 1].
+static tensor::ExtractSliceOp
+createPerBlockSlice(OpBuilder &builder, Location loc, Value coalescedResult,
+                    RankedTensorType unitTy, int64_t blockIdx, int64_t k) {
+  SmallVector<OpFoldResult, 2> offsets = {builder.getIndexAttr(0),
+                                          builder.getIndexAttr(blockIdx * k)};
+  SmallVector<OpFoldResult, 2> sizes = {builder.getIndexAttr(1),
+                                        builder.getIndexAttr(k)};
+  SmallVector<OpFoldResult, 2> strides = {builder.getIndexAttr(1),
+                                          builder.getIndexAttr(1)};
+  return tensor::ExtractSliceOp::create(builder, loc, unitTy, coalescedResult,
+                                        offsets, sizes, strides);
+}
+
+// Detect a group of N >= 1 strictly-consecutive same-CB acquires of kind
+// `AcquireOp` starting at `start`. Returns the group; an acquire that
+// already carries a `num_tiles` attribute terminates the group (it has
+// already been coalesced or was emitted by `TTLSubblockComputeForDST`).
+template <typename AcquireOp>
+static SmallVector<AcquireOp> detectGroup(AcquireOp start) {
+  SmallVector<AcquireOp> group;
+  group.push_back(start);
+  Value cb = start.getCb();
+  for (Operation *cur = start->getNextNode(); cur; cur = cur->getNextNode()) {
+    if (auto next = dyn_cast<AcquireOp>(cur)) {
+      if (next.getCb() == cb && !next.getNumTiles().has_value()) {
+        group.push_back(next);
+        continue;
+      }
+      break; // Same-kind acquire on different CB or already coalesced.
+    }
+    if (!isInterleaveOk(cur)) {
+      break;
+    }
+  }
+  return group;
+}
+
+// Collect the first `count` matching releases of kind `ReleaseOp` on `cb`
+// starting at `start`, walking forward in the same block. Returns empty if
+// fewer than `count` are found before block end, or if a same-CB release
+// already carries `num_tiles` (a partial earlier coalesce we shouldn't
+// extend).
+template <typename ReleaseOp>
+static SmallVector<ReleaseOp> collectReleases(Operation *start, Value cb,
+                                              size_t count) {
+  SmallVector<ReleaseOp> releases;
+  for (Operation *op = start; op != nullptr; op = op->getNextNode()) {
+    auto release = dyn_cast<ReleaseOp>(op);
+    if (!release || release.getCb() != cb) {
+      continue;
+    }
+    if (release.getNumTiles().has_value()) {
+      return {};
+    }
+    releases.push_back(release);
+    if (releases.size() == count) {
+      return releases;
+    }
+  }
+  return {};
+}
+
+template <typename AcquireOp, typename ReleaseOp>
+static bool tryCoalesceGroup(SmallVectorImpl<AcquireOp> &group,
+                             OpBuilder &builder) {
+  AcquireOp leader = group.front();
+  Value cb = leader.getCb();
+  auto unitTy = cast<RankedTensorType>(leader.getResult().getType());
+  // Conservative: only coalesce the rank-2 leading-1 shape that the
+  // existing `num_tiles` convention covers. Other shapes flow through
+  // unchanged.
+  if (unitTy.getRank() != 2 || unitTy.getShape()[0] != 1) {
+    return false;
+  }
+  int64_t k = unitTy.getShape()[1];
+  int64_t N = static_cast<int64_t>(group.size());
+  int64_t totalTiles = N * k;
+
+  SmallVector<ReleaseOp> releases =
+      collectReleases<ReleaseOp>(group.back()->getNextNode(), cb, group.size());
+  if (releases.empty()) {
+    return false;
+  }
+
+  builder.setInsertionPoint(leader);
+  Location loc = leader.getLoc();
+  RankedTensorType coalescedTy = buildCoalescedType(unitTy, totalTiles);
+  IntegerAttr numTilesAttr = builder.getI64IntegerAttr(totalTiles);
+  AcquireOp coalesced =
+      AcquireOp::create(builder, loc, coalescedTy, cb, numTilesAttr);
+
+  for (size_t i = 0; i < group.size(); ++i) {
+    AcquireOp old = group[i];
+    builder.setInsertionPoint(old);
+    Location oldLoc = old.getLoc();
+    auto slice = createPerBlockSlice(builder, oldLoc, coalesced.getResult(),
+                                     unitTy, static_cast<int64_t>(i), k);
+    old.getResult().replaceAllUsesWith(slice.getResult());
+    old.erase();
+  }
+
+  releases.back()->setAttr("num_tiles", numTilesAttr);
+  for (size_t i = 0; i + 1 < releases.size(); ++i) {
+    releases[i].erase();
+  }
+  return true;
+}
+
+// Walk `block` once, applying coalescing to consecutive acquires.
+template <typename AcquireOp, typename ReleaseOp>
+static void coalesceInBlock(Block &block, OpBuilder &builder) {
+  Operation *op = &block.front();
+  while (op) {
+    Operation *next = op->getNextNode();
+    if (auto acquire = dyn_cast<AcquireOp>(op)) {
+      if (!acquire.getNumTiles().has_value()) {
+        SmallVector<AcquireOp> group = detectGroup<AcquireOp>(acquire);
+        if (group.size() >= 2) {
+          // Capture the resume point before the rewrite; the last group
+          // member is erased but the op after it (if any) remains valid.
+          Operation *resume = group.back()->getNextNode();
+          if (tryCoalesceGroup<AcquireOp, ReleaseOp>(group, builder)) {
+            op = resume;
+            continue;
+          }
+        }
+      }
+    }
+    op = next;
+  }
+}
+
+struct TTLCoalesceDFBAcquiresPass
+    : public impl::TTLCoalesceDFBAcquiresBase<TTLCoalesceDFBAcquiresPass> {
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+    OpBuilder builder(func.getContext());
+
+    func.walk([&](Block *block) {
+      if (block->empty()) {
+        return;
+      }
+      coalesceInBlock<CBWaitOp, CBPopOp>(*block, builder);
+      coalesceInBlock<CBReserveOp, CBPushOp>(*block, builder);
+    });
+  }
+};
+
+} // namespace
+
+} // namespace mlir::tt::ttl
diff --git a/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp b/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
index 0fcd11159..6073b28b0 100644
--- a/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
@@ -355,7 +355,8 @@ struct TTLInsertCBSyncPass
 
     insertMissingReleases(waits, pops, erased, builder,
                           [](OpBuilder &b, Location loc, Value cb) {
-                            CBPopOp::create(b, loc, cb);
+                            CBPopOp::create(b, loc, cb,
+                                            /*num_tiles=*/IntegerAttr{});
                           });
   }
 };
diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py
index f4c26c504..f2ee2ad71 100644
--- a/python/ttl/ttl_api.py
+++ b/python/ttl/ttl_api.py
@@ -1359,6 +1359,7 @@ def _compile_kernel(
             f"func.func(ttl-insert-intermediate-dfbs{{enable={compiler_dfbs_flag}}})",
             "func.func(ttl-insert-copy-wait)",
             "func.func(ttl-insert-cb-sync)",
+            "func.func(ttl-coalesce-dfb-acquires)",
             "func.func(ttl-annotate-l1-acc-loops)",
             "func.func(convert-ttl-to-compute)",
             set_compute_config_pass,
diff --git a/test/me2e/builder/pipeline.py b/test/me2e/builder/pipeline.py
index 432963f12..f9f57763c 100644
--- a/test/me2e/builder/pipeline.py
+++ b/test/me2e/builder/pipeline.py
@@ -43,6 +43,7 @@ def compile_ttl_to_ttkernel(
         "ttl-insert-intermediate-dfbs",
         "ttl-insert-copy-wait",
         "ttl-insert-cb-sync",
+        "ttl-coalesce-dfb-acquires",
         "convert-ttl-to-compute",
         assign_dst_pass,
     ]
diff --git a/test/python/test_auto_pop_push.py b/test/python/test_auto_pop_push.py
index f25344bac..fbf3cd8eb 100644
--- a/test/python/test_auto_pop_push.py
+++ b/test/python/test_auto_pop_push.py
@@ -794,23 +794,17 @@ def dm_write():
 
 
 # ---------------------------------------------------------------------------
-# xfail (#556). Consumer reads tile 2 before tile 1 (out of declaration
-# order). The buffer exposes a single FIFO front pointer, so there is no
-# way to release the second slot before the first; the current pass
-# places releases in the order it observes the consumes, violating FIFO
-# monotonicity. Lifted by #556 (coalesce consecutive cb_wait into one
-# cb_wait_front(N) with per-acquire src_idx, decoupling consume order
-# from release order).
+# Reordered consumes: consumer reads tile 2 before tile 1 (out of
+# declaration order). Consecutive cb_wait acquires coalesce into one
+# multi-tile `cb_wait_front(N*k)` plus per-block `tensor.extract_slice`
+# views, so consume order is decoupled from release order; both tiles
+# are present from the single coalesced wait, and the slice offsets
+# index each block at lowering.
 # ---------------------------------------------------------------------------
 
 
 @pytest.mark.requires_device
-@pytest.mark.xfail(
-    strict=True,
-    reason="Reordered consumes (use(t2) before use(t1)) violate CB FIFO "
-    "monotonicity. Lifted by #556 (multi-tile cb_wait_front coalescing).",
-)
-def test_reordered_consumes_violate_fifo_xfail(device):
+def test_reordered_consumes_decoupled_from_fifo(device):
     @ttl.operation(grid=(1, 1))
     def repro(out):
         cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
diff --git a/test/ttlang/Dialect/TTL/IR/cb_ops_invalid.mlir b/test/ttlang/Dialect/TTL/IR/cb_ops_invalid.mlir
index 10ea1ba6f..1cfaa3523 100644
--- a/test/ttlang/Dialect/TTL/IR/cb_ops_invalid.mlir
+++ b/test/ttlang/Dialect/TTL/IR/cb_ops_invalid.mlir
@@ -111,22 +111,23 @@ module {
 
 // -----
 
-// cb_reserve with num_tiles exceeding CB capacity.
+// cb_reserve with num_tiles exceeding CB capacity (across all blocks).
+// Capacity = elementsPerBlock * blockCount = 9 * 2 = 18.
 module {
-  func.func @cb_reserve_num_tiles_exceeds_capacity(%cb: !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2>) -> tensor<5x3x!ttcore.tile<32x32, bf16>> attributes {ttl.kernel_thread = #ttkernel.thread<noc>} {
-    // expected-error @below {{num_tiles (15) exceeds DFB capacity (9)}}
-    %view = ttl.cb_reserve %cb {num_tiles = 15 : i64} : <[3, 3], !ttcore.tile<32x32, bf16>, 2> -> tensor<5x3x!ttcore.tile<32x32, bf16>>
-    func.return %view : tensor<5x3x!ttcore.tile<32x32, bf16>>
+  func.func @cb_reserve_num_tiles_exceeds_capacity(%cb: !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2>) -> tensor<8x3x!ttcore.tile<32x32, bf16>> attributes {ttl.kernel_thread = #ttkernel.thread<noc>} {
+    // expected-error @below {{num_tiles (24) exceeds DFB capacity (18)}}
+    %view = ttl.cb_reserve %cb {num_tiles = 24 : i64} : <[3, 3], !ttcore.tile<32x32, bf16>, 2> -> tensor<8x3x!ttcore.tile<32x32, bf16>>
+    func.return %view : tensor<8x3x!ttcore.tile<32x32, bf16>>
   }
 }
 
 // -----
 
-// cb_push with num_tiles exceeding CB capacity.
+// cb_push with num_tiles exceeding CB capacity (across all blocks).
 module {
   func.func @cb_push_num_tiles_exceeds_capacity(%cb: !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2>) attributes {ttl.kernel_thread = #ttkernel.thread<noc>} {
-    // expected-error @below {{'ttl.cb_push' op num_tiles (15) exceeds DFB capacity (9)}}
-    ttl.cb_push %cb {num_tiles = 15 : i64} : <[3, 3], !ttcore.tile<32x32, bf16>, 2>
+    // expected-error @below {{'ttl.cb_push' op num_tiles (24) exceeds DFB capacity (18)}}
+    ttl.cb_push %cb {num_tiles = 24 : i64} : <[3, 3], !ttcore.tile<32x32, bf16>, 2>
     func.return
   }
 }
diff --git a/test/ttlang/Dialect/TTL/Transforms/coalesce_dfb_acquires.mlir b/test/ttlang/Dialect/TTL/Transforms/coalesce_dfb_acquires.mlir
new file mode 100644
index 000000000..0ea3c10b5
--- /dev/null
+++ b/test/ttlang/Dialect/TTL/Transforms/coalesce_dfb_acquires.mlir
@@ -0,0 +1,228 @@
+// Verifies ttl-coalesce-dfb-acquires: strictly-consecutive same-DFB
+// acquires collapse into a single multi-tile acquire plus per-block
+// extract_slice views, with N matching releases collapsing into one
+// carrying num_tiles=N*k. See issue #556.
+
+// RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(ttl-coalesce-dfb-acquires))' --split-input-file | FileCheck %s
+// RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(ttl-coalesce-dfb-acquires,ttl-coalesce-dfb-acquires))' --split-input-file | FileCheck %s
+
+// Test 1: three consecutive cb_wait + three pops -> one cb_wait{num_tiles=3}
+// + three extract_slices + one cb_pop{num_tiles=3}.
+
+// CHECK-LABEL: func.func @three_waits_consumer
+// CHECK: %[[CBIN:.+]] = ttl.bind_cb{cb_index = 0
+// CHECK: %[[CBOUT:.+]] = ttl.bind_cb{cb_index = 1
+// CHECK: %[[GROUP:.+]] = ttl.cb_wait %[[CBIN]] {num_tiles = 3 : i64}
+// CHECK-SAME: tensor<1x3x!ttcore.tile<32x32, bf16>>
+// CHECK-NEXT: %[[S0:.+]] = tensor.extract_slice %[[GROUP]][0, 0] [1, 1] [1, 1]
+// CHECK-NEXT: ttl.attach_cb %[[S0]]
+// CHECK-NEXT: %[[S1:.+]] = tensor.extract_slice %[[GROUP]][0, 1] [1, 1] [1, 1]
+// CHECK-NEXT: ttl.attach_cb %[[S1]]
+// CHECK-NEXT: %[[S2:.+]] = tensor.extract_slice %[[GROUP]][0, 2] [1, 1] [1, 1]
+// CHECK-NEXT: ttl.attach_cb %[[S2]]
+// CHECK: ttl.cb_pop %[[CBIN]] {num_tiles = 3 : i64}
+// CHECK-NOT: ttl.cb_wait
+// CHECK-NOT: ttl.cb_pop
+// CHECK: return
+func.func @three_waits_consumer()
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %cb_in = ttl.bind_cb{cb_index = 0, block_count = 3} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  %cb_out = ttl.bind_cb{cb_index = 1, block_count = 3} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  %w0 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a0 = ttl.attach_cb %w0, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 3>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %w1 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a1 = ttl.attach_cb %w1, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 3>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %w2 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a2 = ttl.attach_cb %w2, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 3>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %r0 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a0, %r0 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  %r1 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a1, %r1 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  %r2 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a2, %r2 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  func.return
+}
+
+// -----
+
+// Test 2: producer-side analog. Three consecutive cb_reserve + three pushes
+// collapse to one cb_reserve{num_tiles=3} + three extract_slices routed to
+// stores + one cb_push{num_tiles=3}.
+
+// CHECK-LABEL: func.func @three_reserves_producer
+// CHECK: %[[CB:.+]] = ttl.bind_cb{cb_index = 0
+// CHECK: %[[GROUP:.+]] = ttl.cb_reserve %[[CB]] {num_tiles = 3 : i64}
+// CHECK-SAME: tensor<1x3x!ttcore.tile<32x32, bf16>>
+// CHECK-NEXT: tensor.extract_slice %[[GROUP]][0, 0] [1, 1] [1, 1]
+// CHECK-NEXT: tensor.extract_slice %[[GROUP]][0, 1] [1, 1] [1, 1]
+// CHECK-NEXT: tensor.extract_slice %[[GROUP]][0, 2] [1, 1] [1, 1]
+// CHECK: ttl.cb_push %[[CB]] {num_tiles = 3 : i64}
+// CHECK-NOT: ttl.cb_reserve
+// CHECK-NOT: ttl.cb_push
+// CHECK: return
+func.func @three_reserves_producer(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>)
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %cb = ttl.bind_cb{cb_index = 0, block_count = 3} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  %r0 = ttl.cb_reserve %cb : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %r1 = ttl.cb_reserve %cb : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %r2 = ttl.cb_reserve %cb : <[1, 1], !ttcore.tile<32x32, bf16>, 3> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %arg0, %r0 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_push %cb : <[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  ttl.store %arg0, %r1 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_push %cb : <[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  ttl.store %arg0, %r2 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_push %cb : <[1, 1], !ttcore.tile<32x32, bf16>, 3>
+  func.return
+}
+
+// -----
+
+// Test 3: four consecutive waits inside scf.for body coalesce per iteration.
+
+// CHECK-LABEL: func.func @four_waits_in_loop
+// CHECK: scf.for
+// CHECK: ttl.cb_wait {{.*}} {num_tiles = 4 : i64}
+// CHECK-SAME: tensor<1x4x!ttcore.tile<32x32, bf16>>
+// CHECK-COUNT-4: tensor.extract_slice
+// CHECK: ttl.cb_pop {{.*}} {num_tiles = 4 : i64}
+// CHECK: }
+// CHECK-NOT: ttl.cb_wait
+// CHECK-NOT: ttl.cb_pop
+// CHECK: return
+func.func @four_waits_in_loop()
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %cb_in = ttl.bind_cb{cb_index = 0, block_count = 12} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 12>
+  %cb_out = ttl.bind_cb{cb_index = 1, block_count = 4} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 4>
+  scf.for %i = %c0 to %c3 step %c1 {
+    %w0 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 12> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %a0 = ttl.attach_cb %w0, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 12>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %w1 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 12> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %a1 = ttl.attach_cb %w1, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 12>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %w2 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 12> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %a2 = ttl.attach_cb %w2, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 12>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %w3 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 12> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %a3 = ttl.attach_cb %w3, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 12>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    %r0 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %a0, %r0 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 12>
+    ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+    %r1 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %a1, %r1 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 12>
+    ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+    %r2 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %a2, %r2 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 12>
+    ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+    %r3 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %a3, %r3 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 12>
+    ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+  }
+  func.return
+}
+
+// -----
+
+// Test 4 (negative): wait, use, wait, use — non-consecutive acquires.
+// The use of %w0 between the waits breaks the run; nothing coalesces.
+
+// CHECK-LABEL: func.func @interleaved_consume_not_coalesced
+// CHECK-NOT: num_tiles
+// CHECK-NOT: tensor.extract_slice
+func.func @interleaved_consume_not_coalesced()
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %cb_in = ttl.bind_cb{cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb_out = ttl.bind_cb{cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %w0 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a0 = ttl.attach_cb %w0, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %r0 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a0, %r0 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %w1 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a1 = ttl.attach_cb %w1, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %r1 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a1, %r1 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  func.return
+}
+
+// -----
+
+// Test 5 (negative): waits on different CBs alternating — neither group
+// is "strictly consecutive on the same DFB". No coalescing.
+
+// CHECK-LABEL: func.func @alternating_cbs_not_coalesced
+// CHECK-NOT: num_tiles
+// CHECK-NOT: tensor.extract_slice
+func.func @alternating_cbs_not_coalesced()
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %cb_a = ttl.bind_cb{cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb_b = ttl.bind_cb{cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb_out = ttl.bind_cb{cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %wa = ttl.cb_wait %cb_a : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %wb = ttl.cb_wait %cb_b : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %aa = ttl.attach_cb %wa, %cb_a : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %ab = ttl.attach_cb %wb, %cb_b : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %r0 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %aa, %r0 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_pop %cb_a : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  ttl.cb_pop %cb_b : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  func.return
+}
+
+// -----
+
+// Test 6 (negative): cb_reserve already carrying num_tiles (e.g. set by
+// ttl-subblock-compute-for-dst) is left untouched.
+
+// CHECK-LABEL: func.func @existing_num_tiles_untouched
+// CHECK: ttl.cb_reserve %{{.*}} {num_tiles = 2 : i64}
+// CHECK-NOT: tensor.extract_slice
+// CHECK: return
+func.func @existing_num_tiles_untouched(
+    %arg0: tensor<1x2x!ttcore.tile<32x32, bf16>>)
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %cb = ttl.bind_cb{cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %r = ttl.cb_reserve %cb {num_tiles = 2 : i64} : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x2x!ttcore.tile<32x32, bf16>>
+  ttl.store %arg0, %r : tensor<1x2x!ttcore.tile<32x32, bf16>>, tensor<1x2x!ttcore.tile<32x32, bf16>>
+  ttl.cb_push %cb {num_tiles = 2 : i64} : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  func.return
+}
+
+// -----
+
+// Test 7: single cb_wait without a sibling on the same DFB stays
+// unchanged (no group of >= 2 to coalesce).
+
+// CHECK-LABEL: func.func @single_wait_unchanged
+// CHECK: ttl.cb_wait
+// CHECK-NOT: num_tiles
+// CHECK: ttl.cb_pop
+// CHECK-NOT: num_tiles
+// CHECK: return
+func.func @single_wait_unchanged()
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %cb_in = ttl.bind_cb{cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb_out = ttl.bind_cb{cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %w = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a = ttl.attach_cb %w, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %r = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a, %r : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  func.return
+}

From eca337db5d0cb18a8035bb38018879bfd389ca35 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Fri, 8 May 2026 21:09:21 -0700
Subject: [PATCH 07/11] Generalize ttl-coalesce-dfb-acquires termination rule
 (mayReleaseDFB); extend correctness argument and idempotency notes in
 DFBManagement.md; add adversarial pytests (matmul-style two-DFB interleave,
 t1-fanout, interposed third acquire) and matching lit coverage; rename
 CB->DFB in new content.

---
 docs/development/DFBManagement.md             | 156 ++++++++++++-
 include/ttlang/Dialect/TTL/Passes.td          |   9 +-
 .../TTL/Transforms/TTLCoalesceDFBAcquires.cpp | 110 ++++++---
 test/python/test_auto_pop_push.py             | 213 +++++++++++++++++-
 .../TTL/Transforms/coalesce_dfb_acquires.mlir |  42 ++++
 5 files changed, 483 insertions(+), 47 deletions(-)

diff --git a/docs/development/DFBManagement.md b/docs/development/DFBManagement.md
index f3a88bff6..ea44c5739 100644
--- a/docs/development/DFBManagement.md
+++ b/docs/development/DFBManagement.md
@@ -126,12 +126,13 @@ For each acquire `A`, the inserted release `R_A` must satisfy:
 
 (1) is enforced explicitly by the pass. (2) is enforced *implicitly* when
 consumers under criterion (a) appear in declaration order
-(`use(t1); use(t2); use(t3)`), because the resulting `lastUse(A_i)` values are
-then themselves in op order. Reordered consumes (`use(t2); use(t1)`) silently
-violate (2): the pass places `R_0` after `R_1` and the front pointer advances
-past `t1`'s slot before `t1` is read. Lifting that restriction is future work
-that requires multi-tile `cb_wait_front(N)` with per-acquire `src_idx` so each
-consumer reads its tile by index, decoupled from pop ordering.
+(`use(t1); use(t2); use(t3)`). Reordered consumes (`use(t2); use(t1)`) would
+violate FIFO monotonicity on their own, but in the current pipeline `TTLCoalesceDFBAcquires`
+runs immediately after `TTLInsertCBSync` and rewrites N consecutive same-DFB
+acquires into one multi-tile acquire plus per-block `tensor.extract_slice`
+views and a single coalesced release with `num_tiles = N*k`. Per-tile
+`src_idx` values fall out of `extract_slice` offsets, so consume order is
+decoupled from release order and (2) is preserved by construction.
 
 ### Idempotency
 
@@ -230,6 +231,149 @@ The same-block release check makes the pass idempotent. A release after the
 next acquire in the same DFB sync class belongs to that later interval and does
 not satisfy the earlier acquire.
 
+## DFB Acquire Coalescing
+
+`TTLCoalesceDFBAcquires` runs immediately after `TTLInsertCBSync` and
+rewrites a maximal run of consecutive same-DFB acquires (and their matched
+releases) into a single multi-tile acquire plus per-block
+`tensor.extract_slice` views, with the matched releases collapsed into one
+release carrying `num_tiles = N*k`.
+
+```
+%t1 = ttl.cb_wait %cb            %g  = ttl.cb_wait %cb {num_tiles=N*k}
+%t2 = ttl.cb_wait %cb            %t1 = extract_slice %g [0, 0]   [1,k]
+...                              %t2 = extract_slice %g [0, k]   [1,k]
+ttl.cb_pop %cb                   ...
+ttl.cb_pop %cb                   ttl.cb_pop %cb {num_tiles=N*k}
+```
+
+This matches the canonical tt-metal "cumulative wait + indexed reads +
+coalesced pop" pattern (eltwise_binary.cpp, bcast_h.cpp, the matmul
+kernels). Without coalescing each acquire lowers to its own
+non-cumulative `cb_wait_front(k)` / `cb_pop_front(k)`, which races
+whenever consumes are deferred: the first pop advances the front before
+the producer has pushed enough tiles to satisfy the next read.
+
+`addSliceOffset` (`include/ttlang/Dialect/Utils/ConversionUtils.h`) folds
+each `extract_slice` offset into the per-tile `src_idx` / `dst_idx` at
+lowering, so no lowering changes are required. The producer side
+(`cb_reserve` / `cb_push`) uses the same templated helpers — per-block
+`extract_slice`s become the views of downstream `ttl.tile_store` /
+`ttl.store` ops, and `addSliceOffset` handles store-side dst indices the
+same way.
+
+### Correctness criterion
+
+For a candidate group of acquires `G = {a_1, ..., a_N}` on DFB `c`, the
+rewrite is correct iff every op `O` between consecutive group members
+preserves the synchronization invariant of `c` under the coalesced
+schedule. The coalesced acquire blocks until `N*k` tiles are present
+*before* anything between original `a_i` and `a_{i+1}` runs; the
+coalesced release runs only after the last group member's last use.
+
+This holds iff no op between members causes a release on `c` (directly or
+transitively): the original IR may have allowed the producer to recycle
+slots between `a_i` and `a_{i+1}`, and the coalesced version forbids that
+until the very end. Forbidding inter-member releases is therefore
+necessary for correctness at low `block_count`, and sufficient when paired
+with the coalesced release placement.
+
+A locally-checkable (sound, conservative) version of that criterion: an
+op `O` between members is safe to skip past iff none of:
+
+1. `O` operates on `c` directly (`c` appears as an operand). Covers
+   `cb_pop` / `cb_push` on `c` and any other op that reads or writes `c`.
+2. `O` consumes the SSA result of any current group member. A consume can
+   flow into a release on `c` somewhere downstream, and we don't perform
+   transitive analysis.
+3. `O` carries a region. Region bodies might contain a release on `c`;
+   conservative cutoff.
+
+Anything else — an acquire or release on a different DFB, `arith.constant`,
+pure compute on other DFBs — cannot affect `c` and is safe. `ttl.attach_cb`
+is explicitly excluded from rules (1)–(2): it is an SSA-only identity op
+(the metal lowering erases it) that always references the group's results
+and `cb` as operands, so the generic check would otherwise wrongly break
+the group at every `attach_cb`.
+
+#### Why this is sufficient
+
+Suppose `O` between `a_i` and `a_{i+1}` satisfies all three negations
+above. Then:
+
+- `O` does not directly call any release on `c` (rule 1).
+- `O`'s outputs do not consume any tile from `G` (rule 2 on operands; the
+  outputs cannot make further data depend on `G`'s tiles).
+- `O` has no inner region that could hide an indirect release on `c`
+  (rule 3).
+
+So the only way a release on `c` could appear before the coalesced
+release is via a transitive use of some non-`G` value. Because rule 2
+forbids `G`'s outputs from being inputs to `O`, no fresh dataflow path is
+created from `G` into a `c` release. Any release on `c` reachable from
+some unrelated value would have run in the original IR too, at exactly
+the same op-order position, so the coalesced version is no worse.
+
+#### Why this is necessary
+
+If `O` is itself a release on `c` (e.g., a user-written `cb_pop`), the
+original IR lets the producer recycle one slot at `O`, but the coalesced
+acquire holds all `N*k` slots from the start. With `block_count` only
+slightly larger than the working set, the producer cannot push the next
+batch and the consumer cannot release until all members are consumed —
+deadlock. Same argument for transitive releases via group results.
+
+### Detection algorithm
+
+Per block, pre-collect all acquires of the kind under consideration
+(`cb_wait` for the consumer pass; `cb_reserve` for the producer pass).
+For each candidate leader (in op order):
+
+```
+if leader is already coalesced (num_tiles set) or already erased:
+  continue
+
+group = [leader]
+for op = leader.nextOp; op != nullptr; op = op.nextOp:
+  if op is a same-kind same-cb acquire with no num_tiles:
+    group.push_back(op); continue
+  if op is a same-kind acquire on a different DFB:
+    continue  # benign: cannot touch our DFB or our group's results
+  if mayReleaseDFB(op, cb=leader.cb, group):
+    break
+  # else: tolerate (different-DFB op, attach_cb, arith, ...)
+
+if group.size() < 2: continue
+match N releases on cb after the last group member, in op order
+apply rewrite, mark group members as erased
+```
+
+Because the candidate set is fixed before any rewrite, acquires on a
+different DFB that the inner loop skips past (e.g., the matmul-style
+`a1, b1, a2, b2` interleave) still get a chance to lead their own group
+on a later iteration of the outer loop.
+
+### Idempotency
+
+The coalesced acquire and release carry a `num_tiles` attribute, and
+`detectGroup` skips acquires that already have one. A second run of the
+pass therefore finds no candidate groups and is a no-op. The doubled-pass
+lit invocation
+(`--pass-pipeline='builtin.module(func.func(ttl-coalesce-dfb-acquires,
+ttl-coalesce-dfb-acquires))'`) verifies this.
+
+### Limitations
+
+- Non-rank-2 acquire shapes are not coalesced. The existing `num_tiles`
+  shape convention (matching `TTLSubblockComputeForDST`) produces
+  `tensor<1, num_tiles, elem>`; the pass conservatively bails on other
+  ranks rather than picking an axis to scale.
+- Acquires already carrying `num_tiles` (set by
+  `TTLSubblockComputeForDST`) are not extended.
+- Region-bearing ops between members terminate the group, so coalescing
+  does not span control flow within an `scf.if` or `scf.for` (loop-body
+  coalescing still works because the body is its own block).
+
 ## Index Reuse
 
 `TTLFinalizeDFBIndices` reduces the physical DFB count by assigning the same index to compiler-allocated DFBs whose lifetimes do not overlap. The algorithm runs per function. Compiler-allocated DFBs are intra-thread (both producer and consumer are in the same compute function), so their lifetimes are independent across functions.
diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index aa68cc110..0394af6ea 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -81,9 +81,12 @@ def TTLCoalesceDFBAcquires
     per-tile `src_idx` at lowering time, so no lowering changes are
     required.
 
-    Symmetric on the producer side (`cb_reserve` / `cb_push`), with each
-    per-block `extract_slice` becoming the view of a downstream
-    `ttl.tile_store` / `ttl.store`.
+    The producer side (`cb_reserve` / `cb_push`) is rewritten by the same
+    templated routine: a run of `N` consecutive `cb_reserve` ops on one
+    DFB collapses into one `cb_reserve {num_tiles = N*k}` plus per-block
+    `tensor.extract_slice` views routed into the downstream
+    `ttl.tile_store` / `ttl.store` ops, and the `N` matching `cb_push`
+    ops collapse into one `cb_push {num_tiles = N*k}`.
 
     Detection: forward walk per block; an acquire group is a maximal run
     of same-kind same-DFB acquires separated only by `ttl.attach_cb` or
diff --git a/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp b/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp
index cee1fdff0..6188659c0 100644
--- a/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp
@@ -47,13 +47,44 @@ namespace mlir::tt::ttl {
 
 namespace {
 
-// Ops permitted to interleave between consecutive acquires without breaking
-// a coalescable group. Verified empirically against
-// `test/ttlang/Dialect/TTL/Transforms/insert_cb_sync.mlir:812-817`: the
-// frontend emits `cb_wait` immediately followed by `attach_cb`, so a
-// three-wait group has six interleaved ops.
-static bool isInterleaveOk(Operation *op) {
-  return isa<AttachCBOp, arith::ConstantOp>(op);
+// Return true if `op` (sitting between two same-DFB acquires on `cb`) might
+// directly or transitively cause a release on `cb` before our coalesced
+// release executes -- i.e., it must terminate the candidate group. See
+// "DFB Acquire Coalescing" in `docs/development/DFBManagement.md` for the
+// correctness argument. Two locally-checkable conditions cover the cases
+// that matter:
+//
+//   1. The op operates on `cb` itself (uses `cb` as an operand) -- includes
+//      same-DFB releases (cb_pop / cb_push) and any other op that touches
+//      `cb` directly.
+//   2. The op consumes the SSA result of an in-progress group member,
+//      since that consume can flow into a release on `cb` somewhere
+//      downstream.
+//
+// Region-bearing ops are treated as opaque (terminate the group) because
+// their bodies might contain a release on `cb`.
+//
+// `ttl.attach_cb` is an SSA-only identity (lowering erases it) that always
+// references the group's results and `cb`; allow it explicitly.
+static bool mayReleaseDFB(Operation *op, Value cb,
+                          ArrayRef<Operation *> group) {
+  if (isa<AttachCBOp>(op)) {
+    return false;
+  }
+  if (op->getNumRegions() > 0) {
+    return true;
+  }
+  for (Value operand : op->getOperands()) {
+    if (operand == cb) {
+      return true;
+    }
+    for (Operation *member : group) {
+      if (operand == member->getResult(0)) {
+        return true;
+      }
+    }
+  }
+  return false;
 }
 
 // Build the coalesced acquire's result type. For the common rank-2 case
@@ -83,24 +114,32 @@ createPerBlockSlice(OpBuilder &builder, Location loc, Value coalescedResult,
                                         offsets, sizes, strides);
 }
 
-// Detect a group of N >= 1 strictly-consecutive same-CB acquires of kind
-// `AcquireOp` starting at `start`. Returns the group; an acquire that
-// already carries a `num_tiles` attribute terminates the group (it has
-// already been coalesced or was emitted by `TTLSubblockComputeForDST`).
+// Detect a group of same-CB acquires of kind `AcquireOp` starting at
+// `start`. The group is maximal: walks forward in the block, adding each
+// same-kind same-cb acquire (with no pre-existing `num_tiles`) and skipping
+// any op that doesn't touch `cb` or the group's results (per
+// `mayReleaseDFB`). An acquire that already carries `num_tiles` (already
+// coalesced or set by `TTLSubblockComputeForDST`) terminates the group.
 template <typename AcquireOp>
 static SmallVector<AcquireOp> detectGroup(AcquireOp start) {
   SmallVector<AcquireOp> group;
   group.push_back(start);
   Value cb = start.getCb();
+  SmallVector<Operation *> groupOps = {start.getOperation()};
   for (Operation *cur = start->getNextNode(); cur; cur = cur->getNextNode()) {
     if (auto next = dyn_cast<AcquireOp>(cur)) {
-      if (next.getCb() == cb && !next.getNumTiles().has_value()) {
+      if (next.getCb() == cb) {
+        if (next.getNumTiles().has_value()) {
+          break;
+        }
         group.push_back(next);
+        groupOps.push_back(cur);
         continue;
       }
-      break; // Same-kind acquire on different CB or already coalesced.
+      // Different-CB acquire of the same kind -- doesn't touch our cb or
+      // our group's results; skip past.
     }
-    if (!isInterleaveOk(cur)) {
+    if (mayReleaseDFB(cur, cb, groupOps)) {
       break;
     }
   }
@@ -178,27 +217,36 @@ static bool tryCoalesceGroup(SmallVectorImpl<AcquireOp> &group,
   return true;
 }
 
-// Walk `block` once, applying coalescing to consecutive acquires.
+// Apply coalescing to acquires of kind `AcquireOp` in `block`. Pre-collects
+// the candidate set so that other-CB acquires which `detectGroup` skips
+// past still get a chance to lead their own group on a later iteration --
+// we don't rely on traversing erased ops via `getNextNode()`.
 template <typename AcquireOp, typename ReleaseOp>
 static void coalesceInBlock(Block &block, OpBuilder &builder) {
-  Operation *op = &block.front();
-  while (op) {
-    Operation *next = op->getNextNode();
-    if (auto acquire = dyn_cast<AcquireOp>(op)) {
-      if (!acquire.getNumTiles().has_value()) {
-        SmallVector<AcquireOp> group = detectGroup<AcquireOp>(acquire);
-        if (group.size() >= 2) {
-          // Capture the resume point before the rewrite; the last group
-          // member is erased but the op after it (if any) remains valid.
-          Operation *resume = group.back()->getNextNode();
-          if (tryCoalesceGroup<AcquireOp, ReleaseOp>(group, builder)) {
-            op = resume;
-            continue;
-          }
-        }
+  SmallVector<AcquireOp> candidates;
+  for (Operation &op : block) {
+    if (auto acquire = dyn_cast<AcquireOp>(&op)) {
+      candidates.push_back(acquire);
+    }
+  }
+  DenseSet<Operation *> erased;
+  for (AcquireOp leader : candidates) {
+    Operation *leaderOp = leader.getOperation();
+    if (erased.contains(leaderOp)) {
+      continue;
+    }
+    if (leader.getNumTiles().has_value()) {
+      continue;
+    }
+    SmallVector<AcquireOp> group = detectGroup<AcquireOp>(leader);
+    if (group.size() < 2) {
+      continue;
+    }
+    if (tryCoalesceGroup<AcquireOp, ReleaseOp>(group, builder)) {
+      for (AcquireOp member : group) {
+        erased.insert(member.getOperation());
       }
     }
-    op = next;
   }
 }
 
diff --git a/test/python/test_auto_pop_push.py b/test/python/test_auto_pop_push.py
index fbf3cd8eb..c0dbc74e8 100644
--- a/test/python/test_auto_pop_push.py
+++ b/test/python/test_auto_pop_push.py
@@ -11,7 +11,7 @@
 
 Several tests are marked xfail(strict). Each describes a real pattern
 that currently produces wrong runtime output (or fails to compile) and
-will start passing once a tracked compiler follow-up lands. The
+will start passing once a tracked compiler follow-up is merged. The
 explanation for each is at the test site.
 """
 
@@ -595,8 +595,8 @@ def dm_write():
 # directly rather than a tensor SSA value derived from cb_reserve, so the
 # IR carries no def-use edge identifying which copy fills which reserve.
 # The pass falls back to op-order reasoning and attributes all three
-# copies to the last reserve. The push for the earlier reserves lands
-# before any data is written; the buffer's write pointer advances past
+# copies to the last reserve. The push for the earlier reserves is
+# emitted before any data is written; the buffer's write pointer advances past
 # empty slots. Lifted by #555 (encode DFB ownership in SSA on ttl.copy).
 # ---------------------------------------------------------------------------
 
@@ -1071,7 +1071,7 @@ def dm_write():
 #
 # A single cb.wait() followed by two ttl.copy() reads from the same slot to
 # different output positions. Both copies are direct CB operands on the same
-# acquire (criterion-b ownership). The pop must land after the last copy; if
+# acquire (criterion-b ownership). The pop must be inserted after the last copy; if
 # findLastOwnedUse stopped at the first copy, the pop would advance the read
 # pointer before the second copy reads, producing stale data in row 1.
 # ---------------------------------------------------------------------------
@@ -1109,7 +1109,7 @@ def dm_write():
 # ---------------------------------------------------------------------------
 # Producer-side analog of case_b: 3 consecutive cb.reserve() per iteration
 # of an scf.for, with the matching stores deferred until after the third
-# reserve. Each push must land after its own slot's store, inside the loop
+# reserve. Each push must be inserted after its own slot's store, inside the loop
 # body. Symmetric coverage to test 28 in insert_cb_sync.mlir for producers.
 # ---------------------------------------------------------------------------
 
@@ -1157,8 +1157,8 @@ def dm_write():
 # xfail (#540). Tensor recurrence (acc = acc + ...) carrying an acquired
 # tile through scf.for iter_args. The DSL today does not lower this
 # pattern consistently; PR #540 adds the missing materialization. Once
-# #540 lands, the auto-pop pass must follow uses through the iter_arg
-# block argument so the pop lands after the loop, not before. Mirrors
+# #540 is merged, the auto-pop pass must follow uses through the iter_arg
+# block argument so the pop is placed after the loop, not before. Mirrors
 # lit test 30 in insert_cb_sync.mlir.
 # ---------------------------------------------------------------------------
 
@@ -1197,3 +1197,202 @@ def dm_write():
             ttl.copy(blk, out[0, 0]).wait()
 
     _run(device, repro, 1, [float(2**N)])
+
+
+# ---------------------------------------------------------------------------
+# A third same-DFB acquire is interposed between two coalescable waits
+# and their releases. Auto-pop places pop_t1 right after t1's last use,
+# t3's wait runs before t2's last use, then pop_t2 is emitted. The coalescing
+# rewrite collapses pop_t1 and pop_t2 into a single coalesced pop that
+# now sits past the interposed t3 wait; this verifies correctness of
+# that placement.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_third_acquire_interposed_between_coalesced_pops(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=4)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=3)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 1.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 2.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 3.0))
+
+            t1 = cb.wait()
+            t2 = cb.wait()
+            with out_cb.reserve() as o:
+                o.store(t1)
+            t3 = cb.wait()
+            with out_cb.reserve() as o:
+                o.store(t2)
+            with out_cb.reserve() as o:
+                o.store(t3)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 2]).wait()
+
+    _run(device, repro, 3, [1.0, 2.0, 3.0])
+
+
+# ---------------------------------------------------------------------------
+# Producer-side multi-tile block shape. Three consecutive cb.reserve()
+# handles, each shape=(1, 2), with deferred stores on the block-shaped
+# views. Verifies that the producer-side coalesce + per-block
+# extract_slice + dst_idx fold line up for k > 1.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_producer_three_reserves_multi_tile_block_shape(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(inp, out):
+        cb = ttl.make_dataflow_buffer_like(inp, shape=(1, 2), block_count=2)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 2), block_count=3)
+
+        @ttl.compute()
+        def compute():
+            t = cb.wait()
+            r1 = out_cb.reserve()
+            r2 = out_cb.reserve()
+            r3 = out_cb.reserve()
+            r1.store(t)
+            r2.store(t)
+            r3.store(t)
+
+        @ttl.datamovement()
+        def dm_read():
+            r = cb.reserve()
+            tx = ttl.copy(inp[0:1, 0:2], r)
+            tx.wait()
+            r.push()
+
+        @ttl.datamovement()
+        def dm_write():
+            for col in range(3):
+                blk = out_cb.wait()
+                ttl.copy(blk, out[0:1, 2 * col : 2 * col + 2]).wait()
+                blk.pop()
+
+    torch.manual_seed(424)
+    inp_t = to_dram(torch.randn((TILE, 2 * TILE), dtype=torch.bfloat16), device)
+    out_t = to_dram(torch.full((TILE, 6 * TILE), -42.0, dtype=torch.bfloat16), device)
+    repro(inp_t, out_t)
+    ttnn.synchronize_device(device)
+    inp_h = ttnn.to_torch(inp_t)
+    out_h = ttnn.to_torch(out_t)
+    for col in range(3):
+        col_slice = out_h[:, 2 * TILE * col : 2 * TILE * (col + 1)]
+        assert torch.equal(col_slice, inp_h), f"output block {col} differs from input"
+
+
+# ---------------------------------------------------------------------------
+# Two deferred waits where t1 has fan-out (used twice) before the
+# auto-pop pop point. After coalescing, replaceAllUsesWith must update
+# every t1 use, not just one.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_deferred_waits_with_t1_fanout(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=3)
+
+        @ttl.compute()
+        def compute():
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 5.0))
+            with cb.reserve() as v:
+                v.store(ttl.math.fill(v, 7.0))
+
+            t1 = cb.wait()
+            t2 = cb.wait()
+            with out_cb.reserve() as o:
+                o.store(t1)
+            with out_cb.reserve() as o:
+                o.store(t1)
+            with out_cb.reserve() as o:
+                o.store(t2)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 2]).wait()
+
+    _run(device, repro, 3, [5.0, 5.0, 7.0])
+
+
+# ---------------------------------------------------------------------------
+# Matmul-style pattern: 2 waits on cb_a interleaved with 2 waits on cb_b
+# (a1, b1, a2, b2). Each CB has its own pair of deferred consumes, but
+# the source pairs them across CBs. cb_a's two waits coalesce
+# independently of cb_b's two waits.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_matmul_style_two_cb_interleaved_deferred_acquires(device):
+    @ttl.operation(grid=(1, 1))
+    def repro(out):
+        cb_a = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+        cb_b = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+        out_cb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            with cb_a.reserve() as v:
+                v.store(ttl.math.fill(v, 11.0))
+            with cb_a.reserve() as v:
+                v.store(ttl.math.fill(v, 22.0))
+            with cb_b.reserve() as v:
+                v.store(ttl.math.fill(v, 33.0))
+            with cb_b.reserve() as v:
+                v.store(ttl.math.fill(v, 44.0))
+
+            a1 = cb_a.wait()
+            b1 = cb_b.wait()
+            a2 = cb_a.wait()
+            b2 = cb_b.wait()
+            with out_cb.reserve() as o:
+                o.store(a1 + b1)
+            with out_cb.reserve() as o:
+                o.store(a2 + b2)
+
+        @ttl.datamovement()
+        def dm_read():
+            pass
+
+        @ttl.datamovement()
+        def dm_write():
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 0]).wait()
+            blk = out_cb.wait()
+            ttl.copy(blk, out[0, 1]).wait()
+
+    _run(device, repro, 2, [11.0 + 33.0, 22.0 + 44.0])
diff --git a/test/ttlang/Dialect/TTL/Transforms/coalesce_dfb_acquires.mlir b/test/ttlang/Dialect/TTL/Transforms/coalesce_dfb_acquires.mlir
index 0ea3c10b5..b144ab251 100644
--- a/test/ttlang/Dialect/TTL/Transforms/coalesce_dfb_acquires.mlir
+++ b/test/ttlang/Dialect/TTL/Transforms/coalesce_dfb_acquires.mlir
@@ -167,6 +167,10 @@ func.func @interleaved_consume_not_coalesced()
 // CHECK-LABEL: func.func @alternating_cbs_not_coalesced
 // CHECK-NOT: num_tiles
 // CHECK-NOT: tensor.extract_slice
+//
+// Note: this test verifies the SINGLE-acquire-per-CB pattern is left
+// alone. Multi-acquire-per-CB interleaved across CBs (matmul-style) IS
+// coalesced and is covered by the next test.
 func.func @alternating_cbs_not_coalesced()
     attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
   %cb_a = ttl.bind_cb{cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
@@ -226,3 +230,41 @@ func.func @single_wait_unchanged()
   ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
   func.return
 }
+
+// -----
+
+// Test 8: matmul-style pattern. Two waits on cb_a interleaved with two
+// waits on cb_b. Each CB independently has a coalescable group; the
+// other-CB acquire between same-CB acquires does not touch our CB or our
+// group's results, so it does not break the run.
+
+// CHECK-LABEL: func.func @matmul_style_two_cb_interleaved
+// CHECK: %[[CBA:.+]] = ttl.bind_cb{cb_index = 0
+// CHECK: %[[CBB:.+]] = ttl.bind_cb{cb_index = 1
+// CHECK: %[[GA:.+]] = ttl.cb_wait %[[CBA]] {num_tiles = 2 : i64}
+// CHECK-SAME: tensor<1x2x!ttcore.tile<32x32, bf16>>
+// CHECK: %[[GB:.+]] = ttl.cb_wait %[[CBB]] {num_tiles = 2 : i64}
+// CHECK-SAME: tensor<1x2x!ttcore.tile<32x32, bf16>>
+// CHECK-DAG: ttl.cb_pop %[[CBA]] {num_tiles = 2 : i64}
+// CHECK-DAG: ttl.cb_pop %[[CBB]] {num_tiles = 2 : i64}
+// CHECK-NOT: ttl.cb_wait
+// CHECK-NOT: ttl.cb_pop
+// CHECK: return
+func.func @matmul_style_two_cb_interleaved()
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %cb_a = ttl.bind_cb{cb_index = 0, block_count = 4} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 4>
+  %cb_b = ttl.bind_cb{cb_index = 1, block_count = 4} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 4>
+  %a1 = ttl.cb_wait %cb_a : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %aa1 = ttl.attach_cb %a1, %cb_a : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 4>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b1 = ttl.cb_wait %cb_b : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %ab1 = ttl.attach_cb %b1, %cb_b : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 4>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a2 = ttl.cb_wait %cb_a : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %aa2 = ttl.attach_cb %a2, %cb_a : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 4>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b2 = ttl.cb_wait %cb_b : <[1, 1], !ttcore.tile<32x32, bf16>, 4> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %ab2 = ttl.attach_cb %b2, %cb_b : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 4>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_pop %cb_a : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+  ttl.cb_pop %cb_b : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+  ttl.cb_pop %cb_a : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+  ttl.cb_pop %cb_b : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
+  func.return
+}

From 49b66ceda2406e73b8b3f34f64301100966e8cb1 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Fri, 8 May 2026 20:17:32 -0700
Subject: [PATCH 08/11] xfail the issue 541 (fixed in 547) tests

---
 test/python/pipe/test_mcast_matmul.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/python/pipe/test_mcast_matmul.py b/test/python/pipe/test_mcast_matmul.py
index 701748755..fe397f312 100644
--- a/test/python/pipe/test_mcast_matmul.py
+++ b/test/python/pipe/test_mcast_matmul.py
@@ -380,16 +380,19 @@ def _run_matmul(make_kernel, M, K, N, device, golden_fn=None):
     assert_pcc(expected, result, threshold=0.99)
 
 
+@pytest.mark.xfail(reason="Pending fix in PR #547", strict=False)
 def test_mcast_matmul(device):
     """2D mcast matmul (both A+B on dm_read)."""
     _run_matmul(make_mcast_kernel, 10240, 8192, 13312, device)
 
 
+@pytest.mark.xfail(reason="Pending fix in PR #547", strict=False)
 def test_balanced_matmul(device):
     """Balanced matmul (A on dm_read, B on dm_write)."""
     _run_matmul(make_balanced_kernel, 10240, 8192, 13312, device)
 
 
+@pytest.mark.xfail(reason="Pending fix in PR #547", strict=False)
 def test_balanced_matmul_relu(device):
     """Balanced matmul + fused relu."""
 

From b34b80a66b360d83bb853d542285e2842801a337 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Sat, 9 May 2026 20:49:31 -0700
Subject: [PATCH 09/11] Bundle ttl-insert-cb-sync + ttl-coalesce-dfb-acquires
 into a registered ttl-auto-sync pipeline; update C++, Python, and me2e
 callers to use it.

Remove helper used just once.
---
 include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h         | 11 -----------
 include/ttlang/Dialect/TTL/Passes.td                |  6 +++---
 include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h |  2 ++
 include/ttlang/Dialect/Utils/ConversionUtils.h      |  9 ++++++---
 lib/CAPI/CMakeLists.txt                             |  1 +
 lib/CAPI/Dialects.cpp                               |  6 +++++-
 lib/Dialect/TTL/Pipelines/TTLPipelines.cpp          | 11 +++++++++--
 python/ttl/ttl_api.py                               |  3 +--
 test/me2e/builder/pipeline.py                       |  3 +--
 9 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
index 19ecbaac7..a47524f0a 100644
--- a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
+++ b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
@@ -54,17 +54,6 @@ inline mlir::tt::ttl::CBReserveOp findCBReserveForView(mlir::Value view) {
   return view.getDefiningOp<mlir::tt::ttl::CBReserveOp>();
 }
 
-/// Trace through any number of `ttl.attach_cb` ops and return the
-/// underlying tensor SSA value. `attach_cb` is an identity op that records
-/// a tensor->CB association; callers that want to inspect the upstream
-/// producer (e.g. a `tensor.extract_slice`) should call this first.
-inline mlir::Value traceAttachCBs(mlir::Value value) {
-  while (auto attach = value.getDefiningOp<mlir::tt::ttl::AttachCBOp>()) {
-    value = attach.getTensor();
-  }
-  return value;
-}
-
 /// Return the element type for a ttcore::TileType.
 inline std::optional<mlir::Type> getTileElementType(mlir::Type type) {
   if (auto tileType = mlir::dyn_cast<ttcore::TileType>(type)) {
diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index 0394af6ea..e38b52c65 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -47,9 +47,9 @@ def TTLCoalesceDFBAcquires
     cb_pop_front(cb, 3*k);
     ```
 
-    used in tt-metal compute kernels (`tt-metal/tt_metal/kernels/compute/
-    eltwise_binary.cpp`, `bcast_h.cpp`, the matmul kernels, etc.) when a
-    consumer holds a fixed multi-tile window before processing it.
+    used in tt-metal compute kernels (`eltwise_binary.cpp`, `bcast_h.cpp`,
+    matmul kernels, etc.) when a consumer holds a fixed multi-tile window
+    before processing it.
 
     Pre-coalesce, each `ttl.cb_wait` lowers to its own `cb_wait_front(k)`
     and each `ttl.cb_pop` to `cb_pop_front(k)`. Because metal's
diff --git a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h
index ce3b5c650..611a10772 100644
--- a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h
+++ b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h
@@ -59,6 +59,8 @@ struct TTLToTTKernelPipelineOptions
 void createTTLToTTKernelPipeline(mlir::OpPassManager &pm,
                                  const TTLToTTKernelPipelineOptions &options);
 
+void buildTTLAutoSyncPipeline(mlir::OpPassManager &pm);
+
 void registerTTLPipelines();
 
 } // namespace mlir::tt::ttl
diff --git a/include/ttlang/Dialect/Utils/ConversionUtils.h b/include/ttlang/Dialect/Utils/ConversionUtils.h
index d3979e95b..5779a753f 100644
--- a/include/ttlang/Dialect/Utils/ConversionUtils.h
+++ b/include/ttlang/Dialect/Utils/ConversionUtils.h
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/AffineMap.h"
@@ -41,9 +42,11 @@ inline Value addSliceOffset(Value operand, Value localIndex, OpBuilder &builder,
   if (auto extract = tensor.getDefiningOp<mlir::tensor::ExtractOp>()) {
     tensor = extract.getTensor();
   }
-  // Trace through `ttl.attach_cb` so a slice upstream of an attach_cb is
-  // still discoverable.
-  tensor = mlir::tt::ttl::traceAttachCBs(tensor);
+  // Skip past any `ttl.attach_cb` (SSA identity) so the next
+  // `getDefiningOp` finds the extract_slice rather than the attach_cb.
+  while (auto attach = tensor.getDefiningOp<mlir::tt::ttl::AttachCBOp>()) {
+    tensor = attach.getTensor();
+  }
   auto slice = tensor.getDefiningOp<mlir::tensor::ExtractSliceOp>();
   if (!slice) {
     return localIndex;
diff --git a/lib/CAPI/CMakeLists.txt b/lib/CAPI/CMakeLists.txt
index bbb6b8d31..fe4003607 100644
--- a/lib/CAPI/CMakeLists.txt
+++ b/lib/CAPI/CMakeLists.txt
@@ -15,4 +15,5 @@ add_mlir_library(TTLangCAPI
   MLIRFuncDialect
   MLIRTTLDialect
   TTLangTTLTransforms
+  TTLangTTLPipelines
 )
diff --git a/lib/CAPI/Dialects.cpp b/lib/CAPI/Dialects.cpp
index 0ed42d30a..75a0e371a 100644
--- a/lib/CAPI/Dialects.cpp
+++ b/lib/CAPI/Dialects.cpp
@@ -5,6 +5,7 @@
 #include "ttlang-c/Dialects.h"
 #include "ttlang/Dialect/TTL/IR/TTL.h"
 #include "ttlang/Dialect/TTL/Passes.h"
+#include "ttlang/Dialect/TTL/Pipelines/TTLPipelines.h"
 
 #include "mlir/CAPI/IR.h"
 #include "mlir/CAPI/Registration.h"
@@ -30,4 +31,7 @@ void ttlangRegisterTTLDialect(MlirDialectRegistry registry) {
   unwrap(registry)->insert<TTLDialect>();
 }
 
-void ttlangRegisterPasses() { mlir::tt::ttl::registerTTLPasses(); }
+void ttlangRegisterPasses() {
+  mlir::tt::ttl::registerTTLPasses();
+  mlir::tt::ttl::registerTTLPipelines();
+}
diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
index e8abe7f69..ec7d9f6a0 100644
--- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
+++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
@@ -24,8 +24,7 @@ void createTTLToTTKernelPipeline(OpPassManager &pm,
     pm.addNestedPass<func::FuncOp>(createTTLInsertIntermediateDFBs(dfbOpts));
   }
   pm.addNestedPass<func::FuncOp>(createTTLInsertCopyWait());
-  pm.addNestedPass<func::FuncOp>(createTTLInsertCBSync());
-  pm.addNestedPass<func::FuncOp>(createTTLCoalesceDFBAcquires());
+  buildTTLAutoSyncPipeline(pm.nest<func::FuncOp>());
   pm.addPass(createTTLAnnotateL1AccLoops());
   pm.addPass(createTTLConvertTTLToCompute());
   {
@@ -75,12 +74,20 @@ void createTTLToTTKernelPipeline(OpPassManager &pm,
   }
 }
 
+void buildTTLAutoSyncPipeline(OpPassManager &pm) {
+  pm.addPass(createTTLInsertCBSync());
+  pm.addPass(createTTLCoalesceDFBAcquires());
+}
+
 void registerTTLPipelines() {
   PassPipelineRegistration<TTLToTTKernelPipelineOptions>(
       "ttl-to-ttkernel-pipeline",
       "Lower TTL to TTKernel, run cleanup canonicalization/CSE, and optionally "
       "lower TTKernel to EmitC.",
       createTTLToTTKernelPipeline);
+  PassPipelineRegistration<>("ttl-auto-sync",
+                             "Insert auto pop/push and coalesce DFB acquires.",
+                             buildTTLAutoSyncPipeline);
 }
 
 } // namespace mlir::tt::ttl
diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py
index f2ee2ad71..d165d60bf 100644
--- a/python/ttl/ttl_api.py
+++ b/python/ttl/ttl_api.py
@@ -1358,8 +1358,7 @@ def _compile_kernel(
         pipeline_passes = [
             f"func.func(ttl-insert-intermediate-dfbs{{enable={compiler_dfbs_flag}}})",
             "func.func(ttl-insert-copy-wait)",
-            "func.func(ttl-insert-cb-sync)",
-            "func.func(ttl-coalesce-dfb-acquires)",
+            "func.func(ttl-auto-sync)",
             "func.func(ttl-annotate-l1-acc-loops)",
             "func.func(convert-ttl-to-compute)",
             set_compute_config_pass,
diff --git a/test/me2e/builder/pipeline.py b/test/me2e/builder/pipeline.py
index f9f57763c..8c6748b51 100644
--- a/test/me2e/builder/pipeline.py
+++ b/test/me2e/builder/pipeline.py
@@ -42,8 +42,7 @@ def compile_ttl_to_ttkernel(
     func_passes = [
         "ttl-insert-intermediate-dfbs",
         "ttl-insert-copy-wait",
-        "ttl-insert-cb-sync",
-        "ttl-coalesce-dfb-acquires",
+        "ttl-auto-sync",
         "convert-ttl-to-compute",
         assign_dst_pass,
     ]

From bf22919607931513d481c73dabbc585a690c0e46 Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Sat, 9 May 2026 23:14:42 -0700
Subject: [PATCH 10/11] edit comments

---
 docs/development/DFBManagement.md             |  4 +-
 include/ttlang/Dialect/TTL/Passes.td          |  7 ++-
 .../TTL/Transforms/TTLCoalesceDFBAcquires.cpp | 42 +++++++---------
 .../TTL/Transforms/TTLInsertCBSync.cpp        | 48 +++++++++----------
 test/python/test_auto_pop_push.py             |  2 +-
 5 files changed, 49 insertions(+), 54 deletions(-)

diff --git a/docs/development/DFBManagement.md b/docs/development/DFBManagement.md
index ea44c5739..7582ccb68 100644
--- a/docs/development/DFBManagement.md
+++ b/docs/development/DFBManagement.md
@@ -364,8 +364,8 @@ ttl-coalesce-dfb-acquires))'`) verifies this.
 
 ### Limitations
 
-- Non-rank-2 acquire shapes are not coalesced. The existing `num_tiles`
-  shape convention (matching `TTLSubblockComputeForDST`) produces
+- Non-rank-2 acquire result types are not coalesced. The existing
+  `num_tiles` convention (matching `TTLSubblockComputeForDST`) produces
   `tensor<1, num_tiles, elem>`; the pass conservatively bails on other
   ranks rather than picking an axis to scale.
 - Acquires already carrying `num_tiles` (set by
diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index e38b52c65..455a6dbb5 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -37,7 +37,7 @@ def TTLCoalesceDFBAcquires
     ```
 
     into the canonical tt-metal "cumulative wait + indexed reads +
-    coalesced pop" shape
+    coalesced pop" pattern
 
     ```
     cb_wait_front(cb, 3*k);
@@ -93,6 +93,11 @@ def TTLCoalesceDFBAcquires
     `arith.constant` ops. Acquires already carrying a `num_tiles`
     attribute (e.g. set by `ttl-subblock-compute-for-dst`) are not
     coalesced and terminate the group. The pass is idempotent.
+
+    Result-type constraint: only acquires whose result tensor has shape
+    `<1, k>` are coalesced; the coalesced result is `<1, N*k>`. Other
+    ranks would require a convention for which axis to extend by `N`,
+    which is not specified, and are skipped.
   }];
 
   let dependentDialects = [
diff --git a/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp b/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp
index 6188659c0..8ec002733 100644
--- a/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // Rewrites N consecutive same-DFB acquires + N matching releases into the
-// canonical tt-metal cumulative-wait shape:
+// canonical tt-metal cumulative-wait pattern:
 //
 //     cb_wait_front(cb, N*k);
 //     copy_tile(cb, /*src_idx=*/0,    dst);
@@ -27,7 +27,7 @@
 // per-tile `src_idx` / `dst_idx` at lowering, so no lowering changes are
 // needed. Symmetric for `cb_reserve` / `cb_push`.
 //
-// See issue #556 and `docs/development/DFBManagement.md`.
+// See `docs/development/DFBManagement.md`.
 //===----------------------------------------------------------------------===//
 
 #include "ttlang/Dialect/TTL/IR/TTLOps.h"
@@ -87,10 +87,6 @@ static bool mayReleaseDFB(Operation *op, Value cb,
   return false;
 }
 
-// Build the coalesced acquire's result type. For the common rank-2 case
-// `tensor<1 x k x elem>` (matching the `num_tiles` shape convention from
-// `cb_ops_invalid.mlir` and `TTLSubblockComputeForDST`), produce
-// `tensor<1 x (N*k) x elem>`. Higher-rank shapes are not coalesced.
 static RankedTensorType buildCoalescedType(RankedTensorType unitTy,
                                            int64_t totalTiles) {
   auto shape = unitTy.getShape();
@@ -99,8 +95,9 @@ static RankedTensorType buildCoalescedType(RankedTensorType unitTy,
   return RankedTensorType::get({1, totalTiles}, unitTy.getElementType());
 }
 
-// `tensor.extract_slice` for the i-th member of an N-block group:
-// offsets = [0, i*k], sizes = [1, k], strides = [1, 1].
+// Slice into the coalesced result that recovers the i-th member's
+// original `<1, k>` view, used as the replacement value for the i-th
+// erased acquire.
 static tensor::ExtractSliceOp
 createPerBlockSlice(OpBuilder &builder, Location loc, Value coalescedResult,
                     RankedTensorType unitTy, int64_t blockIdx, int64_t k) {
@@ -114,12 +111,9 @@ createPerBlockSlice(OpBuilder &builder, Location loc, Value coalescedResult,
                                         offsets, sizes, strides);
 }
 
-// Detect a group of same-CB acquires of kind `AcquireOp` starting at
-// `start`. The group is maximal: walks forward in the block, adding each
-// same-kind same-cb acquire (with no pre-existing `num_tiles`) and skipping
-// any op that doesn't touch `cb` or the group's results (per
-// `mayReleaseDFB`). An acquire that already carries `num_tiles` (already
-// coalesced or set by `TTLSubblockComputeForDST`) terminates the group.
+// Maximal run of coalescable same-DFB acquires anchored at `start`,
+// in op order within the enclosing block. Already-coalesced acquires
+// (those with a `num_tiles` attribute) are not group members.
 template <typename AcquireOp>
 static SmallVector<AcquireOp> detectGroup(AcquireOp start) {
   SmallVector<AcquireOp> group;
@@ -146,11 +140,9 @@ static SmallVector<AcquireOp> detectGroup(AcquireOp start) {
   return group;
 }
 
-// Collect the first `count` matching releases of kind `ReleaseOp` on `cb`
-// starting at `start`, walking forward in the same block. Returns empty if
-// fewer than `count` are found before block end, or if a same-CB release
-// already carries `num_tiles` (a partial earlier coalesce we shouldn't
-// extend).
+// The `count` releases on `cb` that the coalesced release will replace,
+// in op order. Empty result means the coalesce cannot proceed: either too
+// few releases are present, or one of them is already coalesced.
 template <typename ReleaseOp>
 static SmallVector<ReleaseOp> collectReleases(Operation *start, Value cb,
                                               size_t count) {
@@ -177,9 +169,6 @@ static bool tryCoalesceGroup(SmallVectorImpl<AcquireOp> &group,
   AcquireOp leader = group.front();
   Value cb = leader.getCb();
   auto unitTy = cast<RankedTensorType>(leader.getResult().getType());
-  // Conservative: only coalesce the rank-2 leading-1 shape that the
-  // existing `num_tiles` convention covers. Other shapes flow through
-  // unchanged.
   if (unitTy.getRank() != 2 || unitTy.getShape()[0] != 1) {
     return false;
   }
@@ -217,10 +206,11 @@ static bool tryCoalesceGroup(SmallVectorImpl<AcquireOp> &group,
   return true;
 }
 
-// Apply coalescing to acquires of kind `AcquireOp` in `block`. Pre-collects
-// the candidate set so that other-CB acquires which `detectGroup` skips
-// past still get a chance to lead their own group on a later iteration --
-// we don't rely on traversing erased ops via `getNextNode()`.
+// The candidate set is pre-collected for two reasons: an acquire on a
+// different DFB that `detectGroup` walked past as a non-member must still
+// be considered as the starting point of a separate group later; and the
+// outer iteration must not depend on `getNextNode()` after the rewrite
+// erases ops in place.
 template <typename AcquireOp, typename ReleaseOp>
 static void coalesceInBlock(Block &block, OpBuilder &builder) {
   SmallVector<AcquireOp> candidates;
diff --git a/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp b/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
index 6073b28b0..774113c8f 100644
--- a/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
@@ -6,14 +6,13 @@
 // TTL Insert CB Sync
 //===----------------------------------------------------------------------===//
 //
-// Inserts missing cb_push / cb_pop for unmatched cb_reserve / cb_wait ops.
-// Owned-use discovery is asymmetric: tensor SSA uses are unbounded, direct
-// CB uses are bounded by the next same-class acquire. See
-// `docs/development/DFBManagement.md` for the ownership model.
-//
-// Legality invariants:
-//   P1. cb_push follows reserve-side writes before write pointer reuse.
-//   P2. cb_pop follows wait-side reads before read pointer reuse.
+// Auto-inserts a cb_push / cb_pop after each cb_reserve / cb_wait whose
+// matching release is absent in the input IR, placing each release after
+// the last use of the acquired slot so the slot is not recycled before
+// the consumer is done with it. The classification of "last use" is
+// asymmetric between direct-CB uses and tensor-SSA uses; see
+// `docs/development/DFBManagement.md` for the rules and correctness
+// argument.
 //
 //===----------------------------------------------------------------------===//
 
@@ -47,7 +46,6 @@ struct AcquireInterval {
   Operation *syncClassBoundary;
 };
 
-/// Return true if `a` is before `b` in their common block.
 static bool isBefore(Operation *a, Operation *b) {
   return a->isBeforeInBlock(b);
 }
@@ -198,9 +196,8 @@ static void updateBoundary(Value cb, Operation *acquire,
   }
 }
 
-/// Return the closest later acquire in the same DFB sync class, projected into
-/// `acquire`'s block. Producer intervals use `cb_reserve` boundaries; consumer
-/// intervals use `cb_wait` boundaries.
+/// Return the closest later acquire on `cb` in the same DFB sync class,
+/// projected into `acquire`'s block.
 static Operation *findNextSyncClassAcquire(Value cb, Operation *acquire,
                                            ArrayRef<Operation *> acquires) {
   Operation *boundary = nullptr;
@@ -208,9 +205,10 @@ static Operation *findNextSyncClassAcquire(Value cb, Operation *acquire,
   return boundary;
 }
 
-/// Return the last op in `acquire`'s block that consumes the acquired slot.
-/// Direct CB uses are bounded by the next same-class acquire; tensor SSA
-/// uses are not. See `docs/development/DFBManagement.md` for the model.
+/// Return the last op in `acquire`'s block that consumes the acquired
+/// slot. See `docs/development/DFBManagement.md` for the asymmetric
+/// classification of direct-DFB vs. tensor-SSA uses that this walk
+/// implements.
 static Operation *findLastOwnedUse(AcquireInterval interval) {
   Operation *last = interval.acquire;
   DenseSet<Operation *> visited;
@@ -244,10 +242,12 @@ static Operation *findLastOwnedUse(AcquireInterval interval) {
     }
   };
 
-  // Direct DFB uses: start from the CB value's users and recurse through
-  // their SSA results (e.g. ttl.copy returns a transfer_handle whose ttl.wait
-  // marks the actual end of the transfer). Boundary applies because two
-  // direct DFB uses on the same CB belong to different intervals.
+  // Direct-DFB uses. The walk recurses through each user's SSA results
+  // because the *true* end of the use can be a downstream op (e.g.
+  // ttl.copy returns a transfer_handle whose ttl.wait marks the actual
+  // end of the transfer). The next-acquire boundary applies: two
+  // direct-DFB uses straddling that boundary belong to different
+  // intervals.
   for (OpOperand &use : interval.cb.getUses()) {
     Operation *user = use.getOwner();
     if (user == interval.acquire) {
@@ -263,11 +263,11 @@ static Operation *findLastOwnedUse(AcquireInterval interval) {
   }
   drainWorklist(/*ignoreBoundary=*/false);
 
-  // Tensor SSA uses: start from the acquire's result and recurse through
-  // attach_cb / store / compute users. The next-acquire boundary does NOT
-  // apply: a tile produced by `cb_wait t1` may legitimately be consumed
-  // after `cb_wait t2`. Bounding this walk caused the issue #536 follow-up
-  // bug.
+  // Tensor-SSA uses. The next-acquire boundary does NOT apply: a tile
+  // produced by `cb_wait t1` may legitimately be consumed after
+  // `cb_wait t2`, since the consumer reads through the SSA value, not
+  // the slot's identity. Applying the boundary here was the root cause
+  // of the issue #536 follow-up miscompile.
   if (interval.acquire->getNumResults() > 0) {
     worklist.push_back(interval.acquire->getResult(0));
   }
diff --git a/test/python/test_auto_pop_push.py b/test/python/test_auto_pop_push.py
index c0dbc74e8..8e4b7c429 100644
--- a/test/python/test_auto_pop_push.py
+++ b/test/python/test_auto_pop_push.py
@@ -306,7 +306,7 @@ def dm_write():
 # ---------------------------------------------------------------------------
 # Mixed immediate + deferred consumer uses. Some cb.wait results are consumed
 # before the next wait; others are consumed after multiple subsequent waits.
-# Boundary handling must be correct for both shapes simultaneously.
+# Boundary handling must be correct for both patterns simultaneously.
 # ---------------------------------------------------------------------------
 
 

From 5d02c983e879f56972e63bee084b8fcec3fd48de Mon Sep 17 00:00:00 2001
From: Boyana Norris <bnorris@tenstorrent.com>
Date: Sun, 10 May 2026 09:42:19 -0700
Subject: [PATCH 11/11] add missing tests

---
 .../TTL/Transforms/TTLCoalesceDFBAcquires.cpp |  2 +
 .../TTL/Transforms/TTLInsertCBSync.cpp        | 14 ++--
 .../ttlang/Dialect/TTL/IR/cb_ops_invalid.mlir | 44 ++++++++++++
 .../TTL/Transforms/coalesce_dfb_acquires.mlir | 68 +++++++++++++++++++
 4 files changed, 121 insertions(+), 7 deletions(-)

diff --git a/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp b/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp
index 8ec002733..5eb9cf6fb 100644
--- a/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLCoalesceDFBAcquires.cpp
@@ -79,6 +79,8 @@ static bool mayReleaseDFB(Operation *op, Value cb,
       return true;
     }
     for (Operation *member : group) {
+      assert(member->getNumResults() == 1 &&
+             "DFB acquire ops produce exactly one tensor result");
       if (operand == member->getResult(0)) {
         return true;
       }
diff --git a/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp b/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
index 774113c8f..071ec1ce2 100644
--- a/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLInsertCBSync.cpp
@@ -9,10 +9,10 @@
 // Auto-inserts a cb_push / cb_pop after each cb_reserve / cb_wait whose
 // matching release is absent in the input IR, placing each release after
 // the last use of the acquired slot so the slot is not recycled before
-// the consumer is done with it. The classification of "last use" is
-// asymmetric between direct-CB uses and tensor-SSA uses; see
-// `docs/development/DFBManagement.md` for the rules and correctness
-// argument.
+// the consumer is done with it. "Last use" classification handles two
+// different valid IR situations -- direct-CB uses and tensor-SSA uses --
+// under different rules; see `docs/development/DFBManagement.md` for the
+// rules and correctness argument.
 //
 //===----------------------------------------------------------------------===//
 
@@ -268,9 +268,9 @@ static Operation *findLastOwnedUse(AcquireInterval interval) {
   // `cb_wait t2`, since the consumer reads through the SSA value, not
   // the slot's identity. Applying the boundary here was the root cause
   // of the issue #536 follow-up miscompile.
-  if (interval.acquire->getNumResults() > 0) {
-    worklist.push_back(interval.acquire->getResult(0));
-  }
+  assert(interval.acquire->getNumResults() == 1 &&
+         "DFB acquire ops produce exactly one tensor result");
+  worklist.push_back(interval.acquire->getResult(0));
   drainWorklist(/*ignoreBoundary=*/true);
 
   return last;
diff --git a/test/ttlang/Dialect/TTL/IR/cb_ops_invalid.mlir b/test/ttlang/Dialect/TTL/IR/cb_ops_invalid.mlir
index 1cfaa3523..fb54f7c0c 100644
--- a/test/ttlang/Dialect/TTL/IR/cb_ops_invalid.mlir
+++ b/test/ttlang/Dialect/TTL/IR/cb_ops_invalid.mlir
@@ -132,4 +132,48 @@ module {
   }
 }
 
+// -----
+
+// cb_wait with num_tiles: element type mismatch.
+module {
+  func.func @cb_wait_num_tiles_element_mismatch(%cb: !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x3x!ttcore.tile<32x32, f32>> attributes {ttl.kernel_thread = #ttkernel.thread<noc>} {
+    // expected-error @below {{result element type ('!ttcore.tile<32x32, f32>') must match DFB element type ('!ttcore.tile<32x32, bf16>')}}
+    %view = ttl.cb_wait %cb {num_tiles = 3 : i64} : <[3, 3], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x3x!ttcore.tile<32x32, f32>>
+    func.return %view : tensor<1x3x!ttcore.tile<32x32, f32>>
+  }
+}
+
+// -----
+
+// cb_wait with num_tiles: tile count mismatch between result shape and attribute.
+module {
+  func.func @cb_wait_num_tiles_mismatch(%cb: !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x3x!ttcore.tile<32x32, bf16>> attributes {ttl.kernel_thread = #ttkernel.thread<noc>} {
+    // expected-error @below {{result tensor has 3 tiles but num_tiles attribute is 4}}
+    %view = ttl.cb_wait %cb {num_tiles = 4 : i64} : <[3, 3], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x3x!ttcore.tile<32x32, bf16>>
+    func.return %view : tensor<1x3x!ttcore.tile<32x32, bf16>>
+  }
+}
+
+// -----
+
+// cb_wait with num_tiles exceeding CB capacity (across all blocks).
+module {
+  func.func @cb_wait_num_tiles_exceeds_capacity(%cb: !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2>) -> tensor<8x3x!ttcore.tile<32x32, bf16>> attributes {ttl.kernel_thread = #ttkernel.thread<noc>} {
+    // expected-error @below {{num_tiles (24) exceeds DFB capacity (18)}}
+    %view = ttl.cb_wait %cb {num_tiles = 24 : i64} : <[3, 3], !ttcore.tile<32x32, bf16>, 2> -> tensor<8x3x!ttcore.tile<32x32, bf16>>
+    func.return %view : tensor<8x3x!ttcore.tile<32x32, bf16>>
+  }
+}
+
+// -----
+
+// cb_pop with num_tiles exceeding CB capacity (across all blocks).
+module {
+  func.func @cb_pop_num_tiles_exceeds_capacity(%cb: !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2>) attributes {ttl.kernel_thread = #ttkernel.thread<noc>} {
+    // expected-error @below {{'ttl.cb_pop' op num_tiles (24) exceeds DFB capacity (18)}}
+    ttl.cb_pop %cb {num_tiles = 24 : i64} : <[3, 3], !ttcore.tile<32x32, bf16>, 2>
+    func.return
+  }
+}
+
 // tile_store tests moved to tile_store_invalid.mlir
diff --git a/test/ttlang/Dialect/TTL/Transforms/coalesce_dfb_acquires.mlir b/test/ttlang/Dialect/TTL/Transforms/coalesce_dfb_acquires.mlir
index b144ab251..6e9f57202 100644
--- a/test/ttlang/Dialect/TTL/Transforms/coalesce_dfb_acquires.mlir
+++ b/test/ttlang/Dialect/TTL/Transforms/coalesce_dfb_acquires.mlir
@@ -268,3 +268,71 @@ func.func @matmul_style_two_cb_interleaved()
   ttl.cb_pop %cb_b : <[1, 1], !ttcore.tile<32x32, bf16>, 4>
   func.return
 }
+
+// -----
+
+// Test 9: a region-bearing op (scf.if) between two same-DFB acquires
+// terminates the candidate group, even when the region's body is empty.
+// `mayReleaseDFB` treats any op with regions as opaque because the body
+// might contain a release on the DFB.
+
+// CHECK-LABEL: func.func @region_op_between_acquires_not_coalesced
+// CHECK-NOT: num_tiles
+// CHECK-NOT: tensor.extract_slice
+func.func @region_op_between_acquires_not_coalesced(%cond: i1)
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %cb_in = ttl.bind_cb{cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb_out = ttl.bind_cb{cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %w0 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a0 = ttl.attach_cb %w0, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.if %cond {
+  }
+  %w1 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a1 = ttl.attach_cb %w1, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %r0 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a0, %r0 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %r1 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a1, %r1 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  func.return
+}
+
+// -----
+
+// Test 10: stray `attach_cb` on an unrelated tensor between two same-DFB
+// acquires does NOT terminate the group. `attach_cb` is an SSA identity
+// erased at lowering and cannot release the DFB; `mayReleaseDFB`
+// allow-lists it explicitly.
+
+// CHECK-LABEL: func.func @attach_cb_unrelated_tensor_between_waits
+// CHECK: %[[CBIN:.+]] = ttl.bind_cb{cb_index = 0
+// CHECK: ttl.cb_wait %[[CBIN]] {num_tiles = 2 : i64}
+// CHECK-SAME: tensor<1x2x!ttcore.tile<32x32, bf16>>
+// CHECK-COUNT-2: tensor.extract_slice
+// CHECK: ttl.cb_pop %[[CBIN]] {num_tiles = 2 : i64}
+// CHECK-NOT: ttl.cb_wait
+// CHECK-NOT: ttl.cb_pop
+// CHECK: return
+func.func @attach_cb_unrelated_tensor_between_waits(
+    %unrelated: tensor<1x1x!ttcore.tile<32x32, bf16>>)
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %cb_in = ttl.bind_cb{cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb_out = ttl.bind_cb{cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %w0 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a0 = ttl.attach_cb %w0, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %stray = ttl.attach_cb %unrelated, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %w1 = ttl.cb_wait %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %a1 = ttl.attach_cb %w1, %cb_in : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %r0 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a0, %r0 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %r1 = ttl.cb_reserve %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.store %a1, %r1 : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  ttl.cb_pop %cb_in : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  ttl.cb_push %cb_out : <[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  func.return
+}