diff --git a/docs/sphinx/reference/compiler-options.md b/docs/sphinx/reference/compiler-options.md
index 8f329e75d..78662db94 100644
--- a/docs/sphinx/reference/compiler-options.md
+++ b/docs/sphinx/reference/compiler-options.md
@@ -17,6 +17,7 @@ python my_kernel.py --no-ttl-maximize-dst
 | `--ttl-block-matmul` / `--no-ttl-block-matmul` | enabled | Emit `matmul_block` (processes the full tile block atomically) instead of per-tile matmul loops. Disabling this option is not yet supported. |
 | `--ttl-auto-sync` / `--no-ttl-auto-sync` | disabled | Let the compiler insert and move DFB synchronization ops. When enabled, reserve/push may be refined to per-subblock granularity. When disabled, user-placed reserve/push is preserved as written. |
 | `--ttl-combine-pack-tiles` / `--no-ttl-combine-pack-tiles` | enabled | Combine consecutive `pack_tile` ops on the same CB with contiguous DST and CB indices into a single `pack_tile_block` call. |
+| `--ttl-strict-f32-acc` / `--no-ttl-strict-f32-acc` | disabled | Error at compile time if a `+=` accumulation loop's output block exceeds f32 DST capacity (4 tiles with double-buffering). When enabled, guarantees each accumulation step fits in a single DST section without subblocking. |
 
 ### Other Ways to Set These
 
@@ -110,24 +111,27 @@ ttlang-opt input.mlir -p 'ttl-to-ttkernel-pipeline{maximize-dst=true lower-to-em
 | `use-block-matmul` | bool | `true` | Lower matmul to block-level hardware calls (`experimental::matmul_block`). |
 | `auto-sync` | bool | `false` | Let the compiler insert and move DFB synchronization ops. |
 | `combine-pack-tiles` | bool | `true` | Combine consecutive `pack_tile` ops into `pack_tile_block`. |
+| `strict-f32-acc` | bool | `false` | Error if a `+=` accumulation loop's output block exceeds f32 DST capacity. |
 | `lower-to-emitc` | bool | `false` | Run the TTKernel-to-EmitC backend (produces C++ source). |
 
 The pipeline runs these passes in order:
 
-1. `convert-ttl-to-compute` — lower TTL elementwise tensor ops to `ttl.compute` with tile ops
-2. `ttl-set-compute-kernel-config` — set `fp32_dest_acc_en` / `dst_full_sync_en` defaults
-3. `ttl-assign-dst` — DST register allocation (linear scan with copy insertion)
-4. `ttl-subblock-compute-for-dst` — tile `ttl.compute` into DST-sized subblocks *(only if `maximize-dst=true`)*; optionally refine reserve/push to per-subblock granularity *(only if `auto-sync=true`)*
-5. `ttl-insert-tile-regs-sync` — insert math/pack thread synchronization
-6. `ttl-lower-matmul-block` — mark block-matmul computes and expand stores *(only if `use-block-matmul=true`)*
-7. `ttl-lower-to-loops` — lower `ttl.compute` to `scf.for` loops
-8. `ttl-schedule-operations` — reorder tile ops by dependency depth and kind *(only if `maximize-dst=true`)*
-9. `ttl-annotate-cb-associations` — annotate block args with CB indices
-10. `convert-ttl-to-ttkernel` — lower TTL DMA ops to TTKernel
-11. `ttkernel-insert-inits` — insert hardware init ops before compute ops
-12. `ttkernel-combine-pack-tiles` — combine consecutive `pack_tile` into `pack_tile_block` *(only if `combine-pack-tiles=true`)*
-13. Canonicalization and CSE cleanup
-14. *(if `lower-to-emitc=true`)* `lower-affine`, `convert-ttkernel-to-emitc`, `emitc-form-expressions`
+1. `ttl-annotate-l1-acc-loops` — detect `+=` accumulation loops and annotate for L1 packer accumulation
+2. `convert-ttl-to-compute` — lower TTL elementwise tensor ops to `ttl.compute` with tile ops
+3. `ttl-set-compute-kernel-config` — set `fp32_dest_acc_en` / `dst_full_sync_en` defaults
+4. `ttl-assign-dst` — DST register allocation (linear scan with copy insertion)
+5. `ttl-subblock-compute-for-dst` — tile `ttl.compute` into DST-sized subblocks *(only if `maximize-dst=true`)*; optionally refine reserve/push to per-subblock granularity *(only if `auto-sync=true`)*
+6. `ttl-insert-tile-regs-sync` — insert math/pack thread synchronization
+7. `ttl-lower-matmul-block` — mark block-matmul computes and expand stores *(only if `use-block-matmul=true`)*
+8. `ttl-lower-to-loops` — lower `ttl.compute` to `scf.for` loops
+9. `ttl-schedule-operations` — reorder tile ops by dependency depth and kind *(only if `maximize-dst=true`)*
+10. `ttl-annotate-cb-associations` — annotate block args with CB indices
+11. `convert-ttl-to-ttkernel` — lower TTL DMA ops to TTKernel
+12. `ttkernel-insert-inits` — insert hardware init ops before compute ops
+13. `ttkernel-insert-l1-accumulation` — insert `pack_reconfig_l1_acc` guards for `+=` and reduction loops
+14. `ttkernel-combine-pack-tiles` — combine consecutive `pack_tile` into `pack_tile_block` *(only if `combine-pack-tiles=true`)*
+15. Canonicalization and CSE cleanup
+16. *(if `lower-to-emitc=true`)* `lower-affine`, `convert-ttkernel-to-emitc`, `emitc-form-expressions`
 
 ### Individual Pass Options
 
@@ -169,6 +173,7 @@ Partition `ttl.compute` into DST-sized subblocks.
 | Option | Type | Default | Description |
 |---|---|---|---|
 | `subblock-sync` | bool | `false` | Refine DFB reserve/push to per-subblock granularity, enabling `pack_tile_block` for contiguous subblocks. When disabled, user-placed reserve/push is preserved. |
+| `strict-f32-acc` | bool | `false` | Error if a `+=` accumulation loop with non-f32 output requires subblocking. Subblocking reduces accumulation precision because bf16 L1 intermediates truncate f32 DST values. |
 
 ```bash
 ttlang-opt input.mlir -p 'func.func(ttl-subblock-compute-for-dst{subblock-sync=true})'
diff --git a/include/ttlang/Dialect/TTL/IR/TTL.h b/include/ttlang/Dialect/TTL/IR/TTL.h
index 72ef23f5e..6743fbb1d 100644
--- a/include/ttlang/Dialect/TTL/IR/TTL.h
+++ b/include/ttlang/Dialect/TTL/IR/TTL.h
@@ -64,6 +64,12 @@ constexpr llvm::StringLiteral kTileLoopStrideAttrName("ttl.tile_loop_stride");
 /// ComputeOp is lowered to loops.
 constexpr llvm::StringLiteral kReductionLoopAttrName("ttl.reduction_loop");
 
+/// Marks a user-written scf.for as an L1 accumulation loop. Each iteration
+/// packs to the same CB slot; pack_reconfig_l1_acc makes subsequent
+/// iterations additive. Distinct from kReductionLoopAttrName which marks
+/// compiler-generated reduction loops.
+constexpr llvm::StringLiteral kL1AccLoopAttrName("ttl.l1_acc_loop");
+
 /// Output CB index on tile ops that need it for init insertion.
 constexpr llvm::StringLiteral
     kBcastOutputCBIndexAttrName("ttl.bcast_output_cb_index");
diff --git a/include/ttlang/Dialect/TTL/IR/TTLOps.td b/include/ttlang/Dialect/TTL/IR/TTLOps.td
index 93dfa2da8..5ad4c87a3 100644
--- a/include/ttlang/Dialect/TTL/IR/TTLOps.td
+++ b/include/ttlang/Dialect/TTL/IR/TTLOps.td
@@ -1039,8 +1039,13 @@ def TTL_StoreOp : TTL_Op<"store", [MemoryEffects<[MemWrite]>]> {
     `ttl.store` represents the user's intent to store a computed tensor into
     an output circular buffer.
 
-    Emitted by Python's `o.store(result)`. During `convert-ttl-to-compute`,
-    this op is transformed into a `ttl.tile_store` inside the compute body.
+    Emitted by Python's `o.store(result)` (overwrite) or `o += result`
+    (accumulate). During `convert-ttl-to-compute`, this op is transformed
+    into a `ttl.tile_store` inside the compute body.
+
+    When `accumulate` is set, the enclosing loop is annotated for L1
+    packer accumulation so that each iteration adds to the existing L1
+    value instead of overwriting.
 
     Example:
     ```mlir
@@ -1048,11 +1053,14 @@ def TTL_StoreOp : TTL_Op<"store", [MemoryEffects<[MemWrite]>]> {
     %result = ttl.add %a, %b : ...
     ttl.store %result, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>,
                                   tensor<1x1x!ttcore.tile<32x32, bf16>>
+    // Accumulating store (from +=):
+    ttl.store %result, %reserve {accumulate} : ...
     ```
   }];
   let arguments = (ins
     AnyRankedTensor:$tensor,
-    AnyRankedTensor:$view
+    AnyRankedTensor:$view,
+    OptionalAttr<UnitAttr>:$accumulate
   );
   let assemblyFormat = "$tensor `,` $view attr-dict `:` type($tensor) `,` type($view)";
   let hasVerifier = 1;
diff --git a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
index 94ab10d59..2f2b6763a 100644
--- a/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
+++ b/include/ttlang/Dialect/TTL/IR/TTLOpsUtils.h
@@ -12,9 +12,11 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "llvm/ADT/SetVector.h"
+
 #include <cstdint>
 #include <optional>
 
@@ -427,6 +429,24 @@ inline TileOp createTileOpWithPlaceholderDstIndex(OpBuilder &builder,
   return tileOp;
 }
 
+/// Collect the CB values targeted by pack_tile ops inside a loop.
+llvm::SmallDenseSet<Value, 2> getPackTileCBs(scf::ForOp loop);
+
+/// Returns true if two loops share any pack_tile CB target.
+bool sharePackCB(scf::ForOp loopA, scf::ForOp loopB);
+
+/// A group of consecutive sibling loops that pack to the same output CB.
+struct LoopGroup {
+  scf::ForOp rootLoop;
+  SmallVector<scf::ForOp> loops;
+  Operation *scopeEnd = nullptr;
+};
+
+/// Collect groups of annotated sibling loops that share a pack CB target.
+SmallVector<LoopGroup> collectLoopGroups(
+    ArrayRef<scf::ForOp> l1AccLoops,
+    const llvm::SmallDenseMap<Operation *, Operation *> &enablePointPerLoop);
+
 } // namespace mlir::tt::ttl
 
 #endif // TTLANG_DIALECT_TTL_IR_TTLOPSUTILS_H
diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index 71035366e..6a92a0ebe 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -30,13 +30,24 @@ def TTKernelInsertL1Accumulation
     : Pass<"ttkernel-insert-l1-accumulation", "::mlir::ModuleOp"> {
   let summary = "Insert L1 accumulation guards for reduction loops";
   let description = [{
-    Inserts `pack_reconfig_l1_acc` guards inside reduction loops so that
-    pack operations accumulate into L1 instead of overwriting. On the first
-    iteration of a reduction loop, pack writes normally. On subsequent
-    iterations, the packer is reconfigured to add to the existing L1 value.
+    Inserts `pack_reconfig_l1_acc` guards so that pack operations inside
+    annotated loops accumulate into L1 instead of overwriting. Max-reduce
+    loops are excluded (max is not additive).
+
+    Loops are grouped into accumulation scopes: consecutive sibling loops
+    that pack to the same CB share a single disable pair. Nested annotated
+    loops are folded into the outermost ancestor. Each group gets:
+    - `pack_reconfig_l1_acc(0)` before the first loop.
+    - `if (iv == lb) pack_reconfig_l1_acc(1)` after each loop's first
+      iteration (the L1 acc state persists across `tile_regs` boundaries).
+    - For the second+ loop in a group, an enable before the loop to re-enable
+      L1 acc after init ops that may reset packer state.
+    - `pack_reconfig_l1_acc(0)` after the last `cb_push_back`.
+
+    Loops are identified by the `ttl.l1_acc_loop` (user-written) or
+    `ttl.reduction_loop` (compiler-generated) attributes on `scf.for`
+    ops, with `ttl.l1_acc_loop` taking precedence.
 
-    Reduction loops are identified by the `ttl.reduction_loop` attribute
-    on `scf.for` ops.
   }];
 
   let dependentDialects = [
@@ -46,6 +57,21 @@ def TTKernelInsertL1Accumulation
   ];
 }
 
+def TTLAnnotateL1AccLoops
+    : Pass<"ttl-annotate-l1-acc-loops", "::mlir::func::FuncOp"> {
+  let summary = "Annotate user-written scf.for loops for L1 accumulation";
+  let description = [{
+    Detects user-written `scf.for` loops containing `ttl.store` ops with
+    the `accumulate` attribute (emitted by the `+=` operator) and annotates
+    them with `ttl.l1_acc_loop`. This enables `TTKernelInsertL1Accumulation`
+    to insert `pack_reconfig_l1_acc` guards so that packs accumulate across
+    iterations instead of overwriting.
+  }];
+  let dependentDialects = [
+    "::mlir::scf::SCFDialect"
+  ];
+}
+
 def TTLConvertTTLToCompute
     : Pass<"convert-ttl-to-compute", "::mlir::func::FuncOp"> {
   let summary = "Lower TTL elementwise tensor ops to ttl.compute with tile ops";
@@ -184,7 +210,11 @@ def TTLSubblockComputeForDST
     Option<"subblockSync", "subblock-sync", "bool", "false",
            "Refine DFB reserve/push to per-subblock granularity, enabling "
            "pack_tile_block for contiguous subblocks. When disabled (default), "
-           "user-placed reserve/push is preserved.">
+           "user-placed reserve/push is preserved.">,
+    Option<"strictF32Acc", "strict-f32-acc", "bool", "false",
+           "Error if a user-written accumulation loop (+=) with non-f32 "
+           "output requires subblocking. Subblocking reduces accumulation "
+           "precision because bf16 L1 intermediates are narrower than f32 DST.">
   ];
 
   let dependentDialects = [
diff --git a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h
index 18b4c8a4d..0014affd7 100644
--- a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h
+++ b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h
@@ -44,6 +44,10 @@ struct TTLToTTKernelPipelineOptions
       *this, "reduce-full-fp32",
       llvm::cl::desc("Enable FP32 accumulation for reduce operations."),
       llvm::cl::init(true)};
+  Option<bool> strictF32Acc{
+      *this, "strict-f32-acc",
+      llvm::cl::desc("Error if accumulation output exceeds f32 DST capacity."),
+      llvm::cl::init(false)};
 };
 
 void createTTLToTTKernelPipeline(mlir::OpPassManager &pm,
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp
index 1fa6a3503..22eb9f2cc 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelCombinePackTiles.cpp
@@ -11,12 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ttlang/Dialect/TTL/IR/TTL.h"
 #include "ttlang/Dialect/TTL/Passes.h"
 #include "ttmlir/Dialect/TTKernel/IR/TTKernel.h"
 #include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 
@@ -63,6 +65,19 @@ struct TTKernelCombinePackTilesPass
 
   void runOnOperation() override {
     getOperation().walk([](Block *block) {
+      // Skip blocks inside reduction loops: pack_tile_block is
+      // incompatible with L1 accumulation (pack_reconfig_l1_acc).
+      // L1 acc requires individual pack_tile calls so each K iteration
+      // can independently add to the existing L1 value.
+      for (Operation *parent = block->getParentOp(); parent;
+           parent = parent->getParentOp()) {
+        if (auto forOp = dyn_cast<scf::ForOp>(parent)) {
+          if (forOp->hasAttr(kReductionLoopAttrName) ||
+              forOp->hasAttr(kL1AccLoopAttrName)) {
+            return;
+          }
+        }
+      }
       // Collect all combinable runs first, then replace them. Replacing
       // during iteration would invalidate the block's operation list.
       SmallVector<SmallVector<ttk::PackTileOp>> runs;
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp
index fb63e3e4f..075bc1184 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertInits.cpp
@@ -23,6 +23,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ttlang/Dialect/TTL/IR/TTL.h"
+#include "ttlang/Dialect/TTL/IR/TTLOpsUtils.h"
 #include "ttlang/Dialect/TTL/Passes.h"
 
 #include "ttmlir/Dialect/TTKernel/IR/TTKernel.h"
@@ -348,18 +349,18 @@ analyzeSyncRegion(ttk::TileRegsAcquireOp acquireOp, Value &inputCB,
 }
 
 /// Find the outermost enclosing insertion point by walking up through
-/// compiler-generated loops (marked with ttl.tile_loop_stride or
-/// ttl.subblock_loop_stride). By construction, these loops iterate over tiles
-/// within a single ttl.compute whose input/output CBs are fixed, so the
-/// CB configuration is invariant across iterations and hoisting is safe.
-/// Stops at unmarked loops to avoid hoisting past user loops that could
-/// contain multiple sync regions with different CB configurations.
+/// loops with invariant CB configurations: compiler-generated tile/subblock
+/// loops (ttl.tile_loop_stride, ttl.subblock_loop_stride) and L1
+/// accumulation loops (ttl.l1_acc_loop). All use fixed CBs across
+/// iterations, so init hoisting is safe. Stops at unmarked loops to avoid
+/// hoisting past user loops with varying CB configurations.
 static Operation *hoistAboveCompilerLoops(Operation *op) {
   Operation *insertBefore = op;
   while (auto *parentOp = insertBefore->getParentOp()) {
     if (isa<scf::ForOp>(parentOp) &&
         (parentOp->hasAttr(kTileLoopStrideAttrName) ||
-         parentOp->hasAttr(kSubblockLoopStrideAttrName))) {
+         parentOp->hasAttr(kSubblockLoopStrideAttrName) ||
+         parentOp->hasAttr(kL1AccLoopAttrName))) {
       insertBefore = parentOp;
     } else {
       break;
@@ -403,8 +404,36 @@ static LogicalResult insertCommonInits(ModuleOp moduleOp) {
       inputCB = outputCB;
     }
 
-    if (analysis.hasMatmul && in0CB && in1CB) {
-      // mm_block_init configures UNPACK + MATH + PACK for matmul_block.
+    // When a matmul init is hoisted before a loop that shares an
+    // output CB with a preceding sibling annotated loop, use
+    // init_short. The full init reconfigures the PACK pipeline
+    // which clobbers packer state (including L1 acc on Wormhole).
+    // init_short only reconfigures UNPACK+MATH.
+    bool useInitShort = false;
+    if (analysis.hasMatmul) {
+      if (auto forOp = dyn_cast<scf::ForOp>(insertBefore)) {
+        if (forOp->hasAttr(kL1AccLoopAttrName) ||
+            forOp->hasAttr(kReductionLoopAttrName)) {
+          for (Operation *prev = forOp->getPrevNode(); prev;
+               prev = prev->getPrevNode()) {
+            if (auto prevFor = dyn_cast<scf::ForOp>(prev)) {
+              if ((prevFor->hasAttr(kL1AccLoopAttrName) ||
+                   prevFor->hasAttr(kReductionLoopAttrName)) &&
+                  sharePackCB(prevFor, forOp)) {
+                useInitShort = true;
+              }
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    if (analysis.hasMatmul && in0CB && in1CB && useInitShort) {
+      ttk::MatmulBlockInitShortOp::create(
+          builder, loc, in0CB, in1CB, analysis.matmulTranspose,
+          analysis.matmulCt, analysis.matmulRt, analysis.matmulKt);
+    } else if (analysis.hasMatmul && in0CB && in1CB) {
       ttk::MatmulBlockInitOp::create(
           builder, loc, in0CB, in1CB, outputCB, analysis.matmulTranspose,
           analysis.matmulCt, analysis.matmulRt, analysis.matmulKt);
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
index 3c40a1872..21a029bce 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelInsertL1Accumulation.cpp
@@ -5,17 +5,9 @@
 //===----------------------------------------------------------------------===//
 // TTKernel Insert L1 Accumulation
 //===----------------------------------------------------------------------===//
-//
-// Inserts pack_reconfig_l1_acc guards inside reduction loops. When a
-// tile_regs_acquire is inside a reduction loop, the packer must switch
-// to L1 accumulation mode from the second iteration onwards so that
-// pack_tile adds to the existing L1 value instead of overwriting.
-//
-// See docs/development/AccumulatingComputeLowering.md for design details.
-//
-//===----------------------------------------------------------------------===//
 
 #include "ttlang/Dialect/TTL/IR/TTL.h"
+#include "ttlang/Dialect/TTL/IR/TTLOpsUtils.h"
 #include "ttlang/Dialect/TTL/Passes.h"
 
 #include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
@@ -35,31 +27,24 @@ namespace ttk = mlir::tt::ttkernel;
 
 namespace {
 
-/// Find the innermost enclosing reduction loop for an operation.
-static scf::ForOp findInnermostReductionLoop(Operation *op) {
+/// Find the innermost enclosing L1 acc or reduction loop.
+/// User-written += loops (kL1AccLoopAttrName) take precedence over
+/// compiler-generated reduction loops because the user-specified loop
+/// structure determines the accumulation granularity.
+static scf::ForOp findL1AccLoop(Operation *op) {
+  scf::ForOp reductionFallback;
   for (Operation *parent = op->getParentOp(); parent;
        parent = parent->getParentOp()) {
     if (auto forOp = dyn_cast<scf::ForOp>(parent)) {
-      if (forOp->hasAttr(kReductionLoopAttrName)) {
+      if (forOp->hasAttr(kL1AccLoopAttrName)) {
         return forOp;
       }
-    }
-  }
-  return nullptr;
-}
-
-/// Find the outermost enclosing reduction loop for an operation.
-static scf::ForOp findOutermostReductionLoop(Operation *op) {
-  scf::ForOp outermost;
-  for (Operation *parent = op->getParentOp(); parent;
-       parent = parent->getParentOp()) {
-    if (auto forOp = dyn_cast<scf::ForOp>(parent)) {
-      if (forOp->hasAttr(kReductionLoopAttrName)) {
-        outermost = forOp;
+      if (forOp->hasAttr(kReductionLoopAttrName) && !reductionFallback) {
+        reductionFallback = forOp;
       }
     }
   }
-  return outermost;
+  return reductionFallback;
 }
 
 struct TTKernelInsertL1AccumulationPass
@@ -68,53 +53,103 @@ struct TTKernelInsertL1AccumulationPass
   void runOnOperation() override {
     auto moduleOp = getOperation();
 
-    // Collect all acquire ops inside reduction loops. Collecting first
-    // avoids invalidation issues from modifying IR during iteration.
-    SmallVector<std::pair<ttk::TileRegsAcquireOp, scf::ForOp>> targets;
+    // Walk from TileRegsAcquireOp upward to find annotated loops —
+    // only loops with actual pack activity need L1 acc guards.
+    SmallVector<scf::ForOp> l1AccLoops;
+    llvm::SmallDenseSet<Operation *> visitedLoops;
     moduleOp->walk([&](ttk::TileRegsAcquireOp acquireOp) {
-      auto reductionLoop = findInnermostReductionLoop(acquireOp);
-      if (!reductionLoop) {
+      auto loop = findL1AccLoop(acquireOp);
+      if (!loop || !visitedLoops.insert(loop).second) {
+        return;
+      }
+      // Skip if this pass already ran (idempotency).
+      bool alreadyProcessed = false;
+      loop->walk([&](ttk::PackReconfigL1AccOp) {
+        alreadyProcessed = true;
+        return WalkResult::interrupt();
+      });
+      if (alreadyProcessed) {
         return;
       }
-      // L1 accumulation uses additive packing -- only valid for sum
-      // reductions. Max reductions require DST accumulation (Phase 2)
-      // where the hardware max operation accumulates across iterations.
+      // Max reduce is not additive — L1 acc would corrupt the running max.
       bool hasMaxReduce = false;
-      reductionLoop->walk([&](ttk::ReduceTileOp reduceOp) {
+      loop->walk([&](ttk::ReduceTileOp reduceOp) {
         if (reduceOp.getReduceType() == ttk::ReduceType::Max) {
           hasMaxReduce = true;
         }
       });
       if (!hasMaxReduce) {
-        targets.emplace_back(acquireOp, reductionLoop);
+        l1AccLoops.push_back(loop);
       }
     });
 
-    llvm::SmallDenseSet<Operation *> disabledLoops;
-    for (auto [acquireOp, reductionLoop] : targets) {
-      OpBuilder builder(acquireOp->getContext());
-      builder.setInsertionPointAfter(acquireOp);
-      Location loc = acquireOp.getLoc();
-
-      // Guard: if (loop_iv != lower_bound) pack_reconfig_l1_acc(1)
-      Value loopIV = reductionLoop.getInductionVar();
-      Value loopLB = reductionLoop.getLowerBound();
-      Value notFirstIter = arith::CmpIOp::create(
-          builder, loc, arith::CmpIPredicate::ne, loopIV, loopLB);
-      auto ifOp = scf::IfOp::create(builder, loc, notFirstIter);
-      builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-      Value enableFlag = arith::ConstantOp::create(
-          builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1));
-      ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag);
-
-      // Disable L1 accumulation after the outermost reduction loop.
-      auto outermostLoop = findOutermostReductionLoop(acquireOp);
-      if (disabledLoops.insert(outermostLoop).second) {
-        builder.setInsertionPointAfter(outermostLoop);
-        Value disableFlag = arith::ConstantOp::create(
-            builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(0));
-        ttk::PackReconfigL1AccOp::create(builder, loc, disableFlag);
+    // The enable guard goes after the last pack in the first iteration.
+    // Packs live inside tile_regs_acquire/release sections, which may be
+    // nested in subblock loops. The top-level ancestor of the last release
+    // in the loop body is the correct insertion point.
+    llvm::SmallDenseMap<Operation *, Operation *> l1AccEnablePoint;
+    for (auto loop : l1AccLoops) {
+      Operation *lastReleaseAncestor = nullptr;
+      loop->walk([&](ttk::TileRegsReleaseOp releaseOp) {
+        if (auto *ancestor =
+                loop.getBody()->findAncestorOpInBlock(*releaseOp)) {
+          lastReleaseAncestor = ancestor;
+        }
+      });
+      if (lastReleaseAncestor) {
+        l1AccEnablePoint[loop.getOperation()] = lastReleaseAncestor;
+      }
+    }
+
+    // Group consecutive sibling loops that pack to the same CB.
+    auto groups = collectLoopGroups(l1AccLoops, l1AccEnablePoint);
+
+    // Emit guards per group.
+    for (auto &group : groups) {
+      OpBuilder builder(group.rootLoop->getContext());
+      Location disableLoc = group.rootLoop->getLoc();
+
+      // Disable before the group.
+      builder.setInsertionPoint(group.rootLoop);
+      Value disableFlag =
+          arith::ConstantOp::create(builder, disableLoc, builder.getI32Type(),
+                                    builder.getI32IntegerAttr(0));
+      ttk::PackReconfigL1AccOp::create(builder, disableLoc, disableFlag);
+
+      for (size_t idx = 0; idx < group.loops.size(); ++idx) {
+        scf::ForOp loop = group.loops[idx];
+        auto iter = l1AccEnablePoint.find(loop.getOperation());
+        if (iter == l1AccEnablePoint.end()) {
+          continue;
+        }
+
+        // For the 2nd+ loop in a group, re-enable L1 acc before
+        // the loop because init ops between loops reset packer state.
+        if (idx > 0) {
+          builder.setInsertionPoint(loop);
+          Value enableFlag = arith::ConstantOp::create(
+              builder, loop->getLoc(), builder.getI32Type(),
+              builder.getI32IntegerAttr(1));
+          ttk::PackReconfigL1AccOp::create(builder, loop->getLoc(), enableFlag);
+        }
+
+        // Conditional enable after the first iteration's last pack.
+        Operation *afterOp = iter->second;
+        Location loc = afterOp->getLoc();
+        builder.setInsertionPointAfter(afterOp);
+        Value firstIter =
+            arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::eq,
+                                  loop.getInductionVar(), loop.getLowerBound());
+        auto ifOp = scf::IfOp::create(builder, loc, firstIter);
+        builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+        Value enableFlag = arith::ConstantOp::create(
+            builder, loc, builder.getI32Type(), builder.getI32IntegerAttr(1));
+        ttk::PackReconfigL1AccOp::create(builder, loc, enableFlag);
       }
+
+      // Disable after the scope end.
+      builder.setInsertionPointAfter(group.scopeEnd);
+      ttk::PackReconfigL1AccOp::create(builder, disableLoc, disableFlag);
     }
   }
 };
diff --git a/lib/Dialect/TTL/IR/TTLOpsUtils.cpp b/lib/Dialect/TTL/IR/TTLOpsUtils.cpp
index de5bd5a2c..3e4993885 100644
--- a/lib/Dialect/TTL/IR/TTLOpsUtils.cpp
+++ b/lib/Dialect/TTL/IR/TTLOpsUtils.cpp
@@ -4,6 +4,8 @@
 
 #include "ttlang/Dialect/TTL/IR/TTLOpsUtils.h"
 
+#include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
+
 namespace mlir::tt::ttl {
 
 //===----------------------------------------------------------------------===//
@@ -132,4 +134,107 @@ llvm::StringRef describeTraceFailure(TraceFailureReason reason) {
   llvm_unreachable("unhandled TraceFailureReason");
 }
 
+//===----------------------------------------------------------------------===//
+// Loop grouping for L1 accumulation and init selection
+//===----------------------------------------------------------------------===//
+
+namespace ttk = mlir::tt::ttkernel;
+
+llvm::SmallDenseSet<Value, 2> getPackTileCBs(scf::ForOp loop) {
+  llvm::SmallDenseSet<Value, 2> cbs;
+  loop->walk([&](ttk::PackTileOp packOp) { cbs.insert(packOp.getOutCb()); });
+  return cbs;
+}
+
+bool sharePackCB(scf::ForOp loopA, scf::ForOp loopB) {
+  auto cbsA = getPackTileCBs(loopA);
+  auto cbsB = getPackTileCBs(loopB);
+  for (auto cb : cbsA) {
+    if (cbsB.contains(cb)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+SmallVector<LoopGroup> collectLoopGroups(
+    ArrayRef<scf::ForOp> l1AccLoops,
+    const llvm::SmallDenseMap<Operation *, Operation *> &enablePointPerLoop) {
+  // Find the outermost annotated ancestor of a loop.
+  auto findRoot = [](scf::ForOp loop) -> scf::ForOp {
+    scf::ForOp outermost = loop;
+    for (Operation *parent = loop->getParentOp(); parent;
+         parent = parent->getParentOp()) {
+      if (auto parentFor = dyn_cast<scf::ForOp>(parent)) {
+        if (parentFor->hasAttr(kL1AccLoopAttrName) ||
+            parentFor->hasAttr(kReductionLoopAttrName)) {
+          outermost = parentFor;
+        }
+      }
+    }
+    return outermost;
+  };
+
+  SmallVector<LoopGroup> groups;
+  llvm::SmallDenseSet<Operation *> assigned;
+
+  for (auto loop : l1AccLoops) {
+    if (!enablePointPerLoop.count(loop.getOperation())) {
+      continue;
+    }
+    if (assigned.contains(loop.getOperation())) {
+      continue;
+    }
+
+    scf::ForOp rootLoop = findRoot(loop);
+
+    LoopGroup group;
+    group.rootLoop = rootLoop;
+    group.loops.push_back(loop);
+    assigned.insert(loop.getOperation());
+
+    // Collect sibling annotated loops that share a pack CB target.
+    // sharePackCB walks recursively, so for nested loops (rootLoop
+    // wrapping loop), it finds pack_tile ops inside the inner loop.
+    for (Operation *op = rootLoop->getNextNode(); op; op = op->getNextNode()) {
+      if (isa<ttk::CBPushBackOp>(op)) {
+        break;
+      }
+      auto sibling = dyn_cast<scf::ForOp>(op);
+      if (!sibling) {
+        continue;
+      }
+      if (!sibling->hasAttr(kL1AccLoopAttrName) &&
+          !sibling->hasAttr(kReductionLoopAttrName)) {
+        break;
+      }
+      if (!sharePackCB(rootLoop, sibling)) {
+        break;
+      }
+      group.loops.push_back(sibling);
+      assigned.insert(sibling.getOperation());
+    }
+
+    // Find scope end: scan forward from rootLoop past grouped siblings,
+    // init ops between them, and trailing cb_push_back ops. Only stop
+    // at a non-grouped ForOp or a cb_reserve_back.
+    group.scopeEnd = rootLoop;
+    for (Operation *op = rootLoop->getNextNode(); op; op = op->getNextNode()) {
+      if (isa<ttk::CBPushBackOp>(op)) {
+        group.scopeEnd = op;
+      } else if (isa<ttk::CBReserveBackOp>(op)) {
+        break;
+      } else if (auto forOp = dyn_cast<scf::ForOp>(op)) {
+        if (!assigned.contains(forOp)) {
+          break;
+        }
+      }
+    }
+
+    groups.push_back(std::move(group));
+  }
+
+  return groups;
+}
+
 } // namespace mlir::tt::ttl
diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
index 81be3dad8..805a3c951 100644
--- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
+++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
@@ -18,6 +18,7 @@ namespace mlir::tt::ttl {
 
 void createTTLToTTKernelPipeline(OpPassManager &pm,
                                  const TTLToTTKernelPipelineOptions &options) {
+  pm.addPass(createTTLAnnotateL1AccLoops());
   pm.addPass(createTTLConvertTTLToCompute());
   {
     TTLSetComputeKernelConfigOptions configOpts;
@@ -32,6 +33,7 @@ void createTTLToTTKernelPipeline(OpPassManager &pm,
   if (options.maximizeDST) {
     TTLSubblockComputeForDSTOptions subblockOpts;
     subblockOpts.subblockSync = options.autoSync;
+    subblockOpts.strictF32Acc = options.strictF32Acc;
     pm.addPass(createTTLSubblockComputeForDST(subblockOpts));
   }
   if (options.useBlockMatmul) {
diff --git a/lib/Dialect/TTL/Transforms/CMakeLists.txt b/lib/Dialect/TTL/Transforms/CMakeLists.txt
index a5e2b3fb9..061f72b0b 100644
--- a/lib/Dialect/TTL/Transforms/CMakeLists.txt
+++ b/lib/Dialect/TTL/Transforms/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_dialect_library(TTLangTTLTransforms
   LowerDPrintToEmitC.cpp
   LowerSignpostToEmitC.cpp
   TTLAnnotateCBAssociations.cpp
+  TTLAnnotateL1AccLoops.cpp
   TTLDumpCBFlowGraph.cpp
   TTLLowerMatmulBlock.cpp
   TTLAssignDST.cpp
diff --git a/lib/Dialect/TTL/Transforms/ConvertTTLComputeToSCF.cpp b/lib/Dialect/TTL/Transforms/ConvertTTLComputeToSCF.cpp
index 97aa10d3e..1625fecaa 100644
--- a/lib/Dialect/TTL/Transforms/ConvertTTLComputeToSCF.cpp
+++ b/lib/Dialect/TTL/Transforms/ConvertTTLComputeToSCF.cpp
@@ -373,14 +373,19 @@ struct LowerComputeToLoops : OpRewritePattern<ComputeOp> {
                               })
                               .wasInterrupted();
 
-    assert(!(isSubblocked && isAccumulating) &&
-           "SubblockComputeForDST must skip accumulating computes");
-
     SmallVector<StringAttr> iterTypes;
     for (Attribute attr : op.getIteratorTypes()) {
       iterTypes.push_back(mlir::cast<StringAttr>(attr));
     }
 
+    // Subblocked accumulating computes (matmul K>1 with output > DST) are
+    // handled by LowerMatmulBlock, which generates the K reduction loop
+    // with per-K DstSections. By the time LowerToLoops runs, the ComputeOp
+    // has been replaced. This assert catches unexpected cases.
+    assert(!(isSubblocked && isAccumulating) &&
+           "subblocked accumulating computes should be handled by "
+           "LowerMatmulBlock before LowerToLoops");
+
     // Side-effect-only loops: no iter_args, no tensor.insert, no scf.yield
     // with tensor values. Stores are explicit side effects (tile_store).
     bool processingFailed = false;
diff --git a/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
new file mode 100644
index 000000000..20972e5ce
--- /dev/null
+++ b/lib/Dialect/TTL/Transforms/TTLAnnotateL1AccLoops.cpp
@@ -0,0 +1,105 @@
+// SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+//===----------------------------------------------------------------------===//
+// TTL Annotate L1 Acc Loops
+//===----------------------------------------------------------------------===//
+//
+// Detects user-written scf.for loops containing accumulating stores
+// (ttl.store with the {accumulate} attribute, emitted by +=) and annotates
+// them with kL1AccLoopAttrName for L1 packer accumulation.
+//
+// Uses dominance: for each accumulating store, verifies the destination
+// cb_reserve properly dominates the enclosing loop (the reserve is outside
+// the loop, so the same L1 slot persists across iterations).
+//
+// TTKernelInsertL1Accumulation uses the annotated loops to find enable
+// points, and groups consecutive sibling loops by shared pack CB targets
+// to determine the accumulation scope for disable guards.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ttlang/Dialect/TTL/IR/TTL.h"
+#include "ttlang/Dialect/TTL/IR/TTLOps.h"
+#include "ttlang/Dialect/TTL/Passes.h"
+
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Dominance.h"
+
+#define DEBUG_TYPE "ttl-annotate-l1-acc-loops"
+
+namespace mlir::tt::ttl {
+
+#define GEN_PASS_DEF_TTLANNOTATEL1ACCLOOPS
+#include "ttlang/Dialect/TTL/Passes.h.inc"
+
+namespace {
+
+/// Returns true if the loop carries any ttl.* annotation, indicating it
+/// was generated or already processed by a compiler pass.
+static bool hasCompilerAnnotation(scf::ForOp loop) {
+  for (auto attr : loop->getAttrs()) {
+    if (attr.getName().getValue().starts_with("ttl.")) {
+      return true;
+    }
+  }
+  return false;
+}
+
+struct TTLAnnotateL1AccLoopsPass
+    : public impl::TTLAnnotateL1AccLoopsBase<TTLAnnotateL1AccLoopsPass> {
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+    DominanceInfo domInfo(func);
+    bool failed = false;
+
+    func.walk([&](StoreOp store) {
+      if (!store.getAccumulate()) {
+        return;
+      }
+
+      auto enclosingLoop = store->getParentOfType<scf::ForOp>();
+      if (!enclosingLoop) {
+        return;
+      }
+      if (hasCompilerAnnotation(enclosingLoop)) {
+        return;
+      }
+
+      // Conditional += is not supported: the L1 acc enable guard is conditional
+      // based on the loop induction variable, not on whether a pack actually
+      // executed. If the condition is false on iteration 0, subsequent
+      // iterations accumulate into uninitialized L1.
+      if (store->getParentOp() != enclosingLoop.getOperation()) {
+        store->emitError(
+            "+= inside a conditional is not supported (#504); move "
+            "the condition outside the accumulation loop or use a "
+            "separate loop for the conditional path");
+        failed = true;
+        return;
+      }
+
+      // The reserve must properly dominate the enclosing loop: the
+      // reserve is outside the loop so the same L1 slot persists across
+      // iterations. If the reserve is inside the loop, each iteration
+      // gets a fresh slot and accumulation is meaningless.
+      Value reserve = store.getView();
+      Operation *reserveOp = reserve.getDefiningOp();
+      if (reserveOp && !domInfo.properlyDominates(reserveOp, enclosingLoop)) {
+        return;
+      }
+
+      enclosingLoop->setAttr(kL1AccLoopAttrName,
+                             UnitAttr::get(enclosingLoop->getContext()));
+    });
+
+    if (failed) {
+      signalPassFailure();
+    }
+  }
+};
+
+} // namespace
+
+} // namespace mlir::tt::ttl
diff --git a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
index c599257ec..0d5c21add 100644
--- a/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
+++ b/lib/Dialect/TTL/Transforms/TTLSubblockComputeForDST.cpp
@@ -17,6 +17,8 @@
 #include "ttlang/Dialect/TTL/IR/TTLOpsUtils.h"
 #include "ttlang/Dialect/TTL/Passes.h"
 
+#include "ttmlir/Dialect/TTCore/IR/TTCoreOpsTypes.h"
+
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -103,24 +105,32 @@ struct TTLSubblockComputeForDSTPass
     func::FuncOp funcOp = getOperation();
 
     // Collect compute ops to subblock (avoid modifying while walking).
-    // Skip accumulating computes -- subblocking would break reduction
-    // accumulation by splitting the reduction loop across subblocks.
+    // Skip non-matmul accumulating computes (e.g., reduce_tile) because
+    // subblocking would break their reduction accumulation semantics.
+    // Matmul accumulating computes are safe: K accumulates in-place in
+    // DST without consuming DST slots (effectiveTiles already excludes
+    // reduction dims for matmul -- see hasMatmulBlock logic below).
     SmallVector<ComputeOp> opsToSubblock;
     funcOp.walk([&](ComputeOp computeOp) {
       auto unrollAttr =
           computeOp->getAttrOfType<IntegerAttr>(kUnrollFactorAttrName);
       if (unrollAttr && unrollAttr.getInt() > 1) {
         bool hasAccumulating = false;
+        bool hasMatmulBlock = false;
         computeOp.getBody().walk([&](Operation *op) {
           if (op->hasTrait<TTLAccumulatingOpTrait>()) {
             hasAccumulating = true;
-            return WalkResult::interrupt();
           }
-          return WalkResult::advance();
+          if (isa<TileMatmulBlockOp>(op)) {
+            hasMatmulBlock = true;
+          }
+          return (hasAccumulating && hasMatmulBlock) ? WalkResult::interrupt()
+                                                     : WalkResult::advance();
         });
-        if (!hasAccumulating) {
-          opsToSubblock.push_back(computeOp);
+        if (hasAccumulating && !hasMatmulBlock) {
+          return;
         }
+        opsToSubblock.push_back(computeOp);
       }
     });
 
@@ -181,6 +191,38 @@ struct TTLSubblockComputeForDSTPass
       return success();
     }
 
+    // When --strict-f32-acc is set and this compute is inside a user-written
+    // accumulation loop (+=), error if the output type is not f32. Subblocking
+    // a non-f32 accumulating compute reduces precision: the f32 DST partial
+    // sums are truncated to bf16 when packed to L1 per K step.
+    // TODO(ttl): Instead of erroring, allocate an f32 L1 temporary and
+    // emit a cast to bf16 after the loop, giving full f32 precision at
+    // the cost of 2x L1 per tile.
+    if (strictF32Acc) {
+      bool insideL1AccLoop = false;
+      for (Operation *parent = computeOp->getParentOp(); parent;
+           parent = parent->getParentOp()) {
+        if (auto forOp = dyn_cast<scf::ForOp>(parent)) {
+          if (forOp->hasAttr(kL1AccLoopAttrName)) {
+            insideL1AccLoop = true;
+            break;
+          }
+        }
+      }
+      if (insideL1AccLoop) {
+        auto outType =
+            cast<RankedTensorType>(computeOp.getDpsInits()[0].getType());
+        auto tileType = cast<ttcore::TileType>(outType.getElementType());
+        if (tileType.getDataType() != ttcore::DataType::Float32) {
+          return computeOp.emitError(
+              "subblocking accumulation loop reduces precision: bf16 L1 "
+              "intermediates truncate f32 DST partial sums per K step; "
+              "reduce block dimensions to fit in f32 DST, use f32 output "
+              "type, or compile without --ttl-strict-f32-acc");
+        }
+      }
+    }
+
     // Only parallel dimensions are candidates for subblocking; reduction
     // dimensions must be fully included in each subblock. Matmul K is
     // excluded because it accumulates in-place (see hasMatmulBlock above).
@@ -210,8 +252,10 @@ struct TTLSubblockComputeForDSTPass
     SmallVector<int64_t> parallelSubblockSizes =
         computeMultiDimSubblockSizes(parallelDimSizes, parallelBudget);
 
-    // Expand back to full-rank subblock sizes: reduction dims get their full
-    // size, parallel dims get the computed subblock size.
+    // Reduction dims keep their full size. For matmul, K accumulates
+    // in-place in DST via matmul_block(kt=K_block). L1 accumulation
+    // across user-managed outer K iterations is handled separately by
+    // TTKernelInsertL1Accumulation (kL1AccLoopAttrName).
     SmallVector<int64_t> subblockSizes(rank);
     int64_t parallelIdx = 0;
     for (int64_t d = 0; d < rank; ++d) {
diff --git a/python/pykernel/_src/base_ast.py b/python/pykernel/_src/base_ast.py
index afc6767d9..754d16590 100644
--- a/python/pykernel/_src/base_ast.py
+++ b/python/pykernel/_src/base_ast.py
@@ -83,6 +83,10 @@ def _var_exists(self, var_name):
                 return sym_table
         return {}
 
+    def _set_var(self, var_name, value):
+        """Bind a variable in the current (innermost) scope."""
+        self.symbol_tables[-1][var_name] = value
+
     def visit_Module(self, node):
         # Set default basic block
         with InsertionPoint(self.insert_point), Location.unknown():
diff --git a/python/pykernel/_src/kernel_ast.py b/python/pykernel/_src/kernel_ast.py
index 60bf4fbe3..715b5a938 100644
--- a/python/pykernel/_src/kernel_ast.py
+++ b/python/pykernel/_src/kernel_ast.py
@@ -211,8 +211,8 @@ def visit_For(self, node):
         with InsertionPoint(for_op.body), Location.unknown():
             self.symbol_tables.append({})
 
-            # Add the iterator into the symbol_table
-            self.symbol_tables[-1][node.target.id] = for_op.induction_variable
+            # Add the iterator into the symbol table.
+            self._set_var(node.target.id, for_op.induction_variable)
 
             for stmt in node.body:
                 self.visit(stmt)
@@ -259,17 +259,15 @@ def visit_Assign(self, node):
                     raise ValueError(
                         f"Not enough values to unpack from rt_args slice (expected {len(_vars)}, got {len(values)})"
                     )
-                # Since we are unpacking a tuple, types can't be assigned here:
-                sym_table = self.symbol_tables[-1]
+                # Since we are unpacking a tuple, types can't be assigned here.
                 for i in range(len(_vars)):
-                    sym_table[_tuple.elts[i].id] = values[i]
+                    self._set_var(_tuple.elts[i].id, values[i])
 
                 # Exit out of function now
                 return
 
         var = self.visit(node.targets[0])
         value = self.visit(node.value)
-        sym_table = self.symbol_tables[-1]
 
         # Handle Subscript Assignment here
         if isinstance(node.targets[0], ast.Subscript):
@@ -282,13 +280,12 @@ def visit_Assign(self, node):
         if hasattr(var, "type") and isinstance(var.type, MemRefType):
             memref.StoreOp(value, var, [arith.ConstantOp(IndexType.get(self.ctx), 0)])
         else:
-            sym_table[var_name] = value
+            self._set_var(var_name, value)
 
     def visit_AnnAssign(self, node):
         # NOTE: TTKernel types can not be used with memrefs
         var = self.visit(node.target)
         value = self.visit(node.value)
-        sym_table = self.symbol_tables[-1]
         var_name = node.target.id
 
         # Check the annotation for array creation
@@ -308,7 +305,7 @@ def visit_AnnAssign(self, node):
                 memref_type = MemRefType.get(
                     [elt.value for elt in node.annotation.elts[1:]], var_type
                 )
-                sym_table[var_name] = memref.alloca(memref_type, [], [])
+                self._set_var(var_name, memref.alloca(memref_type, [], []))
                 return
             else:
                 raise NotImplementedError(
@@ -324,7 +321,7 @@ def visit_AnnAssign(self, node):
             var_type = value.type
             memref_type = MemRefType.get([1], var_type)
             var = memref.alloca(memref_type, [], [])
-            sym_table[var_name] = var
+            self._set_var(var_name, var)
         else:
             assert isinstance(var, MemRefType), "Can not AnnAssign to non-memref types"
 
diff --git a/python/ttl/_src/ttl_ast.py b/python/ttl/_src/ttl_ast.py
index 3ccfc5e37..898ec4d6e 100644
--- a/python/ttl/_src/ttl_ast.py
+++ b/python/ttl/_src/ttl_ast.py
@@ -8,6 +8,7 @@
 from typing import List, Optional, Set
 
 from pykernel._src.kernel_ast import TTCompilerBase
+from pykernel._src.utils import _get_type_str
 from ttl.dialects import arith, func, ttcore, ttkernel
 from ttl.ir import *
 
@@ -181,11 +182,10 @@ def visit_Assign(self, node):
                 f"Cannot unpack {len(value)} values into {len(targets)} variables"
             )
 
-        sym_table = self.symbol_tables[-1]
         for elt, val in zip(targets, value):
             if not isinstance(elt, ast.Name):
                 raise ValueError("Tuple unpacking requires simple variable names")
-            sym_table[elt.id] = val
+            self._set_var(elt.id, val)
 
     def _loc_for_node(self, node):
         """Return file location for node if debug_locations enabled, else name location."""
@@ -296,6 +296,24 @@ def visit_Call(self, node):
                     raise
                 self._raise_error(node, str(e))
 
+    def visit_AugAssign(self, node):
+        """Handle += on tensor blocks via the registered __iadd__ method."""
+        with self._loc_for_node(node):
+            target = self.visit(node.target)
+            if (
+                isinstance(node.op, ast.Add)
+                and hasattr(target, "type")
+                and isinstance(target.type, RankedTensorType)
+            ):
+                rhs = self.visit(node.value)
+                mlir_type = _get_type_str(target.type)
+                iadd_fn = self._fn_map.get(f"{mlir_type}.__iadd__")
+                if iadd_fn:
+                    result = iadd_fn(target, rhs)
+                    self._set_var(node.target.id, result)
+                    return
+            return super().visit_AugAssign(node)
+
     def visit_BinOp(self, node):
         """Override to inject auto-profiling and provide better error messages."""
         with self._loc_for_node(node):
@@ -526,8 +544,8 @@ def _emit_entry(self, node):
         self.symbol_tables.append({})
         func_bb = self.func_entry.add_entry_block()
 
-        # Add ttl module to symbol table
-        self.symbol_tables[-1]["ttl"] = ttl
+        # Add ttl module to symbol table.
+        self._set_var("ttl", ttl)
 
         # Ensure TTL dialect is registered for type parsing
         ttl.ensure_dialects_registered(self.ctx)
@@ -536,12 +554,12 @@ def _emit_entry(self, node):
 
         # Emit function body
         with InsertionPoint(func_bb):
-            # Map TensorAccessor function arguments to symbol table
+            # Map TensorAccessor function arguments to symbol table.
             for i, name in enumerate(self._tensor_accessor_names):
-                self.symbol_tables[-1][name] = func_bb.arguments[i]
+                self._set_var(name, func_bb.arguments[i])
                 self.streams.add(name)
 
-            # Prepopulate other captures (non-tensor)
+            # Prepopulate other captures (non-tensor).
             from ..circular_buffer import CircularBuffer
 
             for name, val in self.captures.items():
@@ -549,16 +567,11 @@ def _emit_entry(self, node):
                     continue  # Already handled via function arguments
                 assert isinstance(name, str)
                 if isinstance(val, int):
-                    self.symbol_tables[-1][name] = arith.ConstantOp(
-                        IndexType.get(self.ctx), val
-                    )
+                    self._set_var(name, arith.ConstantOp(IndexType.get(self.ctx), val))
                 elif isinstance(val, float):
-                    self.symbol_tables[-1][name] = arith.ConstantOp(
-                        F32Type.get(self.ctx), val
-                    )
+                    self._set_var(name, arith.ConstantOp(F32Type.get(self.ctx), val))
                 elif isinstance(val, CircularBuffer):
-                    cb_val = self._emit_cb_from_capture(val)
-                    self.symbol_tables[-1][name] = cb_val
+                    self._set_var(name, self._emit_cb_from_capture(val))
                 else:
                     self._raise_error(
                         node, f"Invalid capture type for var {name}: {type(val)}"
@@ -944,7 +957,7 @@ def visit_With(self, node):
                             optional_vars,
                             "'with ... as var' requires a simple variable name",
                         )
-                    self.symbol_tables[-1][optional_vars.id] = acquire_result
+                    self._set_var(optional_vars.id, acquire_result)
 
             for stmt in node.body:
                 self.visit(stmt)
diff --git a/python/ttl/compiler_options.py b/python/ttl/compiler_options.py
index 6871cc7fe..bccf088ae 100644
--- a/python/ttl/compiler_options.py
+++ b/python/ttl/compiler_options.py
@@ -76,6 +76,13 @@ def _make_parser() -> argparse.ArgumentParser:
         action=argparse.BooleanOptionalAction,
         help="Enable FP32 accumulation for matmul operations (default: enabled).",
     )
+    p.add_argument(
+        "--ttl-strict-f32-acc",
+        default=None,
+        dest="strict_f32_acc",
+        action=argparse.BooleanOptionalAction,
+        help="Error if accumulation (+=) output block exceeds f32 DST capacity (default: disabled).",
+    )
     return p
 
 
@@ -121,6 +128,7 @@ class CompilerOptions:
     combine_pack_tiles: bool = True
     reduce_full_fp32: bool = True
     matmul_full_fp32: bool = True
+    strict_f32_acc: bool = False
 
     # Fields that were explicitly provided (not defaulted). Excluded from
     # equality and hashing so two instances with the same bool values are
diff --git a/python/ttl/operators.py b/python/ttl/operators.py
index de4c4ba03..5df40e193 100644
--- a/python/ttl/operators.py
+++ b/python/ttl/operators.py
@@ -111,9 +111,10 @@ def __matmul__(ast_self: TensorBlock, rhs: TensorBlock) -> TensorBlock:
         return ttl.matmul(result_type, ast_self, rhs)
 
     def store(ast_self: TensorBlock, rhs: TensorBlock) -> None:
-        """Store result tensor to the output CB reserve view.
+        """Store result tensor to the output CB reserve view (overwrite).
 
         Emits ttl.store with the result tensor and reserve view.
+        Always overwrites the CB slot. For accumulation, use ``+=``.
         """
         if not _is_block(ast_self):
             raise ValueError(
@@ -122,6 +123,26 @@ def store(ast_self: TensorBlock, rhs: TensorBlock) -> None:
         reserve = _get_reserve_from_block(ast_self)
         ttl.store(rhs, reserve)
 
+    def __iadd__(ast_self: TensorBlock, rhs: TensorBlock) -> TensorBlock:
+        """Accumulate into a reserved block via L1 packer accumulation.
+
+        Emits ttl.store with the ``accumulate`` attribute. When used
+        inside a loop, the compiler inserts ``pack_reconfig_l1_acc``
+        guards so that each iteration adds to the existing L1 value
+        instead of overwriting.
+
+        This is an interim mechanism; the spec's full pattern
+        (``fill`` + lazy ``BlockExpr`` ``+=`` + ``store``) is deferred
+        to the BlockExpr PR (#446).
+        """
+        if not _is_block(ast_self):
+            raise ValueError(
+                "+= must be called on a block acquired from reserve(), not a regular tensor"
+            )
+        reserve = _get_reserve_from_block(ast_self)
+        ttl.store(rhs, reserve, accumulate=True)
+        return ast_self
+
     def push(ast_self: TensorBlock) -> None:
         """
         Signal that data is ready in the circular buffer (producer release).
diff --git a/python/ttl/ttl_api.py b/python/ttl/ttl_api.py
index f15be2952..f2f161dde 100644
--- a/python/ttl/ttl_api.py
+++ b/python/ttl/ttl_api.py
@@ -1224,14 +1224,16 @@ def _compile_kernel(
         assign_dst_pass = f"ttl-assign-dst{{enable-fpu-binary-ops={fpu_flag}}}"
 
         pipeline_passes = [
+            "func.func(ttl-annotate-l1-acc-loops)",
             "func.func(convert-ttl-to-compute)",
             set_compute_config_pass,
             f"func.func({assign_dst_pass})",
         ]
         if compiler_options.maximize_dst:
             subblock_sync = "true" if compiler_options.auto_sync else "false"
+            strict_f32 = "true" if compiler_options.strict_f32_acc else "false"
             pipeline_passes.append(
-                f"func.func(ttl-subblock-compute-for-dst{{subblock-sync={subblock_sync}}})"
+                f"func.func(ttl-subblock-compute-for-dst{{subblock-sync={subblock_sync} strict-f32-acc={strict_f32}}})"
             )
         if compiler_options.use_block_matmul:
             pipeline_passes.append("func.func(ttl-lower-matmul-block)")
diff --git a/test/python/matmul_l1_acc_multinode.py b/test/python/matmul_l1_acc_multinode.py
new file mode 100644
index 000000000..822cc7692
--- /dev/null
+++ b/test/python/matmul_l1_acc_multinode.py
@@ -0,0 +1,192 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# REQUIRES: tt-device
+# RUN: env TTLANG_INITIAL_MLIR=%t.initial.mlir %python %s > %t.output 2>&1
+# RUN: FileCheck %s --check-prefix=CHECK-CPP < %t.output
+# RUN: FileCheck %s --check-prefix=CHECK-RESULT < %t.output
+
+"""
+Multinode matmul with L1 packer accumulation. Auto grid, split DMA (reader=A,
+writer=B+output), 8x8x8 blocks, K_num_blocks=4 at 3072x1024x3072.
+
+The larger dimensions (96x32x96 tiles, 12x4x12 blocks) ensure each core
+handles multiple output blocks (ceil(12/8)=2 per axis on an 8x8 grid),
+exercising the per-block L1 acc disable/re-enable sequence.
+
+The compute thread uses += for accumulation across K iterations. The
+compiler inserts pack_reconfig_l1_acc guards so each K iteration packs
+additively to L1.
+
+Verifies the L1 packer accumulation pattern in generated C++: disable before
+K loop, conditional enable after first iteration, disable after cb_push_back.
+"""
+
+import ttl
+
+try:
+    import ttnn
+except ImportError:
+    print("TTNN not available - exiting")
+    exit(0)
+
+import torch
+
+TILE = 32
+M_BLOCK = 8
+K_BLOCK = 8
+N_BLOCK = 8
+
+
+@ttl.operation(grid="auto")
+def matmul_l1_acc(a, b, out):
+    Mt = a.shape[0] // TILE
+    Kt = a.shape[1] // TILE
+    Nt = b.shape[1] // TILE
+
+    K_num_blocks = Kt // K_BLOCK
+    M_num_blocks = Mt // M_BLOCK
+    N_num_blocks = Nt // N_BLOCK
+
+    grid_n, grid_m = ttl.grid_size(dims=2)
+    m_blocks_per_node = -(-M_num_blocks // grid_m)
+    n_blocks_per_node = -(-N_num_blocks // grid_n)
+
+    a_dfb = ttl.make_dataflow_buffer_like(a, shape=(M_BLOCK, K_BLOCK), block_count=2)
+    b_dfb = ttl.make_dataflow_buffer_like(b, shape=(K_BLOCK, N_BLOCK), block_count=2)
+    out_dfb = ttl.make_dataflow_buffer_like(
+        out, shape=(M_BLOCK, N_BLOCK), block_count=2
+    )
+
+    @ttl.compute()
+    def compute():
+        node_n, node_m = ttl.node(dims=2)
+        for local_m in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m
+            if m_block < M_num_blocks:
+                for local_n in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n
+                    if n_block < N_num_blocks:
+                        out_blk = out_dfb.reserve()
+                        for _ in range(K_num_blocks):
+                            a_blk = a_dfb.wait()
+                            b_blk = b_dfb.wait()
+                            out_blk += a_blk @ b_blk
+                            a_blk.pop()
+                            b_blk.pop()
+                        out_blk.push()
+
+    @ttl.datamovement()
+    def reader():
+        node_n, node_m = ttl.node(dims=2)
+        for local_m in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m
+            if m_block < M_num_blocks:
+                m_off = m_block * M_BLOCK
+                for local_n in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n
+                    if n_block < N_num_blocks:
+                        for kb in range(K_num_blocks):
+                            k_off = kb * K_BLOCK
+                            with a_dfb.reserve() as a_blk:
+                                ttl.copy(
+                                    a[
+                                        m_off : m_off + M_BLOCK,
+                                        k_off : k_off + K_BLOCK,
+                                    ],
+                                    a_blk,
+                                ).wait()
+
+    @ttl.datamovement()
+    def writer():
+        node_n, node_m = ttl.node(dims=2)
+        for local_m in range(m_blocks_per_node):
+            m_block = node_m * m_blocks_per_node + local_m
+            if m_block < M_num_blocks:
+                m_off = m_block * M_BLOCK
+                for local_n in range(n_blocks_per_node):
+                    n_block = node_n * n_blocks_per_node + local_n
+                    if n_block < N_num_blocks:
+                        n_off = n_block * N_BLOCK
+                        for kb in range(K_num_blocks):
+                            k_off = kb * K_BLOCK
+                            with b_dfb.reserve() as b_blk:
+                                ttl.copy(
+                                    b[
+                                        k_off : k_off + K_BLOCK,
+                                        n_off : n_off + N_BLOCK,
+                                    ],
+                                    b_blk,
+                                ).wait()
+                        with out_dfb.wait() as out_blk:
+                            ttl.copy(
+                                out_blk,
+                                out[
+                                    m_off : m_off + M_BLOCK,
+                                    n_off : n_off + N_BLOCK,
+                                ],
+                            ).wait()
+
+
+# =============================================================================
+# C++ output: L1 packer accumulation pattern
+#   1. Disable before the K loop
+#   2. Conditional enable after the first iteration (iv == lb)
+#   3. Disable after cb_push_back following the loop
+# =============================================================================
+
+# CHECK-CPP-DAG:  int32_t [[ENABLE:v[0-9]+]] = 1;
+# CHECK-CPP-DAG:  int32_t [[DISABLE:v[0-9]+]] = 0;
+# CHECK-CPP:      PACK((llk_pack_reconfig_l1_acc([[DISABLE]])));
+# CHECK-CPP:      for
+# CHECK-CPP:        matmul_block(
+# CHECK-CPP:        pack_tile
+# CHECK-CPP:        if (
+# CHECK-CPP-NEXT:   PACK((llk_pack_reconfig_l1_acc([[ENABLE]])));
+# CHECK-CPP:      cb_push_back(
+# CHECK-CPP:      PACK((llk_pack_reconfig_l1_acc([[DISABLE]])));
+
+# CHECK-RESULT: PASS
+
+if __name__ == "__main__":
+    device = ttnn.open_device(device_id=0)
+
+    try:
+        # 96x32x96 tiles = 3072x1024x3072, 8x8x8 blocks -> 12x4x12 blocks.
+        # With an 8x8 grid each core handles ceil(12/8)=2 M-blocks and
+        # 2 N-blocks (4 output blocks), exercising the per-block L1 acc
+        # disable/re-enable sequence.
+        Mt, Kt, Nt = 96, 32, 96
+        M, K, N = Mt * TILE, Kt * TILE, Nt * TILE
+
+        a_torch = torch.randn(M, K, dtype=torch.bfloat16)
+        b_torch = torch.randn(K, N, dtype=torch.bfloat16)
+        golden = (a_torch.float() @ b_torch.float()).float()
+
+        a_dev = ttnn.from_torch(
+            a_torch, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device
+        )
+        b_dev = ttnn.from_torch(
+            b_torch, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device
+        )
+        out_dev = ttnn.from_torch(
+            torch.zeros(M, N, dtype=torch.bfloat16),
+            dtype=ttnn.bfloat16,
+            layout=ttnn.TILE_LAYOUT,
+            device=device,
+        )
+
+        matmul_l1_acc(a_dev, b_dev, out_dev)
+
+        result = ttnn.to_torch(out_dev).float()
+        pcc = torch.corrcoef(torch.stack([result.flatten(), golden.flatten()]))[
+            0, 1
+        ].item()
+        if pcc > 0.999:
+            print("PASS")
+        else:
+            print(f"FAIL: PCC {pcc:.6f} < 0.999")
+
+    finally:
+        ttnn.close_device(device)
diff --git a/test/python/test_matmul_k_accumulation.py b/test/python/test_matmul_k_accumulation.py
index a5d79d9d3..4ed632865 100644
--- a/test/python/test_matmul_k_accumulation.py
+++ b/test/python/test_matmul_k_accumulation.py
@@ -175,13 +175,30 @@ def test_matmul_k_accumulation_streaming(k_tiles, block_n, device):
 @pytest.mark.parametrize("k_tiles", K_TILES, ids=[f"K{k}" for k in K_TILES])
 @pytest.mark.requires_device
 def test_matmul_k_accumulation_single_fill(k_tiles, block_n, device):
-    """Kt>1 single-fill accumulation: tighter bounds (f32 DST)."""
+    """Kt>1 single-fill accumulation.
+
+    When the output block fits in DST (block_n <= 4 for f32), matmul_block
+    accumulates all K tiles in f32 DST with one bf16 truncation at the end
+    (tighter bounds). When the output exceeds DST capacity (block_n > 4
+    for f32), the compiler tiles K to 1 for L1 accumulation, producing one
+    bf16 truncation per K step (same bounds as the streaming test).
+    """
     scale = math.sqrt(k_tiles)
+    # DST capacity with fp32_dest_acc_en=true is 4. Output block is
+    # 1 x block_n. When block_n > 4, L1 acc activates with per-K-step
+    # bf16 truncation, requiring relaxed error bounds.
+    uses_l1_acc = block_n > 4
+    if uses_l1_acc:
+        max_err = 0.5 * scale
+        mean_err = 0.05 * scale
+    else:
+        max_err = 0.1 * scale
+        mean_err = 0.01 * scale
     _run(
         _make_matmul_kn,
         k_tiles,
         block_n,
         device,
-        max_err_limit=0.1 * scale,
-        mean_err_limit=0.01 * scale,
+        max_err_limit=max_err,
+        mean_err_limit=mean_err,
     )
diff --git a/test/python/test_matmul_l1_acc.py b/test/python/test_matmul_l1_acc.py
new file mode 100644
index 000000000..8a8ed4d9a
--- /dev/null
+++ b/test/python/test_matmul_l1_acc.py
@@ -0,0 +1,497 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Matmul L1 accumulation via += across K iterations.
+
+The += operator emits ttl.store with {accumulate}, which the compiler
+detects and annotates for L1 packer accumulation. Each K iteration packs
+additively to L1.
+
+Tests single-core and multicore configurations with various block sizes.
+"""
+
+# REQUIRES: ttnn
+# UNSUPPORTED: system-darwin
+# RUN: %python -m pytest %s -v --tb=short
+
+import pytest
+import torch
+import ttl
+
+ttnn = pytest.importorskip("ttnn", exc_type=ImportError)
+
+from ttlang_test_utils import to_dram
+from utils.correctness import assert_pcc
+
+TILE = 32
+
+
+def _make_l1_acc_kernel(block_m, block_n, grid="auto"):
+    """Matmul with L1 accumulation via += across K iterations."""
+
+    @ttl.operation(grid=grid)
+    def kernel(a, b, out):
+        Mt = a.shape[0] // TILE
+        Kt = a.shape[1] // TILE
+        Nt = b.shape[1] // TILE
+
+        M_num = Mt // block_m
+        N_num = Nt // block_n
+
+        grid_n, grid_m = ttl.grid_size(dims=2)
+        m_per = -(-M_num // grid_m)
+        n_per = -(-N_num // grid_n)
+
+        a_dfb = ttl.make_dataflow_buffer_like(a, shape=(block_m, 1), block_count=2)
+        b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, block_n), block_count=2)
+        out_dfb = ttl.make_dataflow_buffer_like(
+            out, shape=(block_m, block_n), block_count=2
+        )
+
+        @ttl.compute()
+        def compute():
+            node_n, node_m = ttl.node(dims=2)
+            for lm in range(m_per):
+                mb = node_m * m_per + lm
+                if mb < M_num:
+                    for ln in range(n_per):
+                        nb = node_n * n_per + ln
+                        if nb < N_num:
+                            out_blk = out_dfb.reserve()
+                            for _ in range(Kt):
+                                a_blk = a_dfb.wait()
+                                b_blk = b_dfb.wait()
+                                out_blk += a_blk @ b_blk
+                                a_blk.pop()
+                                b_blk.pop()
+                            out_blk.push()
+
+        @ttl.datamovement()
+        def reader():
+            node_n, node_m = ttl.node(dims=2)
+            for lm in range(m_per):
+                mb = node_m * m_per + lm
+                if mb < M_num:
+                    m_off = mb * block_m
+                    for ln in range(n_per):
+                        nb = node_n * n_per + ln
+                        if nb < N_num:
+                            for kt in range(Kt):
+                                with a_dfb.reserve() as blk:
+                                    ttl.copy(
+                                        a[
+                                            m_off : m_off + block_m,
+                                            kt : kt + 1,
+                                        ],
+                                        blk,
+                                    ).wait()
+
+        @ttl.datamovement()
+        def writer():
+            node_n, node_m = ttl.node(dims=2)
+            for lm in range(m_per):
+                mb = node_m * m_per + lm
+                if mb < M_num:
+                    m_off = mb * block_m
+                    for ln in range(n_per):
+                        nb = node_n * n_per + ln
+                        if nb < N_num:
+                            n_off = nb * block_n
+                            for kt in range(Kt):
+                                with b_dfb.reserve() as blk:
+                                    ttl.copy(
+                                        b[
+                                            kt : kt + 1,
+                                            n_off : n_off + block_n,
+                                        ],
+                                        blk,
+                                    ).wait()
+                            with out_dfb.wait() as blk:
+                                ttl.copy(
+                                    blk,
+                                    out[
+                                        m_off : m_off + block_m,
+                                        n_off : n_off + block_n,
+                                    ],
+                                ).wait()
+
+    return kernel
+
+
+# Single-core tests (grid=(1,1))
+SINGLE_CORE_PARAMS = [
+    # (block_m, block_n, Kt)
+    (2, 2, 2),  # Output 2x2=4 fits in f32 DST
+    (2, 2, 4),  # K=4
+    (3, 3, 2),  # Output 3x3=9 > f32 DST(4)
+    (4, 4, 4),  # Output 4x4=16 > f32 DST(4)
+    (8, 8, 2),  # Large output, small K
+    (8, 8, 8),  # Large output, large K
+]
+
+
+@pytest.mark.parametrize(
+    "block_m,block_n,Kt",
+    SINGLE_CORE_PARAMS,
+    ids=[f"blk{m}x{n}_K{k}" for m, n, k in SINGLE_CORE_PARAMS],
+)
+@pytest.mark.requires_device
+def test_l1_acc_single_core(block_m, block_n, Kt, device):
+    """L1 accumulation on single core with various block sizes."""
+    M, K, N = block_m * TILE, Kt * TILE, block_n * TILE
+    a_torch = torch.randn(M, K, dtype=torch.bfloat16)
+    b_torch = torch.randn(K, N, dtype=torch.bfloat16)
+    golden = (a_torch.float() @ b_torch.float()).float()
+
+    a = to_dram(a_torch, device)
+    b = to_dram(b_torch, device)
+    out = to_dram(torch.zeros(M, N, dtype=torch.bfloat16), device)
+
+    kernel = _make_l1_acc_kernel(block_m, block_n, grid=(1, 1))
+    kernel(a, b, out)
+
+    result = ttnn.to_torch(out).float()
+    assert_pcc(golden, result, threshold=0.999)
+
+
+# Multicore tests (grid="auto") with multiple output blocks
+MULTI_CORE_PARAMS = [
+    # (Mt, Kt, Nt, block_m, block_n)
+    (16, 4, 16, 8, 8),  # 2x2 output blocks, K=4
+    (32, 8, 32, 8, 8),  # 4x4 output blocks, K=8
+    (128, 128, 128, 8, 8),  # 16x16 output blocks, K=128 (4096^3 shape)
+]
+
+
+@pytest.mark.parametrize(
+    "Mt,Kt,Nt,block_m,block_n",
+    MULTI_CORE_PARAMS,
+    ids=[
+        f"tiles{mt}x{kt}x{nt}_blk{bm}x{bn}" for mt, kt, nt, bm, bn in MULTI_CORE_PARAMS
+    ],
+)
+@pytest.mark.requires_device
+def test_l1_acc_multicore(Mt, Kt, Nt, block_m, block_n, device):
+    """L1 accumulation with multicore and multiple output blocks."""
+    M, K, N = Mt * TILE, Kt * TILE, Nt * TILE
+    a_torch = torch.randn(M, K, dtype=torch.bfloat16)
+    b_torch = torch.randn(K, N, dtype=torch.bfloat16)
+    golden = (a_torch.float() @ b_torch.float()).float()
+
+    a = to_dram(a_torch, device)
+    b = to_dram(b_torch, device)
+    out = to_dram(torch.zeros(M, N, dtype=torch.bfloat16), device)
+
+    kernel = _make_l1_acc_kernel(block_m, block_n)
+    kernel(a, b, out)
+
+    result = ttnn.to_torch(out).float()
+    assert_pcc(golden, result, threshold=0.999)
+
+
+# ---------------------------------------------------------------------------
+# Non-matmul accumulation: += with a passthrough copy (sum reduction).
+# ---------------------------------------------------------------------------
+
+
+def _make_sum_reduction_kernel():
+    """Sum K input blocks via += (no matmul)."""
+
+    @ttl.operation(grid=(1, 1))
+    def kernel(inp, out):
+        Kt = inp.shape[0] // TILE
+        inp_dfb = ttl.make_dataflow_buffer_like(inp, shape=(1, 1), block_count=2)
+        out_dfb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            out_blk = out_dfb.reserve()
+            for _ in range(Kt):
+                inp_blk = inp_dfb.wait()
+                out_blk += inp_blk
+                inp_blk.pop()
+            out_blk.push()
+
+        @ttl.datamovement()
+        def dm_read():
+            for kt in range(Kt):
+                with inp_dfb.reserve() as blk:
+                    ttl.copy(inp[kt : kt + 1, 0:1], blk).wait()
+
+        @ttl.datamovement()
+        def dm_write():
+            with out_dfb.wait() as blk:
+                ttl.copy(blk, out[0:1, 0:1]).wait()
+
+    return kernel
+
+
+@pytest.mark.parametrize("Kt", [2, 4, 8], ids=[f"K{k}" for k in [2, 4, 8]])
+@pytest.mark.requires_device
+def test_l1_acc_sum_reduction(Kt, device):
+    """Sum K tiles via += without matmul (passthrough accumulation)."""
+    inp_torch = torch.randn(Kt * TILE, TILE, dtype=torch.bfloat16)
+    golden = inp_torch.float().reshape(Kt, TILE, TILE).sum(dim=0)
+
+    inp_dev = to_dram(inp_torch, device)
+    out_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device)
+
+    kernel = _make_sum_reduction_kernel()
+    kernel(inp_dev, out_dev)
+
+    result = ttnn.to_torch(out_dev).float()
+    assert_pcc(golden, result, threshold=0.999)
+
+
+# ---------------------------------------------------------------------------
+# K=1 single iteration: accumulation with one loop iteration.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.requires_device
+def test_l1_acc_single_iteration(device):
+    """K=1: single-iteration += loop. Semantically equivalent to plain store."""
+    M, K, N = TILE, TILE, 2 * TILE
+    a_torch = torch.randn(M, K, dtype=torch.bfloat16)
+    b_torch = torch.randn(K, N, dtype=torch.bfloat16)
+    golden = (a_torch.float() @ b_torch.float()).float()
+
+    a_dev = to_dram(a_torch, device)
+    b_dev = to_dram(b_torch, device)
+    out_dev = to_dram(torch.zeros(M, N, dtype=torch.bfloat16), device)
+
+    kernel = _make_l1_acc_kernel(1, 2, grid=(1, 1))
+    kernel(a_dev, b_dev, out_dev)
+
+    result = ttnn.to_torch(out_dev).float()
+    assert_pcc(golden, result, threshold=0.999)
+
+
+# ---------------------------------------------------------------------------
+# Consecutive += loops to the same reserve (two input streams).
+# ---------------------------------------------------------------------------
+
+
+def _make_consecutive_acc_kernel(K1, K2):
+    """Two consecutive += loops to one output: out = (a@b summed K1) + (c@d summed K2)."""
+
+    @ttl.operation(grid=(1, 1))
+    def kernel(a, b, c, d, out):
+        a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2)
+        b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, 1), block_count=2)
+        c_dfb = ttl.make_dataflow_buffer_like(c, shape=(1, 1), block_count=2)
+        d_dfb = ttl.make_dataflow_buffer_like(d, shape=(1, 1), block_count=2)
+        out_dfb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            out_blk = out_dfb.reserve()
+            for _ in range(K1):
+                a_blk = a_dfb.wait()
+                b_blk = b_dfb.wait()
+                out_blk += a_blk @ b_blk
+                a_blk.pop()
+                b_blk.pop()
+            for _ in range(K2):
+                c_blk = c_dfb.wait()
+                d_blk = d_dfb.wait()
+                out_blk += c_blk @ d_blk
+                c_blk.pop()
+                d_blk.pop()
+            out_blk.push()
+
+        @ttl.datamovement()
+        def reader():
+            for kt in range(K1):
+                with a_dfb.reserve() as blk:
+                    ttl.copy(a[0:1, kt : kt + 1], blk).wait()
+                with b_dfb.reserve() as blk:
+                    ttl.copy(b[kt : kt + 1, 0:1], blk).wait()
+            for kt in range(K2):
+                with c_dfb.reserve() as blk:
+                    ttl.copy(c[0:1, kt : kt + 1], blk).wait()
+                with d_dfb.reserve() as blk:
+                    ttl.copy(d[kt : kt + 1, 0:1], blk).wait()
+
+        @ttl.datamovement()
+        def writer():
+            with out_dfb.wait() as blk:
+                ttl.copy(blk, out[0:1, 0:1]).wait()
+
+    return kernel
+
+
+@pytest.mark.requires_device
+def test_l1_acc_consecutive_loops(device):
+    """Two consecutive += loops to the same reserve block."""
+    K1, K2 = 2, 3
+    a_torch = torch.randn(TILE, K1 * TILE, dtype=torch.bfloat16)
+    b_torch = torch.randn(K1 * TILE, TILE, dtype=torch.bfloat16)
+    c_torch = torch.randn(TILE, K2 * TILE, dtype=torch.bfloat16)
+    d_torch = torch.randn(K2 * TILE, TILE, dtype=torch.bfloat16)
+    golden = (
+        (a_torch.float() @ b_torch.float()) + (c_torch.float() @ d_torch.float())
+    ).float()
+
+    a_dev = to_dram(a_torch, device)
+    b_dev = to_dram(b_torch, device)
+    c_dev = to_dram(c_torch, device)
+    d_dev = to_dram(d_torch, device)
+    out_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device)
+
+    kernel = _make_consecutive_acc_kernel(K1, K2)
+    kernel(a_dev, b_dev, c_dev, d_dev, out_dev)
+
+    result = ttnn.to_torch(out_dev).float()
+    assert_pcc(golden, result, threshold=0.999)
+
+
+# ---------------------------------------------------------------------------
+# .store() before loop, += inside loop (overwrite then accumulate).
+# ---------------------------------------------------------------------------
+
+
+def _make_store_then_acc_kernel(total_k):
+    """.store() before the += loop, then K-1 iterations accumulate via +=."""
+
+    @ttl.operation(grid=(1, 1))
+    def kernel(a, b, out):
+        a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2)
+        b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, 1), block_count=2)
+        out_dfb = ttl.make_dataflow_buffer_like(out, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            out_blk = out_dfb.reserve()
+            a_blk = a_dfb.wait()
+            b_blk = b_dfb.wait()
+            out_blk.store(a_blk @ b_blk)
+            a_blk.pop()
+            b_blk.pop()
+            for _ in range(total_k - 1):
+                a_blk = a_dfb.wait()
+                b_blk = b_dfb.wait()
+                out_blk += a_blk @ b_blk
+                a_blk.pop()
+                b_blk.pop()
+            out_blk.push()
+
+        @ttl.datamovement()
+        def reader():
+            for _ in range(total_k):
+                with a_dfb.reserve() as blk:
+                    ttl.copy(a[0:1, 0:1], blk).wait()
+                with b_dfb.reserve() as blk:
+                    ttl.copy(b[0:1, 0:1], blk).wait()
+
+        @ttl.datamovement()
+        def writer():
+            with out_dfb.wait() as blk:
+                ttl.copy(blk, out[0:1, 0:1]).wait()
+
+    return kernel
+
+
+@pytest.mark.parametrize("total_k", [2, 4], ids=[f"K{k}" for k in [2, 4]])
+@pytest.mark.requires_device
+def test_l1_acc_store_then_acc(total_k, device):
+    """.store() before loop, += inside loop. Result = K * (a @ b)."""
+    a_torch = torch.randn(TILE, TILE, dtype=torch.bfloat16)
+    b_torch = torch.randn(TILE, TILE, dtype=torch.bfloat16)
+    golden = (total_k * (a_torch.float() @ b_torch.float())).float()
+
+    a_dev = to_dram(a_torch, device)
+    b_dev = to_dram(b_torch, device)
+    out_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device)
+
+    kernel = _make_store_then_acc_kernel(total_k)
+    kernel(a_dev, b_dev, out_dev)
+
+    result = ttnn.to_torch(out_dev).float()
+    assert_pcc(golden, result, threshold=0.999)
+
+
+# ---------------------------------------------------------------------------
+# Multiple += to different outputs in the same loop.
+# ---------------------------------------------------------------------------
+
+
+def _make_multi_output_kernel(Kt):
+    """One loop with += to two independent outputs."""
+
+    @ttl.operation(grid=(1, 1))
+    def kernel(a, b, c, d, out_a, out_b):
+        a_dfb = ttl.make_dataflow_buffer_like(a, shape=(1, 1), block_count=2)
+        b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, 1), block_count=2)
+        c_dfb = ttl.make_dataflow_buffer_like(c, shape=(1, 1), block_count=2)
+        d_dfb = ttl.make_dataflow_buffer_like(d, shape=(1, 1), block_count=2)
+        out_a_dfb = ttl.make_dataflow_buffer_like(out_a, shape=(1, 1), block_count=2)
+        out_b_dfb = ttl.make_dataflow_buffer_like(out_b, shape=(1, 1), block_count=2)
+
+        @ttl.compute()
+        def compute():
+            blk_a = out_a_dfb.reserve()
+            blk_b = out_b_dfb.reserve()
+            for _ in range(Kt):
+                a_blk = a_dfb.wait()
+                b_blk = b_dfb.wait()
+                blk_a += a_blk @ b_blk
+                a_blk.pop()
+                b_blk.pop()
+                c_blk = c_dfb.wait()
+                d_blk = d_dfb.wait()
+                blk_b += c_blk @ d_blk
+                c_blk.pop()
+                d_blk.pop()
+            blk_a.push()
+            blk_b.push()
+
+        @ttl.datamovement()
+        def reader():
+            for kt in range(Kt):
+                with a_dfb.reserve() as blk:
+                    ttl.copy(a[0:1, kt : kt + 1], blk).wait()
+                with b_dfb.reserve() as blk:
+                    ttl.copy(b[kt : kt + 1, 0:1], blk).wait()
+                with c_dfb.reserve() as blk:
+                    ttl.copy(c[0:1, kt : kt + 1], blk).wait()
+                with d_dfb.reserve() as blk:
+                    ttl.copy(d[kt : kt + 1, 0:1], blk).wait()
+
+        @ttl.datamovement()
+        def writer():
+            with out_a_dfb.wait() as blk:
+                ttl.copy(blk, out_a[0:1, 0:1]).wait()
+            with out_b_dfb.wait() as blk:
+                ttl.copy(blk, out_b[0:1, 0:1]).wait()
+
+    return kernel
+
+
+@pytest.mark.requires_device
+def test_l1_acc_multi_output(device):
+    """Two independent += outputs in the same K loop."""
+    Kt = 4
+    a_torch = torch.randn(TILE, Kt * TILE, dtype=torch.bfloat16)
+    b_torch = torch.randn(Kt * TILE, TILE, dtype=torch.bfloat16)
+    c_torch = torch.randn(TILE, Kt * TILE, dtype=torch.bfloat16)
+    d_torch = torch.randn(Kt * TILE, TILE, dtype=torch.bfloat16)
+    golden_a = (a_torch.float() @ b_torch.float()).float()
+    golden_b = (c_torch.float() @ d_torch.float()).float()
+
+    a_dev = to_dram(a_torch, device)
+    b_dev = to_dram(b_torch, device)
+    c_dev = to_dram(c_torch, device)
+    d_dev = to_dram(d_torch, device)
+    out_a_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device)
+    out_b_dev = to_dram(torch.zeros(TILE, TILE, dtype=torch.bfloat16), device)
+
+    kernel = _make_multi_output_kernel(Kt)
+    kernel(a_dev, b_dev, c_dev, d_dev, out_a_dev, out_b_dev)
+
+    result_a = ttnn.to_torch(out_a_dev).float()
+    result_b = ttnn.to_torch(out_b_dev).float()
+    assert_pcc(golden_a, result_a, threshold=0.999)
+    assert_pcc(golden_b, result_b, threshold=0.999)
diff --git a/test/python/test_matmul_l1_acc_multinode.py b/test/python/test_matmul_l1_acc_multinode.py
new file mode 100644
index 000000000..70f3fdab6
--- /dev/null
+++ b/test/python/test_matmul_l1_acc_multinode.py
@@ -0,0 +1,157 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Multinode matmul with L1 packer accumulation via += across K iterations.
+L1-only (no DRAM reads during compute). All input blocks are pre-loaded
+into L1 DFBs before the K reduction loop begins.
+
+Tests multicore configurations with a 2D grid and multiple K blocks.
+"""
+
+import pytest
+import torch
+import ttl
+
+ttnn = pytest.importorskip("ttnn", exc_type=ImportError)
+
+from ttlang_test_utils import to_dram
+from utils.correctness import assert_pcc
+
+TILE = 32
+
+
+def _make_l1_acc_multinode_kernel(block_m, block_n, grid="auto"):
+    """Multinode matmul with L1 accumulation.
+
+    All K blocks are pre-loaded into L1 before compute begins (no DRAM
+    streaming during the K loop). The compute thread reserves the output
+    DFB once, stores K times (triggering L1 accumulation), then pushes.
+    """
+
+    @ttl.operation(grid=grid)
+    def kernel(a, b, out):
+        Mt = a.shape[0] // TILE
+        Kt = a.shape[1] // TILE
+        Nt = b.shape[1] // TILE
+
+        M_num = Mt // block_m
+        N_num = Nt // block_n
+
+        grid_n, grid_m = ttl.grid_size(dims=2)
+        m_per = -(-M_num // grid_m)
+        n_per = -(-N_num // grid_n)
+
+        a_dfb = ttl.make_dataflow_buffer_like(a, shape=(block_m, 1), block_count=2)
+        b_dfb = ttl.make_dataflow_buffer_like(b, shape=(1, block_n), block_count=2)
+        out_dfb = ttl.make_dataflow_buffer_like(
+            out, shape=(block_m, block_n), block_count=2
+        )
+
+        @ttl.compute()
+        def compute():
+            node_n, node_m = ttl.node(dims=2)
+            for lm in range(m_per):
+                mb = node_m * m_per + lm
+                if mb < M_num:
+                    for ln in range(n_per):
+                        nb = node_n * n_per + ln
+                        if nb < N_num:
+                            out_blk = out_dfb.reserve()
+                            for _ in range(Kt):
+                                a_blk = a_dfb.wait()
+                                b_blk = b_dfb.wait()
+                                out_blk += a_blk @ b_blk
+                                a_blk.pop()
+                                b_blk.pop()
+                            out_blk.push()
+
+        @ttl.datamovement()
+        def reader():
+            node_n, node_m = ttl.node(dims=2)
+            for lm in range(m_per):
+                mb = node_m * m_per + lm
+                if mb < M_num:
+                    m_off = mb * block_m
+                    for ln in range(n_per):
+                        nb = node_n * n_per + ln
+                        if nb < N_num:
+                            for kt in range(Kt):
+                                with a_dfb.reserve() as blk:
+                                    ttl.copy(
+                                        a[
+                                            m_off : m_off + block_m,
+                                            kt : kt + 1,
+                                        ],
+                                        blk,
+                                    ).wait()
+
+        @ttl.datamovement()
+        def writer():
+            node_n, node_m = ttl.node(dims=2)
+            for lm in range(m_per):
+                mb = node_m * m_per + lm
+                if mb < M_num:
+                    m_off = mb * block_m
+                    for ln in range(n_per):
+                        nb = node_n * n_per + ln
+                        if nb < N_num:
+                            n_off = nb * block_n
+                            for kt in range(Kt):
+                                with b_dfb.reserve() as blk:
+                                    ttl.copy(
+                                        b[
+                                            kt : kt + 1,
+                                            n_off : n_off + block_n,
+                                        ],
+                                        blk,
+                                    ).wait()
+                            with out_dfb.wait() as blk:
+                                ttl.copy(
+                                    blk,
+                                    out[
+                                        m_off : m_off + block_m,
+                                        n_off : n_off + block_n,
+                                    ],
+                                ).wait()
+
+    return kernel
+
+
+PARAMS = [
+    # (Mt, Kt, Nt, block_m, block_n, grid)
+    (4, 2, 4, 2, 2, (2, 2)),
+    (8, 4, 8, 4, 4, (2, 2)),
+    (8, 4, 8, 4, 4, "auto"),
+    (16, 8, 16, 8, 8, "auto"),
+    # Multi-block per core: M_num=4, N_num=4 on 2x2 grid -> 2 blocks/core/axis.
+    # Exercises per-block L1 acc disable/re-enable across output blocks.
+    (16, 4, 16, 4, 4, (2, 2)),
+]
+
+
+@pytest.mark.parametrize(
+    "Mt,Kt,Nt,block_m,block_n,grid",
+    PARAMS,
+    ids=[
+        f"tiles{mt}x{kt}x{nt}_blk{bm}x{bn}_grid{g}" for mt, kt, nt, bm, bn, g in PARAMS
+    ],
+)
+@pytest.mark.requires_device
+def test_l1_acc_multinode(Mt, Kt, Nt, block_m, block_n, grid, device):
+    """Multinode matmul with L1 packer accumulation across K iterations."""
+    M, K, N = Mt * TILE, Kt * TILE, Nt * TILE
+    a_torch = torch.randn(M, K, dtype=torch.bfloat16)
+    b_torch = torch.randn(K, N, dtype=torch.bfloat16)
+    golden = (a_torch.float() @ b_torch.float()).float()
+
+    a_dev = to_dram(a_torch, device)
+    b_dev = to_dram(b_torch, device)
+    out_dev = to_dram(torch.zeros(M, N, dtype=torch.bfloat16), device)
+
+    kernel = _make_l1_acc_multinode_kernel(block_m, block_n, grid=grid)
+    kernel(a_dev, b_dev, out_dev)
+
+    result = ttnn.to_torch(out_dev).float()
+    assert_pcc(golden, result, threshold=0.999)
diff --git a/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir b/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir
new file mode 100644
index 000000000..3e37a056a
--- /dev/null
+++ b/test/ttlang/Conversion/TTLToTTKernel/matmul_subblock_l1_acc.mlir
@@ -0,0 +1,107 @@
+// Matmul with subblocking AND L1 accumulation. Output 3x3 bf16 = 9 tiles
+// exceeds bf16 DST capacity (8), triggering subblocking. The user K loop
+// with {accumulate} triggers L1 acc annotation and pack_reconfig_l1_acc
+// guard insertion.
+
+// RUN: ttlang-opt %s \
+// RUN:   -pass-pipeline='builtin.module( \
+// RUN:     func.func(ttl-annotate-l1-acc-loops, convert-ttl-to-compute, \
+// RUN:       ttl-assign-dst{enable-fpu-binary-ops=0}, \
+// RUN:       ttl-subblock-compute-for-dst, ttl-lower-matmul-block, \
+// RUN:       ttl-lower-to-loops{dst-accumulation=1}, ttl-schedule-operations, \
+// RUN:       ttl-annotate-cb-associations), \
+// RUN:     convert-ttl-to-ttkernel, ttkernel-insert-inits, \
+// RUN:     ttkernel-insert-l1-accumulation, canonicalize, cse)' \
+// RUN:   --split-input-file | FileCheck %s
+
+// CHECK-LABEL: func.func @matmul_3x3_k_loop
+// Disable before the K loop.
+// CHECK-DAG: %[[C0_I32:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[C1_I32:.*]] = arith.constant 1 : i32
+// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C0_I32]])
+// K loop with subblock loops inside.
+// CHECK: scf.for %[[K_IV:.*]] = %[[K_LB:.*]] to
+// Subblock loop: acquire, matmul, 3 pack_tiles (3x1 subblock), release.
+// CHECK:   scf.for
+// CHECK:     ttkernel.tile_regs_acquire
+// CHECK:     ttkernel.matmul_block
+// CHECK-COUNT-3: ttkernel.pack_tile
+// CHECK:     ttkernel.tile_regs_release
+// CHECK:   }
+// Enable after first K iteration.
+// CHECK:   arith.cmpi eq, %[[K_IV]], %[[K_LB]]
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc(%[[C1_I32]])
+// CHECK: }
+// Disable after push.
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C0_I32]])
+func.func @matmul_3x3_k_loop(
+    %arg0: tensor<3x2x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<2x3x!ttcore.tile<32x32, bf16>>) -> tensor<3x3x!ttcore.tile<32x32, bf16>>
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[3, 2], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[2, 3], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<3x2x!ttcore.tile<32x32, bf16>>, !ttl.cb<[3, 2], !ttcore.tile<32x32, bf16>, 2>) -> tensor<3x2x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<2x3x!ttcore.tile<32x32, bf16>>, !ttl.cb<[2, 3], !ttcore.tile<32x32, bf16>, 2>) -> tensor<2x3x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[3, 3], !ttcore.tile<32x32, bf16>, 2> -> tensor<3x3x!ttcore.tile<32x32, bf16>>
+  scf.for %k = %c0 to %c2 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<3x2x!ttcore.tile<32x32, bf16>>, tensor<2x3x!ttcore.tile<32x32, bf16>> -> tensor<3x3x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<3x3x!ttcore.tile<32x32, bf16>>, tensor<3x3x!ttcore.tile<32x32, bf16>>
+  }
+  ttl.cb_push %cb2 : <[3, 3], !ttcore.tile<32x32, bf16>, 2>
+  func.return %reserve : tensor<3x3x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// 8x8 output (64 tiles >> DST capacity 8) with K=4: heavily subblocked.
+// Verifies that multiple levels of subblock loops all sit inside the
+// K loop's L1 acc guards.
+
+// CHECK-LABEL: func.func @matmul_8x8_k4
+// CHECK-DAG: %[[C0_I32:.*]] = arith.constant 0 : i32
+// CHECK-DAG: %[[C1_I32:.*]] = arith.constant 1 : i32
+// Disable before K loop.
+// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C0_I32]])
+// K loop -> subblock row loop -> acquire, matmul K loop, 8x pack, release.
+// CHECK: scf.for %[[K_IV:.*]] = %[[K_LB:.*]] to
+// CHECK:   scf.for
+// CHECK:     ttkernel.tile_regs_acquire
+// CHECK:     scf.for
+// CHECK:       ttkernel.matmul_block
+// CHECK-COUNT-8: ttkernel.pack_tile
+// CHECK:     ttkernel.tile_regs_release
+// CHECK:   }
+// Enable after first K iteration.
+// CHECK:   arith.cmpi eq, %[[K_IV]], %[[K_LB]]
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc(%[[C1_I32]])
+// CHECK: }
+// Disable after push.
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc(%[[C0_I32]])
+func.func @matmul_8x8_k4(
+    %arg0: tensor<8x8x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<8x8x!ttcore.tile<32x32, bf16>>) -> tensor<8x8x!ttcore.tile<32x32, bf16>>
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>} {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<8x8x!ttcore.tile<32x32, bf16>>, !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2>) -> tensor<8x8x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<8x8x!ttcore.tile<32x32, bf16>>, !ttl.cb<[8, 8], !ttcore.tile<32x32, bf16>, 2>) -> tensor<8x8x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[8, 8], !ttcore.tile<32x32, bf16>, 2> -> tensor<8x8x!ttcore.tile<32x32, bf16>>
+  scf.for %k = %c0 to %c4 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<8x8x!ttcore.tile<32x32, bf16>>, tensor<8x8x!ttcore.tile<32x32, bf16>> -> tensor<8x8x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<8x8x!ttcore.tile<32x32, bf16>>, tensor<8x8x!ttcore.tile<32x32, bf16>>
+  }
+  ttl.cb_push %cb2 : <[8, 8], !ttcore.tile<32x32, bf16>, 2>
+  func.return %reserve : tensor<8x8x!ttcore.tile<32x32, bf16>>
+}
diff --git a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir
index 9ddb49905..ab37a5760 100644
--- a/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir
+++ b/test/ttlang/Conversion/TTLToTTKernel/reduce_lowering.mlir
@@ -81,25 +81,28 @@ func.func @reduce_sum_dim0_1x1() attributes {ttl.base_cta_index = 3 : i32, ttl.c
 // FP32-DAG: %[[C0:.*]] = arith.constant 0 : index
 // FP32-DAG: %[[C1:.*]] = arith.constant 1 : index
 // FP32-DAG: %[[C2:.*]] = arith.constant 2 : index
+// FP32-DAG: %[[C0I:.*]] = arith.constant 0 : i32
 // FP32-DAG: %[[C1I:.*]] = arith.constant 1 : i32
 // FP32: %[[CB0:.*]] = ttkernel.get_compile_time_arg_val(0)
 // FP32: %[[CB1:.*]] = ttkernel.get_compile_time_arg_val(1)
 // FP32: %[[CB2:.*]] = ttkernel.get_compile_time_arg_val(2)
+// Disable L1 accumulation before the reduction loop.
+// FP32: ttkernel.pack_reconfig_l1_acc(%[[C0I]])
 // FP32: scf.for %[[IV:.*]] = %[[C0]] to %[[C2]] step %[[C1]]
 // FP32-NEXT:   ttkernel.tile_regs_acquire
-// L1 accumulation guard: enable from second iteration.
-// FP32:   %[[NOT_FIRST:.*]] = arith.cmpi ne, %[[IV]], %[[C0]]
-// FP32-NEXT:   scf.if %[[NOT_FIRST]]
-// FP32-NEXT:     ttkernel.pack_reconfig_l1_acc(%[[C1I]])
-// FP32:        }
 // FP32:   ttkernel.reduce_init({{.*}}<reduce_sum>, <reduce_dim_col>) {full_fp32}
 // FP32:   ttkernel.reduce_tile({{.*}}<reduce_sum>, <reduce_dim_col>) {full_fp32
 // FP32:   ttkernel.reduce_uninit
 // FP32:   ttkernel.pack_tile(%[[C0]], %[[CB2]], %[[C0]], true)
 // FP32:   ttkernel.tile_regs_release
+// L1 accumulation guard: enable once after the first iteration's pack.
+// FP32:   %[[FIRST:.*]] = arith.cmpi eq, %[[IV]], %[[C0]]
+// FP32-NEXT:   scf.if %[[FIRST]]
+// FP32-NEXT:     ttkernel.pack_reconfig_l1_acc(%[[C1I]])
+// FP32:        }
 // FP32: } {ttl.reduction_loop
 // Disable L1 accumulation after reduction loop.
-// FP32: ttkernel.pack_reconfig_l1_acc({{.*}}0{{.*}})
+// FP32: ttkernel.pack_reconfig_l1_acc(%[[C0I]])
 func.func @reduce_2x1_l1_acc() attributes {ttl.base_cta_index = 3 : i32, ttl.crta_indices = [], ttl.kernel_thread = #ttkernel.thread<compute>} {
   %c2 = arith.constant 2 : index
   %c1 = arith.constant 1 : index
diff --git a/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
new file mode 100644
index 000000000..ca2ba92d4
--- /dev/null
+++ b/test/ttlang/Dialect/TTKernel/Transforms/insert_l1_accumulation.mlir
@@ -0,0 +1,567 @@
+// Verifies ttkernel-insert-l1-accumulation: pack_reconfig_l1_acc guards are
+// inserted around reduction loops. The enable call happens once after the
+// first iteration's last pack (iv == lb), and disable guards bracket the
+// accumulation scope.
+
+// RUN: ttlang-opt %s --pass-pipeline='builtin.module(ttkernel-insert-l1-accumulation)' --split-input-file | FileCheck %s
+// Idempotency: running twice produces the same output.
+// RUN: ttlang-opt %s --pass-pipeline='builtin.module(ttkernel-insert-l1-accumulation, ttkernel-insert-l1-accumulation)' --split-input-file | FileCheck %s
+
+// Basic L1 acc loop: enable after first iteration, disable before/after loop.
+
+// CHECK-LABEL: func.func @basic_l1_acc_loop
+// CHECK: ttkernel.pack_reconfig_l1_acc(%{{.*}}) : (i32)
+// CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to
+// CHECK:   ttkernel.tile_regs_acquire
+// CHECK:   ttkernel.pack_tile
+// CHECK:   ttkernel.tile_regs_release
+// CHECK:   %[[CMP:.*]] = arith.cmpi eq, %[[IV]], %[[LB]]
+// CHECK:   scf.if %[[CMP]]
+// CHECK:     %[[ENABLE:.*]] = arith.constant 1 : i32
+// CHECK:     ttkernel.pack_reconfig_l1_acc(%[[ENABLE]]) : (i32)
+// CHECK: }
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc(%{{.*}}) : (i32)
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @basic_l1_acc_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}
+
+// -----
+
+// Reduction loop fallback (ttl.reduction_loop attribute) with sum reduce.
+
+// CHECK-LABEL: func.func @reduction_loop_fallback
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   arith.cmpi eq
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @reduction_loop_fallback() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, <reduce_sum>, <reduce_dim_col>) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.reduction_loop}
+  return
+}
+
+// -----
+
+// Max reduce loops should NOT get L1 accumulation guards.
+
+// CHECK-LABEL: func.func @max_reduce_no_l1_acc
+// CHECK-NOT: pack_reconfig_l1_acc
+func.func @max_reduce_no_l1_acc() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, <reduce_max>, <reduce_dim_col>) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.reduction_loop}
+  return
+}
+
+// -----
+
+// No reduction loop attribute: no transformation.
+
+// CHECK-LABEL: func.func @no_reduction_loop
+// CHECK-NOT: pack_reconfig_l1_acc
+func.func @no_reduction_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  }
+  return
+}
+
+// -----
+
+// Subblocked loop: multiple acquire/release pairs per iteration inside nested
+// loops. The enable guard should appear once after the outermost subblock loop
+// (containing the last release), not after each individual release.
+
+// CHECK-LABEL: func.func @subblocked_loop
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to
+// CHECK:   scf.for
+// CHECK:     ttkernel.tile_regs_acquire
+// CHECK:     ttkernel.tile_regs_release
+// CHECK:   }
+// CHECK:   %[[CMP:.*]] = arith.cmpi eq, %[[IV]], %[[LB]]
+// CHECK:   scf.if %[[CMP]]
+// CHECK:     %[[ENABLE:.*]] = arith.constant 1 : i32
+// CHECK:     ttkernel.pack_reconfig_l1_acc(%[[ENABLE]]) : (i32)
+// CHECK: }
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @subblocked_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  scf.for %iv = %c0 to %c4 step %c1 {
+    scf.for %sb = %c0 to %c2 step %c1 {
+      ttkernel.tile_regs_acquire() : () -> ()
+      ttkernel.matmul_block(%cb, %cb, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> ()
+      ttkernel.tile_regs_commit() : () -> ()
+      ttkernel.tile_regs_wait() : () -> ()
+      ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+      ttkernel.tile_regs_release() : () -> ()
+    }
+  } {ttl.l1_acc_loop}
+  return
+}
+
+// -----
+
+// L1 acc loop with no tile_regs_acquire/release inside: no guards inserted.
+
+// CHECK-LABEL: func.func @l1_acc_loop_no_sync
+// CHECK-NOT: pack_reconfig_l1_acc
+func.func @l1_acc_loop_no_sync() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+  } {ttl.l1_acc_loop}
+  return
+}
+
+// -----
+
+// L1 acc loop inside an unannotated outer loop (the realistic pattern:
+// outer M/N iteration loop wraps the inner K reduction loop). The disable
+// guards bracket the inner K loop, not the outer loop. Each outer
+// iteration gets a fresh disable-before -> K loop -> disable-after cycle.
+
+// CHECK-LABEL: func.func @l1_acc_inside_outer_loop
+// CHECK: scf.for
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK:   scf.for %[[IV:.*]] = %[[LB:.*]] to
+// CHECK:     ttkernel.tile_regs_acquire
+// CHECK:     ttkernel.pack_tile
+// CHECK:     ttkernel.tile_regs_release
+// CHECK:     %[[CMP:.*]] = arith.cmpi eq, %[[IV]], %[[LB]]
+// CHECK:     scf.if %[[CMP]]
+// CHECK:       ttkernel.pack_reconfig_l1_acc
+// CHECK:   }
+// CHECK:   ttkernel.cb_push_back
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @l1_acc_inside_outer_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %outer = %c0 to %c2 step %c1 {
+    scf.for %inner = %c0 to %c4 step %c1 {
+      ttkernel.tile_regs_acquire() : () -> ()
+      ttkernel.tile_regs_commit() : () -> ()
+      ttkernel.tile_regs_wait() : () -> ()
+      ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+      ttkernel.tile_regs_release() : () -> ()
+    } {ttl.l1_acc_loop}
+    ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  }
+  return
+}
+
+// -----
+
+// Multiple consecutive cb_push_back ops after the loop (multi-output compute).
+// The disable guard should go after the last push.
+
+// CHECK-LABEL: func.func @multi_push_after_loop
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   arith.cmpi eq
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @multi_push_after_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %cb1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb0, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.pack_tile(%c0, %cb1, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb0, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  ttkernel.cb_push_back(%cb1, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}
+
+// -----
+
+// Nested l1_acc loops: reserve is outside both loops, so both are annotated
+// and all iterations accumulate into the same CB slot. Disable guards
+// bracket the outermost loop; enable fires once after the first inner
+// iteration of the first outer iteration.
+
+// CHECK-LABEL: func.func @nested_l1_acc_loops
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   scf.for %[[IV:.*]] = %[[LB:.*]] to
+// CHECK:     ttkernel.tile_regs_acquire
+// CHECK:     ttkernel.tile_regs_release
+// CHECK:     arith.cmpi eq, %[[IV]], %[[LB]]
+// CHECK:     scf.if
+// CHECK:       ttkernel.pack_reconfig_l1_acc
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @nested_l1_acc_loops() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  scf.for %outer = %c0 to %c2 step %c1 {
+    scf.for %inner = %c0 to %c4 step %c1 {
+      ttkernel.tile_regs_acquire() : () -> ()
+      ttkernel.tile_regs_commit() : () -> ()
+      ttkernel.tile_regs_wait() : () -> ()
+      ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+      ttkernel.tile_regs_release() : () -> ()
+    } {ttl.l1_acc_loop}
+  } {ttl.l1_acc_loop}
+  return
+}
+
+// -----
+
+// Nested reduction loops (multi-dim reduce): all iterations contribute to
+// a single accumulated result. Same structure as nested l1_acc loops.
+
+// CHECK-LABEL: func.func @nested_reduction_loops
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   scf.for %[[IV:.*]] = %[[LB:.*]] to
+// CHECK:     ttkernel.tile_regs_acquire
+// CHECK:     ttkernel.tile_regs_release
+// CHECK:     arith.cmpi eq, %[[IV]], %[[LB]]
+// CHECK:     scf.if
+// CHECK:       ttkernel.pack_reconfig_l1_acc
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @nested_reduction_loops() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb_in = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_scaler = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %cb_out = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  scf.for %row = %c0 to %c2 step %c1 {
+    scf.for %col = %c0 to %c2 step %c1 {
+      ttkernel.tile_regs_acquire() : () -> ()
+      ttkernel.reduce_tile(%cb_in, %cb_scaler, %c0, %c0, %c0, <reduce_sum>, <reduce_dim_col>) : (!ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index, index, index) -> ()
+      ttkernel.tile_regs_commit() : () -> ()
+      ttkernel.tile_regs_wait() : () -> ()
+      ttkernel.pack_tile(%c0, %cb_out, %c0, true) : (index, !ttkernel.cb<1, !ttcore.tile<32x32, bf16>>, index) -> ()
+      ttkernel.tile_regs_release() : () -> ()
+    } {ttl.reduction_loop}
+  } {ttl.reduction_loop}
+  return
+}
+
+// -----
+
+// Two consecutive L1 acc loops writing to the same CB.
+// The reserve/push scope spans both loops. One disable pair brackets the
+// entire scope; only the first loop gets the enable guard.
+
+// CHECK-LABEL: func.func @consecutive_l1_acc_loops
+// CHECK: ttkernel.cb_reserve_back
+// Disable before first loop.
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// First loop with enable guard.
+// CHECK: scf.for %[[IV1:.*]] = %[[LB1:.*]] to
+// CHECK:   ttkernel.tile_regs_acquire
+// CHECK:   ttkernel.pack_tile
+// CHECK:   ttkernel.tile_regs_release
+// CHECK:   arith.cmpi eq, %[[IV1]], %[[LB1]]
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// No disable between the loops. Unconditional enable re-arms L1 acc
+// after any init ops that may reset packer state.
+// CHECK-NOT: pack_reconfig_l1_acc(%{{.*}}0
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for %[[IV2:.*]] = %[[LB2:.*]] to
+// CHECK:   ttkernel.tile_regs_acquire
+// CHECK:   ttkernel.pack_tile
+// CHECK:   ttkernel.tile_regs_release
+// CHECK:   arith.cmpi eq, %[[IV2]], %[[LB2]]
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// Push then disable.
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @consecutive_l1_acc_loops() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  ttkernel.cb_reserve_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  scf.for %iv1 = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  scf.for %iv2 = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}
+
+// -----
+
+// Single loop with two independent accumulating outputs.
+// Both pack to different CBs but share one L1 acc enable/disable scope.
+
+// CHECK-LABEL: func.func @two_outputs_one_loop
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for %[[IV:.*]] = %[[LB:.*]] to
+// CHECK:   ttkernel.tile_regs_acquire
+// CHECK:   ttkernel.pack_tile
+// CHECK:   ttkernel.tile_regs_release
+// CHECK:   ttkernel.tile_regs_acquire
+// CHECK:   ttkernel.pack_tile
+// CHECK:   ttkernel.tile_regs_release
+// Enable after the last release (second output).
+// CHECK:   arith.cmpi eq, %[[IV]], %[[LB]]
+// CHECK:   scf.if
+// CHECK:     ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// Two pushes then disable.
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @two_outputs_one_loop() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %cb1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb0, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb1, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb0, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  ttkernel.cb_push_back(%cb1, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}
+
+// -----
+
+// Idempotency: input already has pack_reconfig_l1_acc guards. Running
+// the pass again should not insert duplicates.
+
+// CHECK-LABEL: func.func @already_guarded
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @already_guarded() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c4_i32 = arith.constant 4 : i32
+  ttkernel.pack_reconfig_l1_acc(%c0_i32) : (i32) -> ()
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+    %cmp = arith.cmpi eq, %iv, %c0 : index
+    scf.if %cmp {
+      ttkernel.pack_reconfig_l1_acc(%c1_i32) : (i32) -> ()
+    }
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  ttkernel.pack_reconfig_l1_acc(%c0_i32) : (i32) -> ()
+  return
+}
+
+// -----
+
+// Two consecutive annotated loops packing to DIFFERENT CBs.
+// Each loop gets its own independent disable pair.
+
+// CHECK-LABEL: func.func @different_cb_siblings
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @different_cb_siblings() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb0 = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %cb1 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  scf.for %iv1 = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb0, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb0, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  scf.for %iv2 = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb1, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb1, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}
+
+// -----
+
+// Consecutive annotated loops with init ops between them (the real-world
+// pattern from the full pipeline). The scope must span past the init ops
+// to include the push after the second loop.
+
+// CHECK-LABEL: func.func @consecutive_with_init_between
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// CHECK-NOT: pack_reconfig_l1_acc(%{{.*}}0
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK: scf.for
+// CHECK:   ttkernel.pack_reconfig_l1_acc
+// CHECK: }
+// CHECK: ttkernel.cb_push_back
+// CHECK: ttkernel.pack_reconfig_l1_acc
+// CHECK-NOT: ttkernel.pack_reconfig_l1_acc
+func.func @consecutive_with_init_between() attributes {ttkernel.thread = #ttkernel.thread<compute>} {
+  %cb = ttkernel.get_compile_time_arg_val(0) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %cb_in0 = ttkernel.get_compile_time_arg_val(1) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %cb_in1 = ttkernel.get_compile_time_arg_val(2) : () -> !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  ttkernel.cb_reserve_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  scf.for %iv1 = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.matmul_block(%cb_in0, %cb_in0, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  // init_short between the two loops (InsertInits emits init_short when
+  // sibling loops share an output CB, to avoid clobbering PACK config).
+  "ttkernel.mm_block_init_short"(%cb_in1, %cb_in1, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32, i32, i32, i32) -> ()
+  scf.for %iv2 = %c0 to %c4 step %c1 {
+    ttkernel.tile_regs_acquire() : () -> ()
+    ttkernel.matmul_block(%cb_in1, %cb_in1, %c0, %c0, %c0, %c0_i32, %c1_i32, %c1_i32, %c1_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index, index, index, i32, i32, i32, i32) -> ()
+    ttkernel.tile_regs_commit() : () -> ()
+    ttkernel.tile_regs_wait() : () -> ()
+    ttkernel.pack_tile(%c0, %cb, %c0, true) : (index, !ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, index) -> ()
+    ttkernel.tile_regs_release() : () -> ()
+  } {ttl.l1_acc_loop}
+  ttkernel.cb_push_back(%cb, %c4_i32) : (!ttkernel.cb<4, !ttcore.tile<32x32, bf16>>, i32) -> ()
+  return
+}
diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
new file mode 100644
index 000000000..5042dae62
--- /dev/null
+++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops.mlir
@@ -0,0 +1,243 @@
+// Verifies ttl-annotate-l1-acc-loops: scf.for loops containing
+// ttl.store with {accumulate} are annotated with ttl.l1_acc_loop.
+
+// RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(ttl-annotate-l1-acc-loops))' --split-input-file | FileCheck %s
+
+// Accumulating store inside a loop should annotate the loop.
+
+// CHECK-LABEL: func.func @accumulating_store
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+func.func @accumulating_store(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// Plain store (no {accumulate}) should NOT annotate, even with external reserve.
+
+// CHECK-LABEL: func.func @plain_store_no_annotation
+// CHECK: scf.for
+// CHECK-NOT: ttl.l1_acc_loop
+// CHECK: }
+func.func @plain_store_no_annotation(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// Already-annotated loops should be skipped.
+
+// CHECK-LABEL: func.func @skip_reduction_loop
+// CHECK: scf.for
+// CHECK: } {ttl.reduction_loop
+// CHECK-NOT: ttl.l1_acc_loop
+func.func @skip_reduction_loop(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttl.store %arg0, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  } {ttl.reduction_loop}
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// Nested loops: only the innermost loop containing the accumulating store
+// should be annotated, not the outer loop.
+
+// CHECK-LABEL: func.func @nested_only_inner
+// CHECK: scf.for
+// CHECK-NOT: ttl.l1_acc_loop
+// CHECK:   scf.for
+// CHECK:   } {ttl.l1_acc_loop}
+// CHECK: }
+func.func @nested_only_inner(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  // Outer loop (e.g., N-block loop) — should NOT be annotated.
+  scf.for %outer = %c0 to %c4 step %c1 {
+    // Inner loop (K-accumulation loop) — should be annotated.
+    scf.for %iv = %c0 to %c4 step %c1 {
+      %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+      ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    }
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// Two consecutive += loops to the same reserve. Both should be annotated.
+
+// CHECK-LABEL: func.func @consecutive_loops_same_reserve
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+func.func @consecutive_loops_same_reserve(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg2: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg3: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb3 = ttl.bind_cb {cb_index = 3, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb4 = ttl.bind_cb {cb_index = 4, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %c = ttl.attach_cb %arg2, %cb2 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %d = ttl.attach_cb %arg3, %cb3 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb4 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  scf.for %iv = %c0 to %c4 step %c1 {
+    %mm = ttl.matmul %c, %d : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// Loop with both += and plain .store() to same reserve. The loop
+// contains an accumulating store, so it should be annotated.
+
+// CHECK-LABEL: func.func @store_and_acc_in_same_loop
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+func.func @store_and_acc_in_same_loop(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// += with non-matmul RHS (passthrough accumulation).
+
+// CHECK-LABEL: func.func @non_matmul_accumulate
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+func.func @non_matmul_accumulate(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb1 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    ttl.store %a, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// K=1 (single iteration loop). Should still be annotated.
+
+// CHECK-LABEL: func.func @single_iteration
+// CHECK: scf.for
+// CHECK: } {ttl.l1_acc_loop}
+func.func @single_iteration(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c1 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// Loop without any store should NOT be annotated.
+
+// CHECK-LABEL: func.func @no_store
+// CHECK: scf.for
+// CHECK-NOT: ttl.l1_acc_loop
+// CHECK: }
+func.func @no_store(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
diff --git a/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops_invalid.mlir b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops_invalid.mlir
new file mode 100644
index 000000000..494c1ba1d
--- /dev/null
+++ b/test/ttlang/Dialect/TTL/Transforms/annotate_l1_acc_loops_invalid.mlir
@@ -0,0 +1,28 @@
+// += inside a conditional is not supported.
+
+// RUN: ttlang-opt %s --pass-pipeline='builtin.module(func.func(ttl-annotate-l1-acc-loops))' --verify-diagnostics --split-input-file
+
+// += inside scf.if inside scf.for is rejected.
+
+func.func @acc_inside_conditional(
+    %arg0: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<1x1x!ttcore.tile<32x32, bf16>>,
+    %cond: i1) -> tensor<1x1x!ttcore.tile<32x32, bf16>> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], !ttcore.tile<32x32, bf16>, 2>) -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[1, 1], !ttcore.tile<32x32, bf16>, 2> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+  scf.for %iv = %c0 to %c4 step %c1 {
+    scf.if %cond {
+      %mm = ttl.matmul %a, %b : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>> -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+      // expected-error @below {{+= inside a conditional is not supported (#504)}}
+      ttl.store %mm, %reserve {accumulate} : tensor<1x1x!ttcore.tile<32x32, bf16>>, tensor<1x1x!ttcore.tile<32x32, bf16>>
+    }
+  }
+  func.return %reserve : tensor<1x1x!ttcore.tile<32x32, bf16>>
+}
diff --git a/test/ttlang/Dialect/TTL/Transforms/subblock_strict_f32_acc_invalid.mlir b/test/ttlang/Dialect/TTL/Transforms/subblock_strict_f32_acc_invalid.mlir
new file mode 100644
index 000000000..8c4318acd
--- /dev/null
+++ b/test/ttlang/Dialect/TTL/Transforms/subblock_strict_f32_acc_invalid.mlir
@@ -0,0 +1,61 @@
+// Negative tests for ttl-subblock-compute-for-dst with --strict-f32-acc.
+// The check fires when a user-written accumulation loop (+=) with non-f32
+// output requires subblocking, because bf16 L1 intermediates truncate f32
+// DST partial sums per K step.
+
+// RUN: ttlang-opt %s \
+// RUN:   --pass-pipeline='builtin.module(func.func( \
+// RUN:     ttl-annotate-l1-acc-loops, convert-ttl-to-compute, \
+// RUN:     ttl-assign-dst{enable-fpu-binary-ops=0}, \
+// RUN:     ttl-subblock-compute-for-dst{strict-f32-acc=true}))' \
+// RUN:   --verify-diagnostics --split-input-file
+
+// bf16 output 3x3 = 9 tiles exceeds f32 DST capacity (4): should error.
+
+func.func @strict_f32_subblock_bf16_error(
+    %arg0: tensor<3x2x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<2x3x!ttcore.tile<32x32, bf16>>) -> tensor<3x3x!ttcore.tile<32x32, bf16>>
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>, fp32_dest_acc_en} {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[3, 2], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[2, 3], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[3, 3], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<3x2x!ttcore.tile<32x32, bf16>>, !ttl.cb<[3, 2], !ttcore.tile<32x32, bf16>, 2>) -> tensor<3x2x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<2x3x!ttcore.tile<32x32, bf16>>, !ttl.cb<[2, 3], !ttcore.tile<32x32, bf16>, 2>) -> tensor<2x3x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[3, 3], !ttcore.tile<32x32, bf16>, 2> -> tensor<3x3x!ttcore.tile<32x32, bf16>>
+  scf.for %k = %c0 to %c2 step %c1 {
+    // expected-error @below {{subblocking accumulation loop reduces precision}}
+    %mm = ttl.matmul %a, %b : tensor<3x2x!ttcore.tile<32x32, bf16>>, tensor<2x3x!ttcore.tile<32x32, bf16>> -> tensor<3x3x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<3x3x!ttcore.tile<32x32, bf16>>, tensor<3x3x!ttcore.tile<32x32, bf16>>
+  }
+  ttl.cb_push %cb2 : <[3, 3], !ttcore.tile<32x32, bf16>, 2>
+  func.return %reserve : tensor<3x3x!ttcore.tile<32x32, bf16>>
+}
+
+// -----
+
+// bf16 output 2x2 = 4 tiles fits in f32 DST (4): no subblocking, no error.
+
+// expected-no-diagnostics
+func.func @strict_f32_fits_in_dst_ok(
+    %arg0: tensor<2x2x!ttcore.tile<32x32, bf16>>,
+    %arg1: tensor<2x2x!ttcore.tile<32x32, bf16>>) -> tensor<2x2x!ttcore.tile<32x32, bf16>>
+    attributes {ttl.kernel_thread = #ttkernel.thread<compute>, fp32_dest_acc_en} {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %cb0 = ttl.bind_cb {cb_index = 0, block_count = 2} : !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2>
+  %cb1 = ttl.bind_cb {cb_index = 1, block_count = 2} : !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2>
+  %cb2 = ttl.bind_cb {cb_index = 2, block_count = 2} : !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2>
+  %a = ttl.attach_cb %arg0, %cb0 : (tensor<2x2x!ttcore.tile<32x32, bf16>>, !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2>) -> tensor<2x2x!ttcore.tile<32x32, bf16>>
+  %b = ttl.attach_cb %arg1, %cb1 : (tensor<2x2x!ttcore.tile<32x32, bf16>>, !ttl.cb<[2, 2], !ttcore.tile<32x32, bf16>, 2>) -> tensor<2x2x!ttcore.tile<32x32, bf16>>
+  %reserve = ttl.cb_reserve %cb2 : <[2, 2], !ttcore.tile<32x32, bf16>, 2> -> tensor<2x2x!ttcore.tile<32x32, bf16>>
+  scf.for %k = %c0 to %c2 step %c1 {
+    %mm = ttl.matmul %a, %b : tensor<2x2x!ttcore.tile<32x32, bf16>>, tensor<2x2x!ttcore.tile<32x32, bf16>> -> tensor<2x2x!ttcore.tile<32x32, bf16>>
+    ttl.store %mm, %reserve {accumulate} : tensor<2x2x!ttcore.tile<32x32, bf16>>, tensor<2x2x!ttcore.tile<32x32, bf16>>
+  }
+  ttl.cb_push %cb2 : <[2, 2], !ttcore.tile<32x32, bf16>, 2>
+  func.return %reserve : tensor<2x2x!ttcore.tile<32x32, bf16>>
+}