From 8570f8214800da6cd04e5165d5710436f2e49634 Mon Sep 17 00:00:00 2001
From: Ilia Shutov <Ilia_Shutov@epam.com>
Date: Fri, 27 Feb 2026 11:29:06 +0100
Subject: [PATCH 1/4] [ttl] Add TRID-aware DMA barrier lowering option

Add use-trid-barriers option to convert-ttl-to-ttkernel pass and
ttl-to-ttkernel-pipeline. When enabled, ttl.copy emits
noc_async_{read,write}_set_trid before DMA operations, and ttl.wait
emits noc_async_{read,write}_barrier_with_trid instead of global
barriers.

Default behavior (use-trid-barriers=false) preserves existing global
barrier semantics from main branch.

Key changes:
- TridAllocator class manages 16 TRID slots with overflow handling
- CopyLowering/WaitLowering patterns respect useTridBarriers flag
- TTKernel cleanup patterns conditionally registered for TRID mode
- SCF structural type conversions enabled for transfer handle types

Addresses: #87
---
 .../Transforms/TTKernelCleanupPatterns.h      |   4 +-
 include/ttlang/Dialect/TTL/Passes.td          |  22 +-
 .../Dialect/TTL/Pipelines/TTLPipelines.h      |   4 +
 .../Transforms/TTKernelCleanupPatterns.cpp    |  47 +++-
 lib/Dialect/TTL/Pipelines/TTLPipelines.cpp    |   6 +-
 lib/Dialect/TTL/Transforms/CMakeLists.txt     |   1 +
 .../TTL/Transforms/ConvertTTLToTTKernel.cpp   | 224 +++++++++++++++---
 7 files changed, 266 insertions(+), 42 deletions(-)

diff --git a/include/ttlang/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.h b/include/ttlang/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.h
index 59a36cf35..be6873142 100644
--- a/include/ttlang/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.h
+++ b/include/ttlang/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.h
@@ -12,7 +12,9 @@ namespace mlir::tt::ttkernel {
 /// Populate cleanup patterns for TTKernel ops. These patterns optimize
 /// TTKernel code by removing redundant operations (e.g., deduplicating
 /// consecutive barriers of the same type).
-void populateTTKernelCleanupPatterns(RewritePatternSet &patterns);
+/// When useTridBarriers is true, also adds TRID-barrier deduplication patterns.
+void populateTTKernelCleanupPatterns(RewritePatternSet &patterns,
+                                     bool useTridBarriers = false);
 
 } // namespace mlir::tt::ttkernel
 
diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td
index 629e4c940..0d375e476 100644
--- a/include/ttlang/Dialect/TTL/Passes.td
+++ b/include/ttlang/Dialect/TTL/Passes.td
@@ -5,15 +5,31 @@ include "mlir/Pass/PassBase.td"
 
 def TTLConvertTTLToTTKernel
     : Pass<"convert-ttl-to-ttkernel", "::mlir::ModuleOp"> {
-  let summary = "Lower TTL DMA ops to TTKernel using global barriers (temporary)";
+  let summary = "Lower TTL DMA ops to TTKernel noc ops";
   let description = [{
-    Converts TTL DMA ops to TTKernel noc ops. Uses global barriers until TRID
-    barriers are available. Covers bind_cb, copy, wait MVP path.
+    Converts TTL DMA ops to TTKernel noc ops. Covers bind_cb, copy, wait MVP
+    path.
+
+    Two lowering modes are supported:
+    - Default: global barriers (noc_async_{read,write}_barrier).
+    - Optional: TRID-aware barriers (noc_async_*_set_trid +
+      noc_async_*_barrier_with_trid).
+
+    TODO(ttl): Profile both modes on representative benchmarks and consider
+    changing the default.
 
     TODO(ttl): Refine lowering to emit real CB handles and proper NOC addresses.
     Issue: #77 (umbrella issue with subtasks #78-#89).
   }];
 
+  let options = [
+    Option<"useTridBarriers", "use-trid-barriers", "bool", "false",
+           "Use TRID-aware DMA waits (barrier_with_trid) instead of global barriers. "
+           "TRID must be unique per outstanding copy; ordering of TRID values is not "
+           "semantically significant. Generated TRIDs may be nondeterministic when "
+           "patterns are applied in parallel.">,
+  ];
+
   let dependentDialects = [
     "::mlir::arith::ArithDialect",
     "::mlir::tt::ttkernel::TTKernelDialect"
diff --git a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h
index 6c1bd61e7..1c1a090a4 100644
--- a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h
+++ b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h
@@ -18,6 +18,10 @@ struct TTLToTTKernelPipelineOptions
   Option<bool> lowerToEmitC{*this, "lower-to-emitc",
                             llvm::cl::desc("Lower TTKernel to EmitC."),
                             llvm::cl::init(false)};
+  Option<bool> useTridBarriers{
+      *this, "use-trid-barriers",
+      llvm::cl::desc("Use TRID-aware DMA waits (barrier_with_trid)."),
+      llvm::cl::init(false)};
 };
 
 void createTTLToTTKernelPipeline(mlir::OpPassManager &pm,
diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.cpp
index 295955039..d696f9cda 100644
--- a/lib/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.cpp
+++ b/lib/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.cpp
@@ -7,6 +7,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Support/LogicalResult.h"
 #include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace mlir::tt::ttkernel {
 
@@ -31,13 +32,57 @@ struct DeduplicateConsecutiveBarriers : OpRewritePattern<BarrierOp> {
   }
 };
 
+/// Deduplicate consecutive TRID barriers of the same type *only* when they
+/// target the same TRID (and optional NOC). Unlike global barriers, barriers
+/// with different TRIDs are not redundant and must not be removed.
+template <typename BarrierWithTridOp>
+struct DeduplicateConsecutiveTridBarriers
+    : OpRewritePattern<BarrierWithTridOp> {
+  using OpRewritePattern<BarrierWithTridOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(BarrierWithTridOp op,
+                                PatternRewriter &rewriter) const override {
+    auto *prev = op->getPrevNode();
+    if (!prev) {
+      return failure();
+    }
+    auto prevBarrier = dyn_cast<BarrierWithTridOp>(prev);
+    if (!prevBarrier) {
+      return failure();
+    }
+
+    if (op->getNumOperands() != prevBarrier->getNumOperands()) {
+      return failure();
+    }
+
+    for (auto [a, b] :
+         llvm::zip_equal(op->getOperands(), prevBarrier->getOperands())) {
+      if (a != b) {
+        return failure();
+      }
+    }
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
 } // namespace
 
-void populateTTKernelCleanupPatterns(RewritePatternSet &patterns) {
+void populateTTKernelCleanupPatterns(RewritePatternSet &patterns,
+                                     bool useTridBarriers) {
   patterns.add<DeduplicateConsecutiveBarriers<NocAsyncReadBarrierOp>>(
       patterns.getContext());
   patterns.add<DeduplicateConsecutiveBarriers<NocAsyncWriteBarrierOp>>(
       patterns.getContext());
+  if (useTridBarriers) {
+    patterns
+        .add<DeduplicateConsecutiveTridBarriers<NocAsyncReadBarrierWithTridOp>>(
+            patterns.getContext());
+    patterns.add<
+        DeduplicateConsecutiveTridBarriers<NocAsyncWriteBarrierWithTridOp>>(
+        patterns.getContext());
+  }
 }
 
 } // namespace mlir::tt::ttkernel
diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
index 9a4868bcc..2d8eabcb4 100644
--- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
+++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp
@@ -29,7 +29,11 @@ void createTTLToTTKernelPipeline(OpPassManager &pm,
   pm.addPass(createTTLInsertTileRegsSync());
   pm.addPass(createTTLLowerToLoops());
   pm.addPass(createTTLAnnotateCBAssociations());
-  pm.addPass(createTTLConvertTTLToTTKernel());
+  {
+    TTLConvertTTLToTTKernelOptions passOptions;
+    passOptions.useTridBarriers = options.useTridBarriers;
+    pm.addPass(createTTLConvertTTLToTTKernel(passOptions));
+  }
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
   if (options.lowerToEmitC) {
diff --git a/lib/Dialect/TTL/Transforms/CMakeLists.txt b/lib/Dialect/TTL/Transforms/CMakeLists.txt
index 7cb22ffbe..85f9d313d 100644
--- a/lib/Dialect/TTL/Transforms/CMakeLists.txt
+++ b/lib/Dialect/TTL/Transforms/CMakeLists.txt
@@ -21,6 +21,7 @@ add_mlir_dialect_library(TTLangTTLTransforms
   MLIRLinalgDialect
   MLIRMathDialect
   MLIRSCFDialect
+  MLIRSCFTransforms
   MLIRPass
   MLIRTensorDialect
   MLIRTTKernelDialect
diff --git a/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp b/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp
index 443e3014b..3daf901ff 100644
--- a/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp
+++ b/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinDialect.h"
@@ -61,18 +62,31 @@ class TTLToTTKernelTypeConverter : public TypeConverter {
                               t.getElementType());
     });
     // Tensor -> TensorAccessor for TTKernel when TTNN layout is present.
-    addConversion([](RankedTensorType t) -> Type {
+    addConversion([this](RankedTensorType t) -> Type {
       if (t.getEncoding() &&
           mlir::isa<tt::ttnn::TTNNLayoutAttr>(t.getEncoding())) {
         return ttk::TensorAccessorType::get(t.getContext());
       }
+      // Otherwise, preserve tensor shape/encoding but convert element type.
+      // This is required for cases like tensor<?x!ttl.transfer_handle<read>>
+      // becoming tensor<?xi32> once transfer handles are type-converted.
+      auto convertedElemTy = this->convertType(t.getElementType());
+      if (!convertedElemTy) {
+        return t;
+      }
+      if (convertedElemTy == t.getElementType()) {
+        return t;
+      }
+      return mlir::cast<RankedTensorType>(t.clone(convertedElemTy));
+    });
+    // Identity fallback must be last, but also handle conversion of transfer
+    // handles to TRID SSA values (i32).
+    addConversion([](Type t) -> Type {
+      if (llvm::isa<TransferHandleType>(t)) {
+        return IntegerType::get(t.getContext(), 32);
+      }
       return t;
     });
-    // Preserve transfer handle types so ttl.wait can inspect transfer
-    // direction. TRID-aware lowering will be added later.
-    addConversion([](TransferHandleType t) -> Type { return t; });
-    // Identity fallback must be last.
-    addConversion([](Type t) { return t; });
 
     auto castMaterialization = [](OpBuilder &builder, Type resultType,
                                   ValueRange inputs, Location loc) -> Value {
@@ -393,8 +407,8 @@ static CopyOperandKind classifyOperand(Value v) {
   return CopyOperandKind::Unknown;
 }
 
-static Value makeZeroI32(Location loc, ConversionPatternRewriter &rewriter) {
-  return rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
+static Value makeZeroI8(Location loc, ConversionPatternRewriter &rewriter) {
+  return rewriter.create<arith::ConstantIntOp>(loc, 0, 8);
 }
 
 static std::optional<TransferKind> getTransferKindFromHandleType(Type t) {
@@ -539,14 +553,58 @@ static Value linearizeTileIndex(OpBuilder &builder, Location loc, Value row,
   return builder.create<arith::AddIOp>(loc, rowOffset, col);
 }
 
+/// Allocates TRIDs for DMA barriers. TRIDs wrap at 16 (4-bit hardware limit).
+/// Tracks which TRIDs are in use by lowered copies and their transfer
+/// direction. This bookkeeping must stay independent of greedy pattern rewrite
+/// visitation order; therefore, it is only mutated during copy lowering.
+/// When a TRID would be reused while still in use, the caller must emit a
+/// barrier for the old transfer before reassigning.
+///
+/// TODO: Profile both modes on representative benchmarks and consider changing
+/// the default.
+class TridAllocator {
+public:
+  static constexpr uint32_t kNumTrids = 16;
+
+  struct AllocResult {
+    uint32_t trid;
+    /// If set, this TRID was still outstanding from a previous copy. The caller
+    /// must emit a barrier_with_trid for this direction before reusing.
+    std::optional<TransferKind> evictDirection;
+  };
+
+  AllocResult allocateTrid(TransferKind direction) {
+    uint32_t trid = nextTrid % kNumTrids;
+    AllocResult result{trid, std::nullopt};
+    if (outstanding[trid]) {
+      result.evictDirection = direction_[trid];
+    }
+    outstanding[trid] = true;
+    direction_[trid] = direction;
+    ++nextTrid;
+    return result;
+  }
+
+  void releaseTrid(uint32_t trid) { outstanding[trid % kNumTrids] = false; }
+
+private:
+  uint32_t nextTrid = 0;
+  bool outstanding[kNumTrids] = {};
+  TransferKind direction_[kNumTrids] = {};
+};
+
 /// Direction of a tensor<->CB tile copy for NOC operations.
 enum class NocCopyDirection { Read, Write };
 
 /// Lower a tensor_slice<->CB copy in the given direction.
 /// Read: tensor_slice -> CB (noc_async_read_tile, get_write_ptr)
 /// Write: CB -> tensor_slice (noc_async_write_tile, get_read_ptr)
+///
+/// When useTridBarriers is true, emits noc_async_{read,write}_set_trid before
+/// the tile loop to tag NOC operations with the given TRID.
 static LogicalResult lowerTensorCBCopy(CopyOp op, TensorSliceOp sliceOp,
                                        Value cb, NocCopyDirection direction,
+                                       Value tridVal, bool useTridBarriers,
                                        ConversionPatternRewriter &rewriter,
                                        const TypeConverter &typeConverter) {
   auto loc = op.getLoc();
@@ -605,6 +663,17 @@ static LogicalResult lowerTensorCBCopy(CopyOp op, TensorSliceOp sliceOp,
       rewriter.create<arith::ConstantIndexOp>(loc, *pageSizeBytes);
   auto i32Ty = rewriter.getI32Type();
 
+  // Tag subsequent NOC operations with this copy's TRID.
+  // Currently fixed to NOC 0. TODO(ttl): Generalize NOC selection (issue #77).
+  if (useTridBarriers) {
+    Value nocVal = makeZeroI8(loc, rewriter);
+    if (isRead) {
+      rewriter.create<ttk::NocAsyncReadSetTridOp>(loc, tridVal, nocVal);
+    } else {
+      rewriter.create<ttk::NocAsyncWriteSetTridOp>(loc, tridVal, nocVal);
+    }
+  }
+
   emitTileLoop(
       rewriter, loc, cbRows, cbCols,
       [&, tensorTilesX, cbCols](OpBuilder &b, Location bodyLoc, Value loopRow,
@@ -640,7 +709,7 @@ static LogicalResult lowerTensorCBCopy(CopyOp op, TensorSliceOp sliceOp,
         }
       });
 
-  rewriter.replaceOp(op, makeZeroI32(loc, rewriter));
+  rewriter.replaceOp(op, tridVal);
   return success();
 }
 
@@ -662,7 +731,10 @@ struct TensorSliceLowering : OpConversionPattern<TensorSliceOp> {
 };
 
 struct CopyLowering : OpConversionPattern<CopyOp> {
-  using OpConversionPattern::OpConversionPattern;
+  CopyLowering(const TypeConverter &typeConverter, MLIRContext *ctx,
+               TridAllocator *tridAllocator, bool useTridBarriers)
+      : OpConversionPattern(typeConverter, ctx), tridAllocator(tridAllocator),
+        useTridBarriers(useTridBarriers) {}
 
   LogicalResult
   matchAndRewrite(CopyOp op, OpAdaptor adaptor,
@@ -690,6 +762,38 @@ struct CopyLowering : OpConversionPattern<CopyOp> {
       });
     }
 
+    if (!tridAllocator) {
+      return rewriter.notifyMatchFailure(op, "missing TRID allocator");
+    }
+
+    TransferKind direction =
+        (srcIsSlice && dstIsCB) ? TransferKind::read : TransferKind::write;
+
+    Value tridVal;
+    if (useTridBarriers) {
+      auto allocResult = tridAllocator->allocateTrid(direction);
+      // If this TRID was still outstanding, emit a barrier to drain the old
+      // transfer before reusing the TRID.
+      if (allocResult.evictDirection) {
+        Value evictTrid = rewriter.create<arith::ConstantIntOp>(
+            op.getLoc(), allocResult.trid, 32);
+        Value nocVal = makeZeroI8(op.getLoc(), rewriter);
+        if (*allocResult.evictDirection == TransferKind::read) {
+          rewriter.create<ttk::NocAsyncReadBarrierWithTridOp>(
+              op.getLoc(), evictTrid, nocVal);
+        } else {
+          rewriter.create<ttk::NocAsyncWriteBarrierWithTridOp>(
+              op.getLoc(), evictTrid, nocVal);
+        }
+      }
+      tridVal = rewriter.create<arith::ConstantIntOp>(op.getLoc(),
+                                                      allocResult.trid, 32);
+    } else {
+      // In global-barrier mode, allocate but direction does not matter.
+      tridAllocator->allocateTrid(direction);
+      tridVal = rewriter.create<arith::ConstantIntOp>(op.getLoc(), 0, 32);
+    }
+
     // TensorSlice -> CB: read tiles from tensor into circular buffer.
     if (srcIsSlice && dstIsCB) {
       auto sliceOp = src.getDefiningOp<TensorSliceOp>();
@@ -698,8 +802,8 @@ struct CopyLowering : OpConversionPattern<CopyOp> {
             op, "tensor_slice source must come from ttl.tensor_slice op");
       }
       return lowerTensorCBCopy(op, sliceOp, adaptor.getDst(),
-                               NocCopyDirection::Read, rewriter,
-                               *typeConverter);
+                               NocCopyDirection::Read, tridVal, useTridBarriers,
+                               rewriter, *typeConverter);
     }
 
     // CB -> TensorSlice: write tiles from circular buffer to tensor.
@@ -709,41 +813,71 @@ struct CopyLowering : OpConversionPattern<CopyOp> {
           op, "tensor_slice destination must come from ttl.tensor_slice op");
     }
     return lowerTensorCBCopy(op, sliceOp, adaptor.getSrc(),
-                             NocCopyDirection::Write, rewriter, *typeConverter);
+                             NocCopyDirection::Write, tridVal, useTridBarriers,
+                             rewriter, *typeConverter);
   }
+
+private:
+  TridAllocator *tridAllocator = nullptr;
+  bool useTridBarriers = false;
 };
 
 struct WaitLowering : OpConversionPattern<WaitOp> {
-  using OpConversionPattern::OpConversionPattern;
+  WaitLowering(const TypeConverter &typeConverter, MLIRContext *ctx,
+               bool useTridBarriers)
+      : OpConversionPattern(typeConverter, ctx),
+        useTridBarriers(useTridBarriers) {}
 
   LogicalResult
   matchAndRewrite(WaitOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    // TODO(ttl): Lower ttl.wait to TRID-specific barriers keyed by the transfer
-    // handle (read vs write barrier based on transfer direction). Issue: #87.
+    // Emit TRID-specific barriers keyed by the transfer handle.
     //
-    // MVP behavior: require a direction-typed handle and emit the
-    // corresponding global barrier. Untyped handles are rejected by the
-    // verifier, but we also fail the rewrite defensively.
-    auto kind = getTransferKindFromHandleType(adaptor.getXf().getType());
+    // NOTE: After type conversion, the handle value is an i32 TRID. Transfer
+    // direction is recovered from the original operand type.
+    auto kind = getTransferKindFromHandleType(op.getXf().getType());
     if (!kind) {
       return rewriter.notifyMatchFailure(
           op, "requires direction-typed !ttl.transfer_handle<read|write>");
     }
-    if (*kind == TransferKind::read) {
-      rewriter.create<ttk::NocAsyncReadBarrierOp>(op.getLoc());
-    } else if (*kind == TransferKind::write) {
-      rewriter.create<ttk::NocAsyncWriteBarrierOp>(op.getLoc());
+    if (useTridBarriers) {
+      Value tridVal = adaptor.getXf(); // i32 (after type conversion)
+      if (!tridVal.getType().isInteger(32)) {
+        return rewriter.notifyMatchFailure(
+            op,
+            "transfer handle must be type-converted to i32 before ttl.wait");
+      }
+      // Currently fixed to NOC 0. TODO(ttl): Generalize NOC selection (issue
+      // #77).
+      Value nocVal = makeZeroI8(op.getLoc(), rewriter);
+      if (*kind == TransferKind::read) {
+        rewriter.create<ttk::NocAsyncReadBarrierWithTridOp>(op.getLoc(),
+                                                            tridVal, nocVal);
+      } else if (*kind == TransferKind::write) {
+        rewriter.create<ttk::NocAsyncWriteBarrierWithTridOp>(op.getLoc(),
+                                                             tridVal, nocVal);
+      } else {
+        return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
+          diag << "unsupported TransferKind for ttl.wait lowering";
+        });
+      }
     } else {
-      // Future-proofing: TransferKind is currently {read, write}, but fail
-      // explicitly if it ever expands without updating the lowering.
-      return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
-        diag << "unsupported TransferKind for ttl.wait lowering";
-      });
+      if (*kind == TransferKind::read) {
+        rewriter.create<ttk::NocAsyncReadBarrierOp>(op.getLoc());
+      } else if (*kind == TransferKind::write) {
+        rewriter.create<ttk::NocAsyncWriteBarrierOp>(op.getLoc());
+      } else {
+        return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) {
+          diag << "unsupported TransferKind for ttl.wait lowering";
+        });
+      }
     }
     rewriter.eraseOp(op);
     return success();
   }
+
+private:
+  bool useTridBarriers = false;
 };
 
 //===----------------------------------------------------------------------===//
@@ -841,7 +975,7 @@ struct FuncKernelFinalize : OpRewritePattern<FuncOp> {
 static LogicalResult
 lowerTTLOpsToTTKernel(ModuleOp mod, MLIRContext &ctx,
                       TTLToTTKernelTypeConverter &typeConverter,
-                      StringRef passName) {
+                      bool useTridBarriers, StringRef passName) {
   ConversionTarget target(ctx);
   target.addIllegalDialect<tt::ttl::TTLDialect>();
   target.addLegalDialect<arith::ArithDialect, BuiltinDialect, scf::SCFDialect,
@@ -881,10 +1015,24 @@ lowerTTLOpsToTTKernel(ModuleOp mod, MLIRContext &ctx,
   });
 
   RewritePatternSet patterns(&ctx);
-  patterns.add<BindCBLowering, TensorSliceLowering, CopyLowering, WaitLowering,
-               CBReserveLowering, CBPushLowering, CBWaitLowering, CBPopLowering,
-               TileStoreLowering, StoreLowering, CoreXLowering, CoreYLowering>(
-      typeConverter, &ctx);
+  TridAllocator tridAllocator;
+
+  // Patterns with standard (typeConverter, ctx) signature.
+  patterns.add<BindCBLowering, TensorSliceLowering, CBReserveLowering,
+               CBPushLowering, CBWaitLowering, CBPopLowering, TileStoreLowering,
+               StoreLowering, CoreXLowering, CoreYLowering>(typeConverter,
+                                                            &ctx);
+
+  // Patterns with TRID-specific arguments.
+  patterns.add<CopyLowering>(typeConverter, &ctx, &tridAllocator,
+                             useTridBarriers);
+  patterns.add<WaitLowering>(typeConverter, &ctx, useTridBarriers);
+
+  // Convert scf.for/scf.if/etc region signatures when result/iter_arg types
+  // change due to the type converter.
+  mlir::scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter,
+                                                             patterns, target);
+
   populateFunctionOpInterfaceTypeConversionPattern(
       func::FuncOp::getOperationName(), patterns, typeConverter);
 
@@ -898,7 +1046,7 @@ lowerTTLOpsToTTKernel(ModuleOp mod, MLIRContext &ctx,
 
   // Apply post-conversion cleanup patterns (e.g., barrier deduplication).
   RewritePatternSet cleanupPatterns(&ctx);
-  ttkernel::populateTTKernelCleanupPatterns(cleanupPatterns);
+  ttkernel::populateTTKernelCleanupPatterns(cleanupPatterns, useTridBarriers);
   if (failed(applyPatternsGreedily(mod, std::move(cleanupPatterns)))) {
     return failure();
   }
@@ -1046,13 +1194,17 @@ static void cleanupComputeKernels(ModuleOp mod, MLIRContext &ctx) {
 
 struct TTLConvertTTLToTTKernelPass
     : impl::TTLConvertTTLToTTKernelBase<TTLConvertTTLToTTKernelPass> {
+  using Base = impl::TTLConvertTTLToTTKernelBase<TTLConvertTTLToTTKernelPass>;
+  using Base::Base;
+
   void runOnOperation() override {
     MLIRContext &ctx = getContext();
     ModuleOp mod = getOperation();
     TTLToTTKernelTypeConverter typeConverter;
 
     // Phase 1: Lower TTL ops to TTKernel (bind_cb, copy, wait, cb ops, store)
-    if (failed(lowerTTLOpsToTTKernel(mod, ctx, typeConverter, getName()))) {
+    if (failed(lowerTTLOpsToTTKernel(mod, ctx, typeConverter, useTridBarriers,
+                                     getName()))) {
       signalPassFailure();
       return;
     }

From 1f494499a320059e2ebcc04021868ba3d4259052 Mon Sep 17 00:00:00 2001
From: Ilia Shutov <Ilia_Shutov@epam.com>
Date: Fri, 27 Feb 2026 11:29:18 +0100
Subject: [PATCH 2/4] [test] Add conversion lit tests for TRID and global
 barrier modes

- trid_barriers.mlir: Tests TRID-aware lowering with use-trid-barriers=true
  - Verifies noc_async_{read,write}_set_trid emission
  - Verifies noc_async_{read,write}_barrier_with_trid emission
  - Tests TRID overflow handling (17 copies without waits)

- dma_global_barriers.mlir: Tests default global barrier mode
  - Verifies noc_async_{read,write}_barrier emission (no TRID)
  - Ensures backward compatibility with main branch behavior

- Update existing tests to use explicit use-trid-barriers=true where
  they expect TRID-specific output
---
 .../TTLToTTKernel/dma_global_barriers.mlir    | 117 ++++++++++++++++
 .../TTLToTTKernel/dma_single_core.mlir        |  81 ++++++-----
 .../TTLToTTKernel/loopback_dram_copy.mlir     |  12 +-
 .../TTLToTTKernel/trid_barriers.mlir          | 130 ++++++++++++++++++
 4 files changed, 301 insertions(+), 39 deletions(-)
 create mode 100644 test/ttlang/Conversion/TTLToTTKernel/dma_global_barriers.mlir
 create mode 100644 test/ttlang/Conversion/TTLToTTKernel/trid_barriers.mlir

diff --git a/test/ttlang/Conversion/TTLToTTKernel/dma_global_barriers.mlir b/test/ttlang/Conversion/TTLToTTKernel/dma_global_barriers.mlir
new file mode 100644
index 000000000..764fff63f
--- /dev/null
+++ b/test/ttlang/Conversion/TTLToTTKernel/dma_global_barriers.mlir
@@ -0,0 +1,117 @@
+// RUN: ttlang-opt --convert-ttl-to-ttkernel --canonicalize -cse --split-input-file %s | FileCheck %s --check-prefix=GLOBAL
+// Summary: Verify the default (non-TRID) code path emits global NOC barriers.
+// Companion to dma_single_core.mlir which tests use-trid-barriers=1.
+
+#dram = #ttnn.buffer_type<dram>
+#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, <interleaved>>
+
+// Single-tile read: default mode emits noc_async_read_barrier (no TRID ops).
+// GLOBAL-LABEL: func.func @global_single_tile_read
+// GLOBAL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> ()
+// GLOBAL: ttkernel.noc_async_read_barrier() : () -> ()
+// GLOBAL-NOT: ttkernel.noc_async_read_set_trid
+// GLOBAL-NOT: ttkernel.noc_async_read_barrier_with_trid
+// GLOBAL-NOT: ttkernel.noc_async_write_barrier
+module {
+  func.func @global_single_tile_read(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
+    %c0 = arith.constant 0 : index
+    %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2>
+    %slice = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf = ttl.copy %slice, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    ttl.wait %xf : !ttl.transfer_handle<read>
+    func.return
+  }
+}
+
+// -----
+
+#dram = #ttnn.buffer_type<dram>
+#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, <interleaved>>
+
+// Single-tile write: default mode emits noc_async_write_barrier (no TRID ops).
+// GLOBAL-LABEL: func.func @global_single_tile_write
+// GLOBAL: ttkernel.noc_async_write_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> ()
+// GLOBAL: ttkernel.noc_async_write_barrier() : () -> ()
+// GLOBAL-NOT: ttkernel.noc_async_write_set_trid
+// GLOBAL-NOT: ttkernel.noc_async_write_barrier_with_trid
+// GLOBAL-NOT: ttkernel.noc_async_read_barrier
+module {
+  func.func @global_single_tile_write(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
+    %c0 = arith.constant 0 : index
+    %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2>
+    %slice = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf = ttl.copy %cb, %slice : (!ttl.cb<[1, 1], f32, 2>, tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) -> !ttl.transfer_handle<write>
+    ttl.wait %xf : !ttl.transfer_handle<write>
+    func.return
+  }
+}
+
+// -----
+
+#dram = #ttnn.buffer_type<dram>
+#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, <interleaved>>
+
+// Batched reads: consecutive global barriers are deduplicated to a single barrier.
+// GLOBAL-LABEL: func.func @global_batched_reads
+// GLOBAL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> ()
+// GLOBAL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> ()
+// GLOBAL: ttkernel.noc_async_read_barrier() : () -> ()
+// GLOBAL-NOT: ttkernel.noc_async_read_barrier
+// GLOBAL-NOT: ttkernel.noc_async_read_set_trid
+// GLOBAL-NOT: ttkernel.noc_async_read_barrier_with_trid
+module {
+  func.func @global_batched_reads(%t0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, %t1: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 2 : i32, ttl.crta_indices = [0, 1], ttl.kernel_thread = #ttkernel.thread<noc>} {
+    %c0 = arith.constant 0 : index
+    %cb0 = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2>
+    %cb1 = ttl.bind_cb {cb_index = 1, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2>
+    %slice0 = ttl.tensor_slice %t0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %slice1 = ttl.tensor_slice %t1[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf0 = ttl.copy %slice0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %xf1 = ttl.copy %slice1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    ttl.wait %xf0 : !ttl.transfer_handle<read>
+    ttl.wait %xf1 : !ttl.transfer_handle<read>
+    func.return
+  }
+}
+
+// -----
+
+#dram = #ttnn.buffer_type<dram>
+#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, <interleaved>>
+
+// Loopback copy: read then write in a loop uses global barriers for both.
+// GLOBAL-LABEL: func.func @global_loopback
+// GLOBAL: scf.for
+// GLOBAL:   ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> ()
+// GLOBAL:   ttkernel.noc_async_read_barrier() : () -> ()
+// GLOBAL:   ttkernel.noc_async_write_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> ()
+// GLOBAL:   ttkernel.noc_async_write_barrier() : () -> ()
+// GLOBAL-NOT: noc_async_read_set_trid
+// GLOBAL-NOT: noc_async_write_set_trid
+// GLOBAL-NOT: barrier_with_trid
+module {
+  func.func @global_loopback(%src: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>,
+                             %dst: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>)
+      attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0, 1], ttl.kernel_thread = #ttkernel.thread<noc>} {
+    %c0 = arith.constant 0 : index
+    %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2>
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+
+    %src_slice = ttl.tensor_slice %src[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %dst_slice = ttl.tensor_slice %dst[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    scf.for %i = %c0 to %c4 step %c1 {
+      %xf_r = ttl.copy %src_slice, %cb
+        : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>)
+          -> !ttl.transfer_handle<read>
+      ttl.wait %xf_r : !ttl.transfer_handle<read>
+
+      %xf_w = ttl.copy %cb, %dst_slice
+        : (!ttl.cb<[1, 1], f32, 2>, tensor<1x1x!ttcore.tile<32x32, f32>, #layout>)
+          -> !ttl.transfer_handle<write>
+      ttl.wait %xf_w : !ttl.transfer_handle<write>
+    }
+
+    func.return
+  }
+}
diff --git a/test/ttlang/Conversion/TTLToTTKernel/dma_single_core.mlir b/test/ttlang/Conversion/TTLToTTKernel/dma_single_core.mlir
index 98293c787..54837c1bc 100644
--- a/test/ttlang/Conversion/TTLToTTKernel/dma_single_core.mlir
+++ b/test/ttlang/Conversion/TTLToTTKernel/dma_single_core.mlir
@@ -1,4 +1,4 @@
-// RUN: ttlang-opt --convert-ttl-to-ttkernel --canonicalize -cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL
+// RUN: ttlang-opt --convert-ttl-to-ttkernel="use-trid-barriers=1" --canonicalize -cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL
 // Summary: MVP DMA lowering tests for tensor<->CB copies (no pipes).
 
 #dram = #ttnn.buffer_type<dram>
@@ -12,9 +12,11 @@
 // TTKERNEL: %[[SRC_ARGS:.*]] = ttkernel.TensorAccessorArgs({{.*}})
 // TTKERNEL: %[[SRC_ACC:.*]] = ttkernel.TensorAccessor(%[[SRC_ARGS]], %[[BANK_BASE]], {{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor
 // TTKERNEL: %[[CB_PTR:.*]] = ttkernel.get_write_ptr(%[[CB]]) : (!ttkernel.cb<2, f32>) -> i32
+// TTKERNEL: ttkernel.noc_async_read_set_trid(%[[TRID:.*]], %[[NOC:.*]]) : (i32, i8) -> ()
 // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, %[[SRC_ACC]], %[[CB_PTR]]) : (i32, !ttkernel.TensorAccessor, i32) -> ()
-// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> ()
-// TTKERNEL-NOT: ttkernel.noc_async_write_barrier
+// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[TRID]], %[[NOC]]) : (i32, i8) -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> ()
 module {
   func.func @dma_single_tile_single_copy(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
     %c0 = arith.constant 0 : index
@@ -38,9 +40,11 @@ module {
 // TTKERNEL: %[[DST_ARGS:.*]] = ttkernel.TensorAccessorArgs({{.*}})
 // TTKERNEL: %[[DST_ACC:.*]] = ttkernel.TensorAccessor(%[[DST_ARGS]], %[[BANK_BASE]], {{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor
 // TTKERNEL: %[[CB_PTR:.*]] = ttkernel.get_read_ptr(%[[CB]]) : (!ttkernel.cb<2, f32>) -> i32
+// TTKERNEL: ttkernel.noc_async_write_set_trid(%[[TRID:.*]], %[[NOC:.*]]) : (i32, i8) -> ()
 // TTKERNEL: ttkernel.noc_async_write_tile({{.*}}, %[[DST_ACC]], %[[CB_PTR]]) : (i32, !ttkernel.TensorAccessor, i32) -> ()
-// TTKERNEL: ttkernel.noc_async_write_barrier() : () -> ()
-// TTKERNEL-NOT: ttkernel.noc_async_read_barrier
+// TTKERNEL: ttkernel.noc_async_write_barrier_with_trid(%[[TRID]], %[[NOC]]) : (i32, i8) -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
 module {
   func.func @cb_to_tensor(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
     %c0 = arith.constant 0 : index
@@ -71,10 +75,12 @@ module {
 // TTKERNEL: ttkernel.TensorAccessor({{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor
 // TTKERNEL: ttkernel.get_write_ptr({{.*}}) : (!ttkernel.cb<2, f32>) -> i32
 // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> ()
-// Consecutive barriers are deduplicated to a single barrier.
-// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> ()
-// TTKERNEL-NOT: ttkernel.noc_async_read_barrier
-// TTKERNEL-NOT: ttkernel.noc_async_write_barrier
+// Each ttl.wait lowers to a TRID-specific barrier; different TRIDs must not be
+// deduplicated.
+// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> ()
+// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> ()
 module {
   func.func @dma_batched(%t0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, %t1: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 2 : i32, ttl.crta_indices = [0, 1], ttl.kernel_thread = #ttkernel.thread<noc>} {
     %c0 = arith.constant 0 : index
@@ -103,10 +109,11 @@ module {
 // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, {{.*}}, {{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> ()
 // TTKERNEL: scf.for {{.*}} {
 // TTKERNEL:   ttkernel.noc_async_read_tile({{.*}}, {{.*}}, {{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> ()
-// TTKERNEL:   ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL:   ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> ()
 // TTKERNEL: }
-// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> ()
-// TTKERNEL-NOT: ttkernel.noc_async_write_barrier
+// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> ()
 module {
   func.func @dma_pipelined_loop(%t: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
     %c0 = arith.constant 0 : index
@@ -137,17 +144,17 @@ module {
 //
 // TTKERNEL-LABEL: func.func @dma_single_tile_two_phase_loops
 // TTKERNEL: %[[HANDLES0:.*]] = tensor.empty() : tensor<4x!ttl.transfer_handle<read>>
-// TTKERNEL: %[[CAST:.*]] = tensor.cast %[[HANDLES0]] : tensor<4x!ttl.transfer_handle<read>> to tensor<?x!ttl.transfer_handle<read>>
-// TTKERNEL: %[[HANDLES:.*]] = scf.for {{.*}} iter_args(%[[H:.*]] = %[[CAST]]) -> (tensor<?x!ttl.transfer_handle<read>>) {
-// TTKERNEL:   ttkernel.noc_async_read_tile({{.*}}, {{.*}}, {{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> ()
-// TTKERNEL:   %[[XF:.*]] = builtin.unrealized_conversion_cast {{.*}} : i32 to !ttl.transfer_handle<read>
-// TTKERNEL:   %[[INS:.*]] = tensor.insert %[[XF]] into %[[H]]{{\[}}{{.*}}{{\]}} : tensor<?x!ttl.transfer_handle<read>>
-// TTKERNEL:   scf.yield %[[INS]] : tensor<?x!ttl.transfer_handle<read>>
+// TTKERNEL: scf.for {{.*}} iter_args(%[[H:.*]] = %[[HANDLES0]]) -> (tensor<4x!ttl.transfer_handle<read>>) {
+// TTKERNEL:   %[[XF_HANDLE:.*]] = builtin.unrealized_conversion_cast {{.*}} : i32 to !ttl.transfer_handle<read>
+// TTKERNEL:   %[[INS:.*]] = tensor.insert %[[XF_HANDLE]] into %[[H]]{{\[}}{{.*}}{{\]}} : tensor<4x!ttl.transfer_handle<read>>
+// TTKERNEL:   scf.yield %[[INS]] : tensor<4x!ttl.transfer_handle<read>>
 // TTKERNEL: }
 // TTKERNEL: scf.for {{.*}} {
-// TTKERNEL:   ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL:   %[[XF_I32:.*]] = builtin.unrealized_conversion_cast {{.*}} : !ttl.transfer_handle<read> to i32
+// TTKERNEL:   ttkernel.noc_async_read_barrier_with_trid(%[[XF_I32]], {{.*}}) : (i32, i8) -> ()
 // TTKERNEL: }
-// TTKERNEL-NOT: ttkernel.noc_async_write_barrier
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> ()
 module {
   func.func @dma_single_tile_two_phase_loops(%t: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
     %c0 = arith.constant 0 : index
@@ -177,12 +184,13 @@ module {
 #layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, <interleaved>>
 
 // Corner case: waiting twice on the same transfer handle is allowed, but
-// consecutive barriers are deduplicated to a single barrier.
+// consecutive barriers are deduplicated to a single TRID barrier.
 //
 // TTKERNEL-LABEL: func.func @dma_single_tile_double_wait
-// TTKERNEL:      ttkernel.noc_async_read_barrier() : () -> ()
-// TTKERNEL-NOT: ttkernel.noc_async_read_barrier
-// TTKERNEL-NOT: ttkernel.noc_async_write_barrier
+// TTKERNEL:      ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier_with_trid
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> ()
 module {
   func.func @dma_single_tile_double_wait(%t: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
     %c0 = arith.constant 0 : index
@@ -205,8 +213,9 @@ module {
 //
 // TTKERNEL-LABEL: func.func @dma_single_tile_single_element_container
 // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, {{.*}}, {{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> ()
-// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> ()
-// TTKERNEL-NOT: ttkernel.noc_async_write_barrier
+// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> ()
 // TTKERNEL: return
 module {
   func.func @dma_single_tile_single_element_container(%t: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
@@ -260,8 +269,9 @@ module {
 // TTKERNEL:     %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET_X]] : index to i32
 // TTKERNEL:     %[[CB_ADDR:.*]] = arith.index_cast %[[CB_ADDR_IDX]] : index to i32
 // TTKERNEL:     ttkernel.noc_async_read_tile(%[[TILE_OFFSET_I32]], %[[ACC]], %[[CB_ADDR]]) : (i32, !ttkernel.TensorAccessor, i32) -> ()
-// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> ()
-// TTKERNEL-NOT: ttkernel.noc_async_write_barrier
+// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> ()
 module {
   func.func @dma_multi_tile_read(%arg0: tensor<2x2x!ttcore.tile<32x32, f32>, #layout_tile>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
     %c0 = arith.constant 0 : index
@@ -305,8 +315,9 @@ module {
 // TTKERNEL:     %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET_X]] : index to i32
 // TTKERNEL:     %[[CB_ADDR:.*]] = arith.index_cast %[[CB_ADDR_IDX]] : index to i32
 // TTKERNEL:     ttkernel.noc_async_write_tile(%[[TILE_OFFSET_I32]], %[[ACC]], %[[CB_ADDR]]) : (i32, !ttkernel.TensorAccessor, i32) -> ()
-// TTKERNEL: ttkernel.noc_async_write_barrier() : () -> ()
-// TTKERNEL-NOT: ttkernel.noc_async_read_barrier
+// TTKERNEL: ttkernel.noc_async_write_barrier_with_trid({{.*}}) : (i32, i8) -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
 module {
   func.func @dma_multi_tile_write(%arg0: tensor<2x2x!ttcore.tile<32x32, f32>, #layout_tile>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
     %c0 = arith.constant 0 : index
@@ -350,8 +361,9 @@ module {
 // TTKERNEL:     %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET_X]] : index to i32
 // TTKERNEL:     %[[CB_ADDR:.*]] = arith.index_cast %[[CB_ADDR_IDX]] : index to i32
 // TTKERNEL:     ttkernel.noc_async_read_tile(%[[TILE_OFFSET_I32]], %[[ACC]], %[[CB_ADDR]]) : (i32, !ttkernel.TensorAccessor, i32) -> ()
-// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> ()
-// TTKERNEL-NOT: ttkernel.noc_async_write_barrier
+// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> ()
 module {
   func.func @dma_multi_tile_read_cb_shape(%arg0: tensor<2x2x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
     %c0 = arith.constant 0 : index
@@ -396,8 +408,9 @@ module {
 // TTKERNEL:     %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET_X]] : index to i32
 // TTKERNEL:     %[[CB_ADDR:.*]] = arith.index_cast %[[CB_ADDR_IDX]] : index to i32
 // TTKERNEL:     ttkernel.noc_async_write_tile(%[[TILE_OFFSET_I32]], %[[ACC]], %[[CB_ADDR]]) : (i32, !ttkernel.TensorAccessor, i32) -> ()
-// TTKERNEL: ttkernel.noc_async_write_barrier() : () -> ()
-// TTKERNEL-NOT: ttkernel.noc_async_read_barrier
+// TTKERNEL: ttkernel.noc_async_write_barrier_with_trid({{.*}}) : (i32, i8) -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
 module {
   func.func @dma_multi_tile_write_rect(%arg0: tensor<3x2x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
     %c0 = arith.constant 0 : index
diff --git a/test/ttlang/Conversion/TTLToTTKernel/loopback_dram_copy.mlir b/test/ttlang/Conversion/TTLToTTKernel/loopback_dram_copy.mlir
index 6e8ece496..11bfaa46a 100644
--- a/test/ttlang/Conversion/TTLToTTKernel/loopback_dram_copy.mlir
+++ b/test/ttlang/Conversion/TTLToTTKernel/loopback_dram_copy.mlir
@@ -1,6 +1,6 @@
-// RUN: ttlang-opt --convert-ttl-to-ttkernel --canonicalize --cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL
-// Summary: Lower a loopback DRAM copy (read → wait → write → wait in a loop)
-// to TTKernel using global NOC barriers (TRID ops not yet available).
+// RUN: ttlang-opt --convert-ttl-to-ttkernel="use-trid-barriers=1" --canonicalize --cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL
+// Summary: Lower a loopback DRAM copy (read -> wait -> write -> wait in a loop)
+// to TTKernel using TRID-specific NOC barriers.
 
 #dram = #ttnn.buffer_type<dram>
 #layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>,
@@ -13,14 +13,16 @@
 // TTKERNEL:   ttkernel.get_common_arg_val({{.*}}) : (index) -> i32
 // TTKERNEL:   %[[ACC_R:.*]] = ttkernel.TensorAccessor({{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor
 // TTKERNEL:   %[[CB_W_PTR:.*]] = ttkernel.get_write_ptr({{.*}}) : (!ttkernel.cb<2, f32>) -> i32
+// TTKERNEL:   ttkernel.noc_async_read_set_trid({{.*}}, {{.*}}) : (i32, i8) -> ()
 // TTKERNEL:   ttkernel.noc_async_read_tile({{.*}}, %[[ACC_R]], %[[CB_W_PTR]]) : (i32, !ttkernel.TensorAccessor, i32) -> ()
-// TTKERNEL:   ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL:   ttkernel.noc_async_read_barrier_with_trid({{.*}}, {{.*}}) : (i32, i8) -> ()
 // Write: runtime arg for dst tensor, accessor, read ptr for CB
 // TTKERNEL:   ttkernel.get_common_arg_val({{.*}}) : (index) -> i32
 // TTKERNEL:   %[[ACC_W:.*]] = ttkernel.TensorAccessor({{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor
 // TTKERNEL:   %[[CB_R_PTR:.*]] = ttkernel.get_read_ptr({{.*}}) : (!ttkernel.cb<2, f32>) -> i32
+// TTKERNEL:   ttkernel.noc_async_write_set_trid({{.*}}, {{.*}}) : (i32, i8) -> ()
 // TTKERNEL:   ttkernel.noc_async_write_tile({{.*}}, %[[ACC_W]], %[[CB_R_PTR]]) : (i32, !ttkernel.TensorAccessor, i32) -> ()
-// TTKERNEL:   ttkernel.noc_async_write_barrier() : () -> ()
+// TTKERNEL:   ttkernel.noc_async_write_barrier_with_trid({{.*}}, {{.*}}) : (i32, i8) -> ()
 
 module {
   func.func @loopback_dram_copy(%src: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>,
diff --git a/test/ttlang/Conversion/TTLToTTKernel/trid_barriers.mlir b/test/ttlang/Conversion/TTLToTTKernel/trid_barriers.mlir
new file mode 100644
index 000000000..bb110a44c
--- /dev/null
+++ b/test/ttlang/Conversion/TTLToTTKernel/trid_barriers.mlir
@@ -0,0 +1,130 @@
+// RUN: ttlang-opt --convert-ttl-to-ttkernel="use-trid-barriers=1" --canonicalize -cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL
+// Summary: Regression tests for TRID-aware ttl.copy/ttl.wait lowering.
+
+#dram = #ttnn.buffer_type<dram>
+#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, <interleaved>>
+
+// TTKERNEL-LABEL: func.func @trid_single_copy_wait_read
+// TTKERNEL: ttkernel.noc_async_read_set_trid(%[[TRID:.*]], %[[NOC:.*]]) : (i32, i8) -> ()
+// TTKERNEL: ttkernel.noc_async_read_tile(
+// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[TRID]], %[[NOC]]) : (i32, i8) -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL-NOT: builtin.unrealized_conversion_cast
+module {
+  func.func @trid_single_copy_wait_read(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
+    %c0 = arith.constant 0 : index
+    %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2>
+    %slice = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf = ttl.copy %slice, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    ttl.wait %xf : !ttl.transfer_handle<read>
+    func.return
+  }
+}
+
+// -----
+
+#dram = #ttnn.buffer_type<dram>
+#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, <interleaved>>
+
+// TTKERNEL-LABEL: func.func @trid_two_copies_two_waits_read
+// TTKERNEL: ttkernel.noc_async_read_set_trid(%[[TRID0:.*]], %[[NOC:.*]]) : (i32, i8) -> ()
+// TTKERNEL: ttkernel.noc_async_read_tile(
+// TTKERNEL: ttkernel.noc_async_read_set_trid(%[[TRID1:.*]], %[[NOC]]) : (i32, i8) -> ()
+// TTKERNEL: ttkernel.noc_async_read_tile(
+// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[TRID0]], %[[NOC]]) : (i32, i8) -> ()
+// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[TRID1]], %[[NOC]]) : (i32, i8) -> ()
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL-NOT: builtin.unrealized_conversion_cast
+module {
+  func.func @trid_two_copies_two_waits_read(%t0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, %t1: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 2 : i32, ttl.crta_indices = [0, 1], ttl.kernel_thread = #ttkernel.thread<noc>} {
+    %c0 = arith.constant 0 : index
+    %cb0 = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2>
+    %cb1 = ttl.bind_cb {cb_index = 1, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2>
+    %slice0 = ttl.tensor_slice %t0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %slice1 = ttl.tensor_slice %t1[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf0 = ttl.copy %slice0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %xf1 = ttl.copy %slice1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    ttl.wait %xf0 : !ttl.transfer_handle<read>
+    ttl.wait %xf1 : !ttl.transfer_handle<read>
+    func.return
+  }
+}
+
+// -----
+
+#dram = #ttnn.buffer_type<dram>
+#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, <interleaved>>
+
+// Verify TRID overflow handling: 17 copies without intervening waits exhaust
+// the 16-entry TRID space. The 17th copy (reusing TRID 0) must emit an
+// auto-barrier for TRID 0 before reassigning it.
+//
+// TTKERNEL-LABEL: func.func @trid_overflow_auto_barrier
+// The first 16 copies each get a unique TRID (0..15) with no auto-barrier.
+// TTKERNEL-COUNT-16: ttkernel.noc_async_read_set_trid
+// The 17th copy reuses TRID 0. Because TRID 0 is still outstanding, the pass
+// emits an auto-barrier first.
+// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid
+// TTKERNEL: ttkernel.noc_async_read_set_trid
+// No global barriers should appear.
+// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> ()
+// TTKERNEL-NOT: builtin.unrealized_conversion_cast
+module {
+  func.func @trid_overflow_auto_barrier(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread<noc>} {
+    %c0 = arith.constant 0 : index
+    %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2>
+    %s0 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf0 = ttl.copy %s0, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s1 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf1 = ttl.copy %s1, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s2 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf2 = ttl.copy %s2, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s3 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf3 = ttl.copy %s3, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s4 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf4 = ttl.copy %s4, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s5 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf5 = ttl.copy %s5, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s6 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf6 = ttl.copy %s6, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s7 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf7 = ttl.copy %s7, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s8 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf8 = ttl.copy %s8, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s9 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf9 = ttl.copy %s9, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s10 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf10 = ttl.copy %s10, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s11 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf11 = ttl.copy %s11, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s12 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf12 = ttl.copy %s12, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s13 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf13 = ttl.copy %s13, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s14 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf14 = ttl.copy %s14, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    %s15 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf15 = ttl.copy %s15, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    // 17th copy: TRID 0 wraps. Auto-barrier expected here.
+    %s16 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout>
+    %xf16 = ttl.copy %s16, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle<read>
+    ttl.wait %xf0 : !ttl.transfer_handle<read>
+    ttl.wait %xf1 : !ttl.transfer_handle<read>
+    ttl.wait %xf2 : !ttl.transfer_handle<read>
+    ttl.wait %xf3 : !ttl.transfer_handle<read>
+    ttl.wait %xf4 : !ttl.transfer_handle<read>
+    ttl.wait %xf5 : !ttl.transfer_handle<read>
+    ttl.wait %xf6 : !ttl.transfer_handle<read>
+    ttl.wait %xf7 : !ttl.transfer_handle<read>
+    ttl.wait %xf8 : !ttl.transfer_handle<read>
+    ttl.wait %xf9 : !ttl.transfer_handle<read>
+    ttl.wait %xf10 : !ttl.transfer_handle<read>
+    ttl.wait %xf11 : !ttl.transfer_handle<read>
+    ttl.wait %xf12 : !ttl.transfer_handle<read>
+    ttl.wait %xf13 : !ttl.transfer_handle<read>
+    ttl.wait %xf14 : !ttl.transfer_handle<read>
+    ttl.wait %xf15 : !ttl.transfer_handle<read>
+    ttl.wait %xf16 : !ttl.transfer_handle<read>
+    func.return
+  }
+}

From 89153449d98860c8e1799fde61f9e0d45a99e715 Mon Sep 17 00:00:00 2001
From: Ilia Shutov <Ilia_Shutov@epam.com>
Date: Fri, 27 Feb 2026 11:29:29 +0100
Subject: [PATCH 3/4] [test] Update translation lit tests for TRID barrier mode

Enable use-trid-barriers in TTLToCpp translation tests that verify
TRID-specific C++ codegen output. Tests now explicitly request TRID
mode to match their expected noc_async_*_set_trid and
barrier_with_trid output.
---
 .../TTLToCpp/cb_to_tensor_single_tile_write.mlir    |  5 +++--
 .../Translate/TTLToCpp/dma_batched_single_tile.mlir |  9 ++++++---
 .../TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir |  8 +++++---
 .../Translate/TTLToCpp/dma_loop_single_tile.mlir    | 13 +++++++------
 .../dma_multi_tile_batched_in_user_loop.mlir        |  9 ++++++---
 .../Translate/TTLToCpp/dma_multi_tile_read.mlir     |  5 +++--
 .../dma_multi_tile_same_layout_different_cb.mlir    |  8 +++++---
 .../Translate/TTLToCpp/dma_single_tile_read.mlir    |  5 +++--
 .../TTLToCpp/loopback_full_single_tile.mlir         |  8 +++++---
 9 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir b/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir
index 910e13824..d3879935e 100644
--- a/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir
+++ b/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir
@@ -1,4 +1,4 @@
-// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir
+// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir
 // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir
 // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir
 // RUN: FileCheck %s --input-file=%t.cpp
@@ -17,8 +17,9 @@
 // CHECK:   auto [[ARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>();
 // CHECK:   TensorAccessor [[ACCESSOR:v[0-9]+]] = TensorAccessor([[ARGS]], [[RT_ARG]], [[ADDR]]);
 // CHECK:   int32_t [[CB_PTR:v[0-9]+]] = get_read_ptr(get_compile_time_arg_val(0));
+// CHECK:   noc_async_write_set_trid({{.*}}, {{.*}});
 // CHECK:   noc_async_write_tile([[ZERO]], [[ACCESSOR]], [[CB_PTR]]);
-// CHECK:   noc_async_write_barrier();
+// CHECK:   noc_async_write_barrier_with_trid({{.*}}, {{.*}});
 // CHECK:   return;
 // CHECK-NEXT: }
 module {
diff --git a/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir b/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir
index f902cc788..892fd98fd 100644
--- a/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir
+++ b/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir
@@ -1,4 +1,4 @@
-// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir
+// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir
 // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir
 // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir
 // RUN: FileCheck %s --input-file=%t.cpp
@@ -18,15 +18,18 @@
 // CHECK:   auto [[ARGS0:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<2, 0>();
 // CHECK:   TensorAccessor [[ACCESSOR0:v[0-9]+]] = TensorAccessor([[ARGS0]], [[RT_ARG0]], [[ADDR]]);
 // CHECK:   int32_t [[CB_PTR0:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0));
+// CHECK:   noc_async_read_set_trid({{.*}}, {{.*}});
 // CHECK:   noc_async_read_tile([[ZERO]], [[ACCESSOR0]], [[CB_PTR0]]);
 // Tensor 1: get runtime arg, create accessor, get CB write ptr, async read
 // CHECK:   int32_t [[RT_ARG1:v[0-9]+]] = get_common_arg_val<uint32_t>({{v[0-9]+}});
 // CHECK:   auto [[ARGS1:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<3, 1>();
 // CHECK:   TensorAccessor [[ACCESSOR1:v[0-9]+]] = TensorAccessor([[ARGS1]], [[RT_ARG1]], [[ADDR]]);
 // CHECK:   int32_t [[CB_PTR1:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(1));
+// CHECK:   noc_async_read_set_trid({{.*}}, {{.*}});
 // CHECK:   noc_async_read_tile([[ZERO]], [[ACCESSOR1]], [[CB_PTR1]]);
-// Consecutive barriers deduplicated to single barrier.
-// CHECK:   noc_async_read_barrier();
+// Each wait lowers to a TRID barrier (no global barrier).
+// CHECK:   noc_async_read_barrier_with_trid({{.*}}, {{.*}});
+// CHECK:   noc_async_read_barrier_with_trid({{.*}}, {{.*}});
 // CHECK:   return;
 // CHECK-NEXT: }
 module {
diff --git a/test/ttlang/Translate/TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir b/test/ttlang/Translate/TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir
index 194f185f5..6d963cc31 100644
--- a/test/ttlang/Translate/TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir
+++ b/test/ttlang/Translate/TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir
@@ -1,4 +1,4 @@
-// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir
+// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir
 // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir
 // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir
 // RUN: FileCheck %s --input-file=%t.cpp
@@ -48,6 +48,7 @@
 // Cast CB ptr to size_t for index arithmetic
 // CHECK:     ptrdiff_t [[CB_PTR1_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR1]];
 // CHECK:     size_t [[CB_PTR1_IDX:v[0-9]+]] = (size_t) [[CB_PTR1_PTRDIFF]];
+// CHECK:     noc_async_read_set_trid({{.*}}, {{.*}});
 // CHECK:     for (size_t [[TILE1_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE1_Y]] < [[TILES_2]]; [[TILE1_Y]] += [[TILE_STEP]]) {
 // CHECK:       for (size_t [[TILE1_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE1_X]] < [[TILES_2]]; [[TILE1_X]] += [[TILE_STEP]]) {
 // CHECK:         size_t [[TILE1_OFFSET_Y:v[0-9]+]] = [[TILE1_Y]] * [[TILES_2]];
@@ -63,7 +64,7 @@
 // CHECK:         noc_async_read_tile([[TILE1_OFFSET]], [[ACC1]], [[CB_ADDR1]]);
 // CHECK:       }
 // CHECK:     }
-// CHECK:     noc_async_read_barrier();
+// CHECK:     noc_async_read_barrier_with_trid({{.*}}, {{.*}});
 // Second copy: arg1 (96x64) → CB1, accessor with runtime arg index 1
 // CHECK:     int32_t [[RT_ARG2:v[0-9]+]] = get_common_arg_val<uint32_t>([[TILE_STEP]]);
 // CHECK:     auto [[ACC2_ARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<3, 1>();
@@ -72,6 +73,7 @@
 // Cast CB ptr to size_t for index arithmetic
 // CHECK:     ptrdiff_t [[CB_PTR2_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR2]];
 // CHECK:     size_t [[CB_PTR2_IDX:v[0-9]+]] = (size_t) [[CB_PTR2_PTRDIFF]];
+// CHECK:     noc_async_read_set_trid({{.*}}, {{.*}});
 // CHECK:     for (size_t [[TILE2_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE2_Y]] < [[TILES_3]]; [[TILE2_Y]] += [[TILE_STEP]]) {
 // CHECK:       for (size_t [[TILE2_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE2_X]] < [[TILES_2]]; [[TILE2_X]] += [[TILE_STEP]]) {
 // CHECK:         size_t [[TILE2_OFFSET_Y:v[0-9]+]] = [[TILE2_Y]] * [[TILES_2]];
@@ -87,7 +89,7 @@
 // CHECK:         noc_async_read_tile([[TILE2_OFFSET]], [[ACC2]], [[CB_ADDR2]]);
 // CHECK:       }
 // CHECK:     }
-// CHECK:     noc_async_read_barrier();
+// CHECK:     noc_async_read_barrier_with_trid({{.*}}, {{.*}});
 // CHECK:   }
 // CHECK:   return;
 // CHECK-NEXT: }
diff --git a/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir b/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir
index 88d1d349b..16165e6a1 100644
--- a/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir
+++ b/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir
@@ -1,4 +1,4 @@
-// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir
+// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir
 // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir
 // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir
 // RUN: FileCheck %s --input-file=%t.cpp
@@ -21,17 +21,18 @@
 // CHECK:   auto [[ARGS0:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>();
 // CHECK:   TensorAccessor [[ACCESSOR0:v[0-9]+]] = TensorAccessor([[ARGS0]], [[RT_ARG0]], [[ADDR]]);
 // CHECK:   int32_t [[CB_PTR0:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0));
+// CHECK:   noc_async_read_set_trid({{.*}}, {{.*}});
 // CHECK:   noc_async_read_tile([[ZERO]], [[ACCESSOR0]], [[CB_PTR0]]);
 // CHECK:   for (size_t [[IV:i[0-9]+]] = [[LB]]; [[IV]] < [[UB]]; [[IV]] += [[STEP]]) {
-// In-loop copy: create accessor with runtime arg, get CB write ptr
-// CHECK:     int32_t [[RT_ARG1:v[0-9]+]] = get_common_arg_val<uint32_t>([[LB]]);
+// In-loop copy: create accessor using the same runtime arg and get CB write ptr
 // CHECK:     auto [[ARGS1:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>();
-// CHECK:     TensorAccessor [[ACCESSOR1:v[0-9]+]] = TensorAccessor([[ARGS1]], [[RT_ARG1]], [[ADDR]]);
+// CHECK:     TensorAccessor [[ACCESSOR1:v[0-9]+]] = TensorAccessor([[ARGS1]], [[RT_ARG0]], [[ADDR]]);
 // CHECK:     int32_t [[CB_PTR1:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0));
+// CHECK:     noc_async_read_set_trid({{.*}}, {{.*}});
 // CHECK:     noc_async_read_tile([[ZERO]], [[ACCESSOR1]], [[CB_PTR1]]);
-// CHECK:     noc_async_read_barrier();
+// CHECK:     noc_async_read_barrier_with_trid({{.*}}, {{.*}});
 // CHECK:   }
-// CHECK:   noc_async_read_barrier();
+// CHECK:   noc_async_read_barrier_with_trid({{.*}}, {{.*}});
 // CHECK:   return;
 // CHECK-NEXT: }
 module {
diff --git a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir
index ac49d3a96..b7aa1625b 100644
--- a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir
+++ b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir
@@ -1,4 +1,4 @@
-// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir
+// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir
 // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir
 // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir
 // RUN: FileCheck %s --input-file=%t.cpp
@@ -49,6 +49,7 @@
 // Cast CB ptr to size_t for index arithmetic
 // CHECK:     ptrdiff_t [[CB_PTR1_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR1]];
 // CHECK:     size_t [[CB_PTR1_IDX:v[0-9]+]] = (size_t) [[CB_PTR1_PTRDIFF]];
+// CHECK:     noc_async_read_set_trid({{.*}}, {{.*}});
 // Tile loops: for tile_y in 0..2, for tile_x in 0..2
 // CHECK:     for (size_t [[TILE1_Y:[a-z][0-9]+]] = [[LB]]; [[TILE1_Y]] < [[TILES_BOUND]]; [[TILE1_Y]] += [[STEP]]) {
 // CHECK:       for (size_t [[TILE1_X:[a-z][0-9]+]] = [[LB]]; [[TILE1_X]] < [[TILES_BOUND]]; [[TILE1_X]] += [[STEP]]) {
@@ -74,6 +75,7 @@
 // Cast CB ptr to size_t for index arithmetic
 // CHECK:     ptrdiff_t [[CB_PTR2_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR2]];
 // CHECK:     size_t [[CB_PTR2_IDX:v[0-9]+]] = (size_t) [[CB_PTR2_PTRDIFF]];
+// CHECK:     noc_async_read_set_trid({{.*}}, {{.*}});
 // Separate tile loops (same bounds 0..2 x 0..2 but not merged with first copy)
 // CHECK:     for (size_t [[TILE2_Y:[a-z][0-9]+]] = [[LB]]; [[TILE2_Y]] < [[TILES_BOUND]]; [[TILE2_Y]] += [[STEP]]) {
 // CHECK:       for (size_t [[TILE2_X:[a-z][0-9]+]] = [[LB]]; [[TILE2_X]] < [[TILES_BOUND]]; [[TILE2_X]] += [[STEP]]) {
@@ -91,8 +93,9 @@
 // CHECK:       }
 // CHECK:     }
 
-// Consecutive barriers deduplicated to single barrier.
-// CHECK:     noc_async_read_barrier();
+// Each wait lowers to a TRID barrier (no global barrier).
+// CHECK:     noc_async_read_barrier_with_trid({{.*}}, {{.*}});
+// CHECK:     noc_async_read_barrier_with_trid({{.*}}, {{.*}});
 // CHECK:   }
 // CHECK:   return;
 // CHECK-NEXT: }
diff --git a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_read.mlir b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_read.mlir
index 0f3d6a44a..0899250f3 100644
--- a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_read.mlir
+++ b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_read.mlir
@@ -1,4 +1,4 @@
-// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir
+// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir
 // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir
 // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir
 // RUN: FileCheck %s --input-file=%t.cpp
@@ -26,6 +26,7 @@
 // Cast CB ptr to size_t for index arithmetic
 // CHECK:   ptrdiff_t [[CB_PTR_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR]];
 // CHECK:   size_t [[CB_PTR_IDX:v[0-9]+]] = (size_t) [[CB_PTR_PTRDIFF]];
+// CHECK:   noc_async_read_set_trid({{.*}}, {{.*}});
 // CHECK:   for (size_t [[TILE_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE_Y]] < [[TILES_BOUND]]; [[TILE_Y]] += [[TILE_STEP]]) {
 // CHECK:     for (size_t [[TILE_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE_X]] < [[TILES_BOUND]]; [[TILE_X]] += [[TILE_STEP]]) {
 // CHECK:       size_t [[TILE_OFFSET_Y:v[0-9]+]] = [[TILE_Y]] * [[TILES_BOUND]];
@@ -41,7 +42,7 @@
 // CHECK:       noc_async_read_tile([[TILE_OFFSET]], [[ACCESSOR]], [[CB_ADDR]]);
 // CHECK:     }
 // CHECK:   }
-// CHECK:   noc_async_read_barrier();
+// CHECK:   noc_async_read_barrier_with_trid({{.*}}, {{.*}});
 // CHECK:   return;
 // CHECK-NEXT: }
 module {
diff --git a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_same_layout_different_cb.mlir b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_same_layout_different_cb.mlir
index 0294328ad..135aa030c 100644
--- a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_same_layout_different_cb.mlir
+++ b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_same_layout_different_cb.mlir
@@ -1,4 +1,4 @@
-// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir
+// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir
 // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir
 // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir
 // RUN: FileCheck %s --input-file=%t.cpp
@@ -32,6 +32,7 @@
 // Cast CB ptr to size_t for index arithmetic
 // CHECK:   ptrdiff_t [[CB_PTR1_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR1]];
 // CHECK:   size_t [[CB_PTR1_IDX:v[0-9]+]] = (size_t) [[CB_PTR1_PTRDIFF]];
+// CHECK:   noc_async_read_set_trid({{.*}}, {{.*}});
 // Generated tile loops iterate over tensor grid (2x2)
 // CHECK:   for (size_t [[TILE1_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE1_Y]] < [[TILES_BOUND]]; [[TILE1_Y]] += [[TILE_STEP]]) {
 // CHECK:     for (size_t [[TILE1_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE1_X]] < [[TILES_BOUND]]; [[TILE1_X]] += [[TILE_STEP]]) {
@@ -48,7 +49,7 @@
 // CHECK:       noc_async_read_tile([[TILE1_OFFSET]], [[ACC1]], [[CB_ADDR1]]);
 // CHECK:     }
 // CHECK:   }
-// CHECK:   noc_async_read_barrier();
+// CHECK:   noc_async_read_barrier_with_trid({{.*}}, {{.*}});
 
 // Second copy: 64x64 (2x2 tiles) → CB [4,1] - SAME tensor layout, DIFFERENT CB shape
 // CHECK:   int32_t [[RT_ARG2:v[0-9]+]] = get_common_arg_val<uint32_t>([[TILE_STEP]]);
@@ -58,6 +59,7 @@
 // Cast CB ptr to size_t for index arithmetic
 // CHECK:   ptrdiff_t [[CB_PTR2_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR2]];
 // CHECK:   size_t [[CB_PTR2_IDX:v[0-9]+]] = (size_t) [[CB_PTR2_PTRDIFF]];
+// CHECK:   noc_async_read_set_trid({{.*}}, {{.*}});
 // Generated tile loops still iterate over tensor grid (2x2), not CB shape (4x1)
 // CHECK:   for (size_t [[TILE2_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE2_Y]] < [[TILES_BOUND]]; [[TILE2_Y]] += [[TILE_STEP]]) {
 // CHECK:     for (size_t [[TILE2_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE2_X]] < [[TILES_BOUND]]; [[TILE2_X]] += [[TILE_STEP]]) {
@@ -74,7 +76,7 @@
 // CHECK:       noc_async_read_tile([[TILE2_OFFSET]], [[ACC2]], [[CB_ADDR2]]);
 // CHECK:     }
 // CHECK:   }
-// CHECK:   noc_async_read_barrier();
+// CHECK:   noc_async_read_barrier_with_trid({{.*}}, {{.*}});
 // CHECK:   return;
 // CHECK-NEXT: }
 
diff --git a/test/ttlang/Translate/TTLToCpp/dma_single_tile_read.mlir b/test/ttlang/Translate/TTLToCpp/dma_single_tile_read.mlir
index 6211d84ea..0e6f191a6 100644
--- a/test/ttlang/Translate/TTLToCpp/dma_single_tile_read.mlir
+++ b/test/ttlang/Translate/TTLToCpp/dma_single_tile_read.mlir
@@ -1,4 +1,4 @@
-// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir
+// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir
 // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir
 // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir
 // RUN: FileCheck %s --input-file=%t.cpp
@@ -17,8 +17,9 @@
 // CHECK:   auto [[ARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>();
 // CHECK:   TensorAccessor [[ACCESSOR:v[0-9]+]] = TensorAccessor([[ARGS]], [[RT_ARG]], [[ADDR]]);
 // CHECK:   int32_t [[CB_PTR:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0));
+// CHECK:   noc_async_read_set_trid({{.*}}, {{.*}});
 // CHECK:   noc_async_read_tile([[ZERO]], [[ACCESSOR]], [[CB_PTR]]);
-// CHECK:   noc_async_read_barrier();
+// CHECK:   noc_async_read_barrier_with_trid({{.*}}, {{.*}});
 // CHECK:   return;
 // CHECK-NEXT: }
 module {
diff --git a/test/ttlang/Translate/TTLToCpp/loopback_full_single_tile.mlir b/test/ttlang/Translate/TTLToCpp/loopback_full_single_tile.mlir
index 5306158c8..5b9f16a9b 100644
--- a/test/ttlang/Translate/TTLToCpp/loopback_full_single_tile.mlir
+++ b/test/ttlang/Translate/TTLToCpp/loopback_full_single_tile.mlir
@@ -1,4 +1,4 @@
-// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir
+// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir
 // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir
 // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir
 // RUN: FileCheck %s --input-file=%t.cpp
@@ -22,15 +22,17 @@
 // CHECK:     auto [[ARGS_READ:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>();
 // CHECK:     TensorAccessor [[ACC_READ:v[0-9]+]] = TensorAccessor([[ARGS_READ]], [[RT_ARG_R]], [[ADDR]]);
 // CHECK:     int32_t [[CB_WRITE_PTR:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0));
+// CHECK:     noc_async_read_set_trid({{.*}}, {{.*}});
 // CHECK:     noc_async_read_tile([[ZERO]], [[ACC_READ]], [[CB_WRITE_PTR]]);
-// CHECK:     noc_async_read_barrier();
+// CHECK:     noc_async_read_barrier_with_trid({{.*}}, {{.*}});
 // Write: CB → tensor (uses get_read_ptr for CB source)
 // CHECK:     int32_t [[RT_ARG_W:v[0-9]+]] = get_common_arg_val<uint32_t>([[STEP]]);
 // CHECK:     auto [[ARGS_WRITE:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<2, 1>();
 // CHECK:     TensorAccessor [[ACC_WRITE:v[0-9]+]] = TensorAccessor([[ARGS_WRITE]], [[RT_ARG_W]], [[ADDR]]);
 // CHECK:     int32_t [[CB_READ_PTR:v[0-9]+]] = get_read_ptr(get_compile_time_arg_val(0));
+// CHECK:     noc_async_write_set_trid({{.*}}, {{.*}});
 // CHECK:     noc_async_write_tile([[ZERO]], [[ACC_WRITE]], [[CB_READ_PTR]]);
-// CHECK:     noc_async_write_barrier();
+// CHECK:     noc_async_write_barrier_with_trid({{.*}}, {{.*}});
 // CHECK:   }
 // CHECK:   return;
 // CHECK-NEXT: }

From 2dba10ef61c105e41dbdaefbf2fb07e9d3ecd9dd Mon Sep 17 00:00:00 2001
From: Ilia Shutov <Ilia_Shutov@epam.com>
Date: Fri, 27 Feb 2026 11:29:42 +0100
Subject: [PATCH 4/4] [test] Parameterize ME2E tests with use_trid_barriers
 option

Add use_trid_barriers to E2EConfig and TestConfig to enable runtime
testing of both barrier modes:

- E2EConfig.use_trid_barriers controls pipeline pass option
- TestConfig includes use_trid_barriers for test ID disambiguation
- Pipeline builder forwards option to convert-ttl-to-ttkernel
- Runner includes use_trid_barriers in kernel cache key
- CONFIGS includes one TRID-enabled config for coverage

Test IDs now include _trid suffix when use_trid_barriers=True to
ensure unique pytest node IDs.
---
 test/me2e/README.md           |  3 +++
 test/me2e/builder/pipeline.py | 16 ++++++++++++++--
 test/me2e/config.py           |  3 +++
 test/me2e/config_specs.py     |  4 ++++
 test/me2e/runner.py           |  9 +++++++--
 test/me2e/test_compute_ops.py |  5 ++++-
 6 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/test/me2e/README.md b/test/me2e/README.md
index e31954dbc..2c73a16c4 100644
--- a/test/me2e/README.md
+++ b/test/me2e/README.md
@@ -441,8 +441,11 @@ class E2EConfig:
     buffer_factor: int = 2                 # 1=single, 2=double buffer (default)
     memory_layout: MemoryLayout = MemoryLayout.INTERLEAVED
     buffer_type: BufferType = BufferType.DRAM
+    use_trid_barriers: bool = False        # TRID-aware DMA barriers (pass option)
 ```
 
+`use_trid_barriers` enables the convert-ttl-to-ttkernel pass option `use-trid-barriers=1` for runtime coverage of TRID barrier lowering; some ME2E configs (e.g. in CONFIGS) set it to True.
+
 ### Memory Configuration
 
 The `memory_layout` and `buffer_type` fields control MLIR layout attribute generation:
diff --git a/test/me2e/builder/pipeline.py b/test/me2e/builder/pipeline.py
index ed0c773a7..ca311fae8 100644
--- a/test/me2e/builder/pipeline.py
+++ b/test/me2e/builder/pipeline.py
@@ -17,7 +17,11 @@
 from .device_arch import get_mock_arch_from_device
 
 
-def compile_ttl_to_ttkernel(module: Module, device: Optional[Any] = None) -> Module:
+def compile_ttl_to_ttkernel(
+    module: Module,
+    device: Optional[Any] = None,
+    use_trid_barriers: bool = False,
+) -> Module:
     """
     Run the TTL-to-TTKernel pass pipeline on the module.
 
@@ -26,6 +30,8 @@ def compile_ttl_to_ttkernel(module: Module, device: Optional[Any] = None) -> Mod
     Args:
         module: TTL MLIR module to compile.
         device: Optional TTNN device for architecture detection.
+        use_trid_barriers: If True, use TRID-aware DMA barriers (pass option
+            use-trid-barriers=1). Default False matches pass default.
 
     Returns:
         Compiled module with TTKernel/EmitC ops.
@@ -34,6 +40,12 @@ def compile_ttl_to_ttkernel(module: Module, device: Optional[Any] = None) -> Mod
     mock_arch = get_mock_arch_from_device(device)
     device_pass = f"ttcore-register-device{{mock-system-desc-arch={mock_arch}}}"
 
+    ttkernel_pass = (
+        "convert-ttl-to-ttkernel{use-trid-barriers=1}"
+        if use_trid_barriers
+        else "convert-ttl-to-ttkernel"
+    )
+
     pipeline_str = (
         f"builtin.module("
         f"{device_pass},"
@@ -44,7 +56,7 @@ def compile_ttl_to_ttkernel(module: Module, device: Optional[Any] = None) -> Mod
         f"ttl-lower-to-loops,"
         f"ttl-annotate-cb-associations),"
         # TTL to TTKernel conversion (module-level pass).
-        f"convert-ttl-to-ttkernel,"
+        f"{ttkernel_pass},"
         f"canonicalize,"
         f"cse,"
         # Lower to EmitC.
diff --git a/test/me2e/config.py b/test/me2e/config.py
index 9df00d8c7..502ac691a 100644
--- a/test/me2e/config.py
+++ b/test/me2e/config.py
@@ -59,6 +59,9 @@ class E2EConfig:
     memory_layout: MemoryLayout = MemoryLayout.INTERLEAVED
     buffer_type: BufferType = BufferType.DRAM
 
+    # TTL-to-TTKernel: use TRID-aware DMA barriers (default matches pass default).
+    use_trid_barriers: bool = False
+
     @property
     def num_tiles(self) -> int:
         """Total number of tiles in the grid."""
diff --git a/test/me2e/config_specs.py b/test/me2e/config_specs.py
index 84fc9927f..1a126d77e 100644
--- a/test/me2e/config_specs.py
+++ b/test/me2e/config_specs.py
@@ -93,6 +93,7 @@ class TestConfig:
     num_tiles: int = 64
     buffer_factor: int = 2
     memory_layout: MemoryLayout = MemoryLayout.INTERLEAVED
+    use_trid_barriers: bool = False
 
     def __str__(self) -> str:
         """
@@ -132,12 +133,15 @@ def to_e2e_config(self) -> E2EConfig:
             dtype=self.dtype,
             buffer_factor=self.buffer_factor,
             memory_layout=self.memory_layout,
+            use_trid_barriers=self.use_trid_barriers,
         )
 
 
 CONFIGS = [
     # Single tile config.
     TestConfig(num_tiles=1, block_h=1, block_w=1),  # 1x1 grid (single tile)
+    # Single tile with TRID barriers (runtime coverage for use-trid-barriers).
+    TestConfig(num_tiles=1, block_h=1, block_w=1, use_trid_barriers=True),
     # Multi-tile configs with loop generation.
     TestConfig(num_tiles=4, block_h=2, block_w=2),  # 2x2 grid (4 tiles)
     # TODO(#123): Enable 8x8 config once tile index lowering is fixed.
diff --git a/test/me2e/runner.py b/test/me2e/runner.py
index 28cfc9df3..6f8159010 100644
--- a/test/me2e/runner.py
+++ b/test/me2e/runner.py
@@ -47,6 +47,7 @@ def get_compute_kernel(
     """
     cache_key = (
         f"{op.name}_{op.ttl_op}_{config.block_h}x{config.block_w}_{config.dtype}"
+        f"_trid{int(config.use_trid_barriers)}"
     )
     if cache_key in _kernel_cache:
         return _kernel_cache[cache_key]
@@ -56,7 +57,9 @@ def get_compute_kernel(
     module = build_e2e_module(op.name, op.arity, e2e_config)
 
     # Run TTL pass pipeline to get EmitC.
-    compiled_module = compile_ttl_to_ttkernel(module, device)
+    compiled_module = compile_ttl_to_ttkernel(
+        module, device, e2e_config.use_trid_barriers
+    )
 
     # Translate to C++ kernels.
     noc_kernels, compute_kernel = translate_module_to_kernels(compiled_module)
@@ -111,7 +114,9 @@ def run_compute_test(
     # 3. Build full ME2E module to get reader/writer kernels.
     # We need the full module to extract all kernels (reader, compute, writer).
     module = build_e2e_module(op.name, op.arity, e2e_config)
-    compiled_module = compile_ttl_to_ttkernel(module, device)
+    compiled_module = compile_ttl_to_ttkernel(
+        module, device, e2e_config.use_trid_barriers
+    )
     noc_kernels, compute_kernel_spec = translate_module_to_kernels(compiled_module)
 
     # Replace compute kernel source with cached/generated one.
diff --git a/test/me2e/test_compute_ops.py b/test/me2e/test_compute_ops.py
index 8582be01b..952b74926 100644
--- a/test/me2e/test_compute_ops.py
+++ b/test/me2e/test_compute_ops.py
@@ -23,7 +23,10 @@
 @pytest.mark.parametrize(
     "config",
     CONFIGS,
-    ids=lambda c: f"{c.block_h}x{c.block_w}_buf{c.buffer_factor}_{c.memory_layout.value}",
+    ids=lambda c: (
+        f"{c.block_h}x{c.block_w}_buf{c.buffer_factor}_{c.memory_layout.value}"
+        + ("_trid" if c.use_trid_barriers else "")
+    ),
 )
 @pytest.mark.parametrize("dtype", get_test_dtypes(), ids=get_dtype_ids())
 @pytest.mark.requires_device