From 8570f8214800da6cd04e5165d5710436f2e49634 Mon Sep 17 00:00:00 2001 From: Ilia Shutov Date: Fri, 27 Feb 2026 11:29:06 +0100 Subject: [PATCH 1/4] [ttl] Add TRID-aware DMA barrier lowering option Add use-trid-barriers option to convert-ttl-to-ttkernel pass and ttl-to-ttkernel-pipeline. When enabled, ttl.copy emits noc_async_{read,write}_set_trid before DMA operations, and ttl.wait emits noc_async_{read,write}_barrier_with_trid instead of global barriers. Default behavior (use-trid-barriers=false) preserves existing global barrier semantics from main branch. Key changes: - TridAllocator class manages 16 TRID slots with overflow handling - CopyLowering/WaitLowering patterns respect useTridBarriers flag - TTKernel cleanup patterns conditionally registered for TRID mode - SCF structural type conversions enabled for transfer handle types Addresses: #87 --- .../Transforms/TTKernelCleanupPatterns.h | 4 +- include/ttlang/Dialect/TTL/Passes.td | 22 +- .../Dialect/TTL/Pipelines/TTLPipelines.h | 4 + .../Transforms/TTKernelCleanupPatterns.cpp | 47 +++- lib/Dialect/TTL/Pipelines/TTLPipelines.cpp | 6 +- lib/Dialect/TTL/Transforms/CMakeLists.txt | 1 + .../TTL/Transforms/ConvertTTLToTTKernel.cpp | 224 +++++++++++++++--- 7 files changed, 266 insertions(+), 42 deletions(-) diff --git a/include/ttlang/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.h b/include/ttlang/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.h index 59a36cf35..be6873142 100644 --- a/include/ttlang/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.h +++ b/include/ttlang/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.h @@ -12,7 +12,9 @@ namespace mlir::tt::ttkernel { /// Populate cleanup patterns for TTKernel ops. These patterns optimize /// TTKernel code by removing redundant operations (e.g., deduplicating /// consecutive barriers of the same type). -void populateTTKernelCleanupPatterns(RewritePatternSet &patterns); +/// When useTridBarriers is true, also adds TRID-barrier deduplication patterns. +void populateTTKernelCleanupPatterns(RewritePatternSet &patterns, + bool useTridBarriers = false); } // namespace mlir::tt::ttkernel diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td index 629e4c940..0d375e476 100644 --- a/include/ttlang/Dialect/TTL/Passes.td +++ b/include/ttlang/Dialect/TTL/Passes.td @@ -5,15 +5,31 @@ include "mlir/Pass/PassBase.td" def TTLConvertTTLToTTKernel : Pass<"convert-ttl-to-ttkernel", "::mlir::ModuleOp"> { - let summary = "Lower TTL DMA ops to TTKernel using global barriers (temporary)"; + let summary = "Lower TTL DMA ops to TTKernel noc ops"; let description = [{ - Converts TTL DMA ops to TTKernel noc ops. Uses global barriers until TRID - barriers are available. Covers bind_cb, copy, wait MVP path. + Converts TTL DMA ops to TTKernel noc ops. Covers bind_cb, copy, wait MVP + path. + + Two lowering modes are supported: + - Default: global barriers (noc_async_{read,write}_barrier). + - Optional: TRID-aware barriers (noc_async_*_set_trid + + noc_async_*_barrier_with_trid). + + TODO(ttl): Profile both modes on representative benchmarks and consider + changing the default. TODO(ttl): Refine lowering to emit real CB handles and proper NOC addresses. Issue: #77 (umbrella issue with subtasks #78-#89). }]; + let options = [ + Option<"useTridBarriers", "use-trid-barriers", "bool", "false", + "Use TRID-aware DMA waits (barrier_with_trid) instead of global barriers. " + "TRID must be unique per outstanding copy; ordering of TRID values is not " + "semantically significant. Generated TRIDs may be nondeterministic when " + "patterns are applied in parallel.">, + ]; + let dependentDialects = [ "::mlir::arith::ArithDialect", "::mlir::tt::ttkernel::TTKernelDialect" diff --git a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h index 6c1bd61e7..1c1a090a4 100644 --- a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h +++ b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h @@ -18,6 +18,10 @@ struct TTLToTTKernelPipelineOptions Option lowerToEmitC{*this, "lower-to-emitc", llvm::cl::desc("Lower TTKernel to EmitC."), llvm::cl::init(false)}; + Option useTridBarriers{ + *this, "use-trid-barriers", + llvm::cl::desc("Use TRID-aware DMA waits (barrier_with_trid)."), + llvm::cl::init(false)}; }; void createTTLToTTKernelPipeline(mlir::OpPassManager &pm, diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.cpp index 295955039..d696f9cda 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.cpp @@ -7,6 +7,7 @@ #include "mlir/IR/PatternMatch.h" #include "mlir/Support/LogicalResult.h" #include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h" +#include "llvm/ADT/STLExtras.h" namespace mlir::tt::ttkernel { @@ -31,13 +32,57 @@ struct DeduplicateConsecutiveBarriers : OpRewritePattern { } }; +/// Deduplicate consecutive TRID barriers of the same type *only* when they +/// target the same TRID (and optional NOC). Unlike global barriers, barriers +/// with different TRIDs are not redundant and must not be removed. +template +struct DeduplicateConsecutiveTridBarriers + : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(BarrierWithTridOp op, + PatternRewriter &rewriter) const override { + auto *prev = op->getPrevNode(); + if (!prev) { + return failure(); + } + auto prevBarrier = dyn_cast(prev); + if (!prevBarrier) { + return failure(); + } + + if (op->getNumOperands() != prevBarrier->getNumOperands()) { + return failure(); + } + + for (auto [a, b] : + llvm::zip_equal(op->getOperands(), prevBarrier->getOperands())) { + if (a != b) { + return failure(); + } + } + + rewriter.eraseOp(op); + return success(); + } +}; + } // namespace -void populateTTKernelCleanupPatterns(RewritePatternSet &patterns) { +void populateTTKernelCleanupPatterns(RewritePatternSet &patterns, + bool useTridBarriers) { patterns.add>( patterns.getContext()); patterns.add>( patterns.getContext()); + if (useTridBarriers) { + patterns + .add>( + patterns.getContext()); + patterns.add< + DeduplicateConsecutiveTridBarriers>( + patterns.getContext()); + } } } // namespace mlir::tt::ttkernel diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp index 9a4868bcc..2d8eabcb4 100644 --- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp +++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp @@ -29,7 +29,11 @@ void createTTLToTTKernelPipeline(OpPassManager &pm, pm.addPass(createTTLInsertTileRegsSync()); pm.addPass(createTTLLowerToLoops()); pm.addPass(createTTLAnnotateCBAssociations()); - pm.addPass(createTTLConvertTTLToTTKernel()); + { + TTLConvertTTLToTTKernelOptions passOptions; + passOptions.useTridBarriers = options.useTridBarriers; + pm.addPass(createTTLConvertTTLToTTKernel(passOptions)); + } pm.addPass(createCanonicalizerPass()); pm.addPass(createCSEPass()); if (options.lowerToEmitC) { diff --git a/lib/Dialect/TTL/Transforms/CMakeLists.txt b/lib/Dialect/TTL/Transforms/CMakeLists.txt index 7cb22ffbe..85f9d313d 100644 --- a/lib/Dialect/TTL/Transforms/CMakeLists.txt +++ b/lib/Dialect/TTL/Transforms/CMakeLists.txt @@ -21,6 +21,7 @@ add_mlir_dialect_library(TTLangTTLTransforms MLIRLinalgDialect MLIRMathDialect MLIRSCFDialect + MLIRSCFTransforms MLIRPass MLIRTensorDialect MLIRTTKernelDialect diff --git a/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp b/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp index 443e3014b..3daf901ff 100644 --- a/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp +++ b/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp @@ -9,6 +9,7 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/SCF/Utils/Utils.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/IR/BuiltinDialect.h" @@ -61,18 +62,31 @@ class TTLToTTKernelTypeConverter : public TypeConverter { t.getElementType()); }); // Tensor -> TensorAccessor for TTKernel when TTNN layout is present. - addConversion([](RankedTensorType t) -> Type { + addConversion([this](RankedTensorType t) -> Type { if (t.getEncoding() && mlir::isa(t.getEncoding())) { return ttk::TensorAccessorType::get(t.getContext()); } + // Otherwise, preserve tensor shape/encoding but convert element type. + // This is required for cases like tensor> + // becoming tensor once transfer handles are type-converted. + auto convertedElemTy = this->convertType(t.getElementType()); + if (!convertedElemTy) { + return t; + } + if (convertedElemTy == t.getElementType()) { + return t; + } + return mlir::cast(t.clone(convertedElemTy)); + }); + // Identity fallback must be last, but also handle conversion of transfer + // handles to TRID SSA values (i32). + addConversion([](Type t) -> Type { + if (llvm::isa(t)) { + return IntegerType::get(t.getContext(), 32); + } return t; }); - // Preserve transfer handle types so ttl.wait can inspect transfer - // direction. TRID-aware lowering will be added later. - addConversion([](TransferHandleType t) -> Type { return t; }); - // Identity fallback must be last. - addConversion([](Type t) { return t; }); auto castMaterialization = [](OpBuilder &builder, Type resultType, ValueRange inputs, Location loc) -> Value { @@ -393,8 +407,8 @@ static CopyOperandKind classifyOperand(Value v) { return CopyOperandKind::Unknown; } -static Value makeZeroI32(Location loc, ConversionPatternRewriter &rewriter) { - return rewriter.create(loc, 0, 32); +static Value makeZeroI8(Location loc, ConversionPatternRewriter &rewriter) { + return rewriter.create(loc, 0, 8); } static std::optional getTransferKindFromHandleType(Type t) { @@ -539,14 +553,58 @@ static Value linearizeTileIndex(OpBuilder &builder, Location loc, Value row, return builder.create(loc, rowOffset, col); } +/// Allocates TRIDs for DMA barriers. TRIDs wrap at 16 (4-bit hardware limit). +/// Tracks which TRIDs are in use by lowered copies and their transfer +/// direction. This bookkeeping must stay independent of greedy pattern rewrite +/// visitation order; therefore, it is only mutated during copy lowering. +/// When a TRID would be reused while still in use, the caller must emit a +/// barrier for the old transfer before reassigning. +/// +/// TODO: Profile both modes on representative benchmarks and consider changing +/// the default. +class TridAllocator { +public: + static constexpr uint32_t kNumTrids = 16; + + struct AllocResult { + uint32_t trid; + /// If set, this TRID was still outstanding from a previous copy. The caller + /// must emit a barrier_with_trid for this direction before reusing. + std::optional evictDirection; + }; + + AllocResult allocateTrid(TransferKind direction) { + uint32_t trid = nextTrid % kNumTrids; + AllocResult result{trid, std::nullopt}; + if (outstanding[trid]) { + result.evictDirection = direction_[trid]; + } + outstanding[trid] = true; + direction_[trid] = direction; + ++nextTrid; + return result; + } + + void releaseTrid(uint32_t trid) { outstanding[trid % kNumTrids] = false; } + +private: + uint32_t nextTrid = 0; + bool outstanding[kNumTrids] = {}; + TransferKind direction_[kNumTrids] = {}; +}; + /// Direction of a tensor<->CB tile copy for NOC operations. enum class NocCopyDirection { Read, Write }; /// Lower a tensor_slice<->CB copy in the given direction. /// Read: tensor_slice -> CB (noc_async_read_tile, get_write_ptr) /// Write: CB -> tensor_slice (noc_async_write_tile, get_read_ptr) +/// +/// When useTridBarriers is true, emits noc_async_{read,write}_set_trid before +/// the tile loop to tag NOC operations with the given TRID. static LogicalResult lowerTensorCBCopy(CopyOp op, TensorSliceOp sliceOp, Value cb, NocCopyDirection direction, + Value tridVal, bool useTridBarriers, ConversionPatternRewriter &rewriter, const TypeConverter &typeConverter) { auto loc = op.getLoc(); @@ -605,6 +663,17 @@ static LogicalResult lowerTensorCBCopy(CopyOp op, TensorSliceOp sliceOp, rewriter.create(loc, *pageSizeBytes); auto i32Ty = rewriter.getI32Type(); + // Tag subsequent NOC operations with this copy's TRID. + // Currently fixed to NOC 0. TODO(ttl): Generalize NOC selection (issue #77). + if (useTridBarriers) { + Value nocVal = makeZeroI8(loc, rewriter); + if (isRead) { + rewriter.create(loc, tridVal, nocVal); + } else { + rewriter.create(loc, tridVal, nocVal); + } + } + emitTileLoop( rewriter, loc, cbRows, cbCols, [&, tensorTilesX, cbCols](OpBuilder &b, Location bodyLoc, Value loopRow, @@ -640,7 +709,7 @@ static LogicalResult lowerTensorCBCopy(CopyOp op, TensorSliceOp sliceOp, } }); - rewriter.replaceOp(op, makeZeroI32(loc, rewriter)); + rewriter.replaceOp(op, tridVal); return success(); } @@ -662,7 +731,10 @@ struct TensorSliceLowering : OpConversionPattern { }; struct CopyLowering : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; + CopyLowering(const TypeConverter &typeConverter, MLIRContext *ctx, + TridAllocator *tridAllocator, bool useTridBarriers) + : OpConversionPattern(typeConverter, ctx), tridAllocator(tridAllocator), + useTridBarriers(useTridBarriers) {} LogicalResult matchAndRewrite(CopyOp op, OpAdaptor adaptor, @@ -690,6 +762,38 @@ struct CopyLowering : OpConversionPattern { }); } + if (!tridAllocator) { + return rewriter.notifyMatchFailure(op, "missing TRID allocator"); + } + + TransferKind direction = + (srcIsSlice && dstIsCB) ? TransferKind::read : TransferKind::write; + + Value tridVal; + if (useTridBarriers) { + auto allocResult = tridAllocator->allocateTrid(direction); + // If this TRID was still outstanding, emit a barrier to drain the old + // transfer before reusing the TRID. + if (allocResult.evictDirection) { + Value evictTrid = rewriter.create( + op.getLoc(), allocResult.trid, 32); + Value nocVal = makeZeroI8(op.getLoc(), rewriter); + if (*allocResult.evictDirection == TransferKind::read) { + rewriter.create( + op.getLoc(), evictTrid, nocVal); + } else { + rewriter.create( + op.getLoc(), evictTrid, nocVal); + } + } + tridVal = rewriter.create(op.getLoc(), + allocResult.trid, 32); + } else { + // In global-barrier mode, allocate but direction does not matter. + tridAllocator->allocateTrid(direction); + tridVal = rewriter.create(op.getLoc(), 0, 32); + } + // TensorSlice -> CB: read tiles from tensor into circular buffer. if (srcIsSlice && dstIsCB) { auto sliceOp = src.getDefiningOp(); @@ -698,8 +802,8 @@ struct CopyLowering : OpConversionPattern { op, "tensor_slice source must come from ttl.tensor_slice op"); } return lowerTensorCBCopy(op, sliceOp, adaptor.getDst(), - NocCopyDirection::Read, rewriter, - *typeConverter); + NocCopyDirection::Read, tridVal, useTridBarriers, + rewriter, *typeConverter); } // CB -> TensorSlice: write tiles from circular buffer to tensor. @@ -709,41 +813,71 @@ struct CopyLowering : OpConversionPattern { op, "tensor_slice destination must come from ttl.tensor_slice op"); } return lowerTensorCBCopy(op, sliceOp, adaptor.getSrc(), - NocCopyDirection::Write, rewriter, *typeConverter); + NocCopyDirection::Write, tridVal, useTridBarriers, + rewriter, *typeConverter); } + +private: + TridAllocator *tridAllocator = nullptr; + bool useTridBarriers = false; }; struct WaitLowering : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; + WaitLowering(const TypeConverter &typeConverter, MLIRContext *ctx, + bool useTridBarriers) + : OpConversionPattern(typeConverter, ctx), + useTridBarriers(useTridBarriers) {} LogicalResult matchAndRewrite(WaitOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - // TODO(ttl): Lower ttl.wait to TRID-specific barriers keyed by the transfer - // handle (read vs write barrier based on transfer direction). Issue: #87. + // Emit TRID-specific barriers keyed by the transfer handle. // - // MVP behavior: require a direction-typed handle and emit the - // corresponding global barrier. Untyped handles are rejected by the - // verifier, but we also fail the rewrite defensively. - auto kind = getTransferKindFromHandleType(adaptor.getXf().getType()); + // NOTE: After type conversion, the handle value is an i32 TRID. Transfer + // direction is recovered from the original operand type. + auto kind = getTransferKindFromHandleType(op.getXf().getType()); if (!kind) { return rewriter.notifyMatchFailure( op, "requires direction-typed !ttl.transfer_handle"); } - if (*kind == TransferKind::read) { - rewriter.create(op.getLoc()); - } else if (*kind == TransferKind::write) { - rewriter.create(op.getLoc()); + if (useTridBarriers) { + Value tridVal = adaptor.getXf(); // i32 (after type conversion) + if (!tridVal.getType().isInteger(32)) { + return rewriter.notifyMatchFailure( + op, + "transfer handle must be type-converted to i32 before ttl.wait"); + } + // Currently fixed to NOC 0. TODO(ttl): Generalize NOC selection (issue + // #77). + Value nocVal = makeZeroI8(op.getLoc(), rewriter); + if (*kind == TransferKind::read) { + rewriter.create(op.getLoc(), + tridVal, nocVal); + } else if (*kind == TransferKind::write) { + rewriter.create(op.getLoc(), + tridVal, nocVal); + } else { + return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) { + diag << "unsupported TransferKind for ttl.wait lowering"; + }); + } } else { - // Future-proofing: TransferKind is currently {read, write}, but fail - // explicitly if it ever expands without updating the lowering. - return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) { - diag << "unsupported TransferKind for ttl.wait lowering"; - }); + if (*kind == TransferKind::read) { + rewriter.create(op.getLoc()); + } else if (*kind == TransferKind::write) { + rewriter.create(op.getLoc()); + } else { + return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) { + diag << "unsupported TransferKind for ttl.wait lowering"; + }); + } } rewriter.eraseOp(op); return success(); } + +private: + bool useTridBarriers = false; }; //===----------------------------------------------------------------------===// @@ -841,7 +975,7 @@ struct FuncKernelFinalize : OpRewritePattern { static LogicalResult lowerTTLOpsToTTKernel(ModuleOp mod, MLIRContext &ctx, TTLToTTKernelTypeConverter &typeConverter, - StringRef passName) { + bool useTridBarriers, StringRef passName) { ConversionTarget target(ctx); target.addIllegalDialect(); target.addLegalDialect( - typeConverter, &ctx); + TridAllocator tridAllocator; + + // Patterns with standard (typeConverter, ctx) signature. + patterns.add(typeConverter, + &ctx); + + // Patterns with TRID-specific arguments. + patterns.add(typeConverter, &ctx, &tridAllocator, + useTridBarriers); + patterns.add(typeConverter, &ctx, useTridBarriers); + + // Convert scf.for/scf.if/etc region signatures when result/iter_arg types + // change due to the type converter. + mlir::scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter, + patterns, target); + populateFunctionOpInterfaceTypeConversionPattern( func::FuncOp::getOperationName(), patterns, typeConverter); @@ -898,7 +1046,7 @@ lowerTTLOpsToTTKernel(ModuleOp mod, MLIRContext &ctx, // Apply post-conversion cleanup patterns (e.g., barrier deduplication). RewritePatternSet cleanupPatterns(&ctx); - ttkernel::populateTTKernelCleanupPatterns(cleanupPatterns); + ttkernel::populateTTKernelCleanupPatterns(cleanupPatterns, useTridBarriers); if (failed(applyPatternsGreedily(mod, std::move(cleanupPatterns)))) { return failure(); } @@ -1046,13 +1194,17 @@ static void cleanupComputeKernels(ModuleOp mod, MLIRContext &ctx) { struct TTLConvertTTLToTTKernelPass : impl::TTLConvertTTLToTTKernelBase { + using Base = impl::TTLConvertTTLToTTKernelBase; + using Base::Base; + void runOnOperation() override { MLIRContext &ctx = getContext(); ModuleOp mod = getOperation(); TTLToTTKernelTypeConverter typeConverter; // Phase 1: Lower TTL ops to TTKernel (bind_cb, copy, wait, cb ops, store) - if (failed(lowerTTLOpsToTTKernel(mod, ctx, typeConverter, getName()))) { + if (failed(lowerTTLOpsToTTKernel(mod, ctx, typeConverter, useTridBarriers, + getName()))) { signalPassFailure(); return; } From 1f494499a320059e2ebcc04021868ba3d4259052 Mon Sep 17 00:00:00 2001 From: Ilia Shutov Date: Fri, 27 Feb 2026 11:29:18 +0100 Subject: [PATCH 2/4] [test] Add conversion lit tests for TRID and global barrier modes - trid_barriers.mlir: Tests TRID-aware lowering with use-trid-barriers=true - Verifies noc_async_{read,write}_set_trid emission - Verifies noc_async_{read,write}_barrier_with_trid emission - Tests TRID overflow handling (17 copies without waits) - dma_global_barriers.mlir: Tests default global barrier mode - Verifies noc_async_{read,write}_barrier emission (no TRID) - Ensures backward compatibility with main branch behavior - Update existing tests to use explicit use-trid-barriers=true where they expect TRID-specific output --- .../TTLToTTKernel/dma_global_barriers.mlir | 117 ++++++++++++++++ .../TTLToTTKernel/dma_single_core.mlir | 81 ++++++----- .../TTLToTTKernel/loopback_dram_copy.mlir | 12 +- .../TTLToTTKernel/trid_barriers.mlir | 130 ++++++++++++++++++ 4 files changed, 301 insertions(+), 39 deletions(-) create mode 100644 test/ttlang/Conversion/TTLToTTKernel/dma_global_barriers.mlir create mode 100644 test/ttlang/Conversion/TTLToTTKernel/trid_barriers.mlir diff --git a/test/ttlang/Conversion/TTLToTTKernel/dma_global_barriers.mlir b/test/ttlang/Conversion/TTLToTTKernel/dma_global_barriers.mlir new file mode 100644 index 000000000..764fff63f --- /dev/null +++ b/test/ttlang/Conversion/TTLToTTKernel/dma_global_barriers.mlir @@ -0,0 +1,117 @@ +// RUN: ttlang-opt --convert-ttl-to-ttkernel --canonicalize -cse --split-input-file %s | FileCheck %s --check-prefix=GLOBAL +// Summary: Verify the default (non-TRID) code path emits global NOC barriers. +// Companion to dma_single_core.mlir which tests use-trid-barriers=1. + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// Single-tile read: default mode emits noc_async_read_barrier (no TRID ops). +// GLOBAL-LABEL: func.func @global_single_tile_read +// GLOBAL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () +// GLOBAL: ttkernel.noc_async_read_barrier() : () -> () +// GLOBAL-NOT: ttkernel.noc_async_read_set_trid +// GLOBAL-NOT: ttkernel.noc_async_read_barrier_with_trid +// GLOBAL-NOT: ttkernel.noc_async_write_barrier +module { + func.func @global_single_tile_read(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %slice = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf = ttl.copy %slice, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + ttl.wait %xf : !ttl.transfer_handle + func.return + } +} + +// ----- + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// Single-tile write: default mode emits noc_async_write_barrier (no TRID ops). +// GLOBAL-LABEL: func.func @global_single_tile_write +// GLOBAL: ttkernel.noc_async_write_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () +// GLOBAL: ttkernel.noc_async_write_barrier() : () -> () +// GLOBAL-NOT: ttkernel.noc_async_write_set_trid +// GLOBAL-NOT: ttkernel.noc_async_write_barrier_with_trid +// GLOBAL-NOT: ttkernel.noc_async_read_barrier +module { + func.func @global_single_tile_write(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %slice = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf = ttl.copy %cb, %slice : (!ttl.cb<[1, 1], f32, 2>, tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) -> !ttl.transfer_handle + ttl.wait %xf : !ttl.transfer_handle + func.return + } +} + +// ----- + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// Batched reads: consecutive global barriers are deduplicated to a single barrier. +// GLOBAL-LABEL: func.func @global_batched_reads +// GLOBAL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () +// GLOBAL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () +// GLOBAL: ttkernel.noc_async_read_barrier() : () -> () +// GLOBAL-NOT: ttkernel.noc_async_read_barrier +// GLOBAL-NOT: ttkernel.noc_async_read_set_trid +// GLOBAL-NOT: ttkernel.noc_async_read_barrier_with_trid +module { + func.func @global_batched_reads(%t0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, %t1: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 2 : i32, ttl.crta_indices = [0, 1], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb0 = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %cb1 = ttl.bind_cb {cb_index = 1, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %slice0 = ttl.tensor_slice %t0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %slice1 = ttl.tensor_slice %t1[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf0 = ttl.copy %slice0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %xf1 = ttl.copy %slice1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + ttl.wait %xf0 : !ttl.transfer_handle + ttl.wait %xf1 : !ttl.transfer_handle + func.return + } +} + +// ----- + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// Loopback copy: read then write in a loop uses global barriers for both. +// GLOBAL-LABEL: func.func @global_loopback +// GLOBAL: scf.for +// GLOBAL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () +// GLOBAL: ttkernel.noc_async_read_barrier() : () -> () +// GLOBAL: ttkernel.noc_async_write_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () +// GLOBAL: ttkernel.noc_async_write_barrier() : () -> () +// GLOBAL-NOT: noc_async_read_set_trid +// GLOBAL-NOT: noc_async_write_set_trid +// GLOBAL-NOT: barrier_with_trid +module { + func.func @global_loopback(%src: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, + %dst: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) + attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0, 1], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + + %src_slice = ttl.tensor_slice %src[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %dst_slice = ttl.tensor_slice %dst[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + scf.for %i = %c0 to %c4 step %c1 { + %xf_r = ttl.copy %src_slice, %cb + : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) + -> !ttl.transfer_handle + ttl.wait %xf_r : !ttl.transfer_handle + + %xf_w = ttl.copy %cb, %dst_slice + : (!ttl.cb<[1, 1], f32, 2>, tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) + -> !ttl.transfer_handle + ttl.wait %xf_w : !ttl.transfer_handle + } + + func.return + } +} diff --git a/test/ttlang/Conversion/TTLToTTKernel/dma_single_core.mlir b/test/ttlang/Conversion/TTLToTTKernel/dma_single_core.mlir index 98293c787..54837c1bc 100644 --- a/test/ttlang/Conversion/TTLToTTKernel/dma_single_core.mlir +++ b/test/ttlang/Conversion/TTLToTTKernel/dma_single_core.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --convert-ttl-to-ttkernel --canonicalize -cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL +// RUN: ttlang-opt --convert-ttl-to-ttkernel="use-trid-barriers=1" --canonicalize -cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL // Summary: MVP DMA lowering tests for tensor<->CB copies (no pipes). #dram = #ttnn.buffer_type @@ -12,9 +12,11 @@ // TTKERNEL: %[[SRC_ARGS:.*]] = ttkernel.TensorAccessorArgs({{.*}}) // TTKERNEL: %[[SRC_ACC:.*]] = ttkernel.TensorAccessor(%[[SRC_ARGS]], %[[BANK_BASE]], {{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor // TTKERNEL: %[[CB_PTR:.*]] = ttkernel.get_write_ptr(%[[CB]]) : (!ttkernel.cb<2, f32>) -> i32 +// TTKERNEL: ttkernel.noc_async_read_set_trid(%[[TRID:.*]], %[[NOC:.*]]) : (i32, i8) -> () // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, %[[SRC_ACC]], %[[CB_PTR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[TRID]], %[[NOC]]) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_single_tile_single_copy(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -38,9 +40,11 @@ module { // TTKERNEL: %[[DST_ARGS:.*]] = ttkernel.TensorAccessorArgs({{.*}}) // TTKERNEL: %[[DST_ACC:.*]] = ttkernel.TensorAccessor(%[[DST_ARGS]], %[[BANK_BASE]], {{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor // TTKERNEL: %[[CB_PTR:.*]] = ttkernel.get_read_ptr(%[[CB]]) : (!ttkernel.cb<2, f32>) -> i32 +// TTKERNEL: ttkernel.noc_async_write_set_trid(%[[TRID:.*]], %[[NOC:.*]]) : (i32, i8) -> () // TTKERNEL: ttkernel.noc_async_write_tile({{.*}}, %[[DST_ACC]], %[[CB_PTR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_write_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_read_barrier +// TTKERNEL: ttkernel.noc_async_write_barrier_with_trid(%[[TRID]], %[[NOC]]) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () module { func.func @cb_to_tensor(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -71,10 +75,12 @@ module { // TTKERNEL: ttkernel.TensorAccessor({{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor // TTKERNEL: ttkernel.get_write_ptr({{.*}}) : (!ttkernel.cb<2, f32>) -> i32 // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () -// Consecutive barriers are deduplicated to a single barrier. -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_read_barrier -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// Each ttl.wait lowers to a TRID-specific barrier; different TRIDs must not be +// deduplicated. +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_batched(%t0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, %t1: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 2 : i32, ttl.crta_indices = [0, 1], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -103,10 +109,11 @@ module { // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, {{.*}}, {{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () // TTKERNEL: scf.for {{.*}} { // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, {{.*}}, {{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () // TTKERNEL: } -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_pipelined_loop(%t: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -137,17 +144,17 @@ module { // // TTKERNEL-LABEL: func.func @dma_single_tile_two_phase_loops // TTKERNEL: %[[HANDLES0:.*]] = tensor.empty() : tensor<4x!ttl.transfer_handle> -// TTKERNEL: %[[CAST:.*]] = tensor.cast %[[HANDLES0]] : tensor<4x!ttl.transfer_handle> to tensor> -// TTKERNEL: %[[HANDLES:.*]] = scf.for {{.*}} iter_args(%[[H:.*]] = %[[CAST]]) -> (tensor>) { -// TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, {{.*}}, {{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: %[[XF:.*]] = builtin.unrealized_conversion_cast {{.*}} : i32 to !ttl.transfer_handle -// TTKERNEL: %[[INS:.*]] = tensor.insert %[[XF]] into %[[H]]{{\[}}{{.*}}{{\]}} : tensor> -// TTKERNEL: scf.yield %[[INS]] : tensor> +// TTKERNEL: scf.for {{.*}} iter_args(%[[H:.*]] = %[[HANDLES0]]) -> (tensor<4x!ttl.transfer_handle>) { +// TTKERNEL: %[[XF_HANDLE:.*]] = builtin.unrealized_conversion_cast {{.*}} : i32 to !ttl.transfer_handle +// TTKERNEL: %[[INS:.*]] = tensor.insert %[[XF_HANDLE]] into %[[H]]{{\[}}{{.*}}{{\]}} : tensor<4x!ttl.transfer_handle> +// TTKERNEL: scf.yield %[[INS]] : tensor<4x!ttl.transfer_handle> // TTKERNEL: } // TTKERNEL: scf.for {{.*}} { -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL: %[[XF_I32:.*]] = builtin.unrealized_conversion_cast {{.*}} : !ttl.transfer_handle to i32 +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[XF_I32]], {{.*}}) : (i32, i8) -> () // TTKERNEL: } -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_single_tile_two_phase_loops(%t: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -177,12 +184,13 @@ module { #layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // Corner case: waiting twice on the same transfer handle is allowed, but -// consecutive barriers are deduplicated to a single barrier. +// consecutive barriers are deduplicated to a single TRID barrier. // // TTKERNEL-LABEL: func.func @dma_single_tile_double_wait -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_read_barrier -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier_with_trid +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_single_tile_double_wait(%t: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -205,8 +213,9 @@ module { // // TTKERNEL-LABEL: func.func @dma_single_tile_single_element_container // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, {{.*}}, {{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () // TTKERNEL: return module { func.func @dma_single_tile_single_element_container(%t: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { @@ -260,8 +269,9 @@ module { // TTKERNEL: %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET_X]] : index to i32 // TTKERNEL: %[[CB_ADDR:.*]] = arith.index_cast %[[CB_ADDR_IDX]] : index to i32 // TTKERNEL: ttkernel.noc_async_read_tile(%[[TILE_OFFSET_I32]], %[[ACC]], %[[CB_ADDR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_multi_tile_read(%arg0: tensor<2x2x!ttcore.tile<32x32, f32>, #layout_tile>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -305,8 +315,9 @@ module { // TTKERNEL: %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET_X]] : index to i32 // TTKERNEL: %[[CB_ADDR:.*]] = arith.index_cast %[[CB_ADDR_IDX]] : index to i32 // TTKERNEL: ttkernel.noc_async_write_tile(%[[TILE_OFFSET_I32]], %[[ACC]], %[[CB_ADDR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_write_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_read_barrier +// TTKERNEL: ttkernel.noc_async_write_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () module { func.func @dma_multi_tile_write(%arg0: tensor<2x2x!ttcore.tile<32x32, f32>, #layout_tile>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -350,8 +361,9 @@ module { // TTKERNEL: %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET_X]] : index to i32 // TTKERNEL: %[[CB_ADDR:.*]] = arith.index_cast %[[CB_ADDR_IDX]] : index to i32 // TTKERNEL: ttkernel.noc_async_read_tile(%[[TILE_OFFSET_I32]], %[[ACC]], %[[CB_ADDR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_multi_tile_read_cb_shape(%arg0: tensor<2x2x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -396,8 +408,9 @@ module { // TTKERNEL: %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET_X]] : index to i32 // TTKERNEL: %[[CB_ADDR:.*]] = arith.index_cast %[[CB_ADDR_IDX]] : index to i32 // TTKERNEL: ttkernel.noc_async_write_tile(%[[TILE_OFFSET_I32]], %[[ACC]], %[[CB_ADDR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_write_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_read_barrier +// TTKERNEL: ttkernel.noc_async_write_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () module { func.func @dma_multi_tile_write_rect(%arg0: tensor<3x2x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index diff --git a/test/ttlang/Conversion/TTLToTTKernel/loopback_dram_copy.mlir b/test/ttlang/Conversion/TTLToTTKernel/loopback_dram_copy.mlir index 6e8ece496..11bfaa46a 100644 --- a/test/ttlang/Conversion/TTLToTTKernel/loopback_dram_copy.mlir +++ b/test/ttlang/Conversion/TTLToTTKernel/loopback_dram_copy.mlir @@ -1,6 +1,6 @@ -// RUN: ttlang-opt --convert-ttl-to-ttkernel --canonicalize --cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL -// Summary: Lower a loopback DRAM copy (read → wait → write → wait in a loop) -// to TTKernel using global NOC barriers (TRID ops not yet available). +// RUN: ttlang-opt --convert-ttl-to-ttkernel="use-trid-barriers=1" --canonicalize --cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL +// Summary: Lower a loopback DRAM copy (read -> wait -> write -> wait in a loop) +// to TTKernel using TRID-specific NOC barriers. #dram = #ttnn.buffer_type #layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, @@ -13,14 +13,16 @@ // TTKERNEL: ttkernel.get_common_arg_val({{.*}}) : (index) -> i32 // TTKERNEL: %[[ACC_R:.*]] = ttkernel.TensorAccessor({{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor // TTKERNEL: %[[CB_W_PTR:.*]] = ttkernel.get_write_ptr({{.*}}) : (!ttkernel.cb<2, f32>) -> i32 +// TTKERNEL: ttkernel.noc_async_read_set_trid({{.*}}, {{.*}}) : (i32, i8) -> () // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, %[[ACC_R]], %[[CB_W_PTR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}, {{.*}}) : (i32, i8) -> () // Write: runtime arg for dst tensor, accessor, read ptr for CB // TTKERNEL: ttkernel.get_common_arg_val({{.*}}) : (index) -> i32 // TTKERNEL: %[[ACC_W:.*]] = ttkernel.TensorAccessor({{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor // TTKERNEL: %[[CB_R_PTR:.*]] = ttkernel.get_read_ptr({{.*}}) : (!ttkernel.cb<2, f32>) -> i32 +// TTKERNEL: ttkernel.noc_async_write_set_trid({{.*}}, {{.*}}) : (i32, i8) -> () // TTKERNEL: ttkernel.noc_async_write_tile({{.*}}, %[[ACC_W]], %[[CB_R_PTR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_write_barrier() : () -> () +// TTKERNEL: ttkernel.noc_async_write_barrier_with_trid({{.*}}, {{.*}}) : (i32, i8) -> () module { func.func @loopback_dram_copy(%src: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, diff --git a/test/ttlang/Conversion/TTLToTTKernel/trid_barriers.mlir b/test/ttlang/Conversion/TTLToTTKernel/trid_barriers.mlir new file mode 100644 index 000000000..bb110a44c --- /dev/null +++ b/test/ttlang/Conversion/TTLToTTKernel/trid_barriers.mlir @@ -0,0 +1,130 @@ +// RUN: ttlang-opt --convert-ttl-to-ttkernel="use-trid-barriers=1" --canonicalize -cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL +// Summary: Regression tests for TRID-aware ttl.copy/ttl.wait lowering. + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// TTKERNEL-LABEL: func.func @trid_single_copy_wait_read +// TTKERNEL: ttkernel.noc_async_read_set_trid(%[[TRID:.*]], %[[NOC:.*]]) : (i32, i8) -> () +// TTKERNEL: ttkernel.noc_async_read_tile( +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[TRID]], %[[NOC]]) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: builtin.unrealized_conversion_cast +module { + func.func @trid_single_copy_wait_read(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %slice = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf = ttl.copy %slice, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + ttl.wait %xf : !ttl.transfer_handle + func.return + } +} + +// ----- + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// TTKERNEL-LABEL: func.func @trid_two_copies_two_waits_read +// TTKERNEL: ttkernel.noc_async_read_set_trid(%[[TRID0:.*]], %[[NOC:.*]]) : (i32, i8) -> () +// TTKERNEL: ttkernel.noc_async_read_tile( +// TTKERNEL: ttkernel.noc_async_read_set_trid(%[[TRID1:.*]], %[[NOC]]) : (i32, i8) -> () +// TTKERNEL: ttkernel.noc_async_read_tile( +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[TRID0]], %[[NOC]]) : (i32, i8) -> () +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[TRID1]], %[[NOC]]) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: builtin.unrealized_conversion_cast +module { + func.func @trid_two_copies_two_waits_read(%t0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, %t1: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 2 : i32, ttl.crta_indices = [0, 1], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb0 = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %cb1 = ttl.bind_cb {cb_index = 1, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %slice0 = ttl.tensor_slice %t0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %slice1 = ttl.tensor_slice %t1[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf0 = ttl.copy %slice0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %xf1 = ttl.copy %slice1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + ttl.wait %xf0 : !ttl.transfer_handle + ttl.wait %xf1 : !ttl.transfer_handle + func.return + } +} + +// ----- + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// Verify TRID overflow handling: 17 copies without intervening waits exhaust +// the 16-entry TRID space. The 17th copy (reusing TRID 0) must emit an +// auto-barrier for TRID 0 before reassigning it. +// +// TTKERNEL-LABEL: func.func @trid_overflow_auto_barrier +// The first 16 copies each get a unique TRID (0..15) with no auto-barrier. +// TTKERNEL-COUNT-16: ttkernel.noc_async_read_set_trid +// The 17th copy reuses TRID 0. Because TRID 0 is still outstanding, the pass +// emits an auto-barrier first. +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid +// TTKERNEL: ttkernel.noc_async_read_set_trid +// No global barriers should appear. +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: builtin.unrealized_conversion_cast +module { + func.func @trid_overflow_auto_barrier(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %s0 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf0 = ttl.copy %s0, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s1 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf1 = ttl.copy %s1, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s2 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf2 = ttl.copy %s2, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s3 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf3 = ttl.copy %s3, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s4 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf4 = ttl.copy %s4, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s5 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf5 = ttl.copy %s5, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s6 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf6 = ttl.copy %s6, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s7 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf7 = ttl.copy %s7, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s8 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf8 = ttl.copy %s8, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s9 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf9 = ttl.copy %s9, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s10 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf10 = ttl.copy %s10, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s11 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf11 = ttl.copy %s11, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s12 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf12 = ttl.copy %s12, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s13 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf13 = ttl.copy %s13, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s14 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf14 = ttl.copy %s14, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s15 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf15 = ttl.copy %s15, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + // 17th copy: TRID 0 wraps. Auto-barrier expected here. + %s16 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf16 = ttl.copy %s16, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + ttl.wait %xf0 : !ttl.transfer_handle + ttl.wait %xf1 : !ttl.transfer_handle + ttl.wait %xf2 : !ttl.transfer_handle + ttl.wait %xf3 : !ttl.transfer_handle + ttl.wait %xf4 : !ttl.transfer_handle + ttl.wait %xf5 : !ttl.transfer_handle + ttl.wait %xf6 : !ttl.transfer_handle + ttl.wait %xf7 : !ttl.transfer_handle + ttl.wait %xf8 : !ttl.transfer_handle + ttl.wait %xf9 : !ttl.transfer_handle + ttl.wait %xf10 : !ttl.transfer_handle + ttl.wait %xf11 : !ttl.transfer_handle + ttl.wait %xf12 : !ttl.transfer_handle + ttl.wait %xf13 : !ttl.transfer_handle + ttl.wait %xf14 : !ttl.transfer_handle + ttl.wait %xf15 : !ttl.transfer_handle + ttl.wait %xf16 : !ttl.transfer_handle + func.return + } +} From 89153449d98860c8e1799fde61f9e0d45a99e715 Mon Sep 17 00:00:00 2001 From: Ilia Shutov Date: Fri, 27 Feb 2026 11:29:29 +0100 Subject: [PATCH 3/4] [test] Update translation lit tests for TRID barrier mode Enable use-trid-barriers in TTLToCpp translation tests that verify TRID-specific C++ codegen output. Tests now explicitly request TRID mode to match their expected noc_async_*_set_trid and barrier_with_trid output. --- .../TTLToCpp/cb_to_tensor_single_tile_write.mlir | 5 +++-- .../Translate/TTLToCpp/dma_batched_single_tile.mlir | 9 ++++++--- .../TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir | 8 +++++--- .../Translate/TTLToCpp/dma_loop_single_tile.mlir | 13 +++++++------ .../dma_multi_tile_batched_in_user_loop.mlir | 9 ++++++--- .../Translate/TTLToCpp/dma_multi_tile_read.mlir | 5 +++-- .../dma_multi_tile_same_layout_different_cb.mlir | 8 +++++--- .../Translate/TTLToCpp/dma_single_tile_read.mlir | 5 +++-- .../TTLToCpp/loopback_full_single_tile.mlir | 8 +++++--- 9 files changed, 43 insertions(+), 27 deletions(-) diff --git a/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir b/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir index 910e13824..d3879935e 100644 --- a/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir +++ b/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -17,8 +17,9 @@ // CHECK: auto [[ARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); // CHECK: TensorAccessor [[ACCESSOR:v[0-9]+]] = TensorAccessor([[ARGS]], [[RT_ARG]], [[ADDR]]); // CHECK: int32_t [[CB_PTR:v[0-9]+]] = get_read_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_write_set_trid({{.*}}, {{.*}}); // CHECK: noc_async_write_tile([[ZERO]], [[ACCESSOR]], [[CB_PTR]]); -// CHECK: noc_async_write_barrier(); +// CHECK: noc_async_write_barrier_with_trid({{.*}}, {{.*}}); // CHECK: return; // CHECK-NEXT: } module { diff --git a/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir b/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir index f902cc788..892fd98fd 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -18,15 +18,18 @@ // CHECK: auto [[ARGS0:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<2, 0>(); // CHECK: TensorAccessor [[ACCESSOR0:v[0-9]+]] = TensorAccessor([[ARGS0]], [[RT_ARG0]], [[ADDR]]); // CHECK: int32_t [[CB_PTR0:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: noc_async_read_tile([[ZERO]], [[ACCESSOR0]], [[CB_PTR0]]); // Tensor 1: get runtime arg, create accessor, get CB write ptr, async read // CHECK: int32_t [[RT_ARG1:v[0-9]+]] = get_common_arg_val({{v[0-9]+}}); // CHECK: auto [[ARGS1:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<3, 1>(); // CHECK: TensorAccessor [[ACCESSOR1:v[0-9]+]] = TensorAccessor([[ARGS1]], [[RT_ARG1]], [[ADDR]]); // CHECK: int32_t [[CB_PTR1:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(1)); +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: noc_async_read_tile([[ZERO]], [[ACCESSOR1]], [[CB_PTR1]]); -// Consecutive barriers deduplicated to single barrier. -// CHECK: noc_async_read_barrier(); +// Each wait lowers to a TRID barrier (no global barrier). +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: return; // CHECK-NEXT: } module { diff --git a/test/ttlang/Translate/TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir b/test/ttlang/Translate/TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir index 194f185f5..6d963cc31 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -48,6 +48,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR1_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR1]]; // CHECK: size_t [[CB_PTR1_IDX:v[0-9]+]] = (size_t) [[CB_PTR1_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: for (size_t [[TILE1_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE1_Y]] < [[TILES_2]]; [[TILE1_Y]] += [[TILE_STEP]]) { // CHECK: for (size_t [[TILE1_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE1_X]] < [[TILES_2]]; [[TILE1_X]] += [[TILE_STEP]]) { // CHECK: size_t [[TILE1_OFFSET_Y:v[0-9]+]] = [[TILE1_Y]] * [[TILES_2]]; @@ -63,7 +64,7 @@ // CHECK: noc_async_read_tile([[TILE1_OFFSET]], [[ACC1]], [[CB_ADDR1]]); // CHECK: } // CHECK: } -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // Second copy: arg1 (96x64) → CB1, accessor with runtime arg index 1 // CHECK: int32_t [[RT_ARG2:v[0-9]+]] = get_common_arg_val([[TILE_STEP]]); // CHECK: auto [[ACC2_ARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<3, 1>(); @@ -72,6 +73,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR2_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR2]]; // CHECK: size_t [[CB_PTR2_IDX:v[0-9]+]] = (size_t) [[CB_PTR2_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: for (size_t [[TILE2_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE2_Y]] < [[TILES_3]]; [[TILE2_Y]] += [[TILE_STEP]]) { // CHECK: for (size_t [[TILE2_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE2_X]] < [[TILES_2]]; [[TILE2_X]] += [[TILE_STEP]]) { // CHECK: size_t [[TILE2_OFFSET_Y:v[0-9]+]] = [[TILE2_Y]] * [[TILES_2]]; @@ -87,7 +89,7 @@ // CHECK: noc_async_read_tile([[TILE2_OFFSET]], [[ACC2]], [[CB_ADDR2]]); // CHECK: } // CHECK: } -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: } // CHECK: return; // CHECK-NEXT: } diff --git a/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir b/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir index 88d1d349b..16165e6a1 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -21,17 +21,18 @@ // CHECK: auto [[ARGS0:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); // CHECK: TensorAccessor [[ACCESSOR0:v[0-9]+]] = TensorAccessor([[ARGS0]], [[RT_ARG0]], [[ADDR]]); // CHECK: int32_t [[CB_PTR0:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: noc_async_read_tile([[ZERO]], [[ACCESSOR0]], [[CB_PTR0]]); // CHECK: for (size_t [[IV:i[0-9]+]] = [[LB]]; [[IV]] < [[UB]]; [[IV]] += [[STEP]]) { -// In-loop copy: create accessor with runtime arg, get CB write ptr -// CHECK: int32_t [[RT_ARG1:v[0-9]+]] = get_common_arg_val([[LB]]); +// In-loop copy: create accessor using the same runtime arg and get CB write ptr // CHECK: auto [[ARGS1:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); -// CHECK: TensorAccessor [[ACCESSOR1:v[0-9]+]] = TensorAccessor([[ARGS1]], [[RT_ARG1]], [[ADDR]]); +// CHECK: TensorAccessor [[ACCESSOR1:v[0-9]+]] = TensorAccessor([[ARGS1]], [[RT_ARG0]], [[ADDR]]); // CHECK: int32_t [[CB_PTR1:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: noc_async_read_tile([[ZERO]], [[ACCESSOR1]], [[CB_PTR1]]); -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: } -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: return; // CHECK-NEXT: } module { diff --git a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir index ac49d3a96..b7aa1625b 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -49,6 +49,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR1_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR1]]; // CHECK: size_t [[CB_PTR1_IDX:v[0-9]+]] = (size_t) [[CB_PTR1_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // Tile loops: for tile_y in 0..2, for tile_x in 0..2 // CHECK: for (size_t [[TILE1_Y:[a-z][0-9]+]] = [[LB]]; [[TILE1_Y]] < [[TILES_BOUND]]; [[TILE1_Y]] += [[STEP]]) { // CHECK: for (size_t [[TILE1_X:[a-z][0-9]+]] = [[LB]]; [[TILE1_X]] < [[TILES_BOUND]]; [[TILE1_X]] += [[STEP]]) { @@ -74,6 +75,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR2_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR2]]; // CHECK: size_t [[CB_PTR2_IDX:v[0-9]+]] = (size_t) [[CB_PTR2_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // Separate tile loops (same bounds 0..2 x 0..2 but not merged with first copy) // CHECK: for (size_t [[TILE2_Y:[a-z][0-9]+]] = [[LB]]; [[TILE2_Y]] < [[TILES_BOUND]]; [[TILE2_Y]] += [[STEP]]) { // CHECK: for (size_t [[TILE2_X:[a-z][0-9]+]] = [[LB]]; [[TILE2_X]] < [[TILES_BOUND]]; [[TILE2_X]] += [[STEP]]) { @@ -91,8 +93,9 @@ // CHECK: } // CHECK: } -// Consecutive barriers deduplicated to single barrier. -// CHECK: noc_async_read_barrier(); +// Each wait lowers to a TRID barrier (no global barrier). +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: } // CHECK: return; // CHECK-NEXT: } diff --git a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_read.mlir b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_read.mlir index 0f3d6a44a..0899250f3 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_read.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_read.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -26,6 +26,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR]]; // CHECK: size_t [[CB_PTR_IDX:v[0-9]+]] = (size_t) [[CB_PTR_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: for (size_t [[TILE_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE_Y]] < [[TILES_BOUND]]; [[TILE_Y]] += [[TILE_STEP]]) { // CHECK: for (size_t [[TILE_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE_X]] < [[TILES_BOUND]]; [[TILE_X]] += [[TILE_STEP]]) { // CHECK: size_t [[TILE_OFFSET_Y:v[0-9]+]] = [[TILE_Y]] * [[TILES_BOUND]]; @@ -41,7 +42,7 @@ // CHECK: noc_async_read_tile([[TILE_OFFSET]], [[ACCESSOR]], [[CB_ADDR]]); // CHECK: } // CHECK: } -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: return; // CHECK-NEXT: } module { diff --git a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_same_layout_different_cb.mlir b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_same_layout_different_cb.mlir index 0294328ad..135aa030c 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_same_layout_different_cb.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_same_layout_different_cb.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -32,6 +32,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR1_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR1]]; // CHECK: size_t [[CB_PTR1_IDX:v[0-9]+]] = (size_t) [[CB_PTR1_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // Generated tile loops iterate over tensor grid (2x2) // CHECK: for (size_t [[TILE1_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE1_Y]] < [[TILES_BOUND]]; [[TILE1_Y]] += [[TILE_STEP]]) { // CHECK: for (size_t [[TILE1_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE1_X]] < [[TILES_BOUND]]; [[TILE1_X]] += [[TILE_STEP]]) { @@ -48,7 +49,7 @@ // CHECK: noc_async_read_tile([[TILE1_OFFSET]], [[ACC1]], [[CB_ADDR1]]); // CHECK: } // CHECK: } -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // Second copy: 64x64 (2x2 tiles) → CB [4,1] - SAME tensor layout, DIFFERENT CB shape // CHECK: int32_t [[RT_ARG2:v[0-9]+]] = get_common_arg_val([[TILE_STEP]]); @@ -58,6 +59,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR2_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB_PTR2]]; // CHECK: size_t [[CB_PTR2_IDX:v[0-9]+]] = (size_t) [[CB_PTR2_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // Generated tile loops still iterate over tensor grid (2x2), not CB shape (4x1) // CHECK: for (size_t [[TILE2_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE2_Y]] < [[TILES_BOUND]]; [[TILE2_Y]] += [[TILE_STEP]]) { // CHECK: for (size_t [[TILE2_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE2_X]] < [[TILES_BOUND]]; [[TILE2_X]] += [[TILE_STEP]]) { @@ -74,7 +76,7 @@ // CHECK: noc_async_read_tile([[TILE2_OFFSET]], [[ACC2]], [[CB_ADDR2]]); // CHECK: } // CHECK: } -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: return; // CHECK-NEXT: } diff --git a/test/ttlang/Translate/TTLToCpp/dma_single_tile_read.mlir b/test/ttlang/Translate/TTLToCpp/dma_single_tile_read.mlir index 6211d84ea..0e6f191a6 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_single_tile_read.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_single_tile_read.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -17,8 +17,9 @@ // CHECK: auto [[ARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); // CHECK: TensorAccessor [[ACCESSOR:v[0-9]+]] = TensorAccessor([[ARGS]], [[RT_ARG]], [[ADDR]]); // CHECK: int32_t [[CB_PTR:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: noc_async_read_tile([[ZERO]], [[ACCESSOR]], [[CB_PTR]]); -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: return; // CHECK-NEXT: } module { diff --git a/test/ttlang/Translate/TTLToCpp/loopback_full_single_tile.mlir b/test/ttlang/Translate/TTLToCpp/loopback_full_single_tile.mlir index 5306158c8..5b9f16a9b 100644 --- a/test/ttlang/Translate/TTLToCpp/loopback_full_single_tile.mlir +++ b/test/ttlang/Translate/TTLToCpp/loopback_full_single_tile.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -22,15 +22,17 @@ // CHECK: auto [[ARGS_READ:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); // CHECK: TensorAccessor [[ACC_READ:v[0-9]+]] = TensorAccessor([[ARGS_READ]], [[RT_ARG_R]], [[ADDR]]); // CHECK: int32_t [[CB_WRITE_PTR:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: noc_async_read_tile([[ZERO]], [[ACC_READ]], [[CB_WRITE_PTR]]); -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // Write: CB → tensor (uses get_read_ptr for CB source) // CHECK: int32_t [[RT_ARG_W:v[0-9]+]] = get_common_arg_val([[STEP]]); // CHECK: auto [[ARGS_WRITE:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<2, 1>(); // CHECK: TensorAccessor [[ACC_WRITE:v[0-9]+]] = TensorAccessor([[ARGS_WRITE]], [[RT_ARG_W]], [[ADDR]]); // CHECK: int32_t [[CB_READ_PTR:v[0-9]+]] = get_read_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_write_set_trid({{.*}}, {{.*}}); // CHECK: noc_async_write_tile([[ZERO]], [[ACC_WRITE]], [[CB_READ_PTR]]); -// CHECK: noc_async_write_barrier(); +// CHECK: noc_async_write_barrier_with_trid({{.*}}, {{.*}}); // CHECK: } // CHECK: return; // CHECK-NEXT: } From 2dba10ef61c105e41dbdaefbf2fb07e9d3ecd9dd Mon Sep 17 00:00:00 2001 From: Ilia Shutov Date: Fri, 27 Feb 2026 11:29:42 +0100 Subject: [PATCH 4/4] [test] Parameterize ME2E tests with use_trid_barriers option Add use_trid_barriers to E2EConfig and TestConfig to enable runtime testing of both barrier modes: - E2EConfig.use_trid_barriers controls pipeline pass option - TestConfig includes use_trid_barriers for test ID disambiguation - Pipeline builder forwards option to convert-ttl-to-ttkernel - Runner includes use_trid_barriers in kernel cache key - CONFIGS includes one TRID-enabled config for coverage Test IDs now include _trid suffix when use_trid_barriers=True to ensure unique pytest node IDs. --- test/me2e/README.md | 3 +++ test/me2e/builder/pipeline.py | 16 ++++++++++++++-- test/me2e/config.py | 3 +++ test/me2e/config_specs.py | 4 ++++ test/me2e/runner.py | 9 +++++++-- test/me2e/test_compute_ops.py | 5 ++++- 6 files changed, 35 insertions(+), 5 deletions(-) diff --git a/test/me2e/README.md b/test/me2e/README.md index e31954dbc..2c73a16c4 100644 --- a/test/me2e/README.md +++ b/test/me2e/README.md @@ -441,8 +441,11 @@ class E2EConfig: buffer_factor: int = 2 # 1=single, 2=double buffer (default) memory_layout: MemoryLayout = MemoryLayout.INTERLEAVED buffer_type: BufferType = BufferType.DRAM + use_trid_barriers: bool = False # TRID-aware DMA barriers (pass option) ``` +`use_trid_barriers` enables the convert-ttl-to-ttkernel pass option `use-trid-barriers=1` for runtime coverage of TRID barrier lowering; some ME2E configs (e.g. in CONFIGS) set it to True. + ### Memory Configuration The `memory_layout` and `buffer_type` fields control MLIR layout attribute generation: diff --git a/test/me2e/builder/pipeline.py b/test/me2e/builder/pipeline.py index ed0c773a7..ca311fae8 100644 --- a/test/me2e/builder/pipeline.py +++ b/test/me2e/builder/pipeline.py @@ -17,7 +17,11 @@ from .device_arch import get_mock_arch_from_device -def compile_ttl_to_ttkernel(module: Module, device: Optional[Any] = None) -> Module: +def compile_ttl_to_ttkernel( + module: Module, + device: Optional[Any] = None, + use_trid_barriers: bool = False, +) -> Module: """ Run the TTL-to-TTKernel pass pipeline on the module. @@ -26,6 +30,8 @@ def compile_ttl_to_ttkernel(module: Module, device: Optional[Any] = None) -> Mod Args: module: TTL MLIR module to compile. device: Optional TTNN device for architecture detection. + use_trid_barriers: If True, use TRID-aware DMA barriers (pass option + use-trid-barriers=1). Default False matches pass default. Returns: Compiled module with TTKernel/EmitC ops. @@ -34,6 +40,12 @@ def compile_ttl_to_ttkernel(module: Module, device: Optional[Any] = None) -> Mod mock_arch = get_mock_arch_from_device(device) device_pass = f"ttcore-register-device{{mock-system-desc-arch={mock_arch}}}" + ttkernel_pass = ( + "convert-ttl-to-ttkernel{use-trid-barriers=1}" + if use_trid_barriers + else "convert-ttl-to-ttkernel" + ) + pipeline_str = ( f"builtin.module(" f"{device_pass}," @@ -44,7 +56,7 @@ def compile_ttl_to_ttkernel(module: Module, device: Optional[Any] = None) -> Mod f"ttl-lower-to-loops," f"ttl-annotate-cb-associations)," # TTL to TTKernel conversion (module-level pass). - f"convert-ttl-to-ttkernel," + f"{ttkernel_pass}," f"canonicalize," f"cse," # Lower to EmitC. diff --git a/test/me2e/config.py b/test/me2e/config.py index 9df00d8c7..502ac691a 100644 --- a/test/me2e/config.py +++ b/test/me2e/config.py @@ -59,6 +59,9 @@ class E2EConfig: memory_layout: MemoryLayout = MemoryLayout.INTERLEAVED buffer_type: BufferType = BufferType.DRAM + # TTL-to-TTKernel: use TRID-aware DMA barriers (default matches pass default). + use_trid_barriers: bool = False + @property def num_tiles(self) -> int: """Total number of tiles in the grid.""" diff --git a/test/me2e/config_specs.py b/test/me2e/config_specs.py index 84fc9927f..1a126d77e 100644 --- a/test/me2e/config_specs.py +++ b/test/me2e/config_specs.py @@ -93,6 +93,7 @@ class TestConfig: num_tiles: int = 64 buffer_factor: int = 2 memory_layout: MemoryLayout = MemoryLayout.INTERLEAVED + use_trid_barriers: bool = False def __str__(self) -> str: """ @@ -132,12 +133,15 @@ def to_e2e_config(self) -> E2EConfig: dtype=self.dtype, buffer_factor=self.buffer_factor, memory_layout=self.memory_layout, + use_trid_barriers=self.use_trid_barriers, ) CONFIGS = [ # Single tile config. TestConfig(num_tiles=1, block_h=1, block_w=1), # 1x1 grid (single tile) + # Single tile with TRID barriers (runtime coverage for use-trid-barriers). + TestConfig(num_tiles=1, block_h=1, block_w=1, use_trid_barriers=True), # Multi-tile configs with loop generation. TestConfig(num_tiles=4, block_h=2, block_w=2), # 2x2 grid (4 tiles) # TODO(#123): Enable 8x8 config once tile index lowering is fixed. diff --git a/test/me2e/runner.py b/test/me2e/runner.py index 28cfc9df3..6f8159010 100644 --- a/test/me2e/runner.py +++ b/test/me2e/runner.py @@ -47,6 +47,7 @@ def get_compute_kernel( """ cache_key = ( f"{op.name}_{op.ttl_op}_{config.block_h}x{config.block_w}_{config.dtype}" + f"_trid{int(config.use_trid_barriers)}" ) if cache_key in _kernel_cache: return _kernel_cache[cache_key] @@ -56,7 +57,9 @@ def get_compute_kernel( module = build_e2e_module(op.name, op.arity, e2e_config) # Run TTL pass pipeline to get EmitC. - compiled_module = compile_ttl_to_ttkernel(module, device) + compiled_module = compile_ttl_to_ttkernel( + module, device, e2e_config.use_trid_barriers + ) # Translate to C++ kernels. noc_kernels, compute_kernel = translate_module_to_kernels(compiled_module) @@ -111,7 +114,9 @@ def run_compute_test( # 3. Build full ME2E module to get reader/writer kernels. # We need the full module to extract all kernels (reader, compute, writer). module = build_e2e_module(op.name, op.arity, e2e_config) - compiled_module = compile_ttl_to_ttkernel(module, device) + compiled_module = compile_ttl_to_ttkernel( + module, device, e2e_config.use_trid_barriers + ) noc_kernels, compute_kernel_spec = translate_module_to_kernels(compiled_module) # Replace compute kernel source with cached/generated one. diff --git a/test/me2e/test_compute_ops.py b/test/me2e/test_compute_ops.py index 8582be01b..952b74926 100644 --- a/test/me2e/test_compute_ops.py +++ b/test/me2e/test_compute_ops.py @@ -23,7 +23,10 @@ @pytest.mark.parametrize( "config", CONFIGS, - ids=lambda c: f"{c.block_h}x{c.block_w}_buf{c.buffer_factor}_{c.memory_layout.value}", + ids=lambda c: ( + f"{c.block_h}x{c.block_w}_buf{c.buffer_factor}_{c.memory_layout.value}" + + ("_trid" if c.use_trid_barriers else "") + ), ) @pytest.mark.parametrize("dtype", get_test_dtypes(), ids=get_dtype_ids()) @pytest.mark.requires_device