From 2c5b527042f2e679ea12c159def90469b80fe61e Mon Sep 17 00:00:00 2001 From: Ilia Shutov Date: Tue, 5 May 2026 06:54:30 +0200 Subject: [PATCH 1/7] [ttl] Add TRID-aware DMA barrier lowering option Add use-trid-barriers option to convert-ttl-to-ttkernel pass and ttl-to-ttkernel-pipeline. When enabled, ttl.copy emits noc_async_{read,write}_set_trid before DMA operations, and ttl.wait emits noc_async_{read,write}_barrier_with_trid instead of global barriers. Default behavior (use-trid-barriers=false) preserves existing global barrier semantics from main branch. Key changes: - TridAllocator class manages 16 TRID slots with overflow handling - lowerTensorCBCopy unified function supports both modes - CopyLowering/WaitLowering patterns respect useTridBarriers flag - TTKernel cleanup patterns conditionally registered for TRID mode - SCF structural type conversions enabled for transfer handle types TODO: Profile both modes on representative benchmarks and consider changing the default. --- .../Transforms/TTKernelCleanupPatterns.h | 8 +- include/ttlang/Dialect/TTL/Passes.td | 19 +- .../Dialect/TTL/Pipelines/TTLPipelines.h | 4 + .../Transforms/TTKernelCleanupPatterns.cpp | 47 +++- lib/Dialect/TTL/Pipelines/TTLPipelines.cpp | 1 + .../TTL/Transforms/ConvertTTLToTTKernel.cpp | 215 ++++++++++++++---- 6 files changed, 249 insertions(+), 45 deletions(-) diff --git a/include/ttlang/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.h b/include/ttlang/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.h index df733726e..be6873142 100644 --- a/include/ttlang/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.h +++ b/include/ttlang/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.h @@ -9,8 +9,12 @@ namespace mlir::tt::ttkernel { -/// Populate cleanup patterns for TTKernel ops. -void populateTTKernelCleanupPatterns(RewritePatternSet &patterns); +/// Populate cleanup patterns for TTKernel ops. These patterns optimize +/// TTKernel code by removing redundant operations (e.g., deduplicating +/// consecutive barriers of the same type). +/// When useTridBarriers is true, also adds TRID-barrier deduplication patterns. +void populateTTKernelCleanupPatterns(RewritePatternSet &patterns, + bool useTridBarriers = false); } // namespace mlir::tt::ttkernel diff --git a/include/ttlang/Dialect/TTL/Passes.td b/include/ttlang/Dialect/TTL/Passes.td index da500af5c..38840ef78 100644 --- a/include/ttlang/Dialect/TTL/Passes.td +++ b/include/ttlang/Dialect/TTL/Passes.td @@ -119,16 +119,29 @@ def TTLInsertCopyWait def TTLConvertTTLToTTKernel : Pass<"convert-ttl-to-ttkernel", "::mlir::ModuleOp"> { - let summary = "Lower TTL DMA ops to TTKernel using global barriers (temporary)"; + let summary = "Lower TTL DMA ops to TTKernel noc ops"; let description = [{ - Converts TTL DMA ops to TTKernel noc ops. Uses global barriers until TRID - barriers are available. Covers bind_cb, copy, wait MVP path. + Converts TTL DMA ops to TTKernel noc ops. Covers bind_cb, copy, wait MVP + path. + + Two lowering modes are supported: + - Default: global barriers (noc_async_{read,write}_barrier). + - Optional: TRID-aware barriers (noc_async_*_set_trid + + noc_async_*_barrier_with_trid). + + TODO(ttl): Profile both modes on representative benchmarks and consider + changing the default. TODO(ttl): Refine lowering to emit real CB handles and proper NOC addresses. Issue: #77 (umbrella issue with subtasks #78-#89). }]; let options = [ + Option<"useTridBarriers", "use-trid-barriers", "bool", "false", + "Use TRID-aware DMA waits (barrier_with_trid) instead of global barriers. " + "TRID must be unique per outstanding copy; ordering of TRID values is not " + "semantically significant. Generated TRIDs may be nondeterministic when " + "patterns are applied in parallel.">, Option<"reduceFullFp32", "reduce-full-fp32", "bool", "true", "Enable FP32 accumulation for reduce operations."> ]; diff --git a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h index 611a10772..cc2e310dc 100644 --- a/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h +++ b/include/ttlang/Dialect/TTL/Pipelines/TTLPipelines.h @@ -54,6 +54,10 @@ struct TTLToTTKernelPipelineOptions "computations. When disabled, emit an error if any " "operation requires a compiler-allocated DFB."), llvm::cl::init(true)}; + Option useTridBarriers{ + *this, "use-trid-barriers", + llvm::cl::desc("Use TRID-aware DMA waits (barrier_with_trid)."), + llvm::cl::init(false)}; }; void createTTLToTTKernelPipeline(mlir::OpPassManager &pm, diff --git a/lib/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.cpp b/lib/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.cpp index 295955039..d696f9cda 100644 --- a/lib/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.cpp +++ b/lib/Dialect/TTKernel/Transforms/TTKernelCleanupPatterns.cpp @@ -7,6 +7,7 @@ #include "mlir/IR/PatternMatch.h" #include "mlir/Support/LogicalResult.h" #include "ttmlir/Dialect/TTKernel/IR/TTKernelOps.h" +#include "llvm/ADT/STLExtras.h" namespace mlir::tt::ttkernel { @@ -31,13 +32,57 @@ struct DeduplicateConsecutiveBarriers : OpRewritePattern { } }; +/// Deduplicate consecutive TRID barriers of the same type *only* when they +/// target the same TRID (and optional NOC). Unlike global barriers, barriers +/// with different TRIDs are not redundant and must not be removed. +template +struct DeduplicateConsecutiveTridBarriers + : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(BarrierWithTridOp op, + PatternRewriter &rewriter) const override { + auto *prev = op->getPrevNode(); + if (!prev) { + return failure(); + } + auto prevBarrier = dyn_cast(prev); + if (!prevBarrier) { + return failure(); + } + + if (op->getNumOperands() != prevBarrier->getNumOperands()) { + return failure(); + } + + for (auto [a, b] : + llvm::zip_equal(op->getOperands(), prevBarrier->getOperands())) { + if (a != b) { + return failure(); + } + } + + rewriter.eraseOp(op); + return success(); + } +}; + } // namespace -void populateTTKernelCleanupPatterns(RewritePatternSet &patterns) { +void populateTTKernelCleanupPatterns(RewritePatternSet &patterns, + bool useTridBarriers) { patterns.add>( patterns.getContext()); patterns.add>( patterns.getContext()); + if (useTridBarriers) { + patterns + .add>( + patterns.getContext()); + patterns.add< + DeduplicateConsecutiveTridBarriers>( + patterns.getContext()); + } } } // namespace mlir::tt::ttkernel diff --git a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp index 3204992bb..0d6185656 100644 --- a/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp +++ b/lib/Dialect/TTL/Pipelines/TTLPipelines.cpp @@ -55,6 +55,7 @@ void createTTLToTTKernelPipeline(OpPassManager &pm, pm.addPass(createTTLErasePipeNetScopes()); { TTLConvertTTLToTTKernelOptions ttkOpts; + ttkOpts.useTridBarriers = options.useTridBarriers; ttkOpts.reduceFullFp32 = options.reduceFullFp32; pm.addPass(createTTLConvertTTLToTTKernel(ttkOpts)); } diff --git a/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp b/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp index dc21fe501..667732861 100644 --- a/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp +++ b/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/SCF/Utils/Utils.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/IR/BuiltinDialect.h" @@ -68,17 +69,30 @@ class TTLToTTKernelTypeConverter : public TypeConverter { t.getElementType()); }); // Tensor -> TensorAccessor for TTKernel when TTL layout is present. - addConversion([](RankedTensorType t) -> Type { + addConversion([this](RankedTensorType t) -> Type { if (t.getEncoding() && mlir::isa(t.getEncoding())) { return ttk::TensorAccessorType::get(t.getContext()); } + // Otherwise, preserve tensor shape/encoding but convert element type. + // This is required for cases like tensor> + // becoming tensor once transfer handles are type-converted. + auto convertedElemTy = this->convertType(t.getElementType()); + if (!convertedElemTy) { + return t; + } + if (convertedElemTy == t.getElementType()) { + return t; + } + return mlir::cast(t.clone(convertedElemTy)); + }); + // Identity fallback must be last, but also handle conversion of transfer + // handles to TRID SSA values (i32). + addConversion([](Type t) -> Type { + if (llvm::isa(t)) { + return IntegerType::get(t.getContext(), 32); + } return t; }); - // Preserve transfer handle types so ttl.wait can inspect transfer - // direction. TRID-aware lowering will be added later. - addConversion([](TransferHandleType t) -> Type { return t; }); - // Identity fallback must be last. - addConversion([](Type t) { return t; }); auto castMaterialization = [](OpBuilder &builder, Type resultType, ValueRange inputs, Location loc) -> Value { @@ -490,8 +504,8 @@ static CopyOperandKind classifyOperand(Value v) { return CopyOperandKind::Unknown; } -static Value makeZeroI32(Location loc, ConversionPatternRewriter &rewriter) { - return arith::ConstantIntOp::create(rewriter, loc, 0, 32); +static Value makeZeroI8(Location loc, ConversionPatternRewriter &rewriter) { + return rewriter.create(loc, 0, 8); } static std::optional getTransferKindFromHandleType(Type t) { @@ -651,14 +665,58 @@ static Value linearizeNDIndex(OpBuilder &builder, Location loc, return result; } +/// Allocates TRIDs for DMA barriers. TRIDs wrap at 16 (4-bit hardware limit). +/// Tracks which TRIDs are in use by lowered copies and their transfer +/// direction. This bookkeeping must stay independent of greedy pattern rewrite +/// visitation order; therefore, it is only mutated during copy lowering. +/// When a TRID would be reused while still in use, the caller must emit a +/// barrier for the old transfer before reassigning. +/// +/// TODO: Profile both modes on representative benchmarks and consider changing +/// the default. +class TridAllocator { +public: + static constexpr uint32_t kNumTrids = 16; + + struct AllocResult { + uint32_t trid; + /// If set, this TRID was still outstanding from a previous copy. The caller + /// must emit a barrier_with_trid for this direction before reusing. + std::optional evictDirection; + }; + + AllocResult allocateTrid(TransferKind direction) { + uint32_t trid = nextTrid % kNumTrids; + AllocResult result{trid, std::nullopt}; + if (outstanding[trid]) { + result.evictDirection = direction_[trid]; + } + outstanding[trid] = true; + direction_[trid] = direction; + ++nextTrid; + return result; + } + + void releaseTrid(uint32_t trid) { outstanding[trid % kNumTrids] = false; } + +private: + uint32_t nextTrid = 0; + bool outstanding[kNumTrids] = {}; + TransferKind direction_[kNumTrids] = {}; +}; + /// Direction of a tensor<->CB tile copy for NOC operations. enum class NocCopyDirection { Read, Write }; /// Lower a tensor_slice<->CB copy in the given direction. /// Read: tensor_slice -> CB (noc_async_read_tile, get_write_ptr) /// Write: CB -> tensor_slice (noc_async_write_tile, get_read_ptr) +/// +/// When useTridBarriers is true, emits noc_async_{read,write}_set_trid before +/// the tile loop to tag NOC operations with the given TRID. static LogicalResult lowerTensorCBCopy(CopyOp op, TensorSliceOp sliceOp, Value cb, NocCopyDirection direction, + Value tridVal, bool useTridBarriers, ConversionPatternRewriter &rewriter, const TypeConverter &typeConverter) { auto loc = op.getLoc(); @@ -727,6 +785,17 @@ static LogicalResult lowerTensorCBCopy(CopyOp op, TensorSliceOp sliceOp, SmallVector cbBounds(cbShape.begin(), cbShape.end()); + // Tag subsequent NOC operations with this copy's TRID. + // Currently fixed to NOC 0. TODO(ttl): Generalize NOC selection (issue #77). + if (useTridBarriers) { + Value nocVal = makeZeroI8(loc, rewriter); + if (isRead) { + rewriter.create(loc, tridVal, nocVal); + } else { + rewriter.create(loc, tridVal, nocVal); + } + } + emitTileLoop( rewriter, loc, cbBounds, [&](OpBuilder &b, Location bodyLoc, ValueRange cbIVs) { @@ -763,7 +832,7 @@ static LogicalResult lowerTensorCBCopy(CopyOp op, TensorSliceOp sliceOp, } }); - rewriter.replaceOp(op, makeZeroI32(loc, rewriter)); + rewriter.replaceOp(op, tridVal); return success(); } @@ -786,8 +855,10 @@ struct TensorSliceLowering : OpConversionPattern { struct CopyLowering : OpConversionPattern { CopyLowering(const TypeConverter &typeConverter, MLIRContext *context, - const PipeGraph *pipeGraph) - : OpConversionPattern(typeConverter, context), pipeGraph(pipeGraph) {} + const PipeGraph *pipeGraph, TridAllocator *tridAllocator, + bool useTridBarriers) + : OpConversionPattern(typeConverter, context), pipeGraph(pipeGraph), + tridAllocator(tridAllocator), useTridBarriers(useTridBarriers) {} LogicalResult matchAndRewrite(CopyOp op, OpAdaptor adaptor, @@ -852,6 +923,38 @@ struct CopyLowering : OpConversionPattern { }); } + if (!tridAllocator) { + return rewriter.notifyMatchFailure(op, "missing TRID allocator"); + } + + TransferKind direction = + (srcIsSlice && dstIsCB) ? TransferKind::read : TransferKind::write; + + Value tridVal; + if (useTridBarriers) { + auto allocResult = tridAllocator->allocateTrid(direction); + // If this TRID was still outstanding, emit a barrier to drain the old + // transfer before reusing the TRID. + if (allocResult.evictDirection) { + Value evictTrid = rewriter.create( + op.getLoc(), allocResult.trid, 32); + Value nocVal = makeZeroI8(op.getLoc(), rewriter); + if (*allocResult.evictDirection == TransferKind::read) { + rewriter.create( + op.getLoc(), evictTrid, nocVal); + } else { + rewriter.create( + op.getLoc(), evictTrid, nocVal); + } + } + tridVal = rewriter.create(op.getLoc(), + allocResult.trid, 32); + } else { + // In global-barrier mode, allocate but direction does not matter. + tridAllocator->allocateTrid(direction); + tridVal = rewriter.create(op.getLoc(), 0, 32); + } + // TensorSlice -> CB: read tiles from tensor into circular buffer. if (srcIsSlice && dstIsCB) { auto sliceOp = src.getDefiningOp(); @@ -860,8 +963,8 @@ struct CopyLowering : OpConversionPattern { op, "tensor_slice source must come from ttl.tensor_slice op"); } return lowerTensorCBCopy(op, sliceOp, adaptor.getDst(), - NocCopyDirection::Read, rewriter, - *typeConverter); + NocCopyDirection::Read, tridVal, useTridBarriers, + rewriter, *typeConverter); } // CB -> TensorSlice: write tiles from circular buffer to tensor. @@ -871,46 +974,69 @@ struct CopyLowering : OpConversionPattern { op, "tensor_slice destination must come from ttl.tensor_slice op"); } return lowerTensorCBCopy(op, sliceOp, adaptor.getSrc(), - NocCopyDirection::Write, rewriter, *typeConverter); + NocCopyDirection::Write, tridVal, useTridBarriers, + rewriter, *typeConverter); } private: const PipeGraph *pipeGraph; + TridAllocator *tridAllocator = nullptr; + bool useTridBarriers = false; }; struct WaitLowering : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; + WaitLowering(const TypeConverter &typeConverter, MLIRContext *ctx, + bool useTridBarriers) + : OpConversionPattern(typeConverter, ctx), + useTridBarriers(useTridBarriers) {} LogicalResult matchAndRewrite(WaitOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - // TODO(ttl): Lower ttl.wait to TRID-specific barriers keyed by the transfer - // handle (read vs write barrier based on transfer direction). Issue: #87. - // - // MVP behavior: emit the corresponding global barrier based on transfer - // direction. Untyped handles (no kind) are no-ops - used for pipe receives - // where data arrives via multicast and no local barrier is needed. - auto kind = getTransferKindFromHandleType(adaptor.getXf().getType()); + // Emit TRID-specific barriers keyed by the transfer handle when enabled; + // otherwise emit global barriers. Transfer direction is read from the + // original operand type (not the converted i32). Untyped handles (no kind) + // are no-ops (e.g. pipe receive via multicast). + auto kind = getTransferKindFromHandleType(op.getXf().getType()); if (!kind) { // No transfer kind means no barrier needed (e.g., pipe receive where // data arrives via multicast from source core). rewriter.eraseOp(op); return success(); } - if (*kind == TransferKind::read) { - ttk::NocAsyncReadBarrierOp::create(rewriter, op.getLoc()); - } else if (*kind == TransferKind::write) { - ttk::NocAsyncWriteBarrierOp::create(rewriter, op.getLoc()); + if (useTridBarriers) { + Value tridVal = adaptor.getXf(); // i32 (type converter guarantees this) + assert(tridVal.getType().isInteger(32) && + "transfer handle must be type-converted to i32 before ttl.wait"); + Value nocVal = makeZeroI8(op.getLoc(), rewriter); + if (*kind == TransferKind::read) { + rewriter.create(op.getLoc(), + tridVal, nocVal); + } else if (*kind == TransferKind::write) { + rewriter.create(op.getLoc(), + tridVal, nocVal); + } else { + return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) { + diag << "unsupported TransferKind for ttl.wait lowering"; + }); + } } else { - // Future-proofing: TransferKind is currently {read, write}, but fail - // explicitly if it ever expands without updating the lowering. - return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) { - diag << "unsupported TransferKind for ttl.wait lowering"; - }); + if (*kind == TransferKind::read) { + rewriter.create(op.getLoc()); + } else if (*kind == TransferKind::write) { + rewriter.create(op.getLoc()); + } else { + return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) { + diag << "unsupported TransferKind for ttl.wait lowering"; + }); + } } rewriter.eraseOp(op); return success(); } + +private: + bool useTridBarriers = false; }; //===----------------------------------------------------------------------===// @@ -1009,7 +1135,7 @@ struct FuncKernelFinalize : OpRewritePattern { static LogicalResult lowerTTLOpsToTTKernel(ModuleOp mod, MLIRContext &ctx, TTLToTTKernelTypeConverter &typeConverter, - StringRef passName) { + bool useTridBarriers, StringRef passName) { ConversionTarget target(ctx); target.addIllegalDialect(); target.addLegalDialect(typeConverter, &ctx, &pipeGraph); - patterns.add( + TridAllocator tridAllocator; + patterns.add(typeConverter, &ctx); + patterns.add(typeConverter, &ctx); + patterns.add(typeConverter, &ctx, &pipeGraph, &tridAllocator, + useTridBarriers); + patterns.add(typeConverter, &ctx, useTridBarriers); + patterns + .add( + typeConverter, &ctx); + patterns.add( typeConverter, &ctx); populatePipeLoweringPatterns(patterns, typeConverter); + + // Convert scf.for/scf.if/etc region signatures when result/iter_arg types + // change due to the type converter. + mlir::scf::populateSCFStructuralTypeConversionsAndLegality(typeConverter, + patterns, target); populateFunctionOpInterfaceTypeConversionPattern( func::FuncOp::getOperationName(), patterns, typeConverter); @@ -1075,7 +1211,7 @@ lowerTTLOpsToTTKernel(ModuleOp mod, MLIRContext &ctx, // Apply post-conversion cleanup patterns (e.g., barrier deduplication). RewritePatternSet cleanupPatterns(&ctx); - ttkernel::populateTTKernelCleanupPatterns(cleanupPatterns); + ttkernel::populateTTKernelCleanupPatterns(cleanupPatterns, useTridBarriers); if (failed(applyPatternsGreedily(mod, std::move(cleanupPatterns)))) { return failure(); } @@ -1297,7 +1433,8 @@ struct TTLConvertTTLToTTKernelPass expandDstSections(mod); // Phase 1: Lower TTL ops to TTKernel (bind_cb, copy, wait, cb ops, store) - if (failed(lowerTTLOpsToTTKernel(mod, ctx, typeConverter, getName()))) { + if (failed(lowerTTLOpsToTTKernel(mod, ctx, typeConverter, useTridBarriers, + getName()))) { signalPassFailure(); return; } From a2f11d249d4625f18111e03c841a3988931abb5b Mon Sep 17 00:00:00 2001 From: Ilia Shutov Date: Tue, 5 May 2026 06:54:31 +0200 Subject: [PATCH 2/7] [test] Add lit tests for TRID and global barrier modes - trid_barriers.mlir: Tests TRID-aware lowering with use-trid-barriers=true - Verifies noc_async_{read,write}_set_trid emission - Verifies noc_async_{read,write}_barrier_with_trid emission - Tests TRID overflow handling (17 copies without waits) - dma_global_barriers.mlir: Tests default global barrier mode - Verifies noc_async_{read,write}_barrier emission (no TRID) - Ensures backward compatibility with main branch behavior - Update existing tests to use explicit use-trid-barriers=true where they expect TRID-specific output --- .../TTLToTTKernel/compute_fused_chain.mlir | 1 - .../TTLToTTKernel/dma_global_barriers.mlir | 117 ++++++++++++++++ .../TTLToTTKernel/dma_single_core.mlir | 124 +++++++++-------- .../TTLToTTKernel/loopback_dram_copy.mlir | 17 ++- .../TTLToTTKernel/trid_barriers.mlir | 130 ++++++++++++++++++ 5 files changed, 325 insertions(+), 64 deletions(-) create mode 100644 test/ttlang/Conversion/TTLToTTKernel/dma_global_barriers.mlir create mode 100644 test/ttlang/Conversion/TTLToTTKernel/trid_barriers.mlir diff --git a/test/ttlang/Conversion/TTLToTTKernel/compute_fused_chain.mlir b/test/ttlang/Conversion/TTLToTTKernel/compute_fused_chain.mlir index aef30b6ad..c661bc307 100644 --- a/test/ttlang/Conversion/TTLToTTKernel/compute_fused_chain.mlir +++ b/test/ttlang/Conversion/TTLToTTKernel/compute_fused_chain.mlir @@ -100,7 +100,6 @@ // SFPU-NOT: ttl.attach_cb // SFPU-NOT: ttl.copy_tile // SFPU-NOT: ttkernel.add_tiles - func.func @fused_chain_lowering(%a: tensor<2x2x!ttcore.tile<32x32, f32>>, %b: tensor<2x2x!ttcore.tile<32x32, f32>>) -> tensor<2x2x!ttcore.tile<32x32, f32>> { diff --git a/test/ttlang/Conversion/TTLToTTKernel/dma_global_barriers.mlir b/test/ttlang/Conversion/TTLToTTKernel/dma_global_barriers.mlir new file mode 100644 index 000000000..764fff63f --- /dev/null +++ b/test/ttlang/Conversion/TTLToTTKernel/dma_global_barriers.mlir @@ -0,0 +1,117 @@ +// RUN: ttlang-opt --convert-ttl-to-ttkernel --canonicalize -cse --split-input-file %s | FileCheck %s --check-prefix=GLOBAL +// Summary: Verify the default (non-TRID) code path emits global NOC barriers. +// Companion to dma_single_core.mlir which tests use-trid-barriers=1. + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// Single-tile read: default mode emits noc_async_read_barrier (no TRID ops). +// GLOBAL-LABEL: func.func @global_single_tile_read +// GLOBAL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () +// GLOBAL: ttkernel.noc_async_read_barrier() : () -> () +// GLOBAL-NOT: ttkernel.noc_async_read_set_trid +// GLOBAL-NOT: ttkernel.noc_async_read_barrier_with_trid +// GLOBAL-NOT: ttkernel.noc_async_write_barrier +module { + func.func @global_single_tile_read(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %slice = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf = ttl.copy %slice, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + ttl.wait %xf : !ttl.transfer_handle + func.return + } +} + +// ----- + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// Single-tile write: default mode emits noc_async_write_barrier (no TRID ops). +// GLOBAL-LABEL: func.func @global_single_tile_write +// GLOBAL: ttkernel.noc_async_write_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () +// GLOBAL: ttkernel.noc_async_write_barrier() : () -> () +// GLOBAL-NOT: ttkernel.noc_async_write_set_trid +// GLOBAL-NOT: ttkernel.noc_async_write_barrier_with_trid +// GLOBAL-NOT: ttkernel.noc_async_read_barrier +module { + func.func @global_single_tile_write(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %slice = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf = ttl.copy %cb, %slice : (!ttl.cb<[1, 1], f32, 2>, tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) -> !ttl.transfer_handle + ttl.wait %xf : !ttl.transfer_handle + func.return + } +} + +// ----- + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// Batched reads: consecutive global barriers are deduplicated to a single barrier. +// GLOBAL-LABEL: func.func @global_batched_reads +// GLOBAL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () +// GLOBAL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () +// GLOBAL: ttkernel.noc_async_read_barrier() : () -> () +// GLOBAL-NOT: ttkernel.noc_async_read_barrier +// GLOBAL-NOT: ttkernel.noc_async_read_set_trid +// GLOBAL-NOT: ttkernel.noc_async_read_barrier_with_trid +module { + func.func @global_batched_reads(%t0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, %t1: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 2 : i32, ttl.crta_indices = [0, 1], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb0 = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %cb1 = ttl.bind_cb {cb_index = 1, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %slice0 = ttl.tensor_slice %t0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %slice1 = ttl.tensor_slice %t1[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf0 = ttl.copy %slice0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %xf1 = ttl.copy %slice1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + ttl.wait %xf0 : !ttl.transfer_handle + ttl.wait %xf1 : !ttl.transfer_handle + func.return + } +} + +// ----- + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// Loopback copy: read then write in a loop uses global barriers for both. +// GLOBAL-LABEL: func.func @global_loopback +// GLOBAL: scf.for +// GLOBAL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () +// GLOBAL: ttkernel.noc_async_read_barrier() : () -> () +// GLOBAL: ttkernel.noc_async_write_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () +// GLOBAL: ttkernel.noc_async_write_barrier() : () -> () +// GLOBAL-NOT: noc_async_read_set_trid +// GLOBAL-NOT: noc_async_write_set_trid +// GLOBAL-NOT: barrier_with_trid +module { + func.func @global_loopback(%src: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, + %dst: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) + attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0, 1], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + + %src_slice = ttl.tensor_slice %src[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %dst_slice = ttl.tensor_slice %dst[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + scf.for %i = %c0 to %c4 step %c1 { + %xf_r = ttl.copy %src_slice, %cb + : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) + -> !ttl.transfer_handle + ttl.wait %xf_r : !ttl.transfer_handle + + %xf_w = ttl.copy %cb, %dst_slice + : (!ttl.cb<[1, 1], f32, 2>, tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) + -> !ttl.transfer_handle + ttl.wait %xf_w : !ttl.transfer_handle + } + + func.return + } +} diff --git a/test/ttlang/Conversion/TTLToTTKernel/dma_single_core.mlir b/test/ttlang/Conversion/TTLToTTKernel/dma_single_core.mlir index 9160585b4..23d0fe4ab 100644 --- a/test/ttlang/Conversion/TTLToTTKernel/dma_single_core.mlir +++ b/test/ttlang/Conversion/TTLToTTKernel/dma_single_core.mlir @@ -1,10 +1,9 @@ -// RUN: ttlang-opt --allow-unregistered-dialect --convert-ttl-to-ttkernel --canonicalize -cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL +// RUN: ttlang-opt --convert-ttl-to-ttkernel="use-trid-barriers=1" --canonicalize -cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL // Summary: MVP DMA lowering tests for tensor<->CB copies (no pipes). -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> -#layout_tile = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > +#layout_tile = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // TTKERNEL-LABEL: func.func @dma_single_tile_single_copy // TTKERNEL-DAG: %[[C0_IDX:.*]] = arith.constant 0 : index @@ -13,9 +12,11 @@ // TTKERNEL: %[[SRC_ARGS:.*]] = ttkernel.TensorAccessorArgs({{.*}}) // TTKERNEL: %[[SRC_ACC:.*]] = ttkernel.TensorAccessor(%[[SRC_ARGS]], %[[BANK_BASE]], {{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor // TTKERNEL: %[[CB_PTR:.*]] = ttkernel.get_write_ptr(%[[CB]]) : (!ttkernel.cb<2, !ttcore.tile<32x32, f32>>) -> i32 +// TTKERNEL: ttkernel.noc_async_read_set_trid(%[[TRID:.*]], %[[NOC:.*]]) : (i32, i8) -> () // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, %[[SRC_ACC]], %[[CB_PTR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[TRID]], %[[NOC]]) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_single_tile_single_copy(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -29,8 +30,8 @@ module { // ----- -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // TTKERNEL-LABEL: func.func @cb_to_tensor // TTKERNEL-DAG: %[[C0_IDX:.*]] = arith.constant 0 : index @@ -39,9 +40,11 @@ module { // TTKERNEL: %[[DST_ARGS:.*]] = ttkernel.TensorAccessorArgs({{.*}}) // TTKERNEL: %[[DST_ACC:.*]] = ttkernel.TensorAccessor(%[[DST_ARGS]], %[[BANK_BASE]], {{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor // TTKERNEL: %[[CB_PTR:.*]] = ttkernel.get_read_ptr(%[[CB]]) : (!ttkernel.cb<2, !ttcore.tile<32x32, f32>>) -> i32 +// TTKERNEL: ttkernel.noc_async_write_set_trid(%[[TRID:.*]], %[[NOC:.*]]) : (i32, i8) -> () // TTKERNEL: ttkernel.noc_async_write_tile({{.*}}, %[[DST_ACC]], %[[CB_PTR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_write_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_read_barrier +// TTKERNEL: ttkernel.noc_async_write_barrier_with_trid(%[[TRID]], %[[NOC]]) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () module { func.func @cb_to_tensor(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -55,8 +58,8 @@ module { // ----- -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // Batched transfer pattern: issue multiple transfers, then wait on all of them. // Mirrors TT-Metal kernels that batch NOC async operations for throughput. @@ -72,10 +75,12 @@ module { // TTKERNEL: ttkernel.TensorAccessor({{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor // TTKERNEL: ttkernel.get_write_ptr({{.*}}) : (!ttkernel.cb<2, !ttcore.tile<32x32, f32>>) -> i32 // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () -// Consecutive barriers are deduplicated to a single barrier. -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_read_barrier -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// Each ttl.wait lowers to a TRID-specific barrier; different TRIDs must not be +// deduplicated. +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_batched(%t0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, %t1: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 2 : i32, ttl.crta_indices = [0, 1], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -93,8 +98,8 @@ module { // ----- -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // Pipelined loop pattern: wait on the previous transfer while issuing the next. // This approximates "copies in one loop, waits in another" by separating the wait @@ -106,10 +111,11 @@ module { // TTKERNEL: scf.for {{.*}} { // TTKERNEL: ttkernel.get_write_ptr({{.*}}) : (!ttkernel.cb<2, !ttcore.tile<32x32, f32>>) -> i32 // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, {{.*}}, {{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () // TTKERNEL: } -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_pipelined_loop(%t: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -131,8 +137,8 @@ module { // ----- -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // Two-phase pattern: issue all copies in one loop, then wait on all handles in // a second loop. This mirrors TT-Metal kernels that batch NOC async ops and then @@ -149,9 +155,11 @@ module { // TTKERNEL: scf.yield %[[INS]] : tensor> // TTKERNEL: } // TTKERNEL: scf.for {{.*}} { -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL: %[[XF_I32:.*]] = builtin.unrealized_conversion_cast {{.*}} : !ttl.transfer_handle to i32 +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[XF_I32]], {{.*}}) : (i32, i8) -> () // TTKERNEL: } -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_single_tile_two_phase_loops(%t: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -177,16 +185,17 @@ module { // ----- -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // Corner case: waiting twice on the same transfer handle is allowed, but -// consecutive barriers are deduplicated to a single barrier. +// consecutive barriers are deduplicated to a single TRID barrier. // // TTKERNEL-LABEL: func.func @dma_single_tile_double_wait -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_read_barrier -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier_with_trid +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_single_tile_double_wait(%t: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -201,16 +210,17 @@ module { // ----- -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // Corner case: one-element handle batching via tensor.insert, then waiting // outside of a loop. // // TTKERNEL-LABEL: func.func @dma_single_tile_single_element_container // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, {{.*}}, {{.*}}) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () // TTKERNEL: return module { func.func @dma_single_tile_single_element_container(%t: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { @@ -234,10 +244,9 @@ module { // ----- -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> -#layout_tile = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > +#layout_tile = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // Multi-tile read should emit nested scf.for over tile grid with correct offset computation. // Tensor: 64x64xf32 (2x2 tiles), CB: [1,1] (single tile) @@ -265,8 +274,9 @@ module { // TTKERNEL: %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET_X]] : index to i32 // TTKERNEL: %[[CB_ADDR:.*]] = arith.index_cast %[[CB_ADDR_IDX]] : index to i32 // TTKERNEL: ttkernel.noc_async_read_tile(%[[TILE_OFFSET_I32]], %[[ACC]], %[[CB_ADDR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_multi_tile_read(%arg0: tensor<2x2x!ttcore.tile<32x32, f32>, #layout_tile>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -280,10 +290,9 @@ module { // ----- -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> -#layout_tile = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > +#layout_tile = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // Multi-tile write should emit nested scf.for over tile grid with correct offset computation. // Tensor: 64x64xf32 (2x2 tiles), CB: [1,1] (single tile) @@ -311,8 +320,9 @@ module { // TTKERNEL: %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET_X]] : index to i32 // TTKERNEL: %[[CB_ADDR:.*]] = arith.index_cast %[[CB_ADDR_IDX]] : index to i32 // TTKERNEL: ttkernel.noc_async_write_tile(%[[TILE_OFFSET_I32]], %[[ACC]], %[[CB_ADDR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_write_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_read_barrier +// TTKERNEL: ttkernel.noc_async_write_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () module { func.func @dma_multi_tile_write(%arg0: tensor<2x2x!ttcore.tile<32x32, f32>, #layout_tile>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -326,8 +336,8 @@ module { // ----- -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // Multi-tile read with larger CB shape still loops over tile grid with correct offset computation. // Tensor: 64x64xf32 (2x2 tiles), CB: [2,1] (2x1 tiles) @@ -356,8 +366,9 @@ module { // TTKERNEL: %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET_X]] : index to i32 // TTKERNEL: %[[CB_ADDR:.*]] = arith.index_cast %[[CB_ADDR_IDX]] : index to i32 // TTKERNEL: ttkernel.noc_async_read_tile(%[[TILE_OFFSET_I32]], %[[ACC]], %[[CB_ADDR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_write_barrier +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () module { func.func @dma_multi_tile_read_cb_shape(%arg0: tensor<2x2x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index @@ -371,8 +382,8 @@ module { // ----- -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // Rectangular multi-tile write to exercise non-square tile grids (96x64 = 3x2 tiles) with correct offset computation. // Tensor: 96x64xf32 (3x2 tiles - 3 rows, 2 columns), CB: [1,1] (single tile) @@ -402,8 +413,9 @@ module { // TTKERNEL: %[[TILE_OFFSET_I32:.*]] = arith.index_cast %[[TILE_OFFSET_X]] : index to i32 // TTKERNEL: %[[CB_ADDR:.*]] = arith.index_cast %[[CB_ADDR_IDX]] : index to i32 // TTKERNEL: ttkernel.noc_async_write_tile(%[[TILE_OFFSET_I32]], %[[ACC]], %[[CB_ADDR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_write_barrier() : () -> () -// TTKERNEL-NOT: ttkernel.noc_async_read_barrier +// TTKERNEL: ttkernel.noc_async_write_barrier_with_trid({{.*}}) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_write_barrier() : () -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () module { func.func @dma_multi_tile_write_rect(%arg0: tensor<3x2x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index diff --git a/test/ttlang/Conversion/TTLToTTKernel/loopback_dram_copy.mlir b/test/ttlang/Conversion/TTLToTTKernel/loopback_dram_copy.mlir index 7f8944500..d8adedd31 100644 --- a/test/ttlang/Conversion/TTLToTTKernel/loopback_dram_copy.mlir +++ b/test/ttlang/Conversion/TTLToTTKernel/loopback_dram_copy.mlir @@ -1,9 +1,10 @@ -// RUN: ttlang-opt --allow-unregistered-dialect --convert-ttl-to-ttkernel --canonicalize --cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL -// Summary: Lower a loopback DRAM copy (read → wait → write → wait in a loop) -// to TTKernel using global NOC barriers (TRID ops not yet available). +// RUN: ttlang-opt --convert-ttl-to-ttkernel="use-trid-barriers=1" --canonicalize --cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL +// Summary: Lower a loopback DRAM copy (read -> wait -> write -> wait in a loop) +// to TTKernel using TRID-specific NOC barriers. -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, + memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // TTKERNEL-LABEL: func.func @loopback_dram_copy // Verify runtime args and CB pointers are used for both read and write operations. @@ -12,14 +13,16 @@ // TTKERNEL: ttkernel.get_common_arg_val({{.*}}) : (index) -> i32 // TTKERNEL: %[[ACC_R:.*]] = ttkernel.TensorAccessor({{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor // TTKERNEL: %[[CB_W_PTR:.*]] = ttkernel.get_write_ptr({{.*}}) : (!ttkernel.cb<2, !ttcore.tile<32x32, f32>>) -> i32 +// TTKERNEL: ttkernel.noc_async_read_set_trid({{.*}}, {{.*}}) : (i32, i8) -> () // TTKERNEL: ttkernel.noc_async_read_tile({{.*}}, %[[ACC_R]], %[[CB_W_PTR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid({{.*}}, {{.*}}) : (i32, i8) -> () // Write: runtime arg for dst tensor, accessor, read ptr for CB // TTKERNEL: ttkernel.get_common_arg_val({{.*}}) : (index) -> i32 // TTKERNEL: %[[ACC_W:.*]] = ttkernel.TensorAccessor({{.*}}) : (!ttkernel.TensorAccessorArgs, i32, i32) -> !ttkernel.TensorAccessor // TTKERNEL: %[[CB_R_PTR:.*]] = ttkernel.get_read_ptr({{.*}}) : (!ttkernel.cb<2, !ttcore.tile<32x32, f32>>) -> i32 +// TTKERNEL: ttkernel.noc_async_write_set_trid({{.*}}, {{.*}}) : (i32, i8) -> () // TTKERNEL: ttkernel.noc_async_write_tile({{.*}}, %[[ACC_W]], %[[CB_R_PTR]]) : (i32, !ttkernel.TensorAccessor, i32) -> () -// TTKERNEL: ttkernel.noc_async_write_barrier() : () -> () +// TTKERNEL: ttkernel.noc_async_write_barrier_with_trid({{.*}}, {{.*}}) : (i32, i8) -> () module { func.func @loopback_dram_copy(%src: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, diff --git a/test/ttlang/Conversion/TTLToTTKernel/trid_barriers.mlir b/test/ttlang/Conversion/TTLToTTKernel/trid_barriers.mlir new file mode 100644 index 000000000..bb110a44c --- /dev/null +++ b/test/ttlang/Conversion/TTLToTTKernel/trid_barriers.mlir @@ -0,0 +1,130 @@ +// RUN: ttlang-opt --convert-ttl-to-ttkernel="use-trid-barriers=1" --canonicalize -cse --split-input-file %s | FileCheck %s --check-prefix=TTKERNEL +// Summary: Regression tests for TRID-aware ttl.copy/ttl.wait lowering. + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// TTKERNEL-LABEL: func.func @trid_single_copy_wait_read +// TTKERNEL: ttkernel.noc_async_read_set_trid(%[[TRID:.*]], %[[NOC:.*]]) : (i32, i8) -> () +// TTKERNEL: ttkernel.noc_async_read_tile( +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[TRID]], %[[NOC]]) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: builtin.unrealized_conversion_cast +module { + func.func @trid_single_copy_wait_read(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %slice = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf = ttl.copy %slice, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + ttl.wait %xf : !ttl.transfer_handle + func.return + } +} + +// ----- + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// TTKERNEL-LABEL: func.func @trid_two_copies_two_waits_read +// TTKERNEL: ttkernel.noc_async_read_set_trid(%[[TRID0:.*]], %[[NOC:.*]]) : (i32, i8) -> () +// TTKERNEL: ttkernel.noc_async_read_tile( +// TTKERNEL: ttkernel.noc_async_read_set_trid(%[[TRID1:.*]], %[[NOC]]) : (i32, i8) -> () +// TTKERNEL: ttkernel.noc_async_read_tile( +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[TRID0]], %[[NOC]]) : (i32, i8) -> () +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid(%[[TRID1]], %[[NOC]]) : (i32, i8) -> () +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: builtin.unrealized_conversion_cast +module { + func.func @trid_two_copies_two_waits_read(%t0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, %t1: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 2 : i32, ttl.crta_indices = [0, 1], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb0 = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %cb1 = ttl.bind_cb {cb_index = 1, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %slice0 = ttl.tensor_slice %t0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %slice1 = ttl.tensor_slice %t1[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf0 = ttl.copy %slice0, %cb0 : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %xf1 = ttl.copy %slice1, %cb1 : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + ttl.wait %xf0 : !ttl.transfer_handle + ttl.wait %xf1 : !ttl.transfer_handle + func.return + } +} + +// ----- + +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > + +// Verify TRID overflow handling: 17 copies without intervening waits exhaust +// the 16-entry TRID space. The 17th copy (reusing TRID 0) must emit an +// auto-barrier for TRID 0 before reassigning it. +// +// TTKERNEL-LABEL: func.func @trid_overflow_auto_barrier +// The first 16 copies each get a unique TRID (0..15) with no auto-barrier. +// TTKERNEL-COUNT-16: ttkernel.noc_async_read_set_trid +// The 17th copy reuses TRID 0. Because TRID 0 is still outstanding, the pass +// emits an auto-barrier first. +// TTKERNEL: ttkernel.noc_async_read_barrier_with_trid +// TTKERNEL: ttkernel.noc_async_read_set_trid +// No global barriers should appear. +// TTKERNEL-NOT: ttkernel.noc_async_read_barrier() : () -> () +// TTKERNEL-NOT: builtin.unrealized_conversion_cast +module { + func.func @trid_overflow_auto_barrier(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { + %c0 = arith.constant 0 : index + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %s0 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf0 = ttl.copy %s0, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s1 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf1 = ttl.copy %s1, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s2 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf2 = ttl.copy %s2, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s3 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf3 = ttl.copy %s3, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s4 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf4 = ttl.copy %s4, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s5 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf5 = ttl.copy %s5, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s6 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf6 = ttl.copy %s6, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s7 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf7 = ttl.copy %s7, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s8 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf8 = ttl.copy %s8, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s9 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf9 = ttl.copy %s9, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s10 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf10 = ttl.copy %s10, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s11 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf11 = ttl.copy %s11, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s12 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf12 = ttl.copy %s12, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s13 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf13 = ttl.copy %s13, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s14 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf14 = ttl.copy %s14, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + %s15 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf15 = ttl.copy %s15, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + // 17th copy: TRID 0 wraps. Auto-barrier expected here. + %s16 = ttl.tensor_slice %arg0[%c0, %c0] : tensor<1x1x!ttcore.tile<32x32, f32>, #layout> -> tensor<1x1x!ttcore.tile<32x32, f32>, #layout> + %xf16 = ttl.copy %s16, %cb : (tensor<1x1x!ttcore.tile<32x32, f32>, #layout>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + ttl.wait %xf0 : !ttl.transfer_handle + ttl.wait %xf1 : !ttl.transfer_handle + ttl.wait %xf2 : !ttl.transfer_handle + ttl.wait %xf3 : !ttl.transfer_handle + ttl.wait %xf4 : !ttl.transfer_handle + ttl.wait %xf5 : !ttl.transfer_handle + ttl.wait %xf6 : !ttl.transfer_handle + ttl.wait %xf7 : !ttl.transfer_handle + ttl.wait %xf8 : !ttl.transfer_handle + ttl.wait %xf9 : !ttl.transfer_handle + ttl.wait %xf10 : !ttl.transfer_handle + ttl.wait %xf11 : !ttl.transfer_handle + ttl.wait %xf12 : !ttl.transfer_handle + ttl.wait %xf13 : !ttl.transfer_handle + ttl.wait %xf14 : !ttl.transfer_handle + ttl.wait %xf15 : !ttl.transfer_handle + ttl.wait %xf16 : !ttl.transfer_handle + func.return + } +} From cf626ff8288e4b465c2091ba4734e3061c35d31a Mon Sep 17 00:00:00 2001 From: Ilia Shutov Date: Tue, 5 May 2026 06:54:31 +0200 Subject: [PATCH 3/7] [test] Update translate tests for TRID barrier mode Enable use-trid-barriers in TTLToCpp translation tests that verify TRID-specific C++ codegen output. Tests now explicitly request TRID mode to match their expected noc_async_*_set_trid and barrier_with_trid output. --- .../cb_to_tensor_single_tile_write.mlir | 15 +- .../TTLToCpp/compute_fused_chain_to_cpp.mlir | 159 +++--- .../TTLToCpp/compute_with_data_movement.mlir | 476 +++++++----------- .../TTLToCpp/dma_batched_single_tile.mlir | 30 +- .../dma_loop_multi_tile_nontrivial_cb.mlir | 8 +- .../TTLToCpp/dma_loop_single_tile.mlir | 27 +- .../dma_multi_tile_batched_in_user_loop.mlir | 9 +- .../TTLToCpp/dma_multi_tile_read.mlir | 5 +- ...a_multi_tile_same_layout_different_cb.mlir | 8 +- .../TTLToCpp/dma_single_tile_read.mlir | 15 +- .../TTLToCpp/loopback_full_single_tile.mlir | 28 +- 11 files changed, 316 insertions(+), 464 deletions(-) diff --git a/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir b/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir index 9b32fae3b..60398f35d 100644 --- a/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir +++ b/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --allow-unregistered-dialect --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -6,19 +6,20 @@ // Test: Single DMA write operation (CB → tensor) // Validates write barrier placement and ensures no read barrier -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // CHECK: // cb_to_tensor // CHECK: void kernel_main() { // CHECK-DAG: int32_t [[ZERO:v[0-9]+]] = 0; // CHECK-DAG: int32_t [[ADDR:v[0-9]+]] = 4096; -// CHECK: experimental::CircularBuffer [[CB:.*]](get_compile_time_arg_val(0)); // CHECK: int32_t [[RT_ARG:v[0-9]+]] = get_common_arg_val([[RT_ARG_IDX:v[0-9]+]]); -// CHECK: auto [[ARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 0>(); +// CHECK: auto [[ARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); // CHECK: TensorAccessor [[ACCESSOR:v[0-9]+]] = TensorAccessor([[ARGS]], [[RT_ARG]], [[ADDR]]); -// CHECK-NEXT: noc_async_write_tile([[ZERO]], [[ACCESSOR]], [[CB]].get_read_ptr()); -// CHECK: noc_async_write_barrier(); +// CHECK: int32_t [[CB_PTR:v[0-9]+]] = get_read_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_write_set_trid({{.*}}, {{.*}}); +// CHECK: noc_async_write_tile([[ZERO]], [[ACCESSOR]], [[CB_PTR]]); +// CHECK: noc_async_write_barrier_with_trid({{.*}}, {{.*}}); // CHECK: return; // CHECK-NEXT: } module { diff --git a/test/ttlang/Translate/TTLToCpp/compute_fused_chain_to_cpp.mlir b/test/ttlang/Translate/TTLToCpp/compute_fused_chain_to_cpp.mlir index 069036e8d..cbef58eb0 100644 --- a/test/ttlang/Translate/TTLToCpp/compute_fused_chain_to_cpp.mlir +++ b/test/ttlang/Translate/TTLToCpp/compute_fused_chain_to_cpp.mlir @@ -1,128 +1,97 @@ -// FPU path (default): add uses add_tiles (reads from CB), mul uses SFPU. // RUN: ttlang-opt %s \ -// RUN: -pass-pipeline='builtin.module(func.func(ttl-set-compute-kernel-config{enable-fpu-binary-ops=1 matmul-full-fp32=0 reduce-full-fp32=0}, ttl-assign-dst, ttl-lower-to-loops, ttl-annotate-cb-associations), convert-ttl-to-ttkernel, ttkernel-insert-inits, canonicalize, cse, lower-affine)' \ +// RUN: -pass-pipeline='builtin.module(func.func(ttl-assign-dst, ttl-insert-tile-regs-sync, ttl-lower-to-loops, ttl-annotate-cb-associations), convert-ttl-to-ttkernel{use-trid-barriers=1}, canonicalize, cse, lower-affine)' \ // RUN: -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir -// RUN: FileCheck %s --input-file=%t.cpp --check-prefix=FPU - -// SFPU path: all binary ops use copy_tile + SFPU binary ops. -// RUN: ttlang-opt %s \ -// RUN: -pass-pipeline='builtin.module(func.func(ttl-set-compute-kernel-config{enable-fpu-binary-ops=0 matmul-full-fp32=0 reduce-full-fp32=0}, ttl-assign-dst, ttl-lower-to-loops, ttl-annotate-cb-associations), convert-ttl-to-ttkernel, ttkernel-insert-inits, canonicalize, cse, lower-affine)' \ -// RUN: -o %t.sfpu.ttkernel.mlir -// RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.sfpu.ttkernel.mlir -o %t.sfpu.emitc.mlir -// RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.sfpu.cpp %t.sfpu.emitc.mlir -// RUN: FileCheck %s --input-file=%t.sfpu.cpp --check-prefix=SFPU +// RUN: FileCheck %s --input-file=%t.cpp // Purpose: end-to-end TTL -> TTKernel -> emitc -> C++ for fused chain. -// Verifies: add + mul + exp fused compute with CB-based data flow. +// Verifies: add + exp fused compute with CB-based data flow. #map = affine_map<(d0, d1) -> (d0, d1)> -// ============================================================================= -// FPU path: binary_op_init_common, add_tiles, copy_tile (for mul rhs), mul_binary_tile, exp -// ============================================================================= -// FPU-LABEL: void kernel_main() - -// FPU-DAG: int32_t [[TILES:v[0-9]+]] = 4 -// FPU-DAG: size_t [[BOUND:v[0-9]+]] = 2 -// FPU-DAG: size_t [[STEP:v[0-9]+]] = 1 -// FPU-DAG: size_t [[ZERO:v[0-9]+]] = 0 - -// CB wrappers declared at top of kernel -// FPU: experimental::CircularBuffer [[FPU_CB0:.*]](get_compile_time_arg_val(0)); -// FPU: experimental::CircularBuffer [[FPU_CB1:.*]](get_compile_time_arg_val(1)); -// FPU: experimental::CircularBuffer [[FPU_CB2:.*]](get_compile_time_arg_val(2)); -// FPU: [[FPU_CB2]].reserve_back([[TILES]]); -// FPU: binary_op_init_common(get_compile_time_arg_val(0), get_compile_time_arg_val(1), get_compile_time_arg_val(2)); - -// FPU: for (size_t [[I:.*]] = [[ZERO]]; [[I]] < [[BOUND]]; [[I]] += [[STEP]]) { -// FPU-NEXT: for (size_t [[J:.*]] = [[ZERO]]; [[J]] < [[BOUND]]; [[J]] += [[STEP]]) { - -// FPU: tile_regs_acquire(); - -// FPU: add_tiles_init(get_compile_time_arg_val(0), get_compile_time_arg_val(1)); -// FPU-NEXT: add_tiles(get_compile_time_arg_val(0), get_compile_time_arg_val(1), - -// mul rhs from CB needs copy_tile -// FPU: copy_tile_init(get_compile_time_arg_val(1)); -// FPU-NEXT: copy_tile(get_compile_time_arg_val(1), - -// FPU: mul_binary_tile_init(); -// FPU-NEXT: mul_binary_tile( +// CHECK-LABEL: void kernel_main() -// FPU: exp_tile_init(); -// FPU-NEXT: exp_tile( +// --- Constants --- +// CHECK-DAG: int32_t [[TILES:v[0-9]+]] = 4 +// CHECK-DAG: size_t [[BOUND:v[0-9]+]] = 2 +// CHECK-DAG: size_t [[STEP:v[0-9]+]] = 1 +// CHECK-DAG: size_t [[ZERO:v[0-9]+]] = 0 -// FPU: tile_regs_commit(); -// FPU-NEXT: tile_regs_wait(); -// FPU: pack_tile([[ZERO]], get_compile_time_arg_val(2), -// FPU: tile_regs_release(); +// --- Nested loops over 2x2 tile grid --- +// CHECK: for (size_t [[I:.*]] = [[ZERO]]; [[I]] < [[BOUND]]; [[I]] += [[STEP]]) { +// CHECK-NEXT: for (size_t [[J:.*]] = [[ZERO]]; [[J]] < [[BOUND]]; [[J]] += [[STEP]]) { -// FPU-NOT: init_sfpu -// FPU-NOT: add_binary_tile +// --- Compute linear tile index: i * cols + j --- +// CHECK: size_t [[COL_SIZE:.*]] = 2; +// CHECK-NEXT: size_t [[IOFF:.*]] = [[I]] * [[COL_SIZE]]; +// CHECK-NEXT: size_t [[LINIDX:.*]] = [[IOFF]] + [[J]]; -// ============================================================================= -// SFPU path: init_sfpu, copy_tile + add_binary_tile, mul_binary_tile, exp -// ============================================================================= -// SFPU-LABEL: void kernel_main() +// --- DST register lifecycle (acquire inside loop) --- +// CHECK-NEXT: tile_regs_acquire(); -// SFPU-DAG: int32_t [[TILES:v[0-9]+]] = 4 -// SFPU-DAG: size_t [[BOUND:v[0-9]+]] = 2 -// SFPU-DAG: size_t [[STEP:v[0-9]+]] = 1 -// SFPU-DAG: size_t [[ZERO:v[0-9]+]] = 0 +// --- Load tiles into DST (at first use: CB0 first, then CB1) --- +// CHECK-NEXT: copy_tile_init(get_compile_time_arg_val(0)); +// CHECK-NEXT: copy_tile(get_compile_time_arg_val(0), [[LINIDX]], [[ZERO]]); +// CHECK-NEXT: copy_tile_init(get_compile_time_arg_val(1)); +// CHECK-NEXT: copy_tile(get_compile_time_arg_val(1), [[LINIDX]], [[STEP]]); -// CB wrappers declared at top of kernel -// SFPU: experimental::CircularBuffer [[SFPU_CB0:.*]](get_compile_time_arg_val(0)); -// SFPU: experimental::CircularBuffer [[SFPU_CB1:.*]](get_compile_time_arg_val(1)); -// SFPU: experimental::CircularBuffer [[SFPU_CB2:.*]](get_compile_time_arg_val(2)); -// SFPU: [[SFPU_CB2]].reserve_back([[TILES]]); -// SFPU: init_sfpu(get_compile_time_arg_val(0), get_compile_time_arg_val(2)); +// --- Add: DST[0] + DST[1] -> DST[0] --- +// CHECK-NEXT: add_binary_tile_init(); +// CHECK-NEXT: add_binary_tile([[ZERO]], [[STEP]], [[ZERO]]); -// SFPU: for (size_t [[I:.*]] = [[ZERO]]; [[I]] < [[BOUND]]; [[I]] += [[STEP]]) { -// SFPU-NEXT: for (size_t [[J:.*]] = [[ZERO]]; [[J]] < [[BOUND]]; [[J]] += [[STEP]]) { +// --- Mul: DST[0] * DST[1] -> DST[0] --- +// CHECK-NEXT: mul_binary_tile_init(); +// CHECK-NEXT: mul_binary_tile([[ZERO]], [[STEP]], [[ZERO]]); -// SFPU: tile_regs_acquire(); +// --- Exp: exp(DST[0]) -> DST[0] --- +// CHECK-NEXT: exp_tile_init(); +// CHECK-NEXT: exp_tile([[ZERO]]); -// SFPU: copy_tile_init(get_compile_time_arg_val(0)); -// SFPU-NEXT: copy_tile(get_compile_time_arg_val(0), {{.*}}, [[ZERO]]); -// SFPU-NEXT: copy_tile_init(get_compile_time_arg_val(1)); -// SFPU-NEXT: copy_tile(get_compile_time_arg_val(1), {{.*}}, [[STEP]]); +// --- Reserve output CB2 for packing (before commit) --- +// CHECK-NEXT: cb_reserve_back(get_compile_time_arg_val(2), [[TILES]]); -// SFPU-NEXT: add_binary_tile_init(); -// SFPU-NEXT: add_binary_tile([[ZERO]], [[STEP]], [[ZERO]]); +// --- DST register synchronization --- +// CHECK-NEXT: tile_regs_commit(); +// CHECK-NEXT: tile_regs_wait(); -// SFPU-NEXT: mul_binary_tile_init(); -// SFPU-NEXT: mul_binary_tile([[ZERO]], [[STEP]], [[ZERO]]); +// --- Compute CB tile index: i * 2 + j (linearized row-major index) --- +// CHECK: size_t [[CB_OFF_I:v[0-9]+]] = [[I]] * {{.*}}; +// CHECK-NEXT: size_t [[CB_IDX:v[0-9]+]] = [[CB_OFF_I]] + [[J]]; -// SFPU-NEXT: exp_tile_init(); -// SFPU-NEXT: exp_tile([[ZERO]]); +// --- Pack DST[0] to output CB2 --- +// CHECK-NEXT: pack_tile([[ZERO]], get_compile_time_arg_val(2), [[CB_IDX]]); -// SFPU-NEXT: tile_regs_commit(); -// SFPU-NEXT: tile_regs_wait(); +// --- Push to signal data ready --- +// CHECK-NEXT: cb_push_back(get_compile_time_arg_val(2), [[TILES]]); -// SFPU: pack_tile([[ZERO]], get_compile_time_arg_val(2), -// SFPU: tile_regs_release(); +// --- DST register lifecycle (release inside loop) --- +// CHECK-NEXT: tile_regs_release(); -// SFPU-NOT: binary_op_init_common -// SFPU-NOT: add_tiles +// --- End of inner and outer loops --- +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: return; +// --- Verify no tensor operations remain --- +// CHECK-NOT: tensor.extract +// CHECK-NOT: tensor.insert +// CHECK-NOT: tensor.empty func.func @fused_chain_lowering(%a: tensor<2x2x!ttcore.tile<32x32, f32>>, %b: tensor<2x2x!ttcore.tile<32x32, f32>>) -> tensor<2x2x!ttcore.tile<32x32, f32>> attributes {ttl.kernel_thread = #ttkernel.thread} { %output = tensor.empty() : tensor<2x2x!ttcore.tile<32x32, f32>> - %cb0 = ttl.bind_cb {cb_index = 0, block_count = 1} : !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1> - %cb1 = ttl.bind_cb {cb_index = 1, block_count = 1} : !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1> - %cb2 = ttl.bind_cb {cb_index = 2, block_count = 1} : !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1> + %cb0 = ttl.bind_cb {cb_index = 0, buffer_factor = 1} : !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1> + %cb1 = ttl.bind_cb {cb_index = 1, buffer_factor = 1} : !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1> + %cb2 = ttl.bind_cb {cb_index = 2, buffer_factor = 1} : !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1> // Wait for input CBs (entire blocks) before compute. %a_ready = ttl.cb_wait %cb0 : <[2, 2], !ttcore.tile<32x32, f32>, 1> -> tensor<2x2x!ttcore.tile<32x32, f32>> %b_ready = ttl.cb_wait %cb1 : <[2, 2], !ttcore.tile<32x32, f32>, 1> -> tensor<2x2x!ttcore.tile<32x32, f32>> %output_cb = ttl.attach_cb %output, %cb2 : (tensor<2x2x!ttcore.tile<32x32, f32>>, !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1>) -> tensor<2x2x!ttcore.tile<32x32, f32>> - %result_view = ttl.cb_reserve %cb2 : <[2, 2], !ttcore.tile<32x32, f32>, 1> -> tensor<2x2x!ttcore.tile<32x32, f32>> %result = ttl.compute ins(%a_ready, %b_ready : tensor<2x2x!ttcore.tile<32x32, f32>>, tensor<2x2x!ttcore.tile<32x32, f32>>) @@ -132,15 +101,13 @@ func.func @fused_chain_lowering(%a: tensor<2x2x!ttcore.tile<32x32, f32>>, ^bb0(%a_tile: !ttcore.tile<32x32, f32>, %b_tile: !ttcore.tile<32x32, f32>, %out_tile: !ttcore.tile<32x32, f32>): - %i = ttl.iter_index 0 : index - %j = ttl.iter_index 1 : index - %c0 = arith.constant 0 : index - %sum = ttl.tile_add %a_tile, %b_tile into dst[%c0] : !ttcore.tile<32x32, f32>, !ttcore.tile<32x32, f32> -> !ttcore.tile<32x32, f32> - %mul = ttl.tile_mul %sum, %b_tile into dst[%c0] : !ttcore.tile<32x32, f32>, !ttcore.tile<32x32, f32> -> !ttcore.tile<32x32, f32> - %exp = ttl.tile_exp %mul into dst[%c0] : !ttcore.tile<32x32, f32> -> !ttcore.tile<32x32, f32> - ttl.tile_store %exp, %result_view[%i, %j] from dst[%c0] : !ttcore.tile<32x32, f32>, tensor<2x2x!ttcore.tile<32x32, f32>> + %sum = ttl.tile_add %a_tile, %b_tile : !ttcore.tile<32x32, f32> + %mul = ttl.tile_mul %sum, %b_tile : !ttcore.tile<32x32, f32> + %exp = ttl.tile_exp %mul : !ttcore.tile<32x32, f32> + %result_view = ttl.cb_reserve %cb2 : <[2, 2], !ttcore.tile<32x32, f32>, 1> -> tensor<2x2x!ttcore.tile<32x32, f32>> + ttl.tile_store %exp, %result_view : !ttcore.tile<32x32, f32>, tensor<2x2x!ttcore.tile<32x32, f32>> ttl.cb_push %cb2 : <[2, 2], !ttcore.tile<32x32, f32>, 1> - ttl.yield + ttl.yield %exp : !ttcore.tile<32x32, f32> } -> tensor<2x2x!ttcore.tile<32x32, f32>> func.return %result : tensor<2x2x!ttcore.tile<32x32, f32>> diff --git a/test/ttlang/Translate/TTLToCpp/compute_with_data_movement.mlir b/test/ttlang/Translate/TTLToCpp/compute_with_data_movement.mlir index f6b50cedb..79c059005 100644 --- a/test/ttlang/Translate/TTLToCpp/compute_with_data_movement.mlir +++ b/test/ttlang/Translate/TTLToCpp/compute_with_data_movement.mlir @@ -1,310 +1,80 @@ -// FPU path (default): add uses add_tiles (reads from CB), no copy_tile for add. // RUN: ttlang-opt %s \ -// RUN: -pass-pipeline='builtin.module(func.func(convert-ttl-to-compute,ttl-set-compute-kernel-config{enable-fpu-binary-ops=1 matmul-full-fp32=0 reduce-full-fp32=0}, ttl-assign-dst,ttl-lower-to-loops,ttl-annotate-cb-associations),convert-ttl-to-ttkernel,ttkernel-insert-inits,canonicalize,cse,lower-affine)' \ +// RUN: -pass-pipeline='builtin.module(func.func(convert-ttl-to-compute,ttl-assign-dst,ttl-insert-tile-regs-sync,ttl-lower-to-loops,ttl-annotate-cb-associations),convert-ttl-to-ttkernel{use-trid-barriers=1},canonicalize,cse,lower-affine)' \ // RUN: -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir -// RUN: FileCheck %s --input-file=%t.cpp --check-prefix=FPU - -// SFPU path: all binary ops use copy_tile + SFPU binary ops. -// RUN: ttlang-opt %s \ -// RUN: -pass-pipeline='builtin.module(func.func(convert-ttl-to-compute,ttl-set-compute-kernel-config{enable-fpu-binary-ops=0 matmul-full-fp32=0 reduce-full-fp32=0}, ttl-assign-dst,ttl-lower-to-loops,ttl-annotate-cb-associations),convert-ttl-to-ttkernel,ttkernel-insert-inits,canonicalize,cse,lower-affine)' \ -// RUN: -o %t.sfpu.ttkernel.mlir -// RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.sfpu.ttkernel.mlir -o %t.sfpu.emitc.mlir -// RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.sfpu.cpp %t.sfpu.emitc.mlir -// RUN: FileCheck %s --input-file=%t.sfpu.cpp --check-prefix=SFPU +// RUN: FileCheck %s --input-file=%t.cpp // Purpose: Complete example with reader, compute, and writer threads. -// Pattern: reader (NOC) -> CBs -> compute (MATH) -> CB -> writer (NOC) +// Pattern: reader (NOC) → CBs → compute (MATH) → CB → writer (NOC) // Operation: f(A + B) where f is exp, matching the C++ example pattern. -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x2x!ttcore.tile<32x32, f32>, #dram>, > #map = affine_map<(d0, d1) -> (d0, d1)> -// ============================================================================= -// FPU path: reader kernel (same for both paths) -// ============================================================================= -// FPU-LABEL: // reader_binary -// FPU: void kernel_main() { -// FPU-DAG: size_t [[BOUND:v[0-9]+]] = 2 -// FPU-DAG: size_t [[ONE:v[0-9]+]] = 1 -// FPU-DAG: size_t [[PAGE_SIZE:v[0-9]+]] = 4096 -// FPU-DAG: size_t [[ZERO:v[0-9]+]] = 0 - -// CB wrappers declared at top of kernel -// FPU: experimental::CircularBuffer [[FPU_R_CB0:.*]](get_compile_time_arg_val(0)); -// FPU: experimental::CircularBuffer [[FPU_R_CB1:.*]](get_compile_time_arg_val(1)); - -// Read tensor A into CB0 -// FPU: int32_t [[RT_ARG_A:.*]] = get_common_arg_val([[ZERO]]); -// FPU-NEXT: auto [[ARGS_A:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 0>(); -// FPU-NEXT: TensorAccessor [[ACC_A:.*]] = TensorAccessor([[ARGS_A]], [[RT_ARG_A]], -// CB pointer casting chain: ptrdiff_t -> size_t -// FPU-NEXT: ptrdiff_t [[CB0_PTR_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[FPU_R_CB0]].get_write_ptr(); -// FPU-NEXT: size_t [[CB0_PTR_IDX:v[0-9]+]] = (size_t) [[CB0_PTR_PTRDIFF]]; -// FPU: for (size_t [[I_A:.*]] = [[ZERO]]; [[I_A]] < [[BOUND]]; [[I_A]] += [[ONE]]) { -// FPU-NEXT: for (size_t [[J_A:.*]] = [[ZERO]]; [[J_A]] < [[BOUND]]; [[J_A]] += [[ONE]]) { -// Tile offset: linearize 2D index (i * bound + j) -// FPU: size_t [[TILE_OFF_A_Y:v[0-9]+]] = [[I_A]] * [[BOUND]]; -// FPU-NEXT: size_t [[TILE_OFF_A:v[0-9]+]] = [[TILE_OFF_A_Y]] + [[J_A]]; -// Byte offset: tile_offset * page_size + cb_base -// FPU-NEXT: size_t [[BYTE_OFF_A:v[0-9]+]] = [[TILE_OFF_A]] * [[PAGE_SIZE]]; -// FPU-NEXT: size_t [[CB_ADDR_A_IDX:v[0-9]+]] = [[CB0_PTR_IDX]] + [[BYTE_OFF_A]]; -// Cast tile offset and CB address to int32_t for noc_async_read_tile -// FPU-NEXT: ptrdiff_t [[TILE_OFF_A_PD:v[0-9]+]] = (ptrdiff_t) [[TILE_OFF_A]]; -// FPU-NEXT: int32_t [[TILE_OFF_A_I32:v[0-9]+]] = (int32_t) [[TILE_OFF_A_PD]]; -// FPU-NEXT: ptrdiff_t [[CB_ADDR_A_PD:v[0-9]+]] = (ptrdiff_t) [[CB_ADDR_A_IDX]]; -// FPU-NEXT: int32_t [[CB_ADDR_A:v[0-9]+]] = (int32_t) [[CB_ADDR_A_PD]]; -// FPU-NEXT: noc_async_read_tile([[TILE_OFF_A_I32]], [[ACC_A]], [[CB_ADDR_A]]); -// FPU: } -// FPU-NEXT: } -// FPU-NEXT: noc_async_read_barrier(); - -// Read tensor B into CB1 -// FPU: int32_t [[RT_ARG_B:.*]] = get_common_arg_val([[ONE]]); -// FPU-NEXT: auto [[ARGS_B:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 1>(); -// FPU-NEXT: TensorAccessor [[ACC_B:.*]] = TensorAccessor([[ARGS_B]], [[RT_ARG_B]], -// CB pointer casting chain: ptrdiff_t -> size_t -// FPU-NEXT: ptrdiff_t [[CB1_PTR_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[FPU_R_CB1]].get_write_ptr(); -// FPU-NEXT: size_t [[CB1_PTR_IDX:v[0-9]+]] = (size_t) [[CB1_PTR_PTRDIFF]]; -// FPU: for (size_t [[I_B:.*]] = [[ZERO]]; [[I_B]] < [[BOUND]]; [[I_B]] += [[ONE]]) { -// FPU-NEXT: for (size_t [[J_B:.*]] = [[ZERO]]; [[J_B]] < [[BOUND]]; [[J_B]] += [[ONE]]) { -// Tile offset: linearize 2D index (i * bound + j) -// FPU: size_t [[TILE_OFF_B_Y:v[0-9]+]] = [[I_B]] * [[BOUND]]; -// FPU-NEXT: size_t [[TILE_OFF_B:v[0-9]+]] = [[TILE_OFF_B_Y]] + [[J_B]]; -// Byte offset: tile_offset * page_size + cb_base -// FPU-NEXT: size_t [[BYTE_OFF_B:v[0-9]+]] = [[TILE_OFF_B]] * [[PAGE_SIZE]]; -// FPU-NEXT: size_t [[CB_ADDR_B_IDX:v[0-9]+]] = [[CB1_PTR_IDX]] + [[BYTE_OFF_B]]; -// Cast tile offset and CB address to int32_t for noc_async_read_tile -// FPU-NEXT: ptrdiff_t [[TILE_OFF_B_PD:v[0-9]+]] = (ptrdiff_t) [[TILE_OFF_B]]; -// FPU-NEXT: int32_t [[TILE_OFF_B_I32:v[0-9]+]] = (int32_t) [[TILE_OFF_B_PD]]; -// FPU-NEXT: ptrdiff_t [[CB_ADDR_B_PD:v[0-9]+]] = (ptrdiff_t) [[CB_ADDR_B_IDX]]; -// FPU-NEXT: int32_t [[CB_ADDR_B:v[0-9]+]] = (int32_t) [[CB_ADDR_B_PD]]; -// FPU-NEXT: noc_async_read_tile([[TILE_OFF_B_I32]], [[ACC_B]], [[CB_ADDR_B]]); -// FPU: } -// FPU-NEXT: } -// FPU-NEXT: noc_async_read_barrier(); -// FPU-NEXT: return; - -// ============================================================================= -// FPU path: compute kernel -- binary_op_init_common, add_tiles, exp -// ============================================================================= -// FPU-LABEL: // compute_fused -// FPU: void kernel_main() { -// FPU-DAG: int32_t [[TILES:v[0-9]+]] = 4 -// FPU-DAG: size_t [[STEP:v[0-9]+]] = 1 -// FPU-DAG: size_t [[CBOUND:v[0-9]+]] = 2 -// FPU-DAG: size_t [[CZERO:v[0-9]+]] = 0 - -// CB wrappers declared at top of kernel -// FPU: experimental::CircularBuffer [[FPU_C_CB0:.*]](get_compile_time_arg_val(0)); -// FPU: experimental::CircularBuffer [[FPU_C_CB1:.*]](get_compile_time_arg_val(1)); -// FPU: experimental::CircularBuffer [[FPU_C_CB2:.*]](get_compile_time_arg_val(2)); -// FPU: [[FPU_C_CB0]].wait_front([[TILES]]); -// FPU-NEXT: [[FPU_C_CB1]].wait_front([[TILES]]); -// FPU-NEXT: [[FPU_C_CB2]].reserve_back([[TILES]]); -// FPU-NEXT: binary_op_init_common(get_compile_time_arg_val(0), get_compile_time_arg_val(1), get_compile_time_arg_val(2)); - -// FPU: for (size_t [[CI:.*]] = [[CZERO]]; [[CI]] < [[CBOUND]]; [[CI]] += [[STEP]]) { -// FPU-NEXT: for (size_t [[CJ:.*]] = [[CZERO]]; [[CJ]] < [[CBOUND]]; [[CJ]] += [[STEP]]) { -// FPU: tile_regs_acquire(); -// Linearized CB index for add_tiles: i * 2 + j (2 cols per row) -// FPU: size_t [[CSTRIDE:v[0-9]+]] = 2; -// FPU-NEXT: size_t [[CTILE_Y:v[0-9]+]] = [[CI]] * [[CSTRIDE]]; -// FPU-NEXT: size_t [[CTILE_IDX:v[0-9]+]] = [[CTILE_Y]] + [[CJ]]; -// No copy_tile for FPU add -- operands read directly from CB -// FPU-NOT: copy_tile -// FPU: add_tiles_init(get_compile_time_arg_val(0), get_compile_time_arg_val(1)); -// FPU-NEXT: add_tiles(get_compile_time_arg_val(0), get_compile_time_arg_val(1), [[CTILE_IDX]], [[CTILE_IDX]], [[CZERO]]); -// FPU-NEXT: exp_tile_init(); -// FPU-NEXT: exp_tile([[CZERO]]); -// FPU-NEXT: tile_regs_commit(); -// FPU-NEXT: tile_regs_wait(); -// pack_tile reuses the same linearized CB index as add_tiles. -// FPU: pack_tile([[CZERO]], get_compile_time_arg_val(2), [[CTILE_IDX]]); -// FPU-NEXT: [[FPU_C_CB2]].push_back([[TILES]]); -// FPU-NEXT: tile_regs_release(); - -// FPU-NOT: init_sfpu -// FPU-NOT: add_binary_tile - -// ============================================================================= -// FPU path: writer kernel -// ============================================================================= -// FPU-LABEL: // writer_unary -// FPU: void kernel_main() { -// FPU-DAG: size_t [[WBOUND:v[0-9]+]] = 2 -// FPU-DAG: size_t [[WONE:v[0-9]+]] = 1 -// FPU-DAG: size_t [[WPAGE:v[0-9]+]] = 4096 -// FPU-DAG: size_t [[WZERO:v[0-9]+]] = 0 -// CB wrapper declared at top of kernel -// FPU: experimental::CircularBuffer [[FPU_W_CB2:.*]](get_compile_time_arg_val(2)); -// FPU: int32_t [[WRT_ARG:.*]] = get_common_arg_val([[WZERO]]); -// FPU-NEXT: auto [[WARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 0>(); -// FPU-NEXT: TensorAccessor [[WACC:.*]] = TensorAccessor([[WARGS]], [[WRT_ARG]], -// CB pointer casting chain: ptrdiff_t -> size_t -// FPU-NEXT: ptrdiff_t [[WR_PTR_PD:v[0-9]+]] = (ptrdiff_t) [[FPU_W_CB2]].get_read_ptr(); -// FPU-NEXT: size_t [[WR_PTR_IDX:v[0-9]+]] = (size_t) [[WR_PTR_PD]]; -// FPU: for (size_t [[WI:.*]] = [[WZERO]]; [[WI]] < [[WBOUND]]; [[WI]] += [[WONE]]) { -// FPU-NEXT: for (size_t [[WJ:.*]] = [[WZERO]]; [[WJ]] < [[WBOUND]]; [[WJ]] += [[WONE]]) { -// Tile offset: linearize 2D index (i * bound + j) -// FPU: size_t [[WTILE_Y:v[0-9]+]] = [[WI]] * [[WBOUND]]; -// FPU-NEXT: size_t [[WTILE_OFF:v[0-9]+]] = [[WTILE_Y]] + [[WJ]]; -// Byte offset: tile_offset * page_size + cb_base -// FPU-NEXT: size_t [[WBYTE_OFF:v[0-9]+]] = [[WTILE_OFF]] * [[WPAGE]]; -// FPU-NEXT: size_t [[WCB_ADDR_IDX:v[0-9]+]] = [[WR_PTR_IDX]] + [[WBYTE_OFF]]; -// Cast tile offset and CB address to int32_t for noc_async_write_tile -// FPU-NEXT: ptrdiff_t [[WTILE_PD:v[0-9]+]] = (ptrdiff_t) [[WTILE_OFF]]; -// FPU-NEXT: int32_t [[WTILE_I32:v[0-9]+]] = (int32_t) [[WTILE_PD]]; -// FPU-NEXT: ptrdiff_t [[WCB_ADDR_PD:v[0-9]+]] = (ptrdiff_t) [[WCB_ADDR_IDX]]; -// FPU-NEXT: int32_t [[WCB_ADDR:v[0-9]+]] = (int32_t) [[WCB_ADDR_PD]]; -// FPU-NEXT: noc_async_write_tile([[WTILE_I32]], [[WACC]], [[WCB_ADDR]]); -// FPU: } -// FPU-NEXT: } -// FPU-NEXT: noc_async_write_barrier(); - -// ============================================================================= -// SFPU path: reader kernel (same for both paths) -// ============================================================================= -// SFPU-LABEL: // reader_binary -// SFPU: void kernel_main() { -// SFPU-DAG: size_t [[BOUND:v[0-9]+]] = 2 -// SFPU-DAG: size_t [[ONE:v[0-9]+]] = 1 -// SFPU-DAG: size_t [[PAGE_SIZE:v[0-9]+]] = 4096 -// SFPU-DAG: size_t [[ZERO:v[0-9]+]] = 0 - -// CB wrappers declared at top of kernel -// SFPU: experimental::CircularBuffer [[SFPU_R_CB0:.*]](get_compile_time_arg_val(0)); -// SFPU: experimental::CircularBuffer [[SFPU_R_CB1:.*]](get_compile_time_arg_val(1)); +// CHECK-LABEL: // reader_binary +// CHECK: void kernel_main() { +// CHECK-DAG: size_t [[ONE:.*]] = 1; +// CHECK-DAG: size_t [[BOUND:.*]] = 2; +// CHECK-DAG: size_t [[PAGE_SIZE:.*]] = 4096; +// CHECK-DAG: size_t [[ZERO:.*]] = 0; // Read tensor A into CB0 -// SFPU: int32_t [[RT_ARG_A:.*]] = get_common_arg_val([[ZERO]]); -// SFPU-NEXT: auto [[ARGS_A:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 0>(); -// SFPU-NEXT: TensorAccessor [[ACC_A:.*]] = TensorAccessor([[ARGS_A]], [[RT_ARG_A]], -// CB pointer casting chain: ptrdiff_t -> size_t -// SFPU-NEXT: ptrdiff_t [[CB0_PTR_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[SFPU_R_CB0]].get_write_ptr(); -// SFPU-NEXT: size_t [[CB0_PTR_IDX:v[0-9]+]] = (size_t) [[CB0_PTR_PTRDIFF]]; -// SFPU: for (size_t [[I_A:.*]] = [[ZERO]]; [[I_A]] < [[BOUND]]; [[I_A]] += [[ONE]]) { -// SFPU-NEXT: for (size_t [[J_A:.*]] = [[ZERO]]; [[J_A]] < [[BOUND]]; [[J_A]] += [[ONE]]) { -// Tile offset: linearize 2D index (i * bound + j) -// SFPU: size_t [[TILE_OFF_A_Y:v[0-9]+]] = [[I_A]] * [[BOUND]]; -// SFPU-NEXT: size_t [[TILE_OFF_A:v[0-9]+]] = [[TILE_OFF_A_Y]] + [[J_A]]; -// Byte offset: tile_offset * page_size + cb_base -// SFPU-NEXT: size_t [[BYTE_OFF_A:v[0-9]+]] = [[TILE_OFF_A]] * [[PAGE_SIZE]]; -// SFPU-NEXT: size_t [[CB_ADDR_A_IDX:v[0-9]+]] = [[CB0_PTR_IDX]] + [[BYTE_OFF_A]]; -// Cast tile offset and CB address to int32_t for noc_async_read_tile -// SFPU-NEXT: ptrdiff_t [[TILE_OFF_A_PD:v[0-9]+]] = (ptrdiff_t) [[TILE_OFF_A]]; -// SFPU-NEXT: int32_t [[TILE_OFF_A_I32:v[0-9]+]] = (int32_t) [[TILE_OFF_A_PD]]; -// SFPU-NEXT: ptrdiff_t [[CB_ADDR_A_PD:v[0-9]+]] = (ptrdiff_t) [[CB_ADDR_A_IDX]]; -// SFPU-NEXT: int32_t [[CB_ADDR_A:v[0-9]+]] = (int32_t) [[CB_ADDR_A_PD]]; -// SFPU-NEXT: noc_async_read_tile([[TILE_OFF_A_I32]], [[ACC_A]], [[CB_ADDR_A]]); -// SFPU: } -// SFPU-NEXT: } -// SFPU-NEXT: noc_async_read_barrier(); +// CHECK: int32_t [[RT_ARG_A:.*]] = get_common_arg_val([[ZERO]]); +// CHECK-NEXT: auto [[ARGS_A:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<2, 0>(); +// CHECK-NEXT: TensorAccessor [[ACC_A:.*]] = TensorAccessor([[ARGS_A]], [[RT_ARG_A]], +// CHECK: int32_t [[CB0_PTR:.*]] = get_write_ptr(get_compile_time_arg_val(0)); +// Cast CB ptr to size_t for index arithmetic +// CHECK-NEXT: ptrdiff_t [[CB0_PTR_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB0_PTR]]; +// CHECK-NEXT: size_t [[CB0_PTR_IDX:v[0-9]+]] = (size_t) [[CB0_PTR_PTRDIFF]]; +// CHECK-NEXT: noc_async_read_set_trid({{.*}}, {{.*}}); +// CHECK-NEXT: for (size_t [[I_A:.*]] = [[ZERO]]; [[I_A]] < [[BOUND]]; [[I_A]] += [[ONE]]) { +// CHECK-NEXT: for (size_t [[J_A:.*]] = [[ZERO]]; [[J_A]] < [[BOUND]]; [[J_A]] += [[ONE]]) { +// Tile offset computation: i * cols + j +// CHECK: size_t [[TILE_OFF_A_Y:v[0-9]+]] = [[I_A]] * [[BOUND]]; +// CHECK-NEXT: size_t [[TILE_OFF_A_X:v[0-9]+]] = [[TILE_OFF_A_Y]] + [[J_A]]; +// CB address computation: cb_ptr + tile_offset * page_size (all size_t arithmetic) +// CHECK-NEXT: size_t [[BYTE_OFF_A:v[0-9]+]] = [[TILE_OFF_A_X]] * [[PAGE_SIZE]]; +// CHECK-NEXT: size_t [[CB_ADDR_A_IDX:v[0-9]+]] = [[CB0_PTR_IDX]] + [[BYTE_OFF_A]]; +// Cast to i32 for NOC operation +// CHECK-NEXT: ptrdiff_t [[TILE_OFF_A_PTR:v[0-9]+]] = (ptrdiff_t) [[TILE_OFF_A_X]]; +// CHECK-NEXT: int32_t [[TILE_OFF_A:v[0-9]+]] = (int32_t) [[TILE_OFF_A_PTR]]; +// CHECK-NEXT: ptrdiff_t [[CB_ADDR_A_PTR:v[0-9]+]] = (ptrdiff_t) [[CB_ADDR_A_IDX]]; +// CHECK-NEXT: int32_t [[CB_ADDR_A:v[0-9]+]] = (int32_t) [[CB_ADDR_A_PTR]]; +// CHECK-NEXT: noc_async_read_tile([[TILE_OFF_A]], [[ACC_A]], [[CB_ADDR_A]]); +// CHECK: } +// CHECK-NEXT: } +// CHECK-NEXT: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // Read tensor B into CB1 -// SFPU: int32_t [[RT_ARG_B:.*]] = get_common_arg_val([[ONE]]); -// SFPU-NEXT: auto [[ARGS_B:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 1>(); -// SFPU-NEXT: TensorAccessor [[ACC_B:.*]] = TensorAccessor([[ARGS_B]], [[RT_ARG_B]], -// CB pointer casting chain: ptrdiff_t -> size_t -// SFPU-NEXT: ptrdiff_t [[CB1_PTR_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[SFPU_R_CB1]].get_write_ptr(); -// SFPU-NEXT: size_t [[CB1_PTR_IDX:v[0-9]+]] = (size_t) [[CB1_PTR_PTRDIFF]]; -// SFPU: for (size_t [[I_B:.*]] = [[ZERO]]; [[I_B]] < [[BOUND]]; [[I_B]] += [[ONE]]) { -// SFPU-NEXT: for (size_t [[J_B:.*]] = [[ZERO]]; [[J_B]] < [[BOUND]]; [[J_B]] += [[ONE]]) { -// Tile offset: linearize 2D index (i * bound + j) -// SFPU: size_t [[TILE_OFF_B_Y:v[0-9]+]] = [[I_B]] * [[BOUND]]; -// SFPU-NEXT: size_t [[TILE_OFF_B:v[0-9]+]] = [[TILE_OFF_B_Y]] + [[J_B]]; -// Byte offset: tile_offset * page_size + cb_base -// SFPU-NEXT: size_t [[BYTE_OFF_B:v[0-9]+]] = [[TILE_OFF_B]] * [[PAGE_SIZE]]; -// SFPU-NEXT: size_t [[CB_ADDR_B_IDX:v[0-9]+]] = [[CB1_PTR_IDX]] + [[BYTE_OFF_B]]; -// Cast tile offset and CB address to int32_t for noc_async_read_tile -// SFPU-NEXT: ptrdiff_t [[TILE_OFF_B_PD:v[0-9]+]] = (ptrdiff_t) [[TILE_OFF_B]]; -// SFPU-NEXT: int32_t [[TILE_OFF_B_I32:v[0-9]+]] = (int32_t) [[TILE_OFF_B_PD]]; -// SFPU-NEXT: ptrdiff_t [[CB_ADDR_B_PD:v[0-9]+]] = (ptrdiff_t) [[CB_ADDR_B_IDX]]; -// SFPU-NEXT: int32_t [[CB_ADDR_B:v[0-9]+]] = (int32_t) [[CB_ADDR_B_PD]]; -// SFPU-NEXT: noc_async_read_tile([[TILE_OFF_B_I32]], [[ACC_B]], [[CB_ADDR_B]]); -// SFPU: } -// SFPU-NEXT: } -// SFPU-NEXT: noc_async_read_barrier(); -// SFPU-NEXT: return; - -// ============================================================================= -// SFPU path: compute kernel -- init_sfpu, copy_tile, add_binary_tile, exp -// ============================================================================= -// SFPU-LABEL: // compute_fused -// SFPU: void kernel_main() { -// SFPU-DAG: int32_t [[TILES:v[0-9]+]] = 4 -// SFPU-DAG: size_t [[STEP:v[0-9]+]] = 1 -// SFPU-DAG: size_t [[CBOUND:v[0-9]+]] = 2 -// SFPU-DAG: size_t [[CZERO:v[0-9]+]] = 0 - -// CB wrappers declared at top of kernel -// SFPU: experimental::CircularBuffer [[SFPU_C_CB0:.*]](get_compile_time_arg_val(0)); -// SFPU: experimental::CircularBuffer [[SFPU_C_CB1:.*]](get_compile_time_arg_val(1)); -// SFPU: experimental::CircularBuffer [[SFPU_C_CB2:.*]](get_compile_time_arg_val(2)); -// SFPU: [[SFPU_C_CB0]].wait_front([[TILES]]); -// SFPU-NEXT: [[SFPU_C_CB1]].wait_front([[TILES]]); -// SFPU-NEXT: [[SFPU_C_CB2]].reserve_back([[TILES]]); -// SFPU-NEXT: init_sfpu(get_compile_time_arg_val(0), get_compile_time_arg_val(2)); - -// SFPU: for (size_t [[CI:.*]] = [[CZERO]]; [[CI]] < [[CBOUND]]; [[CI]] += [[STEP]]) { -// SFPU-NEXT: for (size_t [[CJ:.*]] = [[CZERO]]; [[CJ]] < [[CBOUND]]; [[CJ]] += [[STEP]]) { -// SFPU: tile_regs_acquire(); -// Linearized index for copy_tile CB index (from affine.linearize_index, lowered) -// SFPU: size_t [[CTILE_Y:v[0-9]+]] = [[CI]] * {{.*}}; -// SFPU-NEXT: size_t [[CTILE_IDX:v[0-9]+]] = [[CTILE_Y]] + [[CJ]]; -// SFPU-NEXT: copy_tile_init(get_compile_time_arg_val(0)); -// SFPU-NEXT: copy_tile(get_compile_time_arg_val(0), [[CTILE_IDX]], [[CZERO]]); -// SFPU-NEXT: copy_tile_init(get_compile_time_arg_val(1)); -// SFPU-NEXT: copy_tile(get_compile_time_arg_val(1), [[CTILE_IDX]], [[STEP]]); -// SFPU-NEXT: add_binary_tile_init(); -// SFPU-NEXT: add_binary_tile([[CZERO]], [[STEP]], [[CZERO]]); -// SFPU-NEXT: exp_tile_init(); -// SFPU-NEXT: exp_tile([[CZERO]]); -// SFPU-NEXT: tile_regs_commit(); -// SFPU-NEXT: tile_regs_wait(); -// SFPU-NEXT: pack_tile([[CZERO]], get_compile_time_arg_val(2), [[CTILE_IDX]]); -// SFPU-NEXT: [[SFPU_C_CB2]].push_back([[TILES]]); -// SFPU-NEXT: tile_regs_release(); - -// SFPU-NOT: binary_op_init_common -// SFPU-NOT: add_tiles - -// ============================================================================= -// SFPU path: writer kernel -// ============================================================================= -// SFPU-LABEL: // writer_unary -// SFPU: void kernel_main() { -// SFPU-DAG: size_t [[WBOUND:v[0-9]+]] = 2 -// SFPU-DAG: size_t [[WONE:v[0-9]+]] = 1 -// SFPU-DAG: size_t [[WPAGE:v[0-9]+]] = 4096 -// SFPU-DAG: size_t [[WZERO:v[0-9]+]] = 0 -// CB wrapper declared at top of kernel -// SFPU: experimental::CircularBuffer [[SFPU_W_CB2:.*]](get_compile_time_arg_val(2)); -// SFPU: int32_t [[WRT_ARG:.*]] = get_common_arg_val([[WZERO]]); -// SFPU-NEXT: auto [[WARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 0>(); -// SFPU-NEXT: TensorAccessor [[WACC:.*]] = TensorAccessor([[WARGS]], [[WRT_ARG]], -// CB pointer casting chain: ptrdiff_t -> size_t -// SFPU-NEXT: ptrdiff_t [[WR_PTR_PD:v[0-9]+]] = (ptrdiff_t) [[SFPU_W_CB2]].get_read_ptr(); -// SFPU-NEXT: size_t [[WR_PTR_IDX:v[0-9]+]] = (size_t) [[WR_PTR_PD]]; -// SFPU: for (size_t [[WI:.*]] = [[WZERO]]; [[WI]] < [[WBOUND]]; [[WI]] += [[WONE]]) { -// SFPU-NEXT: for (size_t [[WJ:.*]] = [[WZERO]]; [[WJ]] < [[WBOUND]]; [[WJ]] += [[WONE]]) { -// Tile offset: linearize 2D index (i * bound + j) -// SFPU: size_t [[WTILE_Y:v[0-9]+]] = [[WI]] * [[WBOUND]]; -// SFPU-NEXT: size_t [[WTILE_OFF:v[0-9]+]] = [[WTILE_Y]] + [[WJ]]; -// Byte offset: tile_offset * page_size + cb_base -// SFPU-NEXT: size_t [[WBYTE_OFF:v[0-9]+]] = [[WTILE_OFF]] * [[WPAGE]]; -// SFPU-NEXT: size_t [[WCB_ADDR_IDX:v[0-9]+]] = [[WR_PTR_IDX]] + [[WBYTE_OFF]]; -// Cast tile offset and CB address to int32_t for noc_async_write_tile -// SFPU-NEXT: ptrdiff_t [[WTILE_PD:v[0-9]+]] = (ptrdiff_t) [[WTILE_OFF]]; -// SFPU-NEXT: int32_t [[WTILE_I32:v[0-9]+]] = (int32_t) [[WTILE_PD]]; -// SFPU-NEXT: ptrdiff_t [[WCB_ADDR_PD:v[0-9]+]] = (ptrdiff_t) [[WCB_ADDR_IDX]]; -// SFPU-NEXT: int32_t [[WCB_ADDR:v[0-9]+]] = (int32_t) [[WCB_ADDR_PD]]; -// SFPU-NEXT: noc_async_write_tile([[WTILE_I32]], [[WACC]], [[WCB_ADDR]]); -// SFPU: } -// SFPU-NEXT: } -// SFPU-NEXT: noc_async_write_barrier(); +// CHECK: int32_t [[RT_ARG_B:.*]] = get_common_arg_val([[ONE]]); +// CHECK-NEXT: auto [[ARGS_B:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<3, 1>(); +// CHECK-NEXT: TensorAccessor [[ACC_B:.*]] = TensorAccessor([[ARGS_B]], [[RT_ARG_B]], +// CHECK: int32_t [[CB1_PTR:.*]] = get_write_ptr(get_compile_time_arg_val(1)); +// Cast CB ptr to size_t for index arithmetic +// CHECK-NEXT: ptrdiff_t [[CB1_PTR_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB1_PTR]]; +// CHECK-NEXT: size_t [[CB1_PTR_IDX:v[0-9]+]] = (size_t) [[CB1_PTR_PTRDIFF]]; +// CHECK-NEXT: noc_async_read_set_trid({{.*}}, {{.*}}); +// CHECK-NEXT: for (size_t [[I_B:.*]] = [[ZERO]]; [[I_B]] < [[BOUND]]; [[I_B]] += [[ONE]]) { +// CHECK-NEXT: for (size_t [[J_B:.*]] = [[ZERO]]; [[J_B]] < [[BOUND]]; [[J_B]] += [[ONE]]) { +// Tile offset computation: i * cols + j +// CHECK: size_t [[TILE_OFF_B_Y:v[0-9]+]] = [[I_B]] * [[BOUND]]; +// CHECK-NEXT: size_t [[TILE_OFF_B_X:v[0-9]+]] = [[TILE_OFF_B_Y]] + [[J_B]]; +// CB address computation: cb_ptr + tile_offset * page_size (all size_t arithmetic) +// CHECK-NEXT: size_t [[BYTE_OFF_B:v[0-9]+]] = [[TILE_OFF_B_X]] * [[PAGE_SIZE]]; +// CHECK-NEXT: size_t [[CB_ADDR_B_IDX:v[0-9]+]] = [[CB1_PTR_IDX]] + [[BYTE_OFF_B]]; +// Cast to i32 for NOC operation +// CHECK-NEXT: ptrdiff_t [[TILE_OFF_B_PTR:v[0-9]+]] = (ptrdiff_t) [[TILE_OFF_B_X]]; +// CHECK-NEXT: int32_t [[TILE_OFF_B:v[0-9]+]] = (int32_t) [[TILE_OFF_B_PTR]]; +// CHECK-NEXT: ptrdiff_t [[CB_ADDR_B_PTR:v[0-9]+]] = (ptrdiff_t) [[CB_ADDR_B_IDX]]; +// CHECK-NEXT: int32_t [[CB_ADDR_B:v[0-9]+]] = (int32_t) [[CB_ADDR_B_PTR]]; +// CHECK-NEXT: noc_async_read_tile([[TILE_OFF_B]], [[ACC_B]], [[CB_ADDR_B]]); +// CHECK: } +// CHECK-NEXT: } +// CHECK-NEXT: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); +// CHECK-NEXT: return; +// CHECK-NEXT: } // Reader kernel: reads A and B from DRAM, pushes to CB0 and CB1 func.func @reader_binary(%a: tensor<2x2x!ttcore.tile<32x32, f32>, #layout>, %b: tensor<2x2x!ttcore.tile<32x32, f32>, #layout>) @@ -326,17 +96,81 @@ func.func @reader_binary(%a: tensor<2x2x!ttcore.tile<32x32, f32>, #layout>, %b: func.return } +// CHECK-LABEL: // compute_fused +// CHECK: void kernel_main() { +// CHECK-DAG: int32_t [[TILES:.*]] = 4; +// CHECK-DAG: size_t [[BOUND:.*]] = 2; +// CHECK-DAG: size_t [[ONE:.*]] = 1; +// CHECK-DAG: size_t [[ZERO:.*]] = 0; + +// Wait for inputs from reader +// CHECK: cb_wait_front(get_compile_time_arg_val(0), [[TILES]]); +// CHECK-NEXT: cb_wait_front(get_compile_time_arg_val(1), [[TILES]]); + +// Initialize SFPU for CB data formats +// CHECK-NEXT: init_sfpu(get_compile_time_arg_val(0), get_compile_time_arg_val(2)); + +// Nested loops over 2x2 tile grid +// CHECK-NEXT: for (size_t [[I:.*]] = [[ZERO]]; [[I]] < [[BOUND]]; [[I]] += [[ONE]]) { +// CHECK-NEXT: for (size_t [[J:.*]] = [[ZERO]]; [[J]] < [[BOUND]]; [[J]] += [[ONE]]) { + +// Compute linear tile index: i * cols + j +// CHECK: size_t [[COL_SIZE:.*]] = 2; +// CHECK-NEXT: size_t [[IOFF:.*]] = [[I]] * [[COL_SIZE]]; +// CHECK-NEXT: size_t [[LINIDX:.*]] = [[IOFF]] + [[J]]; + +// Acquire DST registers (inside loop) +// CHECK-NEXT: tile_regs_acquire(); + +// Load tiles into DST (at first use: CB0 first, then CB1) +// CHECK-NEXT: copy_tile_init(get_compile_time_arg_val(0)); +// CHECK-NEXT: copy_tile(get_compile_time_arg_val(0), [[LINIDX]], [[ZERO]]); +// CHECK-NEXT: copy_tile_init(get_compile_time_arg_val(1)); +// CHECK-NEXT: copy_tile(get_compile_time_arg_val(1), [[LINIDX]], [[ONE]]); + +// Compute: A + B +// CHECK-NEXT: add_binary_tile_init(); +// CHECK-NEXT: add_binary_tile([[ZERO]], [[ONE]], [[ZERO]]); + +// Compute: exp(A + B) +// CHECK-NEXT: exp_tile_init(); +// CHECK-NEXT: exp_tile([[ZERO]]); + +// Reserve output CB2 (before commit) +// CHECK-NEXT: cb_reserve_back(get_compile_time_arg_val(2), [[TILES]]); + +// Synchronize DST registers before pack +// CHECK-NEXT: tile_regs_commit(); +// CHECK-NEXT: tile_regs_wait(); + +// Compute CB tile index: i * 2 + j (linearized row-major index) +// CHECK: size_t [[CB_OFF_I:v[0-9]+]] = [[I]] * {{.*}}; +// CHECK-NEXT: size_t [[CB_IDX:v[0-9]+]] = [[CB_OFF_I]] + [[J]]; + +// Pack result to output CB2 +// CHECK-NEXT: pack_tile([[ZERO]], get_compile_time_arg_val(2), [[CB_IDX]]); + +// Push to signal data ready +// CHECK-NEXT: cb_push_back(get_compile_time_arg_val(2), [[TILES]]); + +// Release DST registers (inside loop) +// CHECK-NEXT: tile_regs_release(); + +// End loops +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: return; + // Compute kernel: reads from CB0, CB1, computes f(A+B), writes to CB2 func.func @compute_fused(%a: tensor<2x2x!ttcore.tile<32x32, f32>>, %b: tensor<2x2x!ttcore.tile<32x32, f32>>) -> tensor<2x2x!ttcore.tile<32x32, f32>> attributes {ttl.base_cta_index = 3 : i32, ttl.crta_indices = [], ttl.kernel_thread = #ttkernel.thread} { - %c0 = arith.constant 0 : index %output = tensor.empty() : tensor<2x2x!ttcore.tile<32x32, f32>> - %cb0 = ttl.bind_cb {cb_index = 0, block_count = 1} : !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1> - %cb1 = ttl.bind_cb {cb_index = 1, block_count = 1} : !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1> - %cb2 = ttl.bind_cb {cb_index = 2, block_count = 1} : !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1> + %cb0 = ttl.bind_cb {cb_index = 0, buffer_factor = 1} : !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1> + %cb1 = ttl.bind_cb {cb_index = 1, buffer_factor = 1} : !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1> + %cb2 = ttl.bind_cb {cb_index = 2, buffer_factor = 1} : !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1> // Wait for inputs from reader thread %a_ready = ttl.cb_wait %cb0 : <[2, 2], !ttcore.tile<32x32, f32>, 1> -> tensor<2x2x!ttcore.tile<32x32, f32>> @@ -344,7 +178,6 @@ func.func @compute_fused(%a: tensor<2x2x!ttcore.tile<32x32, f32>>, %output_cb = ttl.attach_cb %output, %cb2 : (tensor<2x2x!ttcore.tile<32x32, f32>>, !ttl.cb<[2, 2], !ttcore.tile<32x32, f32>, 1>) -> tensor<2x2x!ttcore.tile<32x32, f32>> // Fused computation: f(A + B) where f is exp - %result_view = ttl.cb_reserve %cb2 : <[2, 2], !ttcore.tile<32x32, f32>, 1> -> tensor<2x2x!ttcore.tile<32x32, f32>> %result = ttl.compute ins(%a_ready, %b_ready : tensor<2x2x!ttcore.tile<32x32, f32>>, tensor<2x2x!ttcore.tile<32x32, f32>>) @@ -354,18 +187,53 @@ func.func @compute_fused(%a: tensor<2x2x!ttcore.tile<32x32, f32>>, ^bb0(%a_tile: !ttcore.tile<32x32, f32>, %b_tile: !ttcore.tile<32x32, f32>, %out_tile: !ttcore.tile<32x32, f32>): - %i = ttl.iter_index 0 : index - %j = ttl.iter_index 1 : index - %sum = ttl.tile_add %a_tile, %b_tile into dst[%c0] : !ttcore.tile<32x32, f32>, !ttcore.tile<32x32, f32> -> !ttcore.tile<32x32, f32> - %exp = ttl.tile_exp %sum into dst[%c0] : !ttcore.tile<32x32, f32> -> !ttcore.tile<32x32, f32> - ttl.tile_store %exp, %result_view[%i, %j] from dst[%c0] : !ttcore.tile<32x32, f32>, tensor<2x2x!ttcore.tile<32x32, f32>> + %sum = ttl.tile_add %a_tile, %b_tile : !ttcore.tile<32x32, f32> + %exp = ttl.tile_exp %sum : !ttcore.tile<32x32, f32> + %result_view = ttl.cb_reserve %cb2 : <[2, 2], !ttcore.tile<32x32, f32>, 1> -> tensor<2x2x!ttcore.tile<32x32, f32>> + ttl.tile_store %exp, %result_view : !ttcore.tile<32x32, f32>, tensor<2x2x!ttcore.tile<32x32, f32>> ttl.cb_push %cb2 : <[2, 2], !ttcore.tile<32x32, f32>, 1> - ttl.yield + ttl.yield %exp : !ttcore.tile<32x32, f32> } -> tensor<2x2x!ttcore.tile<32x32, f32>> func.return %result : tensor<2x2x!ttcore.tile<32x32, f32>> } +// CHECK-LABEL: // writer_unary +// CHECK: void kernel_main() { +// CHECK-DAG: size_t [[ONE:.*]] = 1; +// CHECK-DAG: size_t [[BOUND:.*]] = 2; +// CHECK-DAG: size_t [[PAGE_SIZE:.*]] = 4096; +// CHECK-DAG: size_t [[ZERO:.*]] = 0; + +// Write output to DRAM from CB2 +// CHECK: int32_t [[RT_ARG_OUT:.*]] = get_common_arg_val([[ZERO]]); +// CHECK-NEXT: auto [[ARGS_OUT:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); +// CHECK-NEXT: TensorAccessor [[ACC_OUT:.*]] = TensorAccessor([[ARGS_OUT]], [[RT_ARG_OUT]], +// CHECK: int32_t [[CB2_PTR:.*]] = get_read_ptr(get_compile_time_arg_val(2)); +// Cast CB ptr to size_t for index arithmetic +// CHECK-NEXT: ptrdiff_t [[CB2_PTR_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB2_PTR]]; +// CHECK-NEXT: size_t [[CB2_PTR_IDX:v[0-9]+]] = (size_t) [[CB2_PTR_PTRDIFF]]; +// CHECK-NEXT: noc_async_write_set_trid({{.*}}, {{.*}}); +// CHECK-NEXT: for (size_t [[I_OUT:.*]] = [[ZERO]]; [[I_OUT]] < [[BOUND]]; [[I_OUT]] += [[ONE]]) { +// CHECK-NEXT: for (size_t [[J_OUT:.*]] = [[ZERO]]; [[J_OUT]] < [[BOUND]]; [[J_OUT]] += [[ONE]]) { +// Tile offset computation: i * cols + j +// CHECK: size_t [[TILE_OFF_OUT_Y:v[0-9]+]] = [[I_OUT]] * [[BOUND]]; +// CHECK-NEXT: size_t [[TILE_OFF_OUT_X:v[0-9]+]] = [[TILE_OFF_OUT_Y]] + [[J_OUT]]; +// CB address computation: cb_ptr + tile_offset * page_size (all size_t arithmetic) +// CHECK-NEXT: size_t [[BYTE_OFF_OUT:v[0-9]+]] = [[TILE_OFF_OUT_X]] * [[PAGE_SIZE]]; +// CHECK-NEXT: size_t [[CB_ADDR_OUT_IDX:v[0-9]+]] = [[CB2_PTR_IDX]] + [[BYTE_OFF_OUT]]; +// Cast to i32 for NOC operation +// CHECK-NEXT: ptrdiff_t [[TILE_OFF_OUT_PTR:v[0-9]+]] = (ptrdiff_t) [[TILE_OFF_OUT_X]]; +// CHECK-NEXT: int32_t [[TILE_OFF_OUT:v[0-9]+]] = (int32_t) [[TILE_OFF_OUT_PTR]]; +// CHECK-NEXT: ptrdiff_t [[CB_ADDR_OUT_PTR:v[0-9]+]] = (ptrdiff_t) [[CB_ADDR_OUT_IDX]]; +// CHECK-NEXT: int32_t [[CB_ADDR_OUT:v[0-9]+]] = (int32_t) [[CB_ADDR_OUT_PTR]]; +// CHECK-NEXT: noc_async_write_tile([[TILE_OFF_OUT]], [[ACC_OUT]], [[CB_ADDR_OUT]]); +// CHECK: } +// CHECK-NEXT: } +// CHECK-NEXT: noc_async_write_barrier_with_trid({{.*}}, {{.*}}); +// CHECK-NEXT: return; +// CHECK-NEXT: } + // Writer kernel: pops from CB2, writes to DRAM func.func @writer_unary(%out: tensor<2x2x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { diff --git a/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir b/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir index ce2c2a6e2..64568756e 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -6,28 +6,30 @@ // Test: Batched DMA operations // Validates multiple async operations with proper barrier placement -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // CHECK: // dma_batched // CHECK: void kernel_main() { // CHECK-DAG: int32_t [[ZERO:v[0-9]+]] = 0; // CHECK-DAG: int32_t [[ADDR:v[0-9]+]] = 4096; -// CB wrappers declared at top of kernel -// CHECK: experimental::CircularBuffer [[CB0:.*]](get_compile_time_arg_val(0)); -// CHECK: experimental::CircularBuffer [[CB1:.*]](get_compile_time_arg_val(1)); -// Tensor 0: get runtime arg, create accessor, get CB write ptr, cast chain, async read +// Tensor 0: get runtime arg, create accessor, get CB write ptr, async read // CHECK: int32_t [[RT_ARG0:v[0-9]+]] = get_common_arg_val({{v[0-9]+}}); -// CHECK: auto [[ARGS0:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 0>(); +// CHECK: auto [[ARGS0:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<2, 0>(); // CHECK: TensorAccessor [[ACCESSOR0:v[0-9]+]] = TensorAccessor([[ARGS0]], [[RT_ARG0]], [[ADDR]]); -// CHECK-NEXT: noc_async_read_tile([[ZERO]], [[ACCESSOR0]], [[CB0]].get_write_ptr()); -// Tensor 1: get runtime arg, create accessor, get CB write ptr, cast chain, async read +// CHECK: int32_t [[CB_PTR0:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); +// CHECK: noc_async_read_tile([[ZERO]], [[ACCESSOR0]], [[CB_PTR0]]); +// Tensor 1: get runtime arg, create accessor, get CB write ptr, async read // CHECK: int32_t [[RT_ARG1:v[0-9]+]] = get_common_arg_val({{v[0-9]+}}); -// CHECK: auto [[ARGS1:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 1>(); +// CHECK: auto [[ARGS1:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<3, 1>(); // CHECK: TensorAccessor [[ACCESSOR1:v[0-9]+]] = TensorAccessor([[ARGS1]], [[RT_ARG1]], [[ADDR]]); -// CHECK-NEXT: noc_async_read_tile([[ZERO]], [[ACCESSOR1]], [[CB1]].get_write_ptr()); -// Consecutive barriers deduplicated to single barrier. -// CHECK: noc_async_read_barrier(); +// CHECK: int32_t [[CB_PTR1:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(1)); +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); +// CHECK: noc_async_read_tile([[ZERO]], [[ACCESSOR1]], [[CB_PTR1]]); +// Each wait lowers to a TRID barrier (no global barrier). +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: return; // CHECK-NEXT: } module { diff --git a/test/ttlang/Translate/TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir b/test/ttlang/Translate/TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir index f8718498f..7325522ed 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_loop_multi_tile_nontrivial_cb.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -51,6 +51,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR1_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB0]].get_write_ptr(); // CHECK: size_t [[CB_PTR1_IDX:v[0-9]+]] = (size_t) [[CB_PTR1_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: for (size_t [[TILE1_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE1_Y]] < [[TILES_2]]; [[TILE1_Y]] += [[TILE_STEP]]) { // CHECK: for (size_t [[TILE1_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE1_X]] < [[TILES_2]]; [[TILE1_X]] += [[TILE_STEP]]) { // CHECK: size_t [[TILE1_OFFSET_Y:v[0-9]+]] = [[TILE1_Y]] * [[TILES_2]]; @@ -66,7 +67,7 @@ // CHECK: noc_async_read_tile([[TILE1_OFFSET]], [[ACC1]], [[CB_ADDR1]]); // CHECK: } // CHECK: } -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // Second copy: arg1 (96x64) → CB1, accessor with runtime arg index 1 // CHECK: int32_t [[RT_ARG2:v[0-9]+]] = get_common_arg_val([[TILE_STEP]]); // CHECK: auto [[ACC2_ARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 1>(); @@ -74,6 +75,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR2_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB1]].get_write_ptr(); // CHECK: size_t [[CB_PTR2_IDX:v[0-9]+]] = (size_t) [[CB_PTR2_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: for (size_t [[TILE2_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE2_Y]] < [[TILES_3]]; [[TILE2_Y]] += [[TILE_STEP]]) { // CHECK: for (size_t [[TILE2_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE2_X]] < [[TILES_2]]; [[TILE2_X]] += [[TILE_STEP]]) { // CHECK: size_t [[TILE2_OFFSET_Y:v[0-9]+]] = [[TILE2_Y]] * [[TILES_2]]; @@ -89,7 +91,7 @@ // CHECK: noc_async_read_tile([[TILE2_OFFSET]], [[ACC2]], [[CB_ADDR2]]); // CHECK: } // CHECK: } -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: } // CHECK: return; // CHECK-NEXT: } diff --git a/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir b/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir index babaa0ffd..07c42d8c6 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -6,8 +6,8 @@ // Test: DMA operations inside loop // Validates scf.for → C++ for loop with DMA operations -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // CHECK: // dma_pipelined_loop // CHECK: void kernel_main() { @@ -16,20 +16,23 @@ // CHECK-DAG: size_t [[STEP:v[0-9]+]] = 1; // CHECK-DAG: size_t [[UB:v[0-9]+]] = 3; // CHECK-DAG: size_t [[LB:v[0-9]+]] = 0; -// CHECK: experimental::CircularBuffer [[CB:.*]](get_compile_time_arg_val(0)); -// Pre-loop copy: create accessor with runtime arg, get CB write ptr, cast chain +// Pre-loop copy: create accessor with runtime arg, get CB write ptr // CHECK: int32_t [[RT_ARG0:v[0-9]+]] = get_common_arg_val([[LB]]); -// CHECK: auto [[ARGS0:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 0>(); +// CHECK: auto [[ARGS0:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); // CHECK: TensorAccessor [[ACCESSOR0:v[0-9]+]] = TensorAccessor([[ARGS0]], [[RT_ARG0]], [[ADDR]]); -// CHECK-NEXT: noc_async_read_tile([[ZERO]], [[ACCESSOR0]], [[CB]].get_write_ptr()); +// CHECK: int32_t [[CB_PTR0:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); +// CHECK: noc_async_read_tile([[ZERO]], [[ACCESSOR0]], [[CB_PTR0]]); // CHECK: for (size_t [[IV:i[0-9]+]] = [[LB]]; [[IV]] < [[UB]]; [[IV]] += [[STEP]]) { -// In-loop copy: create accessor reusing hoisted runtime arg, get CB write ptr, cast chain -// CHECK: auto [[ARGS1:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 0>(); +// In-loop copy: create accessor using the same runtime arg and get CB write ptr +// CHECK: auto [[ARGS1:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); // CHECK: TensorAccessor [[ACCESSOR1:v[0-9]+]] = TensorAccessor([[ARGS1]], [[RT_ARG0]], [[ADDR]]); -// CHECK-NEXT: noc_async_read_tile([[ZERO]], [[ACCESSOR1]], [[CB]].get_write_ptr()); -// CHECK: noc_async_read_barrier(); +// CHECK: int32_t [[CB_PTR1:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); +// CHECK: noc_async_read_tile([[ZERO]], [[ACCESSOR1]], [[CB_PTR1]]); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: } -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: return; // CHECK-NEXT: } module { diff --git a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir index 4a787ee08..49cacd2a3 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -52,6 +52,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR1_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB0]].get_write_ptr(); // CHECK: size_t [[CB_PTR1_IDX:v[0-9]+]] = (size_t) [[CB_PTR1_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // Tile loops: for tile_y in 0..2, for tile_x in 0..2 // CHECK: for (size_t [[TILE1_Y:[a-z][0-9]+]] = [[LB]]; [[TILE1_Y]] < [[TILES_BOUND]]; [[TILE1_Y]] += [[STEP]]) { // CHECK: for (size_t [[TILE1_X:[a-z][0-9]+]] = [[LB]]; [[TILE1_X]] < [[TILES_BOUND]]; [[TILE1_X]] += [[STEP]]) { @@ -76,6 +77,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR2_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB1]].get_write_ptr(); // CHECK: size_t [[CB_PTR2_IDX:v[0-9]+]] = (size_t) [[CB_PTR2_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // Separate tile loops (same bounds 0..2 x 0..2 but not merged with first copy) // CHECK: for (size_t [[TILE2_Y:[a-z][0-9]+]] = [[LB]]; [[TILE2_Y]] < [[TILES_BOUND]]; [[TILE2_Y]] += [[STEP]]) { // CHECK: for (size_t [[TILE2_X:[a-z][0-9]+]] = [[LB]]; [[TILE2_X]] < [[TILES_BOUND]]; [[TILE2_X]] += [[STEP]]) { @@ -93,8 +95,9 @@ // CHECK: } // CHECK: } -// Consecutive barriers deduplicated to single barrier. -// CHECK: noc_async_read_barrier(); +// Each wait lowers to a TRID barrier (no global barrier). +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: } // CHECK: return; // CHECK-NEXT: } diff --git a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_read.mlir b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_read.mlir index bd01bae5d..b7555220a 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_read.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_read.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -26,6 +26,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB]].get_write_ptr(); // CHECK: size_t [[CB_PTR_IDX:v[0-9]+]] = (size_t) [[CB_PTR_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: for (size_t [[TILE_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE_Y]] < [[TILES_BOUND]]; [[TILE_Y]] += [[TILE_STEP]]) { // CHECK: for (size_t [[TILE_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE_X]] < [[TILES_BOUND]]; [[TILE_X]] += [[TILE_STEP]]) { // CHECK: size_t [[TILE_OFFSET_Y:v[0-9]+]] = [[TILE_Y]] * [[TILES_BOUND]]; @@ -41,7 +42,7 @@ // CHECK: noc_async_read_tile([[TILE_OFFSET]], [[ACCESSOR]], [[CB_ADDR]]); // CHECK: } // CHECK: } -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: return; // CHECK-NEXT: } module { diff --git a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_same_layout_different_cb.mlir b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_same_layout_different_cb.mlir index 03b9cc6ba..04f99d991 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_same_layout_different_cb.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_same_layout_different_cb.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -35,6 +35,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR1_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB0]].get_write_ptr(); // CHECK: size_t [[CB_PTR1_IDX:v[0-9]+]] = (size_t) [[CB_PTR1_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // Generated tile loops iterate over tensor grid (2x2) // CHECK: for (size_t [[TILE1_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE1_Y]] < [[TILES_BOUND]]; [[TILE1_Y]] += [[TILE_STEP]]) { // CHECK: for (size_t [[TILE1_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE1_X]] < [[TILES_BOUND]]; [[TILE1_X]] += [[TILE_STEP]]) { @@ -51,7 +52,7 @@ // CHECK: noc_async_read_tile([[TILE1_OFFSET]], [[ACC1]], [[CB_ADDR1]]); // CHECK: } // CHECK: } -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // Second copy: 64x64 (2x2 tiles) → CB [4,1] - SAME tensor layout, DIFFERENT CB shape // CHECK: int32_t [[RT_ARG2:v[0-9]+]] = get_common_arg_val([[TILE_STEP]]); @@ -60,6 +61,7 @@ // Cast CB ptr to size_t for index arithmetic // CHECK: ptrdiff_t [[CB_PTR2_PTRDIFF:v[0-9]+]] = (ptrdiff_t) [[CB1]].get_write_ptr(); // CHECK: size_t [[CB_PTR2_IDX:v[0-9]+]] = (size_t) [[CB_PTR2_PTRDIFF]]; +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // Generated tile loops still iterate over tensor grid (2x2), not CB shape (4x1) // CHECK: for (size_t [[TILE2_Y:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE2_Y]] < [[TILES_BOUND]]; [[TILE2_Y]] += [[TILE_STEP]]) { // CHECK: for (size_t [[TILE2_X:[a-z][0-9]+]] = [[TILE_LB]]; [[TILE2_X]] < [[TILES_BOUND]]; [[TILE2_X]] += [[TILE_STEP]]) { @@ -76,7 +78,7 @@ // CHECK: noc_async_read_tile([[TILE2_OFFSET]], [[ACC2]], [[CB_ADDR2]]); // CHECK: } // CHECK: } -// CHECK: noc_async_read_barrier(); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: return; // CHECK-NEXT: } diff --git a/test/ttlang/Translate/TTLToCpp/dma_single_tile_read.mlir b/test/ttlang/Translate/TTLToCpp/dma_single_tile_read.mlir index 73bf8dd18..b06d21092 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_single_tile_read.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_single_tile_read.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -6,19 +6,20 @@ // Test: Single-tile DMA read operation (tensor → CB) // Validates TTL→TTKernel→EmitC→C++ pipeline for basic single-tile DMA read -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // CHECK: // dma_single // CHECK: void kernel_main() { // CHECK-DAG: int32_t [[ZERO:v[0-9]+]] = 0; // CHECK-DAG: int32_t [[ADDR:v[0-9]+]] = 4096; -// CHECK: experimental::CircularBuffer [[CB:.*]](get_compile_time_arg_val(0)); // CHECK: int32_t [[RT_ARG:v[0-9]+]] = get_common_arg_val([[RT_ARG_IDX:v[0-9]+]]); -// CHECK: auto [[ARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 0>(); +// CHECK: auto [[ARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); // CHECK: TensorAccessor [[ACCESSOR:v[0-9]+]] = TensorAccessor([[ARGS]], [[RT_ARG]], [[ADDR]]); -// CHECK-NEXT: noc_async_read_tile([[ZERO]], [[ACCESSOR]], [[CB]].get_write_ptr()); -// CHECK: noc_async_read_barrier(); +// CHECK: int32_t [[CB_PTR:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); +// CHECK: noc_async_read_tile([[ZERO]], [[ACCESSOR]], [[CB_PTR]]); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); // CHECK: return; // CHECK-NEXT: } module { diff --git a/test/ttlang/Translate/TTLToCpp/loopback_full_single_tile.mlir b/test/ttlang/Translate/TTLToCpp/loopback_full_single_tile.mlir index 1585aa72c..aab784f91 100644 --- a/test/ttlang/Translate/TTLToCpp/loopback_full_single_tile.mlir +++ b/test/ttlang/Translate/TTLToCpp/loopback_full_single_tile.mlir @@ -1,4 +1,4 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir // RUN: FileCheck %s --input-file=%t.cpp @@ -6,8 +6,8 @@ // Test: Full loopback pattern (read from DRAM, write back to DRAM) // Validates complete pattern matching production kernel structure -#layout = #ttl.layout, - buffer = dram, grid = [1, 1], memory = interleaved> +#dram = #ttnn.buffer_type +#layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > // CHECK: // loopback // CHECK: void kernel_main() { @@ -16,21 +16,23 @@ // CHECK-DAG: size_t [[STEP:v[0-9]+]] = 1; // CHECK-DAG: size_t [[UB:v[0-9]+]] = 4; // CHECK-DAG: size_t [[LB:v[0-9]+]] = 0; -// CB wrapper declared at top of kernel -// CHECK: experimental::CircularBuffer [[CB:.*]](get_compile_time_arg_val(0)); // CHECK: for (size_t [[IV:i[0-9]+]] = [[LB]]; [[IV]] < [[UB]]; [[IV]] += [[STEP]]) { -// Read: tensor -> CB (uses get_write_ptr for CB destination) +// Read: tensor → CB (uses get_write_ptr for CB destination) // CHECK: int32_t [[RT_ARG_R:v[0-9]+]] = get_common_arg_val([[LB]]); -// CHECK: auto [[ARGS_READ:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 0>(); +// CHECK: auto [[ARGS_READ:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); // CHECK: TensorAccessor [[ACC_READ:v[0-9]+]] = TensorAccessor([[ARGS_READ]], [[RT_ARG_R]], [[ADDR]]); -// CHECK-NEXT: noc_async_read_tile([[ZERO]], [[ACC_READ]], [[CB]].get_write_ptr()); -// CHECK: noc_async_read_barrier(); -// Write: CB -> tensor (uses get_read_ptr for CB source) +// CHECK: int32_t [[CB_WRITE_PTR:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); +// CHECK: noc_async_read_tile([[ZERO]], [[ACC_READ]], [[CB_WRITE_PTR]]); +// CHECK: noc_async_read_barrier_with_trid({{.*}}, {{.*}}); +// Write: CB → tensor (uses get_read_ptr for CB source) // CHECK: int32_t [[RT_ARG_W:v[0-9]+]] = get_common_arg_val([[STEP]]); -// CHECK: auto [[ARGS_WRITE:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs(), 1>(); +// CHECK: auto [[ARGS_WRITE:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<2, 1>(); // CHECK: TensorAccessor [[ACC_WRITE:v[0-9]+]] = TensorAccessor([[ARGS_WRITE]], [[RT_ARG_W]], [[ADDR]]); -// CHECK-NEXT: noc_async_write_tile([[ZERO]], [[ACC_WRITE]], [[CB]].get_read_ptr()); -// CHECK: noc_async_write_barrier(); +// CHECK: int32_t [[CB_READ_PTR:v[0-9]+]] = get_read_ptr(get_compile_time_arg_val(0)); +// CHECK: noc_async_write_set_trid({{.*}}, {{.*}}); +// CHECK: noc_async_write_tile([[ZERO]], [[ACC_WRITE]], [[CB_READ_PTR]]); +// CHECK: noc_async_write_barrier_with_trid({{.*}}, {{.*}}); // CHECK: } // CHECK: return; // CHECK-NEXT: } From 97a703190a7b75d7bb18fac4c262d315d72a95ba Mon Sep 17 00:00:00 2001 From: Ilia Shutov Date: Tue, 5 May 2026 06:54:31 +0200 Subject: [PATCH 4/7] [test] Parameterize ME2E tests with use_trid_barriers option Add use_trid_barriers to E2EConfig and TestConfig to enable runtime testing of both barrier modes: - E2EConfig.use_trid_barriers controls pipeline pass option - TestConfig includes use_trid_barriers for test ID disambiguation - Pipeline builder forwards option to convert-ttl-to-ttkernel - Runner includes use_trid_barriers in kernel cache key - CONFIGS includes one TRID-enabled config for coverage Test IDs now include _trid suffix when use_trid_barriers=True to ensure unique pytest node IDs. --- test/me2e/README.md | 3 +++ test/me2e/builder/pipeline.py | 11 ++++++++++- test/me2e/config.py | 3 +++ test/me2e/config_specs.py | 4 ++++ test/me2e/runner.py | 3 +++ 5 files changed, 23 insertions(+), 1 deletion(-) diff --git a/test/me2e/README.md b/test/me2e/README.md index 49fae419b..ac2263ed8 100644 --- a/test/me2e/README.md +++ b/test/me2e/README.md @@ -441,8 +441,11 @@ class E2EConfig: block_count: int = 2 # 1=single, 2=double buffer (default) memory_layout: MemoryLayout = MemoryLayout.INTERLEAVED buffer_type: BufferType = BufferType.DRAM + use_trid_barriers: bool = False # TRID-aware DMA barriers (pass option) ``` +`use_trid_barriers` enables the convert-ttl-to-ttkernel pass option `use-trid-barriers=1` for runtime coverage of TRID barrier lowering; some ME2E configs (e.g. in CONFIGS) set it to True. + ### Memory Configuration The `memory_layout` and `buffer_type` fields control MLIR layout attribute generation: diff --git a/test/me2e/builder/pipeline.py b/test/me2e/builder/pipeline.py index 520476466..18c8e2de9 100644 --- a/test/me2e/builder/pipeline.py +++ b/test/me2e/builder/pipeline.py @@ -20,6 +20,7 @@ def compile_ttl_to_ttkernel( device: Optional[Any] = None, maximize_dst: bool = True, enable_fpu_binary_ops: bool = True, + use_trid_barriers: bool = False, ) -> Module: """ Run the TTL-to-TTKernel pass pipeline on the module. @@ -31,6 +32,8 @@ def compile_ttl_to_ttkernel( device: Optional TTNN device (unused, kept for API compat). maximize_dst: Enable DST maximization (subblocking + scheduling). enable_fpu_binary_ops: Enable FPU binary op detection (add_tiles, etc). + use_trid_barriers: If True, use TRID-aware DMA barriers (pass option + use-trid-barriers=1). Default False matches pass default. Returns: Compiled module with TTKernel/EmitC ops. @@ -57,6 +60,12 @@ def compile_ttl_to_ttkernel( func_passes.append("ttl-schedule-operations") func_pipeline = ",".join(func_passes) + ttkernel_pass = ( + "convert-ttl-to-ttkernel{use-trid-barriers=1}" + if use_trid_barriers + else "convert-ttl-to-ttkernel" + ) + pipeline_str = ( f"builtin.module(" f"func.func({func_pipeline})," @@ -64,7 +73,7 @@ def compile_ttl_to_ttkernel( f"func.func(ttl-annotate-cb-associations)," f"ttl-verify-pipenet-guards," f"ttl-erase-pipenet-scopes," - f"convert-ttl-to-ttkernel," + f"{ttkernel_pass}," f"ttkernel-insert-inits," f"canonicalize," f"cse," diff --git a/test/me2e/config.py b/test/me2e/config.py index e9df38794..7f10057f3 100644 --- a/test/me2e/config.py +++ b/test/me2e/config.py @@ -59,6 +59,9 @@ class E2EConfig: memory_layout: MemoryLayout = MemoryLayout.INTERLEAVED buffer_type: BufferType = BufferType.DRAM + # TTL-to-TTKernel: use TRID-aware DMA barriers (default matches pass default). + use_trid_barriers: bool = False + @property def num_tiles(self) -> int: """Total number of tiles in the grid.""" diff --git a/test/me2e/config_specs.py b/test/me2e/config_specs.py index eeece2b18..db1c662a2 100644 --- a/test/me2e/config_specs.py +++ b/test/me2e/config_specs.py @@ -114,6 +114,7 @@ class TestConfig: num_tiles: int = 64 block_count: int = 2 memory_layout: MemoryLayout = MemoryLayout.INTERLEAVED + use_trid_barriers: bool = False # Pipeline options. maximize_dst: bool = True @@ -165,12 +166,15 @@ def to_e2e_config(self) -> E2EConfig: dtype=self.dtype, block_count=self.block_count, memory_layout=self.memory_layout, + use_trid_barriers=self.use_trid_barriers, ) CONFIGS = [ # Single tile config. TestConfig(num_tiles=1, block_h=1, block_w=1), # 1x1 grid (single tile) + # Single tile with TRID barriers (runtime coverage for use-trid-barriers). + TestConfig(num_tiles=1, block_h=1, block_w=1, use_trid_barriers=True), # Multi-tile configs with loop generation. TestConfig(num_tiles=4, block_h=2, block_w=2), # 2x2 grid (4 tiles) # Maximize-DST disabled: no subblocking or scheduling (basic loop lowering). diff --git a/test/me2e/runner.py b/test/me2e/runner.py index 0b21e6529..bbc85fa22 100644 --- a/test/me2e/runner.py +++ b/test/me2e/runner.py @@ -48,6 +48,7 @@ def get_compute_kernel( cache_key = ( f"{op.name}_{op.ttl_op}_{config.block_h}x{config.block_w}_{config.dtype}" f"_dst{config.maximize_dst}_fpu{config.enable_fpu_binary_ops}" + f"_trid{int(config.use_trid_barriers)}" ) if cache_key in _kernel_cache: return _kernel_cache[cache_key] @@ -62,6 +63,7 @@ def get_compute_kernel( device, maximize_dst=config.maximize_dst, enable_fpu_binary_ops=config.enable_fpu_binary_ops, + use_trid_barriers=config.use_trid_barriers, ) # Translate to C++ kernels. @@ -122,6 +124,7 @@ def run_compute_test( device, maximize_dst=config.maximize_dst, enable_fpu_binary_ops=config.enable_fpu_binary_ops, + use_trid_barriers=config.use_trid_barriers, ) noc_kernels, compute_kernel_spec = translate_module_to_kernels(compiled_module) From f8da7a72707070014299d36bdb43ada0c65e1f37 Mon Sep 17 00:00:00 2001 From: Ilia Shutov Date: Tue, 5 May 2026 06:54:32 +0200 Subject: [PATCH 5/7] [test] Enable TRID barriers in Python lit tests Update Python hardware execution tests to use use_trid_barriers=True for consistent TRID-mode testing. These tests exercise the full compilation and execution path with TRID-aware DMA barriers. --- test/python/many_fused_ops.py | 2 +- test/python/simple_add.py | 2 +- test/python/simple_add_dram.py | 2 +- test/python/simple_add_with_stmt.py | 2 +- test/python/simple_fused.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/test/python/many_fused_ops.py b/test/python/many_fused_ops.py index 325fbfc4e..88f87aebe 100644 --- a/test/python/many_fused_ops.py +++ b/test/python/many_fused_ops.py @@ -158,7 +158,7 @@ def dm_write(): # CHECK-CPP: tile_regs_wait(); # Pack result -# CHECK-CPP: pack_tile( +# CHECK-CPP: pack_tile( # Pop inputs, push output # CHECK-CPP: [[CB0]].pop_front( diff --git a/test/python/simple_add.py b/test/python/simple_add.py index 11c7ee54b..e5c33640a 100644 --- a/test/python/simple_add.py +++ b/test/python/simple_add.py @@ -165,7 +165,7 @@ def dm_write(): # CHECK-CPP: tile_regs_wait(); # Pack result -# CHECK-CPP: pack_tile( +# CHECK-CPP: pack_tile( # Release regs # CHECK-CPP: tile_regs_release(); diff --git a/test/python/simple_add_dram.py b/test/python/simple_add_dram.py index 6170ef0c0..0cebe0789 100644 --- a/test/python/simple_add_dram.py +++ b/test/python/simple_add_dram.py @@ -148,7 +148,7 @@ def dm_write(): # CHECK-CPP: add_binary_tile( # CHECK-CPP: tile_regs_commit(); # CHECK-CPP: tile_regs_wait(); -# CHECK-CPP: pack_tile( +# CHECK-CPP: pack_tile( # CHECK-CPP: tile_regs_release(); # DFB finalization diff --git a/test/python/simple_add_with_stmt.py b/test/python/simple_add_with_stmt.py index fa716a6f9..172f5d0cc 100644 --- a/test/python/simple_add_with_stmt.py +++ b/test/python/simple_add_with_stmt.py @@ -152,7 +152,7 @@ def dm_write(): # CHECK-CPP: add_binary_tile( # CHECK-CPP: tile_regs_commit(); # CHECK-CPP: tile_regs_wait(); -# CHECK-CPP: pack_tile( +# CHECK-CPP: pack_tile( # CHECK-CPP: tile_regs_release(); # Push output, pop inputs (reverse order from 'with' exit) diff --git a/test/python/simple_fused.py b/test/python/simple_fused.py index 2b3a15bcc..79204c3ac 100644 --- a/test/python/simple_fused.py +++ b/test/python/simple_fused.py @@ -130,7 +130,7 @@ def dm_write(): # CHECK-CPP: tile_regs_wait(); # Pack result -# CHECK-CPP: pack_tile( +# CHECK-CPP: pack_tile( # Release regs # CHECK-CPP: tile_regs_release(); From 9bf88612a7c7f918ff299b7f454b93f917f61460 Mon Sep 17 00:00:00 2001 From: Ilia Shutov Date: Tue, 5 May 2026 06:54:32 +0200 Subject: [PATCH 6/7] [ttl] Address PR267 review comments (TRID lowering) - Remove unused releaseTrid; use SmallVector + trailing underscore in TridAllocator - Replace tridAllocator check with assert; remove allocateTrid in non-TRID branch - Add emitNocBarrier helper; assert i32 for handle in WaitLowering - cb_to_tensor_single_tile_write: default RUN + TRID RUN with TRID: prefix - dma_loop_single_tile: relax CHECK for in-loop runtime arg variable - config_specs: add multi-tile config with use_trid_barriers=True Addresses: #87 --- .../TTL/Transforms/ConvertTTLToTTKernel.cpp | 105 ++++++++++-------- test/me2e/config_specs.py | 2 + .../cb_to_tensor_single_tile_write.mlir | 22 +++- .../TTLToCpp/dma_loop_single_tile.mlir | 4 +- 4 files changed, 78 insertions(+), 55 deletions(-) diff --git a/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp b/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp index 667732861..4764723fb 100644 --- a/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp +++ b/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp @@ -38,11 +38,14 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Support/Casting.h" #include "llvm/Support/JSON.h" #include "llvm/Support/raw_ostream.h" #include +#include + namespace mlir::tt::ttl { #define GEN_PASS_DEF_TTLCONVERTTTLTOTTKERNEL #include "ttlang/Dialect/TTL/Passes.h.inc" @@ -508,6 +511,36 @@ static Value makeZeroI8(Location loc, ConversionPatternRewriter &rewriter) { return rewriter.create(loc, 0, 8); } +/// Emits NOC barrier: TRID-scoped (barrier_with_trid) when useTridBarriers and +/// tridVal present, otherwise global barrier. Returns failure() for unsupported +/// TransferKind. +static LogicalResult emitNocBarrier(ConversionPatternRewriter &rewriter, + Location loc, TransferKind kind, + std::optional tridVal, + bool useTridBarriers) { + Value nocVal = makeZeroI8(loc, rewriter); + if (useTridBarriers && tridVal) { + if (kind == TransferKind::read) { + rewriter.create(loc, *tridVal, + nocVal); + } else if (kind == TransferKind::write) { + rewriter.create(loc, *tridVal, + nocVal); + } else { + return failure(); + } + } else { + if (kind == TransferKind::read) { + rewriter.create(loc); + } else if (kind == TransferKind::write) { + rewriter.create(loc); + } else { + return failure(); + } + } + return success(); +} + static std::optional getTransferKindFromHandleType(Type t) { auto transferHandle = llvm::dyn_cast(t); if (!transferHandle) { @@ -686,23 +719,23 @@ class TridAllocator { }; AllocResult allocateTrid(TransferKind direction) { - uint32_t trid = nextTrid % kNumTrids; + uint32_t trid = nextTrid_ % kNumTrids; AllocResult result{trid, std::nullopt}; - if (outstanding[trid]) { + if (outstanding_[trid]) { result.evictDirection = direction_[trid]; } - outstanding[trid] = true; + outstanding_[trid] = true; direction_[trid] = direction; - ++nextTrid; + ++nextTrid_; return result; } - void releaseTrid(uint32_t trid) { outstanding[trid % kNumTrids] = false; } - private: - uint32_t nextTrid = 0; - bool outstanding[kNumTrids] = {}; - TransferKind direction_[kNumTrids] = {}; + uint32_t nextTrid_ = 0; + llvm::SmallVector outstanding_ = + llvm::SmallVector(kNumTrids, false); + llvm::SmallVector direction_ = + llvm::SmallVector(kNumTrids, TransferKind::read); }; /// Direction of a tensor<->CB tile copy for NOC operations. @@ -923,9 +956,7 @@ struct CopyLowering : OpConversionPattern { }); } - if (!tridAllocator) { - return rewriter.notifyMatchFailure(op, "missing TRID allocator"); - } + assert(tridAllocator && "CopyLowering requires TRID allocator"); TransferKind direction = (srcIsSlice && dstIsCB) ? TransferKind::read : TransferKind::write; @@ -938,20 +969,17 @@ struct CopyLowering : OpConversionPattern { if (allocResult.evictDirection) { Value evictTrid = rewriter.create( op.getLoc(), allocResult.trid, 32); - Value nocVal = makeZeroI8(op.getLoc(), rewriter); - if (*allocResult.evictDirection == TransferKind::read) { - rewriter.create( - op.getLoc(), evictTrid, nocVal); - } else { - rewriter.create( - op.getLoc(), evictTrid, nocVal); + if (failed(emitNocBarrier(rewriter, op.getLoc(), + *allocResult.evictDirection, + std::optional(evictTrid), + /*useTridBarriers=*/true))) { + return rewriter.notifyMatchFailure(op, "unsupported evict direction"); } } tridVal = rewriter.create(op.getLoc(), allocResult.trid, 32); } else { - // In global-barrier mode, allocate but direction does not matter. - tridAllocator->allocateTrid(direction); + // Global-barrier mode: no TRID tracking; handle is always constant 0. tridVal = rewriter.create(op.getLoc(), 0, 32); } @@ -1004,32 +1032,15 @@ struct WaitLowering : OpConversionPattern { rewriter.eraseOp(op); return success(); } - if (useTridBarriers) { - Value tridVal = adaptor.getXf(); // i32 (type converter guarantees this) - assert(tridVal.getType().isInteger(32) && - "transfer handle must be type-converted to i32 before ttl.wait"); - Value nocVal = makeZeroI8(op.getLoc(), rewriter); - if (*kind == TransferKind::read) { - rewriter.create(op.getLoc(), - tridVal, nocVal); - } else if (*kind == TransferKind::write) { - rewriter.create(op.getLoc(), - tridVal, nocVal); - } else { - return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) { - diag << "unsupported TransferKind for ttl.wait lowering"; - }); - } - } else { - if (*kind == TransferKind::read) { - rewriter.create(op.getLoc()); - } else if (*kind == TransferKind::write) { - rewriter.create(op.getLoc()); - } else { - return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) { - diag << "unsupported TransferKind for ttl.wait lowering"; - }); - } + Value tridVal = adaptor.getXf(); // i32 (type converter guarantees this) + assert(tridVal.getType().isInteger(32) && + "transfer handle must be type-converted to i32 before ttl.wait"); + if (failed(emitNocBarrier(rewriter, op.getLoc(), *kind, + std::optional(tridVal), + useTridBarriers))) { + return rewriter.notifyMatchFailure(op, [&](Diagnostic &diag) { + diag << "unsupported TransferKind for ttl.wait lowering"; + }); } rewriter.eraseOp(op); return success(); diff --git a/test/me2e/config_specs.py b/test/me2e/config_specs.py index db1c662a2..770e49c29 100644 --- a/test/me2e/config_specs.py +++ b/test/me2e/config_specs.py @@ -177,6 +177,8 @@ def to_e2e_config(self) -> E2EConfig: TestConfig(num_tiles=1, block_h=1, block_w=1, use_trid_barriers=True), # Multi-tile configs with loop generation. TestConfig(num_tiles=4, block_h=2, block_w=2), # 2x2 grid (4 tiles) + # Multi-tile with TRID barriers. + TestConfig(num_tiles=4, block_h=2, block_w=2, use_trid_barriers=True), # Maximize-DST disabled: no subblocking or scheduling (basic loop lowering). TestConfig(num_tiles=4, block_h=2, block_w=2, maximize_dst=False), # SFPU path: FPU binary detection disabled (all binary ops use copy_tile + SFPU). diff --git a/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir b/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir index 60398f35d..0d4607604 100644 --- a/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir +++ b/test/ttlang/Translate/TTLToCpp/cb_to_tensor_single_tile_write.mlir @@ -1,10 +1,14 @@ -// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.ttkernel.mlir +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --canonicalize %s -o %t.ttkernel.mlir // RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.ttkernel.mlir -o %t.emitc.mlir // RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.cpp %t.emitc.mlir -// RUN: FileCheck %s --input-file=%t.cpp +// RUN: FileCheck %s --check-prefix=CHECK --input-file=%t.cpp +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline="use-trid-barriers=1" --canonicalize %s -o %t.trid.ttkernel.mlir +// RUN: ttlang-opt --allow-unregistered-dialect --convert-ttkernel-to-emitc %t.trid.ttkernel.mlir -o %t.trid.emitc.mlir +// RUN: ttlang-translate --allow-unregistered-dialect --ttkernel-to-cpp -o %t.trid.cpp %t.trid.emitc.mlir +// RUN: FileCheck %s --check-prefix=TRID --input-file=%t.trid.cpp -// Test: Single DMA write operation (CB → tensor) -// Validates write barrier placement and ensures no read barrier +// Test: Single DMA write operation (CB -> tensor) +// Default RUN verifies global barrier lowering; TRID RUN verifies TRID-scoped barriers. #dram = #ttnn.buffer_type #layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x1x!ttcore.tile<32x32, f32>, #dram>, > @@ -17,11 +21,17 @@ // CHECK: auto [[ARGS:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); // CHECK: TensorAccessor [[ACCESSOR:v[0-9]+]] = TensorAccessor([[ARGS]], [[RT_ARG]], [[ADDR]]); // CHECK: int32_t [[CB_PTR:v[0-9]+]] = get_read_ptr(get_compile_time_arg_val(0)); -// CHECK: noc_async_write_set_trid({{.*}}, {{.*}}); // CHECK: noc_async_write_tile([[ZERO]], [[ACCESSOR]], [[CB_PTR]]); -// CHECK: noc_async_write_barrier_with_trid({{.*}}, {{.*}}); +// CHECK: noc_async_write_barrier(); // CHECK: return; // CHECK-NEXT: } +// CHECK-NOT: set_trid +// CHECK-NOT: barrier_with_trid + +// TRID: noc_async_write_set_trid({{.*}}, {{.*}}); +// TRID: noc_async_write_tile( +// TRID: noc_async_write_barrier_with_trid({{.*}}, {{.*}}); + module { func.func @cb_to_tensor(%arg0: tensor<1x1x!ttcore.tile<32x32, f32>, #layout>) attributes {ttl.base_cta_index = 1 : i32, ttl.crta_indices = [0], ttl.kernel_thread = #ttkernel.thread} { %c0 = arith.constant 0 : index diff --git a/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir b/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir index 07c42d8c6..99757df68 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_loop_single_tile.mlir @@ -24,9 +24,9 @@ // CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: noc_async_read_tile([[ZERO]], [[ACCESSOR0]], [[CB_PTR0]]); // CHECK: for (size_t [[IV:i[0-9]+]] = [[LB]]; [[IV]] < [[UB]]; [[IV]] += [[STEP]]) { -// In-loop copy: create accessor using the same runtime arg and get CB write ptr +// In-loop copy: create accessor using runtime arg and get CB write ptr // CHECK: auto [[ARGS1:tensor_accessor_args_[0-9]+]] = TensorAccessorArgs<1, 0>(); -// CHECK: TensorAccessor [[ACCESSOR1:v[0-9]+]] = TensorAccessor([[ARGS1]], [[RT_ARG0]], [[ADDR]]); +// CHECK: TensorAccessor [[ACCESSOR1:v[0-9]+]] = TensorAccessor([[ARGS1]], [[RT_ARG_LOOP:v[0-9]+]], [[ADDR]]); // CHECK: int32_t [[CB_PTR1:v[0-9]+]] = get_write_ptr(get_compile_time_arg_val(0)); // CHECK: noc_async_read_set_trid({{.*}}, {{.*}}); // CHECK: noc_async_read_tile([[ZERO]], [[ACCESSOR1]], [[CB_PTR1]]); From def5a1fda1a4cf84ce25171e44a3b1864e29c417 Mon Sep 17 00:00:00 2001 From: Ilia Shutov Date: Wed, 13 May 2026 05:57:07 +0200 Subject: [PATCH 7/7] [ttl] Replace deprecated rewriter.create in copy lowering Migrate TRID/barrier and constant op construction in ConvertTTLToTTKernel to the modern Op::create API requested in PR267 review. --- .../TTL/Transforms/ConvertTTLToTTKernel.cpp | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp b/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp index 4764723fb..3e502eb4e 100644 --- a/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp +++ b/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp @@ -508,7 +508,7 @@ static CopyOperandKind classifyOperand(Value v) { } static Value makeZeroI8(Location loc, ConversionPatternRewriter &rewriter) { - return rewriter.create(loc, 0, 8); + return arith::ConstantIntOp::create(rewriter, loc, 0, 8); } /// Emits NOC barrier: TRID-scoped (barrier_with_trid) when useTridBarriers and @@ -521,19 +521,19 @@ static LogicalResult emitNocBarrier(ConversionPatternRewriter &rewriter, Value nocVal = makeZeroI8(loc, rewriter); if (useTridBarriers && tridVal) { if (kind == TransferKind::read) { - rewriter.create(loc, *tridVal, - nocVal); + ttk::NocAsyncReadBarrierWithTridOp::create(rewriter, loc, *tridVal, + nocVal); } else if (kind == TransferKind::write) { - rewriter.create(loc, *tridVal, - nocVal); + ttk::NocAsyncWriteBarrierWithTridOp::create(rewriter, loc, *tridVal, + nocVal); } else { return failure(); } } else { if (kind == TransferKind::read) { - rewriter.create(loc); + ttk::NocAsyncReadBarrierOp::create(rewriter, loc); } else if (kind == TransferKind::write) { - rewriter.create(loc); + ttk::NocAsyncWriteBarrierOp::create(rewriter, loc); } else { return failure(); } @@ -823,9 +823,9 @@ static LogicalResult lowerTensorCBCopy(CopyOp op, TensorSliceOp sliceOp, if (useTridBarriers) { Value nocVal = makeZeroI8(loc, rewriter); if (isRead) { - rewriter.create(loc, tridVal, nocVal); + ttk::NocAsyncReadSetTridOp::create(rewriter, loc, tridVal, nocVal); } else { - rewriter.create(loc, tridVal, nocVal); + ttk::NocAsyncWriteSetTridOp::create(rewriter, loc, tridVal, nocVal); } } @@ -967,8 +967,8 @@ struct CopyLowering : OpConversionPattern { // If this TRID was still outstanding, emit a barrier to drain the old // transfer before reusing the TRID. if (allocResult.evictDirection) { - Value evictTrid = rewriter.create( - op.getLoc(), allocResult.trid, 32); + Value evictTrid = arith::ConstantIntOp::create( + rewriter, op.getLoc(), allocResult.trid, 32); if (failed(emitNocBarrier(rewriter, op.getLoc(), *allocResult.evictDirection, std::optional(evictTrid), @@ -976,11 +976,12 @@ struct CopyLowering : OpConversionPattern { return rewriter.notifyMatchFailure(op, "unsupported evict direction"); } } - tridVal = rewriter.create(op.getLoc(), - allocResult.trid, 32); + tridVal = + arith::ConstantIntOp::create(rewriter, op.getLoc(), allocResult.trid, + 32); } else { // Global-barrier mode: no TRID tracking; handle is always constant 0. - tridVal = rewriter.create(op.getLoc(), 0, 32); + tridVal = arith::ConstantIntOp::create(rewriter, op.getLoc(), 0, 32); } // TensorSlice -> CB: read tiles from tensor into circular buffer.