From 15d21512c6c9dfeaee9dd7ccc65c6a88e792aaaf Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Mon, 22 Jun 2026 10:47:43 +0800 Subject: [PATCH] fix(sync): keep TLoad WAW barriers for same buffer --- .../InsertSync/InsertSyncAnalysis.cpp | 50 +++++++++++-- ...667_distinct_subviews_no_mte2_barrier.pto} | 8 +-- ..._waw_same_buffer_requires_mte2_barrier.pto | 71 +++++++++++++++++++ .../samples/Sync/test_if_else_tile_result.pto | 2 +- 4 files changed, 121 insertions(+), 10 deletions(-) rename test/lit/pto/{issue667_tload_overlap_no_mte2_barrier.pto => issue667_distinct_subviews_no_mte2_barrier.pto} (92%) create mode 100644 test/lit/pto/tload_waw_same_buffer_requires_mte2_barrier.pto diff --git a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp index 0d93b1d61c..4bc3282d23 100644 --- a/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp +++ b/lib/PTO/Transforms/InsertSync/InsertSyncAnalysis.cpp @@ -184,13 +184,47 @@ static bool isTLoadCompound(const CompoundInstanceElement *compound) { return compound && compound->elementOp && isa(compound->elementOp); } -static bool isTLoadToTLoadWAWExempt(const CompoundInstanceElement *nowCompound, - const CompoundInstanceElement *frontCompound) { +static bool isMTE2TLoadPair(const CompoundInstanceElement *nowCompound, + const CompoundInstanceElement *frontCompound) { return isTLoadCompound(nowCompound) && isTLoadCompound(frontCompound) && nowCompound->kPipeValue == PipelineType::PIPE_MTE2 && frontCompound->kPipeValue == PipelineType::PIPE_MTE2; } +static std::optional getDirectSubviewSource(Value value) { + if (!value) return std::nullopt; + if (auto op = value.getDefiningOp()) + return op.getSource(); + if (auto op = value.getDefiningOp()) + return op.getSource(); + return std::nullopt; +} + +static bool isDistinctSiblingSubviewPair(const BaseMemInfo *lhs, + const BaseMemInfo *rhs) { + if (!lhs || !rhs) return false; + if (lhs->baseBuffer == rhs->baseBuffer) return false; + auto lhsSource = getDirectSubviewSource(lhs->baseBuffer); + auto rhsSource = getDirectSubviewSource(rhs->baseBuffer); + return lhsSource && rhsSource && *lhsSource == *rhsSource; +} + +static bool isTLoadToTLoadWAWExempt( + const CompoundInstanceElement *nowCompound, + const CompoundInstanceElement *frontCompound, + const DepBaseMemInfoPairVec &wawDepVec) { + if (!isMTE2TLoadPair(nowCompound, frontCompound)) return false; + + // PTOAS treats sibling subview SSA values of the same direct parent as + // non-overlapping by IR contract. Keep the exemption limited to that exact + // provenance so same-tile, nested-view, root-vs-view, different-parent, and + // unknown aliases still get an MTE2 pipe barrier. + return !wawDepVec.empty() && + llvm::all_of(wawDepVec, [](const auto &pair) { + return isDistinctSiblingSubviewPair(pair.first, pair.second); + }); +} + // ============================================================================== // 1. Entry Point // ============================================================================== @@ -476,9 +510,15 @@ bool InsertSyncAnalysis::IsMemInfoHasDependency( depBaseMemInfosVec); hasDependency |= memAnalyzer_.DepBetween(nowCompound->defVec, frontCompound->useVec, depBaseMemInfosVec); - if (!isTLoadToTLoadWAWExempt(nowCompound, frontCompound)) { - hasDependency |= memAnalyzer_.DepBetween(nowCompound->defVec, frontCompound->defVec, - depBaseMemInfosVec); + + DepBaseMemInfoPairVec wawDepVec; + bool hasWAWDependency = + memAnalyzer_.DepBetween(nowCompound->defVec, frontCompound->defVec, + wawDepVec); + if (hasWAWDependency && + !isTLoadToTLoadWAWExempt(nowCompound, frontCompound, wawDepVec)) { + depBaseMemInfosVec.append(wawDepVec.begin(), wawDepVec.end()); + hasDependency = true; } // Special hazard: ACC (L0C) read/read cross-pipe ordering. diff --git a/test/lit/pto/issue667_tload_overlap_no_mte2_barrier.pto b/test/lit/pto/issue667_distinct_subviews_no_mte2_barrier.pto similarity index 92% rename from test/lit/pto/issue667_tload_overlap_no_mte2_barrier.pto rename to test/lit/pto/issue667_distinct_subviews_no_mte2_barrier.pto index fb51009558..571e573604 100644 --- a/test/lit/pto/issue667_tload_overlap_no_mte2_barrier.pto +++ b/test/lit/pto/issue667_distinct_subviews_no_mte2_barrier.pto @@ -1,11 +1,11 @@ // RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync %s | FileCheck %s module { - func.func @issue667_tload_overlap_does_not_need_mte2_barrier( + func.func @issue667_distinct_subviews_do_not_need_mte2_barrier( %src0: memref<16x128xf32, #pto.address_space>, %src1: memref<16x128xf32, #pto.address_space>) { %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index %c0_i64 = arith.constant 0 : i64 %tile = pto.alloc_tile addr = %c0_i64 : @@ -14,7 +14,7 @@ module { %s0 = pto.subview %tile[%c0, %c0] sizes [16, 128] : !pto.tile_buf -> !pto.tile_buf - %s1 = pto.subview %tile[%c0, %c64] sizes [16, 128] : + %s1 = pto.subview %tile[%c0, %c128] sizes [16, 128] : !pto.tile_buf -> !pto.tile_buf @@ -45,7 +45,7 @@ module { } } -// CHECK-LABEL: AICORE void issue667_tload_overlap_does_not_need_mte2_barrier( +// CHECK-LABEL: AICORE void issue667_distinct_subviews_do_not_need_mte2_barrier( // CHECK: TLOAD( // CHECK-NOT: pipe_barrier(PIPE_MTE2); // CHECK: TLOAD( diff --git a/test/lit/pto/tload_waw_same_buffer_requires_mte2_barrier.pto b/test/lit/pto/tload_waw_same_buffer_requires_mte2_barrier.pto new file mode 100644 index 0000000000..6059ce4096 --- /dev/null +++ b/test/lit/pto/tload_waw_same_buffer_requires_mte2_barrier.pto @@ -0,0 +1,71 @@ +// RUN: ptoas --pto-arch=a3 --pto-level=level3 --enable-insert-sync %s | FileCheck %s + +module { + func.func @tload_waw_same_tile_requires_mte2_barrier( + %src0: memref<16x128xf32, #pto.address_space>, + %src1: memref<16x128xf32, #pto.address_space>) { + %c0_i64 = arith.constant 0 : i64 + %tile = pto.alloc_tile addr = %c0_i64 : + !pto.tile_buf + + pto.tload ins(%src0 : memref<16x128xf32, #pto.address_space>) + outs(%tile : !pto.tile_buf) + pto.tload ins(%src1 : memref<16x128xf32, #pto.address_space>) + outs(%tile : !pto.tile_buf) + return + } + + func.func @tload_waw_same_subview_requires_mte2_barrier( + %src0: memref<16x128xf32, #pto.address_space>, + %src1: memref<16x128xf32, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %tile = pto.alloc_tile addr = %c0_i64 : + !pto.tile_buf + %view = pto.subview %tile[%c0, %c0] sizes [16, 128] : + !pto.tile_buf + -> !pto.tile_buf + + pto.tload ins(%src0 : memref<16x128xf32, #pto.address_space>) + outs(%view : !pto.tile_buf) + pto.tload ins(%src1 : memref<16x128xf32, #pto.address_space>) + outs(%view : !pto.tile_buf) + return + } + + func.func @tload_waw_nested_subview_requires_mte2_barrier( + %src0: memref<16x64xf32, #pto.address_space>, + %src1: memref<16x64xf32, #pto.address_space>) { + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %tile = pto.alloc_tile addr = %c0_i64 : + !pto.tile_buf + %parent = pto.subview %tile[%c0, %c0] sizes [16, 64] : + !pto.tile_buf + -> !pto.tile_buf + %child = pto.subview %parent[%c0, %c0] sizes [16, 64] : + !pto.tile_buf + -> !pto.tile_buf + + pto.tload ins(%src0 : memref<16x64xf32, #pto.address_space>) + outs(%parent : !pto.tile_buf) + pto.tload ins(%src1 : memref<16x64xf32, #pto.address_space>) + outs(%child : !pto.tile_buf) + return + } +} + +// CHECK-LABEL: AICORE void tload_waw_same_tile_requires_mte2_barrier( +// CHECK: TLOAD( +// CHECK-NEXT: pipe_barrier(PIPE_MTE2); +// CHECK-NEXT: TLOAD( + +// CHECK-LABEL: AICORE void tload_waw_same_subview_requires_mte2_barrier( +// CHECK: TLOAD( +// CHECK-NEXT: pipe_barrier(PIPE_MTE2); +// CHECK-NEXT: TLOAD( + +// CHECK-LABEL: AICORE void tload_waw_nested_subview_requires_mte2_barrier( +// CHECK: TLOAD( +// CHECK-NEXT: pipe_barrier(PIPE_MTE2); +// CHECK-NEXT: TLOAD( diff --git a/test/samples/Sync/test_if_else_tile_result.pto b/test/samples/Sync/test_if_else_tile_result.pto index 04a0506251..b496beb527 100644 --- a/test/samples/Sync/test_if_else_tile_result.pto +++ b/test/samples/Sync/test_if_else_tile_result.pto @@ -1,5 +1,5 @@ module attributes {"pto.device-spec" = "Ascend910B1"} { - func.func @test_if_else_tile_result(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: i32, %arg3: !pto.ptr) { + func.func @test_if_else_tile_result(%arg0: !pto.ptr, %arg1: !pto.ptr, %arg2: i32, %arg3: !pto.ptr) attributes {pto.entry} { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c4 = arith.constant 4 : index