From f96297fd258d747e304de757df6a395704650fa4 Mon Sep 17 00:00:00 2001
From: TaoTao-real <TaoTao-real@users.noreply.github.com>
Date: Sat, 27 Jun 2026 14:52:56 +0800
Subject: [PATCH 1/9] Fix TPut release fence before TNotify

---
 include/PTO/IR/PTOAttrs.td                    |  15 +
 include/PTO/IR/PTOOps.td                      |  46 ++
 .../PTO/Transforms/MemoryConsistencyAttrs.h   |  35 ++
 include/PTO/Transforms/Passes.h               |   1 +
 include/PTO/Transforms/Passes.td              |  19 +
 lib/PTO/Transforms/CMakeLists.txt             |   1 +
 lib/PTO/Transforms/PTOMemoryConsistency.cpp   | 493 ++++++++++++++++++
 lib/PTO/Transforms/PTOToEmitC.cpp             | 234 ++++-----
 test/lit/pto/issue711_tnotify_mte_drain.pto   |  95 ++++
 .../lit/pto/issue872_tput_tnotify_release.pto | 169 ++++++
 test/lit/pto/memory_consistency_invalid.pto   |  88 ++++
 .../pto/signal_payload_cache_consistency.pto  | 197 +++++++
 tools/ptoas/ptoas.cpp                         |   1 +
 13 files changed, 1259 insertions(+), 135 deletions(-)
 create mode 100644 include/PTO/Transforms/MemoryConsistencyAttrs.h
 create mode 100644 lib/PTO/Transforms/PTOMemoryConsistency.cpp
 create mode 100644 test/lit/pto/issue872_tput_tnotify_release.pto
 create mode 100644 test/lit/pto/memory_consistency_invalid.pto
 create mode 100644 test/lit/pto/signal_payload_cache_consistency.pto

diff --git a/include/PTO/IR/PTOAttrs.td b/include/PTO/IR/PTOAttrs.td
index 0675f71085..85ec3b6a0f 100644
--- a/include/PTO/IR/PTOAttrs.td
+++ b/include/PTO/IR/PTOAttrs.td
@@ -302,6 +302,21 @@ def PTO_MemBarAttr : PTO_Attr<"MemBar", "membar"> {
   }];
 }
 
+def PTO_FENCE_SCOPE_DDR : I32EnumAttrCase<"DDR", 0, "ddr">;
+
+def PTO_FenceScopeEnum : PTO_I32Enum<
+  "FenceScope", "PTO memory fence scope", [
+    PTO_FENCE_SCOPE_DDR
+  ]>;
+
+def PTO_FenceScopeAttr : PTO_Attr<"FenceScope", "fence_scope"> {
+  let parameters = (ins EnumParameter<PTO_FenceScopeEnum>:$scope);
+  let assemblyFormat = "`<` params `>`";
+  let description = [{
+    Memory visibility scope for PTO fence operations.
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Sync Op Type (High Level Abstraction)
 //===----------------------------------------------------------------------===//
diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td
index 9fb753b04e..d662bc0ddd 100644
--- a/include/PTO/IR/PTOOps.td
+++ b/include/PTO/IR/PTOOps.td
@@ -2676,6 +2676,52 @@ def BarrierOp : PTO_Op<"barrier"> {
   let assemblyFormat = "$pipe attr-dict";
 }
 
+def CmoCleanOp : PTO_Op<"cmo.clean"> {
+  let summary = "Clean cache lines for a memory space";
+  let description = [{
+    Cache maintenance operation that writes dirty cache lines back to the
+    specified memory space. The first version supports whole-cache GM clean and
+    lowers to `dcci((__gm__ void*)0, ENTIRE_DATA_CACHE, CACHELINE_OUT)`.
+  }];
+
+  let arguments = (ins PTO_AddressSpaceAttr:$space);
+  let assemblyFormat = "`all` $space attr-dict";
+}
+
+def CmoInvalidateOp : PTO_Op<"cmo.invalidate"> {
+  let summary = "Invalidate cache lines for a memory space";
+  let description = [{
+    Cache maintenance operation that invalidates cache lines for the specified
+    memory space. The first version supports whole-cache GM invalidate and
+    lowers to `dcci((__gm__ void*)0, ENTIRE_DATA_CACHE)`.
+  }];
+
+  let arguments = (ins PTO_AddressSpaceAttr:$space);
+  let assemblyFormat = "`all` $space attr-dict";
+}
+
+def FenceReleaseOp : PTO_Op<"fence.release"> {
+  let summary = "Release memory fence";
+  let description = [{
+    Release fence for publishing payload writes before a following signal
+    operation. `scope = ddr` lowers to `dsb(DSB_DDR)`.
+  }];
+
+  let arguments = (ins PTO_FenceScopeAttr:$scope);
+  let assemblyFormat = "$scope attr-dict";
+}
+
+def FenceAcquireOp : PTO_Op<"fence.acquire"> {
+  let summary = "Acquire memory fence";
+  let description = [{
+    Acquire fence for ordering signal observation before following payload
+    reads. `scope = ddr` lowers to `dsb(DSB_DDR)`.
+  }];
+
+  let arguments = (ins PTO_FenceScopeAttr:$scope);
+  let assemblyFormat = "$scope attr-dict";
+}
+
 def TSyncOp : PTO_TOp<"tsync"> {
   let summary = "Direct TSYNC mapping (variadic operands).";
   let description = [{
diff --git a/include/PTO/Transforms/MemoryConsistencyAttrs.h b/include/PTO/Transforms/MemoryConsistencyAttrs.h
new file mode 100644
index 0000000000..74279c5614
--- /dev/null
+++ b/include/PTO/Transforms/MemoryConsistencyAttrs.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+#ifndef MLIR_DIALECT_PTO_TRANSFORMS_MEMORYCONSISTENCYATTRS_H
+#define MLIR_DIALECT_PTO_TRANSFORMS_MEMORYCONSISTENCYATTRS_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+namespace pto {
+
+inline constexpr llvm::StringLiteral kTNotifyDrainMte2AttrName =
+    "__pto.emitc.tnotify_drain_mte2";
+inline constexpr llvm::StringLiteral kTNotifyDrainMte3AttrName =
+    "__pto.emitc.tnotify_drain_mte3";
+inline constexpr llvm::StringLiteral kTNotifyDsbDdrAttrName =
+    "__pto.emitc.tnotify_dsb_ddr";
+inline constexpr llvm::StringLiteral kTNotifyCleanGmCacheAttrName =
+    "__pto.emitc.tnotify_clean_gm_cache";
+inline constexpr llvm::StringLiteral kAcquireCleanGmCacheAttrName =
+    "__pto.emitc.acquire_clean_gm_cache";
+inline constexpr llvm::StringLiteral kAcquireDsbDdrAttrName =
+    "__pto.emitc.acquire_dsb_ddr";
+inline constexpr llvm::StringLiteral kAcquireInvalidateGmCacheAttrName =
+    "__pto.emitc.acquire_invalidate_gm_cache";
+
+} // namespace pto
+} // namespace mlir
+
+#endif // MLIR_DIALECT_PTO_TRANSFORMS_MEMORYCONSISTENCYATTRS_H
diff --git a/include/PTO/Transforms/Passes.h b/include/PTO/Transforms/Passes.h
index 85970756c5..a9eec25035 100644
--- a/include/PTO/Transforms/Passes.h
+++ b/include/PTO/Transforms/Passes.h
@@ -75,6 +75,7 @@ std::unique_ptr<Pass> createPTORemoveRedundantBarrierPass();
 std::unique_ptr<Pass> createPTOViewToMemrefPass();
 std::unique_ptr<Pass> createPTOValidateIntToPtrUsesPass();
 std::unique_ptr<Pass> createPTOMaterializeTileHandlesPass();
+std::unique_ptr<Pass> createPTOMemoryConsistencyPass();
 std::unique_ptr<Pass> createInferPTOLayoutPass();
 std::unique_ptr<Pass> createPTOA5NormalizeTMovPass();
 std::unique_ptr<Pass> createPreFusionAnalysisPass();
diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td
index bcc165674a..e895b06c20 100644
--- a/include/PTO/Transforms/Passes.td
+++ b/include/PTO/Transforms/Passes.td
@@ -667,6 +667,25 @@ def PTOMaterializeTileHandles : Pass<"pto-materialize-tile-handles", "ModuleOp">
   ];
 }
 
+def PTOMemoryConsistency : Pass<"pto-memory-consistency", "ModuleOp"> {
+  let summary = "Annotate PTO memory consistency actions before backend lowering";
+  let description = [{
+    Analyzes signal/payload ordering requirements and annotates communication
+    signal ops and scalar GM consumers with release/acquire actions consumed by
+    backend lowering. It covers TNotify release actions for direct MTE
+    operations, macro-op MTE3 phases, cacheable scalar GM stores, and
+    conservative TWait/TTest acquire invalidation for scalar GM loads.
+  }];
+
+  let constructor = "mlir::pto::createPTOMemoryConsistencyPass()";
+
+  let dependentDialects = [
+    "mlir::pto::PTODialect",
+    "mlir::func::FuncDialect",
+    "mlir::scf::SCFDialect"
+  ];
+}
+
 def PTOUnrollSIMTFor : Pass<"pto-unroll-simt-for", "func::FuncOp"> {
   let summary =
       "Unroll small constant-trip-count scf.for loops in pto.simt_entry functions";
diff --git a/lib/PTO/Transforms/CMakeLists.txt b/lib/PTO/Transforms/CMakeLists.txt
index a7059674df..1e3f8d74dc 100644
--- a/lib/PTO/Transforms/CMakeLists.txt
+++ b/lib/PTO/Transforms/CMakeLists.txt
@@ -58,6 +58,7 @@ add_mlir_dialect_library(PTOTransforms
   PTOA5NormalizeTMovPass.cpp
   PTOCanonicalizeIR.cpp
   PTOMaterializeTileHandles.cpp
+  PTOMemoryConsistency.cpp
   BufferizableOpInterfaceImpl.cpp
   ConvertToPTOOp.cpp
   PTOAssignDefaultFrontendPipeIdPass.cpp
diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
new file mode 100644
index 0000000000..10854e4ea0
--- /dev/null
+++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
@@ -0,0 +1,493 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+#include "PTO/IR/PTO.h"
+#include "PTO/Transforms/InsertSync/SyncMacroModel.h"
+#include "PTO/Transforms/MemoryConsistencyAttrs.h"
+#include "PTO/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace pto {
+#define GEN_PASS_DEF_PTOMEMORYCONSISTENCY
+#include "PTO/Transforms/Passes.h.inc"
+} // namespace pto
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::pto;
+
+namespace {
+
+static bool isGmAddressSpace(pto::AddressSpace space) {
+  return space == pto::AddressSpace::GM || space == pto::AddressSpace::Zero;
+}
+
+struct TNotifyReleaseState {
+  bool drainMte2 = false;
+  bool drainMte3 = false;
+  bool cleanGmCache = false;
+  bool needsDsbDdr = false;
+
+  void merge(const TNotifyReleaseState &other) {
+    drainMte2 |= other.drainMte2;
+    drainMte3 |= other.drainMte3;
+    cleanGmCache |= other.cleanGmCache;
+    needsDsbDdr |= other.needsDsbDdr;
+  }
+
+  void clear() {
+    drainMte2 = false;
+    drainMte3 = false;
+    cleanGmCache = false;
+    needsDsbDdr = false;
+  }
+
+  void applyBarrier(pto::PIPE pipe) {
+    switch (pipe) {
+    case pto::PIPE::PIPE_MTE2:
+      drainMte2 = false;
+      break;
+    case pto::PIPE::PIPE_MTE3:
+      drainMte3 = false;
+      break;
+    case pto::PIPE::PIPE_ALL:
+      drainMte2 = false;
+      drainMte3 = false;
+      break;
+    default:
+      break;
+    }
+  }
+
+  void applyCmoClean(pto::AddressSpace space) {
+    if (isGmAddressSpace(space))
+      cleanGmCache = false;
+  }
+
+  void applyFenceRelease(pto::FenceScope scope) {
+    if (scope != pto::FenceScope::DDR)
+      return;
+    if (drainMte3 || cleanGmCache)
+      return;
+    needsDsbDdr = false;
+  }
+};
+
+struct SignalAcquireState {
+  bool pendingInvalidateGmCache = false;
+  bool dirtyGmCache = false;
+  bool cleanNeedsFence = false;
+
+  void merge(const SignalAcquireState &other) {
+    pendingInvalidateGmCache |= other.pendingInvalidateGmCache;
+    dirtyGmCache |= other.dirtyGmCache;
+    cleanNeedsFence |= other.cleanNeedsFence;
+  }
+
+  void consumeAcquire() {
+    pendingInvalidateGmCache = false;
+    dirtyGmCache = false;
+    cleanNeedsFence = false;
+  }
+
+  void applyCmoClean(pto::AddressSpace space) {
+    if (!isGmAddressSpace(space))
+      return;
+    if (dirtyGmCache)
+      cleanNeedsFence = true;
+    dirtyGmCache = false;
+  }
+
+  void applyFenceRelease(pto::FenceScope scope) {
+    if (scope == pto::FenceScope::DDR && !dirtyGmCache)
+      cleanNeedsFence = false;
+  }
+
+  void applyCmoInvalidate(pto::AddressSpace space) {
+    if (!isGmAddressSpace(space) || dirtyGmCache || cleanNeedsFence)
+      return;
+    pendingInvalidateGmCache = false;
+  }
+};
+
+static bool isGmScalarMemory(Type type) {
+  if (auto ptrTy = dyn_cast<pto::PtrType>(type)) {
+    pto::AddressSpace space = ptrTy.getMemorySpace().getAddressSpace();
+    return isGmAddressSpace(space);
+  }
+
+  if (auto memTy = dyn_cast<MemRefType>(type)) {
+    auto spaceAttr = dyn_cast_or_null<pto::AddressSpaceAttr>(memTy.getMemorySpace());
+    return !spaceAttr || isGmAddressSpace(spaceAttr.getAddressSpace());
+  }
+
+  return false;
+}
+
+static TNotifyReleaseState getReleaseStateForPipe(pto::PIPE pipe) {
+  TNotifyReleaseState state;
+  switch (pipe) {
+  case pto::PIPE::PIPE_MTE2:
+    state.drainMte2 = true;
+    break;
+  case pto::PIPE::PIPE_MTE3:
+    state.drainMte3 = true;
+    state.needsDsbDdr = true;
+    break;
+  case pto::PIPE::PIPE_ALL:
+    state.drainMte2 = true;
+    state.drainMte3 = true;
+    state.needsDsbDdr = true;
+    break;
+  default:
+    break;
+  }
+  return state;
+}
+
+static TNotifyReleaseState getReleaseStateForMacroModel(Operation *op) {
+  TNotifyReleaseState state;
+  auto model = getSyncMacroModel(op);
+  if (!model)
+    return state;
+
+  for (const SyncMacroPhase &phase : model->phases) {
+    // Macro MTE3 phases write GM payloads internally. A following TNotify must
+    // publish its signal only after those stores are drained and DDR-visible.
+    if (phase.pipe == PipelineType::PIPE_MTE3) {
+      state.drainMte3 = true;
+      state.needsDsbDdr = true;
+    }
+  }
+  return state;
+}
+
+static TNotifyReleaseState getDirectTNotifyReleaseState(Operation *op) {
+  if (isa<pto::BarrierOp, pto::CmoCleanOp, pto::CmoInvalidateOp,
+          pto::FenceReleaseOp, pto::FenceAcquireOp>(op))
+    return {};
+
+  if (auto store = dyn_cast<pto::StoreScalarOp>(op)) {
+    if (isGmScalarMemory(store.getPtr().getType())) {
+      TNotifyReleaseState state;
+      state.cleanGmCache = true;
+      state.needsDsbDdr = true;
+      return state;
+    }
+  }
+
+  TNotifyReleaseState macroState = getReleaseStateForMacroModel(op);
+  if (macroState.drainMte3 || macroState.cleanGmCache ||
+      macroState.needsDsbDdr)
+    return macroState;
+
+  if (auto pipeOp = dyn_cast<pto::OpPipeInterface>(op))
+    return getReleaseStateForPipe(pipeOp.getPipe());
+  return {};
+}
+
+static TNotifyReleaseState collectTNotifyReleaseState(Operation *op) {
+  TNotifyReleaseState state = getDirectTNotifyReleaseState(op);
+  for (Region &region : op->getRegions())
+    for (Block &block : region)
+      for (Operation &nested : block)
+        state.merge(collectTNotifyReleaseState(&nested));
+  return state;
+}
+
+static bool isLoopLikeOp(Operation *op) {
+  return isa<scf::ForOp, scf::WhileOp, scf::ParallelOp, scf::ForallOp>(op);
+}
+
+static void setTNotifyReleaseAttrs(pto::TNotifyOp op,
+                                   const TNotifyReleaseState &state) {
+  op->removeAttr(kTNotifyDrainMte2AttrName);
+  op->removeAttr(kTNotifyDrainMte3AttrName);
+  op->removeAttr(kTNotifyDsbDdrAttrName);
+  op->removeAttr(kTNotifyCleanGmCacheAttrName);
+  if (state.drainMte2)
+    op->setAttr(kTNotifyDrainMte2AttrName, UnitAttr::get(op.getContext()));
+  if (state.drainMte3)
+    op->setAttr(kTNotifyDrainMte3AttrName, UnitAttr::get(op.getContext()));
+}
+
+static void setTNotifyPipeDrainAttrs(pto::TNotifyOp op,
+                                     const TNotifyReleaseState &state) {
+  TNotifyReleaseState emitState;
+  emitState.drainMte2 = state.drainMte2;
+  setTNotifyReleaseAttrs(op, emitState);
+}
+
+static void diagnoseTNotifyRelease(pto::TNotifyOp op,
+                                   const TNotifyReleaseState &state,
+                                   bool &hasFailure) {
+  if (state.cleanGmCache) {
+    op.emitOpError()
+        << "requires explicit `pto.cmo.clean all #pto.address_space<gm>` "
+           "before publishing a signal after cacheable GM stores";
+    hasFailure = true;
+    return;
+  }
+  if (state.drainMte3) {
+    op.emitOpError()
+        << "requires an explicit `pto.barrier <PIPE_MTE3>` before "
+           "`pto.fence.release #pto.fence_scope<ddr>` when publishing a "
+           "signal after MTE3 GM writes";
+    hasFailure = true;
+    return;
+  }
+  if (state.needsDsbDdr) {
+    op.emitOpError()
+        << "requires explicit `pto.fence.release #pto.fence_scope<ddr>` "
+           "before publishing a signal after GM writes or cache clean";
+    hasFailure = true;
+  }
+}
+
+static void markNestedTNotifyWithState(Operation *op,
+                                       const TNotifyReleaseState &state,
+                                       bool &hasFailure) {
+  op->walk([&](pto::TNotifyOp notify) {
+    diagnoseTNotifyRelease(notify, state, hasFailure);
+    setTNotifyPipeDrainAttrs(notify, state);
+  });
+}
+
+static TNotifyReleaseState
+annotateTNotifyReleaseForBlock(Block &block,
+                               TNotifyReleaseState entryPendingState,
+                               TNotifyReleaseState loopCarriedState,
+                               bool &hasFailure) {
+  TNotifyReleaseState pendingState = entryPendingState;
+  for (Operation &op : block) {
+    if (auto notify = dyn_cast<pto::TNotifyOp>(op)) {
+      TNotifyReleaseState notifyState = pendingState;
+      notifyState.merge(loopCarriedState);
+      diagnoseTNotifyRelease(notify, notifyState, hasFailure);
+      setTNotifyPipeDrainAttrs(notify, notifyState);
+      pendingState.clear();
+    }
+
+    pendingState.merge(getDirectTNotifyReleaseState(&op));
+
+    TNotifyReleaseState regionEntryState = pendingState;
+    TNotifyReleaseState combinedRegionExitState;
+    for (Region &region : op.getRegions()) {
+      TNotifyReleaseState nestedLoopCarriedState = loopCarriedState;
+      if (isLoopLikeOp(&op))
+        nestedLoopCarriedState.merge(collectTNotifyReleaseState(&op));
+
+      if (region.hasOneBlock()) {
+        combinedRegionExitState.merge(annotateTNotifyReleaseForBlock(
+            region.front(), regionEntryState, nestedLoopCarriedState,
+            hasFailure));
+      } else {
+        TNotifyReleaseState regionState = collectTNotifyReleaseState(&op);
+        TNotifyReleaseState nestedNotifyState = regionEntryState;
+        nestedNotifyState.merge(nestedLoopCarriedState);
+        nestedNotifyState.merge(regionState);
+        markNestedTNotifyWithState(&op, nestedNotifyState, hasFailure);
+
+        TNotifyReleaseState regionExitState = regionEntryState;
+        regionExitState.merge(regionState);
+        combinedRegionExitState.merge(regionExitState);
+      }
+    }
+    pendingState.merge(combinedRegionExitState);
+
+    if (auto barrier = dyn_cast<pto::BarrierOp>(op))
+      pendingState.applyBarrier(barrier.getPipe().getPipe());
+    if (auto cmo = dyn_cast<pto::CmoCleanOp>(op))
+      pendingState.applyCmoClean(cmo.getSpace().getAddressSpace());
+    if (auto fence = dyn_cast<pto::FenceReleaseOp>(op))
+      pendingState.applyFenceRelease(fence.getScope().getScope());
+  }
+  return pendingState;
+}
+
+static bool annotateTNotifyRelease(ModuleOp module) {
+  bool hasFailure = false;
+  for (auto func : module.getOps<func::FuncOp>()) {
+    if (func.getBody().hasOneBlock()) {
+      (void)annotateTNotifyReleaseForBlock(func.getBody().front(),
+                                           TNotifyReleaseState{},
+                                           TNotifyReleaseState{},
+                                           hasFailure);
+      continue;
+    }
+
+    // Be conservative for pre-existing CFG: without a path-sensitive CFG data
+    // flow here, every TNotify may observe any release-relevant work in the
+    // function.
+    TNotifyReleaseState funcState =
+        collectTNotifyReleaseState(func.getOperation());
+    markNestedTNotifyWithState(func.getOperation(), funcState, hasFailure);
+  }
+  return hasFailure;
+}
+
+static void clearAcquireAttrs(pto::LoadScalarOp op) {
+  op->removeAttr(kAcquireCleanGmCacheAttrName);
+  op->removeAttr(kAcquireDsbDdrAttrName);
+  op->removeAttr(kAcquireInvalidateGmCacheAttrName);
+}
+
+static void diagnoseAcquireLoad(pto::LoadScalarOp op,
+                                const SignalAcquireState &state,
+                                bool &hasFailure) {
+  if (!state.pendingInvalidateGmCache ||
+      !isGmScalarMemory(op.getPtr().getType()))
+    return;
+  if (state.dirtyGmCache) {
+    op.emitOpError()
+        << "requires explicit `pto.cmo.clean all #pto.address_space<gm>`, "
+           "`pto.fence.release #pto.fence_scope<ddr>`, and "
+           "`pto.cmo.invalidate all #pto.address_space<gm>` before a "
+           "cacheable GM load after signal acquire when dirty GM cache may "
+           "exist";
+    hasFailure = true;
+    return;
+  }
+  if (state.cleanNeedsFence) {
+    op.emitOpError()
+        << "requires explicit `pto.fence.release #pto.fence_scope<ddr>` "
+           "after GM cache clean and before acquire invalidate";
+    hasFailure = true;
+    return;
+  }
+  op.emitOpError()
+      << "requires explicit `pto.cmo.invalidate all #pto.address_space<gm>` "
+         "before a cacheable GM load after `pto.comm.twait` or successful "
+         "`pto.comm.ttest`";
+  hasFailure = true;
+}
+
+static void consumeAcquireAfterDiagnostic(SignalAcquireState &state) {
+  if (state.pendingInvalidateGmCache)
+    state.consumeAcquire();
+}
+
+static SignalAcquireState collectSignalAcquireState(Operation *op) {
+  SignalAcquireState state;
+  if (isa<pto::TWaitOp, pto::TTestOp>(op))
+    state.pendingInvalidateGmCache = true;
+  if (auto store = dyn_cast<pto::StoreScalarOp>(op);
+      store && isGmScalarMemory(store.getPtr().getType()))
+    state.dirtyGmCache = true;
+  if (auto notify = dyn_cast<pto::TNotifyOp>(op);
+      notify && notify->hasAttr(kTNotifyCleanGmCacheAttrName))
+    state.dirtyGmCache = false;
+  if (auto cmo = dyn_cast<pto::CmoCleanOp>(op))
+    state.applyCmoClean(cmo.getSpace().getAddressSpace());
+  if (auto fence = dyn_cast<pto::FenceReleaseOp>(op))
+    state.applyFenceRelease(fence.getScope().getScope());
+  if (auto cmo = dyn_cast<pto::CmoInvalidateOp>(op))
+    state.applyCmoInvalidate(cmo.getSpace().getAddressSpace());
+
+  for (Region &region : op->getRegions())
+    for (Block &block : region)
+      for (Operation &nested : block)
+        state.merge(collectSignalAcquireState(&nested));
+  return state;
+}
+
+static void markNestedAcquireLoadsWithState(Operation *op,
+                                            SignalAcquireState state,
+                                            bool &hasFailure) {
+  op->walk([&](pto::LoadScalarOp load) {
+    clearAcquireAttrs(load);
+    diagnoseAcquireLoad(load, state, hasFailure);
+    consumeAcquireAfterDiagnostic(state);
+  });
+}
+
+static SignalAcquireState
+annotateSignalAcquireForBlock(Block &block, SignalAcquireState entryState,
+                              bool &hasFailure) {
+  SignalAcquireState state = entryState;
+  for (Operation &op : block) {
+    if (auto load = dyn_cast<pto::LoadScalarOp>(op)) {
+      clearAcquireAttrs(load);
+      diagnoseAcquireLoad(load, state, hasFailure);
+      consumeAcquireAfterDiagnostic(state);
+    }
+
+    if (auto store = dyn_cast<pto::StoreScalarOp>(op);
+        store && isGmScalarMemory(store.getPtr().getType()))
+      state.dirtyGmCache = true;
+
+    if (isa<pto::TWaitOp, pto::TTestOp>(op))
+      state.pendingInvalidateGmCache = true;
+
+    if (auto notify = dyn_cast<pto::TNotifyOp>(op);
+        notify && notify->hasAttr(kTNotifyCleanGmCacheAttrName))
+      state.dirtyGmCache = false;
+    if (auto cmo = dyn_cast<pto::CmoCleanOp>(op))
+      state.applyCmoClean(cmo.getSpace().getAddressSpace());
+    if (auto fence = dyn_cast<pto::FenceReleaseOp>(op))
+      state.applyFenceRelease(fence.getScope().getScope());
+    if (auto cmo = dyn_cast<pto::CmoInvalidateOp>(op))
+      state.applyCmoInvalidate(cmo.getSpace().getAddressSpace());
+
+    SignalAcquireState combinedRegionExitState;
+    for (Region &region : op.getRegions()) {
+      if (region.hasOneBlock()) {
+        combinedRegionExitState.merge(
+            annotateSignalAcquireForBlock(region.front(), state, hasFailure));
+      } else {
+        markNestedAcquireLoadsWithState(&op, state, hasFailure);
+        SignalAcquireState regionState = collectSignalAcquireState(&op);
+        SignalAcquireState regionExitState = state;
+        regionExitState.merge(regionState);
+        combinedRegionExitState.merge(regionExitState);
+      }
+    }
+
+    if (isLoopLikeOp(&op))
+      combinedRegionExitState.merge(state);
+    state.merge(combinedRegionExitState);
+  }
+  return state;
+}
+
+static bool annotateSignalAcquire(ModuleOp module) {
+  bool hasFailure = false;
+  for (auto func : module.getOps<func::FuncOp>()) {
+    if (func.getBody().hasOneBlock()) {
+      (void)annotateSignalAcquireForBlock(func.getBody().front(),
+                                          SignalAcquireState{}, hasFailure);
+      continue;
+    }
+
+    SignalAcquireState funcState =
+        collectSignalAcquireState(func.getOperation());
+    markNestedAcquireLoadsWithState(func.getOperation(), funcState, hasFailure);
+  }
+  return hasFailure;
+}
+
+struct PTOMemoryConsistencyPass
+    : public mlir::pto::impl::PTOMemoryConsistencyBase<
+          PTOMemoryConsistencyPass> {
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    bool releaseFailed = annotateTNotifyRelease(module);
+    bool acquireFailed = annotateSignalAcquire(module);
+    if (releaseFailed || acquireFailed)
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::pto::createPTOMemoryConsistencyPass() {
+  return std::make_unique<PTOMemoryConsistencyPass>();
+}
diff --git a/lib/PTO/Transforms/PTOToEmitC.cpp b/lib/PTO/Transforms/PTOToEmitC.cpp
index 963b01c89c..0b23317ee4 100644
--- a/lib/PTO/Transforms/PTOToEmitC.cpp
+++ b/lib/PTO/Transforms/PTOToEmitC.cpp
@@ -18,6 +18,7 @@
 #include "PTO/IR/PTO.h"
 #include "PTO/IR/PTOTypeUtils.h"
 #include "PTO/IR/PTOSyncUtils.h"
+#include "PTO/Transforms/MemoryConsistencyAttrs.h"
 #include "PTO/Transforms/Passes.h"
 
 #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
@@ -131,15 +132,6 @@ static constexpr llvm::StringLiteral kForceDynamicValidShapeAttrName =
     "__pto.force_dynamic_valid_shape";
 static constexpr llvm::StringLiteral kGlobalTensorStridesAttrName =
     "__pto.globaltensor_strides";
-static constexpr llvm::StringLiteral kTNotifyDrainMte2AttrName =
-    "__pto.emitc.tnotify_drain_mte2";
-static constexpr llvm::StringLiteral kTNotifyDrainMte3AttrName =
-    "__pto.emitc.tnotify_drain_mte3";
-
-enum TNotifyMteDrainMask : unsigned {
-  kDrainMte2 = 1U << 0,
-  kDrainMte3 = 1U << 1,
-};
 static constexpr llvm::StringLiteral kLastUseAttrName = "pto.last_use";
 static constexpr llvm::StringLiteral kLastUseMarkerPrefix = "PTOAS__LAST_USE__";
 
@@ -272,104 +264,6 @@ static Value peelUnrealized(Value v) {
   return v;
 }
 
-static unsigned getMteDrainMaskForPipe(pto::PIPE pipe) {
-  switch (pipe) {
-  case pto::PIPE::PIPE_MTE2:
-    return kDrainMte2;
-  case pto::PIPE::PIPE_MTE3:
-    return kDrainMte3;
-  case pto::PIPE::PIPE_ALL:
-    return kDrainMte2 | kDrainMte3;
-  default:
-    return 0;
-  }
-}
-
-static unsigned getDirectMteDrainMask(Operation *op) {
-  if (auto pipeOp = dyn_cast<pto::OpPipeInterface>(op))
-    return getMteDrainMaskForPipe(pipeOp.getPipe());
-  return 0;
-}
-
-static unsigned collectMteDrainMask(Operation *op) {
-  unsigned mask = getDirectMteDrainMask(op);
-  for (Region &region : op->getRegions())
-    for (Block &block : region)
-      for (Operation &nested : block)
-        mask |= collectMteDrainMask(&nested);
-  return mask;
-}
-
-static bool isLoopLikeOp(Operation *op) {
-  return isa<scf::ForOp, scf::WhileOp, scf::ParallelOp, scf::ForallOp>(op);
-}
-
-static void setTNotifyDrainAttrs(pto::TNotifyOp op, unsigned mask) {
-  op->removeAttr(kTNotifyDrainMte2AttrName);
-  op->removeAttr(kTNotifyDrainMte3AttrName);
-  if (mask & kDrainMte2)
-    op->setAttr(kTNotifyDrainMte2AttrName, UnitAttr::get(op.getContext()));
-  if (mask & kDrainMte3)
-    op->setAttr(kTNotifyDrainMte3AttrName, UnitAttr::get(op.getContext()));
-}
-
-static void markNestedTNotifyWithMask(Operation *op, unsigned mask) {
-  op->walk([&](pto::TNotifyOp notify) { setTNotifyDrainAttrs(notify, mask); });
-}
-
-static unsigned annotateTNotifyMteDrainForBlock(Block &block,
-                                                unsigned entryPendingMask,
-                                                unsigned loopCarriedMask) {
-  unsigned pendingMask = entryPendingMask;
-  for (Operation &op : block) {
-    if (auto notify = dyn_cast<pto::TNotifyOp>(op)) {
-      setTNotifyDrainAttrs(notify, pendingMask | loopCarriedMask);
-      pendingMask = 0;
-    }
-
-    pendingMask |= getDirectMteDrainMask(&op);
-
-    unsigned regionEntryMask = pendingMask;
-    unsigned combinedRegionExitMask = 0;
-    for (Region &region : op.getRegions()) {
-      unsigned nestedLoopCarriedMask = loopCarriedMask;
-      if (isLoopLikeOp(&op))
-        nestedLoopCarriedMask |= collectMteDrainMask(&op);
-
-      if (region.hasOneBlock()) {
-        combinedRegionExitMask |= annotateTNotifyMteDrainForBlock(
-            region.front(), regionEntryMask, nestedLoopCarriedMask);
-      } else {
-        unsigned regionMask = collectMteDrainMask(&op);
-        markNestedTNotifyWithMask(&op, regionEntryMask | nestedLoopCarriedMask |
-                                           regionMask);
-        combinedRegionExitMask |= regionEntryMask | regionMask;
-      }
-    }
-    pendingMask |= combinedRegionExitMask;
-
-    if (auto barrier = dyn_cast<pto::BarrierOp>(op))
-      pendingMask &= ~getMteDrainMaskForPipe(barrier.getPipe().getPipe());
-  }
-  return pendingMask;
-}
-
-static void annotateTNotifyMteDrain(ModuleOp module) {
-  for (auto func : module.getOps<func::FuncOp>()) {
-    if (func.getBody().hasOneBlock()) {
-      (void)annotateTNotifyMteDrainForBlock(func.getBody().front(),
-                                            /*entryPendingMask=*/0,
-                                            /*loopCarriedMask=*/0);
-      continue;
-    }
-
-    // Be conservative for pre-existing CFG: without a path-sensitive CFG data
-    // flow here, every TNotify may observe any MTE work in the function.
-    unsigned funcMask = collectMteDrainMask(func.getOperation());
-    markNestedTNotifyWithMask(func.getOperation(), funcMask);
-  }
-}
-
 static Value buildGlobalTensorFromMemref(ConversionPatternRewriter &rewriter,
                                          Location loc, Value basePtr,
                                          MemRefType mrTy, Operation *anchor,
@@ -5405,6 +5299,13 @@ static std::string getAutoSyncTailModeToken(Operation *op) {
 //===----------------------------------------------------------------------===//
 // pto.barrier lowering -> pipe_barrier(...)
 //===----------------------------------------------------------------------===//
+static void emitDsbDdr(ConversionPatternRewriter &rewriter, Location loc) {
+  auto *ctx = rewriter.getContext();
+  auto args = rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "DSB_DDR")});
+  rewriter.create<emitc::CallOpaqueOp>(loc, TypeRange{}, "dsb", args,
+                                       ArrayAttr{}, ValueRange{});
+}
+
 struct PTOBarrierToEmitC : public OpConversionPattern<pto::BarrierOp> {
   using OpConversionPattern<pto::BarrierOp>::OpConversionPattern;
 
@@ -5446,6 +5347,22 @@ struct PTOBarrierToEmitC : public OpConversionPattern<pto::BarrierOp> {
   }
 };
 
+template <typename FenceOp>
+struct PTOFenceToEmitC : public OpConversionPattern<FenceOp> {
+  using OpConversionPattern<FenceOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(FenceOp op, typename FenceOp::Adaptor adaptor,
+                                ConversionPatternRewriter &rewriter) const override {
+    (void)adaptor;
+    if (op.getScope().getScope() != pto::FenceScope::DDR)
+      return rewriter.notifyMatchFailure(op, "unsupported fence scope");
+
+    emitDsbDdr(rewriter, op.getLoc());
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Sync lowering (robust for bracket form pto.set_flag[...] / pto.wait_flag[...])
 // Replace your PTOSyncToRuntimeCall with the code below.
@@ -6589,6 +6506,62 @@ struct PTOTAssignToEmitC : public OpConversionPattern<pto::TAssignOp> {
 // pto.load_scalar / pto.store_scalar lowering -> ptr[offset]
 //===----------------------------------------------------------------------===//
 
+static void emitCleanGmCache(ConversionPatternRewriter &rewriter,
+                             Location loc) {
+  auto *ctx = rewriter.getContext();
+  auto args = rewriter.getArrayAttr({
+      emitc::OpaqueAttr::get(ctx, "(__gm__ void*)0"),
+      emitc::OpaqueAttr::get(ctx, "ENTIRE_DATA_CACHE"),
+      emitc::OpaqueAttr::get(ctx, "CACHELINE_OUT"),
+  });
+  rewriter.create<emitc::CallOpaqueOp>(loc, TypeRange{}, "dcci", args,
+                                       ArrayAttr{}, ValueRange{});
+}
+
+static void emitInvalidateGmCache(ConversionPatternRewriter &rewriter,
+                                  Location loc) {
+  auto *ctx = rewriter.getContext();
+  auto args = rewriter.getArrayAttr({
+      emitc::OpaqueAttr::get(ctx, "(__gm__ void*)0"),
+      emitc::OpaqueAttr::get(ctx, "ENTIRE_DATA_CACHE"),
+  });
+  rewriter.create<emitc::CallOpaqueOp>(loc, TypeRange{}, "dcci", args,
+                                       ArrayAttr{}, ValueRange{});
+}
+
+static bool isGmCmoSpace(pto::AddressSpace space) {
+  return space == pto::AddressSpace::GM || space == pto::AddressSpace::Zero;
+}
+
+struct PTOCmoCleanToEmitC : public OpConversionPattern<pto::CmoCleanOp> {
+  using OpConversionPattern<pto::CmoCleanOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(pto::CmoCleanOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter &rewriter) const override {
+    (void)adaptor;
+    if (!isGmCmoSpace(op.getSpace().getAddressSpace()))
+      return rewriter.notifyMatchFailure(op, "unsupported CMO clean space");
+    emitCleanGmCache(rewriter, op.getLoc());
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+struct PTOCmoInvalidateToEmitC
+    : public OpConversionPattern<pto::CmoInvalidateOp> {
+  using OpConversionPattern<pto::CmoInvalidateOp>::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(pto::CmoInvalidateOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter &rewriter) const override {
+    (void)adaptor;
+    if (!isGmCmoSpace(op.getSpace().getAddressSpace()))
+      return rewriter.notifyMatchFailure(op, "unsupported CMO invalidate space");
+    emitInvalidateGmCache(rewriter, op.getLoc());
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
 static Type getPointerLikeElementType(Type type) {
   if (auto ptrTy = dyn_cast<pto::PtrType>(type))
     return ptrTy.getElementType();
@@ -7070,28 +7043,20 @@ static void emitPipeBarrier(ConversionPatternRewriter &rewriter, Location loc,
                                        ArrayAttr{}, ValueRange{});
 }
 
-static void emitDsbDdr(ConversionPatternRewriter &rewriter, Location loc) {
-  auto *ctx = rewriter.getContext();
-  auto args = rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "DSB_DDR")});
-  rewriter.create<emitc::CallOpaqueOp>(loc, TypeRange{}, "dsb", args,
-                                       ArrayAttr{}, ValueRange{});
-}
-
 // Issue #711: TNOTIFY writes its signal on the scalar pipe, and
 // TNOTIFY_IMPL's trailing pipe_barrier(PIPE_ALL) runs *after* that store.
-// If prior pto.tload / pto.tstore work is still in flight on an MTE pipe when
-// the signal lands, the receiver's matching TWAIT can return before the data
-// is visible. Emit only the MTE pipe drains that the pre-lowering analysis
-// proved may be needed before this TNotify. Issue #744: prior MTE3 stores also
-// need a DDR-domain release fence before publishing the notification signal.
-static void emitTNotifyMteDrain(ConversionPatternRewriter &rewriter,
-                                Location loc, unsigned mask) {
-  if (mask & kDrainMte2)
+// If prior MTE work is still in flight when the signal lands, the receiver's
+// matching TWAIT can return before the producer-side payload operation is
+// complete. MemoryConsistency now validates explicit CMO/fence operations for
+// DDR visibility; lowering only keeps the pipe-drain actions that the pass may
+// still annotate automatically.
+static void emitTNotifyReleaseActions(ConversionPatternRewriter &rewriter,
+                                      Location loc, bool drainMte2,
+                                      bool drainMte3) {
+  if (drainMte2)
     emitPipeBarrier(rewriter, loc, "PIPE_MTE2");
-  if (mask & kDrainMte3) {
+  if (drainMte3)
     emitPipeBarrier(rewriter, loc, "PIPE_MTE3");
-    emitDsbDdr(rewriter, loc);
-  }
 }
 
 static std::string waitCmpTok(pto::WaitCmp cmp) {
@@ -7348,14 +7313,11 @@ struct PTOSignalCommToEmitC : public OpConversionPattern<SignalOp> {
           rewriter, op.getLoc(), notifyTy, notifyOpTok(op.getNotifyOp()));
       SmallVector<Value> operands{*signalGT, peelUnrealized(adaptor.getValue()),
                                   notifyOp};
-      // See emitTNotifyMteDrain comment: drain in-flight MTE work before the
+      // See emitTNotifyReleaseActions comment: drain in-flight MTE work before the
       // scalar-pipe signal store so the notify/wait handshake is honored.
-      unsigned drainMask = 0;
-      if (op->hasAttr(kTNotifyDrainMte2AttrName))
-        drainMask |= kDrainMte2;
-      if (op->hasAttr(kTNotifyDrainMte3AttrName))
-        drainMask |= kDrainMte3;
-      emitTNotifyMteDrain(rewriter, op.getLoc(), drainMask);
+      bool drainMte2 = op->hasAttr(kTNotifyDrainMte2AttrName);
+      bool drainMte3 = op->hasAttr(kTNotifyDrainMte3AttrName);
+      emitTNotifyReleaseActions(rewriter, op.getLoc(), drainMte2, drainMte3);
       rewriter.create<emitc::CallOpaqueOp>(op.getLoc(), TypeRange{}, callee,
                                            ArrayAttr{}, ArrayAttr{}, operands);
       rewriter.eraseOp(op);
@@ -13746,7 +13708,11 @@ static void populatePTOToEmitCPatterns(RewritePatternSet &patterns,
     PTOTGemvMXToTGEMV_MX,
     PTOTGemvMXAccToTGEMV_MX,
     PTOTGemvMXBiasToTGEMV_MX,
-    PTOBarrierToEmitC
+    PTOBarrierToEmitC,
+    PTOFenceToEmitC<pto::FenceReleaseOp>,
+    PTOFenceToEmitC<pto::FenceAcquireOp>,
+    PTOCmoCleanToEmitC,
+    PTOCmoInvalidateToEmitC
   >(typeConverter, ctx);
 
   patterns.add<CallToEmitC, ReturnToEmitC>(typeConverter, ctx);
@@ -14007,8 +13973,6 @@ static AICORE inline void ptoas_auto_sync_tail(
       }
     }
 
-    annotateTNotifyMteDrain(mop);
-
     // 3. 配置转换目标
     ConversionTarget target(*ctx);
 
diff --git a/test/lit/pto/issue711_tnotify_mte_drain.pto b/test/lit/pto/issue711_tnotify_mte_drain.pto
index 1985744777..4940f88ea5 100644
--- a/test/lit/pto/issue711_tnotify_mte_drain.pto
+++ b/test/lit/pto/issue711_tnotify_mte_drain.pto
@@ -49,6 +49,9 @@ module {
               outs(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
     pto.tstore ins(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
                outs(%dst : !pto.partition_tensor_view<1x32xf32>)
+    pto.barrier <PIPE_MTE2>
+    pto.barrier <PIPE_MTE3>
+    pto.fence.release #pto.fence_scope<ddr>
 
     %sig_view = pto.make_tensor_view %signal_ptr,
       shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
@@ -127,6 +130,86 @@ module {
     return
   }
 
+  // A user/pass-provided MTE3 barrier drains the pending store; the explicit
+  // DDR release fence completes the publish sequence.
+  func.func @tnotify_after_existing_mte3_barrier_and_release(
+      %src_ptr: !pto.ptr<f32>,
+      %dst_ptr: !pto.ptr<f32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c32 = arith.constant 32 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %tile = pto.alloc_tile :
+      !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    %src_view = pto.make_tensor_view %src_ptr,
+      shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
+    %src = pto.partition_view %src_view,
+      offsets = [%c0, %c0], sizes = [%c1, %c32]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x32xf32>
+
+    %dst_view = pto.make_tensor_view %dst_ptr,
+      shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
+    %dst = pto.partition_view %dst_view,
+      offsets = [%c0, %c0], sizes = [%c1, %c32]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x32xf32>
+
+    pto.tload ins(%src : !pto.partition_tensor_view<1x32xf32>)
+              outs(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+    pto.barrier <PIPE_MTE2>
+    pto.tstore ins(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<1x32xf32>)
+    pto.barrier <PIPE_MTE3>
+    pto.fence.release #pto.fence_scope<ddr>
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op set>}
+    return
+  }
+
+  // If the user already writes the complete release sequence, the pass should
+  // not emit a second DDR fence before TNotify.
+  func.func @tnotify_after_existing_mte3_barrier_and_fence(
+      %dst_ptr: !pto.ptr<f32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c32 = arith.constant 32 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %tile = pto.alloc_tile :
+      !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    %dst_view = pto.make_tensor_view %dst_ptr,
+      shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
+    %dst = pto.partition_view %dst_view,
+      offsets = [%c0, %c0], sizes = [%c1, %c32]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x32xf32>
+
+    pto.tstore ins(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<1x32xf32>)
+    pto.barrier <PIPE_MTE3>
+    pto.fence.release #pto.fence_scope<ddr>
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op set>}
+    return
+  }
+
   // tnotify without prior MTE-side work does not need a release drain.
   func.func @tnotify_no_mte_drain(
       %signal_ptr: !pto.ptr<i32>)
@@ -188,6 +271,18 @@ module {
 // CHECK-NOT:  pipe_barrier(
 // CHECK:      pto::comm::TNOTIFY(
 
+// CHECK-LABEL: AICORE void tnotify_after_existing_mte3_barrier_and_release(
+// CHECK:      TSTORE(
+// CHECK:      pipe_barrier(PIPE_MTE3);
+// CHECK-NEXT: dsb(DSB_DDR);
+// CHECK-NEXT: pto::comm::TNOTIFY(
+
+// CHECK-LABEL: AICORE void tnotify_after_existing_mte3_barrier_and_fence(
+// CHECK:      TSTORE(
+// CHECK:      pipe_barrier(PIPE_MTE3);
+// CHECK-NEXT: dsb(DSB_DDR);
+// CHECK-NEXT: pto::comm::TNOTIFY(
+
 // CHECK-LABEL: AICORE void tnotify_no_mte_drain(
 // CHECK-NOT:  pipe_barrier(
 // CHECK:      pto::comm::TNOTIFY(
diff --git a/test/lit/pto/issue872_tput_tnotify_release.pto b/test/lit/pto/issue872_tput_tnotify_release.pto
new file mode 100644
index 0000000000..ecc62b3ecc
--- /dev/null
+++ b/test/lit/pto/issue872_tput_tnotify_release.pto
@@ -0,0 +1,169 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// Regression for issue #872: pto.comm.tput is a macro that can issue MTE3 GM
+// stores internally. A following TNotify publishes a cross-rank signal, so those
+// TPUT payload stores must be drained and made DDR-visible before the signal.
+
+// RUN: ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s
+
+module {
+  func.func @tput_tnotify_release(
+      %src_ptr: !pto.ptr<f32>,
+      %dst_ptr: !pto.ptr<f32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c32 = arith.constant 32 : index
+    %c64 = arith.constant 64 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %src_view = pto.make_tensor_view %src_ptr,
+      shape = [%c8, %c64], strides = [%c64, %c1] : !pto.tensor_view<?x?xf32>
+    %src = pto.partition_view %src_view,
+      offsets = [%c0, %c0], sizes = [%c8, %c64]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+
+    %dst_view = pto.make_tensor_view %dst_ptr,
+      shape = [%c8, %c64], strides = [%c64, %c1] : !pto.tensor_view<?x?xf32>
+    %dst = pto.partition_view %dst_view,
+      offsets = [%c0, %c0], sizes = [%c8, %c64]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+
+    %stage = pto.alloc_tile valid_row = %c2 valid_col = %c32 :
+      !pto.tile_buf<loc=vec, dtype=f32, rows=2, cols=32, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.comm.tput(%dst, %src, buf(%stage) :
+      !pto.partition_tensor_view<8x64xf32>,
+      !pto.partition_tensor_view<8x64xf32>,
+      !pto.tile_buf<loc=vec, dtype=f32, rows=2, cols=32, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      {atomicType = #pto<atomic_type atomic_none>}
+    pto.barrier <PIPE_MTE3>
+    pto.fence.release #pto.fence_scope<ddr>
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op atomic_add>}
+    return
+  }
+
+  func.func @tput_existing_pipe_all_still_dsb(
+      %src_ptr: !pto.ptr<f32>,
+      %dst_ptr: !pto.ptr<f32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c8 = arith.constant 8 : index
+    %c32 = arith.constant 32 : index
+    %c64 = arith.constant 64 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %src_view = pto.make_tensor_view %src_ptr,
+      shape = [%c8, %c64], strides = [%c64, %c1] : !pto.tensor_view<?x?xf32>
+    %src = pto.partition_view %src_view,
+      offsets = [%c0, %c0], sizes = [%c8, %c64]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+
+    %dst_view = pto.make_tensor_view %dst_ptr,
+      shape = [%c8, %c64], strides = [%c64, %c1] : !pto.tensor_view<?x?xf32>
+    %dst = pto.partition_view %dst_view,
+      offsets = [%c0, %c0], sizes = [%c8, %c64]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<8x64xf32>
+
+    %stage = pto.alloc_tile valid_row = %c2 valid_col = %c32 :
+      !pto.tile_buf<loc=vec, dtype=f32, rows=2, cols=32, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+    pto.comm.tput(%dst, %src, buf(%stage) :
+      !pto.partition_tensor_view<8x64xf32>,
+      !pto.partition_tensor_view<8x64xf32>,
+      !pto.tile_buf<loc=vec, dtype=f32, rows=2, cols=32, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+      {atomicType = #pto<atomic_type atomic_none>}
+    pto.barrier <PIPE_ALL>
+    pto.fence.release #pto.fence_scope<ddr>
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op atomic_add>}
+    return
+  }
+
+  func.func @tbroadcast_tnotify_release(
+      %src_ptr: !pto.ptr<f32>,
+      %peer_ptr: !pto.ptr<f32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c32 = arith.constant 32 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %ping = pto.alloc_tile :
+      !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    %src_view = pto.make_tensor_view %src_ptr,
+      shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
+    %src = pto.partition_view %src_view,
+      offsets = [%c0, %c0], sizes = [%c1, %c32]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x32xf32>
+
+    %peer_view = pto.make_tensor_view %peer_ptr,
+      shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
+    %peer = pto.partition_view %peer_view,
+      offsets = [%c0, %c0], sizes = [%c1, %c32]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x32xf32>
+
+    pto.comm.tbroadcast(%src, recv(%ping), group(%peer) :
+        !pto.partition_tensor_view<1x32xf32>,
+        !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+        !pto.partition_tensor_view<1x32xf32>) {root = 0 : i32}
+    pto.barrier <PIPE_MTE3>
+    pto.fence.release #pto.fence_scope<ddr>
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op atomic_add>}
+    return
+  }
+}
+
+// CHECK-LABEL: AICORE void tput_tnotify_release(
+// CHECK:      pto::comm::TPUT(
+// CHECK-NOT:  pipe_barrier(PIPE_MTE2);
+// CHECK:      pipe_barrier(PIPE_MTE3);
+// CHECK-NEXT: dsb(DSB_DDR);
+// CHECK-NEXT: pto::comm::TNOTIFY(
+
+// CHECK-LABEL: AICORE void tput_existing_pipe_all_still_dsb(
+// CHECK:      pto::comm::TPUT(
+// CHECK-NEXT: pipe_barrier(PIPE_ALL);
+// CHECK-NOT:  pipe_barrier(PIPE_MTE2);
+// CHECK-NOT:  pipe_barrier(PIPE_MTE3);
+// CHECK:      dsb(DSB_DDR);
+// CHECK-NEXT: pto::comm::TNOTIFY(
+
+// CHECK-LABEL: AICORE void tbroadcast_tnotify_release(
+// CHECK:      pto::comm::TBROADCAST(
+// CHECK-NOT:  pipe_barrier(PIPE_MTE2);
+// CHECK:      pipe_barrier(PIPE_MTE3);
+// CHECK-NEXT: dsb(DSB_DDR);
+// CHECK-NEXT: pto::comm::TNOTIFY(
diff --git a/test/lit/pto/memory_consistency_invalid.pto b/test/lit/pto/memory_consistency_invalid.pto
new file mode 100644
index 0000000000..f84652f82d
--- /dev/null
+++ b/test/lit/pto/memory_consistency_invalid.pto
@@ -0,0 +1,88 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: not ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s
+
+module {
+  func.func @missing_mte3_release(
+      %dst_ptr: !pto.ptr<f32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c32 = arith.constant 32 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %tile = pto.alloc_tile :
+      !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    %dst_view = pto.make_tensor_view %dst_ptr,
+      shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
+    %dst = pto.partition_view %dst_view,
+      offsets = [%c0, %c0], sizes = [%c1, %c32]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x32xf32>
+
+    pto.tstore ins(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<1x32xf32>)
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op set>}
+    return
+  }
+
+  func.func @missing_scalar_clean(
+      %payload_ptr: !pto.ptr<i32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v_i32 = arith.constant 7 : i32
+
+    pto.store_scalar %v_i32, %payload_ptr[%c0] : !pto.ptr<i32>, i32
+    pto.fence.release #pto.fence_scope<ddr>
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op set>}
+    return
+  }
+
+  func.func @missing_acquire_invalidate(
+      %payload_ptr: !pto.ptr<i32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.twait(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {cmp = #pto<wait_cmp ge>}
+
+    %val = pto.load_scalar %payload_ptr[%c0] : !pto.ptr<i32> -> i32
+    pto.store_scalar %val, %payload_ptr[%c0] : !pto.ptr<i32>, i32
+    return
+  }
+}
+
+// CHECK: requires an explicit `pto.barrier <PIPE_MTE3>`
+// CHECK: requires explicit `pto.cmo.clean all #pto.address_space<gm>`
+// CHECK: requires explicit `pto.cmo.invalidate all #pto.address_space<gm>`
diff --git a/test/lit/pto/signal_payload_cache_consistency.pto b/test/lit/pto/signal_payload_cache_consistency.pto
new file mode 100644
index 0000000000..9056e739f3
--- /dev/null
+++ b/test/lit/pto/signal_payload_cache_consistency.pto
@@ -0,0 +1,197 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// Signal/payload memory-consistency regressions for cacheable scalar GM paths.
+// These are correctness checks, not precise range-clean performance tests.
+
+// RUN: ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s
+
+module {
+  func.func @scalar_store_tnotify_release(
+      %payload_ptr: !pto.ptr<i32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v_i32 = arith.constant 7 : i32
+
+    pto.store_scalar %v_i32, %payload_ptr[%c0] : !pto.ptr<i32>, i32
+    pto.cmo.clean all #pto.address_space<gm>
+    pto.fence.release #pto.fence_scope<ddr>
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op set>}
+    return
+  }
+
+  func.func @scalar_store_clean_and_fence_suppress_release(
+      %payload_ptr: !pto.ptr<i32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v_i32 = arith.constant 7 : i32
+
+    pto.store_scalar %v_i32, %payload_ptr[%c0] : !pto.ptr<i32>, i32
+    pto.cmo.clean all #pto.address_space<gm>
+    pto.fence.release #pto.fence_scope<ddr>
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op set>}
+    return
+  }
+
+  func.func @twait_load_scalar_acquire(
+      %payload_ptr: !pto.ptr<i32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.twait(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {cmp = #pto<wait_cmp ge>}
+
+    pto.cmo.invalidate all #pto.address_space<gm>
+    %val = pto.load_scalar %payload_ptr[%c0] : !pto.ptr<i32> -> i32
+    pto.store_scalar %val, %payload_ptr[%c0] : !pto.ptr<i32>, i32
+    return
+  }
+
+  func.func @twait_user_invalidate_suppresses_acquire(
+      %payload_ptr: !pto.ptr<i32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.twait(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {cmp = #pto<wait_cmp ge>}
+
+    pto.cmo.invalidate all #pto.address_space<gm>
+    %val = pto.load_scalar %payload_ptr[%c0] : !pto.ptr<i32> -> i32
+    pto.store_scalar %val, %payload_ptr[%c0] : !pto.ptr<i32>, i32
+    return
+  }
+
+  func.func @explicit_fence_acquire(
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    pto.fence.acquire #pto.fence_scope<ddr>
+    return
+  }
+
+  func.func @ttest_load_scalar_conservative_acquire(
+      %payload_ptr: !pto.ptr<i32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    %ready = pto.comm.ttest(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {cmp = #pto<wait_cmp ge>} -> i1
+
+    pto.cmo.invalidate all #pto.address_space<gm>
+    %val = pto.load_scalar %payload_ptr[%c0] : !pto.ptr<i32> -> i32
+    scf.if %ready {
+      pto.store_scalar %val, %payload_ptr[%c0] : !pto.ptr<i32>, i32
+    }
+    return
+  }
+
+  func.func @dirty_store_before_acquire_is_cleaned(
+      %payload_ptr: !pto.ptr<i32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v_i32 = arith.constant 3 : i32
+
+    pto.store_scalar %v_i32, %payload_ptr[%c0] : !pto.ptr<i32>, i32
+    pto.cmo.clean all #pto.address_space<gm>
+    pto.fence.release #pto.fence_scope<ddr>
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.twait(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {cmp = #pto<wait_cmp ge>}
+
+    pto.cmo.invalidate all #pto.address_space<gm>
+    %val = pto.load_scalar %payload_ptr[%c0] : !pto.ptr<i32> -> i32
+    pto.store_scalar %val, %payload_ptr[%c0] : !pto.ptr<i32>, i32
+    return
+  }
+}
+
+// CHECK-LABEL: AICORE void scalar_store_tnotify_release(
+// CHECK:      {{.*}}[{{.*}}] =
+// CHECK-NEXT: dcci((__gm__ void*)0, ENTIRE_DATA_CACHE, CACHELINE_OUT);
+// CHECK-NEXT: dsb(DSB_DDR);
+// CHECK-NEXT: pto::comm::TNOTIFY(
+
+// CHECK-LABEL: AICORE void scalar_store_clean_and_fence_suppress_release(
+// CHECK:      {{.*}}[{{.*}}] =
+// CHECK-NEXT: dcci((__gm__ void*)0, ENTIRE_DATA_CACHE, CACHELINE_OUT);
+// CHECK-NEXT: dsb(DSB_DDR);
+// CHECK-NEXT: pto::comm::TNOTIFY(
+
+// CHECK-LABEL: AICORE void twait_load_scalar_acquire(
+// CHECK:      pto::comm::TWAIT(
+// CHECK:      dcci((__gm__ void*)0, ENTIRE_DATA_CACHE);
+// CHECK-NEXT: {{.*}} = {{.*}}[{{.*}}];
+
+// CHECK-LABEL: AICORE void twait_user_invalidate_suppresses_acquire(
+// CHECK:      pto::comm::TWAIT(
+// CHECK-NEXT: dcci((__gm__ void*)0, ENTIRE_DATA_CACHE);
+// CHECK-NEXT: {{.*}} = {{.*}}[{{.*}}];
+
+// CHECK-LABEL: AICORE void explicit_fence_acquire(
+// CHECK:      dsb(DSB_DDR);
+
+// CHECK-LABEL: AICORE void ttest_load_scalar_conservative_acquire(
+// CHECK:      pto::comm::TTEST(
+// CHECK:      dcci((__gm__ void*)0, ENTIRE_DATA_CACHE);
+// CHECK-NEXT: {{.*}} = {{.*}}[{{.*}}];
+
+// CHECK-LABEL: AICORE void dirty_store_before_acquire_is_cleaned(
+// CHECK:      {{.*}}[{{.*}}] =
+// CHECK:      dcci((__gm__ void*)0, ENTIRE_DATA_CACHE, CACHELINE_OUT);
+// CHECK-NEXT: dsb(DSB_DDR);
+// CHECK:      pto::comm::TWAIT(
+// CHECK:      dcci((__gm__ void*)0, ENTIRE_DATA_CACHE);
+// CHECK-NEXT: {{.*}} = {{.*}}[{{.*}}];
diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp
index 6443f1ca72..3b6f0f08d2 100644
--- a/tools/ptoas/ptoas.cpp
+++ b/tools/ptoas/ptoas.cpp
@@ -1915,6 +1915,7 @@ int mlir::pto::compilePTOASModule(
   pm.addPass(pto::createPTOInlineBackendHelpersPass());
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
+  pm.addPass(pto::createPTOMemoryConsistencyPass());
   if (failed(applyConfiguredPassManagerCLOptions(pm, "main PTOAS pipeline")))
     return 1;
 

From dc31dc39ddc4aca83cfdedd14bfb703fce14e48b Mon Sep 17 00:00:00 2001
From: TaoTao-real <TaoTao-real@users.noreply.github.com>
Date: Tue, 30 Jun 2026 16:01:20 +0800
Subject: [PATCH 2/9] Guard unsupported VPTO memory consistency lowering

---
 lib/PTO/Transforms/VPTOCANN900LLVMEmitter.cpp | 31 +++++++++++++++++++
 lib/PTO/Transforms/VPTOLLVMEmitter.cpp        | 31 +++++++++++++++++++
 .../memory_consistency_cmo_unsupported.pto    | 21 +++++++++++++
 .../memory_consistency_fence_unsupported.pto  | 21 +++++++++++++
 4 files changed, 104 insertions(+)
 create mode 100644 test/lit/vpto/memory_consistency_cmo_unsupported.pto
 create mode 100644 test/lit/vpto/memory_consistency_fence_unsupported.pto

diff --git a/lib/PTO/Transforms/VPTOCANN900LLVMEmitter.cpp b/lib/PTO/Transforms/VPTOCANN900LLVMEmitter.cpp
index 8362aea64b..46409520de 100644
--- a/lib/PTO/Transforms/VPTOCANN900LLVMEmitter.cpp
+++ b/lib/PTO/Transforms/VPTOCANN900LLVMEmitter.cpp
@@ -8616,6 +8616,31 @@ class LowerMemBarOpPattern final : public OpConversionPattern<pto::MemBarOp> {
   LoweringState &state;
 };
 
+template <typename MemoryConsistencyOp>
+class LowerUnsupportedMemoryConsistencyOpPattern final
+    : public OpConversionPattern<MemoryConsistencyOp> {
+public:
+  explicit LowerUnsupportedMemoryConsistencyOpPattern(
+      TypeConverter &typeConverter, MLIRContext *context,
+      LoweringState &state)
+      : OpConversionPattern<MemoryConsistencyOp>(typeConverter, context) {
+    (void)state;
+  }
+
+  LogicalResult
+  matchAndRewrite(MemoryConsistencyOp op,
+                  typename MemoryConsistencyOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    (void)adaptor;
+    (void)rewriter;
+    op.emitOpError()
+        << "is not supported by the VPTO backend yet; PTOAS validates the "
+           "memory-consistency contract, but VPTO lowering still needs a "
+           "confirmed DSB/DCCI intrinsic ABI";
+    return failure();
+  }
+};
+
 template <typename BufSyncOp>
 class LowerBufSyncOpPattern final : public OpConversionPattern<BufSyncOp> {
 public:
@@ -9981,6 +10006,10 @@ static void populateVPTOOpLoweringPatterns(VPTOTypeConverter &typeConverter,
                LowerPipeEventDynSyncOpPattern<pto::SetFlagDynOp>,
                LowerPipeEventDynSyncOpPattern<pto::WaitFlagDynOp>,
                LowerBarrierOpPattern, LowerMemBarOpPattern,
+               LowerUnsupportedMemoryConsistencyOpPattern<pto::CmoCleanOp>,
+               LowerUnsupportedMemoryConsistencyOpPattern<pto::CmoInvalidateOp>,
+               LowerUnsupportedMemoryConsistencyOpPattern<pto::FenceReleaseOp>,
+               LowerUnsupportedMemoryConsistencyOpPattern<pto::FenceAcquireOp>,
                LowerBufSyncOpPattern<pto::GetBufOp>,
                LowerBufSyncOpPattern<pto::RlsBufOp>,
                LowerRuntimeQueryOpPattern<pto::GetBlockIdxOp>,
@@ -10042,6 +10071,8 @@ static void configureVPTOOpLoweringTarget(ConversionTarget &target,
   target.addLegalOp<UnrealizedConversionCastOp>();
   target.addIllegalOp<pto::SetFlagOp, pto::WaitFlagOp, pto::SetFlagDynOp, pto::WaitFlagDynOp, pto::SyncSetOp,
                       pto::SyncWaitOp, pto::BarrierOp, pto::MemBarOp,
+                      pto::CmoCleanOp, pto::CmoInvalidateOp,
+                      pto::FenceReleaseOp, pto::FenceAcquireOp,
                       pto::GetBufOp, pto::RlsBufOp>();
   target.addIllegalOp<pto::GetBlockIdxOp, pto::GetSubBlockIdxOp,
                       pto::GetBlockNumOp, pto::GetSubBlockNumOp,
diff --git a/lib/PTO/Transforms/VPTOLLVMEmitter.cpp b/lib/PTO/Transforms/VPTOLLVMEmitter.cpp
index 35f8cc51a3..5376399455 100644
--- a/lib/PTO/Transforms/VPTOLLVMEmitter.cpp
+++ b/lib/PTO/Transforms/VPTOLLVMEmitter.cpp
@@ -8560,6 +8560,31 @@ class LowerMemBarOpPattern final : public OpConversionPattern<pto::MemBarOp> {
   LoweringState &state;
 };
 
+template <typename MemoryConsistencyOp>
+class LowerUnsupportedMemoryConsistencyOpPattern final
+    : public OpConversionPattern<MemoryConsistencyOp> {
+public:
+  explicit LowerUnsupportedMemoryConsistencyOpPattern(
+      TypeConverter &typeConverter, MLIRContext *context,
+      LoweringState &state)
+      : OpConversionPattern<MemoryConsistencyOp>(typeConverter, context) {
+    (void)state;
+  }
+
+  LogicalResult
+  matchAndRewrite(MemoryConsistencyOp op,
+                  typename MemoryConsistencyOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    (void)adaptor;
+    (void)rewriter;
+    op.emitOpError()
+        << "is not supported by the VPTO backend yet; PTOAS validates the "
+           "memory-consistency contract, but VPTO lowering still needs a "
+           "confirmed DSB/DCCI intrinsic ABI";
+    return failure();
+  }
+};
+
 template <typename BufSyncOp>
 class LowerBufSyncOpPattern final : public OpConversionPattern<BufSyncOp> {
 public:
@@ -9927,6 +9952,10 @@ static void populateVPTOOpLoweringPatterns(VPTOTypeConverter &typeConverter,
                LowerPipeEventDynSyncOpPattern<pto::SetFlagDynOp>,
                LowerPipeEventDynSyncOpPattern<pto::WaitFlagDynOp>,
                LowerBarrierOpPattern, LowerMemBarOpPattern,
+               LowerUnsupportedMemoryConsistencyOpPattern<pto::CmoCleanOp>,
+               LowerUnsupportedMemoryConsistencyOpPattern<pto::CmoInvalidateOp>,
+               LowerUnsupportedMemoryConsistencyOpPattern<pto::FenceReleaseOp>,
+               LowerUnsupportedMemoryConsistencyOpPattern<pto::FenceAcquireOp>,
                LowerBufSyncOpPattern<pto::GetBufOp>,
                LowerBufSyncOpPattern<pto::RlsBufOp>,
                LowerRuntimeQueryOpPattern<pto::GetBlockIdxOp>,
@@ -9988,6 +10017,8 @@ static void configureVPTOOpLoweringTarget(ConversionTarget &target,
   target.addLegalOp<UnrealizedConversionCastOp>();
   target.addIllegalOp<pto::SetFlagOp, pto::WaitFlagOp, pto::SetFlagDynOp, pto::WaitFlagDynOp, pto::SyncSetOp,
                       pto::SyncWaitOp, pto::BarrierOp, pto::MemBarOp,
+                      pto::CmoCleanOp, pto::CmoInvalidateOp,
+                      pto::FenceReleaseOp, pto::FenceAcquireOp,
                       pto::GetBufOp, pto::RlsBufOp>();
   target.addIllegalOp<pto::GetBlockIdxOp, pto::GetSubBlockIdxOp,
                       pto::GetBlockNumOp, pto::GetSubBlockNumOp,
diff --git a/test/lit/vpto/memory_consistency_cmo_unsupported.pto b/test/lit/vpto/memory_consistency_cmo_unsupported.pto
new file mode 100644
index 0000000000..12e7c1f059
--- /dev/null
+++ b/test/lit/vpto/memory_consistency_cmo_unsupported.pto
@@ -0,0 +1,21 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: not ptoas --pto-arch=a5 --pto-backend=vpto --emit-vpto-llvm-ir %s -o - 2>&1 | FileCheck %s
+// RUN: not ptoas --cann-output-version=9.0.0 --pto-arch=a5 --pto-backend=vpto --emit-vpto-llvm-ir %s -o - 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @unsupported_cmo_clean() attributes {pto.kernel} {
+    pto.cmo.clean all #pto.address_space<gm>
+    return
+  }
+}
+
+// CHECK: pto.cmo.clean
+// CHECK-SAME: is not supported by the VPTO backend yet
+// CHECK: VPTO lowering still needs a confirmed DSB/DCCI intrinsic ABI
diff --git a/test/lit/vpto/memory_consistency_fence_unsupported.pto b/test/lit/vpto/memory_consistency_fence_unsupported.pto
new file mode 100644
index 0000000000..b8e1bce069
--- /dev/null
+++ b/test/lit/vpto/memory_consistency_fence_unsupported.pto
@@ -0,0 +1,21 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: not ptoas --pto-arch=a5 --pto-backend=vpto --emit-vpto-llvm-ir %s -o - 2>&1 | FileCheck %s
+// RUN: not ptoas --cann-output-version=9.0.0 --pto-arch=a5 --pto-backend=vpto --emit-vpto-llvm-ir %s -o - 2>&1 | FileCheck %s
+
+module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind<vector>} {
+  func.func @unsupported_fence_release() attributes {pto.kernel} {
+    pto.fence.release #pto.fence_scope<ddr>
+    return
+  }
+}
+
+// CHECK: pto.fence.release
+// CHECK-SAME: is not supported by the VPTO backend yet
+// CHECK: VPTO lowering still needs a confirmed DSB/DCCI intrinsic ABI

From ead6b4f73a33e0422f38c42696aa62fcf1437541 Mon Sep 17 00:00:00 2001
From: TaoTao-real <TaoTao-real@users.noreply.github.com>
Date: Tue, 30 Jun 2026 16:06:54 +0800
Subject: [PATCH 3/9] Document PTOAS memory consistency design

---
 .../ptoas-memory-consistency-design.md        | 337 ++++++++++++++++++
 1 file changed, 337 insertions(+)
 create mode 100644 docs/designs/ptoas-memory-consistency-design.md

diff --git a/docs/designs/ptoas-memory-consistency-design.md b/docs/designs/ptoas-memory-consistency-design.md
new file mode 100644
index 0000000000..3a8619b262
--- /dev/null
+++ b/docs/designs/ptoas-memory-consistency-design.md
@@ -0,0 +1,337 @@
+# PTOAS 内存一致性设计
+
+本文说明 PTOAS 如何建模并校验 GM payload 与 signal 之间的内存一致性要求。
+
+这里讨论的是内存一致性，不是自动同步。自动同步负责 pipe 之间的执行顺序，例如
+`set_flag`、`wait_flag` 和 `pipe_barrier`。内存一致性负责回答另一个问题：当 signal
+已经被对端观察到时，signal 之前发布的 payload 是否已经对正确的观察方可见。
+
+## 1. 背景
+
+`pto.comm.tnotify` 用来发布一个 signal。对端通过 `pto.comm.twait` 或
+`pto.comm.ttest` 观察这个 signal，然后读取对应的 payload。
+
+一个容易误解的点是：signal ready 不等价于 payload 一定已经可见。原因是 signal
+和 payload 可能走不同的硬件路径：
+
+- signal 通常是一个较小的通信同步标记。
+- payload 通常是更大的 GM 数据，可能由 MTE3、TPUT 或 cacheable scalar store 写出。
+- 不同路径之间只靠源码顺序不一定形成完整的可见性关系。
+
+因此，PTOAS 需要在发布 signal 前校验 release 侧动作，在消费 signal 后校验
+acquire 侧动作。
+
+## 2. 关键概念
+
+### 2.1 Payload
+
+payload 是真正要被对端或后续代码读取的数据。例如：
+
+- `TStore` 写出的 GM 数据。
+- `TPUT` 内部写出的 peer GM 数据。
+- `store_scalar` 写出的 GM 数据。
+
+### 2.2 Signal
+
+signal 是通知对端 payload 已经准备好的标记。例如：
+
+- `TNotify` 发布 signal。
+- `TWait` 等待 signal。
+- `TTest` 轮询 signal 是否 ready。
+
+signal 只表达“通知发生了”。如果 signal 前没有正确的 release 动作，signal 可能先被
+对端观察到，而 payload 仍然没有进入对端能够正确读取的可见性状态。
+
+### 2.3 Pipe drain
+
+pipe drain 用来保证某条 pipe 上已经发出的工作完成到该 pipe 的边界。典型指令是：
+
+```mlir
+pto.barrier #pto.pipe<PIPE_MTE3>
+```
+
+它解决的是 pipe 内工作排空问题。它不等价于 cache clean，也不等价于 DDR-domain
+visibility fence。
+
+### 2.4 Cache maintenance operation
+
+cache maintenance operation 用来处理 cacheable GM 访问造成的 cache line 状态。
+当前 PTOAS 暴露两个语义 op：
+
+```mlir
+pto.cmo.clean all #pto.address_space<gm>
+pto.cmo.invalidate all #pto.address_space<gm>
+```
+
+第一阶段采用 whole-cache 形式。也就是说，它不指定精确地址范围，而是对整个 GM
+相关 data cache 做保守处理。这样优先保证正确性，后续再优化成精确 range。
+
+### 2.5 DDR fence
+
+DDR fence 用来把已经完成的 GM 写入或 cache maintenance 操作推进到 DDR visibility
+domain，并约束它们发生在后续 signal publish 之前。当前 PTOAS 暴露两个语义 op：
+
+```mlir
+pto.fence.release #pto.fence_scope<ddr>
+pto.fence.acquire #pto.fence_scope<ddr>
+```
+
+当前 release 和 acquire 都使用同一个 `ddr` scope。语义上，release 侧用于发布
+payload，acquire 侧用于约束观察 signal 后的 payload 读取。
+
+## 3. 整体模型
+
+生产端的正确顺序是：
+
+```mermaid
+flowchart LR
+  A["payload write"] --> B["pipe drain or cache clean"]
+  B --> C["DDR release fence"]
+  C --> D["TNotify publishes signal"]
+```
+
+消费端的正确顺序是：
+
+```mermaid
+flowchart LR
+  A["TWait or successful TTest observes signal"] --> B["cache invalidate if needed"]
+  B --> C["payload read"]
+```
+
+这两个方向配合起来，才能保证 signal 和 payload 的顺序关系对观察方成立。
+
+## 4. 显式 IR 接口
+
+PTOAS 选择把 cache maintenance 和 DDR fence 暴露成显式 PTO IR，而不是在 lowering
+阶段偷偷插入 `dcci` 和 `dsb`。
+
+原因如下：
+
+- 这类动作有实际运行时成本，尤其 whole-cache CMO 成本较高。
+- 用户或 PyPTO 更清楚 payload 的发布边界。
+- PTOAS 可以负责校验契约，避免漏插或乱序，而不是猜测所有场景。
+- VPTO 后端当前还没有确认的 DSB 和 DCCI intrinsic ABI，显式 IR 可以先稳定上层契约。
+
+当前新增的语义 op 是：
+
+| PTO IR | 语义 | EmitC lowering |
+| --- | --- | --- |
+| `pto.cmo.clean all #pto.address_space<gm>` | 清理 GM 相关 dirty cache line | `dcci((__gm__ void*)0, ENTIRE_DATA_CACHE, CACHELINE_OUT)` |
+| `pto.cmo.invalidate all #pto.address_space<gm>` | 失效 GM 相关 stale cache line | `dcci((__gm__ void*)0, ENTIRE_DATA_CACHE)` |
+| `pto.fence.release #pto.fence_scope<ddr>` | release 侧 DDR visibility fence | `dsb(DSB_DDR)` |
+| `pto.fence.acquire #pto.fence_scope<ddr>` | acquire 侧 DDR visibility fence | `dsb(DSB_DDR)` |
+
+## 5. MemoryConsistency pass
+
+`pto-memory-consistency` 是一个 Module pass，运行在 shared mainline 上，因此 EmitC 和
+VPTO backend 都会先经过这一步。
+
+这个 pass 的职责是校验显式契约：
+
+- 识别 signal publish 前是否存在 pending payload write。
+- 识别 signal acquire 后是否存在 cacheable GM payload read。
+- 校验用户或 PyPTO 是否已经插入必要的 CMO 和 fence。
+- 对缺失或顺序错误的场景报编译错误。
+- 对不需要 `dcci` 和 `dsb` 的纯 pipe drain 场景，仍允许保留自动标注。
+
+这个 pass 不负责分配 event id，也不属于 InsertSync 自动同步流水线。
+
+## 6. 场景规则
+
+### 6.1 MTE3 或 TPUT 写 payload 后发布 signal
+
+适用场景：
+
+- `TStore` 通过 `PIPE_MTE3` 写 GM。
+- `TPUT` macro op 内部通过 MTE3 写 peer GM。
+- 其他 macro op phase 中存在 MTE3 GM write。
+
+需要的顺序：
+
+```mlir
+// payload producer
+pto.barrier #pto.pipe<PIPE_MTE3>
+pto.fence.release #pto.fence_scope<ddr>
+pto.comm.tnotify ...
+```
+
+`pto.barrier #pto.pipe<PIPE_MTE3>` 用来排空 MTE3 pipe。`pto.fence.release` 用来保证
+这些 GM 写入在 signal 发布前进入 DDR visibility domain。
+
+如果只有 `pto.fence.release`，但没有 MTE3 barrier，PTOAS 会报错。因为 fence 不能替代
+pipe drain。
+
+### 6.2 MTE2 工作后发布 signal
+
+适用场景：
+
+- `TLoad` 或其他 `PIPE_MTE2` 工作出现在 `TNotify` 之前。
+
+当前规则：
+
+```mlir
+// PTOAS 可以自动标注并在 EmitC lowering 中生成 PIPE_MTE2 barrier
+pto.comm.tnotify ...
+```
+
+MTE2 是 GM read 方向。它需要的是 signal 前不要越过前序 MTE2 工作，但不需要 DDR
+release fence。PTOAS 当前仍允许自动插入这类纯 pipe drain。
+
+### 6.3 Cacheable scalar GM store 后发布 signal
+
+适用场景：
+
+- `store_scalar` 写 GM，并且该路径可能经过 cache。
+
+需要的顺序：
+
+```mlir
+pto.store_scalar ...
+pto.cmo.clean all #pto.address_space<gm>
+pto.fence.release #pto.fence_scope<ddr>
+pto.comm.tnotify ...
+```
+
+`pto.cmo.clean` 把 dirty cache line 推出。`pto.fence.release` 等待并约束 clean 的结果在
+signal 发布前可见。
+
+如果只插 `pto.fence.release`，PTOAS 会报错。因为 fence 不会替代 cache clean。
+
+### 6.4 TWait 或 TTest 后读取 cacheable GM payload
+
+适用场景：
+
+- `TWait` 返回后执行 `load_scalar` 读取 GM payload。
+- `TTest` 成功观察到 signal 后执行 `load_scalar` 读取 GM payload。
+
+需要的顺序：
+
+```mlir
+pto.comm.twait ...
+pto.cmo.invalidate all #pto.address_space<gm>
+%value = pto.load_scalar ...
+```
+
+invalidate 用来避免读取到本地 stale cache line。
+
+### 6.5 Acquire 前本地可能存在 dirty GM cache
+
+适用场景：
+
+- 同一个执行流中，等待 signal 前已经有 cacheable GM store。
+- 后续又要在 signal acquire 后读取 GM payload。
+
+需要的顺序：
+
+```mlir
+pto.store_scalar ...
+pto.cmo.clean all #pto.address_space<gm>
+pto.fence.release #pto.fence_scope<ddr>
+pto.comm.twait ...
+pto.cmo.invalidate all #pto.address_space<gm>
+%value = pto.load_scalar ...
+```
+
+clean 和 release fence 用来处理本地 dirty cache。invalidate 用来处理 signal 后读取对端
+payload 时可能遇到的 stale cache。
+
+## 7. PyPTO 生成建议
+
+PyPTO 需要在 payload publish 边界显式生成 CMO 和 fence。
+
+### 7.1 TPUT 发布 signal
+
+```mlir
+pto.comm.tput ...
+pto.barrier #pto.pipe<PIPE_MTE3>
+pto.fence.release #pto.fence_scope<ddr>
+pto.comm.tnotify ...
+```
+
+### 7.2 TStore 发布 signal
+
+```mlir
+pto.tstore ...
+pto.barrier #pto.pipe<PIPE_MTE3>
+pto.fence.release #pto.fence_scope<ddr>
+pto.comm.tnotify ...
+```
+
+### 7.3 Scalar store 发布 signal
+
+```mlir
+pto.store_scalar ...
+pto.cmo.clean all #pto.address_space<gm>
+pto.fence.release #pto.fence_scope<ddr>
+pto.comm.tnotify ...
+```
+
+### 7.4 TWait 后读取 scalar payload
+
+```mlir
+pto.comm.twait ...
+pto.cmo.invalidate all #pto.address_space<gm>
+%value = pto.load_scalar ...
+```
+
+### 7.5 TTest polling 后读取 scalar payload
+
+```mlir
+%ready = pto.comm.ttest ...
+scf.if %ready {
+  pto.cmo.invalidate all #pto.address_space<gm>
+  %value = pto.load_scalar ...
+}
+```
+
+如果 PyPTO 使用 `pto.ldg` 或 `pto.stg` 并显式选择 uncache 路径，可以避免部分
+cacheable scalar GM 问题。但这不是 `pto.cmo.clean` 或 `pto.cmo.invalidate` 的替代品。
+如果之前已经存在 dirty 或 stale cache line，仍需要显式 CMO。
+
+## 8. Backend lowering 状态
+
+### 8.1 EmitC
+
+EmitC backend 已经支持真实 lowering：
+
+- `pto.cmo.clean` lower 到 `dcci(..., CACHELINE_OUT)`。
+- `pto.cmo.invalidate` lower 到 `dcci(...)`。
+- `pto.fence.release` lower 到 `dsb(DSB_DDR)`。
+- `pto.fence.acquire` lower 到 `dsb(DSB_DDR)`。
+
+### 8.2 VPTO
+
+VPTO backend 当前没有确认的 DSB 和 DCCI intrinsic ABI。
+
+因此，VPTO lowering 中现在提供的是 fail-fast stub：
+
+- `pto.cmo.clean`
+- `pto.cmo.invalidate`
+- `pto.fence.release`
+- `pto.fence.acquire`
+
+如果这些 op 进入 VPTO LLVM lowering，PTOAS 会报错，提示 VPTO backend 尚不支持这些
+memory-consistency op，需要确认 DSB/DCCI intrinsic ABI 后再接真实 lowering。
+
+这样做的目的不是支持 VPTO 运行，而是避免 unsupported op 静默残留到后端 IR。
+
+## 9. 当前限制
+
+当前实现优先保证正确性，仍有以下限制：
+
+- CMO 是 whole-cache 粒度，不是精确地址范围。
+- `TWait` 和 `TTest` acquire 侧当前只覆盖 `load_scalar`。
+- VPTO 暂不支持 CMO 和 DDR fence 的真实 lowering。
+- 对复杂 CFG 的分析仍是保守近似，不做完整 path-sensitive 数据流。
+- MemoryConsistency pass 校验的是显式内存一致性契约，不替代 InsertSync 的 alias 和 pipe
+  同步分析。
+
+## 10. 后续工作
+
+后续可以分几步推进：
+
+1. 和 VPTO/Bisheng 对齐 DSB 和 DCCI intrinsic ABI，并补齐 VPTO lowering。
+2. 将 whole-cache CMO 优化成精确 GM address range CMO。
+3. 扩展 acquire 侧 consumer 范围，从 `load_scalar` 扩展到更多 cacheable GM read。
+4. 将 macro op phase 的 memory descriptor 做得更精细，减少误报。
+5. 在 PyPTO 和 PTOAS 之间明确 cacheable 与 uncacheable GM 访问的 IR 契约。

From 77035100927a7cf4789f4bfe3c863ea5b492646a Mon Sep 17 00:00:00 2001
From: TaoTao-real <TaoTao-real@users.noreply.github.com>
Date: Tue, 30 Jun 2026 16:20:30 +0800
Subject: [PATCH 4/9] Auto insert MTE3 drain before release fence

---
 .../ptoas-memory-consistency-design.md        | 32 +++++++++++++++----
 lib/PTO/Transforms/PTOMemoryConsistency.cpp   | 28 ++++++++++------
 test/lit/pto/issue711_tnotify_mte_drain.pto   |  1 -
 .../lit/pto/issue872_tput_tnotify_release.pto |  2 --
 test/lit/pto/memory_consistency_invalid.pto   |  2 +-
 5 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/docs/designs/ptoas-memory-consistency-design.md b/docs/designs/ptoas-memory-consistency-design.md
index 3a8619b262..00b656348d 100644
--- a/docs/designs/ptoas-memory-consistency-design.md
+++ b/docs/designs/ptoas-memory-consistency-design.md
@@ -131,6 +131,7 @@ VPTO backend 都会先经过这一步。
 - 识别 signal publish 前是否存在 pending payload write。
 - 识别 signal acquire 后是否存在 cacheable GM payload read。
 - 校验用户或 PyPTO 是否已经插入必要的 CMO 和 fence。
+- 在显式 release fence 前自动补齐必要的 MTE3 pipe drain。
 - 对缺失或顺序错误的场景报编译错误。
 - 对不需要 `dcci` 和 `dsb` 的纯 pipe drain 场景，仍允许保留自动标注。
 
@@ -150,16 +151,31 @@ VPTO backend 都会先经过这一步。
 
 ```mlir
 // payload producer
-pto.barrier #pto.pipe<PIPE_MTE3>
 pto.fence.release #pto.fence_scope<ddr>
 pto.comm.tnotify ...
 ```
 
-`pto.barrier #pto.pipe<PIPE_MTE3>` 用来排空 MTE3 pipe。`pto.fence.release` 用来保证
-这些 GM 写入在 signal 发布前进入 DDR visibility domain。
+PyPTO 或用户只需要表达 `pto.fence.release` 这个内存一致性边界。PTOAS 会在
+`pto.fence.release #pto.fence_scope<ddr>` 前检查是否存在 pending MTE3 GM write；如果存在，
+自动插入：
+
+```mlir
+pto.barrier #pto.pipe<PIPE_MTE3>
+```
+
+最终 lowering 的顺序是：
 
-如果只有 `pto.fence.release`，但没有 MTE3 barrier，PTOAS 会报错。因为 fence 不能替代
-pipe drain。
+```cpp
+pipe_barrier(PIPE_MTE3);
+dsb(DSB_DDR);
+pto::comm::TNOTIFY(...);
+```
+
+`pipe_barrier(PIPE_MTE3)` 用来排空 MTE3 pipe。`pto.fence.release` lower 出来的
+`dsb(DSB_DDR)` 用来保证这些 GM 写入在 signal 发布前进入 DDR visibility domain。
+
+如果缺少 `pto.fence.release`，PTOAS 会报错。因为 PTOAS 可以推导 pipe drain，但不会凭空
+猜测 payload publish 的语义边界。
 
 ### 6.2 MTE2 工作后发布 signal
 
@@ -239,11 +255,14 @@ payload 时可能遇到的 stale cache。
 
 PyPTO 需要在 payload publish 边界显式生成 CMO 和 fence。
 
+PyPTO 不需要手动生成 `pto.barrier #pto.pipe<PIPE_MTE3>`。这是低层 pipe drain 细节，
+由 PTOAS 根据 release fence 前的 pending MTE3 work 自动插入。这样可以保证最终顺序是
+`pipe_barrier(PIPE_MTE3)` 先于 `dsb(DSB_DDR)`，不会出现先 fence、后 drain 的错误顺序。
+
 ### 7.1 TPUT 发布 signal
 
 ```mlir
 pto.comm.tput ...
-pto.barrier #pto.pipe<PIPE_MTE3>
 pto.fence.release #pto.fence_scope<ddr>
 pto.comm.tnotify ...
 ```
@@ -252,7 +271,6 @@ pto.comm.tnotify ...
 
 ```mlir
 pto.tstore ...
-pto.barrier #pto.pipe<PIPE_MTE3>
 pto.fence.release #pto.fence_scope<ddr>
 pto.comm.tnotify ...
 ```
diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
index 10854e4ea0..b378b9e66e 100644
--- a/lib/PTO/Transforms/PTOMemoryConsistency.cpp
+++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
@@ -12,6 +12,7 @@
 #include "PTO/Transforms/Passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
@@ -236,22 +237,27 @@ static void diagnoseTNotifyRelease(pto::TNotifyOp op,
     hasFailure = true;
     return;
   }
-  if (state.drainMte3) {
-    op.emitOpError()
-        << "requires an explicit `pto.barrier <PIPE_MTE3>` before "
-           "`pto.fence.release #pto.fence_scope<ddr>` when publishing a "
-           "signal after MTE3 GM writes";
-    hasFailure = true;
-    return;
-  }
   if (state.needsDsbDdr) {
     op.emitOpError()
         << "requires explicit `pto.fence.release #pto.fence_scope<ddr>` "
-           "before publishing a signal after GM writes or cache clean";
+           "before publishing a signal after GM writes or cache clean; "
+           "PTOAS inserts the required MTE3 pipe drain before the release "
+           "fence when needed";
     hasFailure = true;
   }
 }
 
+static void insertMte3DrainBeforeReleaseFence(pto::FenceReleaseOp fence,
+                                              TNotifyReleaseState &state) {
+  if (fence.getScope().getScope() != pto::FenceScope::DDR || !state.drainMte3)
+    return;
+  OpBuilder builder(fence);
+  builder.create<pto::BarrierOp>(
+      fence.getLoc(), pto::PipeAttr::get(fence.getContext(),
+                                         pto::PIPE::PIPE_MTE3));
+  state.drainMte3 = false;
+}
+
 static void markNestedTNotifyWithState(Operation *op,
                                        const TNotifyReleaseState &state,
                                        bool &hasFailure) {
@@ -307,8 +313,10 @@ annotateTNotifyReleaseForBlock(Block &block,
       pendingState.applyBarrier(barrier.getPipe().getPipe());
     if (auto cmo = dyn_cast<pto::CmoCleanOp>(op))
       pendingState.applyCmoClean(cmo.getSpace().getAddressSpace());
-    if (auto fence = dyn_cast<pto::FenceReleaseOp>(op))
+    if (auto fence = dyn_cast<pto::FenceReleaseOp>(op)) {
+      insertMte3DrainBeforeReleaseFence(fence, pendingState);
       pendingState.applyFenceRelease(fence.getScope().getScope());
+    }
   }
   return pendingState;
 }
diff --git a/test/lit/pto/issue711_tnotify_mte_drain.pto b/test/lit/pto/issue711_tnotify_mte_drain.pto
index 4940f88ea5..0cda48da4c 100644
--- a/test/lit/pto/issue711_tnotify_mte_drain.pto
+++ b/test/lit/pto/issue711_tnotify_mte_drain.pto
@@ -50,7 +50,6 @@ module {
     pto.tstore ins(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
                outs(%dst : !pto.partition_tensor_view<1x32xf32>)
     pto.barrier <PIPE_MTE2>
-    pto.barrier <PIPE_MTE3>
     pto.fence.release #pto.fence_scope<ddr>
 
     %sig_view = pto.make_tensor_view %signal_ptr,
diff --git a/test/lit/pto/issue872_tput_tnotify_release.pto b/test/lit/pto/issue872_tput_tnotify_release.pto
index ecc62b3ecc..17a891a295 100644
--- a/test/lit/pto/issue872_tput_tnotify_release.pto
+++ b/test/lit/pto/issue872_tput_tnotify_release.pto
@@ -45,7 +45,6 @@ module {
       !pto.partition_tensor_view<8x64xf32>,
       !pto.tile_buf<loc=vec, dtype=f32, rows=2, cols=32, v_row=?, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
       {atomicType = #pto<atomic_type atomic_none>}
-    pto.barrier <PIPE_MTE3>
     pto.fence.release #pto.fence_scope<ddr>
 
     %sig_view = pto.make_tensor_view %signal_ptr,
@@ -132,7 +131,6 @@ module {
         !pto.partition_tensor_view<1x32xf32>,
         !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
         !pto.partition_tensor_view<1x32xf32>) {root = 0 : i32}
-    pto.barrier <PIPE_MTE3>
     pto.fence.release #pto.fence_scope<ddr>
 
     %sig_view = pto.make_tensor_view %signal_ptr,
diff --git a/test/lit/pto/memory_consistency_invalid.pto b/test/lit/pto/memory_consistency_invalid.pto
index f84652f82d..e486abe738 100644
--- a/test/lit/pto/memory_consistency_invalid.pto
+++ b/test/lit/pto/memory_consistency_invalid.pto
@@ -83,6 +83,6 @@ module {
   }
 }
 
-// CHECK: requires an explicit `pto.barrier <PIPE_MTE3>`
+// CHECK: requires explicit `pto.fence.release #pto.fence_scope<ddr>`
 // CHECK: requires explicit `pto.cmo.clean all #pto.address_space<gm>`
 // CHECK: requires explicit `pto.cmo.invalidate all #pto.address_space<gm>`

From aa7725d39398e3e65d9daeca95a70c30efb4d438 Mon Sep 17 00:00:00 2001
From: TaoTao-real <TaoTao-real@users.noreply.github.com>
Date: Tue, 30 Jun 2026 16:39:05 +0800
Subject: [PATCH 5/9] Handle FIX GM writes before release fence

---
 .../ptoas-memory-consistency-design.md        | 28 ++++++----
 lib/PTO/Transforms/PTOMemoryConsistency.cpp   | 54 ++++++++++++++-----
 test/lit/pto/issue711_tnotify_mte_drain.pto   | 40 ++++++++++++++
 3 files changed, 101 insertions(+), 21 deletions(-)

diff --git a/docs/designs/ptoas-memory-consistency-design.md b/docs/designs/ptoas-memory-consistency-design.md
index 00b656348d..0fc7edce4b 100644
--- a/docs/designs/ptoas-memory-consistency-design.md
+++ b/docs/designs/ptoas-memory-consistency-design.md
@@ -131,7 +131,7 @@ VPTO backend 都会先经过这一步。
 - 识别 signal publish 前是否存在 pending payload write。
 - 识别 signal acquire 后是否存在 cacheable GM payload read。
 - 校验用户或 PyPTO 是否已经插入必要的 CMO 和 fence。
-- 在显式 release fence 前自动补齐必要的 MTE3 pipe drain。
+- 在显式 release fence 前自动补齐必要的 MTE3 或 FIX pipe drain。
 - 对缺失或顺序错误的场景报编译错误。
 - 对不需要 `dcci` 和 `dsb` 的纯 pipe drain 场景，仍允许保留自动标注。
 
@@ -139,11 +139,13 @@ VPTO backend 都会先经过这一步。
 
 ## 6. 场景规则
 
-### 6.1 MTE3 或 TPUT 写 payload 后发布 signal
+### 6.1 MTE3、FIX 或 TPUT 写 payload 后发布 signal
 
 适用场景：
 
 - `TStore` 通过 `PIPE_MTE3` 写 GM。
+- `TStore` 通过 `PIPE_FIX` 写 GM，例如 ACC tile 写回 GM。
+- `TStoreFP` 通过 `PIPE_FIX` 写 GM。
 - `TPUT` macro op 内部通过 MTE3 写 peer GM。
 - 其他 macro op phase 中存在 MTE3 GM write。
 
@@ -156,11 +158,13 @@ pto.comm.tnotify ...
 ```
 
 PyPTO 或用户只需要表达 `pto.fence.release` 这个内存一致性边界。PTOAS 会在
-`pto.fence.release #pto.fence_scope<ddr>` 前检查是否存在 pending MTE3 GM write；如果存在，
-自动插入：
+`pto.fence.release #pto.fence_scope<ddr>` 前检查是否存在 pending MTE3 或 FIX GM write；如果存在，
+自动插入对应 pipe 的 drain：
 
 ```mlir
 pto.barrier #pto.pipe<PIPE_MTE3>
+// or
+pto.barrier #pto.pipe<PIPE_FIX>
 ```
 
 最终 lowering 的顺序是：
@@ -171,8 +175,13 @@ dsb(DSB_DDR);
 pto::comm::TNOTIFY(...);
 ```
 
-`pipe_barrier(PIPE_MTE3)` 用来排空 MTE3 pipe。`pto.fence.release` lower 出来的
-`dsb(DSB_DDR)` 用来保证这些 GM 写入在 signal 发布前进入 DDR visibility domain。
+`pipe_barrier(PIPE_MTE3)` 或 `pipe_barrier(PIPE_FIX)` 用来排空实际执行 GM write 的
+pipe。`pto.fence.release` lower 出来的 `dsb(DSB_DDR)` 用来保证这些 GM 写入在 signal
+发布前进入 DDR visibility domain。
+
+这里不能把所有 `PIPE_FIX` op 都当成 release payload write。很多 FIX op 只是本地
+ACC 到 MAT 或 ACC 到 VEC 的搬运，不需要 DDR release。PTOAS 只对确认写 GM payload 的
+FIX 路径补 release drain。
 
 如果缺少 `pto.fence.release`，PTOAS 会报错。因为 PTOAS 可以推导 pipe drain，但不会凭空
 猜测 payload publish 的语义边界。
@@ -255,9 +264,10 @@ payload 时可能遇到的 stale cache。
 
 PyPTO 需要在 payload publish 边界显式生成 CMO 和 fence。
 
-PyPTO 不需要手动生成 `pto.barrier #pto.pipe<PIPE_MTE3>`。这是低层 pipe drain 细节，
-由 PTOAS 根据 release fence 前的 pending MTE3 work 自动插入。这样可以保证最终顺序是
-`pipe_barrier(PIPE_MTE3)` 先于 `dsb(DSB_DDR)`，不会出现先 fence、后 drain 的错误顺序。
+PyPTO 不需要手动生成 `pto.barrier #pto.pipe<PIPE_MTE3>` 或
+`pto.barrier #pto.pipe<PIPE_FIX>`。这是低层 pipe drain 细节，由 PTOAS 根据 release fence
+前的 pending GM write pipe 自动插入。这样可以保证最终顺序是对应 pipe barrier 先于
+`dsb(DSB_DDR)`，不会出现先 fence、后 drain 的错误顺序。
 
 ### 7.1 TPUT 发布 signal
 
diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
index b378b9e66e..567e4ce425 100644
--- a/lib/PTO/Transforms/PTOMemoryConsistency.cpp
+++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
@@ -34,12 +34,14 @@ static bool isGmAddressSpace(pto::AddressSpace space) {
 struct TNotifyReleaseState {
   bool drainMte2 = false;
   bool drainMte3 = false;
+  bool drainFix = false;
   bool cleanGmCache = false;
   bool needsDsbDdr = false;
 
   void merge(const TNotifyReleaseState &other) {
     drainMte2 |= other.drainMte2;
     drainMte3 |= other.drainMte3;
+    drainFix |= other.drainFix;
     cleanGmCache |= other.cleanGmCache;
     needsDsbDdr |= other.needsDsbDdr;
   }
@@ -47,6 +49,7 @@ struct TNotifyReleaseState {
   void clear() {
     drainMte2 = false;
     drainMte3 = false;
+    drainFix = false;
     cleanGmCache = false;
     needsDsbDdr = false;
   }
@@ -59,9 +62,13 @@ struct TNotifyReleaseState {
     case pto::PIPE::PIPE_MTE3:
       drainMte3 = false;
       break;
+    case pto::PIPE::PIPE_FIX:
+      drainFix = false;
+      break;
     case pto::PIPE::PIPE_ALL:
       drainMte2 = false;
       drainMte3 = false;
+      drainFix = false;
       break;
     default:
       break;
@@ -76,7 +83,7 @@ struct TNotifyReleaseState {
   void applyFenceRelease(pto::FenceScope scope) {
     if (scope != pto::FenceScope::DDR)
       return;
-    if (drainMte3 || cleanGmCache)
+    if (drainMte3 || drainFix || cleanGmCache)
       return;
     needsDsbDdr = false;
   }
@@ -154,6 +161,13 @@ static TNotifyReleaseState getReleaseStateForPipe(pto::PIPE pipe) {
   return state;
 }
 
+static TNotifyReleaseState getFixGmWriteReleaseState() {
+  TNotifyReleaseState state;
+  state.drainFix = true;
+  state.needsDsbDdr = true;
+  return state;
+}
+
 static TNotifyReleaseState getReleaseStateForMacroModel(Operation *op) {
   TNotifyReleaseState state;
   auto model = getSyncMacroModel(op);
@@ -185,8 +199,16 @@ static TNotifyReleaseState getDirectTNotifyReleaseState(Operation *op) {
     }
   }
 
+  if (auto tstore = dyn_cast<pto::TStoreOp>(op);
+      tstore && tstore.getPipe() == pto::PIPE::PIPE_FIX)
+    return getFixGmWriteReleaseState();
+
+  if (isa<pto::TStoreFPOp>(op))
+    return getFixGmWriteReleaseState();
+
   TNotifyReleaseState macroState = getReleaseStateForMacroModel(op);
-  if (macroState.drainMte3 || macroState.cleanGmCache ||
+  if (macroState.drainMte3 || macroState.drainFix ||
+      macroState.cleanGmCache ||
       macroState.needsDsbDdr)
     return macroState;
 
@@ -241,21 +263,29 @@ static void diagnoseTNotifyRelease(pto::TNotifyOp op,
     op.emitOpError()
         << "requires explicit `pto.fence.release #pto.fence_scope<ddr>` "
            "before publishing a signal after GM writes or cache clean; "
-           "PTOAS inserts the required MTE3 pipe drain before the release "
-           "fence when needed";
+           "PTOAS inserts the required MTE3/FIX pipe drain before the "
+           "release fence when needed";
     hasFailure = true;
   }
 }
 
-static void insertMte3DrainBeforeReleaseFence(pto::FenceReleaseOp fence,
-                                              TNotifyReleaseState &state) {
-  if (fence.getScope().getScope() != pto::FenceScope::DDR || !state.drainMte3)
+static void insertDrainsBeforeReleaseFence(pto::FenceReleaseOp fence,
+                                           TNotifyReleaseState &state) {
+  if (fence.getScope().getScope() != pto::FenceScope::DDR)
     return;
   OpBuilder builder(fence);
-  builder.create<pto::BarrierOp>(
-      fence.getLoc(), pto::PipeAttr::get(fence.getContext(),
-                                         pto::PIPE::PIPE_MTE3));
-  state.drainMte3 = false;
+  auto insertBarrier = [&](pto::PIPE pipe) {
+    builder.create<pto::BarrierOp>(
+        fence.getLoc(), pto::PipeAttr::get(fence.getContext(), pipe));
+  };
+  if (state.drainMte3) {
+    insertBarrier(pto::PIPE::PIPE_MTE3);
+    state.drainMte3 = false;
+  }
+  if (state.drainFix) {
+    insertBarrier(pto::PIPE::PIPE_FIX);
+    state.drainFix = false;
+  }
 }
 
 static void markNestedTNotifyWithState(Operation *op,
@@ -314,7 +344,7 @@ annotateTNotifyReleaseForBlock(Block &block,
     if (auto cmo = dyn_cast<pto::CmoCleanOp>(op))
       pendingState.applyCmoClean(cmo.getSpace().getAddressSpace());
     if (auto fence = dyn_cast<pto::FenceReleaseOp>(op)) {
-      insertMte3DrainBeforeReleaseFence(fence, pendingState);
+      insertDrainsBeforeReleaseFence(fence, pendingState);
       pendingState.applyFenceRelease(fence.getScope().getScope());
     }
   }
diff --git a/test/lit/pto/issue711_tnotify_mte_drain.pto b/test/lit/pto/issue711_tnotify_mte_drain.pto
index 0cda48da4c..01c0f48637 100644
--- a/test/lit/pto/issue711_tnotify_mte_drain.pto
+++ b/test/lit/pto/issue711_tnotify_mte_drain.pto
@@ -62,6 +62,40 @@ module {
     return
   }
 
+  // acc tstore -> tnotify: ACC->GM stores use the FIX pipe, so the release
+  // fence must be preceded by a FIX pipe drain.
+  func.func @tnotify_drain_after_acc_tstore(
+      %dst_ptr: !pto.ptr<f32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %acc = pto.alloc_tile :
+      !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+
+    %dst_view = pto.make_tensor_view %dst_ptr,
+      shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view<?x?xf32>
+    %dst = pto.partition_view %dst_view,
+      offsets = [%c0, %c0], sizes = [%c16, %c16]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<16x16xf32>
+
+    pto.tstore ins(%acc : !pto.tile_buf<loc=acc, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<16x16xf32>)
+    pto.fence.release #pto.fence_scope<ddr>
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op set>}
+    return
+  }
+
   // tload -> tnotify: the input-consumed case (notify must follow the load
   // so the producer can reuse the source buffer once TWAIT returns).
   func.func @tnotify_drain_after_tload(
@@ -257,6 +291,12 @@ module {
 // CHECK-NEXT: dsb(DSB_DDR);
 // CHECK-NEXT: pto::comm::TNOTIFY(
 
+// CHECK-LABEL: AICORE void tnotify_drain_after_acc_tstore(
+// CHECK:      TSTORE
+// CHECK:      pipe_barrier(PIPE_FIX);
+// CHECK-NEXT: dsb(DSB_DDR);
+// CHECK-NEXT: pto::comm::TNOTIFY(
+
 // CHECK-LABEL: AICORE void tnotify_drain_after_tload(
 // CHECK:      pto::comm::NotifyOp{{.*}}= pto::comm::NotifyOp::AtomicAdd;
 // CHECK:      TLOAD(

From 4c60b7e93105aa8bf931db2e322eaa2e2debf353 Mon Sep 17 00:00:00 2001
From: TaoTao-real <TaoTao-real@users.noreply.github.com>
Date: Tue, 30 Jun 2026 16:58:26 +0800
Subject: [PATCH 6/9] Tighten TNotify release payload detection

---
 .../ptoas-memory-consistency-design.md        |  4 +
 lib/PTO/Transforms/PTOMemoryConsistency.cpp   | 41 +++++-----
 test/lit/pto/tnotify_release_local_ops.pto    | 79 +++++++++++++++++++
 3 files changed, 102 insertions(+), 22 deletions(-)
 create mode 100644 test/lit/pto/tnotify_release_local_ops.pto

diff --git a/docs/designs/ptoas-memory-consistency-design.md b/docs/designs/ptoas-memory-consistency-design.md
index 0fc7edce4b..2bbe1ed6c7 100644
--- a/docs/designs/ptoas-memory-consistency-design.md
+++ b/docs/designs/ptoas-memory-consistency-design.md
@@ -183,6 +183,10 @@ pipe。`pto.fence.release` lower 出来的 `dsb(DSB_DDR)` 用来保证这些 GM
 ACC 到 MAT 或 ACC 到 VEC 的搬运，不需要 DDR release。PTOAS 只对确认写 GM payload 的
 FIX 路径补 release drain。
 
+同理，也不能把所有 `PIPE_MTE3` op 都当成 release payload write。例如 A5 的
+Vec 到 Mat `TInsert` 是本地 UB 到 L1 的搬运，不发布 GM payload。PTOAS 只对
+`TStore`、comm macro MTE3 phase 等确认写 GM payload 的路径补 release drain。
+
 如果缺少 `pto.fence.release`，PTOAS 会报错。因为 PTOAS 可以推导 pipe drain，但不会凭空
 猜测 payload publish 的语义边界。
 
diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
index 567e4ce425..3937892727 100644
--- a/lib/PTO/Transforms/PTOMemoryConsistency.cpp
+++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
@@ -140,24 +140,16 @@ static bool isGmScalarMemory(Type type) {
   return false;
 }
 
-static TNotifyReleaseState getReleaseStateForPipe(pto::PIPE pipe) {
+static TNotifyReleaseState getMte2PayloadReadReleaseState() {
   TNotifyReleaseState state;
-  switch (pipe) {
-  case pto::PIPE::PIPE_MTE2:
-    state.drainMte2 = true;
-    break;
-  case pto::PIPE::PIPE_MTE3:
-    state.drainMte3 = true;
-    state.needsDsbDdr = true;
-    break;
-  case pto::PIPE::PIPE_ALL:
-    state.drainMte2 = true;
-    state.drainMte3 = true;
-    state.needsDsbDdr = true;
-    break;
-  default:
-    break;
-  }
+  state.drainMte2 = true;
+  return state;
+}
+
+static TNotifyReleaseState getMte3GmWriteReleaseState() {
+  TNotifyReleaseState state;
+  state.drainMte3 = true;
+  state.needsDsbDdr = true;
   return state;
 }
 
@@ -199,9 +191,16 @@ static TNotifyReleaseState getDirectTNotifyReleaseState(Operation *op) {
     }
   }
 
-  if (auto tstore = dyn_cast<pto::TStoreOp>(op);
-      tstore && tstore.getPipe() == pto::PIPE::PIPE_FIX)
-    return getFixGmWriteReleaseState();
+  if (isa<pto::TLoadOp, pto::TPrefetchOp>(op))
+    return getMte2PayloadReadReleaseState();
+
+  if (auto tstore = dyn_cast<pto::TStoreOp>(op)) {
+    if (tstore.getPipe() == pto::PIPE::PIPE_MTE3)
+      return getMte3GmWriteReleaseState();
+    if (tstore.getPipe() == pto::PIPE::PIPE_FIX)
+      return getFixGmWriteReleaseState();
+    return {};
+  }
 
   if (isa<pto::TStoreFPOp>(op))
     return getFixGmWriteReleaseState();
@@ -212,8 +211,6 @@ static TNotifyReleaseState getDirectTNotifyReleaseState(Operation *op) {
       macroState.needsDsbDdr)
     return macroState;
 
-  if (auto pipeOp = dyn_cast<pto::OpPipeInterface>(op))
-    return getReleaseStateForPipe(pipeOp.getPipe());
   return {};
 }
 
diff --git a/test/lit/pto/tnotify_release_local_ops.pto b/test/lit/pto/tnotify_release_local_ops.pto
new file mode 100644
index 0000000000..6b8940f21f
--- /dev/null
+++ b/test/lit/pto/tnotify_release_local_ops.pto
@@ -0,0 +1,79 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// TNotify release analysis must be payload-specific. Some local tile ops run on
+// MTE3 or FIX internally, but they do not publish GM payloads and must not
+// require a DDR release fence or emit a release drain before TNotify.
+
+// RUN: ptoas --pto-arch=a5 %s -o - 2>&1 | FileCheck %s
+
+module attributes {"pto.target_arch" = "a5"} {
+  func.func @local_tinsert_mte3_then_tnotify(%signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %src = pto.alloc_tile :
+      !pto.tile_buf<loc=vec, dtype=f16, rows=64, cols=64, v_row=64, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    %dst = pto.alloc_tile :
+      !pto.tile_buf<loc=mat, dtype=f16, rows=64, cols=64, v_row=64, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tinsert ins(%src, %c0, %c0 :
+        !pto.tile_buf<loc=vec, dtype=f16, rows=64, cols=64, v_row=64, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>,
+        index, index)
+      outs(%dst :
+        !pto.tile_buf<loc=mat, dtype=f16, rows=64, cols=64, v_row=64, v_col=64, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+      {tinsertMode = #pto.tinsert_mode<split2>}
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op set>}
+    return
+  }
+
+  func.func @local_tmov_fix_then_tnotify(%signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %src = pto.alloc_tile :
+      !pto.tile_buf<loc=acc, dtype=f32, rows=32, cols=32, v_row=32, v_col=32, blayout=col_major, slayout=row_major, fractal=1024, pad=0>
+    %dst = pto.alloc_tile :
+      !pto.tile_buf<loc=mat, dtype=f16, rows=32, cols=32, v_row=32, v_col=32, blayout=col_major, slayout=row_major, fractal=512, pad=0>
+    pto.tmov ins(%src :
+        !pto.tile_buf<loc=acc, dtype=f32, rows=32, cols=32, v_row=32, v_col=32, blayout=col_major, slayout=row_major, fractal=1024, pad=0>)
+      outs(%dst :
+        !pto.tile_buf<loc=mat, dtype=f16, rows=32, cols=32, v_row=32, v_col=32, blayout=col_major, slayout=row_major, fractal=512, pad=0>)
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op set>}
+    return
+  }
+}
+
+// CHECK-LABEL: AICORE void local_tinsert_mte3_then_tnotify(
+// CHECK:      TINSERT
+// CHECK-NOT:  pipe_barrier(
+// CHECK-NOT:  dsb(
+// CHECK:      pto::comm::TNOTIFY(
+
+// CHECK-LABEL: AICORE void local_tmov_fix_then_tnotify(
+// CHECK:      TMOV
+// CHECK-NOT:  pipe_barrier(
+// CHECK-NOT:  dsb(
+// CHECK:      pto::comm::TNOTIFY(

From 4c6e22c678c1670085ed9f7bb68c4aa256eadd8f Mon Sep 17 00:00:00 2001
From: TaoTao-real <TaoTao-real@users.noreply.github.com>
Date: Tue, 30 Jun 2026 17:13:10 +0800
Subject: [PATCH 7/9] Refine memory consistency region analysis

---
 .../ptoas-memory-consistency-design.md        |  4 ++
 lib/PTO/Transforms/PTOMemoryConsistency.cpp   | 58 +++++++++++++++----
 .../pto/memory_consistency_external_func.pto  | 37 ++++++++++++
 3 files changed, 89 insertions(+), 10 deletions(-)
 create mode 100644 test/lit/pto/memory_consistency_external_func.pto

diff --git a/docs/designs/ptoas-memory-consistency-design.md b/docs/designs/ptoas-memory-consistency-design.md
index 2bbe1ed6c7..ef2fd58e38 100644
--- a/docs/designs/ptoas-memory-consistency-design.md
+++ b/docs/designs/ptoas-memory-consistency-design.md
@@ -135,6 +135,10 @@ VPTO backend 都会先经过这一步。
 - 对缺失或顺序错误的场景报编译错误。
 - 对不需要 `dcci` 和 `dsb` 的纯 pipe drain 场景，仍允许保留自动标注。
 
+遍历策略是 region-scoped 的保守分析：单 block region 按顺序递归分析；复杂 CFG
+region 暂不做 path-sensitive 数据流，但只在当前 region 内收集 pending state，不把同一个
+parent op 的其他 sibling region 状态混入。外部函数声明没有函数体，pass 会直接跳过。
+
 这个 pass 不负责分配 event id，也不属于 InsertSync 自动同步流水线。
 
 ## 6. 场景规则
diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
index 3937892727..e4eaaf4e00 100644
--- a/lib/PTO/Transforms/PTOMemoryConsistency.cpp
+++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
@@ -223,6 +223,14 @@ static TNotifyReleaseState collectTNotifyReleaseState(Operation *op) {
   return state;
 }
 
+static TNotifyReleaseState collectTNotifyReleaseState(Region &region) {
+  TNotifyReleaseState state;
+  for (Block &block : region)
+    for (Operation &nested : block)
+      state.merge(collectTNotifyReleaseState(&nested));
+  return state;
+}
+
 static bool isLoopLikeOp(Operation *op) {
   return isa<scf::ForOp, scf::WhileOp, scf::ParallelOp, scf::ForallOp>(op);
 }
@@ -294,6 +302,15 @@ static void markNestedTNotifyWithState(Operation *op,
   });
 }
 
+static void markNestedTNotifyWithState(Region &region,
+                                       const TNotifyReleaseState &state,
+                                       bool &hasFailure) {
+  for (Block &block : region) {
+    for (Operation &nested : block)
+      markNestedTNotifyWithState(&nested, state, hasFailure);
+  }
+}
+
 static TNotifyReleaseState
 annotateTNotifyReleaseForBlock(Block &block,
                                TNotifyReleaseState entryPendingState,
@@ -323,11 +340,11 @@ annotateTNotifyReleaseForBlock(Block &block,
             region.front(), regionEntryState, nestedLoopCarriedState,
             hasFailure));
       } else {
-        TNotifyReleaseState regionState = collectTNotifyReleaseState(&op);
+        TNotifyReleaseState regionState = collectTNotifyReleaseState(region);
         TNotifyReleaseState nestedNotifyState = regionEntryState;
         nestedNotifyState.merge(nestedLoopCarriedState);
         nestedNotifyState.merge(regionState);
-        markNestedTNotifyWithState(&op, nestedNotifyState, hasFailure);
+        markNestedTNotifyWithState(region, nestedNotifyState, hasFailure);
 
         TNotifyReleaseState regionExitState = regionEntryState;
         regionExitState.merge(regionState);
@@ -351,6 +368,9 @@ annotateTNotifyReleaseForBlock(Block &block,
 static bool annotateTNotifyRelease(ModuleOp module) {
   bool hasFailure = false;
   for (auto func : module.getOps<func::FuncOp>()) {
+    if (func.isExternal())
+      continue;
+
     if (func.getBody().hasOneBlock()) {
       (void)annotateTNotifyReleaseForBlock(func.getBody().front(),
                                            TNotifyReleaseState{},
@@ -362,9 +382,8 @@ static bool annotateTNotifyRelease(ModuleOp module) {
     // Be conservative for pre-existing CFG: without a path-sensitive CFG data
     // flow here, every TNotify may observe any release-relevant work in the
     // function.
-    TNotifyReleaseState funcState =
-        collectTNotifyReleaseState(func.getOperation());
-    markNestedTNotifyWithState(func.getOperation(), funcState, hasFailure);
+    TNotifyReleaseState funcState = collectTNotifyReleaseState(func.getBody());
+    markNestedTNotifyWithState(func.getBody(), funcState, hasFailure);
   }
   return hasFailure;
 }
@@ -434,6 +453,14 @@ static SignalAcquireState collectSignalAcquireState(Operation *op) {
   return state;
 }
 
+static SignalAcquireState collectSignalAcquireState(Region &region) {
+  SignalAcquireState state;
+  for (Block &block : region)
+    for (Operation &nested : block)
+      state.merge(collectSignalAcquireState(&nested));
+  return state;
+}
+
 static void markNestedAcquireLoadsWithState(Operation *op,
                                             SignalAcquireState state,
                                             bool &hasFailure) {
@@ -444,6 +471,15 @@ static void markNestedAcquireLoadsWithState(Operation *op,
   });
 }
 
+static void markNestedAcquireLoadsWithState(Region &region,
+                                            SignalAcquireState state,
+                                            bool &hasFailure) {
+  for (Block &block : region) {
+    for (Operation &nested : block)
+      markNestedAcquireLoadsWithState(&nested, state, hasFailure);
+  }
+}
+
 static SignalAcquireState
 annotateSignalAcquireForBlock(Block &block, SignalAcquireState entryState,
                               bool &hasFailure) {
@@ -478,8 +514,8 @@ annotateSignalAcquireForBlock(Block &block, SignalAcquireState entryState,
         combinedRegionExitState.merge(
             annotateSignalAcquireForBlock(region.front(), state, hasFailure));
       } else {
-        markNestedAcquireLoadsWithState(&op, state, hasFailure);
-        SignalAcquireState regionState = collectSignalAcquireState(&op);
+        markNestedAcquireLoadsWithState(region, state, hasFailure);
+        SignalAcquireState regionState = collectSignalAcquireState(region);
         SignalAcquireState regionExitState = state;
         regionExitState.merge(regionState);
         combinedRegionExitState.merge(regionExitState);
@@ -496,15 +532,17 @@ annotateSignalAcquireForBlock(Block &block, SignalAcquireState entryState,
 static bool annotateSignalAcquire(ModuleOp module) {
   bool hasFailure = false;
   for (auto func : module.getOps<func::FuncOp>()) {
+    if (func.isExternal())
+      continue;
+
     if (func.getBody().hasOneBlock()) {
       (void)annotateSignalAcquireForBlock(func.getBody().front(),
                                           SignalAcquireState{}, hasFailure);
       continue;
     }
 
-    SignalAcquireState funcState =
-        collectSignalAcquireState(func.getOperation());
-    markNestedAcquireLoadsWithState(func.getOperation(), funcState, hasFailure);
+    SignalAcquireState funcState = collectSignalAcquireState(func.getBody());
+    markNestedAcquireLoadsWithState(func.getBody(), funcState, hasFailure);
   }
   return hasFailure;
 }
diff --git a/test/lit/pto/memory_consistency_external_func.pto b/test/lit/pto/memory_consistency_external_func.pto
new file mode 100644
index 0000000000..e24951f0cd
--- /dev/null
+++ b/test/lit/pto/memory_consistency_external_func.pto
@@ -0,0 +1,37 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// MemoryConsistency must skip external func declarations. They have no body to
+// scan and should not affect release/acquire state in real kernels.
+
+// RUN: ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s
+
+module {
+  func.func private @external_consumer(!pto.ptr<i32>)
+
+  func.func @external_func_decl_is_skipped(%signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op set>}
+    return
+  }
+}
+
+// CHECK-LABEL: AICORE void external_func_decl_is_skipped(
+// CHECK-NOT:  pipe_barrier(
+// CHECK-NOT:  dsb(
+// CHECK:      pto::comm::TNOTIFY(

From f0a49646793d59817a73211e18ee5f4a805c9384 Mon Sep 17 00:00:00 2001
From: TaoTao-real <TaoTao-real@users.noreply.github.com>
Date: Wed, 1 Jul 2026 09:20:38 +0800
Subject: [PATCH 8/9] Fix loop release summary in memory consistency

---
 lib/PTO/Transforms/PTOMemoryConsistency.cpp   | 58 ++++++++++++++++++-
 .../pto/memory_consistency_loop_release.pto   | 56 ++++++++++++++++++
 2 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 test/lit/pto/memory_consistency_loop_release.pto

diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
index e4eaaf4e00..ebd27fac53 100644
--- a/lib/PTO/Transforms/PTOMemoryConsistency.cpp
+++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
@@ -231,6 +231,62 @@ static TNotifyReleaseState collectTNotifyReleaseState(Region &region) {
   return state;
 }
 
+static void applyFenceReleaseForSummary(pto::FenceReleaseOp fence,
+                                        TNotifyReleaseState &state) {
+  if (fence.getScope().getScope() != pto::FenceScope::DDR)
+    return;
+
+  // The real annotation pass inserts the pending GM-write pipe drain before a
+  // release fence.  Loop summaries must model that transfer without mutating IR,
+  // otherwise already-released loop-carried writes are reported again at the
+  // next iteration's TNotify.
+  state.drainMte3 = false;
+  state.drainFix = false;
+  state.applyFenceRelease(fence.getScope().getScope());
+}
+
+static TNotifyReleaseState getTNotifyReleaseExitStateForBlock(
+    Block &block, TNotifyReleaseState pendingState);
+
+static TNotifyReleaseState
+getTNotifyReleaseExitState(Operation *op,
+                           TNotifyReleaseState pendingState = {}) {
+  if (isa<pto::TNotifyOp>(op))
+    pendingState.clear();
+
+  pendingState.merge(getDirectTNotifyReleaseState(op));
+
+  TNotifyReleaseState regionEntryState = pendingState;
+  TNotifyReleaseState combinedRegionExitState;
+  for (Region &region : op->getRegions()) {
+    if (region.hasOneBlock()) {
+      combinedRegionExitState.merge(
+          getTNotifyReleaseExitStateForBlock(region.front(), regionEntryState));
+      continue;
+    }
+
+    TNotifyReleaseState regionExitState = regionEntryState;
+    regionExitState.merge(collectTNotifyReleaseState(region));
+    combinedRegionExitState.merge(regionExitState);
+  }
+  pendingState.merge(combinedRegionExitState);
+
+  if (auto barrier = dyn_cast<pto::BarrierOp>(op))
+    pendingState.applyBarrier(barrier.getPipe().getPipe());
+  if (auto cmo = dyn_cast<pto::CmoCleanOp>(op))
+    pendingState.applyCmoClean(cmo.getSpace().getAddressSpace());
+  if (auto fence = dyn_cast<pto::FenceReleaseOp>(op))
+    applyFenceReleaseForSummary(fence, pendingState);
+  return pendingState;
+}
+
+static TNotifyReleaseState getTNotifyReleaseExitStateForBlock(
+    Block &block, TNotifyReleaseState pendingState) {
+  for (Operation &op : block)
+    pendingState = getTNotifyReleaseExitState(&op, pendingState);
+  return pendingState;
+}
+
 static bool isLoopLikeOp(Operation *op) {
   return isa<scf::ForOp, scf::WhileOp, scf::ParallelOp, scf::ForallOp>(op);
 }
@@ -333,7 +389,7 @@ annotateTNotifyReleaseForBlock(Block &block,
     for (Region &region : op.getRegions()) {
       TNotifyReleaseState nestedLoopCarriedState = loopCarriedState;
       if (isLoopLikeOp(&op))
-        nestedLoopCarriedState.merge(collectTNotifyReleaseState(&op));
+        nestedLoopCarriedState.merge(getTNotifyReleaseExitState(&op));
 
       if (region.hasOneBlock()) {
         combinedRegionExitState.merge(annotateTNotifyReleaseForBlock(
diff --git a/test/lit/pto/memory_consistency_loop_release.pto b/test/lit/pto/memory_consistency_loop_release.pto
new file mode 100644
index 0000000000..fd11820c47
--- /dev/null
+++ b/test/lit/pto/memory_consistency_loop_release.pto
@@ -0,0 +1,56 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s
+
+// A loop-local release fence must clear the loop-carried GM-write pending state.
+// Otherwise the next iteration's TNotify is falsely diagnosed as missing an
+// explicit release fence even though each iteration already has one.
+
+module {
+  func.func @loop_tstore_release_tnotify(
+      %dst_ptr: !pto.ptr<f32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c32 = arith.constant 32 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %tile = pto.alloc_tile :
+      !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    %dst_view = pto.make_tensor_view %dst_ptr,
+      shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
+    %dst = pto.partition_view %dst_view,
+      offsets = [%c0, %c0], sizes = [%c1, %c32]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x32xf32>
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+
+    scf.for %i = %c0 to %c2 step %c1 {
+      pto.tstore ins(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+                 outs(%dst : !pto.partition_tensor_view<1x32xf32>)
+      pto.fence.release #pto.fence_scope<ddr>
+      pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+          {notifyOp = #pto<notify_op set>}
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: AICORE void loop_tstore_release_tnotify(
+// CHECK:      TSTORE(
+// CHECK:      pipe_barrier(PIPE_MTE3);
+// CHECK-NEXT: dsb(DSB_DDR);
+// CHECK-NEXT: pto::comm::TNOTIFY(

From ec771dc312dc856c9140c768a3091de0777d3ff2 Mon Sep 17 00:00:00 2001
From: TaoTao-real <TaoTao-real@users.noreply.github.com>
Date: Wed, 1 Jul 2026 09:21:59 +0800
Subject: [PATCH 9/9] Reject non-inlined memory consistency calls

---
 .../ptoas-memory-consistency-design.md        |  6 ++
 lib/PTO/Transforms/PTOMemoryConsistency.cpp   | 82 ++++++++++++++++++-
 ...ory_consistency_noninline_call_invalid.pto | 59 +++++++++++++
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 test/lit/pto/memory_consistency_noninline_call_invalid.pto

diff --git a/docs/designs/ptoas-memory-consistency-design.md b/docs/designs/ptoas-memory-consistency-design.md
index ef2fd58e38..f7428a75f6 100644
--- a/docs/designs/ptoas-memory-consistency-design.md
+++ b/docs/designs/ptoas-memory-consistency-design.md
@@ -139,6 +139,12 @@ VPTO backend 都会先经过这一步。
 region 暂不做 path-sensitive 数据流，但只在当前 region 内收集 pending state，不把同一个
 parent op 的其他 sibling region 状态混入。外部函数声明没有函数体，pass 会直接跳过。
 
+`func.call` 边界不做上下文敏感的数据流传播。若 same-module 非内联 callee 的传递调用闭包
+中包含 payload 访问、CMO、fence 或 signal 相关 PTO op，pass 会报错并要求在
+`pto-memory-consistency` 前完成 inline。这样可以避免 caller 在 `TNotify` 前看不到 callee
+内部 pending payload write，或者 callee 内部 cacheable payload read 看不到 caller 侧
+`TWait` acquire state。
+
 这个 pass 不负责分配 event id，也不属于 InsertSync 自动同步流水线。
 
 ## 6. 场景规则
diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
index ebd27fac53..541d8412fc 100644
--- a/lib/PTO/Transforms/PTOMemoryConsistency.cpp
+++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp
@@ -13,7 +13,9 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/SymbolTable.h"
 #include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseSet.h"
 
 namespace mlir {
 namespace pto {
@@ -291,6 +293,83 @@ static bool isLoopLikeOp(Operation *op) {
   return isa<scf::ForOp, scf::WhileOp, scf::ParallelOp, scf::ForallOp>(op);
 }
 
+static func::FuncOp lookupCallee(func::CallOp call) {
+  return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
+      call.getOperation(), call.getCalleeAttr());
+}
+
+static bool isMemoryConsistencyRelevantDirectOp(Operation *op) {
+  if (isa<pto::BarrierOp, pto::CmoCleanOp, pto::CmoInvalidateOp,
+          pto::FenceReleaseOp, pto::FenceAcquireOp, pto::TNotifyOp,
+          pto::TWaitOp, pto::TTestOp, pto::TLoadOp, pto::TPrefetchOp,
+          pto::TStoreOp, pto::TStoreFPOp>(op))
+    return true;
+
+  if (auto load = dyn_cast<pto::LoadScalarOp>(op))
+    return isGmScalarMemory(load.getPtr().getType());
+  if (auto store = dyn_cast<pto::StoreScalarOp>(op))
+    return isGmScalarMemory(store.getPtr().getType());
+
+  TNotifyReleaseState macroState = getReleaseStateForMacroModel(op);
+  return macroState.drainMte2 || macroState.drainMte3 ||
+         macroState.drainFix || macroState.cleanGmCache ||
+         macroState.needsDsbDdr;
+}
+
+static bool calleeContainsMemoryConsistencyRelevantOps(
+    func::FuncOp callee, llvm::DenseSet<Operation *> &activeCallees) {
+  if (!callee || callee.isExternal())
+    return false;
+  if (!activeCallees.insert(callee.getOperation()).second)
+    return false;
+
+  WalkResult result = callee.walk([&](Operation *op) -> WalkResult {
+    if (op == callee.getOperation())
+      return WalkResult::advance();
+
+    if (auto nestedCall = dyn_cast<func::CallOp>(op)) {
+      func::FuncOp nestedCallee = lookupCallee(nestedCall);
+      if (calleeContainsMemoryConsistencyRelevantOps(nestedCallee,
+                                                     activeCallees))
+        return WalkResult::interrupt();
+      return WalkResult::advance();
+    }
+
+    if (isMemoryConsistencyRelevantDirectOp(op))
+      return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+
+  activeCallees.erase(callee.getOperation());
+  return result.wasInterrupted();
+}
+
+static bool diagnoseNonInlinedMemoryConsistencyCalls(ModuleOp module) {
+  bool hasFailure = false;
+  for (auto func : module.getOps<func::FuncOp>()) {
+    if (func.isExternal())
+      continue;
+
+    func.walk([&](func::CallOp call) {
+      func::FuncOp callee = lookupCallee(call);
+      if (!callee || callee.isExternal())
+        return;
+
+      llvm::DenseSet<Operation *> activeCallees;
+      if (!calleeContainsMemoryConsistencyRelevantOps(callee, activeCallees))
+        return;
+
+      call.emitOpError()
+          << "calls @" << callee.getSymName()
+          << ", which contains PTO memory consistency relevant operations; "
+             "inline the callee before `pto-memory-consistency` or keep "
+             "payload, CMO, fence, and signal operations in the caller";
+      hasFailure = true;
+    });
+  }
+  return hasFailure;
+}
+
 static void setTNotifyReleaseAttrs(pto::TNotifyOp op,
                                    const TNotifyReleaseState &state) {
   op->removeAttr(kTNotifyDrainMte2AttrName);
@@ -608,9 +687,10 @@ struct PTOMemoryConsistencyPass
           PTOMemoryConsistencyPass> {
   void runOnOperation() override {
     ModuleOp module = getOperation();
+    bool callFailed = diagnoseNonInlinedMemoryConsistencyCalls(module);
     bool releaseFailed = annotateTNotifyRelease(module);
     bool acquireFailed = annotateSignalAcquire(module);
-    if (releaseFailed || acquireFailed)
+    if (callFailed || releaseFailed || acquireFailed)
       signalPassFailure();
   }
 };
diff --git a/test/lit/pto/memory_consistency_noninline_call_invalid.pto b/test/lit/pto/memory_consistency_noninline_call_invalid.pto
new file mode 100644
index 0000000000..88f003023b
--- /dev/null
+++ b/test/lit/pto/memory_consistency_noninline_call_invalid.pto
@@ -0,0 +1,59 @@
+// Copyright (c) 2026 Huawei Technologies Co., Ltd.
+// This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+// CANN Open Software License Agreement Version 2.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+
+// RUN: not ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s
+
+// Non-inlined calls are not context-sensitive: a caller-side TNotify cannot
+// safely observe release-relevant payload writes hidden in a callee body.  Such
+// callees must be inlined before the memory consistency pass.
+
+module {
+  func.func private @producer(
+      %tile: !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+      %dst: !pto.partition_tensor_view<1x32xf32>) {
+    pto.tstore ins(%tile : !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
+               outs(%dst : !pto.partition_tensor_view<1x32xf32>)
+    return
+  }
+
+  func.func @call_hidden_payload_write(
+      %dst_ptr: !pto.ptr<f32>,
+      %signal_ptr: !pto.ptr<i32>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c32 = arith.constant 32 : index
+    %v_i32 = arith.constant 1 : i32
+
+    %tile = pto.alloc_tile :
+      !pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>
+
+    %dst_view = pto.make_tensor_view %dst_ptr,
+      shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view<?x?xf32>
+    %dst = pto.partition_view %dst_view,
+      offsets = [%c0, %c0], sizes = [%c1, %c32]
+      : !pto.tensor_view<?x?xf32> -> !pto.partition_tensor_view<1x32xf32>
+
+    call @producer(%tile, %dst) :
+      (!pto.tile_buf<loc=vec, dtype=f32, rows=1, cols=32, v_row=1, v_col=32, blayout=row_major, slayout=none_box, fractal=512, pad=0>,
+       !pto.partition_tensor_view<1x32xf32>) -> ()
+    pto.fence.release #pto.fence_scope<ddr>
+
+    %sig_view = pto.make_tensor_view %signal_ptr,
+      shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32>
+    %sig = pto.partition_view %sig_view,
+      offsets = [%c0], sizes = [%c1]
+      : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32>
+    pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32)
+        {notifyOp = #pto<notify_op set>}
+    return
+  }
+}
+
+// CHECK: calls @producer, which contains PTO memory consistency relevant operations
+// CHECK: inline the callee before `pto-memory-consistency`