From f96297fd258d747e304de757df6a395704650fa4 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Sat, 27 Jun 2026 14:52:56 +0800 Subject: [PATCH 1/9] Fix TPut release fence before TNotify --- include/PTO/IR/PTOAttrs.td | 15 + include/PTO/IR/PTOOps.td | 46 ++ .../PTO/Transforms/MemoryConsistencyAttrs.h | 35 ++ include/PTO/Transforms/Passes.h | 1 + include/PTO/Transforms/Passes.td | 19 + lib/PTO/Transforms/CMakeLists.txt | 1 + lib/PTO/Transforms/PTOMemoryConsistency.cpp | 493 ++++++++++++++++++ lib/PTO/Transforms/PTOToEmitC.cpp | 234 ++++----- test/lit/pto/issue711_tnotify_mte_drain.pto | 95 ++++ .../lit/pto/issue872_tput_tnotify_release.pto | 169 ++++++ test/lit/pto/memory_consistency_invalid.pto | 88 ++++ .../pto/signal_payload_cache_consistency.pto | 197 +++++++ tools/ptoas/ptoas.cpp | 1 + 13 files changed, 1259 insertions(+), 135 deletions(-) create mode 100644 include/PTO/Transforms/MemoryConsistencyAttrs.h create mode 100644 lib/PTO/Transforms/PTOMemoryConsistency.cpp create mode 100644 test/lit/pto/issue872_tput_tnotify_release.pto create mode 100644 test/lit/pto/memory_consistency_invalid.pto create mode 100644 test/lit/pto/signal_payload_cache_consistency.pto diff --git a/include/PTO/IR/PTOAttrs.td b/include/PTO/IR/PTOAttrs.td index 0675f71085..85ec3b6a0f 100644 --- a/include/PTO/IR/PTOAttrs.td +++ b/include/PTO/IR/PTOAttrs.td @@ -302,6 +302,21 @@ def PTO_MemBarAttr : PTO_Attr<"MemBar", "membar"> { }]; } +def PTO_FENCE_SCOPE_DDR : I32EnumAttrCase<"DDR", 0, "ddr">; + +def PTO_FenceScopeEnum : PTO_I32Enum< + "FenceScope", "PTO memory fence scope", [ + PTO_FENCE_SCOPE_DDR + ]>; + +def PTO_FenceScopeAttr : PTO_Attr<"FenceScope", "fence_scope"> { + let parameters = (ins EnumParameter:$scope); + let assemblyFormat = "`<` params `>`"; + let description = [{ + Memory visibility scope for PTO fence operations. + }]; +} + //===----------------------------------------------------------------------===// // Sync Op Type (High Level Abstraction) //===----------------------------------------------------------------------===// diff --git a/include/PTO/IR/PTOOps.td b/include/PTO/IR/PTOOps.td index 9fb753b04e..d662bc0ddd 100644 --- a/include/PTO/IR/PTOOps.td +++ b/include/PTO/IR/PTOOps.td @@ -2676,6 +2676,52 @@ def BarrierOp : PTO_Op<"barrier"> { let assemblyFormat = "$pipe attr-dict"; } +def CmoCleanOp : PTO_Op<"cmo.clean"> { + let summary = "Clean cache lines for a memory space"; + let description = [{ + Cache maintenance operation that writes dirty cache lines back to the + specified memory space. The first version supports whole-cache GM clean and + lowers to `dcci((__gm__ void*)0, ENTIRE_DATA_CACHE, CACHELINE_OUT)`. + }]; + + let arguments = (ins PTO_AddressSpaceAttr:$space); + let assemblyFormat = "`all` $space attr-dict"; +} + +def CmoInvalidateOp : PTO_Op<"cmo.invalidate"> { + let summary = "Invalidate cache lines for a memory space"; + let description = [{ + Cache maintenance operation that invalidates cache lines for the specified + memory space. The first version supports whole-cache GM invalidate and + lowers to `dcci((__gm__ void*)0, ENTIRE_DATA_CACHE)`. + }]; + + let arguments = (ins PTO_AddressSpaceAttr:$space); + let assemblyFormat = "`all` $space attr-dict"; +} + +def FenceReleaseOp : PTO_Op<"fence.release"> { + let summary = "Release memory fence"; + let description = [{ + Release fence for publishing payload writes before a following signal + operation. `scope = ddr` lowers to `dsb(DSB_DDR)`. + }]; + + let arguments = (ins PTO_FenceScopeAttr:$scope); + let assemblyFormat = "$scope attr-dict"; +} + +def FenceAcquireOp : PTO_Op<"fence.acquire"> { + let summary = "Acquire memory fence"; + let description = [{ + Acquire fence for ordering signal observation before following payload + reads. `scope = ddr` lowers to `dsb(DSB_DDR)`. + }]; + + let arguments = (ins PTO_FenceScopeAttr:$scope); + let assemblyFormat = "$scope attr-dict"; +} + def TSyncOp : PTO_TOp<"tsync"> { let summary = "Direct TSYNC mapping (variadic operands)."; let description = [{ diff --git a/include/PTO/Transforms/MemoryConsistencyAttrs.h b/include/PTO/Transforms/MemoryConsistencyAttrs.h new file mode 100644 index 0000000000..74279c5614 --- /dev/null +++ b/include/PTO/Transforms/MemoryConsistencyAttrs.h @@ -0,0 +1,35 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#ifndef MLIR_DIALECT_PTO_TRANSFORMS_MEMORYCONSISTENCYATTRS_H +#define MLIR_DIALECT_PTO_TRANSFORMS_MEMORYCONSISTENCYATTRS_H + +#include "llvm/ADT/StringRef.h" + +namespace mlir { +namespace pto { + +inline constexpr llvm::StringLiteral kTNotifyDrainMte2AttrName = + "__pto.emitc.tnotify_drain_mte2"; +inline constexpr llvm::StringLiteral kTNotifyDrainMte3AttrName = + "__pto.emitc.tnotify_drain_mte3"; +inline constexpr llvm::StringLiteral kTNotifyDsbDdrAttrName = + "__pto.emitc.tnotify_dsb_ddr"; +inline constexpr llvm::StringLiteral kTNotifyCleanGmCacheAttrName = + "__pto.emitc.tnotify_clean_gm_cache"; +inline constexpr llvm::StringLiteral kAcquireCleanGmCacheAttrName = + "__pto.emitc.acquire_clean_gm_cache"; +inline constexpr llvm::StringLiteral kAcquireDsbDdrAttrName = + "__pto.emitc.acquire_dsb_ddr"; +inline constexpr llvm::StringLiteral kAcquireInvalidateGmCacheAttrName = + "__pto.emitc.acquire_invalidate_gm_cache"; + +} // namespace pto +} // namespace mlir + +#endif // MLIR_DIALECT_PTO_TRANSFORMS_MEMORYCONSISTENCYATTRS_H diff --git a/include/PTO/Transforms/Passes.h b/include/PTO/Transforms/Passes.h index 85970756c5..a9eec25035 100644 --- a/include/PTO/Transforms/Passes.h +++ b/include/PTO/Transforms/Passes.h @@ -75,6 +75,7 @@ std::unique_ptr createPTORemoveRedundantBarrierPass(); std::unique_ptr createPTOViewToMemrefPass(); std::unique_ptr createPTOValidateIntToPtrUsesPass(); std::unique_ptr createPTOMaterializeTileHandlesPass(); +std::unique_ptr createPTOMemoryConsistencyPass(); std::unique_ptr createInferPTOLayoutPass(); std::unique_ptr createPTOA5NormalizeTMovPass(); std::unique_ptr createPreFusionAnalysisPass(); diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td index bcc165674a..e895b06c20 100644 --- a/include/PTO/Transforms/Passes.td +++ b/include/PTO/Transforms/Passes.td @@ -667,6 +667,25 @@ def PTOMaterializeTileHandles : Pass<"pto-materialize-tile-handles", "ModuleOp"> ]; } +def PTOMemoryConsistency : Pass<"pto-memory-consistency", "ModuleOp"> { + let summary = "Annotate PTO memory consistency actions before backend lowering"; + let description = [{ + Analyzes signal/payload ordering requirements and annotates communication + signal ops and scalar GM consumers with release/acquire actions consumed by + backend lowering. It covers TNotify release actions for direct MTE + operations, macro-op MTE3 phases, cacheable scalar GM stores, and + conservative TWait/TTest acquire invalidation for scalar GM loads. + }]; + + let constructor = "mlir::pto::createPTOMemoryConsistencyPass()"; + + let dependentDialects = [ + "mlir::pto::PTODialect", + "mlir::func::FuncDialect", + "mlir::scf::SCFDialect" + ]; +} + def PTOUnrollSIMTFor : Pass<"pto-unroll-simt-for", "func::FuncOp"> { let summary = "Unroll small constant-trip-count scf.for loops in pto.simt_entry functions"; diff --git a/lib/PTO/Transforms/CMakeLists.txt b/lib/PTO/Transforms/CMakeLists.txt index a7059674df..1e3f8d74dc 100644 --- a/lib/PTO/Transforms/CMakeLists.txt +++ b/lib/PTO/Transforms/CMakeLists.txt @@ -58,6 +58,7 @@ add_mlir_dialect_library(PTOTransforms PTOA5NormalizeTMovPass.cpp PTOCanonicalizeIR.cpp PTOMaterializeTileHandles.cpp + PTOMemoryConsistency.cpp BufferizableOpInterfaceImpl.cpp ConvertToPTOOp.cpp PTOAssignDefaultFrontendPipeIdPass.cpp diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp new file mode 100644 index 0000000000..10854e4ea0 --- /dev/null +++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp @@ -0,0 +1,493 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +#include "PTO/IR/PTO.h" +#include "PTO/Transforms/InsertSync/SyncMacroModel.h" +#include "PTO/Transforms/MemoryConsistencyAttrs.h" +#include "PTO/Transforms/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Pass/Pass.h" + +namespace mlir { +namespace pto { +#define GEN_PASS_DEF_PTOMEMORYCONSISTENCY +#include "PTO/Transforms/Passes.h.inc" +} // namespace pto +} // namespace mlir + +using namespace mlir; +using namespace mlir::pto; + +namespace { + +static bool isGmAddressSpace(pto::AddressSpace space) { + return space == pto::AddressSpace::GM || space == pto::AddressSpace::Zero; +} + +struct TNotifyReleaseState { + bool drainMte2 = false; + bool drainMte3 = false; + bool cleanGmCache = false; + bool needsDsbDdr = false; + + void merge(const TNotifyReleaseState &other) { + drainMte2 |= other.drainMte2; + drainMte3 |= other.drainMte3; + cleanGmCache |= other.cleanGmCache; + needsDsbDdr |= other.needsDsbDdr; + } + + void clear() { + drainMte2 = false; + drainMte3 = false; + cleanGmCache = false; + needsDsbDdr = false; + } + + void applyBarrier(pto::PIPE pipe) { + switch (pipe) { + case pto::PIPE::PIPE_MTE2: + drainMte2 = false; + break; + case pto::PIPE::PIPE_MTE3: + drainMte3 = false; + break; + case pto::PIPE::PIPE_ALL: + drainMte2 = false; + drainMte3 = false; + break; + default: + break; + } + } + + void applyCmoClean(pto::AddressSpace space) { + if (isGmAddressSpace(space)) + cleanGmCache = false; + } + + void applyFenceRelease(pto::FenceScope scope) { + if (scope != pto::FenceScope::DDR) + return; + if (drainMte3 || cleanGmCache) + return; + needsDsbDdr = false; + } +}; + +struct SignalAcquireState { + bool pendingInvalidateGmCache = false; + bool dirtyGmCache = false; + bool cleanNeedsFence = false; + + void merge(const SignalAcquireState &other) { + pendingInvalidateGmCache |= other.pendingInvalidateGmCache; + dirtyGmCache |= other.dirtyGmCache; + cleanNeedsFence |= other.cleanNeedsFence; + } + + void consumeAcquire() { + pendingInvalidateGmCache = false; + dirtyGmCache = false; + cleanNeedsFence = false; + } + + void applyCmoClean(pto::AddressSpace space) { + if (!isGmAddressSpace(space)) + return; + if (dirtyGmCache) + cleanNeedsFence = true; + dirtyGmCache = false; + } + + void applyFenceRelease(pto::FenceScope scope) { + if (scope == pto::FenceScope::DDR && !dirtyGmCache) + cleanNeedsFence = false; + } + + void applyCmoInvalidate(pto::AddressSpace space) { + if (!isGmAddressSpace(space) || dirtyGmCache || cleanNeedsFence) + return; + pendingInvalidateGmCache = false; + } +}; + +static bool isGmScalarMemory(Type type) { + if (auto ptrTy = dyn_cast(type)) { + pto::AddressSpace space = ptrTy.getMemorySpace().getAddressSpace(); + return isGmAddressSpace(space); + } + + if (auto memTy = dyn_cast(type)) { + auto spaceAttr = dyn_cast_or_null(memTy.getMemorySpace()); + return !spaceAttr || isGmAddressSpace(spaceAttr.getAddressSpace()); + } + + return false; +} + +static TNotifyReleaseState getReleaseStateForPipe(pto::PIPE pipe) { + TNotifyReleaseState state; + switch (pipe) { + case pto::PIPE::PIPE_MTE2: + state.drainMte2 = true; + break; + case pto::PIPE::PIPE_MTE3: + state.drainMte3 = true; + state.needsDsbDdr = true; + break; + case pto::PIPE::PIPE_ALL: + state.drainMte2 = true; + state.drainMte3 = true; + state.needsDsbDdr = true; + break; + default: + break; + } + return state; +} + +static TNotifyReleaseState getReleaseStateForMacroModel(Operation *op) { + TNotifyReleaseState state; + auto model = getSyncMacroModel(op); + if (!model) + return state; + + for (const SyncMacroPhase &phase : model->phases) { + // Macro MTE3 phases write GM payloads internally. A following TNotify must + // publish its signal only after those stores are drained and DDR-visible. + if (phase.pipe == PipelineType::PIPE_MTE3) { + state.drainMte3 = true; + state.needsDsbDdr = true; + } + } + return state; +} + +static TNotifyReleaseState getDirectTNotifyReleaseState(Operation *op) { + if (isa(op)) + return {}; + + if (auto store = dyn_cast(op)) { + if (isGmScalarMemory(store.getPtr().getType())) { + TNotifyReleaseState state; + state.cleanGmCache = true; + state.needsDsbDdr = true; + return state; + } + } + + TNotifyReleaseState macroState = getReleaseStateForMacroModel(op); + if (macroState.drainMte3 || macroState.cleanGmCache || + macroState.needsDsbDdr) + return macroState; + + if (auto pipeOp = dyn_cast(op)) + return getReleaseStateForPipe(pipeOp.getPipe()); + return {}; +} + +static TNotifyReleaseState collectTNotifyReleaseState(Operation *op) { + TNotifyReleaseState state = getDirectTNotifyReleaseState(op); + for (Region ®ion : op->getRegions()) + for (Block &block : region) + for (Operation &nested : block) + state.merge(collectTNotifyReleaseState(&nested)); + return state; +} + +static bool isLoopLikeOp(Operation *op) { + return isa(op); +} + +static void setTNotifyReleaseAttrs(pto::TNotifyOp op, + const TNotifyReleaseState &state) { + op->removeAttr(kTNotifyDrainMte2AttrName); + op->removeAttr(kTNotifyDrainMte3AttrName); + op->removeAttr(kTNotifyDsbDdrAttrName); + op->removeAttr(kTNotifyCleanGmCacheAttrName); + if (state.drainMte2) + op->setAttr(kTNotifyDrainMte2AttrName, UnitAttr::get(op.getContext())); + if (state.drainMte3) + op->setAttr(kTNotifyDrainMte3AttrName, UnitAttr::get(op.getContext())); +} + +static void setTNotifyPipeDrainAttrs(pto::TNotifyOp op, + const TNotifyReleaseState &state) { + TNotifyReleaseState emitState; + emitState.drainMte2 = state.drainMte2; + setTNotifyReleaseAttrs(op, emitState); +} + +static void diagnoseTNotifyRelease(pto::TNotifyOp op, + const TNotifyReleaseState &state, + bool &hasFailure) { + if (state.cleanGmCache) { + op.emitOpError() + << "requires explicit `pto.cmo.clean all #pto.address_space` " + "before publishing a signal after cacheable GM stores"; + hasFailure = true; + return; + } + if (state.drainMte3) { + op.emitOpError() + << "requires an explicit `pto.barrier ` before " + "`pto.fence.release #pto.fence_scope` when publishing a " + "signal after MTE3 GM writes"; + hasFailure = true; + return; + } + if (state.needsDsbDdr) { + op.emitOpError() + << "requires explicit `pto.fence.release #pto.fence_scope` " + "before publishing a signal after GM writes or cache clean"; + hasFailure = true; + } +} + +static void markNestedTNotifyWithState(Operation *op, + const TNotifyReleaseState &state, + bool &hasFailure) { + op->walk([&](pto::TNotifyOp notify) { + diagnoseTNotifyRelease(notify, state, hasFailure); + setTNotifyPipeDrainAttrs(notify, state); + }); +} + +static TNotifyReleaseState +annotateTNotifyReleaseForBlock(Block &block, + TNotifyReleaseState entryPendingState, + TNotifyReleaseState loopCarriedState, + bool &hasFailure) { + TNotifyReleaseState pendingState = entryPendingState; + for (Operation &op : block) { + if (auto notify = dyn_cast(op)) { + TNotifyReleaseState notifyState = pendingState; + notifyState.merge(loopCarriedState); + diagnoseTNotifyRelease(notify, notifyState, hasFailure); + setTNotifyPipeDrainAttrs(notify, notifyState); + pendingState.clear(); + } + + pendingState.merge(getDirectTNotifyReleaseState(&op)); + + TNotifyReleaseState regionEntryState = pendingState; + TNotifyReleaseState combinedRegionExitState; + for (Region ®ion : op.getRegions()) { + TNotifyReleaseState nestedLoopCarriedState = loopCarriedState; + if (isLoopLikeOp(&op)) + nestedLoopCarriedState.merge(collectTNotifyReleaseState(&op)); + + if (region.hasOneBlock()) { + combinedRegionExitState.merge(annotateTNotifyReleaseForBlock( + region.front(), regionEntryState, nestedLoopCarriedState, + hasFailure)); + } else { + TNotifyReleaseState regionState = collectTNotifyReleaseState(&op); + TNotifyReleaseState nestedNotifyState = regionEntryState; + nestedNotifyState.merge(nestedLoopCarriedState); + nestedNotifyState.merge(regionState); + markNestedTNotifyWithState(&op, nestedNotifyState, hasFailure); + + TNotifyReleaseState regionExitState = regionEntryState; + regionExitState.merge(regionState); + combinedRegionExitState.merge(regionExitState); + } + } + pendingState.merge(combinedRegionExitState); + + if (auto barrier = dyn_cast(op)) + pendingState.applyBarrier(barrier.getPipe().getPipe()); + if (auto cmo = dyn_cast(op)) + pendingState.applyCmoClean(cmo.getSpace().getAddressSpace()); + if (auto fence = dyn_cast(op)) + pendingState.applyFenceRelease(fence.getScope().getScope()); + } + return pendingState; +} + +static bool annotateTNotifyRelease(ModuleOp module) { + bool hasFailure = false; + for (auto func : module.getOps()) { + if (func.getBody().hasOneBlock()) { + (void)annotateTNotifyReleaseForBlock(func.getBody().front(), + TNotifyReleaseState{}, + TNotifyReleaseState{}, + hasFailure); + continue; + } + + // Be conservative for pre-existing CFG: without a path-sensitive CFG data + // flow here, every TNotify may observe any release-relevant work in the + // function. + TNotifyReleaseState funcState = + collectTNotifyReleaseState(func.getOperation()); + markNestedTNotifyWithState(func.getOperation(), funcState, hasFailure); + } + return hasFailure; +} + +static void clearAcquireAttrs(pto::LoadScalarOp op) { + op->removeAttr(kAcquireCleanGmCacheAttrName); + op->removeAttr(kAcquireDsbDdrAttrName); + op->removeAttr(kAcquireInvalidateGmCacheAttrName); +} + +static void diagnoseAcquireLoad(pto::LoadScalarOp op, + const SignalAcquireState &state, + bool &hasFailure) { + if (!state.pendingInvalidateGmCache || + !isGmScalarMemory(op.getPtr().getType())) + return; + if (state.dirtyGmCache) { + op.emitOpError() + << "requires explicit `pto.cmo.clean all #pto.address_space`, " + "`pto.fence.release #pto.fence_scope`, and " + "`pto.cmo.invalidate all #pto.address_space` before a " + "cacheable GM load after signal acquire when dirty GM cache may " + "exist"; + hasFailure = true; + return; + } + if (state.cleanNeedsFence) { + op.emitOpError() + << "requires explicit `pto.fence.release #pto.fence_scope` " + "after GM cache clean and before acquire invalidate"; + hasFailure = true; + return; + } + op.emitOpError() + << "requires explicit `pto.cmo.invalidate all #pto.address_space` " + "before a cacheable GM load after `pto.comm.twait` or successful " + "`pto.comm.ttest`"; + hasFailure = true; +} + +static void consumeAcquireAfterDiagnostic(SignalAcquireState &state) { + if (state.pendingInvalidateGmCache) + state.consumeAcquire(); +} + +static SignalAcquireState collectSignalAcquireState(Operation *op) { + SignalAcquireState state; + if (isa(op)) + state.pendingInvalidateGmCache = true; + if (auto store = dyn_cast(op); + store && isGmScalarMemory(store.getPtr().getType())) + state.dirtyGmCache = true; + if (auto notify = dyn_cast(op); + notify && notify->hasAttr(kTNotifyCleanGmCacheAttrName)) + state.dirtyGmCache = false; + if (auto cmo = dyn_cast(op)) + state.applyCmoClean(cmo.getSpace().getAddressSpace()); + if (auto fence = dyn_cast(op)) + state.applyFenceRelease(fence.getScope().getScope()); + if (auto cmo = dyn_cast(op)) + state.applyCmoInvalidate(cmo.getSpace().getAddressSpace()); + + for (Region ®ion : op->getRegions()) + for (Block &block : region) + for (Operation &nested : block) + state.merge(collectSignalAcquireState(&nested)); + return state; +} + +static void markNestedAcquireLoadsWithState(Operation *op, + SignalAcquireState state, + bool &hasFailure) { + op->walk([&](pto::LoadScalarOp load) { + clearAcquireAttrs(load); + diagnoseAcquireLoad(load, state, hasFailure); + consumeAcquireAfterDiagnostic(state); + }); +} + +static SignalAcquireState +annotateSignalAcquireForBlock(Block &block, SignalAcquireState entryState, + bool &hasFailure) { + SignalAcquireState state = entryState; + for (Operation &op : block) { + if (auto load = dyn_cast(op)) { + clearAcquireAttrs(load); + diagnoseAcquireLoad(load, state, hasFailure); + consumeAcquireAfterDiagnostic(state); + } + + if (auto store = dyn_cast(op); + store && isGmScalarMemory(store.getPtr().getType())) + state.dirtyGmCache = true; + + if (isa(op)) + state.pendingInvalidateGmCache = true; + + if (auto notify = dyn_cast(op); + notify && notify->hasAttr(kTNotifyCleanGmCacheAttrName)) + state.dirtyGmCache = false; + if (auto cmo = dyn_cast(op)) + state.applyCmoClean(cmo.getSpace().getAddressSpace()); + if (auto fence = dyn_cast(op)) + state.applyFenceRelease(fence.getScope().getScope()); + if (auto cmo = dyn_cast(op)) + state.applyCmoInvalidate(cmo.getSpace().getAddressSpace()); + + SignalAcquireState combinedRegionExitState; + for (Region ®ion : op.getRegions()) { + if (region.hasOneBlock()) { + combinedRegionExitState.merge( + annotateSignalAcquireForBlock(region.front(), state, hasFailure)); + } else { + markNestedAcquireLoadsWithState(&op, state, hasFailure); + SignalAcquireState regionState = collectSignalAcquireState(&op); + SignalAcquireState regionExitState = state; + regionExitState.merge(regionState); + combinedRegionExitState.merge(regionExitState); + } + } + + if (isLoopLikeOp(&op)) + combinedRegionExitState.merge(state); + state.merge(combinedRegionExitState); + } + return state; +} + +static bool annotateSignalAcquire(ModuleOp module) { + bool hasFailure = false; + for (auto func : module.getOps()) { + if (func.getBody().hasOneBlock()) { + (void)annotateSignalAcquireForBlock(func.getBody().front(), + SignalAcquireState{}, hasFailure); + continue; + } + + SignalAcquireState funcState = + collectSignalAcquireState(func.getOperation()); + markNestedAcquireLoadsWithState(func.getOperation(), funcState, hasFailure); + } + return hasFailure; +} + +struct PTOMemoryConsistencyPass + : public mlir::pto::impl::PTOMemoryConsistencyBase< + PTOMemoryConsistencyPass> { + void runOnOperation() override { + ModuleOp module = getOperation(); + bool releaseFailed = annotateTNotifyRelease(module); + bool acquireFailed = annotateSignalAcquire(module); + if (releaseFailed || acquireFailed) + signalPassFailure(); + } +}; + +} // namespace + +std::unique_ptr mlir::pto::createPTOMemoryConsistencyPass() { + return std::make_unique(); +} diff --git a/lib/PTO/Transforms/PTOToEmitC.cpp b/lib/PTO/Transforms/PTOToEmitC.cpp index 963b01c89c..0b23317ee4 100644 --- a/lib/PTO/Transforms/PTOToEmitC.cpp +++ b/lib/PTO/Transforms/PTOToEmitC.cpp @@ -18,6 +18,7 @@ #include "PTO/IR/PTO.h" #include "PTO/IR/PTOTypeUtils.h" #include "PTO/IR/PTOSyncUtils.h" +#include "PTO/Transforms/MemoryConsistencyAttrs.h" #include "PTO/Transforms/Passes.h" #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" @@ -131,15 +132,6 @@ static constexpr llvm::StringLiteral kForceDynamicValidShapeAttrName = "__pto.force_dynamic_valid_shape"; static constexpr llvm::StringLiteral kGlobalTensorStridesAttrName = "__pto.globaltensor_strides"; -static constexpr llvm::StringLiteral kTNotifyDrainMte2AttrName = - "__pto.emitc.tnotify_drain_mte2"; -static constexpr llvm::StringLiteral kTNotifyDrainMte3AttrName = - "__pto.emitc.tnotify_drain_mte3"; - -enum TNotifyMteDrainMask : unsigned { - kDrainMte2 = 1U << 0, - kDrainMte3 = 1U << 1, -}; static constexpr llvm::StringLiteral kLastUseAttrName = "pto.last_use"; static constexpr llvm::StringLiteral kLastUseMarkerPrefix = "PTOAS__LAST_USE__"; @@ -272,104 +264,6 @@ static Value peelUnrealized(Value v) { return v; } -static unsigned getMteDrainMaskForPipe(pto::PIPE pipe) { - switch (pipe) { - case pto::PIPE::PIPE_MTE2: - return kDrainMte2; - case pto::PIPE::PIPE_MTE3: - return kDrainMte3; - case pto::PIPE::PIPE_ALL: - return kDrainMte2 | kDrainMte3; - default: - return 0; - } -} - -static unsigned getDirectMteDrainMask(Operation *op) { - if (auto pipeOp = dyn_cast(op)) - return getMteDrainMaskForPipe(pipeOp.getPipe()); - return 0; -} - -static unsigned collectMteDrainMask(Operation *op) { - unsigned mask = getDirectMteDrainMask(op); - for (Region ®ion : op->getRegions()) - for (Block &block : region) - for (Operation &nested : block) - mask |= collectMteDrainMask(&nested); - return mask; -} - -static bool isLoopLikeOp(Operation *op) { - return isa(op); -} - -static void setTNotifyDrainAttrs(pto::TNotifyOp op, unsigned mask) { - op->removeAttr(kTNotifyDrainMte2AttrName); - op->removeAttr(kTNotifyDrainMte3AttrName); - if (mask & kDrainMte2) - op->setAttr(kTNotifyDrainMte2AttrName, UnitAttr::get(op.getContext())); - if (mask & kDrainMte3) - op->setAttr(kTNotifyDrainMte3AttrName, UnitAttr::get(op.getContext())); -} - -static void markNestedTNotifyWithMask(Operation *op, unsigned mask) { - op->walk([&](pto::TNotifyOp notify) { setTNotifyDrainAttrs(notify, mask); }); -} - -static unsigned annotateTNotifyMteDrainForBlock(Block &block, - unsigned entryPendingMask, - unsigned loopCarriedMask) { - unsigned pendingMask = entryPendingMask; - for (Operation &op : block) { - if (auto notify = dyn_cast(op)) { - setTNotifyDrainAttrs(notify, pendingMask | loopCarriedMask); - pendingMask = 0; - } - - pendingMask |= getDirectMteDrainMask(&op); - - unsigned regionEntryMask = pendingMask; - unsigned combinedRegionExitMask = 0; - for (Region ®ion : op.getRegions()) { - unsigned nestedLoopCarriedMask = loopCarriedMask; - if (isLoopLikeOp(&op)) - nestedLoopCarriedMask |= collectMteDrainMask(&op); - - if (region.hasOneBlock()) { - combinedRegionExitMask |= annotateTNotifyMteDrainForBlock( - region.front(), regionEntryMask, nestedLoopCarriedMask); - } else { - unsigned regionMask = collectMteDrainMask(&op); - markNestedTNotifyWithMask(&op, regionEntryMask | nestedLoopCarriedMask | - regionMask); - combinedRegionExitMask |= regionEntryMask | regionMask; - } - } - pendingMask |= combinedRegionExitMask; - - if (auto barrier = dyn_cast(op)) - pendingMask &= ~getMteDrainMaskForPipe(barrier.getPipe().getPipe()); - } - return pendingMask; -} - -static void annotateTNotifyMteDrain(ModuleOp module) { - for (auto func : module.getOps()) { - if (func.getBody().hasOneBlock()) { - (void)annotateTNotifyMteDrainForBlock(func.getBody().front(), - /*entryPendingMask=*/0, - /*loopCarriedMask=*/0); - continue; - } - - // Be conservative for pre-existing CFG: without a path-sensitive CFG data - // flow here, every TNotify may observe any MTE work in the function. - unsigned funcMask = collectMteDrainMask(func.getOperation()); - markNestedTNotifyWithMask(func.getOperation(), funcMask); - } -} - static Value buildGlobalTensorFromMemref(ConversionPatternRewriter &rewriter, Location loc, Value basePtr, MemRefType mrTy, Operation *anchor, @@ -5405,6 +5299,13 @@ static std::string getAutoSyncTailModeToken(Operation *op) { //===----------------------------------------------------------------------===// // pto.barrier lowering -> pipe_barrier(...) //===----------------------------------------------------------------------===// +static void emitDsbDdr(ConversionPatternRewriter &rewriter, Location loc) { + auto *ctx = rewriter.getContext(); + auto args = rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "DSB_DDR")}); + rewriter.create(loc, TypeRange{}, "dsb", args, + ArrayAttr{}, ValueRange{}); +} + struct PTOBarrierToEmitC : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; @@ -5446,6 +5347,22 @@ struct PTOBarrierToEmitC : public OpConversionPattern { } }; +template +struct PTOFenceToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(FenceOp op, typename FenceOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + (void)adaptor; + if (op.getScope().getScope() != pto::FenceScope::DDR) + return rewriter.notifyMatchFailure(op, "unsupported fence scope"); + + emitDsbDdr(rewriter, op.getLoc()); + rewriter.eraseOp(op); + return success(); + } +}; + //===----------------------------------------------------------------------===// // Sync lowering (robust for bracket form pto.set_flag[...] / pto.wait_flag[...]) // Replace your PTOSyncToRuntimeCall with the code below. @@ -6589,6 +6506,62 @@ struct PTOTAssignToEmitC : public OpConversionPattern { // pto.load_scalar / pto.store_scalar lowering -> ptr[offset] //===----------------------------------------------------------------------===// +static void emitCleanGmCache(ConversionPatternRewriter &rewriter, + Location loc) { + auto *ctx = rewriter.getContext(); + auto args = rewriter.getArrayAttr({ + emitc::OpaqueAttr::get(ctx, "(__gm__ void*)0"), + emitc::OpaqueAttr::get(ctx, "ENTIRE_DATA_CACHE"), + emitc::OpaqueAttr::get(ctx, "CACHELINE_OUT"), + }); + rewriter.create(loc, TypeRange{}, "dcci", args, + ArrayAttr{}, ValueRange{}); +} + +static void emitInvalidateGmCache(ConversionPatternRewriter &rewriter, + Location loc) { + auto *ctx = rewriter.getContext(); + auto args = rewriter.getArrayAttr({ + emitc::OpaqueAttr::get(ctx, "(__gm__ void*)0"), + emitc::OpaqueAttr::get(ctx, "ENTIRE_DATA_CACHE"), + }); + rewriter.create(loc, TypeRange{}, "dcci", args, + ArrayAttr{}, ValueRange{}); +} + +static bool isGmCmoSpace(pto::AddressSpace space) { + return space == pto::AddressSpace::GM || space == pto::AddressSpace::Zero; +} + +struct PTOCmoCleanToEmitC : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::CmoCleanOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + (void)adaptor; + if (!isGmCmoSpace(op.getSpace().getAddressSpace())) + return rewriter.notifyMatchFailure(op, "unsupported CMO clean space"); + emitCleanGmCache(rewriter, op.getLoc()); + rewriter.eraseOp(op); + return success(); + } +}; + +struct PTOCmoInvalidateToEmitC + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite(pto::CmoInvalidateOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + (void)adaptor; + if (!isGmCmoSpace(op.getSpace().getAddressSpace())) + return rewriter.notifyMatchFailure(op, "unsupported CMO invalidate space"); + emitInvalidateGmCache(rewriter, op.getLoc()); + rewriter.eraseOp(op); + return success(); + } +}; + static Type getPointerLikeElementType(Type type) { if (auto ptrTy = dyn_cast(type)) return ptrTy.getElementType(); @@ -7070,28 +7043,20 @@ static void emitPipeBarrier(ConversionPatternRewriter &rewriter, Location loc, ArrayAttr{}, ValueRange{}); } -static void emitDsbDdr(ConversionPatternRewriter &rewriter, Location loc) { - auto *ctx = rewriter.getContext(); - auto args = rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "DSB_DDR")}); - rewriter.create(loc, TypeRange{}, "dsb", args, - ArrayAttr{}, ValueRange{}); -} - // Issue #711: TNOTIFY writes its signal on the scalar pipe, and // TNOTIFY_IMPL's trailing pipe_barrier(PIPE_ALL) runs *after* that store. -// If prior pto.tload / pto.tstore work is still in flight on an MTE pipe when -// the signal lands, the receiver's matching TWAIT can return before the data -// is visible. Emit only the MTE pipe drains that the pre-lowering analysis -// proved may be needed before this TNotify. Issue #744: prior MTE3 stores also -// need a DDR-domain release fence before publishing the notification signal. -static void emitTNotifyMteDrain(ConversionPatternRewriter &rewriter, - Location loc, unsigned mask) { - if (mask & kDrainMte2) +// If prior MTE work is still in flight when the signal lands, the receiver's +// matching TWAIT can return before the producer-side payload operation is +// complete. MemoryConsistency now validates explicit CMO/fence operations for +// DDR visibility; lowering only keeps the pipe-drain actions that the pass may +// still annotate automatically. +static void emitTNotifyReleaseActions(ConversionPatternRewriter &rewriter, + Location loc, bool drainMte2, + bool drainMte3) { + if (drainMte2) emitPipeBarrier(rewriter, loc, "PIPE_MTE2"); - if (mask & kDrainMte3) { + if (drainMte3) emitPipeBarrier(rewriter, loc, "PIPE_MTE3"); - emitDsbDdr(rewriter, loc); - } } static std::string waitCmpTok(pto::WaitCmp cmp) { @@ -7348,14 +7313,11 @@ struct PTOSignalCommToEmitC : public OpConversionPattern { rewriter, op.getLoc(), notifyTy, notifyOpTok(op.getNotifyOp())); SmallVector operands{*signalGT, peelUnrealized(adaptor.getValue()), notifyOp}; - // See emitTNotifyMteDrain comment: drain in-flight MTE work before the + // See emitTNotifyReleaseActions comment: drain in-flight MTE work before the // scalar-pipe signal store so the notify/wait handshake is honored. - unsigned drainMask = 0; - if (op->hasAttr(kTNotifyDrainMte2AttrName)) - drainMask |= kDrainMte2; - if (op->hasAttr(kTNotifyDrainMte3AttrName)) - drainMask |= kDrainMte3; - emitTNotifyMteDrain(rewriter, op.getLoc(), drainMask); + bool drainMte2 = op->hasAttr(kTNotifyDrainMte2AttrName); + bool drainMte3 = op->hasAttr(kTNotifyDrainMte3AttrName); + emitTNotifyReleaseActions(rewriter, op.getLoc(), drainMte2, drainMte3); rewriter.create(op.getLoc(), TypeRange{}, callee, ArrayAttr{}, ArrayAttr{}, operands); rewriter.eraseOp(op); @@ -13746,7 +13708,11 @@ static void populatePTOToEmitCPatterns(RewritePatternSet &patterns, PTOTGemvMXToTGEMV_MX, PTOTGemvMXAccToTGEMV_MX, PTOTGemvMXBiasToTGEMV_MX, - PTOBarrierToEmitC + PTOBarrierToEmitC, + PTOFenceToEmitC, + PTOFenceToEmitC, + PTOCmoCleanToEmitC, + PTOCmoInvalidateToEmitC >(typeConverter, ctx); patterns.add(typeConverter, ctx); @@ -14007,8 +13973,6 @@ static AICORE inline void ptoas_auto_sync_tail( } } - annotateTNotifyMteDrain(mop); - // 3. 配置转换目标 ConversionTarget target(*ctx); diff --git a/test/lit/pto/issue711_tnotify_mte_drain.pto b/test/lit/pto/issue711_tnotify_mte_drain.pto index 1985744777..4940f88ea5 100644 --- a/test/lit/pto/issue711_tnotify_mte_drain.pto +++ b/test/lit/pto/issue711_tnotify_mte_drain.pto @@ -49,6 +49,9 @@ module { outs(%tile : !pto.tile_buf) pto.tstore ins(%tile : !pto.tile_buf) outs(%dst : !pto.partition_tensor_view<1x32xf32>) + pto.barrier + pto.barrier + pto.fence.release #pto.fence_scope %sig_view = pto.make_tensor_view %signal_ptr, shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> @@ -127,6 +130,86 @@ module { return } + // A user/pass-provided MTE3 barrier drains the pending store; the explicit + // DDR release fence completes the publish sequence. + func.func @tnotify_after_existing_mte3_barrier_and_release( + %src_ptr: !pto.ptr, + %dst_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %v_i32 = arith.constant 1 : i32 + + %tile = pto.alloc_tile : + !pto.tile_buf + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %src = pto.partition_view %src_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %dst = pto.partition_view %dst_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + pto.tload ins(%src : !pto.partition_tensor_view<1x32xf32>) + outs(%tile : !pto.tile_buf) + pto.barrier + pto.tstore ins(%tile : !pto.tile_buf) + outs(%dst : !pto.partition_tensor_view<1x32xf32>) + pto.barrier + pto.fence.release #pto.fence_scope + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } + + // If the user already writes the complete release sequence, the pass should + // not emit a second DDR fence before TNotify. + func.func @tnotify_after_existing_mte3_barrier_and_fence( + %dst_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %v_i32 = arith.constant 1 : i32 + + %tile = pto.alloc_tile : + !pto.tile_buf + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %dst = pto.partition_view %dst_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + pto.tstore ins(%tile : !pto.tile_buf) + outs(%dst : !pto.partition_tensor_view<1x32xf32>) + pto.barrier + pto.fence.release #pto.fence_scope + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } + // tnotify without prior MTE-side work does not need a release drain. func.func @tnotify_no_mte_drain( %signal_ptr: !pto.ptr) @@ -188,6 +271,18 @@ module { // CHECK-NOT: pipe_barrier( // CHECK: pto::comm::TNOTIFY( +// CHECK-LABEL: AICORE void tnotify_after_existing_mte3_barrier_and_release( +// CHECK: TSTORE( +// CHECK: pipe_barrier(PIPE_MTE3); +// CHECK-NEXT: dsb(DSB_DDR); +// CHECK-NEXT: pto::comm::TNOTIFY( + +// CHECK-LABEL: AICORE void tnotify_after_existing_mte3_barrier_and_fence( +// CHECK: TSTORE( +// CHECK: pipe_barrier(PIPE_MTE3); +// CHECK-NEXT: dsb(DSB_DDR); +// CHECK-NEXT: pto::comm::TNOTIFY( + // CHECK-LABEL: AICORE void tnotify_no_mte_drain( // CHECK-NOT: pipe_barrier( // CHECK: pto::comm::TNOTIFY( diff --git a/test/lit/pto/issue872_tput_tnotify_release.pto b/test/lit/pto/issue872_tput_tnotify_release.pto new file mode 100644 index 0000000000..ecc62b3ecc --- /dev/null +++ b/test/lit/pto/issue872_tput_tnotify_release.pto @@ -0,0 +1,169 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Regression for issue #872: pto.comm.tput is a macro that can issue MTE3 GM +// stores internally. A following TNotify publishes a cross-rank signal, so those +// TPUT payload stores must be drained and made DDR-visible before the signal. + +// RUN: ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s + +module { + func.func @tput_tnotify_release( + %src_ptr: !pto.ptr, + %dst_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %v_i32 = arith.constant 1 : i32 + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c8, %c64], strides = [%c64, %c1] : !pto.tensor_view + %src = pto.partition_view %src_view, + offsets = [%c0, %c0], sizes = [%c8, %c64] + : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c8, %c64], strides = [%c64, %c1] : !pto.tensor_view + %dst = pto.partition_view %dst_view, + offsets = [%c0, %c0], sizes = [%c8, %c64] + : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + + %stage = pto.alloc_tile valid_row = %c2 valid_col = %c32 : + !pto.tile_buf + pto.comm.tput(%dst, %src, buf(%stage) : + !pto.partition_tensor_view<8x64xf32>, + !pto.partition_tensor_view<8x64xf32>, + !pto.tile_buf) + {atomicType = #pto} + pto.barrier + pto.fence.release #pto.fence_scope + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } + + func.func @tput_existing_pipe_all_still_dsb( + %src_ptr: !pto.ptr, + %dst_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %v_i32 = arith.constant 1 : i32 + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c8, %c64], strides = [%c64, %c1] : !pto.tensor_view + %src = pto.partition_view %src_view, + offsets = [%c0, %c0], sizes = [%c8, %c64] + : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c8, %c64], strides = [%c64, %c1] : !pto.tensor_view + %dst = pto.partition_view %dst_view, + offsets = [%c0, %c0], sizes = [%c8, %c64] + : !pto.tensor_view -> !pto.partition_tensor_view<8x64xf32> + + %stage = pto.alloc_tile valid_row = %c2 valid_col = %c32 : + !pto.tile_buf + pto.comm.tput(%dst, %src, buf(%stage) : + !pto.partition_tensor_view<8x64xf32>, + !pto.partition_tensor_view<8x64xf32>, + !pto.tile_buf) + {atomicType = #pto} + pto.barrier + pto.fence.release #pto.fence_scope + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } + + func.func @tbroadcast_tnotify_release( + %src_ptr: !pto.ptr, + %peer_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %v_i32 = arith.constant 1 : i32 + + %ping = pto.alloc_tile : + !pto.tile_buf + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %src = pto.partition_view %src_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + %peer_view = pto.make_tensor_view %peer_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %peer = pto.partition_view %peer_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + pto.comm.tbroadcast(%src, recv(%ping), group(%peer) : + !pto.partition_tensor_view<1x32xf32>, + !pto.tile_buf, + !pto.partition_tensor_view<1x32xf32>) {root = 0 : i32} + pto.barrier + pto.fence.release #pto.fence_scope + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } +} + +// CHECK-LABEL: AICORE void tput_tnotify_release( +// CHECK: pto::comm::TPUT( +// CHECK-NOT: pipe_barrier(PIPE_MTE2); +// CHECK: pipe_barrier(PIPE_MTE3); +// CHECK-NEXT: dsb(DSB_DDR); +// CHECK-NEXT: pto::comm::TNOTIFY( + +// CHECK-LABEL: AICORE void tput_existing_pipe_all_still_dsb( +// CHECK: pto::comm::TPUT( +// CHECK-NEXT: pipe_barrier(PIPE_ALL); +// CHECK-NOT: pipe_barrier(PIPE_MTE2); +// CHECK-NOT: pipe_barrier(PIPE_MTE3); +// CHECK: dsb(DSB_DDR); +// CHECK-NEXT: pto::comm::TNOTIFY( + +// CHECK-LABEL: AICORE void tbroadcast_tnotify_release( +// CHECK: pto::comm::TBROADCAST( +// CHECK-NOT: pipe_barrier(PIPE_MTE2); +// CHECK: pipe_barrier(PIPE_MTE3); +// CHECK-NEXT: dsb(DSB_DDR); +// CHECK-NEXT: pto::comm::TNOTIFY( diff --git a/test/lit/pto/memory_consistency_invalid.pto b/test/lit/pto/memory_consistency_invalid.pto new file mode 100644 index 0000000000..f84652f82d --- /dev/null +++ b/test/lit/pto/memory_consistency_invalid.pto @@ -0,0 +1,88 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// RUN: not ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s + +module { + func.func @missing_mte3_release( + %dst_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %v_i32 = arith.constant 1 : i32 + + %tile = pto.alloc_tile : + !pto.tile_buf + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %dst = pto.partition_view %dst_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + pto.tstore ins(%tile : !pto.tile_buf) + outs(%dst : !pto.partition_tensor_view<1x32xf32>) + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } + + func.func @missing_scalar_clean( + %payload_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %v_i32 = arith.constant 7 : i32 + + pto.store_scalar %v_i32, %payload_ptr[%c0] : !pto.ptr, i32 + pto.fence.release #pto.fence_scope + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } + + func.func @missing_acquire_invalidate( + %payload_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %v_i32 = arith.constant 1 : i32 + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.twait(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {cmp = #pto} + + %val = pto.load_scalar %payload_ptr[%c0] : !pto.ptr -> i32 + pto.store_scalar %val, %payload_ptr[%c0] : !pto.ptr, i32 + return + } +} + +// CHECK: requires an explicit `pto.barrier ` +// CHECK: requires explicit `pto.cmo.clean all #pto.address_space` +// CHECK: requires explicit `pto.cmo.invalidate all #pto.address_space` diff --git a/test/lit/pto/signal_payload_cache_consistency.pto b/test/lit/pto/signal_payload_cache_consistency.pto new file mode 100644 index 0000000000..9056e739f3 --- /dev/null +++ b/test/lit/pto/signal_payload_cache_consistency.pto @@ -0,0 +1,197 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Signal/payload memory-consistency regressions for cacheable scalar GM paths. +// These are correctness checks, not precise range-clean performance tests. + +// RUN: ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s + +module { + func.func @scalar_store_tnotify_release( + %payload_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %v_i32 = arith.constant 7 : i32 + + pto.store_scalar %v_i32, %payload_ptr[%c0] : !pto.ptr, i32 + pto.cmo.clean all #pto.address_space + pto.fence.release #pto.fence_scope + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } + + func.func @scalar_store_clean_and_fence_suppress_release( + %payload_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %v_i32 = arith.constant 7 : i32 + + pto.store_scalar %v_i32, %payload_ptr[%c0] : !pto.ptr, i32 + pto.cmo.clean all #pto.address_space + pto.fence.release #pto.fence_scope + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } + + func.func @twait_load_scalar_acquire( + %payload_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %v_i32 = arith.constant 1 : i32 + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.twait(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {cmp = #pto} + + pto.cmo.invalidate all #pto.address_space + %val = pto.load_scalar %payload_ptr[%c0] : !pto.ptr -> i32 + pto.store_scalar %val, %payload_ptr[%c0] : !pto.ptr, i32 + return + } + + func.func @twait_user_invalidate_suppresses_acquire( + %payload_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %v_i32 = arith.constant 1 : i32 + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.twait(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {cmp = #pto} + + pto.cmo.invalidate all #pto.address_space + %val = pto.load_scalar %payload_ptr[%c0] : !pto.ptr -> i32 + pto.store_scalar %val, %payload_ptr[%c0] : !pto.ptr, i32 + return + } + + func.func @explicit_fence_acquire( + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + pto.fence.acquire #pto.fence_scope + return + } + + func.func @ttest_load_scalar_conservative_acquire( + %payload_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %v_i32 = arith.constant 1 : i32 + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + %ready = pto.comm.ttest(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {cmp = #pto} -> i1 + + pto.cmo.invalidate all #pto.address_space + %val = pto.load_scalar %payload_ptr[%c0] : !pto.ptr -> i32 + scf.if %ready { + pto.store_scalar %val, %payload_ptr[%c0] : !pto.ptr, i32 + } + return + } + + func.func @dirty_store_before_acquire_is_cleaned( + %payload_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %v_i32 = arith.constant 3 : i32 + + pto.store_scalar %v_i32, %payload_ptr[%c0] : !pto.ptr, i32 + pto.cmo.clean all #pto.address_space + pto.fence.release #pto.fence_scope + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.twait(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {cmp = #pto} + + pto.cmo.invalidate all #pto.address_space + %val = pto.load_scalar %payload_ptr[%c0] : !pto.ptr -> i32 + pto.store_scalar %val, %payload_ptr[%c0] : !pto.ptr, i32 + return + } +} + +// CHECK-LABEL: AICORE void scalar_store_tnotify_release( +// CHECK: {{.*}}[{{.*}}] = +// CHECK-NEXT: dcci((__gm__ void*)0, ENTIRE_DATA_CACHE, CACHELINE_OUT); +// CHECK-NEXT: dsb(DSB_DDR); +// CHECK-NEXT: pto::comm::TNOTIFY( + +// CHECK-LABEL: AICORE void scalar_store_clean_and_fence_suppress_release( +// CHECK: {{.*}}[{{.*}}] = +// CHECK-NEXT: dcci((__gm__ void*)0, ENTIRE_DATA_CACHE, CACHELINE_OUT); +// CHECK-NEXT: dsb(DSB_DDR); +// CHECK-NEXT: pto::comm::TNOTIFY( + +// CHECK-LABEL: AICORE void twait_load_scalar_acquire( +// CHECK: pto::comm::TWAIT( +// CHECK: dcci((__gm__ void*)0, ENTIRE_DATA_CACHE); +// CHECK-NEXT: {{.*}} = {{.*}}[{{.*}}]; + +// CHECK-LABEL: AICORE void twait_user_invalidate_suppresses_acquire( +// CHECK: pto::comm::TWAIT( +// CHECK-NEXT: dcci((__gm__ void*)0, ENTIRE_DATA_CACHE); +// CHECK-NEXT: {{.*}} = {{.*}}[{{.*}}]; + +// CHECK-LABEL: AICORE void explicit_fence_acquire( +// CHECK: dsb(DSB_DDR); + +// CHECK-LABEL: AICORE void ttest_load_scalar_conservative_acquire( +// CHECK: pto::comm::TTEST( +// CHECK: dcci((__gm__ void*)0, ENTIRE_DATA_CACHE); +// CHECK-NEXT: {{.*}} = {{.*}}[{{.*}}]; + +// CHECK-LABEL: AICORE void dirty_store_before_acquire_is_cleaned( +// CHECK: {{.*}}[{{.*}}] = +// CHECK: dcci((__gm__ void*)0, ENTIRE_DATA_CACHE, CACHELINE_OUT); +// CHECK-NEXT: dsb(DSB_DDR); +// CHECK: pto::comm::TWAIT( +// CHECK: dcci((__gm__ void*)0, ENTIRE_DATA_CACHE); +// CHECK-NEXT: {{.*}} = {{.*}}[{{.*}}]; diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp index 6443f1ca72..3b6f0f08d2 100644 --- a/tools/ptoas/ptoas.cpp +++ b/tools/ptoas/ptoas.cpp @@ -1915,6 +1915,7 @@ int mlir::pto::compilePTOASModule( pm.addPass(pto::createPTOInlineBackendHelpersPass()); pm.addPass(createCanonicalizerPass()); pm.addPass(createCSEPass()); + pm.addPass(pto::createPTOMemoryConsistencyPass()); if (failed(applyConfiguredPassManagerCLOptions(pm, "main PTOAS pipeline"))) return 1; From dc31dc39ddc4aca83cfdedd14bfb703fce14e48b Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Tue, 30 Jun 2026 16:01:20 +0800 Subject: [PATCH 2/9] Guard unsupported VPTO memory consistency lowering --- lib/PTO/Transforms/VPTOCANN900LLVMEmitter.cpp | 31 +++++++++++++++++++ lib/PTO/Transforms/VPTOLLVMEmitter.cpp | 31 +++++++++++++++++++ .../memory_consistency_cmo_unsupported.pto | 21 +++++++++++++ .../memory_consistency_fence_unsupported.pto | 21 +++++++++++++ 4 files changed, 104 insertions(+) create mode 100644 test/lit/vpto/memory_consistency_cmo_unsupported.pto create mode 100644 test/lit/vpto/memory_consistency_fence_unsupported.pto diff --git a/lib/PTO/Transforms/VPTOCANN900LLVMEmitter.cpp b/lib/PTO/Transforms/VPTOCANN900LLVMEmitter.cpp index 8362aea64b..46409520de 100644 --- a/lib/PTO/Transforms/VPTOCANN900LLVMEmitter.cpp +++ b/lib/PTO/Transforms/VPTOCANN900LLVMEmitter.cpp @@ -8616,6 +8616,31 @@ class LowerMemBarOpPattern final : public OpConversionPattern { LoweringState &state; }; +template +class LowerUnsupportedMemoryConsistencyOpPattern final + : public OpConversionPattern { +public: + explicit LowerUnsupportedMemoryConsistencyOpPattern( + TypeConverter &typeConverter, MLIRContext *context, + LoweringState &state) + : OpConversionPattern(typeConverter, context) { + (void)state; + } + + LogicalResult + matchAndRewrite(MemoryConsistencyOp op, + typename MemoryConsistencyOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + (void)adaptor; + (void)rewriter; + op.emitOpError() + << "is not supported by the VPTO backend yet; PTOAS validates the " + "memory-consistency contract, but VPTO lowering still needs a " + "confirmed DSB/DCCI intrinsic ABI"; + return failure(); + } +}; + template class LowerBufSyncOpPattern final : public OpConversionPattern { public: @@ -9981,6 +10006,10 @@ static void populateVPTOOpLoweringPatterns(VPTOTypeConverter &typeConverter, LowerPipeEventDynSyncOpPattern, LowerPipeEventDynSyncOpPattern, LowerBarrierOpPattern, LowerMemBarOpPattern, + LowerUnsupportedMemoryConsistencyOpPattern, + LowerUnsupportedMemoryConsistencyOpPattern, + LowerUnsupportedMemoryConsistencyOpPattern, + LowerUnsupportedMemoryConsistencyOpPattern, LowerBufSyncOpPattern, LowerBufSyncOpPattern, LowerRuntimeQueryOpPattern, @@ -10042,6 +10071,8 @@ static void configureVPTOOpLoweringTarget(ConversionTarget &target, target.addLegalOp(); target.addIllegalOp(); target.addIllegalOp { LoweringState &state; }; +template +class LowerUnsupportedMemoryConsistencyOpPattern final + : public OpConversionPattern { +public: + explicit LowerUnsupportedMemoryConsistencyOpPattern( + TypeConverter &typeConverter, MLIRContext *context, + LoweringState &state) + : OpConversionPattern(typeConverter, context) { + (void)state; + } + + LogicalResult + matchAndRewrite(MemoryConsistencyOp op, + typename MemoryConsistencyOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + (void)adaptor; + (void)rewriter; + op.emitOpError() + << "is not supported by the VPTO backend yet; PTOAS validates the " + "memory-consistency contract, but VPTO lowering still needs a " + "confirmed DSB/DCCI intrinsic ABI"; + return failure(); + } +}; + template class LowerBufSyncOpPattern final : public OpConversionPattern { public: @@ -9927,6 +9952,10 @@ static void populateVPTOOpLoweringPatterns(VPTOTypeConverter &typeConverter, LowerPipeEventDynSyncOpPattern, LowerPipeEventDynSyncOpPattern, LowerBarrierOpPattern, LowerMemBarOpPattern, + LowerUnsupportedMemoryConsistencyOpPattern, + LowerUnsupportedMemoryConsistencyOpPattern, + LowerUnsupportedMemoryConsistencyOpPattern, + LowerUnsupportedMemoryConsistencyOpPattern, LowerBufSyncOpPattern, LowerBufSyncOpPattern, LowerRuntimeQueryOpPattern, @@ -9988,6 +10017,8 @@ static void configureVPTOOpLoweringTarget(ConversionTarget &target, target.addLegalOp(); target.addIllegalOp(); target.addIllegalOp&1 | FileCheck %s +// RUN: not ptoas --cann-output-version=9.0.0 --pto-arch=a5 --pto-backend=vpto --emit-vpto-llvm-ir %s -o - 2>&1 | FileCheck %s + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + func.func @unsupported_cmo_clean() attributes {pto.kernel} { + pto.cmo.clean all #pto.address_space + return + } +} + +// CHECK: pto.cmo.clean +// CHECK-SAME: is not supported by the VPTO backend yet +// CHECK: VPTO lowering still needs a confirmed DSB/DCCI intrinsic ABI diff --git a/test/lit/vpto/memory_consistency_fence_unsupported.pto b/test/lit/vpto/memory_consistency_fence_unsupported.pto new file mode 100644 index 0000000000..b8e1bce069 --- /dev/null +++ b/test/lit/vpto/memory_consistency_fence_unsupported.pto @@ -0,0 +1,21 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// RUN: not ptoas --pto-arch=a5 --pto-backend=vpto --emit-vpto-llvm-ir %s -o - 2>&1 | FileCheck %s +// RUN: not ptoas --cann-output-version=9.0.0 --pto-arch=a5 --pto-backend=vpto --emit-vpto-llvm-ir %s -o - 2>&1 | FileCheck %s + +module attributes {pto.target_arch = "a5", pto.kernel_kind = #pto.kernel_kind} { + func.func @unsupported_fence_release() attributes {pto.kernel} { + pto.fence.release #pto.fence_scope + return + } +} + +// CHECK: pto.fence.release +// CHECK-SAME: is not supported by the VPTO backend yet +// CHECK: VPTO lowering still needs a confirmed DSB/DCCI intrinsic ABI From ead6b4f73a33e0422f38c42696aa62fcf1437541 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Tue, 30 Jun 2026 16:06:54 +0800 Subject: [PATCH 3/9] Document PTOAS memory consistency design --- .../ptoas-memory-consistency-design.md | 337 ++++++++++++++++++ 1 file changed, 337 insertions(+) create mode 100644 docs/designs/ptoas-memory-consistency-design.md diff --git a/docs/designs/ptoas-memory-consistency-design.md b/docs/designs/ptoas-memory-consistency-design.md new file mode 100644 index 0000000000..3a8619b262 --- /dev/null +++ b/docs/designs/ptoas-memory-consistency-design.md @@ -0,0 +1,337 @@ +# PTOAS 内存一致性设计 + +本文说明 PTOAS 如何建模并校验 GM payload 与 signal 之间的内存一致性要求。 + +这里讨论的是内存一致性,不是自动同步。自动同步负责 pipe 之间的执行顺序,例如 +`set_flag`、`wait_flag` 和 `pipe_barrier`。内存一致性负责回答另一个问题:当 signal +已经被对端观察到时,signal 之前发布的 payload 是否已经对正确的观察方可见。 + +## 1. 背景 + +`pto.comm.tnotify` 用来发布一个 signal。对端通过 `pto.comm.twait` 或 +`pto.comm.ttest` 观察这个 signal,然后读取对应的 payload。 + +一个容易误解的点是:signal ready 不等价于 payload 一定已经可见。原因是 signal +和 payload 可能走不同的硬件路径: + +- signal 通常是一个较小的通信同步标记。 +- payload 通常是更大的 GM 数据,可能由 MTE3、TPUT 或 cacheable scalar store 写出。 +- 不同路径之间只靠源码顺序不一定形成完整的可见性关系。 + +因此,PTOAS 需要在发布 signal 前校验 release 侧动作,在消费 signal 后校验 +acquire 侧动作。 + +## 2. 关键概念 + +### 2.1 Payload + +payload 是真正要被对端或后续代码读取的数据。例如: + +- `TStore` 写出的 GM 数据。 +- `TPUT` 内部写出的 peer GM 数据。 +- `store_scalar` 写出的 GM 数据。 + +### 2.2 Signal + +signal 是通知对端 payload 已经准备好的标记。例如: + +- `TNotify` 发布 signal。 +- `TWait` 等待 signal。 +- `TTest` 轮询 signal 是否 ready。 + +signal 只表达“通知发生了”。如果 signal 前没有正确的 release 动作,signal 可能先被 +对端观察到,而 payload 仍然没有进入对端能够正确读取的可见性状态。 + +### 2.3 Pipe drain + +pipe drain 用来保证某条 pipe 上已经发出的工作完成到该 pipe 的边界。典型指令是: + +```mlir +pto.barrier #pto.pipe +``` + +它解决的是 pipe 内工作排空问题。它不等价于 cache clean,也不等价于 DDR-domain +visibility fence。 + +### 2.4 Cache maintenance operation + +cache maintenance operation 用来处理 cacheable GM 访问造成的 cache line 状态。 +当前 PTOAS 暴露两个语义 op: + +```mlir +pto.cmo.clean all #pto.address_space +pto.cmo.invalidate all #pto.address_space +``` + +第一阶段采用 whole-cache 形式。也就是说,它不指定精确地址范围,而是对整个 GM +相关 data cache 做保守处理。这样优先保证正确性,后续再优化成精确 range。 + +### 2.5 DDR fence + +DDR fence 用来把已经完成的 GM 写入或 cache maintenance 操作推进到 DDR visibility +domain,并约束它们发生在后续 signal publish 之前。当前 PTOAS 暴露两个语义 op: + +```mlir +pto.fence.release #pto.fence_scope +pto.fence.acquire #pto.fence_scope +``` + +当前 release 和 acquire 都使用同一个 `ddr` scope。语义上,release 侧用于发布 +payload,acquire 侧用于约束观察 signal 后的 payload 读取。 + +## 3. 整体模型 + +生产端的正确顺序是: + +```mermaid +flowchart LR + A["payload write"] --> B["pipe drain or cache clean"] + B --> C["DDR release fence"] + C --> D["TNotify publishes signal"] +``` + +消费端的正确顺序是: + +```mermaid +flowchart LR + A["TWait or successful TTest observes signal"] --> B["cache invalidate if needed"] + B --> C["payload read"] +``` + +这两个方向配合起来,才能保证 signal 和 payload 的顺序关系对观察方成立。 + +## 4. 显式 IR 接口 + +PTOAS 选择把 cache maintenance 和 DDR fence 暴露成显式 PTO IR,而不是在 lowering +阶段偷偷插入 `dcci` 和 `dsb`。 + +原因如下: + +- 这类动作有实际运行时成本,尤其 whole-cache CMO 成本较高。 +- 用户或 PyPTO 更清楚 payload 的发布边界。 +- PTOAS 可以负责校验契约,避免漏插或乱序,而不是猜测所有场景。 +- VPTO 后端当前还没有确认的 DSB 和 DCCI intrinsic ABI,显式 IR 可以先稳定上层契约。 + +当前新增的语义 op 是: + +| PTO IR | 语义 | EmitC lowering | +| --- | --- | --- | +| `pto.cmo.clean all #pto.address_space` | 清理 GM 相关 dirty cache line | `dcci((__gm__ void*)0, ENTIRE_DATA_CACHE, CACHELINE_OUT)` | +| `pto.cmo.invalidate all #pto.address_space` | 失效 GM 相关 stale cache line | `dcci((__gm__ void*)0, ENTIRE_DATA_CACHE)` | +| `pto.fence.release #pto.fence_scope` | release 侧 DDR visibility fence | `dsb(DSB_DDR)` | +| `pto.fence.acquire #pto.fence_scope` | acquire 侧 DDR visibility fence | `dsb(DSB_DDR)` | + +## 5. MemoryConsistency pass + +`pto-memory-consistency` 是一个 Module pass,运行在 shared mainline 上,因此 EmitC 和 +VPTO backend 都会先经过这一步。 + +这个 pass 的职责是校验显式契约: + +- 识别 signal publish 前是否存在 pending payload write。 +- 识别 signal acquire 后是否存在 cacheable GM payload read。 +- 校验用户或 PyPTO 是否已经插入必要的 CMO 和 fence。 +- 对缺失或顺序错误的场景报编译错误。 +- 对不需要 `dcci` 和 `dsb` 的纯 pipe drain 场景,仍允许保留自动标注。 + +这个 pass 不负责分配 event id,也不属于 InsertSync 自动同步流水线。 + +## 6. 场景规则 + +### 6.1 MTE3 或 TPUT 写 payload 后发布 signal + +适用场景: + +- `TStore` 通过 `PIPE_MTE3` 写 GM。 +- `TPUT` macro op 内部通过 MTE3 写 peer GM。 +- 其他 macro op phase 中存在 MTE3 GM write。 + +需要的顺序: + +```mlir +// payload producer +pto.barrier #pto.pipe +pto.fence.release #pto.fence_scope +pto.comm.tnotify ... +``` + +`pto.barrier #pto.pipe` 用来排空 MTE3 pipe。`pto.fence.release` 用来保证 +这些 GM 写入在 signal 发布前进入 DDR visibility domain。 + +如果只有 `pto.fence.release`,但没有 MTE3 barrier,PTOAS 会报错。因为 fence 不能替代 +pipe drain。 + +### 6.2 MTE2 工作后发布 signal + +适用场景: + +- `TLoad` 或其他 `PIPE_MTE2` 工作出现在 `TNotify` 之前。 + +当前规则: + +```mlir +// PTOAS 可以自动标注并在 EmitC lowering 中生成 PIPE_MTE2 barrier +pto.comm.tnotify ... +``` + +MTE2 是 GM read 方向。它需要的是 signal 前不要越过前序 MTE2 工作,但不需要 DDR +release fence。PTOAS 当前仍允许自动插入这类纯 pipe drain。 + +### 6.3 Cacheable scalar GM store 后发布 signal + +适用场景: + +- `store_scalar` 写 GM,并且该路径可能经过 cache。 + +需要的顺序: + +```mlir +pto.store_scalar ... +pto.cmo.clean all #pto.address_space +pto.fence.release #pto.fence_scope +pto.comm.tnotify ... +``` + +`pto.cmo.clean` 把 dirty cache line 推出。`pto.fence.release` 等待并约束 clean 的结果在 +signal 发布前可见。 + +如果只插 `pto.fence.release`,PTOAS 会报错。因为 fence 不会替代 cache clean。 + +### 6.4 TWait 或 TTest 后读取 cacheable GM payload + +适用场景: + +- `TWait` 返回后执行 `load_scalar` 读取 GM payload。 +- `TTest` 成功观察到 signal 后执行 `load_scalar` 读取 GM payload。 + +需要的顺序: + +```mlir +pto.comm.twait ... +pto.cmo.invalidate all #pto.address_space +%value = pto.load_scalar ... +``` + +invalidate 用来避免读取到本地 stale cache line。 + +### 6.5 Acquire 前本地可能存在 dirty GM cache + +适用场景: + +- 同一个执行流中,等待 signal 前已经有 cacheable GM store。 +- 后续又要在 signal acquire 后读取 GM payload。 + +需要的顺序: + +```mlir +pto.store_scalar ... +pto.cmo.clean all #pto.address_space +pto.fence.release #pto.fence_scope +pto.comm.twait ... +pto.cmo.invalidate all #pto.address_space +%value = pto.load_scalar ... +``` + +clean 和 release fence 用来处理本地 dirty cache。invalidate 用来处理 signal 后读取对端 +payload 时可能遇到的 stale cache。 + +## 7. PyPTO 生成建议 + +PyPTO 需要在 payload publish 边界显式生成 CMO 和 fence。 + +### 7.1 TPUT 发布 signal + +```mlir +pto.comm.tput ... +pto.barrier #pto.pipe +pto.fence.release #pto.fence_scope +pto.comm.tnotify ... +``` + +### 7.2 TStore 发布 signal + +```mlir +pto.tstore ... +pto.barrier #pto.pipe +pto.fence.release #pto.fence_scope +pto.comm.tnotify ... +``` + +### 7.3 Scalar store 发布 signal + +```mlir +pto.store_scalar ... +pto.cmo.clean all #pto.address_space +pto.fence.release #pto.fence_scope +pto.comm.tnotify ... +``` + +### 7.4 TWait 后读取 scalar payload + +```mlir +pto.comm.twait ... +pto.cmo.invalidate all #pto.address_space +%value = pto.load_scalar ... +``` + +### 7.5 TTest polling 后读取 scalar payload + +```mlir +%ready = pto.comm.ttest ... +scf.if %ready { + pto.cmo.invalidate all #pto.address_space + %value = pto.load_scalar ... +} +``` + +如果 PyPTO 使用 `pto.ldg` 或 `pto.stg` 并显式选择 uncache 路径,可以避免部分 +cacheable scalar GM 问题。但这不是 `pto.cmo.clean` 或 `pto.cmo.invalidate` 的替代品。 +如果之前已经存在 dirty 或 stale cache line,仍需要显式 CMO。 + +## 8. Backend lowering 状态 + +### 8.1 EmitC + +EmitC backend 已经支持真实 lowering: + +- `pto.cmo.clean` lower 到 `dcci(..., CACHELINE_OUT)`。 +- `pto.cmo.invalidate` lower 到 `dcci(...)`。 +- `pto.fence.release` lower 到 `dsb(DSB_DDR)`。 +- `pto.fence.acquire` lower 到 `dsb(DSB_DDR)`。 + +### 8.2 VPTO + +VPTO backend 当前没有确认的 DSB 和 DCCI intrinsic ABI。 + +因此,VPTO lowering 中现在提供的是 fail-fast stub: + +- `pto.cmo.clean` +- `pto.cmo.invalidate` +- `pto.fence.release` +- `pto.fence.acquire` + +如果这些 op 进入 VPTO LLVM lowering,PTOAS 会报错,提示 VPTO backend 尚不支持这些 +memory-consistency op,需要确认 DSB/DCCI intrinsic ABI 后再接真实 lowering。 + +这样做的目的不是支持 VPTO 运行,而是避免 unsupported op 静默残留到后端 IR。 + +## 9. 当前限制 + +当前实现优先保证正确性,仍有以下限制: + +- CMO 是 whole-cache 粒度,不是精确地址范围。 +- `TWait` 和 `TTest` acquire 侧当前只覆盖 `load_scalar`。 +- VPTO 暂不支持 CMO 和 DDR fence 的真实 lowering。 +- 对复杂 CFG 的分析仍是保守近似,不做完整 path-sensitive 数据流。 +- MemoryConsistency pass 校验的是显式内存一致性契约,不替代 InsertSync 的 alias 和 pipe + 同步分析。 + +## 10. 后续工作 + +后续可以分几步推进: + +1. 和 VPTO/Bisheng 对齐 DSB 和 DCCI intrinsic ABI,并补齐 VPTO lowering。 +2. 将 whole-cache CMO 优化成精确 GM address range CMO。 +3. 扩展 acquire 侧 consumer 范围,从 `load_scalar` 扩展到更多 cacheable GM read。 +4. 将 macro op phase 的 memory descriptor 做得更精细,减少误报。 +5. 在 PyPTO 和 PTOAS 之间明确 cacheable 与 uncacheable GM 访问的 IR 契约。 From 77035100927a7cf4789f4bfe3c863ea5b492646a Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Tue, 30 Jun 2026 16:20:30 +0800 Subject: [PATCH 4/9] Auto insert MTE3 drain before release fence --- .../ptoas-memory-consistency-design.md | 32 +++++++++++++++---- lib/PTO/Transforms/PTOMemoryConsistency.cpp | 28 ++++++++++------ test/lit/pto/issue711_tnotify_mte_drain.pto | 1 - .../lit/pto/issue872_tput_tnotify_release.pto | 2 -- test/lit/pto/memory_consistency_invalid.pto | 2 +- 5 files changed, 44 insertions(+), 21 deletions(-) diff --git a/docs/designs/ptoas-memory-consistency-design.md b/docs/designs/ptoas-memory-consistency-design.md index 3a8619b262..00b656348d 100644 --- a/docs/designs/ptoas-memory-consistency-design.md +++ b/docs/designs/ptoas-memory-consistency-design.md @@ -131,6 +131,7 @@ VPTO backend 都会先经过这一步。 - 识别 signal publish 前是否存在 pending payload write。 - 识别 signal acquire 后是否存在 cacheable GM payload read。 - 校验用户或 PyPTO 是否已经插入必要的 CMO 和 fence。 +- 在显式 release fence 前自动补齐必要的 MTE3 pipe drain。 - 对缺失或顺序错误的场景报编译错误。 - 对不需要 `dcci` 和 `dsb` 的纯 pipe drain 场景,仍允许保留自动标注。 @@ -150,16 +151,31 @@ VPTO backend 都会先经过这一步。 ```mlir // payload producer -pto.barrier #pto.pipe pto.fence.release #pto.fence_scope pto.comm.tnotify ... ``` -`pto.barrier #pto.pipe` 用来排空 MTE3 pipe。`pto.fence.release` 用来保证 -这些 GM 写入在 signal 发布前进入 DDR visibility domain。 +PyPTO 或用户只需要表达 `pto.fence.release` 这个内存一致性边界。PTOAS 会在 +`pto.fence.release #pto.fence_scope` 前检查是否存在 pending MTE3 GM write;如果存在, +自动插入: + +```mlir +pto.barrier #pto.pipe +``` + +最终 lowering 的顺序是: -如果只有 `pto.fence.release`,但没有 MTE3 barrier,PTOAS 会报错。因为 fence 不能替代 -pipe drain。 +```cpp +pipe_barrier(PIPE_MTE3); +dsb(DSB_DDR); +pto::comm::TNOTIFY(...); +``` + +`pipe_barrier(PIPE_MTE3)` 用来排空 MTE3 pipe。`pto.fence.release` lower 出来的 +`dsb(DSB_DDR)` 用来保证这些 GM 写入在 signal 发布前进入 DDR visibility domain。 + +如果缺少 `pto.fence.release`,PTOAS 会报错。因为 PTOAS 可以推导 pipe drain,但不会凭空 +猜测 payload publish 的语义边界。 ### 6.2 MTE2 工作后发布 signal @@ -239,11 +255,14 @@ payload 时可能遇到的 stale cache。 PyPTO 需要在 payload publish 边界显式生成 CMO 和 fence。 +PyPTO 不需要手动生成 `pto.barrier #pto.pipe`。这是低层 pipe drain 细节, +由 PTOAS 根据 release fence 前的 pending MTE3 work 自动插入。这样可以保证最终顺序是 +`pipe_barrier(PIPE_MTE3)` 先于 `dsb(DSB_DDR)`,不会出现先 fence、后 drain 的错误顺序。 + ### 7.1 TPUT 发布 signal ```mlir pto.comm.tput ... -pto.barrier #pto.pipe pto.fence.release #pto.fence_scope pto.comm.tnotify ... ``` @@ -252,7 +271,6 @@ pto.comm.tnotify ... ```mlir pto.tstore ... -pto.barrier #pto.pipe pto.fence.release #pto.fence_scope pto.comm.tnotify ... ``` diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp index 10854e4ea0..b378b9e66e 100644 --- a/lib/PTO/Transforms/PTOMemoryConsistency.cpp +++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp @@ -12,6 +12,7 @@ #include "PTO/Transforms/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" namespace mlir { @@ -236,22 +237,27 @@ static void diagnoseTNotifyRelease(pto::TNotifyOp op, hasFailure = true; return; } - if (state.drainMte3) { - op.emitOpError() - << "requires an explicit `pto.barrier ` before " - "`pto.fence.release #pto.fence_scope` when publishing a " - "signal after MTE3 GM writes"; - hasFailure = true; - return; - } if (state.needsDsbDdr) { op.emitOpError() << "requires explicit `pto.fence.release #pto.fence_scope` " - "before publishing a signal after GM writes or cache clean"; + "before publishing a signal after GM writes or cache clean; " + "PTOAS inserts the required MTE3 pipe drain before the release " + "fence when needed"; hasFailure = true; } } +static void insertMte3DrainBeforeReleaseFence(pto::FenceReleaseOp fence, + TNotifyReleaseState &state) { + if (fence.getScope().getScope() != pto::FenceScope::DDR || !state.drainMte3) + return; + OpBuilder builder(fence); + builder.create( + fence.getLoc(), pto::PipeAttr::get(fence.getContext(), + pto::PIPE::PIPE_MTE3)); + state.drainMte3 = false; +} + static void markNestedTNotifyWithState(Operation *op, const TNotifyReleaseState &state, bool &hasFailure) { @@ -307,8 +313,10 @@ annotateTNotifyReleaseForBlock(Block &block, pendingState.applyBarrier(barrier.getPipe().getPipe()); if (auto cmo = dyn_cast(op)) pendingState.applyCmoClean(cmo.getSpace().getAddressSpace()); - if (auto fence = dyn_cast(op)) + if (auto fence = dyn_cast(op)) { + insertMte3DrainBeforeReleaseFence(fence, pendingState); pendingState.applyFenceRelease(fence.getScope().getScope()); + } } return pendingState; } diff --git a/test/lit/pto/issue711_tnotify_mte_drain.pto b/test/lit/pto/issue711_tnotify_mte_drain.pto index 4940f88ea5..0cda48da4c 100644 --- a/test/lit/pto/issue711_tnotify_mte_drain.pto +++ b/test/lit/pto/issue711_tnotify_mte_drain.pto @@ -50,7 +50,6 @@ module { pto.tstore ins(%tile : !pto.tile_buf) outs(%dst : !pto.partition_tensor_view<1x32xf32>) pto.barrier - pto.barrier pto.fence.release #pto.fence_scope %sig_view = pto.make_tensor_view %signal_ptr, diff --git a/test/lit/pto/issue872_tput_tnotify_release.pto b/test/lit/pto/issue872_tput_tnotify_release.pto index ecc62b3ecc..17a891a295 100644 --- a/test/lit/pto/issue872_tput_tnotify_release.pto +++ b/test/lit/pto/issue872_tput_tnotify_release.pto @@ -45,7 +45,6 @@ module { !pto.partition_tensor_view<8x64xf32>, !pto.tile_buf) {atomicType = #pto} - pto.barrier pto.fence.release #pto.fence_scope %sig_view = pto.make_tensor_view %signal_ptr, @@ -132,7 +131,6 @@ module { !pto.partition_tensor_view<1x32xf32>, !pto.tile_buf, !pto.partition_tensor_view<1x32xf32>) {root = 0 : i32} - pto.barrier pto.fence.release #pto.fence_scope %sig_view = pto.make_tensor_view %signal_ptr, diff --git a/test/lit/pto/memory_consistency_invalid.pto b/test/lit/pto/memory_consistency_invalid.pto index f84652f82d..e486abe738 100644 --- a/test/lit/pto/memory_consistency_invalid.pto +++ b/test/lit/pto/memory_consistency_invalid.pto @@ -83,6 +83,6 @@ module { } } -// CHECK: requires an explicit `pto.barrier ` +// CHECK: requires explicit `pto.fence.release #pto.fence_scope` // CHECK: requires explicit `pto.cmo.clean all #pto.address_space` // CHECK: requires explicit `pto.cmo.invalidate all #pto.address_space` From aa7725d39398e3e65d9daeca95a70c30efb4d438 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Tue, 30 Jun 2026 16:39:05 +0800 Subject: [PATCH 5/9] Handle FIX GM writes before release fence --- .../ptoas-memory-consistency-design.md | 28 ++++++---- lib/PTO/Transforms/PTOMemoryConsistency.cpp | 54 ++++++++++++++----- test/lit/pto/issue711_tnotify_mte_drain.pto | 40 ++++++++++++++ 3 files changed, 101 insertions(+), 21 deletions(-) diff --git a/docs/designs/ptoas-memory-consistency-design.md b/docs/designs/ptoas-memory-consistency-design.md index 00b656348d..0fc7edce4b 100644 --- a/docs/designs/ptoas-memory-consistency-design.md +++ b/docs/designs/ptoas-memory-consistency-design.md @@ -131,7 +131,7 @@ VPTO backend 都会先经过这一步。 - 识别 signal publish 前是否存在 pending payload write。 - 识别 signal acquire 后是否存在 cacheable GM payload read。 - 校验用户或 PyPTO 是否已经插入必要的 CMO 和 fence。 -- 在显式 release fence 前自动补齐必要的 MTE3 pipe drain。 +- 在显式 release fence 前自动补齐必要的 MTE3 或 FIX pipe drain。 - 对缺失或顺序错误的场景报编译错误。 - 对不需要 `dcci` 和 `dsb` 的纯 pipe drain 场景,仍允许保留自动标注。 @@ -139,11 +139,13 @@ VPTO backend 都会先经过这一步。 ## 6. 场景规则 -### 6.1 MTE3 或 TPUT 写 payload 后发布 signal +### 6.1 MTE3、FIX 或 TPUT 写 payload 后发布 signal 适用场景: - `TStore` 通过 `PIPE_MTE3` 写 GM。 +- `TStore` 通过 `PIPE_FIX` 写 GM,例如 ACC tile 写回 GM。 +- `TStoreFP` 通过 `PIPE_FIX` 写 GM。 - `TPUT` macro op 内部通过 MTE3 写 peer GM。 - 其他 macro op phase 中存在 MTE3 GM write。 @@ -156,11 +158,13 @@ pto.comm.tnotify ... ``` PyPTO 或用户只需要表达 `pto.fence.release` 这个内存一致性边界。PTOAS 会在 -`pto.fence.release #pto.fence_scope` 前检查是否存在 pending MTE3 GM write;如果存在, -自动插入: +`pto.fence.release #pto.fence_scope` 前检查是否存在 pending MTE3 或 FIX GM write;如果存在, +自动插入对应 pipe 的 drain: ```mlir pto.barrier #pto.pipe +// or +pto.barrier #pto.pipe ``` 最终 lowering 的顺序是: @@ -171,8 +175,13 @@ dsb(DSB_DDR); pto::comm::TNOTIFY(...); ``` -`pipe_barrier(PIPE_MTE3)` 用来排空 MTE3 pipe。`pto.fence.release` lower 出来的 -`dsb(DSB_DDR)` 用来保证这些 GM 写入在 signal 发布前进入 DDR visibility domain。 +`pipe_barrier(PIPE_MTE3)` 或 `pipe_barrier(PIPE_FIX)` 用来排空实际执行 GM write 的 +pipe。`pto.fence.release` lower 出来的 `dsb(DSB_DDR)` 用来保证这些 GM 写入在 signal +发布前进入 DDR visibility domain。 + +这里不能把所有 `PIPE_FIX` op 都当成 release payload write。很多 FIX op 只是本地 +ACC 到 MAT 或 ACC 到 VEC 的搬运,不需要 DDR release。PTOAS 只对确认写 GM payload 的 +FIX 路径补 release drain。 如果缺少 `pto.fence.release`,PTOAS 会报错。因为 PTOAS 可以推导 pipe drain,但不会凭空 猜测 payload publish 的语义边界。 @@ -255,9 +264,10 @@ payload 时可能遇到的 stale cache。 PyPTO 需要在 payload publish 边界显式生成 CMO 和 fence。 -PyPTO 不需要手动生成 `pto.barrier #pto.pipe`。这是低层 pipe drain 细节, -由 PTOAS 根据 release fence 前的 pending MTE3 work 自动插入。这样可以保证最终顺序是 -`pipe_barrier(PIPE_MTE3)` 先于 `dsb(DSB_DDR)`,不会出现先 fence、后 drain 的错误顺序。 +PyPTO 不需要手动生成 `pto.barrier #pto.pipe` 或 +`pto.barrier #pto.pipe`。这是低层 pipe drain 细节,由 PTOAS 根据 release fence +前的 pending GM write pipe 自动插入。这样可以保证最终顺序是对应 pipe barrier 先于 +`dsb(DSB_DDR)`,不会出现先 fence、后 drain 的错误顺序。 ### 7.1 TPUT 发布 signal diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp index b378b9e66e..567e4ce425 100644 --- a/lib/PTO/Transforms/PTOMemoryConsistency.cpp +++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp @@ -34,12 +34,14 @@ static bool isGmAddressSpace(pto::AddressSpace space) { struct TNotifyReleaseState { bool drainMte2 = false; bool drainMte3 = false; + bool drainFix = false; bool cleanGmCache = false; bool needsDsbDdr = false; void merge(const TNotifyReleaseState &other) { drainMte2 |= other.drainMte2; drainMte3 |= other.drainMte3; + drainFix |= other.drainFix; cleanGmCache |= other.cleanGmCache; needsDsbDdr |= other.needsDsbDdr; } @@ -47,6 +49,7 @@ struct TNotifyReleaseState { void clear() { drainMte2 = false; drainMte3 = false; + drainFix = false; cleanGmCache = false; needsDsbDdr = false; } @@ -59,9 +62,13 @@ struct TNotifyReleaseState { case pto::PIPE::PIPE_MTE3: drainMte3 = false; break; + case pto::PIPE::PIPE_FIX: + drainFix = false; + break; case pto::PIPE::PIPE_ALL: drainMte2 = false; drainMte3 = false; + drainFix = false; break; default: break; @@ -76,7 +83,7 @@ struct TNotifyReleaseState { void applyFenceRelease(pto::FenceScope scope) { if (scope != pto::FenceScope::DDR) return; - if (drainMte3 || cleanGmCache) + if (drainMte3 || drainFix || cleanGmCache) return; needsDsbDdr = false; } @@ -154,6 +161,13 @@ static TNotifyReleaseState getReleaseStateForPipe(pto::PIPE pipe) { return state; } +static TNotifyReleaseState getFixGmWriteReleaseState() { + TNotifyReleaseState state; + state.drainFix = true; + state.needsDsbDdr = true; + return state; +} + static TNotifyReleaseState getReleaseStateForMacroModel(Operation *op) { TNotifyReleaseState state; auto model = getSyncMacroModel(op); @@ -185,8 +199,16 @@ static TNotifyReleaseState getDirectTNotifyReleaseState(Operation *op) { } } + if (auto tstore = dyn_cast(op); + tstore && tstore.getPipe() == pto::PIPE::PIPE_FIX) + return getFixGmWriteReleaseState(); + + if (isa(op)) + return getFixGmWriteReleaseState(); + TNotifyReleaseState macroState = getReleaseStateForMacroModel(op); - if (macroState.drainMte3 || macroState.cleanGmCache || + if (macroState.drainMte3 || macroState.drainFix || + macroState.cleanGmCache || macroState.needsDsbDdr) return macroState; @@ -241,21 +263,29 @@ static void diagnoseTNotifyRelease(pto::TNotifyOp op, op.emitOpError() << "requires explicit `pto.fence.release #pto.fence_scope` " "before publishing a signal after GM writes or cache clean; " - "PTOAS inserts the required MTE3 pipe drain before the release " - "fence when needed"; + "PTOAS inserts the required MTE3/FIX pipe drain before the " + "release fence when needed"; hasFailure = true; } } -static void insertMte3DrainBeforeReleaseFence(pto::FenceReleaseOp fence, - TNotifyReleaseState &state) { - if (fence.getScope().getScope() != pto::FenceScope::DDR || !state.drainMte3) +static void insertDrainsBeforeReleaseFence(pto::FenceReleaseOp fence, + TNotifyReleaseState &state) { + if (fence.getScope().getScope() != pto::FenceScope::DDR) return; OpBuilder builder(fence); - builder.create( - fence.getLoc(), pto::PipeAttr::get(fence.getContext(), - pto::PIPE::PIPE_MTE3)); - state.drainMte3 = false; + auto insertBarrier = [&](pto::PIPE pipe) { + builder.create( + fence.getLoc(), pto::PipeAttr::get(fence.getContext(), pipe)); + }; + if (state.drainMte3) { + insertBarrier(pto::PIPE::PIPE_MTE3); + state.drainMte3 = false; + } + if (state.drainFix) { + insertBarrier(pto::PIPE::PIPE_FIX); + state.drainFix = false; + } } static void markNestedTNotifyWithState(Operation *op, @@ -314,7 +344,7 @@ annotateTNotifyReleaseForBlock(Block &block, if (auto cmo = dyn_cast(op)) pendingState.applyCmoClean(cmo.getSpace().getAddressSpace()); if (auto fence = dyn_cast(op)) { - insertMte3DrainBeforeReleaseFence(fence, pendingState); + insertDrainsBeforeReleaseFence(fence, pendingState); pendingState.applyFenceRelease(fence.getScope().getScope()); } } diff --git a/test/lit/pto/issue711_tnotify_mte_drain.pto b/test/lit/pto/issue711_tnotify_mte_drain.pto index 0cda48da4c..01c0f48637 100644 --- a/test/lit/pto/issue711_tnotify_mte_drain.pto +++ b/test/lit/pto/issue711_tnotify_mte_drain.pto @@ -62,6 +62,40 @@ module { return } + // acc tstore -> tnotify: ACC->GM stores use the FIX pipe, so the release + // fence must be preceded by a FIX pipe drain. + func.func @tnotify_drain_after_acc_tstore( + %dst_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %v_i32 = arith.constant 1 : i32 + + %acc = pto.alloc_tile : + !pto.tile_buf + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c16, %c16], strides = [%c16, %c1] : !pto.tensor_view + %dst = pto.partition_view %dst_view, + offsets = [%c0, %c0], sizes = [%c16, %c16] + : !pto.tensor_view -> !pto.partition_tensor_view<16x16xf32> + + pto.tstore ins(%acc : !pto.tile_buf) + outs(%dst : !pto.partition_tensor_view<16x16xf32>) + pto.fence.release #pto.fence_scope + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } + // tload -> tnotify: the input-consumed case (notify must follow the load // so the producer can reuse the source buffer once TWAIT returns). func.func @tnotify_drain_after_tload( @@ -257,6 +291,12 @@ module { // CHECK-NEXT: dsb(DSB_DDR); // CHECK-NEXT: pto::comm::TNOTIFY( +// CHECK-LABEL: AICORE void tnotify_drain_after_acc_tstore( +// CHECK: TSTORE +// CHECK: pipe_barrier(PIPE_FIX); +// CHECK-NEXT: dsb(DSB_DDR); +// CHECK-NEXT: pto::comm::TNOTIFY( + // CHECK-LABEL: AICORE void tnotify_drain_after_tload( // CHECK: pto::comm::NotifyOp{{.*}}= pto::comm::NotifyOp::AtomicAdd; // CHECK: TLOAD( From 4c60b7e93105aa8bf931db2e322eaa2e2debf353 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Tue, 30 Jun 2026 16:58:26 +0800 Subject: [PATCH 6/9] Tighten TNotify release payload detection --- .../ptoas-memory-consistency-design.md | 4 + lib/PTO/Transforms/PTOMemoryConsistency.cpp | 41 +++++----- test/lit/pto/tnotify_release_local_ops.pto | 79 +++++++++++++++++++ 3 files changed, 102 insertions(+), 22 deletions(-) create mode 100644 test/lit/pto/tnotify_release_local_ops.pto diff --git a/docs/designs/ptoas-memory-consistency-design.md b/docs/designs/ptoas-memory-consistency-design.md index 0fc7edce4b..2bbe1ed6c7 100644 --- a/docs/designs/ptoas-memory-consistency-design.md +++ b/docs/designs/ptoas-memory-consistency-design.md @@ -183,6 +183,10 @@ pipe。`pto.fence.release` lower 出来的 `dsb(DSB_DDR)` 用来保证这些 GM ACC 到 MAT 或 ACC 到 VEC 的搬运,不需要 DDR release。PTOAS 只对确认写 GM payload 的 FIX 路径补 release drain。 +同理,也不能把所有 `PIPE_MTE3` op 都当成 release payload write。例如 A5 的 +Vec 到 Mat `TInsert` 是本地 UB 到 L1 的搬运,不发布 GM payload。PTOAS 只对 +`TStore`、comm macro MTE3 phase 等确认写 GM payload 的路径补 release drain。 + 如果缺少 `pto.fence.release`,PTOAS 会报错。因为 PTOAS 可以推导 pipe drain,但不会凭空 猜测 payload publish 的语义边界。 diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp index 567e4ce425..3937892727 100644 --- a/lib/PTO/Transforms/PTOMemoryConsistency.cpp +++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp @@ -140,24 +140,16 @@ static bool isGmScalarMemory(Type type) { return false; } -static TNotifyReleaseState getReleaseStateForPipe(pto::PIPE pipe) { +static TNotifyReleaseState getMte2PayloadReadReleaseState() { TNotifyReleaseState state; - switch (pipe) { - case pto::PIPE::PIPE_MTE2: - state.drainMte2 = true; - break; - case pto::PIPE::PIPE_MTE3: - state.drainMte3 = true; - state.needsDsbDdr = true; - break; - case pto::PIPE::PIPE_ALL: - state.drainMte2 = true; - state.drainMte3 = true; - state.needsDsbDdr = true; - break; - default: - break; - } + state.drainMte2 = true; + return state; +} + +static TNotifyReleaseState getMte3GmWriteReleaseState() { + TNotifyReleaseState state; + state.drainMte3 = true; + state.needsDsbDdr = true; return state; } @@ -199,9 +191,16 @@ static TNotifyReleaseState getDirectTNotifyReleaseState(Operation *op) { } } - if (auto tstore = dyn_cast(op); - tstore && tstore.getPipe() == pto::PIPE::PIPE_FIX) - return getFixGmWriteReleaseState(); + if (isa(op)) + return getMte2PayloadReadReleaseState(); + + if (auto tstore = dyn_cast(op)) { + if (tstore.getPipe() == pto::PIPE::PIPE_MTE3) + return getMte3GmWriteReleaseState(); + if (tstore.getPipe() == pto::PIPE::PIPE_FIX) + return getFixGmWriteReleaseState(); + return {}; + } if (isa(op)) return getFixGmWriteReleaseState(); @@ -212,8 +211,6 @@ static TNotifyReleaseState getDirectTNotifyReleaseState(Operation *op) { macroState.needsDsbDdr) return macroState; - if (auto pipeOp = dyn_cast(op)) - return getReleaseStateForPipe(pipeOp.getPipe()); return {}; } diff --git a/test/lit/pto/tnotify_release_local_ops.pto b/test/lit/pto/tnotify_release_local_ops.pto new file mode 100644 index 0000000000..6b8940f21f --- /dev/null +++ b/test/lit/pto/tnotify_release_local_ops.pto @@ -0,0 +1,79 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// TNotify release analysis must be payload-specific. Some local tile ops run on +// MTE3 or FIX internally, but they do not publish GM payloads and must not +// require a DDR release fence or emit a release drain before TNotify. + +// RUN: ptoas --pto-arch=a5 %s -o - 2>&1 | FileCheck %s + +module attributes {"pto.target_arch" = "a5"} { + func.func @local_tinsert_mte3_then_tnotify(%signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %v_i32 = arith.constant 1 : i32 + + %src = pto.alloc_tile : + !pto.tile_buf + %dst = pto.alloc_tile : + !pto.tile_buf + pto.tinsert ins(%src, %c0, %c0 : + !pto.tile_buf, + index, index) + outs(%dst : + !pto.tile_buf) + {tinsertMode = #pto.tinsert_mode} + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } + + func.func @local_tmov_fix_then_tnotify(%signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %v_i32 = arith.constant 1 : i32 + + %src = pto.alloc_tile : + !pto.tile_buf + %dst = pto.alloc_tile : + !pto.tile_buf + pto.tmov ins(%src : + !pto.tile_buf) + outs(%dst : + !pto.tile_buf) + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } +} + +// CHECK-LABEL: AICORE void local_tinsert_mte3_then_tnotify( +// CHECK: TINSERT +// CHECK-NOT: pipe_barrier( +// CHECK-NOT: dsb( +// CHECK: pto::comm::TNOTIFY( + +// CHECK-LABEL: AICORE void local_tmov_fix_then_tnotify( +// CHECK: TMOV +// CHECK-NOT: pipe_barrier( +// CHECK-NOT: dsb( +// CHECK: pto::comm::TNOTIFY( From 4c6e22c678c1670085ed9f7bb68c4aa256eadd8f Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Tue, 30 Jun 2026 17:13:10 +0800 Subject: [PATCH 7/9] Refine memory consistency region analysis --- .../ptoas-memory-consistency-design.md | 4 ++ lib/PTO/Transforms/PTOMemoryConsistency.cpp | 58 +++++++++++++++---- .../pto/memory_consistency_external_func.pto | 37 ++++++++++++ 3 files changed, 89 insertions(+), 10 deletions(-) create mode 100644 test/lit/pto/memory_consistency_external_func.pto diff --git a/docs/designs/ptoas-memory-consistency-design.md b/docs/designs/ptoas-memory-consistency-design.md index 2bbe1ed6c7..ef2fd58e38 100644 --- a/docs/designs/ptoas-memory-consistency-design.md +++ b/docs/designs/ptoas-memory-consistency-design.md @@ -135,6 +135,10 @@ VPTO backend 都会先经过这一步。 - 对缺失或顺序错误的场景报编译错误。 - 对不需要 `dcci` 和 `dsb` 的纯 pipe drain 场景,仍允许保留自动标注。 +遍历策略是 region-scoped 的保守分析:单 block region 按顺序递归分析;复杂 CFG +region 暂不做 path-sensitive 数据流,但只在当前 region 内收集 pending state,不把同一个 +parent op 的其他 sibling region 状态混入。外部函数声明没有函数体,pass 会直接跳过。 + 这个 pass 不负责分配 event id,也不属于 InsertSync 自动同步流水线。 ## 6. 场景规则 diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp index 3937892727..e4eaaf4e00 100644 --- a/lib/PTO/Transforms/PTOMemoryConsistency.cpp +++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp @@ -223,6 +223,14 @@ static TNotifyReleaseState collectTNotifyReleaseState(Operation *op) { return state; } +static TNotifyReleaseState collectTNotifyReleaseState(Region ®ion) { + TNotifyReleaseState state; + for (Block &block : region) + for (Operation &nested : block) + state.merge(collectTNotifyReleaseState(&nested)); + return state; +} + static bool isLoopLikeOp(Operation *op) { return isa(op); } @@ -294,6 +302,15 @@ static void markNestedTNotifyWithState(Operation *op, }); } +static void markNestedTNotifyWithState(Region ®ion, + const TNotifyReleaseState &state, + bool &hasFailure) { + for (Block &block : region) { + for (Operation &nested : block) + markNestedTNotifyWithState(&nested, state, hasFailure); + } +} + static TNotifyReleaseState annotateTNotifyReleaseForBlock(Block &block, TNotifyReleaseState entryPendingState, @@ -323,11 +340,11 @@ annotateTNotifyReleaseForBlock(Block &block, region.front(), regionEntryState, nestedLoopCarriedState, hasFailure)); } else { - TNotifyReleaseState regionState = collectTNotifyReleaseState(&op); + TNotifyReleaseState regionState = collectTNotifyReleaseState(region); TNotifyReleaseState nestedNotifyState = regionEntryState; nestedNotifyState.merge(nestedLoopCarriedState); nestedNotifyState.merge(regionState); - markNestedTNotifyWithState(&op, nestedNotifyState, hasFailure); + markNestedTNotifyWithState(region, nestedNotifyState, hasFailure); TNotifyReleaseState regionExitState = regionEntryState; regionExitState.merge(regionState); @@ -351,6 +368,9 @@ annotateTNotifyReleaseForBlock(Block &block, static bool annotateTNotifyRelease(ModuleOp module) { bool hasFailure = false; for (auto func : module.getOps()) { + if (func.isExternal()) + continue; + if (func.getBody().hasOneBlock()) { (void)annotateTNotifyReleaseForBlock(func.getBody().front(), TNotifyReleaseState{}, @@ -362,9 +382,8 @@ static bool annotateTNotifyRelease(ModuleOp module) { // Be conservative for pre-existing CFG: without a path-sensitive CFG data // flow here, every TNotify may observe any release-relevant work in the // function. - TNotifyReleaseState funcState = - collectTNotifyReleaseState(func.getOperation()); - markNestedTNotifyWithState(func.getOperation(), funcState, hasFailure); + TNotifyReleaseState funcState = collectTNotifyReleaseState(func.getBody()); + markNestedTNotifyWithState(func.getBody(), funcState, hasFailure); } return hasFailure; } @@ -434,6 +453,14 @@ static SignalAcquireState collectSignalAcquireState(Operation *op) { return state; } +static SignalAcquireState collectSignalAcquireState(Region ®ion) { + SignalAcquireState state; + for (Block &block : region) + for (Operation &nested : block) + state.merge(collectSignalAcquireState(&nested)); + return state; +} + static void markNestedAcquireLoadsWithState(Operation *op, SignalAcquireState state, bool &hasFailure) { @@ -444,6 +471,15 @@ static void markNestedAcquireLoadsWithState(Operation *op, }); } +static void markNestedAcquireLoadsWithState(Region ®ion, + SignalAcquireState state, + bool &hasFailure) { + for (Block &block : region) { + for (Operation &nested : block) + markNestedAcquireLoadsWithState(&nested, state, hasFailure); + } +} + static SignalAcquireState annotateSignalAcquireForBlock(Block &block, SignalAcquireState entryState, bool &hasFailure) { @@ -478,8 +514,8 @@ annotateSignalAcquireForBlock(Block &block, SignalAcquireState entryState, combinedRegionExitState.merge( annotateSignalAcquireForBlock(region.front(), state, hasFailure)); } else { - markNestedAcquireLoadsWithState(&op, state, hasFailure); - SignalAcquireState regionState = collectSignalAcquireState(&op); + markNestedAcquireLoadsWithState(region, state, hasFailure); + SignalAcquireState regionState = collectSignalAcquireState(region); SignalAcquireState regionExitState = state; regionExitState.merge(regionState); combinedRegionExitState.merge(regionExitState); @@ -496,15 +532,17 @@ annotateSignalAcquireForBlock(Block &block, SignalAcquireState entryState, static bool annotateSignalAcquire(ModuleOp module) { bool hasFailure = false; for (auto func : module.getOps()) { + if (func.isExternal()) + continue; + if (func.getBody().hasOneBlock()) { (void)annotateSignalAcquireForBlock(func.getBody().front(), SignalAcquireState{}, hasFailure); continue; } - SignalAcquireState funcState = - collectSignalAcquireState(func.getOperation()); - markNestedAcquireLoadsWithState(func.getOperation(), funcState, hasFailure); + SignalAcquireState funcState = collectSignalAcquireState(func.getBody()); + markNestedAcquireLoadsWithState(func.getBody(), funcState, hasFailure); } return hasFailure; } diff --git a/test/lit/pto/memory_consistency_external_func.pto b/test/lit/pto/memory_consistency_external_func.pto new file mode 100644 index 0000000000..e24951f0cd --- /dev/null +++ b/test/lit/pto/memory_consistency_external_func.pto @@ -0,0 +1,37 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// MemoryConsistency must skip external func declarations. They have no body to +// scan and should not affect release/acquire state in real kernels. + +// RUN: ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s + +module { + func.func private @external_consumer(!pto.ptr) + + func.func @external_func_decl_is_skipped(%signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %v_i32 = arith.constant 1 : i32 + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } +} + +// CHECK-LABEL: AICORE void external_func_decl_is_skipped( +// CHECK-NOT: pipe_barrier( +// CHECK-NOT: dsb( +// CHECK: pto::comm::TNOTIFY( From f0a49646793d59817a73211e18ee5f4a805c9384 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Wed, 1 Jul 2026 09:20:38 +0800 Subject: [PATCH 8/9] Fix loop release summary in memory consistency --- lib/PTO/Transforms/PTOMemoryConsistency.cpp | 58 ++++++++++++++++++- .../pto/memory_consistency_loop_release.pto | 56 ++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 test/lit/pto/memory_consistency_loop_release.pto diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp index e4eaaf4e00..ebd27fac53 100644 --- a/lib/PTO/Transforms/PTOMemoryConsistency.cpp +++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp @@ -231,6 +231,62 @@ static TNotifyReleaseState collectTNotifyReleaseState(Region ®ion) { return state; } +static void applyFenceReleaseForSummary(pto::FenceReleaseOp fence, + TNotifyReleaseState &state) { + if (fence.getScope().getScope() != pto::FenceScope::DDR) + return; + + // The real annotation pass inserts the pending GM-write pipe drain before a + // release fence. Loop summaries must model that transfer without mutating IR, + // otherwise already-released loop-carried writes are reported again at the + // next iteration's TNotify. + state.drainMte3 = false; + state.drainFix = false; + state.applyFenceRelease(fence.getScope().getScope()); +} + +static TNotifyReleaseState getTNotifyReleaseExitStateForBlock( + Block &block, TNotifyReleaseState pendingState); + +static TNotifyReleaseState +getTNotifyReleaseExitState(Operation *op, + TNotifyReleaseState pendingState = {}) { + if (isa(op)) + pendingState.clear(); + + pendingState.merge(getDirectTNotifyReleaseState(op)); + + TNotifyReleaseState regionEntryState = pendingState; + TNotifyReleaseState combinedRegionExitState; + for (Region ®ion : op->getRegions()) { + if (region.hasOneBlock()) { + combinedRegionExitState.merge( + getTNotifyReleaseExitStateForBlock(region.front(), regionEntryState)); + continue; + } + + TNotifyReleaseState regionExitState = regionEntryState; + regionExitState.merge(collectTNotifyReleaseState(region)); + combinedRegionExitState.merge(regionExitState); + } + pendingState.merge(combinedRegionExitState); + + if (auto barrier = dyn_cast(op)) + pendingState.applyBarrier(barrier.getPipe().getPipe()); + if (auto cmo = dyn_cast(op)) + pendingState.applyCmoClean(cmo.getSpace().getAddressSpace()); + if (auto fence = dyn_cast(op)) + applyFenceReleaseForSummary(fence, pendingState); + return pendingState; +} + +static TNotifyReleaseState getTNotifyReleaseExitStateForBlock( + Block &block, TNotifyReleaseState pendingState) { + for (Operation &op : block) + pendingState = getTNotifyReleaseExitState(&op, pendingState); + return pendingState; +} + static bool isLoopLikeOp(Operation *op) { return isa(op); } @@ -333,7 +389,7 @@ annotateTNotifyReleaseForBlock(Block &block, for (Region ®ion : op.getRegions()) { TNotifyReleaseState nestedLoopCarriedState = loopCarriedState; if (isLoopLikeOp(&op)) - nestedLoopCarriedState.merge(collectTNotifyReleaseState(&op)); + nestedLoopCarriedState.merge(getTNotifyReleaseExitState(&op)); if (region.hasOneBlock()) { combinedRegionExitState.merge(annotateTNotifyReleaseForBlock( diff --git a/test/lit/pto/memory_consistency_loop_release.pto b/test/lit/pto/memory_consistency_loop_release.pto new file mode 100644 index 0000000000..fd11820c47 --- /dev/null +++ b/test/lit/pto/memory_consistency_loop_release.pto @@ -0,0 +1,56 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// RUN: ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s + +// A loop-local release fence must clear the loop-carried GM-write pending state. +// Otherwise the next iteration's TNotify is falsely diagnosed as missing an +// explicit release fence even though each iteration already has one. + +module { + func.func @loop_tstore_release_tnotify( + %dst_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c32 = arith.constant 32 : index + %v_i32 = arith.constant 1 : i32 + + %tile = pto.alloc_tile : + !pto.tile_buf + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %dst = pto.partition_view %dst_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + + scf.for %i = %c0 to %c2 step %c1 { + pto.tstore ins(%tile : !pto.tile_buf) + outs(%dst : !pto.partition_tensor_view<1x32xf32>) + pto.fence.release #pto.fence_scope + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + } + return + } +} + +// CHECK-LABEL: AICORE void loop_tstore_release_tnotify( +// CHECK: TSTORE( +// CHECK: pipe_barrier(PIPE_MTE3); +// CHECK-NEXT: dsb(DSB_DDR); +// CHECK-NEXT: pto::comm::TNOTIFY( From ec771dc312dc856c9140c768a3091de0777d3ff2 Mon Sep 17 00:00:00 2001 From: TaoTao-real Date: Wed, 1 Jul 2026 09:21:59 +0800 Subject: [PATCH 9/9] Reject non-inlined memory consistency calls --- .../ptoas-memory-consistency-design.md | 6 ++ lib/PTO/Transforms/PTOMemoryConsistency.cpp | 82 ++++++++++++++++++- ...ory_consistency_noninline_call_invalid.pto | 59 +++++++++++++ 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 test/lit/pto/memory_consistency_noninline_call_invalid.pto diff --git a/docs/designs/ptoas-memory-consistency-design.md b/docs/designs/ptoas-memory-consistency-design.md index ef2fd58e38..f7428a75f6 100644 --- a/docs/designs/ptoas-memory-consistency-design.md +++ b/docs/designs/ptoas-memory-consistency-design.md @@ -139,6 +139,12 @@ VPTO backend 都会先经过这一步。 region 暂不做 path-sensitive 数据流,但只在当前 region 内收集 pending state,不把同一个 parent op 的其他 sibling region 状态混入。外部函数声明没有函数体,pass 会直接跳过。 +`func.call` 边界不做上下文敏感的数据流传播。若 same-module 非内联 callee 的传递调用闭包 +中包含 payload 访问、CMO、fence 或 signal 相关 PTO op,pass 会报错并要求在 +`pto-memory-consistency` 前完成 inline。这样可以避免 caller 在 `TNotify` 前看不到 callee +内部 pending payload write,或者 callee 内部 cacheable payload read 看不到 caller 侧 +`TWait` acquire state。 + 这个 pass 不负责分配 event id,也不属于 InsertSync 自动同步流水线。 ## 6. 场景规则 diff --git a/lib/PTO/Transforms/PTOMemoryConsistency.cpp b/lib/PTO/Transforms/PTOMemoryConsistency.cpp index ebd27fac53..541d8412fc 100644 --- a/lib/PTO/Transforms/PTOMemoryConsistency.cpp +++ b/lib/PTO/Transforms/PTOMemoryConsistency.cpp @@ -13,7 +13,9 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/SymbolTable.h" #include "mlir/Pass/Pass.h" +#include "llvm/ADT/DenseSet.h" namespace mlir { namespace pto { @@ -291,6 +293,83 @@ static bool isLoopLikeOp(Operation *op) { return isa(op); } +static func::FuncOp lookupCallee(func::CallOp call) { + return SymbolTable::lookupNearestSymbolFrom( + call.getOperation(), call.getCalleeAttr()); +} + +static bool isMemoryConsistencyRelevantDirectOp(Operation *op) { + if (isa(op)) + return true; + + if (auto load = dyn_cast(op)) + return isGmScalarMemory(load.getPtr().getType()); + if (auto store = dyn_cast(op)) + return isGmScalarMemory(store.getPtr().getType()); + + TNotifyReleaseState macroState = getReleaseStateForMacroModel(op); + return macroState.drainMte2 || macroState.drainMte3 || + macroState.drainFix || macroState.cleanGmCache || + macroState.needsDsbDdr; +} + +static bool calleeContainsMemoryConsistencyRelevantOps( + func::FuncOp callee, llvm::DenseSet &activeCallees) { + if (!callee || callee.isExternal()) + return false; + if (!activeCallees.insert(callee.getOperation()).second) + return false; + + WalkResult result = callee.walk([&](Operation *op) -> WalkResult { + if (op == callee.getOperation()) + return WalkResult::advance(); + + if (auto nestedCall = dyn_cast(op)) { + func::FuncOp nestedCallee = lookupCallee(nestedCall); + if (calleeContainsMemoryConsistencyRelevantOps(nestedCallee, + activeCallees)) + return WalkResult::interrupt(); + return WalkResult::advance(); + } + + if (isMemoryConsistencyRelevantDirectOp(op)) + return WalkResult::interrupt(); + return WalkResult::advance(); + }); + + activeCallees.erase(callee.getOperation()); + return result.wasInterrupted(); +} + +static bool diagnoseNonInlinedMemoryConsistencyCalls(ModuleOp module) { + bool hasFailure = false; + for (auto func : module.getOps()) { + if (func.isExternal()) + continue; + + func.walk([&](func::CallOp call) { + func::FuncOp callee = lookupCallee(call); + if (!callee || callee.isExternal()) + return; + + llvm::DenseSet activeCallees; + if (!calleeContainsMemoryConsistencyRelevantOps(callee, activeCallees)) + return; + + call.emitOpError() + << "calls @" << callee.getSymName() + << ", which contains PTO memory consistency relevant operations; " + "inline the callee before `pto-memory-consistency` or keep " + "payload, CMO, fence, and signal operations in the caller"; + hasFailure = true; + }); + } + return hasFailure; +} + static void setTNotifyReleaseAttrs(pto::TNotifyOp op, const TNotifyReleaseState &state) { op->removeAttr(kTNotifyDrainMte2AttrName); @@ -608,9 +687,10 @@ struct PTOMemoryConsistencyPass PTOMemoryConsistencyPass> { void runOnOperation() override { ModuleOp module = getOperation(); + bool callFailed = diagnoseNonInlinedMemoryConsistencyCalls(module); bool releaseFailed = annotateTNotifyRelease(module); bool acquireFailed = annotateSignalAcquire(module); - if (releaseFailed || acquireFailed) + if (callFailed || releaseFailed || acquireFailed) signalPassFailure(); } }; diff --git a/test/lit/pto/memory_consistency_noninline_call_invalid.pto b/test/lit/pto/memory_consistency_noninline_call_invalid.pto new file mode 100644 index 0000000000..88f003023b --- /dev/null +++ b/test/lit/pto/memory_consistency_noninline_call_invalid.pto @@ -0,0 +1,59 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// RUN: not ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s + +// Non-inlined calls are not context-sensitive: a caller-side TNotify cannot +// safely observe release-relevant payload writes hidden in a callee body. Such +// callees must be inlined before the memory consistency pass. + +module { + func.func private @producer( + %tile: !pto.tile_buf, + %dst: !pto.partition_tensor_view<1x32xf32>) { + pto.tstore ins(%tile : !pto.tile_buf) + outs(%dst : !pto.partition_tensor_view<1x32xf32>) + return + } + + func.func @call_hidden_payload_write( + %dst_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %v_i32 = arith.constant 1 : i32 + + %tile = pto.alloc_tile : + !pto.tile_buf + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %dst = pto.partition_view %dst_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + call @producer(%tile, %dst) : + (!pto.tile_buf, + !pto.partition_tensor_view<1x32xf32>) -> () + pto.fence.release #pto.fence_scope + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } +} + +// CHECK: calls @producer, which contains PTO memory consistency relevant operations +// CHECK: inline the callee before `pto-memory-consistency`