Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions include/PTO/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,12 @@ def PlanMemory : Pass<"pto-plan-memory", "ModuleOp"> {
Option<"restrictInplaceAsISA", "restrict-inplace-as-isa", "bool",
/*default=*/"false",
"restrict memory inplace as isa, default : false">,
Option<"orderBySize", "order-by-size", "bool",
/*default=*/"false",
"Process buffers largest-first (first-fit-decreasing order) during "
"local memory planning instead of the default DMA-first order. "
"Decreasing-size order packs heterogeneous-size buffers tighter "
"(matches XLA/TVM/SOMAS). default : false">,
];
}
def PTOLoweringSyncToPipe : Pass<"pto-lowering-sync-to-pipe", "func::FuncOp"> {
Expand Down
55 changes: 54 additions & 1 deletion lib/PTO/Transforms/PTOPlanMemory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1334,6 +1334,13 @@ bool MemPlan::IsEnoughForBuffersNoReuse(StorageEntry *rootStorageEntry,
if (iter == bufferScope2RequiredSize.end())
llvm::report_fatal_error("missing required-size entry for buffer scope");
if (iter->second < restBufferSize) {
// Even when the scope fits without reuse (no peak to save), honor
// largest-first placement so the option means the same thing on both paths:
// a deterministic decreasing-size layout regardless of whether reuse kicks
// in. Stable sort keeps uniform-size scopes byte-identical to the default.
if (orderBySize) {
rootStorageEntry = GetSizeOrderedRootStorageEntry(rootStorageEntry);
}
PlanBuffersWithoutReuse(rootStorageEntry, alignUnit);
return true;
}
Expand Down Expand Up @@ -1548,6 +1555,9 @@ void MemPlan::ReportCurEntryDebugInfo(const StorageEntry *curEntry) {

StorageEntry *
MemPlan::GetReorderRootStorageEntry(StorageEntry *rootStorageEntry) {
if (orderBySize) {
return GetSizeOrderedRootStorageEntry(rootStorageEntry);
}
if (rootStorageEntry->bufInfo->bufferScope != pto::AddressSpace::VEC) {
return rootStorageEntry;
}
Expand Down Expand Up @@ -1616,6 +1626,49 @@ void MemPlan::ReorderContinuousPingPongEntry(
reorderedStorageEntryVec.swap(storageEntryVec);
}

StorageEntry *
MemPlan::GetSizeOrderedRootStorageEntry(StorageEntry *rootStorageEntry) {
// First-fit-decreasing: place the largest buffers first. For the heterogeneous
// buffer sizes that real kernels produce, decreasing-size order packs tighter
// than an arbitrary/DMA-first order (this is the ordering XLA, TVM and SOMAS
// all use). Applies to every memory space, unlike the DMA-first reorder which
// is VEC-only.
SmallVector<StorageEntry *> entries = {rootStorageEntry};
entries.insert(entries.end(), rootStorageEntry->mergedChildren.begin(),
rootStorageEntry->mergedChildren.end());

// Stable sort by decreasing buffer size. Stable keeps the original order among
// equal-size buffers, so uniform-size instances (e.g. the plan_memory_* tests)
// are left untouched.
std::stable_sort(entries.begin(), entries.end(),
[](const StorageEntry *a, const StorageEntry *b) {
return a->bufInfo->constBits > b->bufInfo->constBits;
});

// Keep ping-pong (double-buffer) pairs contiguous so double-buffering is
// preserved (same post-processing the DMA-first path applies).
ReorderContinuousPingPongEntry(entries);

// Rebuild the flat root -> children structure around the new (largest) root.
// Clear every entry's child list first: when the root changes, the previous
// root would otherwise keep its stale child list (forming a cycle), and only
// the new root should carry the flat list of the others.
StorageEntry *reorderedRootStorageEntry = entries[0];
for (StorageEntry *entry : entries) {
entry->mergedChildren.clear();
}
for (size_t j = 1; j < entries.size(); ++j) {
reorderedRootStorageEntry->mergedChildren.push_back(entries[j]);
}
// Keep the scope -> root map consistent so later consumers (RecordOverflowIfAny,
// PrintSuccessfulAllocatedMaxBits) read the new root and its full child list.
// This must accompany the clear above: clearing children without updating the
// map would leave the stale root pointing at an empty child list.
memscope2rootStorageEntry[reorderedRootStorageEntry->bufInfo->bufferScope] =

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Non-blocking (latent fragility): this writes memscope2rootStorageEntry[scope] while the same map is being range-iterated in PlanMemAddressOfWholeLocalBuffer (for (auto &it : memscope2rootStorageEntry)), reached through both IsEnoughForBuffersNoReuse and PlanReusableLocalBuffer / PlanMemAddressForLevel0.

It is safe today only because scope is always the key already being visited (every entry in a scope shares that scope, and the key was inserted in MergeSameScopeSE), so DenseMap::operator[] hits an existing bucket and never inserts or reallocates -- the active iterator stays valid. That is an implicit invariant. If a future change ever routes a scope not yet in the map through this path, operator[] would insert, possibly rehash, and invalidate the in-flight range-for iterator (UB). Cheap hardening: assign through the iterator the caller already holds, or assert memscope2rootStorageEntry.count(scope) here before writing.

reorderedRootStorageEntry;
return reorderedRootStorageEntry;
Comment on lines +1656 to +1669

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

When the root storage entry of a scope changes after sorting, the old root becomes a child of the new root. If we do not clear the mergedChildren of the old root, it will still contain references to the other entries (including the new root), creating potential cycles or stale child references in the tree. Clearing mergedChildren for all entries in entries before rebuilding the new root's children prevents this.

Additionally, the memscope2rootStorageEntry map is used by other methods (such as RecordOverflowIfAny and PrintSuccessfulAllocatedMaxBits) to find the root storage entry of a scope. If we do not update this map when the root changes, those methods will continue to operate on the old root, leading to incorrect or incomplete analysis.

  for (auto *entry : entries) {
    entry->mergedChildren.clear();
  }
  StorageEntry *reorderedRootStorageEntry = entries[0];
  for (size_t j = 1; j < entries.size(); ++j) {
    reorderedRootStorageEntry->mergedChildren.push_back(entries[j]);
  }
  memscope2rootStorageEntry[reorderedRootStorageEntry->bufInfo->bufferScope] = reorderedRootStorageEntry;
  return reorderedRootStorageEntry;

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks — applied. The new root now clears every entry's mergedChildren before rebuilding the flat child list, and updates memscope2rootStorageEntry to point at the new root.

Worth noting the two changes must go together: clearing the old root's children without updating the map would leave RecordOverflowIfAny reading the stale root with an empty child list (under-reporting the peak). With both applied, the scope's root + children stay consistent. (In the prior version the old root kept its complete child list, so overflow detection was still correct — but the tree had a stale back-reference; this makes it well-formed.)

}

std::pair<size_t, size_t>
MemPlan::GetBufferSpaceInfo(pto::AddressSpace &space) const {
switch (space) {
Expand Down Expand Up @@ -2347,7 +2400,7 @@ void PlanMemoryPass::runOnOperation() {

MemPlan memPlan(this->memMode, this->enableGlobalReuse,
this->enablePrintMemoryAllocatedSize,
this->restrictInplaceAsISA);
this->restrictInplaceAsISA, this->orderBySize);
if (failed(memPlan.InitMemSpecsFromModule(funcOp))) {
return signalPassFailure();
}
Expand Down
12 changes: 10 additions & 2 deletions lib/PTO/Transforms/PTOPlanMemory.h
Original file line number Diff line number Diff line change
Expand Up @@ -415,10 +415,10 @@ using StorageEntryPair = std::pair<const StorageEntry *, const StorageEntry *>;
class MemPlan {
public:
MemPlan(MemPlanMode planMode, bool enableGlobalReuse, bool enablePrintMemoryAllocatedSize,
bool restrictInplaceAsISA)
bool restrictInplaceAsISA, bool orderBySize)
: planMode(planMode), enableGlobalReuse(enableGlobalReuse),
enablePrintMemoryAllocatedSize(enablePrintMemoryAllocatedSize),
restrictInplaceAsISA(restrictInplaceAsISA) {}
restrictInplaceAsISA(restrictInplaceAsISA), orderBySize(orderBySize) {}

LogicalResult plan();

Expand Down Expand Up @@ -480,6 +480,9 @@ class MemPlan {
/// enable PTO op plan memory inplace
bool restrictInplaceAsISA;

/// Process buffers largest-first (first-fit-decreasing) instead of DMA-first.
bool orderBySize;

/// StorageEntry generate.
void GenerateStorageEntry();

Expand Down Expand Up @@ -538,6 +541,11 @@ class MemPlan {
/// of buffers corresponding to DMA.
StorageEntry *GetReorderRootStorageEntry(StorageEntry *rootStorageEntry);

/// Reorder rootStorageEntry's children largest-first (first-fit-decreasing)
/// across every memory space, keeping ping-pong pairs contiguous. Used when
/// the order-by-size option is enabled.
StorageEntry *GetSizeOrderedRootStorageEntry(StorageEntry *rootStorageEntry);

/// Assign addresses without reuse.
void PlanBuffersWithoutReuse(StorageEntry *rootStorageEntry,
size_t alignUnit);
Expand Down
69 changes: 69 additions & 0 deletions test/lit/pto/plan_memory_order_by_size_noreuse.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Heterogeneous NO-reuse case: three UB tiles of different sizes (src 8KB,
// idx 8KB, dst 32KB) whose total (48KB) fits the UB budget, so PlanMemory takes
// the fast no-reuse path (PlanBuffersWithoutReuse) rather than the reuse path.
// The peak is the same either way here, but --plan-memory-order-by-size must
// still give a deterministic largest-first layout on this path too: without it
// the first-generated (small) input tile lands at offset 0; with it the largest
// tile (dst) is placed first at offset 0. This is the regression guard for the
// contract that the option means the same thing whether or not reuse kicks in.
//
// RUN: ptoas --pto-arch=a3 --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=DEFAULT
// RUN: ptoas --pto-arch=a3 --plan-memory-order-by-size --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=BYSIZE

module {
func.func @order_by_size_noreuse(%src_ptr: !pto.ptr<f32>, %idx_ptr: !pto.ptr<ui32>, %dst_ptr: !pto.ptr<f32>) attributes {pto.kernel} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2048 = arith.constant 2048 : index
%c8192 = arith.constant 8192 : index

%src_view = pto.make_tensor_view %src_ptr,
shape = [%c1, %c1, %c1, %c1, %c2048],
strides = [%c2048, %c2048, %c2048, %c2048, %c1]
: !pto.tensor_view<1x1x1x1x2048xf32>
%idx_view = pto.make_tensor_view %idx_ptr,
shape = [%c1, %c1, %c1, %c1, %c2048],
strides = [%c2048, %c2048, %c2048, %c2048, %c1]
: !pto.tensor_view<1x1x1x1x2048xui32>
%dst_view = pto.make_tensor_view %dst_ptr,
shape = [%c1, %c1, %c1, %c1, %c8192],
strides = [%c8192, %c8192, %c8192, %c8192, %c1]
: !pto.tensor_view<1x1x1x1x8192xf32>

%src_part = pto.partition_view %src_view,
offsets = [%c0, %c0, %c0, %c0, %c0],
sizes = [%c1, %c1, %c1, %c1, %c2048]
: !pto.tensor_view<1x1x1x1x2048xf32> -> !pto.partition_tensor_view<1x1x1x1x2048xf32>
%idx_part = pto.partition_view %idx_view,
offsets = [%c0, %c0, %c0, %c0, %c0],
sizes = [%c1, %c1, %c1, %c1, %c2048]
: !pto.tensor_view<1x1x1x1x2048xui32> -> !pto.partition_tensor_view<1x1x1x1x2048xui32>
%dst_part = pto.partition_view %dst_view,
offsets = [%c0, %c0, %c0, %c0, %c0],
sizes = [%c1, %c1, %c1, %c1, %c8192]
: !pto.tensor_view<1x1x1x1x8192xf32> -> !pto.partition_tensor_view<1x1x1x1x8192xf32>

%src_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x2048xf32>
%idx_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x2048xui32>
%dst_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x8192xf32>

pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x2048xf32>)
outs(%src_tile : !pto.tile_buf<vec, 1x2048xf32>)
pto.tload ins(%idx_part : !pto.partition_tensor_view<1x1x1x1x2048xui32>)
outs(%idx_tile : !pto.tile_buf<vec, 1x2048xui32>)

pto.tsort32 ins(%src_tile, %idx_tile : !pto.tile_buf<vec, 1x2048xf32>,
!pto.tile_buf<vec, 1x2048xui32>)
outs(%dst_tile : !pto.tile_buf<vec, 1x8192xf32>)

pto.tstore ins(%dst_tile : !pto.tile_buf<vec, 1x8192xf32>)
outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x8192xf32>)
return
}
}

// On the no-reuse fast path the largest tile (dst, 1x8192xf32) gets offset 0
// ONLY with order-by-size; the default order leaves offset 0 to the smaller
// first-generated input tile.
// BYSIZE: pto.pointer_cast(%c0_i64){{.*}} : memref<1x8192xf32
// DEFAULT-NOT: pto.pointer_cast(%c0_i64){{.*}} : memref<1x8192xf32
66 changes: 66 additions & 0 deletions test/lit/pto/plan_memory_order_by_size_reuse.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Heterogeneous forced-reuse case (extracted from the TileLang tsort32 suite):
// three UB tiles of different sizes (src 32KB, idx 32KB, dst 128KB) whose total
// exceeds the UB budget, so PlanMemory takes the reuse path where allocation
// ORDER matters. With the default (DMA/gen) order the largest tile (dst) is
// allocated last and lands at a high offset. With --plan-memory-order-by-size
// (first-fit-decreasing) the largest tile is allocated first and gets offset 0.
//
// RUN: ptoas --pto-arch=a3 --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=DEFAULT
// RUN: ptoas --pto-arch=a3 --plan-memory-order-by-size --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=BYSIZE

module {
func.func @order_by_size_reuse(%src_ptr: !pto.ptr<f32>, %idx_ptr: !pto.ptr<ui32>, %dst_ptr: !pto.ptr<f32>) attributes {pto.kernel} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c8192 = arith.constant 8192 : index
%c32768 = arith.constant 32768 : index

%src_view = pto.make_tensor_view %src_ptr,
shape = [%c1, %c1, %c1, %c1, %c8192],
strides = [%c8192, %c8192, %c8192, %c8192, %c1]
: !pto.tensor_view<1x1x1x1x8192xf32>
%idx_view = pto.make_tensor_view %idx_ptr,
shape = [%c1, %c1, %c1, %c1, %c8192],
strides = [%c8192, %c8192, %c8192, %c8192, %c1]
: !pto.tensor_view<1x1x1x1x8192xui32>
%dst_view = pto.make_tensor_view %dst_ptr,
shape = [%c1, %c1, %c1, %c1, %c32768],
strides = [%c32768, %c32768, %c32768, %c32768, %c1]
: !pto.tensor_view<1x1x1x1x32768xf32>

%src_part = pto.partition_view %src_view,
offsets = [%c0, %c0, %c0, %c0, %c0],
sizes = [%c1, %c1, %c1, %c1, %c8192]
: !pto.tensor_view<1x1x1x1x8192xf32> -> !pto.partition_tensor_view<1x1x1x1x8192xf32>
%idx_part = pto.partition_view %idx_view,
offsets = [%c0, %c0, %c0, %c0, %c0],
sizes = [%c1, %c1, %c1, %c1, %c8192]
: !pto.tensor_view<1x1x1x1x8192xui32> -> !pto.partition_tensor_view<1x1x1x1x8192xui32>
%dst_part = pto.partition_view %dst_view,
offsets = [%c0, %c0, %c0, %c0, %c0],
sizes = [%c1, %c1, %c1, %c1, %c32768]
: !pto.tensor_view<1x1x1x1x32768xf32> -> !pto.partition_tensor_view<1x1x1x1x32768xf32>

%src_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x8192xf32>
%idx_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x8192xui32>
%dst_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x32768xf32>

pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x8192xf32>)
outs(%src_tile : !pto.tile_buf<vec, 1x8192xf32>)
pto.tload ins(%idx_part : !pto.partition_tensor_view<1x1x1x1x8192xui32>)
outs(%idx_tile : !pto.tile_buf<vec, 1x8192xui32>)

pto.tsort32 ins(%src_tile, %idx_tile : !pto.tile_buf<vec, 1x8192xf32>,
!pto.tile_buf<vec, 1x8192xui32>)
outs(%dst_tile : !pto.tile_buf<vec, 1x32768xf32>)

pto.tstore ins(%dst_tile : !pto.tile_buf<vec, 1x32768xf32>)
outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x32768xf32>)
return
}
}

// The largest tile (dst, 1x32768xf32) is placed at offset 0 ONLY with
// order-by-size; the default order leaves offset 0 to a smaller input tile.
// BYSIZE: pto.pointer_cast(%c0_i64){{.*}} : memref<1x32768xf32
// DEFAULT-NOT: pto.pointer_cast(%c0_i64){{.*}} : memref<1x32768xf32
8 changes: 8 additions & 0 deletions tools/ptoas/ptoas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,13 @@ static llvm::cl::opt<bool> enableInsertSync("enable-insert-sync",
llvm::cl::desc("Enable automatic synchronization insertion pass"),
llvm::cl::init(false));

static llvm::cl::opt<bool> planMemoryOrderBySize(
"plan-memory-order-by-size",
llvm::cl::desc("PlanMemory: allocate buffers largest-first "
"(first-fit-decreasing) instead of the default DMA-first "
"order"),
llvm::cl::init(false));

static llvm::cl::opt<bool> enableBufidSync(
"enable-bufid_sync",
llvm::cl::desc("Enable A5 buffer-id synchronization insertion pass"),
Expand Down Expand Up @@ -1864,6 +1871,7 @@ int mlir::pto::compilePTOASModule(
planMemoryOption.memMode = MemPlanMode::LOCAL_MEM_PLAN;
planMemoryOption.enableGlobalReuse = false;
planMemoryOption.enablePrintMemoryAllocatedSize = false;
planMemoryOption.orderBySize = planMemoryOrderBySize;
pm.addPass(pto::createPlanMemoryPass(planMemoryOption));
}
pm.addPass(pto::createPTOResolveReservedBuffersPass());
Expand Down