Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions include/PTO/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,12 @@ def PlanMemory : Pass<"pto-plan-memory", "ModuleOp"> {
Option<"restrictInplaceAsISA", "restrict-inplace-as-isa", "bool",
/*default=*/"false",
"restrict memory inplace as isa, default : false">,
Option<"orderBySize", "order-by-size", "bool",
/*default=*/"false",
"Process buffers largest-first (first-fit-decreasing order) during "
"local memory planning instead of the default DMA-first order. "
"Decreasing-size order packs heterogeneous-size buffers tighter "
"(matches XLA/TVM/SOMAS). default : false">,
];
}
def PTOLoweringSyncToPipe : Pass<"pto-lowering-sync-to-pipe", "func::FuncOp"> {
Expand Down
36 changes: 35 additions & 1 deletion lib/PTO/Transforms/PTOPlanMemory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1548,6 +1548,9 @@ void MemPlan::ReportCurEntryDebugInfo(const StorageEntry *curEntry) {

StorageEntry *
MemPlan::GetReorderRootStorageEntry(StorageEntry *rootStorageEntry) {
if (orderBySize) {
return GetSizeOrderedRootStorageEntry(rootStorageEntry);
}
if (rootStorageEntry->bufInfo->bufferScope != pto::AddressSpace::VEC) {
return rootStorageEntry;
}
Expand Down Expand Up @@ -1616,6 +1619,37 @@ void MemPlan::ReorderContinuousPingPongEntry(
reorderedStorageEntryVec.swap(storageEntryVec);
}

StorageEntry *
MemPlan::GetSizeOrderedRootStorageEntry(StorageEntry *rootStorageEntry) {
// First-fit-decreasing: place the largest buffers first. For the heterogeneous
// buffer sizes that real kernels produce, decreasing-size order packs tighter
// than an arbitrary/DMA-first order (this is the ordering XLA, TVM and SOMAS
// all use). Applies to every memory space, unlike the DMA-first reorder which
// is VEC-only.
SmallVector<StorageEntry *> entries = {rootStorageEntry};
entries.insert(entries.end(), rootStorageEntry->mergedChildren.begin(),
rootStorageEntry->mergedChildren.end());

// Stable sort by decreasing buffer size. Stable keeps the original order among
// equal-size buffers, so uniform-size instances (e.g. the plan_memory_* tests)
// are left untouched.
std::stable_sort(entries.begin(), entries.end(),
[](const StorageEntry *a, const StorageEntry *b) {
return a->bufInfo->constBits > b->bufInfo->constBits;
});

// Keep ping-pong (double-buffer) pairs contiguous so double-buffering is
// preserved (same post-processing the DMA-first path applies).
ReorderContinuousPingPongEntry(entries);

StorageEntry *reorderedRootStorageEntry = entries[0];
reorderedRootStorageEntry->mergedChildren.clear();
for (size_t j = 1; j < entries.size(); ++j) {
reorderedRootStorageEntry->mergedChildren.push_back(entries[j]);
}
return reorderedRootStorageEntry;
}

std::pair<size_t, size_t>
MemPlan::GetBufferSpaceInfo(pto::AddressSpace &space) const {
switch (space) {
Expand Down Expand Up @@ -2347,7 +2381,7 @@ void PlanMemoryPass::runOnOperation() {

MemPlan memPlan(this->memMode, this->enableGlobalReuse,
this->enablePrintMemoryAllocatedSize,
this->restrictInplaceAsISA);
this->restrictInplaceAsISA, this->orderBySize);
if (failed(memPlan.InitMemSpecsFromModule(funcOp))) {
return signalPassFailure();
}
Expand Down
12 changes: 10 additions & 2 deletions lib/PTO/Transforms/PTOPlanMemory.h
Original file line number Diff line number Diff line change
Expand Up @@ -415,10 +415,10 @@ using StorageEntryPair = std::pair<const StorageEntry *, const StorageEntry *>;
class MemPlan {
public:
MemPlan(MemPlanMode planMode, bool enableGlobalReuse, bool enablePrintMemoryAllocatedSize,
bool restrictInplaceAsISA)
bool restrictInplaceAsISA, bool orderBySize)
: planMode(planMode), enableGlobalReuse(enableGlobalReuse),
enablePrintMemoryAllocatedSize(enablePrintMemoryAllocatedSize),
restrictInplaceAsISA(restrictInplaceAsISA) {}
restrictInplaceAsISA(restrictInplaceAsISA), orderBySize(orderBySize) {}

LogicalResult plan();

Expand Down Expand Up @@ -480,6 +480,9 @@ class MemPlan {
/// enable PTO op plan memory inplace
bool restrictInplaceAsISA;

/// Process buffers largest-first (first-fit-decreasing) instead of DMA-first.
bool orderBySize;

/// StorageEntry generate.
void GenerateStorageEntry();

Expand Down Expand Up @@ -538,6 +541,11 @@ class MemPlan {
/// of buffers corresponding to DMA.
StorageEntry *GetReorderRootStorageEntry(StorageEntry *rootStorageEntry);

/// Reorder rootStorageEntry's children largest-first (first-fit-decreasing)
/// across every memory space, keeping ping-pong pairs contiguous. Used when
/// the order-by-size option is enabled.
StorageEntry *GetSizeOrderedRootStorageEntry(StorageEntry *rootStorageEntry);

/// Assign addresses without reuse.
void PlanBuffersWithoutReuse(StorageEntry *rootStorageEntry,
size_t alignUnit);
Expand Down
66 changes: 66 additions & 0 deletions test/lit/pto/plan_memory_order_by_size_reuse.pto
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Heterogeneous forced-reuse case (extracted from the TileLang tsort32 suite):
// three UB tiles of different sizes (src 32KB, idx 32KB, dst 128KB) whose total
// exceeds the UB budget, so PlanMemory takes the reuse path where allocation
// ORDER matters. With the default (DMA/gen) order the largest tile (dst) is
// allocated last and lands at a high offset. With --plan-memory-order-by-size
// (first-fit-decreasing) the largest tile is allocated first and gets offset 0.
//
// RUN: ptoas --pto-arch=a3 --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=DEFAULT
// RUN: ptoas --pto-arch=a3 --plan-memory-order-by-size --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=BYSIZE

module {
func.func @order_by_size_reuse(%src_ptr: !pto.ptr<f32>, %idx_ptr: !pto.ptr<ui32>, %dst_ptr: !pto.ptr<f32>) attributes {pto.kernel} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c8192 = arith.constant 8192 : index
%c32768 = arith.constant 32768 : index

%src_view = pto.make_tensor_view %src_ptr,
shape = [%c1, %c1, %c1, %c1, %c8192],
strides = [%c8192, %c8192, %c8192, %c8192, %c1]
: !pto.tensor_view<1x1x1x1x8192xf32>
%idx_view = pto.make_tensor_view %idx_ptr,
shape = [%c1, %c1, %c1, %c1, %c8192],
strides = [%c8192, %c8192, %c8192, %c8192, %c1]
: !pto.tensor_view<1x1x1x1x8192xui32>
%dst_view = pto.make_tensor_view %dst_ptr,
shape = [%c1, %c1, %c1, %c1, %c32768],
strides = [%c32768, %c32768, %c32768, %c32768, %c1]
: !pto.tensor_view<1x1x1x1x32768xf32>

%src_part = pto.partition_view %src_view,
offsets = [%c0, %c0, %c0, %c0, %c0],
sizes = [%c1, %c1, %c1, %c1, %c8192]
: !pto.tensor_view<1x1x1x1x8192xf32> -> !pto.partition_tensor_view<1x1x1x1x8192xf32>
%idx_part = pto.partition_view %idx_view,
offsets = [%c0, %c0, %c0, %c0, %c0],
sizes = [%c1, %c1, %c1, %c1, %c8192]
: !pto.tensor_view<1x1x1x1x8192xui32> -> !pto.partition_tensor_view<1x1x1x1x8192xui32>
%dst_part = pto.partition_view %dst_view,
offsets = [%c0, %c0, %c0, %c0, %c0],
sizes = [%c1, %c1, %c1, %c1, %c32768]
: !pto.tensor_view<1x1x1x1x32768xf32> -> !pto.partition_tensor_view<1x1x1x1x32768xf32>

%src_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x8192xf32>
%idx_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x8192xui32>
%dst_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x32768xf32>

pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x8192xf32>)
outs(%src_tile : !pto.tile_buf<vec, 1x8192xf32>)
pto.tload ins(%idx_part : !pto.partition_tensor_view<1x1x1x1x8192xui32>)
outs(%idx_tile : !pto.tile_buf<vec, 1x8192xui32>)

pto.tsort32 ins(%src_tile, %idx_tile : !pto.tile_buf<vec, 1x8192xf32>,
!pto.tile_buf<vec, 1x8192xui32>)
outs(%dst_tile : !pto.tile_buf<vec, 1x32768xf32>)

pto.tstore ins(%dst_tile : !pto.tile_buf<vec, 1x32768xf32>)
outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x32768xf32>)
return
}
}

// The largest tile (dst, 1x32768xf32) is placed at offset 0 ONLY with
// order-by-size; the default order leaves offset 0 to a smaller input tile.
// BYSIZE: pto.pointer_cast(%c0_i64){{.*}} : memref<1x32768xf32
// DEFAULT-NOT: pto.pointer_cast(%c0_i64){{.*}} : memref<1x32768xf32
8 changes: 8 additions & 0 deletions tools/ptoas/ptoas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,13 @@ static llvm::cl::opt<bool> enableInsertSync("enable-insert-sync",
llvm::cl::desc("Enable automatic synchronization insertion pass"),
llvm::cl::init(false));

static llvm::cl::opt<bool> planMemoryOrderBySize(
"plan-memory-order-by-size",
llvm::cl::desc("PlanMemory: allocate buffers largest-first "
"(first-fit-decreasing) instead of the default DMA-first "
"order"),
llvm::cl::init(false));

static llvm::cl::opt<bool> enableBufidSync(
"enable-bufid_sync",
llvm::cl::desc("Enable A5 buffer-id synchronization insertion pass"),
Expand Down Expand Up @@ -1864,6 +1871,7 @@ int mlir::pto::compilePTOASModule(
planMemoryOption.memMode = MemPlanMode::LOCAL_MEM_PLAN;
planMemoryOption.enableGlobalReuse = false;
planMemoryOption.enablePrintMemoryAllocatedSize = false;
planMemoryOption.orderBySize = planMemoryOrderBySize;
pm.addPass(pto::createPlanMemoryPass(planMemoryOption));
}
pm.addPass(pto::createPTOResolveReservedBuffersPass());
Expand Down