diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td index 6256c9de03..6c79c9c8d2 100644 --- a/include/PTO/Transforms/Passes.td +++ b/include/PTO/Transforms/Passes.td @@ -178,6 +178,12 @@ def PlanMemory : Pass<"pto-plan-memory", "ModuleOp"> { Option<"restrictInplaceAsISA", "restrict-inplace-as-isa", "bool", /*default=*/"false", "restrict memory inplace as isa, default : false">, + Option<"orderBySize", "order-by-size", "bool", + /*default=*/"false", + "Process buffers largest-first (first-fit-decreasing order) during " + "local memory planning instead of the default DMA-first order. " + "Decreasing-size order packs heterogeneous-size buffers tighter " + "(matches XLA/TVM/SOMAS). default : false">, ]; } def PTOLoweringSyncToPipe : Pass<"pto-lowering-sync-to-pipe", "func::FuncOp"> { diff --git a/lib/PTO/Transforms/PTOPlanMemory.cpp b/lib/PTO/Transforms/PTOPlanMemory.cpp index aa3196a672..15c627a002 100644 --- a/lib/PTO/Transforms/PTOPlanMemory.cpp +++ b/lib/PTO/Transforms/PTOPlanMemory.cpp @@ -1548,6 +1548,9 @@ void MemPlan::ReportCurEntryDebugInfo(const StorageEntry *curEntry) { StorageEntry * MemPlan::GetReorderRootStorageEntry(StorageEntry *rootStorageEntry) { + if (orderBySize) { + return GetSizeOrderedRootStorageEntry(rootStorageEntry); + } if (rootStorageEntry->bufInfo->bufferScope != pto::AddressSpace::VEC) { return rootStorageEntry; } @@ -1616,6 +1619,37 @@ void MemPlan::ReorderContinuousPingPongEntry( reorderedStorageEntryVec.swap(storageEntryVec); } +StorageEntry * +MemPlan::GetSizeOrderedRootStorageEntry(StorageEntry *rootStorageEntry) { + // First-fit-decreasing: place the largest buffers first. For the heterogeneous + // buffer sizes that real kernels produce, decreasing-size order packs tighter + // than an arbitrary/DMA-first order (this is the ordering XLA, TVM and SOMAS + // all use). Applies to every memory space, unlike the DMA-first reorder which + // is VEC-only. + SmallVector entries = {rootStorageEntry}; + entries.insert(entries.end(), rootStorageEntry->mergedChildren.begin(), + rootStorageEntry->mergedChildren.end()); + + // Stable sort by decreasing buffer size. Stable keeps the original order among + // equal-size buffers, so uniform-size instances (e.g. the plan_memory_* tests) + // are left untouched. + std::stable_sort(entries.begin(), entries.end(), + [](const StorageEntry *a, const StorageEntry *b) { + return a->bufInfo->constBits > b->bufInfo->constBits; + }); + + // Keep ping-pong (double-buffer) pairs contiguous so double-buffering is + // preserved (same post-processing the DMA-first path applies). + ReorderContinuousPingPongEntry(entries); + + StorageEntry *reorderedRootStorageEntry = entries[0]; + reorderedRootStorageEntry->mergedChildren.clear(); + for (size_t j = 1; j < entries.size(); ++j) { + reorderedRootStorageEntry->mergedChildren.push_back(entries[j]); + } + return reorderedRootStorageEntry; +} + std::pair MemPlan::GetBufferSpaceInfo(pto::AddressSpace &space) const { switch (space) { @@ -2347,7 +2381,7 @@ void PlanMemoryPass::runOnOperation() { MemPlan memPlan(this->memMode, this->enableGlobalReuse, this->enablePrintMemoryAllocatedSize, - this->restrictInplaceAsISA); + this->restrictInplaceAsISA, this->orderBySize); if (failed(memPlan.InitMemSpecsFromModule(funcOp))) { return signalPassFailure(); } diff --git a/lib/PTO/Transforms/PTOPlanMemory.h b/lib/PTO/Transforms/PTOPlanMemory.h index ede0d557ae..7bc2bcaa03 100644 --- a/lib/PTO/Transforms/PTOPlanMemory.h +++ b/lib/PTO/Transforms/PTOPlanMemory.h @@ -415,10 +415,10 @@ using StorageEntryPair = std::pair; class MemPlan { public: MemPlan(MemPlanMode planMode, bool enableGlobalReuse, bool enablePrintMemoryAllocatedSize, - bool restrictInplaceAsISA) + bool restrictInplaceAsISA, bool orderBySize) : planMode(planMode), enableGlobalReuse(enableGlobalReuse), enablePrintMemoryAllocatedSize(enablePrintMemoryAllocatedSize), - restrictInplaceAsISA(restrictInplaceAsISA) {} + restrictInplaceAsISA(restrictInplaceAsISA), orderBySize(orderBySize) {} LogicalResult plan(); @@ -480,6 +480,9 @@ class MemPlan { /// enable PTO op plan memory inplace bool restrictInplaceAsISA; + /// Process buffers largest-first (first-fit-decreasing) instead of DMA-first. + bool orderBySize; + /// StorageEntry generate. void GenerateStorageEntry(); @@ -538,6 +541,11 @@ class MemPlan { /// of buffers corresponding to DMA. StorageEntry *GetReorderRootStorageEntry(StorageEntry *rootStorageEntry); + /// Reorder rootStorageEntry's children largest-first (first-fit-decreasing) + /// across every memory space, keeping ping-pong pairs contiguous. Used when + /// the order-by-size option is enabled. + StorageEntry *GetSizeOrderedRootStorageEntry(StorageEntry *rootStorageEntry); + /// Assign addresses without reuse. void PlanBuffersWithoutReuse(StorageEntry *rootStorageEntry, size_t alignUnit); diff --git a/test/lit/pto/plan_memory_order_by_size_reuse.pto b/test/lit/pto/plan_memory_order_by_size_reuse.pto new file mode 100644 index 0000000000..05541cfd07 --- /dev/null +++ b/test/lit/pto/plan_memory_order_by_size_reuse.pto @@ -0,0 +1,66 @@ +// Heterogeneous forced-reuse case (extracted from the TileLang tsort32 suite): +// three UB tiles of different sizes (src 32KB, idx 32KB, dst 128KB) whose total +// exceeds the UB budget, so PlanMemory takes the reuse path where allocation +// ORDER matters. With the default (DMA/gen) order the largest tile (dst) is +// allocated last and lands at a high offset. With --plan-memory-order-by-size +// (first-fit-decreasing) the largest tile is allocated first and gets offset 0. +// +// RUN: ptoas --pto-arch=a3 --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=DEFAULT +// RUN: ptoas --pto-arch=a3 --plan-memory-order-by-size --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=BYSIZE + +module { + func.func @order_by_size_reuse(%src_ptr: !pto.ptr, %idx_ptr: !pto.ptr, %dst_ptr: !pto.ptr) attributes {pto.kernel} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8192 = arith.constant 8192 : index + %c32768 = arith.constant 32768 : index + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c1, %c1, %c1, %c8192], + strides = [%c8192, %c8192, %c8192, %c8192, %c1] + : !pto.tensor_view<1x1x1x1x8192xf32> + %idx_view = pto.make_tensor_view %idx_ptr, + shape = [%c1, %c1, %c1, %c1, %c8192], + strides = [%c8192, %c8192, %c8192, %c8192, %c1] + : !pto.tensor_view<1x1x1x1x8192xui32> + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c1, %c1, %c1, %c32768], + strides = [%c32768, %c32768, %c32768, %c32768, %c1] + : !pto.tensor_view<1x1x1x1x32768xf32> + + %src_part = pto.partition_view %src_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c8192] + : !pto.tensor_view<1x1x1x1x8192xf32> -> !pto.partition_tensor_view<1x1x1x1x8192xf32> + %idx_part = pto.partition_view %idx_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c8192] + : !pto.tensor_view<1x1x1x1x8192xui32> -> !pto.partition_tensor_view<1x1x1x1x8192xui32> + %dst_part = pto.partition_view %dst_view, + offsets = [%c0, %c0, %c0, %c0, %c0], + sizes = [%c1, %c1, %c1, %c1, %c32768] + : !pto.tensor_view<1x1x1x1x32768xf32> -> !pto.partition_tensor_view<1x1x1x1x32768xf32> + + %src_tile = pto.alloc_tile : !pto.tile_buf + %idx_tile = pto.alloc_tile : !pto.tile_buf + %dst_tile = pto.alloc_tile : !pto.tile_buf + + pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x8192xf32>) + outs(%src_tile : !pto.tile_buf) + pto.tload ins(%idx_part : !pto.partition_tensor_view<1x1x1x1x8192xui32>) + outs(%idx_tile : !pto.tile_buf) + + pto.tsort32 ins(%src_tile, %idx_tile : !pto.tile_buf, + !pto.tile_buf) + outs(%dst_tile : !pto.tile_buf) + + pto.tstore ins(%dst_tile : !pto.tile_buf) + outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x32768xf32>) + return + } +} + +// The largest tile (dst, 1x32768xf32) is placed at offset 0 ONLY with +// order-by-size; the default order leaves offset 0 to a smaller input tile. +// BYSIZE: pto.pointer_cast(%c0_i64){{.*}} : memref<1x32768xf32 +// DEFAULT-NOT: pto.pointer_cast(%c0_i64){{.*}} : memref<1x32768xf32 diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp index b29a7f9f5a..cbca576957 100644 --- a/tools/ptoas/ptoas.cpp +++ b/tools/ptoas/ptoas.cpp @@ -317,6 +317,13 @@ static llvm::cl::opt enableInsertSync("enable-insert-sync", llvm::cl::desc("Enable automatic synchronization insertion pass"), llvm::cl::init(false)); +static llvm::cl::opt planMemoryOrderBySize( + "plan-memory-order-by-size", + llvm::cl::desc("PlanMemory: allocate buffers largest-first " + "(first-fit-decreasing) instead of the default DMA-first " + "order"), + llvm::cl::init(false)); + static llvm::cl::opt enableBufidSync( "enable-bufid_sync", llvm::cl::desc("Enable A5 buffer-id synchronization insertion pass"), @@ -1864,6 +1871,7 @@ int mlir::pto::compilePTOASModule( planMemoryOption.memMode = MemPlanMode::LOCAL_MEM_PLAN; planMemoryOption.enableGlobalReuse = false; planMemoryOption.enablePrintMemoryAllocatedSize = false; + planMemoryOption.orderBySize = planMemoryOrderBySize; pm.addPass(pto::createPlanMemoryPass(planMemoryOption)); } pm.addPass(pto::createPTOResolveReservedBuffersPass());