hw-native-sys · zhangstevenunity · Jul 2, 2026 · Jun 30, 2026 · Jul 1, 2026 · zhangstevenunity
diff --git a/include/PTO/Transforms/Passes.td b/include/PTO/Transforms/Passes.td
@@ -178,6 +178,12 @@ def PlanMemory : Pass<"pto-plan-memory", "ModuleOp"> {
     Option<"restrictInplaceAsISA", "restrict-inplace-as-isa", "bool",
            /*default=*/"false",
            "restrict memory inplace as isa, default : false">,
+    Option<"orderBySize", "order-by-size", "bool",
+           /*default=*/"false",
+           "Process buffers largest-first (first-fit-decreasing order) during "
+           "local memory planning instead of the default DMA-first order. "
+           "Decreasing-size order packs heterogeneous-size buffers tighter "
+           "(matches XLA/TVM/SOMAS). default : false">,
   ];
 }
 def PTOLoweringSyncToPipe : Pass<"pto-lowering-sync-to-pipe", "func::FuncOp"> {

diff --git a/lib/PTO/Transforms/PTOPlanMemory.cpp b/lib/PTO/Transforms/PTOPlanMemory.cpp
@@ -1334,6 +1334,13 @@ bool MemPlan::IsEnoughForBuffersNoReuse(StorageEntry *rootStorageEntry,
   if (iter == bufferScope2RequiredSize.end())
     llvm::report_fatal_error("missing required-size entry for buffer scope");
   if (iter->second < restBufferSize) {
+    // Even when the scope fits without reuse (no peak to save), honor
+    // largest-first placement so the option means the same thing on both paths:
+    // a deterministic decreasing-size layout regardless of whether reuse kicks
+    // in. Stable sort keeps uniform-size scopes byte-identical to the default.
+    if (orderBySize) {
+      rootStorageEntry = GetSizeOrderedRootStorageEntry(rootStorageEntry);
+    }
     PlanBuffersWithoutReuse(rootStorageEntry, alignUnit);
     return true;
   }
@@ -1548,6 +1555,9 @@ void MemPlan::ReportCurEntryDebugInfo(const StorageEntry *curEntry) {
 
 StorageEntry *
 MemPlan::GetReorderRootStorageEntry(StorageEntry *rootStorageEntry) {
+  if (orderBySize) {
+    return GetSizeOrderedRootStorageEntry(rootStorageEntry);
+  }
   if (rootStorageEntry->bufInfo->bufferScope != pto::AddressSpace::VEC) {
     return rootStorageEntry;
   }
@@ -1616,6 +1626,49 @@ void MemPlan::ReorderContinuousPingPongEntry(
   reorderedStorageEntryVec.swap(storageEntryVec);
 }
 
+StorageEntry *
+MemPlan::GetSizeOrderedRootStorageEntry(StorageEntry *rootStorageEntry) {
+  // First-fit-decreasing: place the largest buffers first. For the heterogeneous
+  // buffer sizes that real kernels produce, decreasing-size order packs tighter
+  // than an arbitrary/DMA-first order (this is the ordering XLA, TVM and SOMAS
+  // all use). Applies to every memory space, unlike the DMA-first reorder which
+  // is VEC-only.
+  SmallVector<StorageEntry *> entries = {rootStorageEntry};
+  entries.insert(entries.end(), rootStorageEntry->mergedChildren.begin(),
+                 rootStorageEntry->mergedChildren.end());
+
+  // Stable sort by decreasing buffer size. Stable keeps the original order among
+  // equal-size buffers, so uniform-size instances (e.g. the plan_memory_* tests)
+  // are left untouched.
+  std::stable_sort(entries.begin(), entries.end(),
+                   [](const StorageEntry *a, const StorageEntry *b) {
+                     return a->bufInfo->constBits > b->bufInfo->constBits;
+                   });
+
+  // Keep ping-pong (double-buffer) pairs contiguous so double-buffering is
+  // preserved (same post-processing the DMA-first path applies).
+  ReorderContinuousPingPongEntry(entries);
+
+  // Rebuild the flat root -> children structure around the new (largest) root.
+  // Clear every entry's child list first: when the root changes, the previous
+  // root would otherwise keep its stale child list (forming a cycle), and only
+  // the new root should carry the flat list of the others.
+  StorageEntry *reorderedRootStorageEntry = entries[0];
+  for (StorageEntry *entry : entries) {
+    entry->mergedChildren.clear();
+  }
+  for (size_t j = 1; j < entries.size(); ++j) {
+    reorderedRootStorageEntry->mergedChildren.push_back(entries[j]);
+  }
+  // Keep the scope -> root map consistent so later consumers (RecordOverflowIfAny,
+  // PrintSuccessfulAllocatedMaxBits) read the new root and its full child list.
+  // This must accompany the clear above: clearing children without updating the
+  // map would leave the stale root pointing at an empty child list.
+  memscope2rootStorageEntry[reorderedRootStorageEntry->bufInfo->bufferScope] =
+      reorderedRootStorageEntry;
+  return reorderedRootStorageEntry;
+}
+
 std::pair<size_t, size_t>
 MemPlan::GetBufferSpaceInfo(pto::AddressSpace &space) const {
   switch (space) {
@@ -2347,7 +2400,7 @@ void PlanMemoryPass::runOnOperation() {
 
     MemPlan memPlan(this->memMode, this->enableGlobalReuse,
                     this->enablePrintMemoryAllocatedSize,
-                    this->restrictInplaceAsISA);
+                    this->restrictInplaceAsISA, this->orderBySize);
     if (failed(memPlan.InitMemSpecsFromModule(funcOp))) {
       return signalPassFailure();
     }

diff --git a/lib/PTO/Transforms/PTOPlanMemory.h b/lib/PTO/Transforms/PTOPlanMemory.h
@@ -415,10 +415,10 @@ using StorageEntryPair = std::pair<const StorageEntry *, const StorageEntry *>;
 class MemPlan {
 public:
   MemPlan(MemPlanMode planMode, bool enableGlobalReuse, bool enablePrintMemoryAllocatedSize,
-          bool restrictInplaceAsISA)
+          bool restrictInplaceAsISA, bool orderBySize)
       : planMode(planMode), enableGlobalReuse(enableGlobalReuse),
         enablePrintMemoryAllocatedSize(enablePrintMemoryAllocatedSize),
-        restrictInplaceAsISA(restrictInplaceAsISA) {}
+        restrictInplaceAsISA(restrictInplaceAsISA), orderBySize(orderBySize) {}
 
   LogicalResult plan();
 
@@ -480,6 +480,9 @@ class MemPlan {
   /// enable PTO op plan memory inplace
   bool restrictInplaceAsISA;
 
+  /// Process buffers largest-first (first-fit-decreasing) instead of DMA-first.
+  bool orderBySize;
+
   /// StorageEntry generate.
   void GenerateStorageEntry();
 
@@ -538,6 +541,11 @@ class MemPlan {
   /// of buffers corresponding to DMA.
   StorageEntry *GetReorderRootStorageEntry(StorageEntry *rootStorageEntry);
 
+  /// Reorder rootStorageEntry's children largest-first (first-fit-decreasing)
+  /// across every memory space, keeping ping-pong pairs contiguous. Used when
+  /// the order-by-size option is enabled.
+  StorageEntry *GetSizeOrderedRootStorageEntry(StorageEntry *rootStorageEntry);
+
   /// Assign addresses without reuse.
   void PlanBuffersWithoutReuse(StorageEntry *rootStorageEntry,
                                size_t alignUnit);

diff --git a/test/lit/pto/plan_memory_order_by_size_noreuse.pto b/test/lit/pto/plan_memory_order_by_size_noreuse.pto
@@ -0,0 +1,69 @@
+// Heterogeneous NO-reuse case: three UB tiles of different sizes (src 8KB,
+// idx 8KB, dst 32KB) whose total (48KB) fits the UB budget, so PlanMemory takes
+// the fast no-reuse path (PlanBuffersWithoutReuse) rather than the reuse path.
+// The peak is the same either way here, but --plan-memory-order-by-size must
+// still give a deterministic largest-first layout on this path too: without it
+// the first-generated (small) input tile lands at offset 0; with it the largest
+// tile (dst) is placed first at offset 0. This is the regression guard for the
+// contract that the option means the same thing whether or not reuse kicks in.
+//
+// RUN: ptoas --pto-arch=a3 --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=DEFAULT
+// RUN: ptoas --pto-arch=a3 --plan-memory-order-by-size --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=BYSIZE
+
+module {
+  func.func @order_by_size_noreuse(%src_ptr: !pto.ptr<f32>, %idx_ptr: !pto.ptr<ui32>, %dst_ptr: !pto.ptr<f32>) attributes {pto.kernel} {
+    %c0    = arith.constant 0     : index
+    %c1    = arith.constant 1     : index
+    %c2048 = arith.constant 2048  : index
+    %c8192 = arith.constant 8192  : index
+
+    %src_view = pto.make_tensor_view %src_ptr,
+      shape = [%c1, %c1, %c1, %c1, %c2048],
+      strides = [%c2048, %c2048, %c2048, %c2048, %c1]
+      : !pto.tensor_view<1x1x1x1x2048xf32>
+    %idx_view = pto.make_tensor_view %idx_ptr,
+      shape = [%c1, %c1, %c1, %c1, %c2048],
+      strides = [%c2048, %c2048, %c2048, %c2048, %c1]
+      : !pto.tensor_view<1x1x1x1x2048xui32>
+    %dst_view = pto.make_tensor_view %dst_ptr,
+      shape = [%c1, %c1, %c1, %c1, %c8192],
+      strides = [%c8192, %c8192, %c8192, %c8192, %c1]
+      : !pto.tensor_view<1x1x1x1x8192xf32>
+
+    %src_part = pto.partition_view %src_view,
+      offsets = [%c0, %c0, %c0, %c0, %c0],
+      sizes = [%c1, %c1, %c1, %c1, %c2048]
+      : !pto.tensor_view<1x1x1x1x2048xf32> -> !pto.partition_tensor_view<1x1x1x1x2048xf32>
+    %idx_part = pto.partition_view %idx_view,
+      offsets = [%c0, %c0, %c0, %c0, %c0],
+      sizes = [%c1, %c1, %c1, %c1, %c2048]
+      : !pto.tensor_view<1x1x1x1x2048xui32> -> !pto.partition_tensor_view<1x1x1x1x2048xui32>
+    %dst_part = pto.partition_view %dst_view,
+      offsets = [%c0, %c0, %c0, %c0, %c0],
+      sizes = [%c1, %c1, %c1, %c1, %c8192]
+      : !pto.tensor_view<1x1x1x1x8192xf32> -> !pto.partition_tensor_view<1x1x1x1x8192xf32>
+
+    %src_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x2048xf32>
+    %idx_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x2048xui32>
+    %dst_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x8192xf32>
+
+    pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x2048xf32>)
+              outs(%src_tile : !pto.tile_buf<vec, 1x2048xf32>)
+    pto.tload ins(%idx_part : !pto.partition_tensor_view<1x1x1x1x2048xui32>)
+              outs(%idx_tile : !pto.tile_buf<vec, 1x2048xui32>)
+
+    pto.tsort32 ins(%src_tile, %idx_tile : !pto.tile_buf<vec, 1x2048xf32>,
+                          !pto.tile_buf<vec, 1x2048xui32>)
+             outs(%dst_tile : !pto.tile_buf<vec, 1x8192xf32>)
+
+    pto.tstore ins(%dst_tile : !pto.tile_buf<vec, 1x8192xf32>)
+               outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x8192xf32>)
+    return
+  }
+}
+
+// On the no-reuse fast path the largest tile (dst, 1x8192xf32) gets offset 0
+// ONLY with order-by-size; the default order leaves offset 0 to the smaller
+// first-generated input tile.
+// BYSIZE: pto.pointer_cast(%c0_i64){{.*}} : memref<1x8192xf32
+// DEFAULT-NOT: pto.pointer_cast(%c0_i64){{.*}} : memref<1x8192xf32
diff --git a/test/lit/pto/plan_memory_order_by_size_reuse.pto b/test/lit/pto/plan_memory_order_by_size_reuse.pto
@@ -0,0 +1,66 @@
+// Heterogeneous forced-reuse case (extracted from the TileLang tsort32 suite):
+// three UB tiles of different sizes (src 32KB, idx 32KB, dst 128KB) whose total
+// exceeds the UB budget, so PlanMemory takes the reuse path where allocation
+// ORDER matters. With the default (DMA/gen) order the largest tile (dst) is
+// allocated last and lands at a high offset. With --plan-memory-order-by-size
+// (first-fit-decreasing) the largest tile is allocated first and gets offset 0.
+//
+// RUN: ptoas --pto-arch=a3 --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=DEFAULT
+// RUN: ptoas --pto-arch=a3 --plan-memory-order-by-size --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=BYSIZE
+
+module {
+  func.func @order_by_size_reuse(%src_ptr: !pto.ptr<f32>, %idx_ptr: !pto.ptr<ui32>, %dst_ptr: !pto.ptr<f32>) attributes {pto.kernel} {
+    %c0    = arith.constant 0     : index
+    %c1    = arith.constant 1     : index
+    %c8192 = arith.constant 8192  : index
+    %c32768 = arith.constant 32768 : index
+
+    %src_view = pto.make_tensor_view %src_ptr,
+      shape = [%c1, %c1, %c1, %c1, %c8192],
+      strides = [%c8192, %c8192, %c8192, %c8192, %c1]
+      : !pto.tensor_view<1x1x1x1x8192xf32>
+    %idx_view = pto.make_tensor_view %idx_ptr,
+      shape = [%c1, %c1, %c1, %c1, %c8192],
+      strides = [%c8192, %c8192, %c8192, %c8192, %c1]
+      : !pto.tensor_view<1x1x1x1x8192xui32>
+    %dst_view = pto.make_tensor_view %dst_ptr,
+      shape = [%c1, %c1, %c1, %c1, %c32768],
+      strides = [%c32768, %c32768, %c32768, %c32768, %c1]
+      : !pto.tensor_view<1x1x1x1x32768xf32>
+
+    %src_part = pto.partition_view %src_view,
+      offsets = [%c0, %c0, %c0, %c0, %c0],
+      sizes = [%c1, %c1, %c1, %c1, %c8192]
+      : !pto.tensor_view<1x1x1x1x8192xf32> -> !pto.partition_tensor_view<1x1x1x1x8192xf32>
+    %idx_part = pto.partition_view %idx_view,
+      offsets = [%c0, %c0, %c0, %c0, %c0],
+      sizes = [%c1, %c1, %c1, %c1, %c8192]
+      : !pto.tensor_view<1x1x1x1x8192xui32> -> !pto.partition_tensor_view<1x1x1x1x8192xui32>
+    %dst_part = pto.partition_view %dst_view,
+      offsets = [%c0, %c0, %c0, %c0, %c0],
+      sizes = [%c1, %c1, %c1, %c1, %c32768]
+      : !pto.tensor_view<1x1x1x1x32768xf32> -> !pto.partition_tensor_view<1x1x1x1x32768xf32>
+
+    %src_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x8192xf32>
+    %idx_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x8192xui32>
+    %dst_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x32768xf32>
+
+    pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x8192xf32>)
+              outs(%src_tile : !pto.tile_buf<vec, 1x8192xf32>)
+    pto.tload ins(%idx_part : !pto.partition_tensor_view<1x1x1x1x8192xui32>)
+              outs(%idx_tile : !pto.tile_buf<vec, 1x8192xui32>)
+
+    pto.tsort32 ins(%src_tile, %idx_tile : !pto.tile_buf<vec, 1x8192xf32>,
+                          !pto.tile_buf<vec, 1x8192xui32>)
+             outs(%dst_tile : !pto.tile_buf<vec, 1x32768xf32>)
+
+    pto.tstore ins(%dst_tile : !pto.tile_buf<vec, 1x32768xf32>)
+               outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x32768xf32>)
+    return
+  }
+}
+
+// The largest tile (dst, 1x32768xf32) is placed at offset 0 ONLY with
+// order-by-size; the default order leaves offset 0 to a smaller input tile.
+// BYSIZE: pto.pointer_cast(%c0_i64){{.*}} : memref<1x32768xf32
+// DEFAULT-NOT: pto.pointer_cast(%c0_i64){{.*}} : memref<1x32768xf32
diff --git a/tools/ptoas/ptoas.cpp b/tools/ptoas/ptoas.cpp
@@ -317,6 +317,13 @@ static llvm::cl::opt<bool> enableInsertSync("enable-insert-sync",
                                             llvm::cl::desc("Enable automatic synchronization insertion pass"),
                                             llvm::cl::init(false));
 
+static llvm::cl::opt<bool> planMemoryOrderBySize(
+    "plan-memory-order-by-size",
+    llvm::cl::desc("PlanMemory: allocate buffers largest-first "
+                   "(first-fit-decreasing) instead of the default DMA-first "
+                   "order"),
+    llvm::cl::init(false));
+
 static llvm::cl::opt<bool> enableBufidSync(
     "enable-bufid_sync",
     llvm::cl::desc("Enable A5 buffer-id synchronization insertion pass"),
@@ -1864,6 +1871,7 @@ int mlir::pto::compilePTOASModule(
     planMemoryOption.memMode = MemPlanMode::LOCAL_MEM_PLAN;
     planMemoryOption.enableGlobalReuse = false;
     planMemoryOption.enablePrintMemoryAllocatedSize = false;
+    planMemoryOption.orderBySize = planMemoryOrderBySize;
     pm.addPass(pto::createPlanMemoryPass(planMemoryOption));
   }
   pm.addPass(pto::createPTOResolveReservedBuffersPass());