-
Notifications
You must be signed in to change notification settings - Fork 68
feat(planmemory): first-fit-decreasing buffer ordering (opt-in) #885
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1334,6 +1334,13 @@ bool MemPlan::IsEnoughForBuffersNoReuse(StorageEntry *rootStorageEntry, | |
| if (iter == bufferScope2RequiredSize.end()) | ||
| llvm::report_fatal_error("missing required-size entry for buffer scope"); | ||
| if (iter->second < restBufferSize) { | ||
| // Even when the scope fits without reuse (no peak to save), honor | ||
| // largest-first placement so the option means the same thing on both paths: | ||
| // a deterministic decreasing-size layout regardless of whether reuse kicks | ||
| // in. Stable sort keeps uniform-size scopes byte-identical to the default. | ||
| if (orderBySize) { | ||
| rootStorageEntry = GetSizeOrderedRootStorageEntry(rootStorageEntry); | ||
| } | ||
| PlanBuffersWithoutReuse(rootStorageEntry, alignUnit); | ||
| return true; | ||
| } | ||
|
|
@@ -1548,6 +1555,9 @@ void MemPlan::ReportCurEntryDebugInfo(const StorageEntry *curEntry) { | |
|
|
||
| StorageEntry * | ||
| MemPlan::GetReorderRootStorageEntry(StorageEntry *rootStorageEntry) { | ||
| if (orderBySize) { | ||
| return GetSizeOrderedRootStorageEntry(rootStorageEntry); | ||
| } | ||
| if (rootStorageEntry->bufInfo->bufferScope != pto::AddressSpace::VEC) { | ||
| return rootStorageEntry; | ||
| } | ||
|
|
@@ -1616,6 +1626,49 @@ void MemPlan::ReorderContinuousPingPongEntry( | |
| reorderedStorageEntryVec.swap(storageEntryVec); | ||
| } | ||
|
|
||
| StorageEntry * | ||
| MemPlan::GetSizeOrderedRootStorageEntry(StorageEntry *rootStorageEntry) { | ||
| // First-fit-decreasing: place the largest buffers first. For the heterogeneous | ||
| // buffer sizes that real kernels produce, decreasing-size order packs tighter | ||
| // than an arbitrary/DMA-first order (this is the ordering XLA, TVM and SOMAS | ||
| // all use). Applies to every memory space, unlike the DMA-first reorder which | ||
| // is VEC-only. | ||
| SmallVector<StorageEntry *> entries = {rootStorageEntry}; | ||
| entries.insert(entries.end(), rootStorageEntry->mergedChildren.begin(), | ||
| rootStorageEntry->mergedChildren.end()); | ||
|
|
||
| // Stable sort by decreasing buffer size. Stable keeps the original order among | ||
| // equal-size buffers, so uniform-size instances (e.g. the plan_memory_* tests) | ||
| // are left untouched. | ||
| std::stable_sort(entries.begin(), entries.end(), | ||
| [](const StorageEntry *a, const StorageEntry *b) { | ||
| return a->bufInfo->constBits > b->bufInfo->constBits; | ||
| }); | ||
|
|
||
| // Keep ping-pong (double-buffer) pairs contiguous so double-buffering is | ||
| // preserved (same post-processing the DMA-first path applies). | ||
| ReorderContinuousPingPongEntry(entries); | ||
|
|
||
| // Rebuild the flat root -> children structure around the new (largest) root. | ||
| // Clear every entry's child list first: when the root changes, the previous | ||
| // root would otherwise keep its stale child list (forming a cycle), and only | ||
| // the new root should carry the flat list of the others. | ||
| StorageEntry *reorderedRootStorageEntry = entries[0]; | ||
| for (StorageEntry *entry : entries) { | ||
| entry->mergedChildren.clear(); | ||
| } | ||
| for (size_t j = 1; j < entries.size(); ++j) { | ||
| reorderedRootStorageEntry->mergedChildren.push_back(entries[j]); | ||
| } | ||
| // Keep the scope -> root map consistent so later consumers (RecordOverflowIfAny, | ||
| // PrintSuccessfulAllocatedMaxBits) read the new root and its full child list. | ||
| // This must accompany the clear above: clearing children without updating the | ||
| // map would leave the stale root pointing at an empty child list. | ||
| memscope2rootStorageEntry[reorderedRootStorageEntry->bufInfo->bufferScope] = | ||
| reorderedRootStorageEntry; | ||
| return reorderedRootStorageEntry; | ||
|
Comment on lines
+1656
to
+1669
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When the root storage entry of a scope changes after sorting, the old root becomes a child of the new root. If we do not clear the Additionally, the for (auto *entry : entries) {
entry->mergedChildren.clear();
}
StorageEntry *reorderedRootStorageEntry = entries[0];
for (size_t j = 1; j < entries.size(); ++j) {
reorderedRootStorageEntry->mergedChildren.push_back(entries[j]);
}
memscope2rootStorageEntry[reorderedRootStorageEntry->bufInfo->bufferScope] = reorderedRootStorageEntry;
return reorderedRootStorageEntry;
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks — applied. The new root now clears every entry's Worth noting the two changes must go together: clearing the old root's children without updating the map would leave |
||
| } | ||
|
|
||
| std::pair<size_t, size_t> | ||
| MemPlan::GetBufferSpaceInfo(pto::AddressSpace &space) const { | ||
| switch (space) { | ||
|
|
@@ -2347,7 +2400,7 @@ void PlanMemoryPass::runOnOperation() { | |
|
|
||
| MemPlan memPlan(this->memMode, this->enableGlobalReuse, | ||
| this->enablePrintMemoryAllocatedSize, | ||
| this->restrictInplaceAsISA); | ||
| this->restrictInplaceAsISA, this->orderBySize); | ||
| if (failed(memPlan.InitMemSpecsFromModule(funcOp))) { | ||
| return signalPassFailure(); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| // Heterogeneous NO-reuse case: three UB tiles of different sizes (src 8KB, | ||
| // idx 8KB, dst 32KB) whose total (48KB) fits the UB budget, so PlanMemory takes | ||
| // the fast no-reuse path (PlanBuffersWithoutReuse) rather than the reuse path. | ||
| // The peak is the same either way here, but --plan-memory-order-by-size must | ||
| // still give a deterministic largest-first layout on this path too: without it | ||
| // the first-generated (small) input tile lands at offset 0; with it the largest | ||
| // tile (dst) is placed first at offset 0. This is the regression guard for the | ||
| // contract that the option means the same thing whether or not reuse kicks in. | ||
| // | ||
| // RUN: ptoas --pto-arch=a3 --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=DEFAULT | ||
| // RUN: ptoas --pto-arch=a3 --plan-memory-order-by-size --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=BYSIZE | ||
|
|
||
| module { | ||
| func.func @order_by_size_noreuse(%src_ptr: !pto.ptr<f32>, %idx_ptr: !pto.ptr<ui32>, %dst_ptr: !pto.ptr<f32>) attributes {pto.kernel} { | ||
| %c0 = arith.constant 0 : index | ||
| %c1 = arith.constant 1 : index | ||
| %c2048 = arith.constant 2048 : index | ||
| %c8192 = arith.constant 8192 : index | ||
|
|
||
| %src_view = pto.make_tensor_view %src_ptr, | ||
| shape = [%c1, %c1, %c1, %c1, %c2048], | ||
| strides = [%c2048, %c2048, %c2048, %c2048, %c1] | ||
| : !pto.tensor_view<1x1x1x1x2048xf32> | ||
| %idx_view = pto.make_tensor_view %idx_ptr, | ||
| shape = [%c1, %c1, %c1, %c1, %c2048], | ||
| strides = [%c2048, %c2048, %c2048, %c2048, %c1] | ||
| : !pto.tensor_view<1x1x1x1x2048xui32> | ||
| %dst_view = pto.make_tensor_view %dst_ptr, | ||
| shape = [%c1, %c1, %c1, %c1, %c8192], | ||
| strides = [%c8192, %c8192, %c8192, %c8192, %c1] | ||
| : !pto.tensor_view<1x1x1x1x8192xf32> | ||
|
|
||
| %src_part = pto.partition_view %src_view, | ||
| offsets = [%c0, %c0, %c0, %c0, %c0], | ||
| sizes = [%c1, %c1, %c1, %c1, %c2048] | ||
| : !pto.tensor_view<1x1x1x1x2048xf32> -> !pto.partition_tensor_view<1x1x1x1x2048xf32> | ||
| %idx_part = pto.partition_view %idx_view, | ||
| offsets = [%c0, %c0, %c0, %c0, %c0], | ||
| sizes = [%c1, %c1, %c1, %c1, %c2048] | ||
| : !pto.tensor_view<1x1x1x1x2048xui32> -> !pto.partition_tensor_view<1x1x1x1x2048xui32> | ||
| %dst_part = pto.partition_view %dst_view, | ||
| offsets = [%c0, %c0, %c0, %c0, %c0], | ||
| sizes = [%c1, %c1, %c1, %c1, %c8192] | ||
| : !pto.tensor_view<1x1x1x1x8192xf32> -> !pto.partition_tensor_view<1x1x1x1x8192xf32> | ||
|
|
||
| %src_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x2048xf32> | ||
| %idx_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x2048xui32> | ||
| %dst_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x8192xf32> | ||
|
|
||
| pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x2048xf32>) | ||
| outs(%src_tile : !pto.tile_buf<vec, 1x2048xf32>) | ||
| pto.tload ins(%idx_part : !pto.partition_tensor_view<1x1x1x1x2048xui32>) | ||
| outs(%idx_tile : !pto.tile_buf<vec, 1x2048xui32>) | ||
|
|
||
| pto.tsort32 ins(%src_tile, %idx_tile : !pto.tile_buf<vec, 1x2048xf32>, | ||
| !pto.tile_buf<vec, 1x2048xui32>) | ||
| outs(%dst_tile : !pto.tile_buf<vec, 1x8192xf32>) | ||
|
|
||
| pto.tstore ins(%dst_tile : !pto.tile_buf<vec, 1x8192xf32>) | ||
| outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x8192xf32>) | ||
| return | ||
| } | ||
| } | ||
|
|
||
| // On the no-reuse fast path the largest tile (dst, 1x8192xf32) gets offset 0 | ||
| // ONLY with order-by-size; the default order leaves offset 0 to the smaller | ||
| // first-generated input tile. | ||
| // BYSIZE: pto.pointer_cast(%c0_i64){{.*}} : memref<1x8192xf32 | ||
| // DEFAULT-NOT: pto.pointer_cast(%c0_i64){{.*}} : memref<1x8192xf32 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| // Heterogeneous forced-reuse case (extracted from the TileLang tsort32 suite): | ||
| // three UB tiles of different sizes (src 32KB, idx 32KB, dst 128KB) whose total | ||
| // exceeds the UB budget, so PlanMemory takes the reuse path where allocation | ||
| // ORDER matters. With the default (DMA/gen) order the largest tile (dst) is | ||
| // allocated last and lands at a high offset. With --plan-memory-order-by-size | ||
| // (first-fit-decreasing) the largest tile is allocated first and gets offset 0. | ||
| // | ||
| // RUN: ptoas --pto-arch=a3 --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=DEFAULT | ||
| // RUN: ptoas --pto-arch=a3 --plan-memory-order-by-size --mlir-print-ir-after=pto-plan-memory %s 2>&1 1>/dev/null | FileCheck %s --check-prefix=BYSIZE | ||
|
|
||
| module { | ||
| func.func @order_by_size_reuse(%src_ptr: !pto.ptr<f32>, %idx_ptr: !pto.ptr<ui32>, %dst_ptr: !pto.ptr<f32>) attributes {pto.kernel} { | ||
| %c0 = arith.constant 0 : index | ||
| %c1 = arith.constant 1 : index | ||
| %c8192 = arith.constant 8192 : index | ||
| %c32768 = arith.constant 32768 : index | ||
|
|
||
| %src_view = pto.make_tensor_view %src_ptr, | ||
| shape = [%c1, %c1, %c1, %c1, %c8192], | ||
| strides = [%c8192, %c8192, %c8192, %c8192, %c1] | ||
| : !pto.tensor_view<1x1x1x1x8192xf32> | ||
| %idx_view = pto.make_tensor_view %idx_ptr, | ||
| shape = [%c1, %c1, %c1, %c1, %c8192], | ||
| strides = [%c8192, %c8192, %c8192, %c8192, %c1] | ||
| : !pto.tensor_view<1x1x1x1x8192xui32> | ||
| %dst_view = pto.make_tensor_view %dst_ptr, | ||
| shape = [%c1, %c1, %c1, %c1, %c32768], | ||
| strides = [%c32768, %c32768, %c32768, %c32768, %c1] | ||
| : !pto.tensor_view<1x1x1x1x32768xf32> | ||
|
|
||
| %src_part = pto.partition_view %src_view, | ||
| offsets = [%c0, %c0, %c0, %c0, %c0], | ||
| sizes = [%c1, %c1, %c1, %c1, %c8192] | ||
| : !pto.tensor_view<1x1x1x1x8192xf32> -> !pto.partition_tensor_view<1x1x1x1x8192xf32> | ||
| %idx_part = pto.partition_view %idx_view, | ||
| offsets = [%c0, %c0, %c0, %c0, %c0], | ||
| sizes = [%c1, %c1, %c1, %c1, %c8192] | ||
| : !pto.tensor_view<1x1x1x1x8192xui32> -> !pto.partition_tensor_view<1x1x1x1x8192xui32> | ||
| %dst_part = pto.partition_view %dst_view, | ||
| offsets = [%c0, %c0, %c0, %c0, %c0], | ||
| sizes = [%c1, %c1, %c1, %c1, %c32768] | ||
| : !pto.tensor_view<1x1x1x1x32768xf32> -> !pto.partition_tensor_view<1x1x1x1x32768xf32> | ||
|
|
||
| %src_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x8192xf32> | ||
| %idx_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x8192xui32> | ||
| %dst_tile = pto.alloc_tile : !pto.tile_buf<vec, 1x32768xf32> | ||
|
|
||
| pto.tload ins(%src_part : !pto.partition_tensor_view<1x1x1x1x8192xf32>) | ||
| outs(%src_tile : !pto.tile_buf<vec, 1x8192xf32>) | ||
| pto.tload ins(%idx_part : !pto.partition_tensor_view<1x1x1x1x8192xui32>) | ||
| outs(%idx_tile : !pto.tile_buf<vec, 1x8192xui32>) | ||
|
|
||
| pto.tsort32 ins(%src_tile, %idx_tile : !pto.tile_buf<vec, 1x8192xf32>, | ||
| !pto.tile_buf<vec, 1x8192xui32>) | ||
| outs(%dst_tile : !pto.tile_buf<vec, 1x32768xf32>) | ||
|
|
||
| pto.tstore ins(%dst_tile : !pto.tile_buf<vec, 1x32768xf32>) | ||
| outs(%dst_part : !pto.partition_tensor_view<1x1x1x1x32768xf32>) | ||
| return | ||
| } | ||
| } | ||
|
|
||
| // The largest tile (dst, 1x32768xf32) is placed at offset 0 ONLY with | ||
| // order-by-size; the default order leaves offset 0 to a smaller input tile. | ||
| // BYSIZE: pto.pointer_cast(%c0_i64){{.*}} : memref<1x32768xf32 | ||
| // DEFAULT-NOT: pto.pointer_cast(%c0_i64){{.*}} : memref<1x32768xf32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Non-blocking (latent fragility): this writes
memscope2rootStorageEntry[scope]while the same map is being range-iterated in PlanMemAddressOfWholeLocalBuffer (for (auto &it : memscope2rootStorageEntry)), reached through both IsEnoughForBuffersNoReuse and PlanReusableLocalBuffer / PlanMemAddressForLevel0.It is safe today only because
scopeis always the key already being visited (every entry in a scope shares that scope, and the key was inserted in MergeSameScopeSE), so DenseMap::operator[] hits an existing bucket and never inserts or reallocates -- the active iterator stays valid. That is an implicit invariant. If a future change ever routes a scope not yet in the map through this path, operator[] would insert, possibly rehash, and invalidate the in-flight range-for iterator (UB). Cheap hardening: assign through the iterator the caller already holds, or assertmemscope2rootStorageEntry.count(scope)here before writing.