diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt index 9d6bd8335..620fa4ef3 100644 --- a/build_tools/llvm_version.txt +++ b/build_tools/llvm_version.txt @@ -1 +1 @@ -1728a56d0e66c9e64a2e62fa6c5508580ccd28a0 +89946bda5e1c7ceaf6d26634cc8c8c9498d9f7be diff --git a/build_tools/patches/0006-fix-segment-fault-in-applySignatureConversion.patch b/build_tools/patches/0006-fix-segment-fault-in-applySignatureConversion.patch deleted file mode 100644 index 6460a548b..000000000 --- a/build_tools/patches/0006-fix-segment-fault-in-applySignatureConversion.patch +++ /dev/null @@ -1,28 +0,0 @@ -From 283951f026428a3c34b8a2b8f2498d55faf590f5 Mon Sep 17 00:00:00 2001 -From: Chao Chen -Date: Fri, 26 Apr 2024 20:58:37 +0000 -Subject: [PATCH 6/7] fix-segment-fault-in-applySignatureConversion - ---- - mlir/lib/Transforms/Utils/DialectConversion.cpp | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp -index d407d60334c7..a5fa9660be15 100644 ---- a/mlir/lib/Transforms/Utils/DialectConversion.cpp -+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp -@@ -1460,8 +1460,10 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( - - // Legalize the argument output type. - Type outputType = origOutputType; -- if (Type legalOutputType = converter->convertType(outputType)) -- outputType = legalOutputType; -+ if (converter) { -+ if (Type legalOutputType = converter->convertType(outputType)) -+ outputType = legalOutputType; -+ } - - newArg = buildUnresolvedArgumentMaterialization( - newBlock, origArg.getLoc(), replArgs, origOutputType, outputType, --- -2.34.1 diff --git a/include/imex/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.h b/include/imex/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.h index 4712704b1..0f5919bc0 100644 --- a/include/imex/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.h +++ b/include/imex/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.h @@ -67,7 +67,7 @@ class XeGPUOneToNPatterRewriter : public mlir::PatternRewriter, } mlir::Block * - applySignatureConversion(mlir::Region *region, + applySignatureConversion(mlir::Block *block, mlir::TypeConverter::SignatureConversion &conversion, const mlir::TypeConverter *converter = nullptr); diff --git a/include/imex/Utils/GPUSerialize.h b/include/imex/Utils/GPUSerialize.h new file mode 100644 index 000000000..c6eef0c9a --- /dev/null +++ b/include/imex/Utils/GPUSerialize.h @@ -0,0 +1,18 @@ +//===- GPUSerialize.h - Pass Utility Functions --------------------*- C++ +//-*-===// +// +// Copyright 2024 Intel Corporation +// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _IMEX_GPUSERIALIZE_H_ +#define _IMEX_GPUSERIALIZE_H_ + +namespace imex { +static constexpr const char *gpuBinaryAttrName = "gpu.binary"; +} // namespace imex + +#endif // _IMEX_GPUSERIALIZE_H_ diff --git a/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp b/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp index 25e445c20..d4055c2be 100644 --- a/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp +++ b/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp @@ -18,6 +18,7 @@ #include "imex/Dialect/GPUX/IR/GPUXOps.h" #include "imex/Utils/FuncUtils.hpp" +#include "imex/Utils/GPUSerialize.h" #include "imex/Utils/TypeConversion.hpp" #include "../PassDetail.h" @@ -517,6 +518,34 @@ class ConvertLaunchFuncOpToGpuRuntimeCallPattern } }; +class RemoveGPUModulePattern + : public mlir::ConvertOpToLLVMPattern { +public: + RemoveGPUModulePattern(mlir::LLVMTypeConverter &converter) + : mlir::ConvertOpToLLVMPattern(converter) {} + mlir::LogicalResult + matchAndRewrite(mlir::gpu::GPUModuleOp op, + mlir::gpu::GPUModuleOp::Adaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + rewriter.eraseOp(op); + return mlir::success(); + } +}; + +class RemoveGPUFuncPattern + : public mlir::ConvertOpToLLVMPattern { +public: + RemoveGPUFuncPattern(mlir::LLVMTypeConverter &converter) + : mlir::ConvertOpToLLVMPattern(converter) {} + mlir::LogicalResult + matchAndRewrite(mlir::gpu::GPUFuncOp op, + mlir::gpu::GPUFuncOp::Adaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + rewriter.eraseOp(op); + return mlir::success(); + } +}; + /// A rewrite pattern to convert gpux.create_stream operations into a GPU /// runtime call. class ConvertGpuStreamCreatePattern @@ -583,19 +612,18 @@ void GPUXToLLVMPass::runOnOperation() { mlir::RewritePatternSet patterns(&context); mlir::LLVMConversionTarget target(context); - mlir::arith::populateArithToLLVMConversionPatterns(converter, patterns); - mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns); - mlir::populateVectorToLLVMConversionPatterns(converter, patterns); - mlir::populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns); - mlir::populateFuncToLLVMConversionPatterns(converter, patterns); - mlir::populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, - target); +// mlir::arith::populateArithToLLVMConversionPatterns(converter, patterns); +// mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns); +// mlir::populateVectorToLLVMConversionPatterns(converter, patterns); +// mlir::populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns); +// mlir::populateFuncToLLVMConversionPatterns(converter, patterns); +// mlir::populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, +// target); - mlir::populateGpuToLLVMConversionPatterns( - converter, patterns, mlir::gpu::getDefaultGpuBinaryAnnotation()); + mlir::populateGpuToLLVMConversionPatterns(converter, patterns); - imex::populateControlFlowTypeConversionRewritesAndTarget(converter, patterns, - target); +// imex::populateControlFlowTypeConversionRewritesAndTarget(converter, patterns, +// target); imex::populateGpuxToLLVMPatternsAndLegality(converter, patterns, target); @@ -631,12 +659,13 @@ void imex::populateGpuxToLLVMPatternsAndLegality( ConvertGpuStreamCreatePattern, ConvertGpuStreamDestroyPattern, ConvertAllocOpToGpuRuntimeCallPattern, - ConvertDeallocOpToGpuRuntimeCallPattern + ConvertDeallocOpToGpuRuntimeCallPattern, + RemoveGPUModulePattern // clang-format on >(converter); patterns.add( - converter, mlir::gpu::getDefaultGpuBinaryAnnotation()); + converter, imex::gpuBinaryAttrName); target.addIllegalDialect(); target.addIllegalDialect(); diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp index eb347a5fd..b06e27720 100644 --- a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp +++ b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp @@ -783,7 +783,7 @@ class GatherScatterToRawSend : public OpConversionPattern { payLoad = rewriter.create(loc, base, payLoad, 0); SmallVector indices(16, 0); payLoad = rewriter.create( - loc, payLoad, payLoad, rewriter.getI64ArrayAttr(indices)); + loc, payLoad, payLoad, indices); auto createDescOp = op.getTensorDesc().template getDefiningOp(); auto offsets = rewriter.getRemappedValue(createDescOp.getOffsets()); @@ -886,7 +886,7 @@ class AtomicToLsc : public OpConversionPattern<::mlir::xegpu::AtomicRMWOp> { SmallVector indices(16, 0); payLoad = rewriter.create( - loc, payLoad, payLoad, rewriter.getI64ArrayAttr(indices)); + loc, payLoad, payLoad, indices); auto createDescOp = op.getTensorDesc().getDefiningOp(); auto offsets = rewriter.getRemappedValue(createDescOp.getOffsets()); payLoad = rewriter.create(loc, payLoad, offsets); @@ -1232,7 +1232,7 @@ struct VectorExtractVC final llvm::SmallVector indices(size); std::iota(indices.begin(), indices.end(), linearizedOffset); rewriter.replaceOpWithNewOp( - extractOp, vec, vec, rewriter.getI64ArrayAttr(indices)); + extractOp, vec, vec, indices); } else { // use CompositExtract for scalar result rewriter.replaceOpWithNewOp(extractOp, vec, linearizedOffset); @@ -1338,7 +1338,7 @@ struct VectorExtractStridedSliceVC final } // perform a shuffle to extract the kD vector rewriter.replaceOpWithNewOp( - extractOp, srcVector, srcVector, rewriter.getI64ArrayAttr(indices)); + extractOp, srcVector, srcVector, indices); return success(); } @@ -1392,16 +1392,16 @@ struct VectorShuffleVC final SmallVector indices(totalSize); for (auto [i, value] : - llvm::enumerate(mask.getAsValueRange())) { + llvm::enumerate(mask)) { - int32_t v = value.getZExtValue(); + int32_t v = value; std::iota(indices.begin() + shuffleSliceLen * i, indices.begin() + shuffleSliceLen * (i + 1), shuffleSliceLen * v); } rewriter.replaceOpWithNewOp( - shuffleOp, vec1, vec2, rewriter.getI64ArrayAttr(indices)); + shuffleOp, vec1, vec2, indices); return success(); } @@ -1431,7 +1431,8 @@ struct SCFForOpBlockVCPattern final newOp.getRegion().getArgument(i).getType()); } - rewriter.applySignatureConversion(&op.getRegion(), signatureConverter); + rewriter.applySignatureConversion(&op.getRegion().getBlocks().front(), + signatureConverter); rewriter.eraseBlock(newOp.getBody()); rewriter.inlineRegionBefore(op.getRegion(), newOp.getRegion(), diff --git a/lib/Conversion/XeTileToXeGPU/SCFOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/SCFOpConversion.cpp index ebd1fc2bf..5d3c61efd 100644 --- a/lib/Conversion/XeTileToXeGPU/SCFOpConversion.cpp +++ b/lib/Conversion/XeTileToXeGPU/SCFOpConversion.cpp @@ -59,7 +59,8 @@ struct SgSCFForOpBlockPattern // apply the signature convertion for SCFFor body arguments, an // UnrealizedConversionCastOp will be inserted by typeConverter - rewriter.applySignatureConversion(&op.getRegion(), argumentMapping); + rewriter.applySignatureConversion(&op.getRegion().getBlocks().front(), + argumentMapping); if (newOp.getBody()) rewriter.eraseBlock(newOp.getBody()); diff --git a/lib/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.cpp index 30c036b23..84cf6eb47 100644 --- a/lib/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.cpp +++ b/lib/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.cpp @@ -227,9 +227,9 @@ XeGPUTypeConverter::computeTypeMapping(mlir::ValueRange original, } mlir::Block *XeGPUOneToNPatterRewriter::applySignatureConversion( - mlir::Region *region, mlir::TypeConverter::SignatureConversion &conversion, + mlir::Block *block, mlir::TypeConverter::SignatureConversion &conversion, const mlir::TypeConverter *converter) { - return rewriter.applySignatureConversion(region, conversion, converter); + return rewriter.applySignatureConversion(block, conversion, converter); } void XeGPUOneToNPatterRewriter::replaceOp(mlir::Operation *op, diff --git a/lib/Dialect/XeTile/Transforms/BlockAligning.cpp b/lib/Dialect/XeTile/Transforms/BlockAligning.cpp index 34a37541f..5ed587d90 100644 --- a/lib/Dialect/XeTile/Transforms/BlockAligning.cpp +++ b/lib/Dialect/XeTile/Transforms/BlockAligning.cpp @@ -304,7 +304,7 @@ class XeTileBlockAligningPass : public imex::impl::XeTileBlockAligningBase< // Use TopDown traversal order, and only look at existing ops // to simpliy the code logic and speedup the pass mlir::GreedyRewriteConfig config; - config.enableRegionSimplification = false; + config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled; config.useTopDownTraversal = true; config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps; if (failed( diff --git a/lib/Dialect/XeTile/Transforms/Blocking.cpp b/lib/Dialect/XeTile/Transforms/Blocking.cpp index 28ecf4506..00ba3096b 100644 --- a/lib/Dialect/XeTile/Transforms/Blocking.cpp +++ b/lib/Dialect/XeTile/Transforms/Blocking.cpp @@ -1124,7 +1124,7 @@ class XeTileBlockingPass // Use TopDown traversal order, and only look at existing ops // to simpliy the code logic and speedup the pass mlir::GreedyRewriteConfig config; - config.enableRegionSimplification = false; + config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled; config.useTopDownTraversal = true; config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps; { // initialize the inner block size per op. diff --git a/lib/Transforms/InsertGPUAllocs.cpp b/lib/Transforms/InsertGPUAllocs.cpp index 893be344f..c5decccb0 100644 --- a/lib/Transforms/InsertGPUAllocs.cpp +++ b/lib/Transforms/InsertGPUAllocs.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -158,6 +159,9 @@ class InsertGPUAllocsPass final } else if (auto init_tile = mlir::dyn_cast(op)) { return {{init_tile.getSource()}}; + } else if (auto init_xedesc = + mlir::dyn_cast(op)) { + return {{init_xedesc.getSource()}}; } else { op->emitError("Uhhandled mem op in gpu region"); return std::nullopt; @@ -187,6 +191,9 @@ class InsertGPUAllocsPass final // Only handle the case where the tile source is a memref return init_tile.isSourceMemRef(); } + if (auto init_xedesc = mlir::dyn_cast(op)) { + return true; + } return false; }; @@ -259,6 +266,36 @@ class InsertGPUAllocsPass final return; } + // walk over the users and find xegpu.load/store ops + std::function findXeGPULoadStore; + findXeGPULoadStore = [&](mlir::Operation *use, bool onDevice, AccessType& ret) { + if (auto tile_update = mlir::dyn_cast(use)) { + auto res = tile_update->getResult(0); + for (auto u : res.getUsers()) { + findXeGPULoadStore(u, onDevice, ret); + } + } + if (auto tile_for = mlir::dyn_cast<::mlir::scf::ForOp>(use)) { + for (size_t idx=0; idx(use)) { + (onDevice ? ret.deviceRead : ret.hostRead) = true; + } + else if (auto tile_prefetch = + mlir::dyn_cast(use)) { + (onDevice ? ret.deviceRead : ret.hostRead) = true; + } else if (auto tile_store = + mlir::dyn_cast(use)) { + (onDevice ? ret.deviceWrite : ret.hostWrite) = true; + } + }; + // Checks the access type of the OP under consideration. auto getAccessType = [&](mlir::Value memref) { AccessType ret; @@ -298,6 +335,15 @@ class InsertGPUAllocsPass final continue; } + if (auto init_xedesc = mlir::dyn_cast(user)) { + bool onDevice = user->getParentOfType(); + auto res = init_xedesc->getResult(0); + for (auto use : res.getUsers()) { + findXeGPULoadStore(use, onDevice, ret); + } + continue; + } + if (mlir::isa(user)) { ret.hostRead = true; ret.hostWrite = true; @@ -365,6 +411,15 @@ class InsertGPUAllocsPass final use.set(newAlloc.getResult()); } } + + // remove 'memref.dealloc' (it's later replaced with gpu.dealloc) + auto memory = alloc->getResult(0); + for (auto u : memory.getUsers()) { + if (auto dealloc = mlir::dyn_cast(u)) { + dealloc.erase(); + } + } + alloc.replaceAllUsesWith(allocResult); builder.create(loc, std::nullopt, allocResult); alloc.erase(); diff --git a/lib/Transforms/PropagatePackedLayout.cpp b/lib/Transforms/PropagatePackedLayout.cpp index bd79e4d29..3d30d4c89 100644 --- a/lib/Transforms/PropagatePackedLayout.cpp +++ b/lib/Transforms/PropagatePackedLayout.cpp @@ -330,7 +330,7 @@ makeCast(mlir::OpBuilder &builder, mlir::Value src, mlir::Type srcType, tmp = builder.create( loc, tmp, tmp, - builder.getI64ArrayAttr(getVNNIShuffleIndices(srcVecType, dstVecType))); + getVNNIShuffleIndices(srcVecType, dstVecType)); return {builder.create(loc, dstVecType, tmp), root}; diff --git a/lib/Transforms/SerializeSPIRV.cpp b/lib/Transforms/SerializeSPIRV.cpp index 56f3e535f..ada0f1285 100644 --- a/lib/Transforms/SerializeSPIRV.cpp +++ b/lib/Transforms/SerializeSPIRV.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "PassDetail.h" +#include "imex/Utils/GPUSerialize.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" @@ -54,13 +55,12 @@ struct SerializeSPIRVPass : public SerializeSPIRVPassBase { signalPassFailure(); return; } - // attach the spv binary to the gpu module auto spvData = llvm::StringRef(reinterpret_cast(spvBinary.data()), spvBinary.size() * sizeof(uint32_t)); auto spvAttr = mlir::StringAttr::get(&getContext(), spvData); - gpuMod->setAttr(gpu::getDefaultGpuBinaryAnnotation(), spvAttr); + gpuMod->setAttr(imex::gpuBinaryAttrName, spvAttr); spvMod->erase(); } } diff --git a/lib/Transforms/VectorLinearize.cpp b/lib/Transforms/VectorLinearize.cpp index dd07a3a2c..cdeb18348 100644 --- a/lib/Transforms/VectorLinearize.cpp +++ b/lib/Transforms/VectorLinearize.cpp @@ -140,7 +140,7 @@ struct VectorExtractStridedSliceConversion final // perform a shuffle to extract the kD vector rewriter.replaceOpWithNewOp( extractOp, dstType, srcVector, srcVector, - rewriter.getI64ArrayAttr(indices)); + indices); } return mlir::success(); } @@ -180,16 +180,16 @@ struct VectorShffleOpConversion final llvm::SmallVector indices(totalSize); for (auto [i, value] : - llvm::enumerate(mask.getAsValueRange())) { + llvm::enumerate(mask)) { - int64_t v = value.getZExtValue(); + int64_t v = value; std::iota(indices.begin() + shuffleSliceLen * i, indices.begin() + shuffleSliceLen * (i + 1), shuffleSliceLen * v); } rewriter.replaceOpWithNewOp( - shuffleOp, dstType, vec1, vec2, rewriter.getI64ArrayAttr(indices)); + shuffleOp, dstType, vec1, vec2, indices); return mlir::success(); } @@ -232,7 +232,7 @@ struct VectorExtractOpConversion final std::iota(indices.begin(), indices.end(), linearizedOffset); rewriter.replaceOpWithNewOp( extractOp, dstTy, adaptor.getVector(), adaptor.getVector(), - rewriter.getI64ArrayAttr(indices)); + indices); } return mlir::success(); @@ -298,11 +298,11 @@ struct VectorInsertOpConversion final 0); auto modifiedSource = rewriter.create( insertOp.getLoc(), dstTy, adaptor.getSource(), adaptor.getSource(), - rewriter.getI64ArrayAttr(modifiedSrcIndices)); + modifiedSrcIndices); rewriter.replaceOpWithNewOp( insertOp, dstTy, adaptor.getDest(), modifiedSource, - rewriter.getI64ArrayAttr(indices)); + indices); return mlir::success(); } diff --git a/test/Gen/NDArray/ndarray-gpu.pp b/test/Gen/NDArray/ndarray-gpu.pp index 013823308..00aa0d7f8 100644 --- a/test/Gen/NDArray/ndarray-gpu.pp +++ b/test/Gen/NDArray/ndarray-gpu.pp @@ -9,16 +9,9 @@ linalg-fuse-elementwise-ops arith-expand memref-expand - arith-bufferize - func-bufferize func.func(empty-tensor-to-alloc-tensor) - func.func(scf-bufferize) - func.func(tensor-bufferize) - func.func(bufferization-bufferize) - func.func(linalg-bufferize) func.func(linalg-detensorize) - func.func(tensor-bufferize) - func.func(finalizing-bufferize) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} imex-remove-temporaries func.func(convert-linalg-to-parallel-loops) func.func(scf-parallel-loop-fusion) diff --git a/test/Gen/NDArray/ndarray.pp b/test/Gen/NDArray/ndarray.pp index 3e197ca87..abd4a7ce8 100644 --- a/test/Gen/NDArray/ndarray.pp +++ b/test/Gen/NDArray/ndarray.pp @@ -8,18 +8,12 @@ linalg-fuse-elementwise-ops convert-shape-to-std arith-expand - arith-bufferize - func-bufferize func.func( empty-tensor-to-alloc-tensor - scf-bufferize - tensor-bufferize - linalg-bufferize - bufferization-bufferize - linalg-detensorize - tensor-bufferize - finalizing-bufferize - convert-linalg-to-parallel-loops) + linalg-detensorize) + one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" + buffer-deallocation-pipeline + func.func(convert-linalg-to-parallel-loops) drop-regions canonicalize fold-memref-alias-ops diff --git a/test/Gen/PlaidML/linalg-to-cpu.pp b/test/Gen/PlaidML/linalg-to-cpu.pp index 537ea2ea8..69fdde3ce 100644 --- a/test/Gen/PlaidML/linalg-to-cpu.pp +++ b/test/Gen/PlaidML/linalg-to-cpu.pp @@ -1,16 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) func.func(llvm-request-c-wrappers) convert-scf-to-cf convert-cf-to-llvm diff --git a/test/Integration/Dialect/Linalg/OpenCL/linalg-to-gpux-opencl.pp b/test/Integration/Dialect/Linalg/OpenCL/linalg-to-gpux-opencl.pp index 8081e02a4..e40f60848 100644 --- a/test/Integration/Dialect/Linalg/OpenCL/linalg-to-gpux-opencl.pp +++ b/test/Integration/Dialect/Linalg/OpenCL/linalg-to-gpux-opencl.pp @@ -1,17 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-parallel-loops + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops gpu-map-parallel-loops convert-parallel-loops-to-gpu) // insert-gpu-allocs pass can have client-api = opencl or vulkan args diff --git a/test/Integration/Dialect/Linalg/Vulkan/linalg-to-gpu-vulkan.pp b/test/Integration/Dialect/Linalg/Vulkan/linalg-to-gpu-vulkan.pp index bb778eb4b..f1822f4d2 100644 --- a/test/Integration/Dialect/Linalg/Vulkan/linalg-to-gpu-vulkan.pp +++ b/test/Integration/Dialect/Linalg/Vulkan/linalg-to-gpu-vulkan.pp @@ -2,18 +2,9 @@ // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module( convert-tensor-to-linalg - arith-bufferize + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} func.func( - empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func( - finalizing-bufferize convert-linalg-to-parallel-loops gpu-map-parallel-loops convert-parallel-loops-to-gpu diff --git a/test/Jax/gordon/linalg-to-cpu.pp b/test/Jax/gordon/linalg-to-cpu.pp index a11fe09ee..3d739b66a 100644 --- a/test/Jax/gordon/linalg-to-cpu.pp +++ b/test/Jax/gordon/linalg-to-cpu.pp @@ -2,17 +2,10 @@ builtin.module(inline convert-tensor-to-linalg convert-elementwise-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/Jax/gordon/linalg-to-llvm.pp b/test/Jax/gordon/linalg-to-llvm.pp index cfd0ea77b..992e98278 100644 --- a/test/Jax/gordon/linalg-to-llvm.pp +++ b/test/Jax/gordon/linalg-to-llvm.pp @@ -1,17 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-parallel-loops + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops convert-parallel-loops-to-gpu) diff --git a/test/Jax/janet/linalg-to-cpu.pp b/test/Jax/janet/linalg-to-cpu.pp index a11fe09ee..3d739b66a 100644 --- a/test/Jax/janet/linalg-to-cpu.pp +++ b/test/Jax/janet/linalg-to-cpu.pp @@ -2,17 +2,10 @@ builtin.module(inline convert-tensor-to-linalg convert-elementwise-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/Jax/janet/linalg-to-llvm.pp b/test/Jax/janet/linalg-to-llvm.pp index cfd0ea77b..992e98278 100644 --- a/test/Jax/janet/linalg-to-llvm.pp +++ b/test/Jax/janet/linalg-to-llvm.pp @@ -1,17 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-parallel-loops + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops convert-parallel-loops-to-gpu) diff --git a/test/Jax/jax_qmc/linalg-to-cpu.pp b/test/Jax/jax_qmc/linalg-to-cpu.pp index 4aa276de3..ba5485ef9 100644 --- a/test/Jax/jax_qmc/linalg-to-cpu.pp +++ b/test/Jax/jax_qmc/linalg-to-cpu.pp @@ -2,17 +2,10 @@ builtin.module(inline convert-tensor-to-linalg convert-elementwise-to-linalg -arith-bufferize -func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) -func-bufferize -func.func(finalizing-bufferize - convert-linalg-to-loops) +func.func(empty-tensor-to-alloc-tensor) +one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} +buffer-deallocation-pipeline +func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/Jax/jax_qmc/linalg-to-llvm.pp b/test/Jax/jax_qmc/linalg-to-llvm.pp index 352b5babf..140f112d4 100644 --- a/test/Jax/jax_qmc/linalg-to-llvm.pp +++ b/test/Jax/jax_qmc/linalg-to-llvm.pp @@ -1,17 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-parallel-loops + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops convert-parallel-loops-to-gpu) diff --git a/test/Jax/qoc/linalg-to-cpu.pp b/test/Jax/qoc/linalg-to-cpu.pp index 8a606cf24..60d715679 100644 --- a/test/Jax/qoc/linalg-to-cpu.pp +++ b/test/Jax/qoc/linalg-to-cpu.pp @@ -2,17 +2,10 @@ builtin.module(inline convert-tensor-to-linalg convert-elementwise-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/Jax/qoc/linalg-to-llvm.pp b/test/Jax/qoc/linalg-to-llvm.pp index cfd0ea77b..51e748124 100644 --- a/test/Jax/qoc/linalg-to-llvm.pp +++ b/test/Jax/qoc/linalg-to-llvm.pp @@ -1,16 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops diff --git a/test/Models/Mobilenet-v3/linalg-to-cpu.pp b/test/Models/Mobilenet-v3/linalg-to-cpu.pp index 26e956ceb..769dcdffb 100644 --- a/test/Models/Mobilenet-v3/linalg-to-cpu.pp +++ b/test/Models/Mobilenet-v3/linalg-to-cpu.pp @@ -2,17 +2,10 @@ builtin.module(inline convert-tensor-to-linalg convert-elementwise-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/Models/Mobilenet-v3/linalg-to-llvm.pp b/test/Models/Mobilenet-v3/linalg-to-llvm.pp index cfd0ea77b..51e748124 100644 --- a/test/Models/Mobilenet-v3/linalg-to-llvm.pp +++ b/test/Models/Mobilenet-v3/linalg-to-llvm.pp @@ -1,16 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops diff --git a/test/Models/Resnet-50/linalg-to-cpu.pp b/test/Models/Resnet-50/linalg-to-cpu.pp index f3209077f..d938beeac 100644 --- a/test/Models/Resnet-50/linalg-to-cpu.pp +++ b/test/Models/Resnet-50/linalg-to-cpu.pp @@ -2,17 +2,10 @@ builtin.module(inline convert-tensor-to-linalg convert-elementwise-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/Models/Resnet-50/linalg-to-llvm.pp b/test/Models/Resnet-50/linalg-to-llvm.pp index cfd0ea77b..51e748124 100644 --- a/test/Models/Resnet-50/linalg-to-llvm.pp +++ b/test/Models/Resnet-50/linalg-to-llvm.pp @@ -1,16 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops diff --git a/test/PlaidML/OpTest.EltwiseAdd.dynamic.mlir b/test/PlaidML/OpTest.EltwiseAdd.dynamic.mlir index c5b183546..8915bdbb0 100644 --- a/test/PlaidML/OpTest.EltwiseAdd.dynamic.mlir +++ b/test/PlaidML/OpTest.EltwiseAdd.dynamic.mlir @@ -1,6 +1,6 @@ -//imex-opt OpTest.EltwiseAdd.dynamic.mlir --pass-pipeline="builtin.module(convert-tensor-to-linalg,arith-bufferize,func.func(empty-tensor-to-alloc-tensor,eliminate-empty-tensors,scf-bufferize,shape-bufferize,linalg-bufferize,bufferization-bufferize,tensor-bufferize),func-bufferize,func.func(finalizing-bufferize,convert-linalg-to-parallel-loops,imex-add-outer-parallel-loop,gpu-map-parallel-loops,convert-parallel-loops-to-gpu))" +//imex-opt OpTest.EltwiseAdd.dynamic.mlir --pass-pipeline='builtin.module(convert-tensor-to-linalg,func.func(empty-tensor-to-alloc-tensor,eliminate-empty-tensors),-one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries",func.func(convert-linalg-to-parallel-loops,imex-add-outer-parallel-loop,gpu-map-parallel-loops,convert-parallel-loops-to-gpu))' // RUN: %python_executable %imex_runner -i %s --pass-pipeline-file=%p/linalg-to-cpu.pp \ // RUN: --runner mlir-cpu-runner -e main \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/test/PlaidML/linalg-to-cpu.pp b/test/PlaidML/linalg-to-cpu.pp index 19d5fe506..dcbddd5d3 100644 --- a/test/PlaidML/linalg-to-cpu.pp +++ b/test/PlaidML/linalg-to-cpu.pp @@ -1,16 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/PlaidML/linalg-to-llvm-caching.pp b/test/PlaidML/linalg-to-llvm-caching.pp index d351a38b9..0cb0a6835 100644 --- a/test/PlaidML/linalg-to-llvm-caching.pp +++ b/test/PlaidML/linalg-to-llvm-caching.pp @@ -1,17 +1,11 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor + func.func(empty-tensor-to-alloc-tensor) //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-parallel-loops + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops + imex-add-outer-parallel-loop gpu-map-parallel-loops convert-parallel-loops-to-gpu) // insert-gpu-allocs pass can have client-api = opencl or vulkan args diff --git a/test/PlaidML/linalg-to-llvm.pp b/test/PlaidML/linalg-to-llvm.pp index cfd0ea77b..c250df591 100644 --- a/test/PlaidML/linalg-to-llvm.pp +++ b/test/PlaidML/linalg-to-llvm.pp @@ -1,17 +1,10 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor + func.func(empty-tensor-to-alloc-tensor) //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-parallel-loops + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops convert-parallel-loops-to-gpu) diff --git a/test/imex-runner/fullgpu.pp b/test/imex-runner/fullgpu.pp index 45f997df8..075bdece0 100644 --- a/test/imex-runner/fullgpu.pp +++ b/test/imex-runner/fullgpu.pp @@ -17,18 +17,10 @@ linalg-fuse-elementwise-ops, arith-expand, memref-expand, - arith-bufferize, - func-bufferize, func.func(empty-tensor-to-alloc-tensor), - func.func(scf-bufferize), - func.func(tensor-bufferize), - func.func(bufferization-bufferize), - func.func(linalg-bufferize), - func.func(linalg-detensorize), - func.func(tensor-bufferize), + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} region-bufferize, canonicalize, - func.func(finalizing-bufferize), imex-remove-temporaries, func.func(convert-linalg-to-parallel-loops), func.func(scf-parallel-loop-fusion), diff --git a/test/imex-runner/ndarray.pp b/test/imex-runner/ndarray.pp index 494db314e..4aacc84c1 100644 --- a/test/imex-runner/ndarray.pp +++ b/test/imex-runner/ndarray.pp @@ -8,16 +8,10 @@ linalg-fuse-elementwise-ops arith-expand memref-expand - arith-bufferize - func-bufferize func.func(empty-tensor-to-alloc-tensor) - func.func(scf-bufferize) - func.func(tensor-bufferize) - func.func(bufferization-bufferize) - func.func(linalg-bufferize) func.func(linalg-detensorize) - func.func(tensor-bufferize) - func.func(finalizing-bufferize) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline imex-remove-temporaries func.func(convert-linalg-to-parallel-loops) func.func(scf-parallel-loop-fusion)