From bdb7cb5f7b8a5ff6e9ed083e378fbd2314827e47 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Wed, 3 Jul 2024 09:28:53 +0000 Subject: [PATCH 1/9] lower --- build_tools/llvm_version.txt | 2 +- .../XeTileToXeGPU/XeTileToXeGPUConversion.h | 2 +- include/imex/Utils/GPUSerialize.h | 18 ++++++++++++++++++ lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp | 6 +++--- lib/Conversion/XeGPUToVC/XeGPUToVC.cpp | 3 ++- .../XeTileToXeGPU/SCFOpConversion.cpp | 3 ++- .../XeTileToXeGPU/XeTileToXeGPUConversion.cpp | 4 ++-- .../XeTile/Transforms/BlockAligning.cpp | 2 +- lib/Dialect/XeTile/Transforms/Blocking.cpp | 2 +- lib/Transforms/SerializeSPIRV.cpp | 17 +++++++++++++++-- lib/Transforms/SetSPIRVCapabilities.cpp | 4 ++-- 11 files changed, 48 insertions(+), 15 deletions(-) create mode 100644 include/imex/Utils/GPUSerialize.h diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt index 9d6bd8335..7b14b14ba 100644 --- a/build_tools/llvm_version.txt +++ b/build_tools/llvm_version.txt @@ -1 +1 @@ -1728a56d0e66c9e64a2e62fa6c5508580ccd28a0 +37661a17e26d9002ae9ade8c0de3932c22f16360 diff --git a/include/imex/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.h b/include/imex/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.h index 4712704b1..0f5919bc0 100644 --- a/include/imex/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.h +++ b/include/imex/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.h @@ -67,7 +67,7 @@ class XeGPUOneToNPatterRewriter : public mlir::PatternRewriter, } mlir::Block * - applySignatureConversion(mlir::Region *region, + applySignatureConversion(mlir::Block *block, mlir::TypeConverter::SignatureConversion &conversion, const mlir::TypeConverter *converter = nullptr); diff --git a/include/imex/Utils/GPUSerialize.h b/include/imex/Utils/GPUSerialize.h new file mode 100644 index 000000000..c6eef0c9a --- /dev/null +++ b/include/imex/Utils/GPUSerialize.h @@ -0,0 +1,18 @@ +//===- GPUSerialize.h - Pass Utility Functions --------------------*- C++ +//-*-===// +// +// Copyright 2024 Intel Corporation +// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _IMEX_GPUSERIALIZE_H_ +#define _IMEX_GPUSERIALIZE_H_ + +namespace imex { +static constexpr const char *gpuBinaryAttrName = "gpu.binary"; +} // namespace imex + +#endif // _IMEX_GPUSERIALIZE_H_ diff --git a/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp b/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp index 25e445c20..a99caac62 100644 --- a/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp +++ b/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp @@ -19,6 +19,7 @@ #include "imex/Utils/FuncUtils.hpp" #include "imex/Utils/TypeConversion.hpp" +#include "imex/Utils/GPUSerialize.h" #include "../PassDetail.h" @@ -591,8 +592,7 @@ void GPUXToLLVMPass::runOnOperation() { mlir::populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, target); - mlir::populateGpuToLLVMConversionPatterns( - converter, patterns, mlir::gpu::getDefaultGpuBinaryAnnotation()); + mlir::populateGpuToLLVMConversionPatterns(converter, patterns); imex::populateControlFlowTypeConversionRewritesAndTarget(converter, patterns, target); @@ -636,7 +636,7 @@ void imex::populateGpuxToLLVMPatternsAndLegality( >(converter); patterns.add( - converter, mlir::gpu::getDefaultGpuBinaryAnnotation()); + converter, imex::gpuBinaryAttrName); target.addIllegalDialect(); target.addIllegalDialect(); diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp index eb347a5fd..a1dacd840 100644 --- a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp +++ b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp @@ -1431,7 +1431,8 @@ struct SCFForOpBlockVCPattern final newOp.getRegion().getArgument(i).getType()); } - rewriter.applySignatureConversion(&op.getRegion(), signatureConverter); + rewriter.applySignatureConversion(&op.getRegion().getBlocks().front(), + signatureConverter); rewriter.eraseBlock(newOp.getBody()); rewriter.inlineRegionBefore(op.getRegion(), newOp.getRegion(), diff --git a/lib/Conversion/XeTileToXeGPU/SCFOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/SCFOpConversion.cpp index ebd1fc2bf..5d3c61efd 100644 --- a/lib/Conversion/XeTileToXeGPU/SCFOpConversion.cpp +++ b/lib/Conversion/XeTileToXeGPU/SCFOpConversion.cpp @@ -59,7 +59,8 @@ struct SgSCFForOpBlockPattern // apply the signature convertion for SCFFor body arguments, an // UnrealizedConversionCastOp will be inserted by typeConverter - rewriter.applySignatureConversion(&op.getRegion(), argumentMapping); + rewriter.applySignatureConversion(&op.getRegion().getBlocks().front(), + argumentMapping); if (newOp.getBody()) rewriter.eraseBlock(newOp.getBody()); diff --git a/lib/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.cpp index 30c036b23..84cf6eb47 100644 --- a/lib/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.cpp +++ b/lib/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.cpp @@ -227,9 +227,9 @@ XeGPUTypeConverter::computeTypeMapping(mlir::ValueRange original, } mlir::Block *XeGPUOneToNPatterRewriter::applySignatureConversion( - mlir::Region *region, mlir::TypeConverter::SignatureConversion &conversion, + mlir::Block *block, mlir::TypeConverter::SignatureConversion &conversion, const mlir::TypeConverter *converter) { - return rewriter.applySignatureConversion(region, conversion, converter); + return rewriter.applySignatureConversion(block, conversion, converter); } void XeGPUOneToNPatterRewriter::replaceOp(mlir::Operation *op, diff --git a/lib/Dialect/XeTile/Transforms/BlockAligning.cpp b/lib/Dialect/XeTile/Transforms/BlockAligning.cpp index 34a37541f..5ed587d90 100644 --- a/lib/Dialect/XeTile/Transforms/BlockAligning.cpp +++ b/lib/Dialect/XeTile/Transforms/BlockAligning.cpp @@ -304,7 +304,7 @@ class XeTileBlockAligningPass : public imex::impl::XeTileBlockAligningBase< // Use TopDown traversal order, and only look at existing ops // to simpliy the code logic and speedup the pass mlir::GreedyRewriteConfig config; - config.enableRegionSimplification = false; + config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled; config.useTopDownTraversal = true; config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps; if (failed( diff --git a/lib/Dialect/XeTile/Transforms/Blocking.cpp b/lib/Dialect/XeTile/Transforms/Blocking.cpp index 28ecf4506..00ba3096b 100644 --- a/lib/Dialect/XeTile/Transforms/Blocking.cpp +++ b/lib/Dialect/XeTile/Transforms/Blocking.cpp @@ -1124,7 +1124,7 @@ class XeTileBlockingPass // Use TopDown traversal order, and only look at existing ops // to simpliy the code logic and speedup the pass mlir::GreedyRewriteConfig config; - config.enableRegionSimplification = false; + config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled; config.useTopDownTraversal = true; config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps; { // initialize the inner block size per op. diff --git a/lib/Transforms/SerializeSPIRV.cpp b/lib/Transforms/SerializeSPIRV.cpp index 56f3e535f..5c8aa41a8 100644 --- a/lib/Transforms/SerializeSPIRV.cpp +++ b/lib/Transforms/SerializeSPIRV.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "PassDetail.h" +#include "imex/Utils/GPUSerialize.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" @@ -54,13 +55,25 @@ struct SerializeSPIRVPass : public SerializeSPIRVPassBase { signalPassFailure(); return; } - // attach the spv binary to the gpu module auto spvData = llvm::StringRef(reinterpret_cast(spvBinary.data()), spvBinary.size() * sizeof(uint32_t)); + if(auto targets = gpuMod.getTargets()) { + auto& tgts = *targets; + if(tgts.size()!=1) { + spvMod.emitError() << "Failed to serialize SPIR-V module"; + signalPassFailure(); + return; + } + auto target = tgts.begin(); + } else { + spvMod.emitError() << "Failed to serialize SPIR-V module"; + signalPassFailure(); + return; + } auto spvAttr = mlir::StringAttr::get(&getContext(), spvData); - gpuMod->setAttr(gpu::getDefaultGpuBinaryAnnotation(), spvAttr); + gpuMod->setAttr(imex::gpuBinaryAttrName, spvAttr); spvMod->erase(); } } diff --git a/lib/Transforms/SetSPIRVCapabilities.cpp b/lib/Transforms/SetSPIRVCapabilities.cpp index b7b3787a6..37b20eb3e 100644 --- a/lib/Transforms/SetSPIRVCapabilities.cpp +++ b/lib/Transforms/SetSPIRVCapabilities.cpp @@ -92,7 +92,7 @@ struct SetSPIRVCapabilitiesPass spirv::DeviceType::Unknown, spirv::TargetEnvAttr::kUnknownDeviceID); auto op = getOperation(); op->walk([&](mlir::gpu::GPUModuleOp op) { - op->setAttr(spirv::getTargetEnvAttrName(), attr); + op.setTargetsAttr(mlir::ArrayAttr::get(op.getContext(), {attr})); }); } else if (m_clientAPI == "vulkan") { auto triple = spirv::VerCapExtAttr::get( @@ -103,7 +103,7 @@ struct SetSPIRVCapabilitiesPass spirv::DeviceType::Unknown, spirv::TargetEnvAttr::kUnknownDeviceID); auto op = getOperation(); op->walk([&](mlir::gpu::GPUModuleOp op) { - op->setAttr(spirv::getTargetEnvAttrName(), attr); + op.setTargetsAttr(mlir::ArrayAttr::get(op.getContext(), {attr})); }); } } From 04c797cb0d0922cff15dfe4eaaa6337aff0017e5 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 4 Jul 2024 06:16:35 +0000 Subject: [PATCH 2/9] fix pass --- lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp | 37 ++++++++++++++------ lib/Transforms/SerializeSPIRV.cpp | 13 ------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp b/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp index a99caac62..7058a17e0 100644 --- a/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp +++ b/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp @@ -18,8 +18,8 @@ #include "imex/Dialect/GPUX/IR/GPUXOps.h" #include "imex/Utils/FuncUtils.hpp" -#include "imex/Utils/TypeConversion.hpp" #include "imex/Utils/GPUSerialize.h" +#include "imex/Utils/TypeConversion.hpp" #include "../PassDetail.h" @@ -518,6 +518,20 @@ class ConvertLaunchFuncOpToGpuRuntimeCallPattern } }; +class RemoveGPUModulePattern + : public mlir::ConvertOpToLLVMPattern { +public: + RemoveGPUModulePattern(mlir::LLVMTypeConverter &converter) + : mlir::ConvertOpToLLVMPattern(converter) {} + mlir::LogicalResult + matchAndRewrite(mlir::gpu::GPUModuleOp op, + mlir::gpu::GPUModuleOp::Adaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + rewriter.eraseOp(op); + return mlir::success(); + } +}; + /// A rewrite pattern to convert gpux.create_stream operations into a GPU /// runtime call. class ConvertGpuStreamCreatePattern @@ -584,18 +598,18 @@ void GPUXToLLVMPass::runOnOperation() { mlir::RewritePatternSet patterns(&context); mlir::LLVMConversionTarget target(context); - mlir::arith::populateArithToLLVMConversionPatterns(converter, patterns); - mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns); - mlir::populateVectorToLLVMConversionPatterns(converter, patterns); - mlir::populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns); - mlir::populateFuncToLLVMConversionPatterns(converter, patterns); - mlir::populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, - target); +// mlir::arith::populateArithToLLVMConversionPatterns(converter, patterns); +// mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns); +// mlir::populateVectorToLLVMConversionPatterns(converter, patterns); +// mlir::populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns); +// mlir::populateFuncToLLVMConversionPatterns(converter, patterns); +// mlir::populateAsyncStructuralTypeConversionsAndLegality(converter, patterns, +// target); mlir::populateGpuToLLVMConversionPatterns(converter, patterns); - imex::populateControlFlowTypeConversionRewritesAndTarget(converter, patterns, - target); +// imex::populateControlFlowTypeConversionRewritesAndTarget(converter, patterns, +// target); imex::populateGpuxToLLVMPatternsAndLegality(converter, patterns, target); @@ -631,7 +645,8 @@ void imex::populateGpuxToLLVMPatternsAndLegality( ConvertGpuStreamCreatePattern, ConvertGpuStreamDestroyPattern, ConvertAllocOpToGpuRuntimeCallPattern, - ConvertDeallocOpToGpuRuntimeCallPattern + ConvertDeallocOpToGpuRuntimeCallPattern, + RemoveGPUModulePattern // clang-format on >(converter); diff --git a/lib/Transforms/SerializeSPIRV.cpp b/lib/Transforms/SerializeSPIRV.cpp index 5c8aa41a8..ada0f1285 100644 --- a/lib/Transforms/SerializeSPIRV.cpp +++ b/lib/Transforms/SerializeSPIRV.cpp @@ -59,19 +59,6 @@ struct SerializeSPIRVPass : public SerializeSPIRVPassBase { auto spvData = llvm::StringRef(reinterpret_cast(spvBinary.data()), spvBinary.size() * sizeof(uint32_t)); - if(auto targets = gpuMod.getTargets()) { - auto& tgts = *targets; - if(tgts.size()!=1) { - spvMod.emitError() << "Failed to serialize SPIR-V module"; - signalPassFailure(); - return; - } - auto target = tgts.begin(); - } else { - spvMod.emitError() << "Failed to serialize SPIR-V module"; - signalPassFailure(); - return; - } auto spvAttr = mlir::StringAttr::get(&getContext(), spvData); gpuMod->setAttr(imex::gpuBinaryAttrName, spvAttr); spvMod->erase(); From 84d9b7d1b6ef962c9644d1518b16c3612d7c5efa Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 4 Jul 2024 06:32:11 +0000 Subject: [PATCH 3/9] stash --- lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp | 17 ++++++++++++++++- test/Gen/NDArray/ndarray-gpu.pp | 10 ++-------- test/Gen/NDArray/ndarray.pp | 14 ++++---------- test/Jax/jax_qmc/linalg-to-cpu.pp | 14 ++++---------- test/PlaidML/linalg-to-cpu.pp | 15 ++++----------- test/PlaidML/linalg-to-llvm-caching.pp | 15 +++++---------- test/PlaidML/linalg-to-llvm.pp | 14 ++++---------- test/imex-runner/ndarray.pp | 10 ++-------- 8 files changed, 41 insertions(+), 68 deletions(-) diff --git a/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp b/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp index 7058a17e0..c7bf0de1d 100644 --- a/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp +++ b/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp @@ -532,6 +532,20 @@ class RemoveGPUModulePattern } }; +class RemoveGPUFuncPattern + : public mlir::ConvertOpToLLVMPattern { +public: + RemoveGPUFuncPattern(mlir::LLVMTypeConverter &converter) + : mlir::ConvertOpToLLVMPattern(converter) {} + mlir::LogicalResult + matchAndRewrite(mlir::gpu::GPUFuncOp op, + mlir::gpu::GPUFuncOp::Adaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + rewriter.eraseOp(op); + return mlir::success(); + } +}; + /// A rewrite pattern to convert gpux.create_stream operations into a GPU /// runtime call. class ConvertGpuStreamCreatePattern @@ -646,7 +660,8 @@ void imex::populateGpuxToLLVMPatternsAndLegality( ConvertGpuStreamDestroyPattern, ConvertAllocOpToGpuRuntimeCallPattern, ConvertDeallocOpToGpuRuntimeCallPattern, - RemoveGPUModulePattern + RemoveGPUModulePattern, + RemoveGPUFuncPattern // clang-format on >(converter); diff --git a/test/Gen/NDArray/ndarray-gpu.pp b/test/Gen/NDArray/ndarray-gpu.pp index 013823308..d6e00f890 100644 --- a/test/Gen/NDArray/ndarray-gpu.pp +++ b/test/Gen/NDArray/ndarray-gpu.pp @@ -9,16 +9,10 @@ linalg-fuse-elementwise-ops arith-expand memref-expand - arith-bufferize - func-bufferize func.func(empty-tensor-to-alloc-tensor) - func.func(scf-bufferize) - func.func(tensor-bufferize) - func.func(bufferization-bufferize) - func.func(linalg-bufferize) func.func(linalg-detensorize) - func.func(tensor-bufferize) - func.func(finalizing-bufferize) + one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" + buffer-deallocation-pipeline imex-remove-temporaries func.func(convert-linalg-to-parallel-loops) func.func(scf-parallel-loop-fusion) diff --git a/test/Gen/NDArray/ndarray.pp b/test/Gen/NDArray/ndarray.pp index 3e197ca87..abd4a7ce8 100644 --- a/test/Gen/NDArray/ndarray.pp +++ b/test/Gen/NDArray/ndarray.pp @@ -8,18 +8,12 @@ linalg-fuse-elementwise-ops convert-shape-to-std arith-expand - arith-bufferize - func-bufferize func.func( empty-tensor-to-alloc-tensor - scf-bufferize - tensor-bufferize - linalg-bufferize - bufferization-bufferize - linalg-detensorize - tensor-bufferize - finalizing-bufferize - convert-linalg-to-parallel-loops) + linalg-detensorize) + one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" + buffer-deallocation-pipeline + func.func(convert-linalg-to-parallel-loops) drop-regions canonicalize fold-memref-alias-ops diff --git a/test/Jax/jax_qmc/linalg-to-cpu.pp b/test/Jax/jax_qmc/linalg-to-cpu.pp index 4aa276de3..9ad496301 100644 --- a/test/Jax/jax_qmc/linalg-to-cpu.pp +++ b/test/Jax/jax_qmc/linalg-to-cpu.pp @@ -3,16 +3,10 @@ convert-tensor-to-linalg convert-elementwise-to-linalg arith-bufferize -func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) -func-bufferize -func.func(finalizing-bufferize - convert-linalg-to-loops) +func.func(empty-tensor-to-alloc-tensor) +one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} +buffer-deallocation-pipeline +func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/PlaidML/linalg-to-cpu.pp b/test/PlaidML/linalg-to-cpu.pp index 19d5fe506..dcbddd5d3 100644 --- a/test/PlaidML/linalg-to-cpu.pp +++ b/test/PlaidML/linalg-to-cpu.pp @@ -1,16 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/PlaidML/linalg-to-llvm-caching.pp b/test/PlaidML/linalg-to-llvm-caching.pp index d351a38b9..6f002526f 100644 --- a/test/PlaidML/linalg-to-llvm-caching.pp +++ b/test/PlaidML/linalg-to-llvm-caching.pp @@ -1,17 +1,12 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor + func.func(empty-tensor-to-alloc-tensor) //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-parallel-loops + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-parallel-loops + imex-add-outer-parallel-loop gpu-map-parallel-loops convert-parallel-loops-to-gpu) // insert-gpu-allocs pass can have client-api = opencl or vulkan args diff --git a/test/PlaidML/linalg-to-llvm.pp b/test/PlaidML/linalg-to-llvm.pp index cfd0ea77b..f4b3775ce 100644 --- a/test/PlaidML/linalg-to-llvm.pp +++ b/test/PlaidML/linalg-to-llvm.pp @@ -1,17 +1,11 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor + func.func(empty-tensor-to-alloc-tensor) //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-parallel-loops + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops convert-parallel-loops-to-gpu) diff --git a/test/imex-runner/ndarray.pp b/test/imex-runner/ndarray.pp index 494db314e..4aacc84c1 100644 --- a/test/imex-runner/ndarray.pp +++ b/test/imex-runner/ndarray.pp @@ -8,16 +8,10 @@ linalg-fuse-elementwise-ops arith-expand memref-expand - arith-bufferize - func-bufferize func.func(empty-tensor-to-alloc-tensor) - func.func(scf-bufferize) - func.func(tensor-bufferize) - func.func(bufferization-bufferize) - func.func(linalg-bufferize) func.func(linalg-detensorize) - func.func(tensor-bufferize) - func.func(finalizing-bufferize) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline imex-remove-temporaries func.func(convert-linalg-to-parallel-loops) func.func(scf-parallel-loop-fusion) From 06a4726dae36f58d141471464dfbabc7f2614106 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Thu, 4 Jul 2024 08:43:35 +0000 Subject: [PATCH 4/9] lower ok --- lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp | 3 +-- lib/Transforms/SetSPIRVCapabilities.cpp | 4 ++-- test/Gen/NDArray/ndarray-gpu.pp | 3 +-- test/PlaidML/linalg-to-llvm-caching.pp | 1 - test/PlaidML/linalg-to-llvm.pp | 1 - 5 files changed, 4 insertions(+), 8 deletions(-) diff --git a/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp b/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp index c7bf0de1d..d4055c2be 100644 --- a/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp +++ b/lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp @@ -660,8 +660,7 @@ void imex::populateGpuxToLLVMPatternsAndLegality( ConvertGpuStreamDestroyPattern, ConvertAllocOpToGpuRuntimeCallPattern, ConvertDeallocOpToGpuRuntimeCallPattern, - RemoveGPUModulePattern, - RemoveGPUFuncPattern + RemoveGPUModulePattern // clang-format on >(converter); diff --git a/lib/Transforms/SetSPIRVCapabilities.cpp b/lib/Transforms/SetSPIRVCapabilities.cpp index 37b20eb3e..b7b3787a6 100644 --- a/lib/Transforms/SetSPIRVCapabilities.cpp +++ b/lib/Transforms/SetSPIRVCapabilities.cpp @@ -92,7 +92,7 @@ struct SetSPIRVCapabilitiesPass spirv::DeviceType::Unknown, spirv::TargetEnvAttr::kUnknownDeviceID); auto op = getOperation(); op->walk([&](mlir::gpu::GPUModuleOp op) { - op.setTargetsAttr(mlir::ArrayAttr::get(op.getContext(), {attr})); + op->setAttr(spirv::getTargetEnvAttrName(), attr); }); } else if (m_clientAPI == "vulkan") { auto triple = spirv::VerCapExtAttr::get( @@ -103,7 +103,7 @@ struct SetSPIRVCapabilitiesPass spirv::DeviceType::Unknown, spirv::TargetEnvAttr::kUnknownDeviceID); auto op = getOperation(); op->walk([&](mlir::gpu::GPUModuleOp op) { - op.setTargetsAttr(mlir::ArrayAttr::get(op.getContext(), {attr})); + op->setAttr(spirv::getTargetEnvAttrName(), attr); }); } } diff --git a/test/Gen/NDArray/ndarray-gpu.pp b/test/Gen/NDArray/ndarray-gpu.pp index d6e00f890..00aa0d7f8 100644 --- a/test/Gen/NDArray/ndarray-gpu.pp +++ b/test/Gen/NDArray/ndarray-gpu.pp @@ -11,8 +11,7 @@ memref-expand func.func(empty-tensor-to-alloc-tensor) func.func(linalg-detensorize) - one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" - buffer-deallocation-pipeline + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} imex-remove-temporaries func.func(convert-linalg-to-parallel-loops) func.func(scf-parallel-loop-fusion) diff --git a/test/PlaidML/linalg-to-llvm-caching.pp b/test/PlaidML/linalg-to-llvm-caching.pp index 6f002526f..0cb0a6835 100644 --- a/test/PlaidML/linalg-to-llvm-caching.pp +++ b/test/PlaidML/linalg-to-llvm-caching.pp @@ -4,7 +4,6 @@ func.func(empty-tensor-to-alloc-tensor) //eliminate-empty-tensors one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} - buffer-deallocation-pipeline func.func(convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops diff --git a/test/PlaidML/linalg-to-llvm.pp b/test/PlaidML/linalg-to-llvm.pp index f4b3775ce..c250df591 100644 --- a/test/PlaidML/linalg-to-llvm.pp +++ b/test/PlaidML/linalg-to-llvm.pp @@ -4,7 +4,6 @@ func.func(empty-tensor-to-alloc-tensor) //eliminate-empty-tensors one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} - buffer-deallocation-pipeline func.func(convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops From 496b240093b5e132b60c5ee69878300fe69be300 Mon Sep 17 00:00:00 2001 From: "Mei, Yijie" Date: Fri, 5 Jul 2024 04:06:39 +0000 Subject: [PATCH 5/9] fix --- test/Gen/PlaidML/linalg-to-cpu.pp | 15 ++++----------- .../Linalg/OpenCL/linalg-to-gpux-opencl.pp | 14 +++----------- .../Dialect/Linalg/Vulkan/linalg-to-gpu-vulkan.pp | 13 ++----------- test/Jax/gordon/linalg-to-cpu.pp | 15 ++++----------- test/Jax/gordon/linalg-to-llvm.pp | 14 +++----------- test/Jax/janet/linalg-to-cpu.pp | 15 ++++----------- test/Jax/janet/linalg-to-llvm.pp | 14 +++----------- test/Jax/jax_qmc/linalg-to-cpu.pp | 1 - test/Jax/jax_qmc/linalg-to-llvm.pp | 14 +++----------- test/Jax/qoc/linalg-to-cpu.pp | 15 ++++----------- test/Jax/qoc/linalg-to-llvm.pp | 13 +++---------- test/Models/Mobilenet-v3/linalg-to-cpu.pp | 15 ++++----------- test/Models/Mobilenet-v3/linalg-to-llvm.pp | 13 +++---------- test/Models/Resnet-50/linalg-to-cpu.pp | 15 ++++----------- test/Models/Resnet-50/linalg-to-llvm.pp | 13 +++---------- test/PlaidML/OpTest.EltwiseAdd.dynamic.mlir | 2 +- test/imex-runner/fullgpu.pp | 10 +--------- 17 files changed, 49 insertions(+), 162 deletions(-) diff --git a/test/Gen/PlaidML/linalg-to-cpu.pp b/test/Gen/PlaidML/linalg-to-cpu.pp index 537ea2ea8..69fdde3ce 100644 --- a/test/Gen/PlaidML/linalg-to-cpu.pp +++ b/test/Gen/PlaidML/linalg-to-cpu.pp @@ -1,16 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) func.func(llvm-request-c-wrappers) convert-scf-to-cf convert-cf-to-llvm diff --git a/test/Integration/Dialect/Linalg/OpenCL/linalg-to-gpux-opencl.pp b/test/Integration/Dialect/Linalg/OpenCL/linalg-to-gpux-opencl.pp index 8081e02a4..e40f60848 100644 --- a/test/Integration/Dialect/Linalg/OpenCL/linalg-to-gpux-opencl.pp +++ b/test/Integration/Dialect/Linalg/OpenCL/linalg-to-gpux-opencl.pp @@ -1,17 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-parallel-loops + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops gpu-map-parallel-loops convert-parallel-loops-to-gpu) // insert-gpu-allocs pass can have client-api = opencl or vulkan args diff --git a/test/Integration/Dialect/Linalg/Vulkan/linalg-to-gpu-vulkan.pp b/test/Integration/Dialect/Linalg/Vulkan/linalg-to-gpu-vulkan.pp index bb778eb4b..f1822f4d2 100644 --- a/test/Integration/Dialect/Linalg/Vulkan/linalg-to-gpu-vulkan.pp +++ b/test/Integration/Dialect/Linalg/Vulkan/linalg-to-gpu-vulkan.pp @@ -2,18 +2,9 @@ // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module( convert-tensor-to-linalg - arith-bufferize + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} func.func( - empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func( - finalizing-bufferize convert-linalg-to-parallel-loops gpu-map-parallel-loops convert-parallel-loops-to-gpu diff --git a/test/Jax/gordon/linalg-to-cpu.pp b/test/Jax/gordon/linalg-to-cpu.pp index a11fe09ee..3d739b66a 100644 --- a/test/Jax/gordon/linalg-to-cpu.pp +++ b/test/Jax/gordon/linalg-to-cpu.pp @@ -2,17 +2,10 @@ builtin.module(inline convert-tensor-to-linalg convert-elementwise-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/Jax/gordon/linalg-to-llvm.pp b/test/Jax/gordon/linalg-to-llvm.pp index cfd0ea77b..992e98278 100644 --- a/test/Jax/gordon/linalg-to-llvm.pp +++ b/test/Jax/gordon/linalg-to-llvm.pp @@ -1,17 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-parallel-loops + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops convert-parallel-loops-to-gpu) diff --git a/test/Jax/janet/linalg-to-cpu.pp b/test/Jax/janet/linalg-to-cpu.pp index a11fe09ee..3d739b66a 100644 --- a/test/Jax/janet/linalg-to-cpu.pp +++ b/test/Jax/janet/linalg-to-cpu.pp @@ -2,17 +2,10 @@ builtin.module(inline convert-tensor-to-linalg convert-elementwise-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/Jax/janet/linalg-to-llvm.pp b/test/Jax/janet/linalg-to-llvm.pp index cfd0ea77b..992e98278 100644 --- a/test/Jax/janet/linalg-to-llvm.pp +++ b/test/Jax/janet/linalg-to-llvm.pp @@ -1,17 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-parallel-loops + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops convert-parallel-loops-to-gpu) diff --git a/test/Jax/jax_qmc/linalg-to-cpu.pp b/test/Jax/jax_qmc/linalg-to-cpu.pp index 9ad496301..ba5485ef9 100644 --- a/test/Jax/jax_qmc/linalg-to-cpu.pp +++ b/test/Jax/jax_qmc/linalg-to-cpu.pp @@ -2,7 +2,6 @@ builtin.module(inline convert-tensor-to-linalg convert-elementwise-to-linalg -arith-bufferize func.func(empty-tensor-to-alloc-tensor) one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} buffer-deallocation-pipeline diff --git a/test/Jax/jax_qmc/linalg-to-llvm.pp b/test/Jax/jax_qmc/linalg-to-llvm.pp index 352b5babf..140f112d4 100644 --- a/test/Jax/jax_qmc/linalg-to-llvm.pp +++ b/test/Jax/jax_qmc/linalg-to-llvm.pp @@ -1,17 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-parallel-loops + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops convert-parallel-loops-to-gpu) diff --git a/test/Jax/qoc/linalg-to-cpu.pp b/test/Jax/qoc/linalg-to-cpu.pp index 8a606cf24..60d715679 100644 --- a/test/Jax/qoc/linalg-to-cpu.pp +++ b/test/Jax/qoc/linalg-to-cpu.pp @@ -2,17 +2,10 @@ builtin.module(inline convert-tensor-to-linalg convert-elementwise-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/Jax/qoc/linalg-to-llvm.pp b/test/Jax/qoc/linalg-to-llvm.pp index cfd0ea77b..51e748124 100644 --- a/test/Jax/qoc/linalg-to-llvm.pp +++ b/test/Jax/qoc/linalg-to-llvm.pp @@ -1,16 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops diff --git a/test/Models/Mobilenet-v3/linalg-to-cpu.pp b/test/Models/Mobilenet-v3/linalg-to-cpu.pp index 26e956ceb..769dcdffb 100644 --- a/test/Models/Mobilenet-v3/linalg-to-cpu.pp +++ b/test/Models/Mobilenet-v3/linalg-to-cpu.pp @@ -2,17 +2,10 @@ builtin.module(inline convert-tensor-to-linalg convert-elementwise-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/Models/Mobilenet-v3/linalg-to-llvm.pp b/test/Models/Mobilenet-v3/linalg-to-llvm.pp index cfd0ea77b..51e748124 100644 --- a/test/Models/Mobilenet-v3/linalg-to-llvm.pp +++ b/test/Models/Mobilenet-v3/linalg-to-llvm.pp @@ -1,16 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops diff --git a/test/Models/Resnet-50/linalg-to-cpu.pp b/test/Models/Resnet-50/linalg-to-cpu.pp index f3209077f..d938beeac 100644 --- a/test/Models/Resnet-50/linalg-to-cpu.pp +++ b/test/Models/Resnet-50/linalg-to-cpu.pp @@ -2,17 +2,10 @@ builtin.module(inline convert-tensor-to-linalg convert-elementwise-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize - convert-linalg-to-loops) + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + buffer-deallocation-pipeline + func.func(convert-linalg-to-loops) convert-scf-to-cf convert-cf-to-llvm convert-arith-to-llvm diff --git a/test/Models/Resnet-50/linalg-to-llvm.pp b/test/Models/Resnet-50/linalg-to-llvm.pp index cfd0ea77b..51e748124 100644 --- a/test/Models/Resnet-50/linalg-to-llvm.pp +++ b/test/Models/Resnet-50/linalg-to-llvm.pp @@ -1,16 +1,9 @@ // linalg dialect to gpu dialect lowering pipeline // Ready for vulkan runner or narrow scope l0/sycl runner starting from GPU dialect. builtin.module(convert-tensor-to-linalg - arith-bufferize - func.func(empty-tensor-to-alloc-tensor - //eliminate-empty-tensors - scf-bufferize - shape-bufferize - linalg-bufferize - bufferization-bufferize - tensor-bufferize) - func-bufferize - func.func(finalizing-bufferize + func.func(empty-tensor-to-alloc-tensor) + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} + func.func(convert-linalg-to-parallel-loops convert-linalg-to-parallel-loops imex-add-outer-parallel-loop gpu-map-parallel-loops diff --git a/test/PlaidML/OpTest.EltwiseAdd.dynamic.mlir b/test/PlaidML/OpTest.EltwiseAdd.dynamic.mlir index c5b183546..8915bdbb0 100644 --- a/test/PlaidML/OpTest.EltwiseAdd.dynamic.mlir +++ b/test/PlaidML/OpTest.EltwiseAdd.dynamic.mlir @@ -1,6 +1,6 @@ -//imex-opt OpTest.EltwiseAdd.dynamic.mlir --pass-pipeline="builtin.module(convert-tensor-to-linalg,arith-bufferize,func.func(empty-tensor-to-alloc-tensor,eliminate-empty-tensors,scf-bufferize,shape-bufferize,linalg-bufferize,bufferization-bufferize,tensor-bufferize),func-bufferize,func.func(finalizing-bufferize,convert-linalg-to-parallel-loops,imex-add-outer-parallel-loop,gpu-map-parallel-loops,convert-parallel-loops-to-gpu))" +//imex-opt OpTest.EltwiseAdd.dynamic.mlir --pass-pipeline='builtin.module(convert-tensor-to-linalg,func.func(empty-tensor-to-alloc-tensor,eliminate-empty-tensors),-one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries",func.func(convert-linalg-to-parallel-loops,imex-add-outer-parallel-loop,gpu-map-parallel-loops,convert-parallel-loops-to-gpu))' // RUN: %python_executable %imex_runner -i %s --pass-pipeline-file=%p/linalg-to-cpu.pp \ // RUN: --runner mlir-cpu-runner -e main \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/test/imex-runner/fullgpu.pp b/test/imex-runner/fullgpu.pp index 45f997df8..075bdece0 100644 --- a/test/imex-runner/fullgpu.pp +++ b/test/imex-runner/fullgpu.pp @@ -17,18 +17,10 @@ linalg-fuse-elementwise-ops, arith-expand, memref-expand, - arith-bufferize, - func-bufferize, func.func(empty-tensor-to-alloc-tensor), - func.func(scf-bufferize), - func.func(tensor-bufferize), - func.func(bufferization-bufferize), - func.func(linalg-bufferize), - func.func(linalg-detensorize), - func.func(tensor-bufferize), + one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries} region-bufferize, canonicalize, - func.func(finalizing-bufferize), imex-remove-temporaries, func.func(convert-linalg-to-parallel-loops), func.func(scf-parallel-loop-fusion), From 6876a47bee8fdee06a17ec3e6c06562cb31e35f6 Mon Sep 17 00:00:00 2001 From: dchigarev Date: Wed, 31 Jul 2024 10:09:52 +0000 Subject: [PATCH 6/9] Add gpu allocs for xegpu dialect Signed-off-by: dchigarev --- lib/Transforms/InsertGPUAllocs.cpp | 46 ++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/lib/Transforms/InsertGPUAllocs.cpp b/lib/Transforms/InsertGPUAllocs.cpp index 893be344f..988f4c1d8 100644 --- a/lib/Transforms/InsertGPUAllocs.cpp +++ b/lib/Transforms/InsertGPUAllocs.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -158,6 +159,9 @@ class InsertGPUAllocsPass final } else if (auto init_tile = mlir::dyn_cast(op)) { return {{init_tile.getSource()}}; + } else if (auto init_xedesc = + mlir::dyn_cast(op)) { + return {{init_xedesc.getSource()}}; } else { op->emitError("Uhhandled mem op in gpu region"); return std::nullopt; @@ -187,6 +191,9 @@ class InsertGPUAllocsPass final // Only handle the case where the tile source is a memref return init_tile.isSourceMemRef(); } + if (auto init_xedesc = mlir::dyn_cast(op)) { + return true; + } return false; }; @@ -259,6 +266,36 @@ class InsertGPUAllocsPass final return; } + // walk over the users and find xegpu.load/store ops + std::function findXeGPULoadStore; + findXeGPULoadStore = [&](mlir::Operation *use, bool onDevice, AccessType& ret) { + if (auto tile_update = mlir::dyn_cast(use)) { + auto res = tile_update->getResult(0); + for (auto u : res.getUsers()) { + findXeGPULoadStore(u, onDevice, ret); + } + } + if (auto tile_for = mlir::dyn_cast<::mlir::scf::ForOp>(use)) { + for (size_t idx=0; idx(use)) { + (onDevice ? ret.deviceRead : ret.hostRead) = true; + } + else if (auto tile_prefetch = + mlir::dyn_cast(use)) { + (onDevice ? ret.deviceRead : ret.hostRead) = true; + } else if (auto tile_store = + mlir::dyn_cast(use)) { + (onDevice ? ret.deviceWrite : ret.hostWrite) = true; + } + }; + // Checks the access type of the OP under consideration. auto getAccessType = [&](mlir::Value memref) { AccessType ret; @@ -298,6 +335,15 @@ class InsertGPUAllocsPass final continue; } + if (auto init_xedesc = mlir::dyn_cast(user)) { + bool onDevice = user->getParentOfType(); + auto res = init_xedesc->getResult(0); + for (auto use : res.getUsers()) { + findXeGPULoadStore(use, onDevice, ret); + } + continue; + } + if (mlir::isa(user)) { ret.hostRead = true; ret.hostWrite = true; From 42e6f28f48f4801b5169306e9d6e018ee4a20525 Mon Sep 17 00:00:00 2001 From: dchigarev Date: Thu, 1 Aug 2024 12:28:53 +0000 Subject: [PATCH 7/9] Update llvm version Signed-off-by: dchigarev --- build_tools/llvm_version.txt | 2 +- lib/Conversion/XeGPUToVC/XeGPUToVC.cpp | 14 +++++++------- lib/Transforms/PropagatePackedLayout.cpp | 2 +- lib/Transforms/VectorLinearize.cpp | 14 +++++++------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt index 7b14b14ba..620fa4ef3 100644 --- a/build_tools/llvm_version.txt +++ b/build_tools/llvm_version.txt @@ -1 +1 @@ -37661a17e26d9002ae9ade8c0de3932c22f16360 +89946bda5e1c7ceaf6d26634cc8c8c9498d9f7be diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp index a1dacd840..b06e27720 100644 --- a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp +++ b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp @@ -783,7 +783,7 @@ class GatherScatterToRawSend : public OpConversionPattern { payLoad = rewriter.create(loc, base, payLoad, 0); SmallVector indices(16, 0); payLoad = rewriter.create( - loc, payLoad, payLoad, rewriter.getI64ArrayAttr(indices)); + loc, payLoad, payLoad, indices); auto createDescOp = op.getTensorDesc().template getDefiningOp(); auto offsets = rewriter.getRemappedValue(createDescOp.getOffsets()); @@ -886,7 +886,7 @@ class AtomicToLsc : public OpConversionPattern<::mlir::xegpu::AtomicRMWOp> { SmallVector indices(16, 0); payLoad = rewriter.create( - loc, payLoad, payLoad, rewriter.getI64ArrayAttr(indices)); + loc, payLoad, payLoad, indices); auto createDescOp = op.getTensorDesc().getDefiningOp(); auto offsets = rewriter.getRemappedValue(createDescOp.getOffsets()); payLoad = rewriter.create(loc, payLoad, offsets); @@ -1232,7 +1232,7 @@ struct VectorExtractVC final llvm::SmallVector indices(size); std::iota(indices.begin(), indices.end(), linearizedOffset); rewriter.replaceOpWithNewOp( - extractOp, vec, vec, rewriter.getI64ArrayAttr(indices)); + extractOp, vec, vec, indices); } else { // use CompositExtract for scalar result rewriter.replaceOpWithNewOp(extractOp, vec, linearizedOffset); @@ -1338,7 +1338,7 @@ struct VectorExtractStridedSliceVC final } // perform a shuffle to extract the kD vector rewriter.replaceOpWithNewOp( - extractOp, srcVector, srcVector, rewriter.getI64ArrayAttr(indices)); + extractOp, srcVector, srcVector, indices); return success(); } @@ -1392,16 +1392,16 @@ struct VectorShuffleVC final SmallVector indices(totalSize); for (auto [i, value] : - llvm::enumerate(mask.getAsValueRange())) { + llvm::enumerate(mask)) { - int32_t v = value.getZExtValue(); + int32_t v = value; std::iota(indices.begin() + shuffleSliceLen * i, indices.begin() + shuffleSliceLen * (i + 1), shuffleSliceLen * v); } rewriter.replaceOpWithNewOp( - shuffleOp, vec1, vec2, rewriter.getI64ArrayAttr(indices)); + shuffleOp, vec1, vec2, indices); return success(); } diff --git a/lib/Transforms/PropagatePackedLayout.cpp b/lib/Transforms/PropagatePackedLayout.cpp index bd79e4d29..3d30d4c89 100644 --- a/lib/Transforms/PropagatePackedLayout.cpp +++ b/lib/Transforms/PropagatePackedLayout.cpp @@ -330,7 +330,7 @@ makeCast(mlir::OpBuilder &builder, mlir::Value src, mlir::Type srcType, tmp = builder.create( loc, tmp, tmp, - builder.getI64ArrayAttr(getVNNIShuffleIndices(srcVecType, dstVecType))); + getVNNIShuffleIndices(srcVecType, dstVecType)); return {builder.create(loc, dstVecType, tmp), root}; diff --git a/lib/Transforms/VectorLinearize.cpp b/lib/Transforms/VectorLinearize.cpp index dd07a3a2c..cdeb18348 100644 --- a/lib/Transforms/VectorLinearize.cpp +++ b/lib/Transforms/VectorLinearize.cpp @@ -140,7 +140,7 @@ struct VectorExtractStridedSliceConversion final // perform a shuffle to extract the kD vector rewriter.replaceOpWithNewOp( extractOp, dstType, srcVector, srcVector, - rewriter.getI64ArrayAttr(indices)); + indices); } return mlir::success(); } @@ -180,16 +180,16 @@ struct VectorShffleOpConversion final llvm::SmallVector indices(totalSize); for (auto [i, value] : - llvm::enumerate(mask.getAsValueRange())) { + llvm::enumerate(mask)) { - int64_t v = value.getZExtValue(); + int64_t v = value; std::iota(indices.begin() + shuffleSliceLen * i, indices.begin() + shuffleSliceLen * (i + 1), shuffleSliceLen * v); } rewriter.replaceOpWithNewOp( - shuffleOp, dstType, vec1, vec2, rewriter.getI64ArrayAttr(indices)); + shuffleOp, dstType, vec1, vec2, indices); return mlir::success(); } @@ -232,7 +232,7 @@ struct VectorExtractOpConversion final std::iota(indices.begin(), indices.end(), linearizedOffset); rewriter.replaceOpWithNewOp( extractOp, dstTy, adaptor.getVector(), adaptor.getVector(), - rewriter.getI64ArrayAttr(indices)); + indices); } return mlir::success(); @@ -298,11 +298,11 @@ struct VectorInsertOpConversion final 0); auto modifiedSource = rewriter.create( insertOp.getLoc(), dstTy, adaptor.getSource(), adaptor.getSource(), - rewriter.getI64ArrayAttr(modifiedSrcIndices)); + modifiedSrcIndices); rewriter.replaceOpWithNewOp( insertOp, dstTy, adaptor.getDest(), modifiedSource, - rewriter.getI64ArrayAttr(indices)); + indices); return mlir::success(); } From 48c4c2c842a48801807a01c0ba49a3752665ea7b Mon Sep 17 00:00:00 2001 From: dchigarev Date: Thu, 1 Aug 2024 12:31:20 +0000 Subject: [PATCH 8/9] remove invalid patch Signed-off-by: dchigarev --- ...nt-fault-in-applySignatureConversion.patch | 28 ------------------- 1 file changed, 28 deletions(-) delete mode 100644 build_tools/patches/0006-fix-segment-fault-in-applySignatureConversion.patch diff --git a/build_tools/patches/0006-fix-segment-fault-in-applySignatureConversion.patch b/build_tools/patches/0006-fix-segment-fault-in-applySignatureConversion.patch deleted file mode 100644 index 6460a548b..000000000 --- a/build_tools/patches/0006-fix-segment-fault-in-applySignatureConversion.patch +++ /dev/null @@ -1,28 +0,0 @@ -From 283951f026428a3c34b8a2b8f2498d55faf590f5 Mon Sep 17 00:00:00 2001 -From: Chao Chen -Date: Fri, 26 Apr 2024 20:58:37 +0000 -Subject: [PATCH 6/7] fix-segment-fault-in-applySignatureConversion - ---- - mlir/lib/Transforms/Utils/DialectConversion.cpp | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp -index d407d60334c7..a5fa9660be15 100644 ---- a/mlir/lib/Transforms/Utils/DialectConversion.cpp -+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp -@@ -1460,8 +1460,10 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( - - // Legalize the argument output type. - Type outputType = origOutputType; -- if (Type legalOutputType = converter->convertType(outputType)) -- outputType = legalOutputType; -+ if (converter) { -+ if (Type legalOutputType = converter->convertType(outputType)) -+ outputType = legalOutputType; -+ } - - newArg = buildUnresolvedArgumentMaterialization( - newBlock, origArg.getLoc(), replArgs, origOutputType, outputType, --- -2.34.1 From d6101400c695411dec11e4ddb052181452b5afe9 Mon Sep 17 00:00:00 2001 From: dchigarev Date: Wed, 7 Aug 2024 11:21:40 +0000 Subject: [PATCH 9/9] Fix double deallocs Signed-off-by: dchigarev --- lib/Transforms/InsertGPUAllocs.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/Transforms/InsertGPUAllocs.cpp b/lib/Transforms/InsertGPUAllocs.cpp index 988f4c1d8..c5decccb0 100644 --- a/lib/Transforms/InsertGPUAllocs.cpp +++ b/lib/Transforms/InsertGPUAllocs.cpp @@ -411,6 +411,15 @@ class InsertGPUAllocsPass final use.set(newAlloc.getResult()); } } + + // remove 'memref.dealloc' (it's later replaced with gpu.dealloc) + auto memory = alloc->getResult(0); + for (auto u : memory.getUsers()) { + if (auto dealloc = mlir::dyn_cast(u)) { + dealloc.erase(); + } + } + alloc.replaceAllUsesWith(allocResult); builder.create(loc, std::nullopt, allocResult); alloc.erase();