Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build_tools/llvm_version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1728a56d0e66c9e64a2e62fa6c5508580ccd28a0
89946bda5e1c7ceaf6d26634cc8c8c9498d9f7be

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class XeGPUOneToNPatterRewriter : public mlir::PatternRewriter,
}

mlir::Block *
applySignatureConversion(mlir::Region *region,
applySignatureConversion(mlir::Block *block,
mlir::TypeConverter::SignatureConversion &conversion,
const mlir::TypeConverter *converter = nullptr);

Expand Down
18 changes: 18 additions & 0 deletions include/imex/Utils/GPUSerialize.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
//===- GPUSerialize.h - Pass Utility Functions --------------------*- C++
//-*-===//
//
// Copyright 2024 Intel Corporation
// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef _IMEX_GPUSERIALIZE_H_
#define _IMEX_GPUSERIALIZE_H_

namespace imex {
static constexpr const char *gpuBinaryAttrName = "gpu.binary";
} // namespace imex

#endif // _IMEX_GPUSERIALIZE_H_
55 changes: 42 additions & 13 deletions lib/Conversion/GPUXToLLVM/GPUXToLLVMPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "imex/Dialect/GPUX/IR/GPUXOps.h"

#include "imex/Utils/FuncUtils.hpp"
#include "imex/Utils/GPUSerialize.h"
#include "imex/Utils/TypeConversion.hpp"

#include "../PassDetail.h"
Expand Down Expand Up @@ -517,6 +518,34 @@ class ConvertLaunchFuncOpToGpuRuntimeCallPattern
}
};

class RemoveGPUModulePattern
: public mlir::ConvertOpToLLVMPattern<mlir::gpu::GPUModuleOp> {
public:
RemoveGPUModulePattern(mlir::LLVMTypeConverter &converter)
: mlir::ConvertOpToLLVMPattern<mlir::gpu::GPUModuleOp>(converter) {}
mlir::LogicalResult
matchAndRewrite(mlir::gpu::GPUModuleOp op,
mlir::gpu::GPUModuleOp::Adaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
rewriter.eraseOp(op);
return mlir::success();
}
};

class RemoveGPUFuncPattern
: public mlir::ConvertOpToLLVMPattern<mlir::gpu::GPUFuncOp> {
public:
RemoveGPUFuncPattern(mlir::LLVMTypeConverter &converter)
: mlir::ConvertOpToLLVMPattern<mlir::gpu::GPUFuncOp>(converter) {}
mlir::LogicalResult
matchAndRewrite(mlir::gpu::GPUFuncOp op,
mlir::gpu::GPUFuncOp::Adaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
rewriter.eraseOp(op);
return mlir::success();
}
};

/// A rewrite pattern to convert gpux.create_stream operations into a GPU
/// runtime call.
class ConvertGpuStreamCreatePattern
Expand Down Expand Up @@ -583,19 +612,18 @@ void GPUXToLLVMPass::runOnOperation() {
mlir::RewritePatternSet patterns(&context);
mlir::LLVMConversionTarget target(context);

mlir::arith::populateArithToLLVMConversionPatterns(converter, patterns);
mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns);
mlir::populateVectorToLLVMConversionPatterns(converter, patterns);
mlir::populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns);
mlir::populateFuncToLLVMConversionPatterns(converter, patterns);
mlir::populateAsyncStructuralTypeConversionsAndLegality(converter, patterns,
target);
// mlir::arith::populateArithToLLVMConversionPatterns(converter, patterns);
// mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns);
// mlir::populateVectorToLLVMConversionPatterns(converter, patterns);
// mlir::populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns);
// mlir::populateFuncToLLVMConversionPatterns(converter, patterns);
// mlir::populateAsyncStructuralTypeConversionsAndLegality(converter, patterns,
// target);

mlir::populateGpuToLLVMConversionPatterns(
converter, patterns, mlir::gpu::getDefaultGpuBinaryAnnotation());
mlir::populateGpuToLLVMConversionPatterns(converter, patterns);

imex::populateControlFlowTypeConversionRewritesAndTarget(converter, patterns,
target);
// imex::populateControlFlowTypeConversionRewritesAndTarget(converter, patterns,
// target);

imex::populateGpuxToLLVMPatternsAndLegality(converter, patterns, target);

Expand Down Expand Up @@ -631,12 +659,13 @@ void imex::populateGpuxToLLVMPatternsAndLegality(
ConvertGpuStreamCreatePattern,
ConvertGpuStreamDestroyPattern,
ConvertAllocOpToGpuRuntimeCallPattern,
ConvertDeallocOpToGpuRuntimeCallPattern
ConvertDeallocOpToGpuRuntimeCallPattern,
RemoveGPUModulePattern
// clang-format on
>(converter);

patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
converter, mlir::gpu::getDefaultGpuBinaryAnnotation());
converter, imex::gpuBinaryAttrName);

target.addIllegalDialect<mlir::gpu::GPUDialect>();
target.addIllegalDialect<imex::gpux::GPUXDialect>();
Expand Down
17 changes: 9 additions & 8 deletions lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@ class GatherScatterToRawSend : public OpConversionPattern<OpType> {
payLoad = rewriter.create<vector::InsertOp>(loc, base, payLoad, 0);
SmallVector<int64_t, 16> indices(16, 0);
payLoad = rewriter.create<mlir::vector::ShuffleOp>(
loc, payLoad, payLoad, rewriter.getI64ArrayAttr(indices));
loc, payLoad, payLoad, indices);
auto createDescOp =
op.getTensorDesc().template getDefiningOp<xegpu::CreateDescOp>();
auto offsets = rewriter.getRemappedValue(createDescOp.getOffsets());
Expand Down Expand Up @@ -886,7 +886,7 @@ class AtomicToLsc : public OpConversionPattern<::mlir::xegpu::AtomicRMWOp> {

SmallVector<int64_t, 16> indices(16, 0);
payLoad = rewriter.create<mlir::vector::ShuffleOp>(
loc, payLoad, payLoad, rewriter.getI64ArrayAttr(indices));
loc, payLoad, payLoad, indices);
auto createDescOp = op.getTensorDesc().getDefiningOp<xegpu::CreateDescOp>();
auto offsets = rewriter.getRemappedValue(createDescOp.getOffsets());
payLoad = rewriter.create<arith::AddIOp>(loc, payLoad, offsets);
Expand Down Expand Up @@ -1232,7 +1232,7 @@ struct VectorExtractVC final
llvm::SmallVector<int64_t, 2> indices(size);
std::iota(indices.begin(), indices.end(), linearizedOffset);
rewriter.replaceOpWithNewOp<mlir::vector::ShuffleOp>(
extractOp, vec, vec, rewriter.getI64ArrayAttr(indices));
extractOp, vec, vec, indices);
} else { // use CompositExtract for scalar result
rewriter.replaceOpWithNewOp<mlir::vector::ExtractOp>(extractOp, vec,
linearizedOffset);
Expand Down Expand Up @@ -1338,7 +1338,7 @@ struct VectorExtractStridedSliceVC final
}
// perform a shuffle to extract the kD vector
rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
extractOp, srcVector, srcVector, rewriter.getI64ArrayAttr(indices));
extractOp, srcVector, srcVector, indices);

return success();
}
Expand Down Expand Up @@ -1392,16 +1392,16 @@ struct VectorShuffleVC final

SmallVector<int64_t, 2> indices(totalSize);
for (auto [i, value] :
llvm::enumerate(mask.getAsValueRange<IntegerAttr>())) {
llvm::enumerate(mask)) {

int32_t v = value.getZExtValue();
int32_t v = value;
std::iota(indices.begin() + shuffleSliceLen * i,
indices.begin() + shuffleSliceLen * (i + 1),
shuffleSliceLen * v);
}

rewriter.replaceOpWithNewOp<vector::ShuffleOp>(
shuffleOp, vec1, vec2, rewriter.getI64ArrayAttr(indices));
shuffleOp, vec1, vec2, indices);

return success();
}
Expand Down Expand Up @@ -1431,7 +1431,8 @@ struct SCFForOpBlockVCPattern final
newOp.getRegion().getArgument(i).getType());
}

rewriter.applySignatureConversion(&op.getRegion(), signatureConverter);
rewriter.applySignatureConversion(&op.getRegion().getBlocks().front(),
signatureConverter);

rewriter.eraseBlock(newOp.getBody());
rewriter.inlineRegionBefore(op.getRegion(), newOp.getRegion(),
Expand Down
3 changes: 2 additions & 1 deletion lib/Conversion/XeTileToXeGPU/SCFOpConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ struct SgSCFForOpBlockPattern

// apply the signature convertion for SCFFor body arguments, an
// UnrealizedConversionCastOp will be inserted by typeConverter
rewriter.applySignatureConversion(&op.getRegion(), argumentMapping);
rewriter.applySignatureConversion(&op.getRegion().getBlocks().front(),
argumentMapping);

if (newOp.getBody())
rewriter.eraseBlock(newOp.getBody());
Expand Down
4 changes: 2 additions & 2 deletions lib/Conversion/XeTileToXeGPU/XeTileToXeGPUConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,9 @@ XeGPUTypeConverter::computeTypeMapping(mlir::ValueRange original,
}

mlir::Block *XeGPUOneToNPatterRewriter::applySignatureConversion(
mlir::Region *region, mlir::TypeConverter::SignatureConversion &conversion,
mlir::Block *block, mlir::TypeConverter::SignatureConversion &conversion,
const mlir::TypeConverter *converter) {
return rewriter.applySignatureConversion(region, conversion, converter);
return rewriter.applySignatureConversion(block, conversion, converter);
}

void XeGPUOneToNPatterRewriter::replaceOp(mlir::Operation *op,
Expand Down
2 changes: 1 addition & 1 deletion lib/Dialect/XeTile/Transforms/BlockAligning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ class XeTileBlockAligningPass : public imex::impl::XeTileBlockAligningBase<
// Use TopDown traversal order, and only look at existing ops
// to simpliy the code logic and speedup the pass
mlir::GreedyRewriteConfig config;
config.enableRegionSimplification = false;
config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
config.useTopDownTraversal = true;
config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps;
if (failed(
Expand Down
2 changes: 1 addition & 1 deletion lib/Dialect/XeTile/Transforms/Blocking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1124,7 +1124,7 @@ class XeTileBlockingPass
// Use TopDown traversal order, and only look at existing ops
// to simpliy the code logic and speedup the pass
mlir::GreedyRewriteConfig config;
config.enableRegionSimplification = false;
config.enableRegionSimplification = GreedySimplifyRegionLevel::Disabled;
config.useTopDownTraversal = true;
config.strictMode = GreedyRewriteStrictness::ExistingAndNewOps;
{ // initialize the inner block size per op.
Expand Down
55 changes: 55 additions & 0 deletions lib/Transforms/InsertGPUAllocs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <mlir/Dialect/Func/IR/FuncOps.h>
#include <mlir/Dialect/GPU/Transforms/Passes.h>
#include <mlir/Dialect/MemRef/IR/MemRef.h>
#include <mlir/Dialect/XeGPU/IR/XeGPU.h>
#include <mlir/Dialect/SCF/IR/SCF.h>
#include <mlir/Pass/Pass.h>

Expand Down Expand Up @@ -158,6 +159,9 @@ class InsertGPUAllocsPass final
} else if (auto init_tile =
mlir::dyn_cast<imex::xetile::InitTileOp>(op)) {
return {{init_tile.getSource()}};
} else if (auto init_xedesc =
mlir::dyn_cast<mlir::xegpu::CreateNdDescOp>(op)) {
return {{init_xedesc.getSource()}};
} else {
op->emitError("Uhhandled mem op in gpu region");
return std::nullopt;
Expand Down Expand Up @@ -187,6 +191,9 @@ class InsertGPUAllocsPass final
// Only handle the case where the tile source is a memref
return init_tile.isSourceMemRef();
}
if (auto init_xedesc = mlir::dyn_cast<mlir::xegpu::CreateNdDescOp>(op)) {
return true;
}
return false;
};

Expand Down Expand Up @@ -259,6 +266,36 @@ class InsertGPUAllocsPass final
return;
}

// walk over the users and find xegpu.load/store ops
std::function<void(mlir::Operation*, bool, AccessType&)> findXeGPULoadStore;
findXeGPULoadStore = [&](mlir::Operation *use, bool onDevice, AccessType& ret) {
if (auto tile_update = mlir::dyn_cast<mlir::xegpu::UpdateNdOffsetOp>(use)) {
auto res = tile_update->getResult(0);
for (auto u : res.getUsers()) {
findXeGPULoadStore(u, onDevice, ret);
}
}
if (auto tile_for = mlir::dyn_cast<::mlir::scf::ForOp>(use)) {
for (size_t idx=0; idx<tile_for.getInits().size(); idx++) {
auto a = tile_for.getRegionIterArg(idx);
for (auto u : a.getUsers()) {
findXeGPULoadStore(u, onDevice, ret);
}
}
}
if (auto tile_load =
mlir::dyn_cast<mlir::xegpu::LoadNdOp>(use)) {
(onDevice ? ret.deviceRead : ret.hostRead) = true;
}
else if (auto tile_prefetch =
mlir::dyn_cast<mlir::xegpu::PrefetchNdOp>(use)) {
(onDevice ? ret.deviceRead : ret.hostRead) = true;
} else if (auto tile_store =
mlir::dyn_cast<mlir::xegpu::StoreNdOp>(use)) {
(onDevice ? ret.deviceWrite : ret.hostWrite) = true;
}
};

// Checks the access type of the OP under consideration.
auto getAccessType = [&](mlir::Value memref) {
AccessType ret;
Expand Down Expand Up @@ -298,6 +335,15 @@ class InsertGPUAllocsPass final
continue;
}

if (auto init_xedesc = mlir::dyn_cast<mlir::xegpu::CreateNdDescOp>(user)) {
bool onDevice = user->getParentOfType<mlir::gpu::LaunchOp>();
auto res = init_xedesc->getResult(0);
for (auto use : res.getUsers()) {
findXeGPULoadStore(use, onDevice, ret);
}
continue;
}

if (mlir::isa<mlir::func::ReturnOp>(user)) {
ret.hostRead = true;
ret.hostWrite = true;
Expand Down Expand Up @@ -365,6 +411,15 @@ class InsertGPUAllocsPass final
use.set(newAlloc.getResult());
}
}

// remove 'memref.dealloc' (it's later replaced with gpu.dealloc)
auto memory = alloc->getResult(0);
for (auto u : memory.getUsers()) {
if (auto dealloc = mlir::dyn_cast<mlir::memref::DeallocOp>(u)) {
dealloc.erase();
}
}

alloc.replaceAllUsesWith(allocResult);
builder.create<mlir::gpu::DeallocOp>(loc, std::nullopt, allocResult);
alloc.erase();
Expand Down
2 changes: 1 addition & 1 deletion lib/Transforms/PropagatePackedLayout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ makeCast(mlir::OpBuilder &builder, mlir::Value src, mlir::Type srcType,

tmp = builder.create<mlir::vector::ShuffleOp>(
loc, tmp, tmp,
builder.getI64ArrayAttr(getVNNIShuffleIndices(srcVecType, dstVecType)));
getVNNIShuffleIndices(srcVecType, dstVecType));

return {builder.create<mlir::vector::ShapeCastOp>(loc, dstVecType, tmp),
root};
Expand Down
4 changes: 2 additions & 2 deletions lib/Transforms/SerializeSPIRV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
//===----------------------------------------------------------------------===//

#include "PassDetail.h"
#include "imex/Utils/GPUSerialize.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
Expand Down Expand Up @@ -54,13 +55,12 @@ struct SerializeSPIRVPass : public SerializeSPIRVPassBase<SerializeSPIRVPass> {
signalPassFailure();
return;
}

// attach the spv binary to the gpu module
auto spvData =
llvm::StringRef(reinterpret_cast<const char *>(spvBinary.data()),
spvBinary.size() * sizeof(uint32_t));
auto spvAttr = mlir::StringAttr::get(&getContext(), spvData);
gpuMod->setAttr(gpu::getDefaultGpuBinaryAnnotation(), spvAttr);
gpuMod->setAttr(imex::gpuBinaryAttrName, spvAttr);
spvMod->erase();
}
}
Expand Down
Loading