Skip to content

Conversation

@vitalybuka
Copy link
Collaborator

Reverts #162780

Breaks build bots, see #162780.

@llvmbot
Copy link
Member

llvmbot commented Oct 16, 2025

@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-gpu

Author: Vitaly Buka (vitalybuka)

Changes

Reverts llvm/llvm-project#162780

Breaks build bots, see #162780.


Patch is 57.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/163684.diff

13 Files Affected:

  • (modified) mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td (+1-5)
  • (modified) mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td (+45-20)
  • (modified) mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td (+3-59)
  • (modified) mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt (-1)
  • (modified) mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp (+6-162)
  • (modified) mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp (-146)
  • (modified) mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp (+58-57)
  • (modified) mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp (+2-5)
  • (modified) mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp (+1-2)
  • (modified) mlir/test/Conversion/XeGPUToXeVM/dpas.mlir (+6-2)
  • (removed) mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir (-201)
  • (modified) mlir/test/Dialect/XeGPU/invalid.mlir (+21-15)
  • (modified) mlir/test/Dialect/XeGPU/ops.mlir (+20-40)
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 19a52317956d2..5695d5d515d7f 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -712,14 +712,10 @@ def XeGPU_MemLayoutAttr : XeGPUAttr<"MemLayout", "mem_layout"> { return getAttrs().contains(name); } - ArrayAttr getStrideAttr() { + ArrayAttr getStrides() { return getAttrs().getAs<ArrayAttr>("stride"); } - ArrayAttr getBlockAttr() { - return getAttrs().getAs<ArrayAttr>("block"); - } - }]; } diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 426377fcf598f..73f9061f5debe 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -1298,14 +1298,14 @@ def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure, } def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, - AllElementTypesMatch<["mem_desc", "res"]>]> { + AllElementTypesMatch<["mem_desc", "res"]>, + AllRanksMatch<["mem_desc", "res"]>]> { let arguments = (ins XeGPU_MemDesc:$mem_desc, Variadic<Index>: $offsets, DenseI64ArrayAttr: $const_offsets, - OptionalAttr<UnitAttr>:$subgroup_block_io, OptionalAttr<DistributeLayoutAttr>:$layout ); - let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res);  + let results = (outs XeGPU_ValueType:$res); let assemblyFormat = [{ $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets) prop-dict attr-dict `` `:` type(operands) `->` type(results) @@ -1319,9 +1319,6 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, Arguments: - `mem_desc`: the memory descriptor identifying the SLM region. - `offsets`: the coordinates within the matrix to read from. - - `subgroup_block_io`: [optional] An attribute indicating that the operation can be  - lowered to a subgroup block load. When this attribute is present,  - the offsets are subgroup-uniform across all lanes. - `layout`: [optional] An attribute for guiding distributions among subgroups and/or work-items. It currently can accept either LayoutAttr or SliceAttr. @@ -1339,10 +1336,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, } ArrayRef<int64_t> getDataShape() { - auto resTy = getRes().getType(); - if (auto vecTy = llvm::dyn_cast<VectorType>(resTy)) - return vecTy.getShape(); - return {}; + return getRes().getType().getShape(); } }]; @@ -1350,13 +1344,13 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, } def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, - AllElementTypesMatch<["mem_desc", "data"]>]> { + AllElementTypesMatch<["mem_desc", "data"]>, + AllRanksMatch<["mem_desc", "data"]>]> { let arguments = (ins - AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data, + XeGPU_ValueType:$data, XeGPU_MemDesc:$mem_desc, Variadic<Index>: $offsets, DenseI64ArrayAttr: $const_offsets, - OptionalAttr<UnitAttr>:$subgroup_block_io, OptionalAttr<DistributeLayoutAttr>:$layout ); let assemblyFormat = [{ $data `,` $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets) @@ -1370,9 +1364,6 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, - `mem_desc`: the memory descriptor specifying the SLM region. - `offsets`: the coordinates within the matrix where the data will be written. - `data`: the values to be stored in the matrix. - - `subgroup_block_io`: [optional] An attribute indicating that the operation can be  - lowered to a subgroup block store. When this attribute is present,  - the offsets are subgroup-uniform across all lanes.  - `layout`: [optional] An attribute for guiding distributions among subgroups and/or work-items. It currently can accept either LayoutAttr or SliceAttr. @@ -1387,10 +1378,7 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, } ArrayRef<int64_t> getDataShape() { - auto DataTy = getData().getType(); - if (auto vecTy = llvm::dyn_cast<VectorType>(DataTy)) - return vecTy.getShape(); - return {}; + return getData().getType().getShape(); } }]; @@ -1398,4 +1386,41 @@ def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, let hasVerifier = 1; } +def XeGPU_MemDescSubviewOp: XeGPU_Op<"mem_desc_subview", + [Pure, ViewLikeOpInterface, AllElementTypesMatch<["src", "res"]>]> { + let description = [{ + Creates a subview of a memory descriptor. The resulting memory descriptor can have + a lower rank than the source; in this case, the result dimensions correspond to the + higher-order dimensions of the source memory descriptor. + + Arguments: + - `src` : a memory descriptor. + - `offsets` : the coordinates within the matrix the subview will be created from. + + Results: + - `res` : a memory descriptor with smaller size. + + }]; + let arguments = (ins XeGPU_MemDesc:$src, + Variadic<Index>:$offsets, + DenseI64ArrayAttr:$const_offsets); + let results = (outs XeGPU_MemDesc:$res); + let assemblyFormat = [{$src `` custom<DynamicIndexList>($offsets, $const_offsets) prop-dict + attr-dict `` `:` qualified(type($src)) `->` qualified(type($res))}]; + let builders = [ + OpBuilder<(ins "Type": $res, "Value":$src, "llvm::ArrayRef<OpFoldResult>": $offsets)> + ]; + + let extraClassDeclaration = [{ + mlir::Value getViewSource() { return getSrc(); } + + SmallVector<OpFoldResult> getMixedOffsets() { + return getMixedValues(getConstOffsets(), getOffsets(), getContext()); + } + }]; + + let hasVerifier = 1; +} + + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index b1196fbe9c66a..84902b2039643 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -237,11 +237,12 @@ def XeGPU_MemDesc: XeGPUTypeDef<"MemDesc", "mem_desc", [ShapedTypeInterface], "m return MemDescType::get(getContext(), shape.value_or(getShape()), elementType, getMemLayout()); } - ArrayAttr getStrideAttr() { + ArrayAttr getStrides() { auto layout = getMemLayout(); if (layout && layout.hasAttr("stride")) { - return layout.getStrideAttr(); + return layout.getStrides(); } + // derive and return default strides SmallVector<int64_t> defaultStrides; llvm::append_range(defaultStrides, getShape().drop_front()); @@ -249,63 +250,6 @@ def XeGPU_MemDesc: XeGPUTypeDef<"MemDesc", "mem_desc", [ShapedTypeInterface], "m Builder builder(getContext()); return builder.getI64ArrayAttr(defaultStrides); } - - ArrayAttr getBlockAttr() { - auto layout = getMemLayout(); - if (layout && layout.hasAttr("block")) { - return layout.getBlockAttr(); - } - Builder builder(getContext()); - return builder.getI64ArrayAttr({}); - } - - /// Heuristic to determine if the MemDesc uses column-major layout, - /// based on the rank and the value of the first stride dimension. - bool isColMajor() { - auto dim0 = dyn_cast<IntegerAttr>(getStrideAttr()[0]); - return getRank() == 2 && dim0.getInt() == 1; - } - - // Get the Blocking shape for a MemDescType, Which is represented - // as an attribute in MemDescType. By default it is the shape - // of the mdescTy - SmallVector<int64_t> getBlockShape() { - SmallVector<int64_t> size(getShape()); - ArrayAttr blockAttr = getBlockAttr(); - if (!blockAttr.empty()) { - size.clear(); - for (auto attr : blockAttr.getValue()) { - size.push_back(cast<IntegerAttr>(attr).getInt()); - } - } - return size; - } - - // Get strides as vector of integer.  - // If it contains block attribute, the strides are blocked strides. - // - // The blocking is applied to the base matrix shape derived from the - // memory descriptor's stride information. If the matrix described by - // the memory descriptor is not contiguous, it is assumed that the base - // matrix is contiguous and follows the same memory layout. - // - // It first computes the original matrix shape using the stride info, - // then computes the number of blocks in each dimension of original shape, - // then compute the outer block shape and stride, - // then combines the inner and outer block shape and stride - // e.g. for `mem_desc<32x256xf16, @block=[16, 8], @strides=[1, 32]>` - // its memory layout tuple is ([2,32,16,8],[128,256,1,16]) - // for `mem_desc<256x32xf16, @block=[8, 16]>` with default @stride[32, 1] - // its memory layout tuple is ([32,2,8,16],[256,128,16,1]) - SmallVector<int64_t> getStrideShape(); -  - /// Generates instructions to compute the linearize offset  - // if the memory descriptor is blocked, it returns linearize offset based on the blocked layout - // the strides of memory descriptor is always considered regardless of blocked or not - Value getLinearOffsets(OpBuilder &builder, - Location loc, ArrayRef<OpFoldResult> offsets); - - }]; let hasCustomAssemblyFormat = true; diff --git a/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt b/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt index dd9edc43a1657..84b25809f1ed0 100644 --- a/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt +++ b/mlir/lib/Conversion/XeGPUToXeVM/CMakeLists.txt @@ -21,7 +21,6 @@ add_mlir_conversion_library(MLIRXeGPUToXeVM MLIRIndexDialect MLIRSCFDialect MLIRXeGPUDialect - MLIRXeGPUUtils MLIRPass MLIRTransforms MLIRSCFTransforms diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp index fcbf66dbe9e45..ddcbc44f2652a 100644 --- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp +++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp @@ -22,7 +22,6 @@ #include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/STLExtras.h" @@ -64,7 +63,6 @@ static int32_t getNumericXeVMAddrSpace(xegpu::MemorySpace xeGpuMemspace) { case xegpu::MemorySpace::SLM: return static_cast<int>(xevm::AddrSpace::SHARED); } - llvm_unreachable("Unknown XeGPU memory space"); } // Get same bitwidth flat vector type of new element type. @@ -188,7 +186,6 @@ class CreateNdDescToXeVMPattern int64_t rank = mixedSizes.size(); if (rank != 2) return rewriter.notifyMatchFailure(op, "Expected 2D shape."); - auto sourceTy = source.getType(); auto sourceMemrefTy = dyn_cast<MemRefType>(sourceTy); // If source is a memref, we need to extract the aligned pointer as index. @@ -367,11 +364,10 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> { // Add a builder that creates // offset * elemByteSize + baseAddr -static Value addOffsetToBaseAddr(ConversionPatternRewriter &rewriter, - Location loc, Value baseAddr, Value offset, - int64_t elemByteSize) { +static Value addOffset(ConversionPatternRewriter &rewriter, Location loc, + Value baseAddr, Value offset, int64_t elemByteSize) { Value byteSize = arith::ConstantIntOp::create( - rewriter, loc, baseAddr.getType(), elemByteSize); + rewriter, loc, rewriter.getI64Type(), elemByteSize); Value byteOffset = arith::MulIOp::create(rewriter, loc, offset, byteSize); Value newAddr = arith::AddIOp::create(rewriter, loc, baseAddr, byteOffset); return newAddr; @@ -447,8 +443,7 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> { // If offset is provided, we add them to the base pointer. // Offset is in number of elements, we need to multiply by // element byte size. - basePtrI64 = - addOffsetToBaseAddr(rewriter, loc, basePtrI64, offset, elemByteSize); + basePtrI64 = addOffset(rewriter, loc, basePtrI64, offset, elemByteSize); } // Convert base pointer (i64) to LLVM pointer type. Value basePtrLLVM = @@ -511,147 +506,6 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> { } }; -// Lower xegpu::CreateMemDescOp to memref::ViewOp. Since SLM access instructions -// on Xe2 and Xe3 operate on 32-bit or 64-bit units, all data types smaller than -// 32 bits will be converted to 32 bits. -class CreateMemDescOpPattern final - : public OpConversionPattern<xegpu::CreateMemDescOp> { -public: - using OpConversionPattern<xegpu::CreateMemDescOp>::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::CreateMemDescOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - - auto resTy = op.getMemDesc(); - - // Create the result MemRefType with the same shape, element type, and - // memory space - auto newResTy = getTypeConverter()->convertType<MemRefType>(resTy); - - Value zero = arith::ConstantIndexOp::create(rewriter, op.getLoc(), 0); - auto viewOp = memref::ViewOp::create(rewriter, op.getLoc(), newResTy, - op.getSource(), zero, ValueRange()); - rewriter.replaceOp(op, viewOp); - return success(); - } -}; - -template <typename OpType, - typename = std::enable_if_t<llvm::is_one_of< - OpType, xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>::value>> -class LoadStoreMatrixToXeVMPattern : public OpConversionPattern<OpType> { - using OpConversionPattern<OpType>::OpConversionPattern; - LogicalResult - matchAndRewrite(OpType op, typename OpType::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - - SmallVector<OpFoldResult> offsets = op.getMixedOffsets(); - if (offsets.empty()) - return rewriter.notifyMatchFailure(op, "Expected offset to be provided."); - - auto loc = op.getLoc(); - auto ctxt = rewriter.getContext(); - Value basePtrStruct = adaptor.getMemDesc(); - Value mdescVal = op.getMemDesc(); - // Load result or Store value Type can be vector or scalar. - Value data; - if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) - data = op.getResult(); - else - data = adaptor.getData(); - VectorType valOrResVecTy = dyn_cast<VectorType>(data.getType()); - if (!valOrResVecTy) - valOrResVecTy = VectorType::get(1, data.getType()); - - int64_t elemBitWidth = - valOrResVecTy.getElementType().getIntOrFloatBitWidth(); - // Element type must be multiple of 8 bits. - if (elemBitWidth % 8 != 0) - return rewriter.notifyMatchFailure( - op, "Expected element type bit width to be multiple of 8."); - int64_t elemByteSize = elemBitWidth / 8; - - // Default memory space is SLM. - LLVM::LLVMPointerType ptrTypeLLVM = LLVM::LLVMPointerType::get( - ctxt, getNumericXeVMAddrSpace(xegpu::MemorySpace::SLM)); - - auto mdescTy = cast<xegpu::MemDescType>(mdescVal.getType()); - - Value basePtrLLVM = memref::ExtractAlignedPointerAsIndexOp::create( - rewriter, loc, basePtrStruct); - - // Convert base pointer (ptr) to i32 - Value basePtrI32 = arith::IndexCastUIOp::create( - rewriter, loc, rewriter.getI32Type(), basePtrLLVM); - - Value linearOffset = mdescTy.getLinearOffsets(rewriter, loc, offsets); - linearOffset = arith::IndexCastUIOp::create( - rewriter, loc, rewriter.getI32Type(), linearOffset); - basePtrI32 = addOffsetToBaseAddr(rewriter, loc, basePtrI32, linearOffset, - elemByteSize); - - // convert base pointer (i32) to LLVM pointer type - basePtrLLVM = - LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, basePtrI32); - - if (op.getSubgroupBlockIoAttr()) { - // if the attribute 'subgroup_block_io' is set to true, it lowers to - // xevm.blockload - - Type intElemTy = rewriter.getIntegerType(elemBitWidth); - VectorType intVecTy = - VectorType::get(valOrResVecTy.getShape(), intElemTy); - - if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) { - Value loadOp = - xevm::BlockLoadOp::create(rewriter, loc, intVecTy, basePtrLLVM); - if (intVecTy != valOrResVecTy) { - loadOp = - vector::BitCastOp::create(rewriter, loc, valOrResVecTy, loadOp); - } - rewriter.replaceOp(op, loadOp); - } else { - Value dataToStore = adaptor.getData(); - if (valOrResVecTy != intVecTy) { - dataToStore = - vector::BitCastOp::create(rewriter, loc, intVecTy, dataToStore); - } - xevm::BlockStoreOp::create(rewriter, loc, basePtrLLVM, dataToStore, - nullptr); - rewriter.eraseOp(op); - } - return success(); - } - - if (valOrResVecTy.getNumElements() >= 1) { - auto chipOpt = xegpu::getChipStr(op); - if (!chipOpt || (*chipOpt != "pvc" && *chipOpt != "bmg")) { - // the lowering for chunk load only works for pvc and bmg - return rewriter.notifyMatchFailure( - op, "The lowering is specific to pvc or bmg."); - } - } - - if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) { - // if the size of valOrResVecTy is 1, it lowers to a scalar load/store - // operation. LLVM load/store does not support vector of size 1, so we - // need to handle this case separately. - auto scalarTy = valOrResVecTy.getElementType(); - LLVM::LoadOp loadOp; - if (valOrResVecTy.getNumElements() == 1) - loadOp = LLVM::LoadOp::create(rewriter, loc, scalarTy, basePtrLLVM); - else - loadOp = - LLVM::LoadOp::create(rewriter, loc, valOrResVecTy, basePtrLLVM); - rewriter.replaceOp(op, loadOp); - } else { - LLVM::StoreOp::create(rewriter, loc, adaptor.getData(), basePtrLLVM); - rewriter.eraseOp(op); - } - return success(); - } -}; - class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> { using OpConversionPattern::OpConversionPattern; LogicalResult @@ -694,8 +548,8 @@ class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> { op, "Expected element type bit width to be multiple of 8."); elemByteSize = elemBitWidth / 8; } - basePtrI64 = addOffsetToBaseAddr(rewriter, loc, basePtrI64, offsets, - elemByteSize); + basePtrI64 = + addOffset(rewriter, loc, basePtrI64, offsets, elemByteSize); } } // Default memory space is global. @@ -932,13 +786,6 @@ struct ConvertXeGPUToXeVMPass auto i32Type = IntegerType::get(&getContext(), 32); return VectorType::get(8, i32Type); }); - // Convert MemDescType into flattened MemRefType for SLM - typeConverter.addConversion([&](xegpu::MemDescType type) -> Type { - Type elemTy = type.getElementType(); - int numElems = type.getNumElements(); - return MemRefType::get(numElems, elemTy, AffineMap(), 3); - }); - typeConverter.addConversion([&](MemRefType type) -> Type { // Convert MemRefType to i64 type. return IntegerType::get(&getContext(), 64); @@ -1093,9 +940,6 @@ void mlir::populateXeGPUToXeVMConversionPatterns( LoadStoreToXeVMPattern<xegpu::LoadGatherOp>, LoadStoreToXeVMPattern<xegpu::StoreScatterOp>>( typeConverter, patterns.getContext()); - patterns.add<LoadStoreMatrixToXeVMPattern<xegpu::LoadMatrixOp>, - LoadStoreMat... [truncated] 
@vitalybuka vitalybuka added the skip-precommit-approval PR for CI feedback, not intended for review label Oct 16, 2025
@vitalybuka vitalybuka enabled auto-merge (squash) October 16, 2025 02:44
@vitalybuka vitalybuka merged commit d43581a into main Oct 16, 2025
14 checks passed
@vitalybuka vitalybuka deleted the revert-162780-users/Jianhui-Li/XeGPU/load-matrix-WI-attributes branch October 16, 2025 03:11
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

mlir:gpu mlir skip-precommit-approval PR for CI feedback, not intended for review

2 participants