- Notifications
You must be signed in to change notification settings - Fork 15.2k
[mlir][gpu] Allow integer attribute in dynamic_shared_memory_size #71509
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…arameter of `gpu.launch` This PR allows integer attributes as `dynamic_shared_memory_size` parameter of `gpu.launch`. See the example IR below, `200` doesn't have to be SSA value anymore. ``` gpu.launch blocks(..) threads(...) dynamic_shared_memory_size 200 ```
dynamic_shared_memory_size p…dynamic_shared_memory_size | @llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Guray Ozen (grypp) Changes…arameter of This PR allows integer attributes as Full diff: https://github.com/llvm/llvm-project/pull/71509.diff 5 Files Affected:
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h index 14a1fac5fd255f3..06b1ea95d20339d 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h @@ -17,6 +17,7 @@ #include "mlir/Bytecode/BytecodeOpInterface.h" #include "mlir/Dialect/DLTI/Traits.h" #include "mlir/Dialect/GPU/IR/CompilationInterfaces.h" +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Dialect.h" diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 6375d35f4311295..5bf5cbc5efe628f 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -587,7 +587,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [ Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies, Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, - Optional<I32>:$dynamicSharedMemorySize)>, + Optional<I32>:$dynamicSharedMemorySize, + OptionalAttr<SI32Attr>:$dynamicSharedMemorySizeConstant)>, Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> { let summary = "GPU kernel launch operation"; @@ -693,7 +694,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [ CArg<"Type", "nullptr">:$asyncTokenType, CArg<"ValueRange", "{}">:$asyncDependencies, CArg<"TypeRange", "{}">:$workgroupAttributions, - CArg<"TypeRange", "{}">:$privateAttributions)> + CArg<"TypeRange", "{}">:$privateAttributions, + CArg<"IntegerAttr", "IntegerAttr()">:$dynamicSharedMemorySizeConstant)> ]; let extraClassDeclaration = [{ @@ -728,6 +730,24 @@ def GPU_LaunchOp : GPU_Op<"launch", [ /// Returns the keywords used in the custom syntax for this Op. static StringRef getWorkgroupKeyword() { return "workgroup"; } static StringRef getPrivateKeyword() { return "private"; } + static StringRef getDynamicSharedMemorySizeConstantKeyword() { + return "dynamicSharedMemorySizeConstant"; + } + + static int getDynamicSharedMemorySizeDynamicValue() { + return std::numeric_limits<int32_t>::min(); + } + /// Returns a value of the dynamic shared memory size. + /// If it is a constant, it builds one + mlir::Value getDynamicSharedMemorySizeValue(OpBuilder &b) { + int32_t kDynamic = getDynamicSharedMemorySizeDynamicValue(); + if (getDynamicSharedMemorySizeConstant().value_or(kDynamic) == kDynamic) + return getDynamicSharedMemorySize(); + return b.create<mlir::arith::ConstantOp>( + getLoc(), b.getIntegerType(32), + b.getI32IntegerAttr( + getDynamicSharedMemorySizeConstant().value())); + } /// Returns the number of buffers located in the workgroup memory. unsigned getNumWorkgroupAttributions() { diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index 5eb2cadc884e151..269ee7dcaec0e71 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -618,7 +618,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result, Value getBlockSizeZ, Value dynamicSharedMemorySize, Type asyncTokenType, ValueRange asyncDependencies, TypeRange workgroupAttributions, - TypeRange privateAttributions) { + TypeRange privateAttributions, + IntegerAttr dynamicSharedMemorySizeAttr) { // Add a WorkGroup attribution attribute. This attribute is required to // identify private attributions in the list of block argguments. result.addAttribute(getNumWorkgroupAttributionsAttrName(), @@ -634,7 +635,9 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result, getBlockSizeY, getBlockSizeZ}); if (dynamicSharedMemorySize) result.addOperands(dynamicSharedMemorySize); - + if (dynamicSharedMemorySizeAttr) + result.addAttribute(getDynamicSharedMemorySizeConstantKeyword(), + dynamicSharedMemorySizeAttr); // Create a kernel body region with kNumConfigRegionAttributes + N memory // attributions, where the first kNumConfigRegionAttributes arguments have // `index` type and the rest have the same types as the data operands. @@ -759,6 +762,10 @@ void LaunchOp::print(OpAsmPrinter &p) { if (getDynamicSharedMemorySize()) p << ' ' << getDynamicSharedMemorySizeKeyword() << ' ' << getDynamicSharedMemorySize(); + else if (getDynamicSharedMemorySizeConstantAttr()) { + p << ' ' << getDynamicSharedMemorySizeKeyword() << ' ' + << getDynamicSharedMemorySizeConstantAttr().getSInt(); + } printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributions()); printAttributions(p, getPrivateKeyword(), getPrivateAttributions()); @@ -768,7 +775,8 @@ void LaunchOp::print(OpAsmPrinter &p) { p.printRegion(getBody(), /*printEntryBlockArgs=*/false); p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{ LaunchOp::getOperandSegmentSizeAttr(), - getNumWorkgroupAttributionsAttrName()}); + getNumWorkgroupAttributionsAttrName(), + getDynamicSharedMemorySizeConstantKeyword()}); } // Parse the size assignment blocks for blocks and threads. These have the form @@ -854,12 +862,20 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { bool hasDynamicSharedMemorySize = false; if (!parser.parseOptionalKeyword( LaunchOp::getDynamicSharedMemorySizeKeyword())) { - hasDynamicSharedMemorySize = true; - if (parser.parseOperand(dynamicSharedMemorySize) || - parser.resolveOperand(dynamicSharedMemorySize, - parser.getBuilder().getI32Type(), - result.operands)) - return failure(); + IntegerAttr shmemAttr; + OptionalParseResult shmemAttrResult = parser.parseOptionalAttribute( + shmemAttr, parser.getBuilder().getIntegerType(32, true)); + if (!shmemAttrResult.has_value()) { + hasDynamicSharedMemorySize = true; + shmemAttr = parser.getBuilder().getSI32IntegerAttr( + getDynamicSharedMemorySizeDynamicValue()); + if (parser.parseOperand(dynamicSharedMemorySize) || + parser.resolveOperand(dynamicSharedMemorySize, + parser.getBuilder().getI32Type(), + result.operands)) + return failure(); + } + result.addAttribute(getDynamicSharedMemorySizeConstantKeyword(), shmemAttr); } // Create the region arguments, it has kNumConfigRegionAttributes arguments diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp index b1e2f914db4cb9b..3e29fbe8cdfbbc3 100644 --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -281,7 +281,7 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, auto launchFunc = builder.create<gpu::LaunchFuncOp>( launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(), - launchOp.getDynamicSharedMemorySize(), operands, + launchOp.getDynamicSharedMemorySizeValue(builder), operands, asyncToken ? asyncToken.getType() : nullptr, launchOp.getAsyncDependencies()); launchOp.replaceAllUsesWith(launchFunc); diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir index 28c121a550100c2..b032a4035230990 100644 --- a/mlir/test/Dialect/GPU/outlining.mlir +++ b/mlir/test/Dialect/GPU/outlining.mlir @@ -372,3 +372,36 @@ func.func @launch_memory_attributions_1(%arg0 : memref<*xf32>) { } // CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>} + + +// ----- + +// CHECK-LABEL: func.func @dynamic_shared_memory( +// CHECK-SAME: %[[arg0:.+]]: i32 +func.func @dynamic_shared_memory(%shmemSize : i32) { + %c1 = arith.constant 1 : index + gpu.launch blocks(%bx, %by, %bz) in (%sbx = %c1, %sby = %c1, %sbz = %c1) + threads(%tx, %ty, %tz) in (%stx = %c1, %sty = %c1, %stz = %c1) + dynamic_shared_memory_size %shmemSize + { + gpu.terminator + } + gpu.launch blocks(%bx, %by, %bz) in (%sbx = %c1, %sby = %c1, %sbz = %c1) + threads(%tx, %ty, %tz) in (%stx = %c1, %sty = %c1, %stz = %c1) + dynamic_shared_memory_size 200 + { + gpu.terminator + } + gpu.launch blocks(%bx, %by, %bz) in (%sbx = %c1, %sby = %c1, %sbz = %c1) + threads(%tx, %ty, %tz) in (%stx = %c1, %sty = %c1, %stz = %c1) + { + gpu.terminator + } + + +// CHECK: gpu.launch_func @dynamic_shared_memory_kernel::@dynamic_shared_memory_kernel blocks in (%{{.+}}, %{{.+}}, %{{.+}}) threads in (%{{.+}}, %{{.+}}, %{{.+}}) dynamic_shared_memory_size %[[arg0]] +// CHECK: %[[c200:.+]] = arith.constant 200 : i32 +// CHECK: gpu.launch_func @dynamic_shared_memory_kernel_0::@dynamic_shared_memory_kernel blocks in (%{{.+}}, %{{.+}}, %{{.+}}) threads in (%{{.+}}, %{{.+}}, %{{.+}}) dynamic_shared_memory_size %[[c200]] + return +} + |
| ✅ With the latest revision this PR passed the C/C++ code formatter. |
| Please fix the wrapping of the PR title in the description |
joker-eph left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You PR description says the "what" this is doing, but seems to miss the more important part: the "why"? Can you elaborate on why this is desirable to have?
dynamic_shared_memory_sizedynamic_shared_memory_size
This PR allows integer attributes as
dynamic_shared_memory_sizeparameter ofgpu.launch. See the example IR below,200doesn't have to be SSA value anymore.Motivation:
When shared memory size is known, we can leverage it for the IR verification.