- Notifications
You must be signed in to change notification settings - Fork 15.6k
[AMDGPU] Add flag to enable expensive trip counts for runtime unroll. #171735
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU] Add flag to enable expensive trip counts for runtime unroll. #171735
Conversation
| @llvm/pr-subscribers-llvm-transforms Author: None (carlobertolli) ChangesFull diff: https://github.com/llvm/llvm-project/pull/171735.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 35406a387cf0f..3ed918fb570e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -83,6 +83,11 @@ static cl::opt<unsigned> MemcpyLoopUnroll( "operations when lowering memcpy as a loop"), cl::init(16), cl::Hidden); +static cl::opt<bool> UnrollRuntimeExpensiveTripCount( + "amdgpu-unroll-runtime-expensive-trip-count", + cl::desc("Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for runtime unrolling"), + cl::init(false), cl::Hidden); + static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth = 0) { const Instruction *I = dyn_cast<Instruction>(Cond); @@ -124,6 +129,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences( // We want to run unroll even for the loops which have been vectorized. UP.UnrollVectorizedLoop = true; + UP.AllowExpensiveTripCount = UnrollRuntimeExpensiveTripCount; + // TODO: Do we want runtime unrolling? // Maximum alloca size than can fit registers. Reserve 16 registers. diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll new file mode 100644 index 0000000000000..29f1a2a1bc22f --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll @@ -0,0 +1,77 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-NOUNROLL %s +; RUN: opt -amdgpu-unroll-runtime-expensive-trip-count -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-UNROLL %s + + +; CHECK-LABEL: @_Z6kernelPilll( +; CHECK: for.body: + +; CHECK-NOUNROLL: store +; CHECK-NOUNROLL-NOT: store +; CHECK-NOUNROLL: br + +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: br + + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write) +define protected amdgpu_kernel void @_Z6kernelPilll(ptr addrspace(1) noundef writeonly captures(none) %a.coerce, i64 noundef %n, i64 noundef %k, i64 noundef %s) local_unnamed_addr #0 { +entry: + %0 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %1 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %2 = load i32, ptr addrspace(4) %0, align 4 + %3 = icmp ult i32 %1, %2 + %4 = select i1 %3, i64 12, i64 18 + %5 = getelementptr inbounds nuw i8, ptr addrspace(4) %0, i64 %4 + %6 = load i16, ptr addrspace(4) %5, align 2 + %7 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() + %mul = mul nsw i64 %k, %n + %mul.i.i = sub nsw i64 0, %s + %cmp.not.i.i = icmp sge i64 %mul, %mul.i.i + %cmp1.i.i = icmp slt i64 %mul, %s + %or.cond.i.i = and i1 %cmp.not.i.i, %cmp1.i.i + %cmp.i.i = icmp eq i64 %s, 0 + %8 = add i64 %mul, 1 + %or.cond.i.i.i.i = icmp ult i64 %8, 2 + %9 = lshr i64 %mul, 63 + %spec.select.i.i.i.i = add nsw i64 %9, %mul + %cmp2.i.i = icmp slt i64 %mul, 0 + %add.i.i = select i1 %cmp2.i.i, i64 %s, i64 0 + %spec.select.i.i = add nsw i64 %add.i.i, %mul + %conv.i.i4 = zext i16 %6 to i32 + %10 = and i1 %cmp.i.i, %or.cond.i.i.i.i + %spec.select = select i1 %10, i64 %spec.select.i.i.i.i, i64 -1 + %retval.0.i.i = select i1 %or.cond.i.i, i64 %spec.select.i.i, i64 %spec.select + %conv5 = zext nneg i32 %7 to i64 + %cmp6 = icmp sgt i64 %retval.0.i.i, %conv5 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %conv8 = phi i64 [ %conv, %for.body ], [ %conv5, %for.body.preheader ] + %i3.07 = phi i32 [ %add, %for.body ], [ %7, %for.body.preheader ] + %arrayidx = getelementptr inbounds nuw i32, ptr addrspace(1) %a.coerce, i64 %conv8 + store i32 %i3.07, ptr addrspace(1) %arrayidx, align 4 + %add = add nuw nsw i32 %i3.07, %conv.i.i4 + %conv = zext nneg i32 %add to i64 + %cmp = icmp sgt i64 %retval.0.i.i, %conv + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !152 +} + +!47 = !{!"llvm.loop.mustprogress"} +!152 = distinct !{!152, !47, !153} +!153 = !{!"llvm.loop.unroll.enable"} |
| @llvm/pr-subscribers-backend-amdgpu Author: None (carlobertolli) ChangesFull diff: https://github.com/llvm/llvm-project/pull/171735.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 35406a387cf0f..3ed918fb570e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -83,6 +83,11 @@ static cl::opt<unsigned> MemcpyLoopUnroll( "operations when lowering memcpy as a loop"), cl::init(16), cl::Hidden); +static cl::opt<bool> UnrollRuntimeExpensiveTripCount( + "amdgpu-unroll-runtime-expensive-trip-count", + cl::desc("Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for runtime unrolling"), + cl::init(false), cl::Hidden); + static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth = 0) { const Instruction *I = dyn_cast<Instruction>(Cond); @@ -124,6 +129,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences( // We want to run unroll even for the loops which have been vectorized. UP.UnrollVectorizedLoop = true; + UP.AllowExpensiveTripCount = UnrollRuntimeExpensiveTripCount; + // TODO: Do we want runtime unrolling? // Maximum alloca size than can fit registers. Reserve 16 registers. diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll new file mode 100644 index 0000000000000..29f1a2a1bc22f --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll @@ -0,0 +1,77 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-NOUNROLL %s +; RUN: opt -amdgpu-unroll-runtime-expensive-trip-count -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-UNROLL %s + + +; CHECK-LABEL: @_Z6kernelPilll( +; CHECK: for.body: + +; CHECK-NOUNROLL: store +; CHECK-NOUNROLL-NOT: store +; CHECK-NOUNROLL: br + +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: br + + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write) +define protected amdgpu_kernel void @_Z6kernelPilll(ptr addrspace(1) noundef writeonly captures(none) %a.coerce, i64 noundef %n, i64 noundef %k, i64 noundef %s) local_unnamed_addr #0 { +entry: + %0 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %1 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %2 = load i32, ptr addrspace(4) %0, align 4 + %3 = icmp ult i32 %1, %2 + %4 = select i1 %3, i64 12, i64 18 + %5 = getelementptr inbounds nuw i8, ptr addrspace(4) %0, i64 %4 + %6 = load i16, ptr addrspace(4) %5, align 2 + %7 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() + %mul = mul nsw i64 %k, %n + %mul.i.i = sub nsw i64 0, %s + %cmp.not.i.i = icmp sge i64 %mul, %mul.i.i + %cmp1.i.i = icmp slt i64 %mul, %s + %or.cond.i.i = and i1 %cmp.not.i.i, %cmp1.i.i + %cmp.i.i = icmp eq i64 %s, 0 + %8 = add i64 %mul, 1 + %or.cond.i.i.i.i = icmp ult i64 %8, 2 + %9 = lshr i64 %mul, 63 + %spec.select.i.i.i.i = add nsw i64 %9, %mul + %cmp2.i.i = icmp slt i64 %mul, 0 + %add.i.i = select i1 %cmp2.i.i, i64 %s, i64 0 + %spec.select.i.i = add nsw i64 %add.i.i, %mul + %conv.i.i4 = zext i16 %6 to i32 + %10 = and i1 %cmp.i.i, %or.cond.i.i.i.i + %spec.select = select i1 %10, i64 %spec.select.i.i.i.i, i64 -1 + %retval.0.i.i = select i1 %or.cond.i.i, i64 %spec.select.i.i, i64 %spec.select + %conv5 = zext nneg i32 %7 to i64 + %cmp6 = icmp sgt i64 %retval.0.i.i, %conv5 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %conv8 = phi i64 [ %conv, %for.body ], [ %conv5, %for.body.preheader ] + %i3.07 = phi i32 [ %add, %for.body ], [ %7, %for.body.preheader ] + %arrayidx = getelementptr inbounds nuw i32, ptr addrspace(1) %a.coerce, i64 %conv8 + store i32 %i3.07, ptr addrspace(1) %arrayidx, align 4 + %add = add nuw nsw i32 %i3.07, %conv.i.i4 + %conv = zext nneg i32 %add to i64 + %cmp = icmp sgt i64 %retval.0.i.i, %conv + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !152 +} + +!47 = !{!"llvm.loop.mustprogress"} +!152 = distinct !{!152, !47, !153} +!153 = !{!"llvm.loop.unroll.enable"} |
| ✅ With the latest revision this PR passed the C/C++ code formatter. |
dc80e5f to 7f7e824 Compare | ; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write) | ||
| define protected amdgpu_kernel void @_Z6kernelPilll(ptr addrspace(1) noundef writeonly captures(none) %a.coerce, i64 noundef %n, i64 noundef %k, i64 noundef %s) local_unnamed_addr #0 { | ||
| entry: | ||
| %0 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use named variables
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
| @@ -0,0 +1,77 @@ | |||
| ; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-NOUNROLL %s | |||
| ; RUN: opt -amdgpu-unroll-runtime-expensive-trip-count -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-UNROLL %s | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
suggest auto-generated checklines
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I usually prefer minimal checks if the output is simple enough. There's no chance we are going to get more store instructions on the output given there is a single one in the input, unless some form of unrolling happens.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll not insist but at least this can be improved by https://llvm.org/docs/CommandGuide/FileCheck.html#the-check-count-directive.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks for the suggestion.
| %mul = mul nsw i64 %k, %n | ||
| %mul.i.i = sub nsw i64 0, %s | ||
| %cmp.not.i.i = icmp sge i64 %mul, %mul.i.i | ||
| %cmp1.i.i = icmp slt i64 %mul, %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you trim the IR further?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd rather not: it needs to be like this to reflect the real testcase in the normalization kernel of pytorch. This computation is part of the tensor size call and we need to make sure we can unroll the related loop. The related call is
https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cuda/Normalization.cuh#L131
This flag is one of the two steps needed to unroll the loop in that function so we can obtain software pipelining.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it has to be exactly the same as where you got it originally from. A small reproducer is enough as long as it reflects the intent of the change. There are a lot of unnecessary attributes here as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, I hope this is small enough now.
7f7e824 to b49ad87 Compare | // We want to run unroll even for the loops which have been vectorized. | ||
| UP.UnrollVectorizedLoop = true; | ||
| | ||
| UP.AllowExpensiveTripCount = UnrollRuntimeExpensiveTripCount; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't there a generic flag that sets this in the unroll pass? The proliferation of target specific flags just to set an option in the backend implementation of some generic thing is not good
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see two places that set the related boolean var to true:
if (UserUnrollCount || (PragmaCount > 0)) {
and
if (L->getHeader()->getParent()->hasProfileData()) {
in LoopUnrollPass.cpp
I see two other targets that set it, IBM's P and Z systems. I can't see a top level flag for this.
Agreed that proliferation of this at target level is undesirable, but there isn't an alternative right now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, you can move the flag you are adding here to there instead
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assume just setting in gatherUnrollingPreferences will work?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
b49ad87 to 317eda7 Compare
No description provided.