[AMDGPU] Add flag to enable expensive trip counts for runtime unroll. #171735

carlobertolli · 2025-12-10T23:18:46Z

No description provided.

llvmbot · 2025-12-10T23:19:20Z

@llvm/pr-subscribers-llvm-transforms

Author: None (carlobertolli)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/171735.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+7)
(added) llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll (+77)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 35406a387cf0f..3ed918fb570e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -83,6 +83,11 @@ static cl::opt<unsigned> MemcpyLoopUnroll( "operations when lowering memcpy as a loop"), cl::init(16), cl::Hidden); +static cl::opt<bool> UnrollRuntimeExpensiveTripCount( + "amdgpu-unroll-runtime-expensive-trip-count", + cl::desc("Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for runtime unrolling"), + cl::init(false), cl::Hidden); + static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth = 0) { const Instruction *I = dyn_cast<Instruction>(Cond); @@ -124,6 +129,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences( // We want to run unroll even for the loops which have been vectorized. UP.UnrollVectorizedLoop = true; + UP.AllowExpensiveTripCount = UnrollRuntimeExpensiveTripCount; + // TODO: Do we want runtime unrolling? // Maximum alloca size than can fit registers. Reserve 16 registers. diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll new file mode 100644 index 0000000000000..29f1a2a1bc22f --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll @@ -0,0 +1,77 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-NOUNROLL %s +; RUN: opt -amdgpu-unroll-runtime-expensive-trip-count -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-UNROLL %s + + +; CHECK-LABEL: @_Z6kernelPilll( +; CHECK: for.body: + +; CHECK-NOUNROLL: store +; CHECK-NOUNROLL-NOT: store +; CHECK-NOUNROLL: br + +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: br + + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write) +define protected amdgpu_kernel void @_Z6kernelPilll(ptr addrspace(1) noundef writeonly captures(none) %a.coerce, i64 noundef %n, i64 noundef %k, i64 noundef %s) local_unnamed_addr #0 { +entry: + %0 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %1 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %2 = load i32, ptr addrspace(4) %0, align 4 + %3 = icmp ult i32 %1, %2 + %4 = select i1 %3, i64 12, i64 18 + %5 = getelementptr inbounds nuw i8, ptr addrspace(4) %0, i64 %4 + %6 = load i16, ptr addrspace(4) %5, align 2 + %7 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() + %mul = mul nsw i64 %k, %n + %mul.i.i = sub nsw i64 0, %s + %cmp.not.i.i = icmp sge i64 %mul, %mul.i.i + %cmp1.i.i = icmp slt i64 %mul, %s + %or.cond.i.i = and i1 %cmp.not.i.i, %cmp1.i.i + %cmp.i.i = icmp eq i64 %s, 0 + %8 = add i64 %mul, 1 + %or.cond.i.i.i.i = icmp ult i64 %8, 2 + %9 = lshr i64 %mul, 63 + %spec.select.i.i.i.i = add nsw i64 %9, %mul + %cmp2.i.i = icmp slt i64 %mul, 0 + %add.i.i = select i1 %cmp2.i.i, i64 %s, i64 0 + %spec.select.i.i = add nsw i64 %add.i.i, %mul + %conv.i.i4 = zext i16 %6 to i32 + %10 = and i1 %cmp.i.i, %or.cond.i.i.i.i + %spec.select = select i1 %10, i64 %spec.select.i.i.i.i, i64 -1 + %retval.0.i.i = select i1 %or.cond.i.i, i64 %spec.select.i.i, i64 %spec.select + %conv5 = zext nneg i32 %7 to i64 + %cmp6 = icmp sgt i64 %retval.0.i.i, %conv5 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %conv8 = phi i64 [ %conv, %for.body ], [ %conv5, %for.body.preheader ] + %i3.07 = phi i32 [ %add, %for.body ], [ %7, %for.body.preheader ] + %arrayidx = getelementptr inbounds nuw i32, ptr addrspace(1) %a.coerce, i64 %conv8 + store i32 %i3.07, ptr addrspace(1) %arrayidx, align 4 + %add = add nuw nsw i32 %i3.07, %conv.i.i4 + %conv = zext nneg i32 %add to i64 + %cmp = icmp sgt i64 %retval.0.i.i, %conv + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !152 +} + +!47 = !{!"llvm.loop.mustprogress"} +!152 = distinct !{!152, !47, !153} +!153 = !{!"llvm.loop.unroll.enable"}

llvmbot · 2025-12-10T23:19:21Z

@llvm/pr-subscribers-backend-amdgpu

Author: None (carlobertolli)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/171735.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+7)
(added) llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll (+77)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 35406a387cf0f..3ed918fb570e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -83,6 +83,11 @@ static cl::opt<unsigned> MemcpyLoopUnroll( "operations when lowering memcpy as a loop"), cl::init(16), cl::Hidden); +static cl::opt<bool> UnrollRuntimeExpensiveTripCount( + "amdgpu-unroll-runtime-expensive-trip-count", + cl::desc("Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for runtime unrolling"), + cl::init(false), cl::Hidden); + static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth = 0) { const Instruction *I = dyn_cast<Instruction>(Cond); @@ -124,6 +129,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences( // We want to run unroll even for the loops which have been vectorized. UP.UnrollVectorizedLoop = true; + UP.AllowExpensiveTripCount = UnrollRuntimeExpensiveTripCount; + // TODO: Do we want runtime unrolling? // Maximum alloca size than can fit registers. Reserve 16 registers. diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll new file mode 100644 index 0000000000000..29f1a2a1bc22f --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll @@ -0,0 +1,77 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-NOUNROLL %s +; RUN: opt -amdgpu-unroll-runtime-expensive-trip-count -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-UNROLL %s + + +; CHECK-LABEL: @_Z6kernelPilll( +; CHECK: for.body: + +; CHECK-NOUNROLL: store +; CHECK-NOUNROLL-NOT: store +; CHECK-NOUNROLL: br + +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: br + + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write) +define protected amdgpu_kernel void @_Z6kernelPilll(ptr addrspace(1) noundef writeonly captures(none) %a.coerce, i64 noundef %n, i64 noundef %k, i64 noundef %s) local_unnamed_addr #0 { +entry: + %0 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %1 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %2 = load i32, ptr addrspace(4) %0, align 4 + %3 = icmp ult i32 %1, %2 + %4 = select i1 %3, i64 12, i64 18 + %5 = getelementptr inbounds nuw i8, ptr addrspace(4) %0, i64 %4 + %6 = load i16, ptr addrspace(4) %5, align 2 + %7 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() + %mul = mul nsw i64 %k, %n + %mul.i.i = sub nsw i64 0, %s + %cmp.not.i.i = icmp sge i64 %mul, %mul.i.i + %cmp1.i.i = icmp slt i64 %mul, %s + %or.cond.i.i = and i1 %cmp.not.i.i, %cmp1.i.i + %cmp.i.i = icmp eq i64 %s, 0 + %8 = add i64 %mul, 1 + %or.cond.i.i.i.i = icmp ult i64 %8, 2 + %9 = lshr i64 %mul, 63 + %spec.select.i.i.i.i = add nsw i64 %9, %mul + %cmp2.i.i = icmp slt i64 %mul, 0 + %add.i.i = select i1 %cmp2.i.i, i64 %s, i64 0 + %spec.select.i.i = add nsw i64 %add.i.i, %mul + %conv.i.i4 = zext i16 %6 to i32 + %10 = and i1 %cmp.i.i, %or.cond.i.i.i.i + %spec.select = select i1 %10, i64 %spec.select.i.i.i.i, i64 -1 + %retval.0.i.i = select i1 %or.cond.i.i, i64 %spec.select.i.i, i64 %spec.select + %conv5 = zext nneg i32 %7 to i64 + %cmp6 = icmp sgt i64 %retval.0.i.i, %conv5 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %conv8 = phi i64 [ %conv, %for.body ], [ %conv5, %for.body.preheader ] + %i3.07 = phi i32 [ %add, %for.body ], [ %7, %for.body.preheader ] + %arrayidx = getelementptr inbounds nuw i32, ptr addrspace(1) %a.coerce, i64 %conv8 + store i32 %i3.07, ptr addrspace(1) %arrayidx, align 4 + %add = add nuw nsw i32 %i3.07, %conv.i.i4 + %conv = zext nneg i32 %add to i64 + %cmp = icmp sgt i64 %retval.0.i.i, %conv + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !152 +} + +!47 = !{!"llvm.loop.mustprogress"} +!152 = distinct !{!152, !47, !153} +!153 = !{!"llvm.loop.unroll.enable"}

github-actions · 2025-12-10T23:20:26Z

✅ With the latest revision this PR passed the C/C++ code formatter.

shiltian · 2025-12-11T00:35:18Z

llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll

+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write)
+define protected amdgpu_kernel void @_Z6kernelPilll(ptr addrspace(1) noundef writeonly captures(none) %a.coerce, i64 noundef %n, i64 noundef %k, i64 noundef %s) local_unnamed_addr #0 {
+entry:
+ %0 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()


use named variables

shiltian · 2025-12-11T00:35:39Z

llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll

@@ -0,0 +1,77 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-NOUNROLL %s
+; RUN: opt -amdgpu-unroll-runtime-expensive-trip-count -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-UNROLL %s


suggest auto-generated checklines

I usually prefer minimal checks if the output is simple enough. There's no chance we are going to get more store instructions on the output given there is a single one in the input, unless some form of unrolling happens.

I'll not insist but at least this can be improved by https://llvm.org/docs/CommandGuide/FileCheck.html#the-check-count-directive.

Done, thanks for the suggestion.

shiltian · 2025-12-11T00:36:15Z

llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll

+ %mul = mul nsw i64 %k, %n
+ %mul.i.i = sub nsw i64 0, %s
+ %cmp.not.i.i = icmp sge i64 %mul, %mul.i.i
+ %cmp1.i.i = icmp slt i64 %mul, %s


can you trim the IR further?

I'd rather not: it needs to be like this to reflect the real testcase in the normalization kernel of pytorch. This computation is part of the tensor size call and we need to make sure we can unroll the related loop. The related call is
https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cuda/Normalization.cuh#L131
This flag is one of the two steps needed to unroll the loop in that function so we can obtain software pipelining.

I don't think it has to be exactly the same as where you got it originally from. A small reproducer is enough as long as it reflects the intent of the change. There are a lot of unnecessary attributes here as well.

Done, I hope this is small enough now.

arsenm · 2025-12-12T18:46:33Z

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

 // We want to run unroll even for the loops which have been vectorized.
 UP.UnrollVectorizedLoop = true;

+ UP.AllowExpensiveTripCount = UnrollRuntimeExpensiveTripCount;


Isn't there a generic flag that sets this in the unroll pass? The proliferation of target specific flags just to set an option in the backend implementation of some generic thing is not good

I see two places that set the related boolean var to true:
if (UserUnrollCount || (PragmaCount > 0)) {

and

if (L->getHeader()->getParent()->hasProfileData()) {

in LoopUnrollPass.cpp

I see two other targets that set it, IBM's P and Z systems. I can't see a top level flag for this.
Agreed that proliferation of this at target level is undesirable, but there isn't an alternative right now.

Right, you can move the flag you are adding here to there instead

I assume just setting in gatherUnrollingPreferences will work?

llvmbot added backend:AMDGPU llvm:transforms labels Dec 10, 2025

carlobertolli force-pushed the expensive.tripcount.unrolling.flag.tr branch from dc80e5f to 7f7e824 Compare December 10, 2025 23:22

shiltian reviewed Dec 11, 2025

View reviewed changes

carlobertolli force-pushed the expensive.tripcount.unrolling.flag.tr branch from 7f7e824 to b49ad87 Compare December 11, 2025 15:55

carlobertolli requested review from bcahoon and hidekisaito December 11, 2025 19:16

arsenm reviewed Dec 12, 2025

View reviewed changes

[AMDGPU] Add flag to enable expensive trip counts for runtime unroll.

317eda7

carlobertolli force-pushed the expensive.tripcount.unrolling.flag.tr branch from b49ad87 to 317eda7 Compare December 15, 2025 17:05

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Add flag to enable expensive trip counts for runtime unroll. #171735

[AMDGPU] Add flag to enable expensive trip counts for runtime unroll. #171735

Uh oh!

carlobertolli commented Dec 10, 2025

llvmbot commented Dec 10, 2025

llvmbot commented Dec 10, 2025

github-actions bot commented Dec 10, 2025 •

edited

Loading

shiltian Dec 11, 2025

carlobertolli Dec 11, 2025

shiltian Dec 11, 2025

carlobertolli Dec 11, 2025

shiltian Dec 12, 2025

carlobertolli Dec 15, 2025

shiltian Dec 11, 2025

carlobertolli Dec 11, 2025

shiltian Dec 12, 2025

carlobertolli Dec 15, 2025

arsenm Dec 12, 2025

carlobertolli Dec 12, 2025

arsenm Dec 13, 2025

arsenm Dec 13, 2025

carlobertolli Dec 15, 2025

Labels

4 participants

		@@ -0,0 +1,77 @@
		; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - \| FileCheck -check-prefixes=CHECK-NOUNROLL %s
		; RUN: opt -amdgpu-unroll-runtime-expensive-trip-count -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - \| FileCheck -check-prefixes=CHECK-UNROLL %s

[AMDGPU] Add flag to enable expensive trip counts for runtime unroll. #171735

Are you sure you want to change the base?

[AMDGPU] Add flag to enable expensive trip counts for runtime unroll. #171735

Uh oh!

Conversation

carlobertolli commented Dec 10, 2025

llvmbot commented Dec 10, 2025

llvmbot commented Dec 10, 2025

github-actions bot commented Dec 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Labels

4 participants

github-actions bot commented Dec 10, 2025 •

edited

Loading