Skip to content

Conversation

@carlobertolli
Copy link
Member

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Dec 10, 2025

@llvm/pr-subscribers-llvm-transforms

Author: None (carlobertolli)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/171735.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+7)
  • (added) llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll (+77)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 35406a387cf0f..3ed918fb570e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -83,6 +83,11 @@ static cl::opt<unsigned> MemcpyLoopUnroll( "operations when lowering memcpy as a loop"), cl::init(16), cl::Hidden); +static cl::opt<bool> UnrollRuntimeExpensiveTripCount( + "amdgpu-unroll-runtime-expensive-trip-count", + cl::desc("Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for runtime unrolling"), + cl::init(false), cl::Hidden); + static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth = 0) { const Instruction *I = dyn_cast<Instruction>(Cond); @@ -124,6 +129,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences( // We want to run unroll even for the loops which have been vectorized. UP.UnrollVectorizedLoop = true; + UP.AllowExpensiveTripCount = UnrollRuntimeExpensiveTripCount; + // TODO: Do we want runtime unrolling? // Maximum alloca size than can fit registers. Reserve 16 registers. diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll new file mode 100644 index 0000000000000..29f1a2a1bc22f --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll @@ -0,0 +1,77 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-NOUNROLL %s +; RUN: opt -amdgpu-unroll-runtime-expensive-trip-count -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-UNROLL %s + + +; CHECK-LABEL: @_Z6kernelPilll( +; CHECK: for.body: + +; CHECK-NOUNROLL: store +; CHECK-NOUNROLL-NOT: store +; CHECK-NOUNROLL: br + +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: br + + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write) +define protected amdgpu_kernel void @_Z6kernelPilll(ptr addrspace(1) noundef writeonly captures(none) %a.coerce, i64 noundef %n, i64 noundef %k, i64 noundef %s) local_unnamed_addr #0 { +entry: + %0 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %1 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %2 = load i32, ptr addrspace(4) %0, align 4 + %3 = icmp ult i32 %1, %2 + %4 = select i1 %3, i64 12, i64 18 + %5 = getelementptr inbounds nuw i8, ptr addrspace(4) %0, i64 %4 + %6 = load i16, ptr addrspace(4) %5, align 2 + %7 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() + %mul = mul nsw i64 %k, %n + %mul.i.i = sub nsw i64 0, %s + %cmp.not.i.i = icmp sge i64 %mul, %mul.i.i + %cmp1.i.i = icmp slt i64 %mul, %s + %or.cond.i.i = and i1 %cmp.not.i.i, %cmp1.i.i + %cmp.i.i = icmp eq i64 %s, 0 + %8 = add i64 %mul, 1 + %or.cond.i.i.i.i = icmp ult i64 %8, 2 + %9 = lshr i64 %mul, 63 + %spec.select.i.i.i.i = add nsw i64 %9, %mul + %cmp2.i.i = icmp slt i64 %mul, 0 + %add.i.i = select i1 %cmp2.i.i, i64 %s, i64 0 + %spec.select.i.i = add nsw i64 %add.i.i, %mul + %conv.i.i4 = zext i16 %6 to i32 + %10 = and i1 %cmp.i.i, %or.cond.i.i.i.i + %spec.select = select i1 %10, i64 %spec.select.i.i.i.i, i64 -1 + %retval.0.i.i = select i1 %or.cond.i.i, i64 %spec.select.i.i, i64 %spec.select + %conv5 = zext nneg i32 %7 to i64 + %cmp6 = icmp sgt i64 %retval.0.i.i, %conv5 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %conv8 = phi i64 [ %conv, %for.body ], [ %conv5, %for.body.preheader ] + %i3.07 = phi i32 [ %add, %for.body ], [ %7, %for.body.preheader ] + %arrayidx = getelementptr inbounds nuw i32, ptr addrspace(1) %a.coerce, i64 %conv8 + store i32 %i3.07, ptr addrspace(1) %arrayidx, align 4 + %add = add nuw nsw i32 %i3.07, %conv.i.i4 + %conv = zext nneg i32 %add to i64 + %cmp = icmp sgt i64 %retval.0.i.i, %conv + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !152 +} + +!47 = !{!"llvm.loop.mustprogress"} +!152 = distinct !{!152, !47, !153} +!153 = !{!"llvm.loop.unroll.enable"} 
@llvmbot
Copy link
Member

llvmbot commented Dec 10, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: None (carlobertolli)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/171735.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+7)
  • (added) llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll (+77)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 35406a387cf0f..3ed918fb570e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -83,6 +83,11 @@ static cl::opt<unsigned> MemcpyLoopUnroll( "operations when lowering memcpy as a loop"), cl::init(16), cl::Hidden); +static cl::opt<bool> UnrollRuntimeExpensiveTripCount( + "amdgpu-unroll-runtime-expensive-trip-count", + cl::desc("Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for runtime unrolling"), + cl::init(false), cl::Hidden); + static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth = 0) { const Instruction *I = dyn_cast<Instruction>(Cond); @@ -124,6 +129,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences( // We want to run unroll even for the loops which have been vectorized. UP.UnrollVectorizedLoop = true; + UP.AllowExpensiveTripCount = UnrollRuntimeExpensiveTripCount; + // TODO: Do we want runtime unrolling? // Maximum alloca size than can fit registers. Reserve 16 registers. diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll new file mode 100644 index 0000000000000..29f1a2a1bc22f --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/expensive-tripcount.ll @@ -0,0 +1,77 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-NOUNROLL %s +; RUN: opt -amdgpu-unroll-runtime-expensive-trip-count -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-UNROLL %s + + +; CHECK-LABEL: @_Z6kernelPilll( +; CHECK: for.body: + +; CHECK-NOUNROLL: store +; CHECK-NOUNROLL-NOT: store +; CHECK-NOUNROLL: br + +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: store +; CHECK-UNROLL: br + + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write) +define protected amdgpu_kernel void @_Z6kernelPilll(ptr addrspace(1) noundef writeonly captures(none) %a.coerce, i64 noundef %n, i64 noundef %k, i64 noundef %s) local_unnamed_addr #0 { +entry: + %0 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %1 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %2 = load i32, ptr addrspace(4) %0, align 4 + %3 = icmp ult i32 %1, %2 + %4 = select i1 %3, i64 12, i64 18 + %5 = getelementptr inbounds nuw i8, ptr addrspace(4) %0, i64 %4 + %6 = load i16, ptr addrspace(4) %5, align 2 + %7 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() + %mul = mul nsw i64 %k, %n + %mul.i.i = sub nsw i64 0, %s + %cmp.not.i.i = icmp sge i64 %mul, %mul.i.i + %cmp1.i.i = icmp slt i64 %mul, %s + %or.cond.i.i = and i1 %cmp.not.i.i, %cmp1.i.i + %cmp.i.i = icmp eq i64 %s, 0 + %8 = add i64 %mul, 1 + %or.cond.i.i.i.i = icmp ult i64 %8, 2 + %9 = lshr i64 %mul, 63 + %spec.select.i.i.i.i = add nsw i64 %9, %mul + %cmp2.i.i = icmp slt i64 %mul, 0 + %add.i.i = select i1 %cmp2.i.i, i64 %s, i64 0 + %spec.select.i.i = add nsw i64 %add.i.i, %mul + %conv.i.i4 = zext i16 %6 to i32 + %10 = and i1 %cmp.i.i, %or.cond.i.i.i.i + %spec.select = select i1 %10, i64 %spec.select.i.i.i.i, i64 -1 + %retval.0.i.i = select i1 %or.cond.i.i, i64 %spec.select.i.i, i64 %spec.select + %conv5 = zext nneg i32 %7 to i64 + %cmp6 = icmp sgt i64 %retval.0.i.i, %conv5 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %conv8 = phi i64 [ %conv, %for.body ], [ %conv5, %for.body.preheader ] + %i3.07 = phi i32 [ %add, %for.body ], [ %7, %for.body.preheader ] + %arrayidx = getelementptr inbounds nuw i32, ptr addrspace(1) %a.coerce, i64 %conv8 + store i32 %i3.07, ptr addrspace(1) %arrayidx, align 4 + %add = add nuw nsw i32 %i3.07, %conv.i.i4 + %conv = zext nneg i32 %add to i64 + %cmp = icmp sgt i64 %retval.0.i.i, %conv + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !152 +} + +!47 = !{!"llvm.loop.mustprogress"} +!152 = distinct !{!152, !47, !153} +!153 = !{!"llvm.loop.unroll.enable"} 
@github-actions
Copy link

github-actions bot commented Dec 10, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@carlobertolli carlobertolli force-pushed the expensive.tripcount.unrolling.flag.tr branch from dc80e5f to 7f7e824 Compare December 10, 2025 23:22
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: write)
define protected amdgpu_kernel void @_Z6kernelPilll(ptr addrspace(1) noundef writeonly captures(none) %a.coerce, i64 noundef %n, i64 noundef %k, i64 noundef %s) local_unnamed_addr #0 {
entry:
%0 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use named variables

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

@@ -0,0 +1,77 @@
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-NOUNROLL %s
; RUN: opt -amdgpu-unroll-runtime-expensive-trip-count -mtriple=amdgcn-amd-amdhsa -passes=loop-unroll -S %s -o - | FileCheck -check-prefixes=CHECK-UNROLL %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggest auto-generated checklines

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I usually prefer minimal checks if the output is simple enough. There's no chance we are going to get more store instructions on the output given there is a single one in the input, unless some form of unrolling happens.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll not insist but at least this can be improved by https://llvm.org/docs/CommandGuide/FileCheck.html#the-check-count-directive.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, thanks for the suggestion.

%mul = mul nsw i64 %k, %n
%mul.i.i = sub nsw i64 0, %s
%cmp.not.i.i = icmp sge i64 %mul, %mul.i.i
%cmp1.i.i = icmp slt i64 %mul, %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you trim the IR further?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather not: it needs to be like this to reflect the real testcase in the normalization kernel of pytorch. This computation is part of the tensor size call and we need to make sure we can unroll the related loop. The related call is
https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cuda/Normalization.cuh#L131
This flag is one of the two steps needed to unroll the loop in that function so we can obtain software pipelining.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it has to be exactly the same as where you got it originally from. A small reproducer is enough as long as it reflects the intent of the change. There are a lot of unnecessary attributes here as well.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, I hope this is small enough now.

@carlobertolli carlobertolli force-pushed the expensive.tripcount.unrolling.flag.tr branch from 7f7e824 to b49ad87 Compare December 11, 2025 15:55
// We want to run unroll even for the loops which have been vectorized.
UP.UnrollVectorizedLoop = true;

UP.AllowExpensiveTripCount = UnrollRuntimeExpensiveTripCount;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't there a generic flag that sets this in the unroll pass? The proliferation of target specific flags just to set an option in the backend implementation of some generic thing is not good

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see two places that set the related boolean var to true:
if (UserUnrollCount || (PragmaCount > 0)) {

and

if (L->getHeader()->getParent()->hasProfileData()) {

in LoopUnrollPass.cpp

I see two other targets that set it, IBM's P and Z systems. I can't see a top level flag for this.
Agreed that proliferation of this at target level is undesirable, but there isn't an alternative right now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, you can move the flag you are adding here to there instead

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume just setting in gatherUnrollingPreferences will work?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

@carlobertolli carlobertolli force-pushed the expensive.tripcount.unrolling.flag.tr branch from b49ad87 to 317eda7 Compare December 15, 2025 17:05
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment