- Notifications
You must be signed in to change notification settings - Fork 15k
[SCEV] Rewrite A - B = UMin(1, A - B) lazily for A != B loop guards. #163787
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
| @llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-llvm-analysis Author: Florian Hahn (fhahn) ChangesFollow-up to 2d02726 (#160500) Creating the SCEV subtraction eagerly is very expensive. To soften the blow, just collect a map with inequalities and check if we can apply the subtract rewrite when rewriting SCEVAddExpr. Restores most of the regression: There is still some negative impact compared to before 2d02726, but there's probably not much we could do reduce this even more. Compile-time improvement with 2d02726 reverted on top of the current PR: http://llvm-compile-time-tracker.com/compare.php?from=7fca35db60fe6f423ea6051b45226046c067c252&to=98dd152bdfc76b30d00190d3850d89406ca3c21f&stat=instructions:u stage1-O3: 60628M (-0.03%) Full diff: https://github.com/llvm/llvm-project/pull/163787.diff 2 Files Affected:
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index e5a6c8cc0a6aa..96d3ef6706843 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -1345,6 +1345,7 @@ class ScalarEvolution { class LoopGuards { DenseMap<const SCEV *, const SCEV *> RewriteMap; + DenseMap<const SCEV *, SmallPtrSet<const SCEV *, 2>> NotEqualMap; bool PreserveNUW = false; bool PreserveNSW = false; ScalarEvolution &SE; diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index a64b93d541943..ce6f5fd17e294 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -15772,19 +15772,25 @@ void ScalarEvolution::LoopGuards::collectFromBlock( GetNextSCEVDividesByDivisor(One, DividesBy); To = SE.getUMaxExpr(FromRewritten, OneAlignedUp); } else { + // LHS != RHS can be rewritten as (LHS - RHS) = UMax(1, LHS - RHS), + // but creating the subtraction eagerly is expensive. Track the + // inequalities in a separate map, and materialize the rewrite lazily + // when encountering a suitable subtraction while re-writing. if (LHS->getType()->isPointerTy()) { LHS = SE.getLosslessPtrToIntExpr(LHS); RHS = SE.getLosslessPtrToIntExpr(RHS); if (isa<SCEVCouldNotCompute>(LHS) || isa<SCEVCouldNotCompute>(RHS)) break; } - auto AddSubRewrite = [&](const SCEV *A, const SCEV *B) { - const SCEV *Sub = SE.getMinusSCEV(A, B); - AddRewrite(Sub, Sub, - SE.getUMaxExpr(Sub, SE.getOne(From->getType()))); - }; - AddSubRewrite(LHS, RHS); - AddSubRewrite(RHS, LHS); + const SCEVConstant *C; + const SCEV *A, *B; + if (match(RHS, m_scev_Add(m_SCEVConstant(C), m_SCEV(A))) && + match(LHS, m_scev_Add(m_scev_Specific(C), m_SCEV(B)))) { + RHS = A; + LHS = B; + } + Guards.NotEqualMap[LHS].insert(RHS); + Guards.NotEqualMap[RHS].insert(LHS); continue; } break; @@ -15918,13 +15924,15 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { class SCEVLoopGuardRewriter : public SCEVRewriteVisitor<SCEVLoopGuardRewriter> { const DenseMap<const SCEV *, const SCEV *> ⤅ + const DenseMap<const SCEV *, SmallPtrSet<const SCEV *, 2>> &NotEqualMap; SCEV::NoWrapFlags FlagMask = SCEV::FlagAnyWrap; public: SCEVLoopGuardRewriter(ScalarEvolution &SE, const ScalarEvolution::LoopGuards &Guards) - : SCEVRewriteVisitor(SE), Map(Guards.RewriteMap) { + : SCEVRewriteVisitor(SE), Map(Guards.RewriteMap), + NotEqualMap(Guards.NotEqualMap) { if (Guards.PreserveNUW) FlagMask = ScalarEvolution::setFlags(FlagMask, SCEV::FlagNUW); if (Guards.PreserveNSW) @@ -15979,14 +15987,35 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { } const SCEV *visitAddExpr(const SCEVAddExpr *Expr) { + // Helper to check if S is a subtraction (A - B) where A != B, and if so, + // return UMax(S, 1). + auto RewriteSubtraction = [&](const SCEV *S) -> const SCEV * { + const SCEV *LHS, *RHS; + if (MatchBinarySub(S, LHS, RHS)) { + auto It = NotEqualMap.find(LHS); + if (It != NotEqualMap.end() && It->second.contains(RHS)) + return SE.getUMaxExpr(S, SE.getOne(S->getType())); + } + return nullptr; + }; + + // Check if Expr itself is a subtraction pattern with guard info. + if (const SCEV *Rewritten = RewriteSubtraction(Expr)) + return Rewritten; + // Trip count expressions sometimes consist of adding 3 operands, i.e. // (Const + A + B). There may be guard info for A + B, and if so, apply // it. // TODO: Could more generally apply guards to Add sub-expressions. if (isa<SCEVConstant>(Expr->getOperand(0)) && Expr->getNumOperands() == 3) { - if (const SCEV *S = Map.lookup( - SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2)))) + const SCEV *Add = + SE.getAddExpr(Expr->getOperand(1), Expr->getOperand(2)); + if (const SCEV *Rewritten = RewriteSubtraction(Add)) + return SE.getAddExpr( + Expr->getOperand(0), Rewritten, + ScalarEvolution::maskFlags(Expr->getNoWrapFlags(), FlagMask)); + if (const SCEV *S = Map.lookup(Add)) return SE.getAddExpr(Expr->getOperand(0), S); } SmallVector<const SCEV *, 2> Operands; @@ -16021,7 +16050,7 @@ const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { } }; - if (RewriteMap.empty()) + if (RewriteMap.empty() && NotEqualMap.empty()) return Expr; SCEVLoopGuardRewriter Rewriter(SE, *this); |
| | ||
| class LoopGuards { | ||
| DenseMap<const SCEV *, const SCEV *> RewriteMap; | ||
| DenseMap<const SCEV *, SmallPtrSet<const SCEV *, 2>> NotEqualMap; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| DenseMap<const SCEV *, SmallPtrSet<const SCEV *, 2>> NotEqualMap; | |
| DenseSet<std::pair<const SCEV *, const SCEV *>> NotEqualMap; |
I think something like this would be simpler than the nested set?
(Could also sort the pair to avoid inserting both orders, but that's probably more hassle than it's worth.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using SmallDenseMap here might make sense as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to use SmallDenseSet, thanks
| const SCEVConstant *C; | ||
| const SCEV *A, *B; | ||
| if (match(RHS, m_scev_Add(m_SCEVConstant(C), m_SCEV(A))) && | ||
| match(LHS, m_scev_Add(m_scev_Specific(C), m_SCEV(B)))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't really need the other operand to be SCEVConstant, though I guess the non-constant case may be less practically relevant...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I couldn't find a case where it would matter. I guess we could check both sides to find a common SCEV on both sides.
Extra test coverage for #163787.
Follow-up to 2d02726 (llvm#160500) Creating the SCEV subtraction eagerly is very expensive. To soften the blow, just collect a map with inequalities and check if we can apply the subtract rewrite when rewriting SCEVAddExpr. Restores most of the regression: http://llvm-compile-time-tracker.com/compare.php?from=0792478e4e133be96650444f3264e89d002fc058&to=7fca35db60fe6f423ea6051b45226046c067c252&stat=instructions:u stage1-O3: -0.10% stage1-ReleaseThinLTO: -0.09% stage1-ReleaseLTO-g: -0.10% stage1-O0-g: +0.02% stage1-aarch64-O3: -0.09% stage1-aarch64-O0-g: +0.00% stage2-O3: -0.17% stage2-O0-g: -0.05% stage2-clang: -0.07% There is still some negative impact compared to before 2d02726, but there's probably not much we could do reduce this even more. Compile-time improvement with 2d02726 reverted on top of the current PR: http://llvm-compile-time-tracker.com/compare.php?from=7fca35db60fe6f423ea6051b45226046c067c252&to=98dd152bdfc76b30d00190d3850d89406ca3c21f&stat=instructions:u stage1-O3: 60628M (-0.03%) stage1-ReleaseThinLTO: 76388M (-0.04%) stage1-ReleaseLTO-g: 89228M (-0.02%) stage1-O0-g: 18523M (-0.03%) stage1-aarch64-O3: 67623M (-0.03%) stage1-aarch64-O0-g: 22595M (+0.01%) stage2-O3: 52336M (+0.01%) stage2-O0-g: 16174M (+0.00%) stage2-clang: 34890032M (-0.03%)
7f22a82 to 4b0dd35 Compare | | ||
| class LoopGuards { | ||
| DenseMap<const SCEV *, const SCEV *> RewriteMap; | ||
| DenseMap<const SCEV *, SmallPtrSet<const SCEV *, 2>> NotEqualMap; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to use SmallDenseSet, thanks
| const SCEVConstant *C; | ||
| const SCEV *A, *B; | ||
| if (match(RHS, m_scev_Add(m_SCEVConstant(C), m_SCEV(A))) && | ||
| match(LHS, m_scev_Add(m_scev_Specific(C), m_SCEV(B)))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I couldn't find a case where it would matter. I guess we could check both sides to find a common SCEV on both sides.
| ; N32: [[LOOP_LATCH]]: | ||
| ; N32-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 | ||
| ; N32-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[UMAX]] | ||
| ; N32-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[PTR_DIFF]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Found that there's some improvements with the change added in 0590c9e
| ; N32: [[LOOP_BODY]]: | ||
| ; N32-NEXT: [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_BODY]] ], [ [[START]], %[[LOOP_BODY_PREHEADER]] ] | ||
| ; N32-NEXT: [[TMP0:%.*]] = call i1 @cond() | ||
| ; N32-NEXT: br i1 true, label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_BODY]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's also one case that regressed, but there are quite a few improvements with unrolling and others on macOS.
Extra test coverage for llvm/llvm-project#163787.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…op guards. (#163787) Follow-up to 2d02726 (llvm/llvm-project#160500) Creating the SCEV subtraction eagerly is very expensive. To soften the blow, just collect a map with inequalities and check if we can apply the subtract rewrite when rewriting SCEVAddExpr. Restores most of the regression: http://llvm-compile-time-tracker.com/compare.php?from=0792478e4e133be96650444f3264e89d002fc058&to=7fca35db60fe6f423ea6051b45226046c067c252&stat=instructions:u stage1-O3: -0.10% stage1-ReleaseThinLTO: -0.09% stage1-ReleaseLTO-g: -0.10% stage1-O0-g: +0.02% stage1-aarch64-O3: -0.09% stage1-aarch64-O0-g: +0.00% stage2-O3: -0.17% stage2-O0-g: -0.05% stage2-clang: -0.07% There is still some negative impact compared to before 2d02726, but there's probably not much we could do reduce this even more. Compile-time improvement with 2d02726 reverted on top of the current PR: http://llvm-compile-time-tracker.com/compare.php?from=7fca35db60fe6f423ea6051b45226046c067c252&to=98dd152bdfc76b30d00190d3850d89406ca3c21f&stat=instructions:u stage1-O3: 60628M (-0.03%) stage1-ReleaseThinLTO: 76388M (-0.04%) stage1-ReleaseLTO-g: 89228M (-0.02%) stage1-O0-g: 18523M (-0.03%) stage1-aarch64-O3: 67623M (-0.03%) stage1-aarch64-O0-g: 22595M (+0.01%) stage2-O3: 52336M (+0.01%) stage2-O0-g: 16174M (+0.00%) stage2-clang: 34890032M (-0.03%) PR: llvm/llvm-project#163787
| LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/186/builds/13277 Here is the relevant piece of the build log for the reference |
Follow-up to 2d02726 (#160500)
Creating the SCEV subtraction eagerly is very expensive. To soften the blow, just collect a map with inequalities and check if we can apply the subtract rewrite when rewriting SCEVAddExpr.
Restores most of the regression:
http://llvm-compile-time-tracker.com/compare.php?from=0792478e4e133be96650444f3264e89d002fc058&to=7fca35db60fe6f423ea6051b45226046c067c252&stat=instructions:u stage1-O3: -0.10%
stage1-ReleaseThinLTO: -0.09%
stage1-ReleaseLTO-g: -0.10%
stage1-O0-g: +0.02%
stage1-aarch64-O3: -0.09%
stage1-aarch64-O0-g: +0.00%
stage2-O3: -0.17%
stage2-O0-g: -0.05%
stage2-clang: -0.07%
There is still some negative impact compared to before 2d02726, but there's probably not much we could do reduce this even more.
Compile-time improvement with 2d02726 reverted on top of the current PR: http://llvm-compile-time-tracker.com/compare.php?from=7fca35db60fe6f423ea6051b45226046c067c252&to=98dd152bdfc76b30d00190d3850d89406ca3c21f&stat=instructions:u
stage1-O3: 60628M (-0.03%)
stage1-ReleaseThinLTO: 76388M (-0.04%)
stage1-ReleaseLTO-g: 89228M (-0.02%)
stage1-O0-g: 18523M (-0.03%)
stage1-aarch64-O3: 67623M (-0.03%)
stage1-aarch64-O0-g: 22595M (+0.01%)
stage2-O3: 52336M (+0.01%)
stage2-O0-g: 16174M (+0.00%)
stage2-clang: 34890032M (-0.03%)