Skip to content

Commit f996725

Browse files
[LoopVectorize] Fix strict reductions where VF = 1
Currently we will allow loops with a fixed width VF of 1 to vectorize if the -enable-strict-reductions flag is set. However, the loop vectorizer will not use ordered reductions if `VF.isScalar()` and the resulting vectorized loop will be out of order. This patch removes `VF.isVector()` when checking if ordered reductions should be used. Also, instead of converting the FAdds to reductions if the VF = 1, operands of the FAdds are changed such that the order is preserved. Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D104533
1 parent 80aa7e1 commit f996725

File tree

3 files changed

+103
-22
lines changed

3 files changed

+103
-22
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,8 @@ class LoopVectorizationPlanner {
356356
/// reductions, with one operand being vector and the other being the scalar
357357
/// reduction chain.
358358
void adjustRecipesForInLoopReductions(VPlanPtr &Plan,
359-
VPRecipeBuilder &RecipeBuilder);
359+
VPRecipeBuilder &RecipeBuilder,
360+
ElementCount MinVF);
360361
};
361362

362363
} // namespace llvm

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4344,8 +4344,7 @@ void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR,
43444344
// any loop invariant values.
43454345
BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
43464346

4347-
bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi &&
4348-
Cost->useOrderedReductions(RdxDesc);
4347+
bool IsOrdered = IsInLoopReductionPhi && Cost->useOrderedReductions(RdxDesc);
43494348

43504349
for (unsigned Part = 0; Part < UF; ++Part) {
43514350
if (IsOrdered && Part > 0)
@@ -4759,8 +4758,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
47594758
Type *VecTy =
47604759
ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
47614760

4762-
bool IsOrdered = State.VF.isVector() &&
4763-
Cost->isInLoopReduction(cast<PHINode>(PN)) &&
4761+
bool IsOrdered = Cost->isInLoopReduction(cast<PHINode>(PN)) &&
47644762
Cost->useOrderedReductions(*RdxDesc);
47654763
unsigned LastPartForNewPhi = IsOrdered ? 1 : State.UF;
47664764
for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
@@ -9280,8 +9278,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
92809278
}
92819279

92829280
// Adjust the recipes for any inloop reductions.
9283-
if (Range.Start.isVector())
9284-
adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
9281+
adjustRecipesForInLoopReductions(Plan, RecipeBuilder, Range.Start);
92859282

92869283
// Finally, if tail is folded by masking, introduce selects between the phi
92879284
// and the live-out instruction of each reduction, at the end of the latch.
@@ -9356,12 +9353,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
93569353
// reductions, with one operand being vector and the other being the scalar
93579354
// reduction chain.
93589355
void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
9359-
VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
9356+
VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
93609357
for (auto &Reduction : CM.getInLoopReductionChains()) {
93619358
PHINode *Phi = Reduction.first;
93629359
RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
93639360
const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
93649361

9362+
if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9363+
continue;
9364+
93659365
// ReductionOperations are orders top-down from the phi's use to the
93669366
// LoopExitValue. We keep a track of the previous item (the Chain) to tell
93679367
// which of the two operands will remain scalar and which will be reduced.
@@ -9378,7 +9378,7 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
93789378
"Expected to replace a VPWidenSelectSC");
93799379
FirstOpId = 1;
93809380
} else {
9381-
assert(isa<VPWidenRecipe>(WidenRecipe) &&
9381+
assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) &&
93829382
"Expected to replace a VPWidenSC");
93839383
FirstOpId = 0;
93849384
}
@@ -9527,8 +9527,13 @@ void VPReductionRecipe::execute(VPTransformState &State) {
95279527
Value *NewRed;
95289528
Value *NextInChain;
95299529
if (IsOrdered) {
9530-
NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9531-
PrevInChain);
9530+
if (State.VF.isVector())
9531+
NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9532+
PrevInChain);
9533+
else
9534+
NewRed = State.Builder.CreateBinOp(
9535+
(Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(),
9536+
PrevInChain, NewVecOp);
95329537
PrevInChain = NewRed;
95339538
} else {
95349539
PrevInChain = State.get(getChainOp(), Part);

llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll

Lines changed: 86 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -693,14 +693,89 @@ for.end:
693693
ret float %add6
694694
}
695695

696-
!0 = distinct !{!0, !4, !7, !9}
697-
!1 = distinct !{!1, !4, !8, !9}
698-
!2 = distinct !{!2, !5, !7, !9}
699-
!3 = distinct !{!3, !6, !7, !9, !10}
700-
!4 = !{!"llvm.loop.vectorize.width", i32 8}
701-
!5 = !{!"llvm.loop.vectorize.width", i32 4}
702-
!6 = !{!"llvm.loop.vectorize.width", i32 2}
703-
!7 = !{!"llvm.loop.interleave.count", i32 1}
704-
!8 = !{!"llvm.loop.interleave.count", i32 4}
705-
!9 = !{!"llvm.loop.vectorize.enable", i1 true}
706-
!10 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
696+
; Test reductions for a VF of 1 and a UF > 1.
697+
define float @fadd_scalar_vf(float* noalias nocapture readonly %a, i64 %n) {
698+
; CHECK-ORDERED-LABEL: @fadd_scalar_vf
699+
; CHECK-ORDERED: vector.body
700+
; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, {{.*}} ], [ %[[FADD4:.*]], %vector.body ]
701+
; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float*
702+
; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
703+
; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float*
704+
; CHECK-ORDERED: %[[LOAD4:.*]] = load float, float*
705+
; CHECK-ORDERED: %[[FADD1:.*]] = fadd float %[[VEC_PHI]], %[[LOAD1]]
706+
; CHECK-ORDERED: %[[FADD2:.*]] = fadd float %[[FADD1]], %[[LOAD2]]
707+
; CHECK-ORDERED: %[[FADD3:.*]] = fadd float %[[FADD2]], %[[LOAD3]]
708+
; CHECK-ORDERED: %[[FADD4]] = fadd float %[[FADD3]], %[[LOAD4]]
709+
; CHECK-ORDERED-NOT: call float @llvm.vector.reduce.fadd
710+
; CHECK-ORDERED: scalar.ph
711+
; CHECK-ORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[FADD4]], %middle.block ]
712+
; CHECK-ORDERED: for.body
713+
; CHECK-ORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ]
714+
; CHECK-ORDERED: %[[LOAD5:.*]] = load float, float*
715+
; CHECK-ORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]]
716+
; CHECK-ORDERED: for.end
717+
; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[FADD4]], %middle.block ]
718+
; CHECK-ORDERED: ret float %[[RES_PHI]]
719+
720+
; CHECK-UNORDERED-LABEL: @fadd_scalar_vf
721+
; CHECK-UNORDERED: vector.body
722+
; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD1:.*]], %vector.body ]
723+
; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ]
724+
; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD3:.*]], %vector.body ]
725+
; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD4:.*]], %vector.body ]
726+
; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
727+
; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
728+
; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, float*
729+
; CHECK-UNORDERED: %[[LOAD4:.*]] = load float, float*
730+
; CHECK-UNORDERED: %[[FADD1]] = fadd float %[[LOAD1]], %[[VEC_PHI1]]
731+
; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[LOAD2]], %[[VEC_PHI2]]
732+
; CHECK-UNORDERED: %[[FADD3]] = fadd float %[[LOAD3]], %[[VEC_PHI3]]
733+
; CHECK-UNORDERED: %[[FADD4]] = fadd float %[[LOAD4]], %[[VEC_PHI4]]
734+
; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
735+
; CHECK-UNORDERED: middle.block
736+
; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd float %[[FADD2]], %[[FADD1]]
737+
; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd float %[[FADD3]], %[[BIN_RDX1]]
738+
; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd float %[[FADD4]], %[[BIN_RDX2]]
739+
; CHECK-UNORDERED: scalar.ph
740+
; CHECK-UNORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[BIN_RDX3]], %middle.block ]
741+
; CHECK-UNORDERED: for.body
742+
; CHECK-UNORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ]
743+
; CHECK-UNORDERED: %[[LOAD5:.*]] = load float, float*
744+
; CHECK-UNORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]]
745+
; CHECK-UNORDERED: for.end
746+
; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[BIN_RDX3]], %middle.block ]
747+
; CHECK-UNORDERED: ret float %[[RES_PHI]]
748+
749+
; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf
750+
; CHECK-NOT-VECTORIZED-NOT: @vector.body
751+
752+
entry:
753+
br label %for.body
754+
755+
for.body:
756+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
757+
%sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
758+
%arrayidx = getelementptr inbounds float, float* %a, i64 %iv
759+
%0 = load float, float* %arrayidx, align 4
760+
%add = fadd float %0, %sum.07
761+
%iv.next = add nuw nsw i64 %iv, 1
762+
%exitcond.not = icmp eq i64 %iv.next, %n
763+
br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !4
764+
765+
for.end:
766+
ret float %add
767+
}
768+
769+
!0 = distinct !{!0, !5, !9, !11}
770+
!1 = distinct !{!1, !5, !10, !11}
771+
!2 = distinct !{!2, !6, !9, !11}
772+
!3 = distinct !{!3, !7, !9, !11, !12}
773+
!4 = distinct !{!4, !8, !10, !11}
774+
!5 = !{!"llvm.loop.vectorize.width", i32 8}
775+
!6 = !{!"llvm.loop.vectorize.width", i32 4}
776+
!7 = !{!"llvm.loop.vectorize.width", i32 2}
777+
!8 = !{!"llvm.loop.vectorize.width", i32 1}
778+
!9 = !{!"llvm.loop.interleave.count", i32 1}
779+
!10 = !{!"llvm.loop.interleave.count", i32 4}
780+
!11 = !{!"llvm.loop.vectorize.enable", i1 true}
781+
!12 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}

0 commit comments

Comments
 (0)