Skip to content

Commit 5ad30bf

Browse files
committed
[LV] Introduce conditional vector basic block.
This patch add the transformation that convert flatten control flow with conditional vector basic block. This transformation can help program skip masked operations without any active lane. First, this transformation will collect all masked stores and operands bottom-up. And put these msaked operations into a new vector basic block. Second, this transformation will split original vector loop and insert the new basic block between split blocks. And update the conditional branch in the orignal blocks. E.g. Before: { vector.loop: ... BranchOnCount %IV, %TC Successors middle.block, vector.loop } After: { vector.loop: ... %any.active.mask = any-of(%mask) BranchOnCond %any.active.mask Successors vector.if.bb, vector.loop.split vector.if.bb: ... (Masked operations) Successors vector.loop.split vector.loop.split: ... BranchOnCount %IV, %TC Successors middle.block, vector.loop }
1 parent d12fe84 commit 5ad30bf

File tree

4 files changed

+202
-6
lines changed

4 files changed

+202
-6
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,10 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
350350
cl::desc(
351351
"Prefer predicating a reduction operation over an after loop select."));
352352

353+
static cl::opt<bool> PreferControlFlow(
354+
"prefer-control-flow", cl::init(false), cl::Hidden,
355+
cl::desc("Generate control flow inside the vector region."));
356+
353357
cl::opt<bool> llvm::EnableVPlanNativePath(
354358
"enable-vplan-native-path", cl::Hidden,
355359
cl::desc("Enable VPlan-native vectorization path with "
@@ -4251,6 +4255,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
42514255
case VPInstruction::ExplicitVectorLength:
42524256
C += VPI->cost(VF, CostCtx);
42534257
break;
4258+
case VPInstruction::AnyOf:
4259+
if (!VPI->getUnderlyingValue())
4260+
C += VPI->cost(VF, CostCtx);
4261+
break;
42544262
default:
42554263
break;
42564264
}
@@ -8296,6 +8304,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
82968304
VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths,
82978305
*Plan, CM.getMinimalBitwidths());
82988306
VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);
8307+
if (PreferControlFlow || TTI.preferControlFlow())
8308+
VPlanTransforms::optimizeConditionalVPBB(*Plan);
82998309
// TODO: try to put it close to addActiveLaneMask().
83008310
if (CM.foldTailWithEVL())
83018311
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4680,3 +4680,159 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
46804680
}
46814681
}
46824682
}
4683+
4684+
void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) {
4685+
VPDominatorTree VPDT(Plan);
4686+
4687+
VPValue *HeaderMask = findHeaderMask(Plan);
4688+
4689+
// Get the mask from the store recipes.
4690+
auto GetMask = [&HeaderMask](VPRecipeBase &R) -> VPValue * {
4691+
using namespace llvm::VPlanPatternMatch;
4692+
if (!isa<VPWidenStoreRecipe>(R))
4693+
return nullptr;
4694+
VPValue *OrigMask = cast<VPWidenMemoryRecipe>(R).getMask();
4695+
if (!OrigMask || OrigMask == HeaderMask ||
4696+
match(OrigMask, m_VPInstruction<VPInstruction::ActiveLaneMask>(
4697+
m_VPValue(), m_VPValue())))
4698+
return nullptr;
4699+
4700+
return OrigMask;
4701+
};
4702+
4703+
// First, collect all masked stores.
4704+
SmallVector<std::pair<VPRecipeBase *, VPValue *>> MaskedStores;
4705+
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
4706+
Plan.getEntry());
4707+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4708+
for (VPRecipeBase &R : *VPBB) {
4709+
if (VPValue *Mask = GetMask(R))
4710+
MaskedStores.emplace_back(&R, Mask);
4711+
}
4712+
}
4713+
4714+
if (MaskedStores.empty())
4715+
return;
4716+
4717+
DenseSet<VPRecipeBase *> Candidates;
4718+
auto AddOperandsToCandidates = [&Candidates](VPRecipeBase *R) {
4719+
for (VPValue *Op : R->operands())
4720+
if (VPRecipeBase *OpR = Op->getDefiningRecipe())
4721+
Candidates.insert(OpR);
4722+
};
4723+
4724+
SmallVector<SetVector<VPRecipeBase *>> Tries;
4725+
while (!MaskedStores.empty()) {
4726+
auto [SR, M] = MaskedStores.pop_back_val();
4727+
Candidates.clear();
4728+
AddOperandsToCandidates(SR);
4729+
4730+
SetVector<VPRecipeBase *> CurrentTree;
4731+
CurrentTree.insert(SR);
4732+
4733+
VPBasicBlock *MaskBlock =
4734+
M->hasDefiningRecipe() ? M->getDefiningRecipe()->getParent() : nullptr;
4735+
4736+
// Don't move recipes before the mask and PHI recipes.
4737+
auto End = MaskBlock == SR->getParent()
4738+
? M->getDefiningRecipe()->getReverseIterator()
4739+
: SR->getParent()->getFirstNonPhi()->getReverseIterator();
4740+
// Also don't move the recipes through any recipe that may have side effects
4741+
// or write to memory.
4742+
for (auto It = std::next(SR->getReverseIterator()); It != End; ++It) {
4743+
if (It->mayHaveSideEffects() || It->mayWriteToMemory()) {
4744+
End = It;
4745+
break;
4746+
}
4747+
}
4748+
4749+
// Greedily add all recipes that are used to compute the stored value to the
4750+
// tree. All users of the added recipe must dominate the store
4751+
// recipe.
4752+
for (VPRecipeBase &R : make_range(SR->getReverseIterator(), End)) {
4753+
// Recipe is not part of the tree
4754+
if (!Candidates.contains(&R))
4755+
continue;
4756+
4757+
if (any_of(R.definedValues(), [&SR = SR, &VPDT](VPValue *Def) {
4758+
for (VPUser *U : Def->users()) {
4759+
if (auto *UR = dyn_cast<VPRecipeBase>(U)) {
4760+
if (UR == SR || VPDT.properlyDominates(UR, SR))
4761+
continue;
4762+
}
4763+
return true;
4764+
}
4765+
return false;
4766+
}))
4767+
continue;
4768+
4769+
CurrentTree.insert(&R);
4770+
AddOperandsToCandidates(&R);
4771+
}
4772+
// The previous traversal could have added recipes that are used by
4773+
// non-added recipes, which need to be removed from the list.
4774+
SmallDenseSet<VPRecipeBase *, 8> ToRemove;
4775+
bool Changed;
4776+
do {
4777+
Changed = false;
4778+
for (VPRecipeBase *R : CurrentTree) {
4779+
if (ToRemove.contains(R))
4780+
continue;
4781+
if (any_of(R->definedValues(), [&](VPValue *Def) {
4782+
for (VPUser *U : Def->users()) {
4783+
if (auto *UR = dyn_cast<VPRecipeBase>(U))
4784+
if (!CurrentTree.contains(UR) || ToRemove.contains(UR))
4785+
return true;
4786+
}
4787+
return false;
4788+
})) {
4789+
Changed = true;
4790+
ToRemove.insert(R);
4791+
}
4792+
}
4793+
} while (Changed);
4794+
4795+
for (VPRecipeBase *R : ToRemove)
4796+
CurrentTree.remove(R);
4797+
4798+
if (CurrentTree.size() > 1)
4799+
Tries.push_back(CurrentTree);
4800+
}
4801+
4802+
for (const auto &List : Tries) {
4803+
VPRecipeBase *SR = List.front();
4804+
VPValue *M = cast<VPWidenMemoryRecipe>(SR)->getMask();
4805+
assert(M && "Mask VPValue must exist at this point");
4806+
auto Recipes = reverse(List.getArrayRef());
4807+
4808+
// Split the current basic block at the store recipe point so that
4809+
// a predicated block can be added in between.
4810+
VPBasicBlock *ParentBB = SR->getParent();
4811+
VPBasicBlock *ContBB = ParentBB->splitAt(SR->getIterator());
4812+
4813+
// Create VPBB and insert it between ParentBB and ContBB.
4814+
VPBasicBlock *IfBB = Plan.createVPBasicBlock("vector.if.bb");
4815+
VPBlockUtils::insertBlockAfter(IfBB, ParentBB);
4816+
if (ContBB->getNumSuccessors() == 0)
4817+
ParentBB->getEnclosingLoopRegion()->setExiting(ContBB);
4818+
4819+
// Move recipes into the conditional block.
4820+
for (VPRecipeBase *R : Recipes)
4821+
R->moveBefore(*IfBB, IfBB->end());
4822+
4823+
// Add the condition and branch in the parent block.
4824+
auto *ActiveLane =
4825+
new VPInstruction(VPInstruction::AnyOf, {M}, nullptr, "any.of.mask");
4826+
4827+
auto *BranchOnCond =
4828+
new VPInstruction(VPInstruction::BranchOnCond, ActiveLane);
4829+
ParentBB->appendRecipe(ActiveLane);
4830+
ParentBB->appendRecipe(BranchOnCond);
4831+
4832+
// Set proper predecessors and successors for the conditional block.
4833+
ParentBB->clearSuccessors();
4834+
ParentBB->setSuccessors({IfBB, ContBB});
4835+
ContBB->clearPredecessors();
4836+
ContBB->setPredecessors({ParentBB, IfBB});
4837+
}
4838+
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,29 @@ struct VPlanTransforms {
378378
/// users in the original exit block using the VPIRInstruction wrapping to the
379379
/// LCSSA phi.
380380
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
381+
382+
/// Try to convert flattened control flow into a conditional vector basic
383+
/// block. If there are no active bits in the mask, it will skip all masked
384+
/// operations. This transformation will collect all masked operations
385+
/// bottom-up from the masked stores and put all masked operations in a new
386+
/// vector basic block. The original vector.loop will be split and the newly
387+
/// created basic block will be inserted in between.
388+
///
389+
///
390+
/// [ ] <-- vector.loop
391+
/// ^ | %any.active.mask = any-of(%Mask)
392+
/// / | Branch-On-Count %any.active.mask, 0
393+
/// / |\
394+
/// | (T)| \ (F)
395+
/// | | v
396+
/// | | [ ] <-- vector.if.bb (masked operations)
397+
/// | | |
398+
/// | | v
399+
/// | +-->[ ] <-- vector.loop.split
400+
/// | | |
401+
/// +---------+ v
402+
/// [ ] <-- middle.block
403+
static void optimizeConditionalVPBB(VPlan &Plan);
381404
};
382405

383406
} // namespace llvm

llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v %s | FileCheck %s
2+
; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v -prefer-control-flow %s | FileCheck %s
33

44
define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) {
55
; CHECK-LABEL: define void @test(
@@ -28,20 +28,27 @@ define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr
2828
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
2929
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
3030
; CHECK: [[VECTOR_BODY]]:
31-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
31+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY_SPLIT:.*]] ]
3232
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]]
33-
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
33+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
3434
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
3535
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
3636
; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
3737
; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]]
3838
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]]
3939
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]]
40+
; CHECK-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP7]]
41+
; CHECK-NEXT: [[TMP13:%.*]] = freeze <4 x i1> [[TMP8]]
42+
; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP12]], [[TMP13]]
43+
; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
44+
; CHECK-NEXT: br i1 [[TMP11]], label %[[VECTOR_IF_BB:.*]], label %[[VECTOR_BODY_SPLIT]]
45+
; CHECK: [[VECTOR_IF_BB]]:
4046
; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
4147
; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]]
42-
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
43-
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP2]], i32 8, <4 x i1> [[TMP7]])
44-
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]])
48+
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr align 8 [[TMP2]], <4 x i1> [[TMP7]])
49+
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr align 8 [[TMP4]], <4 x i1> [[TMP8]])
50+
; CHECK-NEXT: br label %[[VECTOR_BODY_SPLIT]]
51+
; CHECK: [[VECTOR_BODY_SPLIT]]:
4552
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
4653
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
4754
; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]

0 commit comments

Comments
 (0)