Skip to content

Commit 6978329

Browse files
committed
[LV] Introduce conditional vector basic block.
This patch add the transformation that convert flatten control flow with conditional vector basic block. This transformation can help program skip masked operations without any active lane. First, this transformation will collect all masked stores and operands bottom-up. And put these msaked operations into a new vector basic block. Second, this transformation will split original vector loop and insert the new basic block between split blocks. And update the conditional branch in the orignal blocks. E.g. Before: { vector.loop: ... BranchOnCount %IV, %TC Successors middle.block, vector.loop } After: { vector.loop: ... %any.active.mask = any-of(%mask) BranchOnCond %any.active.mask Successors vector.if.bb, vector.loop.split vector.if.bb: ... (Masked operations) Successors vector.loop.split vector.loop.split: ... BranchOnCount %IV, %TC Successors middle.block, vector.loop }
1 parent f5c344e commit 6978329

File tree

4 files changed

+205
-6
lines changed

4 files changed

+205
-6
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,10 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
351351
cl::desc(
352352
"Prefer predicating a reduction operation over an after loop select."));
353353

354+
static cl::opt<bool> PreferControlFlow(
355+
"prefer-control-flow", cl::init(false), cl::Hidden,
356+
cl::desc("Generate control flow inside the vector region."));
357+
354358
cl::opt<bool> llvm::EnableVPlanNativePath(
355359
"enable-vplan-native-path", cl::Hidden,
356360
cl::desc("Enable VPlan-native vectorization path with "
@@ -4267,6 +4271,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
42674271
case VPInstruction::ExplicitVectorLength:
42684272
C += VPI->cost(VF, CostCtx);
42694273
break;
4274+
case VPInstruction::AnyOf:
4275+
if (!VPI->getUnderlyingValue())
4276+
C += VPI->cost(VF, CostCtx);
4277+
break;
42704278
default:
42714279
break;
42724280
}
@@ -8647,6 +8655,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
86478655
}
86488656
VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues, *PSE.getSE());
86498657

8658+
if (PreferControlFlow || TTI.preferControlFlow())
8659+
VPlanTransforms::optimizeConditionalVPBB(*Plan);
8660+
86508661
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
86518662
return Plan;
86528663
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5182,3 +5182,161 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
51825182
}
51835183
}
51845184
}
5185+
5186+
void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) {
5187+
VPDominatorTree VPDT(Plan);
5188+
5189+
VPValue *HeaderMask = findHeaderMask(Plan);
5190+
5191+
// Get the mask from the store recipes.
5192+
auto GetMask = [&HeaderMask](VPRecipeBase &R) -> VPValue * {
5193+
using namespace llvm::VPlanPatternMatch;
5194+
if (!isa<VPWidenStoreRecipe>(R))
5195+
return nullptr;
5196+
VPValue *OrigMask = cast<VPWidenMemoryRecipe>(R).getMask();
5197+
if (!OrigMask || OrigMask == HeaderMask ||
5198+
match(OrigMask, m_VPInstruction<VPInstruction::ActiveLaneMask>(
5199+
m_VPValue(), m_VPValue())))
5200+
return nullptr;
5201+
5202+
return OrigMask;
5203+
};
5204+
5205+
// First, collect all masked stores.
5206+
SmallVector<std::pair<VPRecipeBase *, VPValue *>> MaskedStores;
5207+
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
5208+
Plan.getEntry());
5209+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
5210+
for (VPRecipeBase &R : *VPBB) {
5211+
if (VPValue *Mask = GetMask(R))
5212+
MaskedStores.emplace_back(&R, Mask);
5213+
}
5214+
}
5215+
5216+
if (MaskedStores.empty())
5217+
return;
5218+
5219+
DenseSet<VPRecipeBase *> Candidates;
5220+
auto AddOperandsToCandidates = [&Candidates](VPRecipeBase *R) {
5221+
for (VPValue *Op : R->operands())
5222+
if (VPRecipeBase *OpR = Op->getDefiningRecipe())
5223+
Candidates.insert(OpR);
5224+
};
5225+
5226+
SmallVector<SetVector<VPRecipeBase *>> Tries;
5227+
while (!MaskedStores.empty()) {
5228+
auto [SR, M] = MaskedStores.pop_back_val();
5229+
Candidates.clear();
5230+
AddOperandsToCandidates(SR);
5231+
5232+
SetVector<VPRecipeBase *> CurrentTree;
5233+
CurrentTree.insert(SR);
5234+
5235+
VPBasicBlock *MaskBlock =
5236+
M->hasDefiningRecipe() ? M->getDefiningRecipe()->getParent() : nullptr;
5237+
5238+
// Don't move recipes before the mask and PHI recipes.
5239+
auto End =
5240+
MaskBlock == SR->getParent()
5241+
? M->getDefiningRecipe()->getReverseIterator()
5242+
: std::next(
5243+
SR->getParent()->getFirstNonPhi()->getReverseIterator());
5244+
// Also don't move the recipes through any recipe that may have side effects
5245+
// or write to memory.
5246+
for (auto It = std::next(SR->getReverseIterator()); It != End; ++It) {
5247+
if (It->mayHaveSideEffects() || It->mayWriteToMemory()) {
5248+
End = It;
5249+
break;
5250+
}
5251+
}
5252+
5253+
// Greedily add all recipes that are used to compute the stored value to the
5254+
// tree. All users of the added recipe must dominate the store
5255+
// recipe.
5256+
for (VPRecipeBase &R : make_range(SR->getReverseIterator(), End)) {
5257+
// Recipe is not part of the tree
5258+
if (!Candidates.contains(&R))
5259+
continue;
5260+
5261+
if (any_of(R.definedValues(), [&SR = SR, &VPDT](VPValue *Def) {
5262+
for (VPUser *U : Def->users()) {
5263+
if (auto *UR = dyn_cast<VPRecipeBase>(U)) {
5264+
if (UR == SR || VPDT.properlyDominates(UR, SR))
5265+
continue;
5266+
}
5267+
return true;
5268+
}
5269+
return false;
5270+
}))
5271+
continue;
5272+
5273+
CurrentTree.insert(&R);
5274+
AddOperandsToCandidates(&R);
5275+
}
5276+
// The previous traversal could have added recipes that are used by
5277+
// non-added recipes, which need to be removed from the list.
5278+
SmallDenseSet<VPRecipeBase *, 8> ToRemove;
5279+
bool Changed;
5280+
do {
5281+
Changed = false;
5282+
for (VPRecipeBase *R : CurrentTree) {
5283+
if (ToRemove.contains(R))
5284+
continue;
5285+
if (any_of(R->definedValues(), [&](VPValue *Def) {
5286+
for (VPUser *U : Def->users()) {
5287+
if (auto *UR = dyn_cast<VPRecipeBase>(U))
5288+
if (!CurrentTree.contains(UR) || ToRemove.contains(UR))
5289+
return true;
5290+
}
5291+
return false;
5292+
})) {
5293+
Changed = true;
5294+
ToRemove.insert(R);
5295+
}
5296+
}
5297+
} while (Changed);
5298+
5299+
for (VPRecipeBase *R : ToRemove)
5300+
CurrentTree.remove(R);
5301+
5302+
if (CurrentTree.size() > 1)
5303+
Tries.push_back(CurrentTree);
5304+
}
5305+
5306+
for (const auto &List : Tries) {
5307+
VPRecipeBase *SR = List.front();
5308+
VPValue *M = cast<VPWidenMemoryRecipe>(SR)->getMask();
5309+
assert(M && "Mask VPValue must exist at this point");
5310+
auto Recipes = reverse(List.getArrayRef());
5311+
5312+
// Split the current basic block at the store recipe point so that
5313+
// a predicated block can be added in between.
5314+
VPBasicBlock *ParentBB = SR->getParent();
5315+
VPBasicBlock *ContBB = ParentBB->splitAt(SR->getIterator());
5316+
5317+
// Create VPBB and insert it between ParentBB and ContBB.
5318+
VPBasicBlock *IfBB = Plan.createVPBasicBlock("vector.if.bb");
5319+
VPBlockUtils::insertBlockAfter(IfBB, ParentBB);
5320+
if (ContBB->getNumSuccessors() == 0)
5321+
ParentBB->getEnclosingLoopRegion()->setExiting(ContBB);
5322+
5323+
// Move recipes into the conditional block.
5324+
for (VPRecipeBase *R : Recipes)
5325+
R->moveBefore(*IfBB, IfBB->end());
5326+
5327+
// Add the condition and branch in the parent block.
5328+
auto *ActiveLane = new VPInstruction(VPInstruction::AnyOf, {M}, {}, {},
5329+
DebugLoc::getUnknown(), "any.of.mask");
5330+
5331+
auto *BranchOnCond =
5332+
new VPInstruction(VPInstruction::BranchOnCond, ActiveLane);
5333+
ParentBB->appendRecipe(ActiveLane);
5334+
ParentBB->appendRecipe(BranchOnCond);
5335+
5336+
// Set proper predecessors and successors for the conditional block.
5337+
ParentBB->clearSuccessors();
5338+
ParentBB->setSuccessors({IfBB, ContBB});
5339+
ContBB->clearPredecessors();
5340+
ContBB->setPredecessors({ParentBB, IfBB});
5341+
}
5342+
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,29 @@ struct VPlanTransforms {
402402
/// users in the original exit block using the VPIRInstruction wrapping to the
403403
/// LCSSA phi.
404404
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
405+
406+
/// Try to convert flattened control flow into a conditional vector basic
407+
/// block. If there are no active bits in the mask, it will skip all masked
408+
/// operations. This transformation will collect all masked operations
409+
/// bottom-up from the masked stores and put all masked operations in a new
410+
/// vector basic block. The original vector.loop will be split and the newly
411+
/// created basic block will be inserted in between.
412+
///
413+
///
414+
/// [ ] <-- vector.loop
415+
/// ^ | %any.active.mask = any-of(%Mask)
416+
/// / | Branch-On-Count %any.active.mask, 0
417+
/// / |\
418+
/// | (T)| \ (F)
419+
/// | | v
420+
/// | | [ ] <-- vector.if.bb (masked operations)
421+
/// | | |
422+
/// | | v
423+
/// | +-->[ ] <-- vector.loop.split
424+
/// | | |
425+
/// +---------+ v
426+
/// [ ] <-- middle.block
427+
static void optimizeConditionalVPBB(VPlan &Plan);
405428
};
406429

407430
} // namespace llvm

llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v %s | FileCheck %s
2+
; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v -prefer-control-flow %s | FileCheck %s
33

44
define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) {
55
; CHECK-LABEL: define void @test(
@@ -28,20 +28,27 @@ define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr
2828
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
2929
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
3030
; CHECK: [[VECTOR_BODY]]:
31-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
31+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[IF_THEN9_SPLIT:.*]] ]
3232
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]]
33-
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
33+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[TMP2]], i64 4
3434
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
3535
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
3636
; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
3737
; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]]
3838
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]]
3939
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]]
40+
; CHECK-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP7]]
41+
; CHECK-NEXT: [[TMP13:%.*]] = freeze <4 x i1> [[TMP8]]
42+
; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP12]], [[TMP13]]
43+
; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
44+
; CHECK-NEXT: br i1 [[TMP11]], label %[[VECTOR_IF_BB:.*]], label %[[IF_THEN9_SPLIT]]
45+
; CHECK: [[VECTOR_IF_BB]]:
4046
; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
4147
; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]]
42-
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
43-
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP2]], i32 8, <4 x i1> [[TMP7]])
44-
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]])
48+
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr align 8 [[TMP2]], <4 x i1> [[TMP7]])
49+
; CHECK-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr align 8 [[TMP4]], <4 x i1> [[TMP8]])
50+
; CHECK-NEXT: br label %[[IF_THEN9_SPLIT]]
51+
; CHECK: [[IF_THEN9_SPLIT]]:
4552
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
4653
; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
4754
; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]

0 commit comments

Comments
 (0)