- Notifications
You must be signed in to change notification settings - Fork 15.5k
[LV] Convert scatter w/uniform addr and mask being header mask to scalar store. #172799
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[LV] Convert scatter w/uniform addr and mask being header mask to scalar store. #172799
Conversation
…lar store. This patch converts scatter with uniform address and the mask being the header mask to the `last-active-lane` + `extract-lane` + `scalar store`. The header mask can guarantee that the scatter has at least one active lane and can be converted to the scalar store. Note that some dead instructions will be generated by `extract-lane`. Will open another PR to clean up.
| @llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Elvis Wang (ElvisWang123) ChangesThis patch converts scatter with uniform address and the mask being the header mask to the The header mask can guarantee that the scatter has at least one active lane and can be converted to the scalar store. Note that some dead instructions will be generated by Patch is 60.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/172799.diff 12 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 25a4b60e9a533..00ea3a932a209 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1601,11 +1601,24 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { // Only convert the scatter to a scalar store if it is unmasked. // TODO: Support converting scatter masked by the header mask to scalar // store. - if (Mask) + if (Mask && !vputils::isHeaderMask(Mask, Plan)) continue; - auto *Extract = new VPInstruction(VPInstruction::ExtractLastLane, - {WidenStoreR->getOperand(1)}); + VPInstruction *Extract; + if (!Mask) { + Extract = new VPInstruction(VPInstruction::ExtractLastLane, + {WidenStoreR->getOperand(1)}); + } else { + // If the mask is the header mask, this mask contains at least one + // active lane. So it is safe to convert the scatter to a scalar + // store. + assert(vputils::isHeaderMask(Mask, Plan) && + "Mask must be header mask."); + auto *Idx = new VPInstruction(VPInstruction::LastActiveLane, Mask); + Idx->insertBefore(WidenStoreR); + Extract = new VPInstruction(VPInstruction::ExtractLane, + {Idx, WidenStoreR->getOperand(1)}); + } Extract->insertBefore(WidenStoreR); // TODO: Sink the scalar store recipe to middle block if possible. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 0a11e8e4390cb..b4f7c77ff607e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -911,51 +911,130 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFNONE: [[END]]: ; TFNONE-NEXT: ret void ; -; TFCOMMON-LABEL: define void @test_widen_exp_v2( -; TFCOMMON-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { -; TFCOMMON-NEXT: [[ENTRY:.*]]: -; TFCOMMON-NEXT: br label %[[LOOP:.*]] -; TFCOMMON: [[LOOP]]: -; TFCOMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; TFCOMMON-NEXT: [[LD:%.*]] = load double, ptr [[P2]], align 8 -; TFCOMMON-NEXT: [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR7:[0-9]+]] -; TFCOMMON-NEXT: [[COND1:%.*]] = fcmp ogt double [[EXP]], 0.000000e+00 -; TFCOMMON-NEXT: [[SINK:%.*]] = select i1 [[COND1]], double 0.000000e+00, double 1.000000e+00 -; TFCOMMON-NEXT: store double [[SINK]], ptr [[P]], align 8 -; TFCOMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; TFCOMMON-NEXT: [[COND2:%.*]] = icmp eq i64 [[IV]], [[N]] -; TFCOMMON-NEXT: br i1 [[COND2]], label %[[END:.*]], label %[[LOOP]] -; TFCOMMON: [[END]]: -; TFCOMMON-NEXT: ret void +; TFALWAYS-LABEL: define void @test_widen_exp_v2( +; TFALWAYS-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; TFALWAYS-NEXT: [[ENTRY:.*]]: +; TFALWAYS-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; TFALWAYS-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; TFALWAYS-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2 +; TFALWAYS-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TFALWAYS-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 +; TFALWAYS-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], [[TMP4]] +; TFALWAYS-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP0]], [[TMP4]] +; TFALWAYS-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 0 +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; TFALWAYS-NEXT: br label %[[VECTOR_BODY:.*]] +; TFALWAYS: [[VECTOR_BODY]]: +; TFALWAYS-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFALWAYS-NEXT: [[TMP8:%.*]] = load double, ptr [[P2]], align 8 +; TFALWAYS-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP8]], i64 0 +; TFALWAYS-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer +; TFALWAYS-NEXT: [[TMP9:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; TFALWAYS-NEXT: [[TMP10:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP9]], zeroinitializer +; TFALWAYS-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFALWAYS-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true) +; TFALWAYS-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP11]], i1 false) +; TFALWAYS-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; TFALWAYS-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]] +; TFALWAYS-NEXT: store double [[TMP14]], ptr [[P]], align 8 +; TFALWAYS-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP7]]) +; TFALWAYS-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; TFALWAYS-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; TFALWAYS-NEXT: br i1 [[TMP16]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; TFALWAYS: [[END]]: +; TFALWAYS-NEXT: ret void +; +; TFFALLBACK-LABEL: define void @test_widen_exp_v2( +; TFFALLBACK-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; TFFALLBACK-NEXT: [[ENTRY:.*]]: +; TFFALLBACK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; TFFALLBACK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2 +; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 +; TFFALLBACK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], [[TMP4]] +; TFFALLBACK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP0]], [[TMP4]] +; TFFALLBACK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 0 +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; TFFALLBACK-NEXT: br label %[[VECTOR_BODY:.*]] +; TFFALLBACK: [[VECTOR_BODY]]: +; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[TMP8:%.*]] = load double, ptr [[P2]], align 8 +; TFFALLBACK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP8]], i64 0 +; TFFALLBACK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer +; TFFALLBACK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; TFFALLBACK-NEXT: [[TMP10:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP9]], zeroinitializer +; TFFALLBACK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFFALLBACK-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true) +; TFFALLBACK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP11]], i1 false) +; TFFALLBACK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; TFFALLBACK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]] +; TFFALLBACK-NEXT: store double [[TMP14]], ptr [[P]], align 8 +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP7]]) +; TFFALLBACK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; TFFALLBACK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; TFFALLBACK-NEXT: br i1 [[TMP16]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; TFFALLBACK: [[END]]: +; TFFALLBACK-NEXT: ret void ; ; TFA_INTERLEAVE-LABEL: define void @test_widen_exp_v2( ; TFA_INTERLEAVE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { ; TFA_INTERLEAVE-NEXT: [[ENTRY:.*]]: ; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP7]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[TMP10]] +; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], [[TMP10]] ; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ult i64 0, [[TMP0]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[TMP0]] +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1 +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[TMP0]]) ; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFA_INTERLEAVE: [[VECTOR_BODY]]: -; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP9:.*]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP9]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP9]] ] +; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT1:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT3:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8 -; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]] -; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP8]], double 0.000000e+00, double 1.000000e+00 -; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = or i1 [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK2]] -; TFA_INTERLEAVE-NEXT: br i1 [[TMP14]], label %[[BB8:.*]], label %[[TMP9]] -; TFA_INTERLEAVE: [[BB8]]: +; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP4]], i64 0 +; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]]) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]]) +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP11]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP13]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP14]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], splat (i1 true) +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]], splat (i1 true) +; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 2 +; TFA_INTERLEAVE-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP16]], i1 false) +; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = add i64 [[TMP29]], [[FIRST_INACTIVE_LANE]] +; TFA_INTERLEAVE-NEXT: [[FIRST_INACTIVE_LANE4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP15]], i1 false) +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = icmp ne i64 [[FIRST_INACTIVE_LANE4]], [[TMP29]] +; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = select i1 [[TMP20]], i64 [[FIRST_INACTIVE_LANE4]], i64 [[TMP30]] +; TFA_INTERLEAVE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP31]], 1 +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]] +; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = sub i64 [[LAST_ACTIVE_LANE]], [[TMP23]] +; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI4]], i64 [[TMP25]] +; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = icmp uge i64 [[LAST_ACTIVE_LANE]], [[TMP23]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP27]], double [[TMP26]], double [[TMP24]] ; TFA_INTERLEAVE-NEXT: store double [[PREDPHI3]], ptr [[P]], align 8 -; TFA_INTERLEAVE-NEXT: br label %[[TMP9]] -; TFA_INTERLEAVE: [[TMP9]]: -; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 1 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = icmp ult i64 [[TMP20]], [[TMP3]] +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1 +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT1]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP3]]) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT3]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP19]], i64 [[TMP3]]) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT1]], i32 0 ; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT]], true ; TFA_INTERLEAVE-NEXT: br i1 [[TMP21]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; TFA_INTERLEAVE: [[END]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index 8cc9e431e6214..3b1d37a1f123c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -397,15 +397,20 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true) +; CHECK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP14]], i1 false) +; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP15]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP10]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], i64 [[LAST_ACTIVE_LANE]] +; CHECK-NEXT: store i32 [[TMP16]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll index 36ebd422b5d7b..12da17de09ee5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll @@ -31,17 +31,21 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[COND:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT1]], splat (i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[BOXES]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[NBRBOXES]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 ... [truncated] |
| @llvm/pr-subscribers-backend-risc-v Author: Elvis Wang (ElvisWang123) ChangesThis patch converts scatter with uniform address and the mask being the header mask to the The header mask can guarantee that the scatter has at least one active lane and can be converted to the scalar store. Note that some dead instructions will be generated by Patch is 60.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/172799.diff 12 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 25a4b60e9a533..00ea3a932a209 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1601,11 +1601,24 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { // Only convert the scatter to a scalar store if it is unmasked. // TODO: Support converting scatter masked by the header mask to scalar // store. - if (Mask) + if (Mask && !vputils::isHeaderMask(Mask, Plan)) continue; - auto *Extract = new VPInstruction(VPInstruction::ExtractLastLane, - {WidenStoreR->getOperand(1)}); + VPInstruction *Extract; + if (!Mask) { + Extract = new VPInstruction(VPInstruction::ExtractLastLane, + {WidenStoreR->getOperand(1)}); + } else { + // If the mask is the header mask, this mask contains at least one + // active lane. So it is safe to convert the scatter to a scalar + // store. + assert(vputils::isHeaderMask(Mask, Plan) && + "Mask must be header mask."); + auto *Idx = new VPInstruction(VPInstruction::LastActiveLane, Mask); + Idx->insertBefore(WidenStoreR); + Extract = new VPInstruction(VPInstruction::ExtractLane, + {Idx, WidenStoreR->getOperand(1)}); + } Extract->insertBefore(WidenStoreR); // TODO: Sink the scalar store recipe to middle block if possible. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 0a11e8e4390cb..b4f7c77ff607e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -911,51 +911,130 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFNONE: [[END]]: ; TFNONE-NEXT: ret void ; -; TFCOMMON-LABEL: define void @test_widen_exp_v2( -; TFCOMMON-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { -; TFCOMMON-NEXT: [[ENTRY:.*]]: -; TFCOMMON-NEXT: br label %[[LOOP:.*]] -; TFCOMMON: [[LOOP]]: -; TFCOMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; TFCOMMON-NEXT: [[LD:%.*]] = load double, ptr [[P2]], align 8 -; TFCOMMON-NEXT: [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR7:[0-9]+]] -; TFCOMMON-NEXT: [[COND1:%.*]] = fcmp ogt double [[EXP]], 0.000000e+00 -; TFCOMMON-NEXT: [[SINK:%.*]] = select i1 [[COND1]], double 0.000000e+00, double 1.000000e+00 -; TFCOMMON-NEXT: store double [[SINK]], ptr [[P]], align 8 -; TFCOMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; TFCOMMON-NEXT: [[COND2:%.*]] = icmp eq i64 [[IV]], [[N]] -; TFCOMMON-NEXT: br i1 [[COND2]], label %[[END:.*]], label %[[LOOP]] -; TFCOMMON: [[END]]: -; TFCOMMON-NEXT: ret void +; TFALWAYS-LABEL: define void @test_widen_exp_v2( +; TFALWAYS-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; TFALWAYS-NEXT: [[ENTRY:.*]]: +; TFALWAYS-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; TFALWAYS-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; TFALWAYS-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2 +; TFALWAYS-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TFALWAYS-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 +; TFALWAYS-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], [[TMP4]] +; TFALWAYS-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP0]], [[TMP4]] +; TFALWAYS-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 0 +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; TFALWAYS-NEXT: br label %[[VECTOR_BODY:.*]] +; TFALWAYS: [[VECTOR_BODY]]: +; TFALWAYS-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFALWAYS-NEXT: [[TMP8:%.*]] = load double, ptr [[P2]], align 8 +; TFALWAYS-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP8]], i64 0 +; TFALWAYS-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer +; TFALWAYS-NEXT: [[TMP9:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; TFALWAYS-NEXT: [[TMP10:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP9]], zeroinitializer +; TFALWAYS-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFALWAYS-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true) +; TFALWAYS-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP11]], i1 false) +; TFALWAYS-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; TFALWAYS-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]] +; TFALWAYS-NEXT: store double [[TMP14]], ptr [[P]], align 8 +; TFALWAYS-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP7]]) +; TFALWAYS-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; TFALWAYS-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; TFALWAYS-NEXT: br i1 [[TMP16]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; TFALWAYS: [[END]]: +; TFALWAYS-NEXT: ret void +; +; TFFALLBACK-LABEL: define void @test_widen_exp_v2( +; TFFALLBACK-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; TFFALLBACK-NEXT: [[ENTRY:.*]]: +; TFFALLBACK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; TFFALLBACK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2 +; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 +; TFFALLBACK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], [[TMP4]] +; TFFALLBACK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP0]], [[TMP4]] +; TFFALLBACK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 0 +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; TFFALLBACK-NEXT: br label %[[VECTOR_BODY:.*]] +; TFFALLBACK: [[VECTOR_BODY]]: +; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[TMP8:%.*]] = load double, ptr [[P2]], align 8 +; TFFALLBACK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP8]], i64 0 +; TFFALLBACK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer +; TFFALLBACK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; TFFALLBACK-NEXT: [[TMP10:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP9]], zeroinitializer +; TFFALLBACK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFFALLBACK-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true) +; TFFALLBACK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP11]], i1 false) +; TFFALLBACK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; TFFALLBACK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]] +; TFFALLBACK-NEXT: store double [[TMP14]], ptr [[P]], align 8 +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP7]]) +; TFFALLBACK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; TFFALLBACK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; TFFALLBACK-NEXT: br i1 [[TMP16]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; TFFALLBACK: [[END]]: +; TFFALLBACK-NEXT: ret void ; ; TFA_INTERLEAVE-LABEL: define void @test_widen_exp_v2( ; TFA_INTERLEAVE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { ; TFA_INTERLEAVE-NEXT: [[ENTRY:.*]]: ; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP7]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[TMP10]] +; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], [[TMP10]] ; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ult i64 0, [[TMP0]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[TMP0]] +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1 +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[TMP0]]) ; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFA_INTERLEAVE: [[VECTOR_BODY]]: -; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP9:.*]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP9]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP9]] ] +; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT1:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT3:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8 -; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]] -; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP8]], double 0.000000e+00, double 1.000000e+00 -; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = or i1 [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK2]] -; TFA_INTERLEAVE-NEXT: br i1 [[TMP14]], label %[[BB8:.*]], label %[[TMP9]] -; TFA_INTERLEAVE: [[BB8]]: +; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP4]], i64 0 +; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]]) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]]) +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP11]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP13]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP14]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], splat (i1 true) +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]], splat (i1 true) +; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 2 +; TFA_INTERLEAVE-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP16]], i1 false) +; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = add i64 [[TMP29]], [[FIRST_INACTIVE_LANE]] +; TFA_INTERLEAVE-NEXT: [[FIRST_INACTIVE_LANE4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP15]], i1 false) +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = icmp ne i64 [[FIRST_INACTIVE_LANE4]], [[TMP29]] +; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = select i1 [[TMP20]], i64 [[FIRST_INACTIVE_LANE4]], i64 [[TMP30]] +; TFA_INTERLEAVE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP31]], 1 +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]] +; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = sub i64 [[LAST_ACTIVE_LANE]], [[TMP23]] +; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI4]], i64 [[TMP25]] +; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = icmp uge i64 [[LAST_ACTIVE_LANE]], [[TMP23]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP27]], double [[TMP26]], double [[TMP24]] ; TFA_INTERLEAVE-NEXT: store double [[PREDPHI3]], ptr [[P]], align 8 -; TFA_INTERLEAVE-NEXT: br label %[[TMP9]] -; TFA_INTERLEAVE: [[TMP9]]: -; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 1 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = icmp ult i64 [[TMP20]], [[TMP3]] +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1 +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT1]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP3]]) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT3]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP19]], i64 [[TMP3]]) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT1]], i32 0 ; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT]], true ; TFA_INTERLEAVE-NEXT: br i1 [[TMP21]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; TFA_INTERLEAVE: [[END]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index 8cc9e431e6214..3b1d37a1f123c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -397,15 +397,20 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true) +; CHECK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP14]], i1 false) +; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP15]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP10]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], i64 [[LAST_ACTIVE_LANE]] +; CHECK-NEXT: store i32 [[TMP16]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll index 36ebd422b5d7b..12da17de09ee5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll @@ -31,17 +31,21 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[COND:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT1]], splat (i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[BOXES]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[NBRBOXES]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 ... [truncated] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Did you verify that this profitable on all targets?
This patch converts scatter with uniform address and the mask being the header mask to the
last-active-lane+extract-lane+scalar store.The header mask can guarantee that the scatter has at least one active lane and can be converted to the scalar store.
Note that some dead instructions will be generated by
extract-lane(such as test changes in RISCV/uniform-load-store.ll). Will fix in #172798.