Skip to content

Conversation

@ElvisWang123
Copy link
Contributor

@ElvisWang123 ElvisWang123 commented Dec 18, 2025

This patch converts scatter with uniform address and the mask being the header mask to the last-active-lane + extract-lane + scalar store.

The header mask can guarantee that the scatter has at least one active lane and can be converted to the scalar store.

Note that some dead instructions will be generated by extract-lane (such as test changes in RISCV/uniform-load-store.ll). Will fix in #172798.

…lar store. This patch converts scatter with uniform address and the mask being the header mask to the `last-active-lane` + `extract-lane` + `scalar store`. The header mask can guarantee that the scatter has at least one active lane and can be converted to the scalar store. Note that some dead instructions will be generated by `extract-lane`. Will open another PR to clean up.
@llvmbot
Copy link
Member

llvmbot commented Dec 18, 2025

@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: Elvis Wang (ElvisWang123)

Changes

This patch converts scatter with uniform address and the mask being the header mask to the last-active-lane + extract-lane + scalar store.

The header mask can guarantee that the scatter has at least one active lane and can be converted to the scalar store.

Note that some dead instructions will be generated by extract-lane. Fix in #172798.


Patch is 60.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/172799.diff

12 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+16-3)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll (+114-35)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll (+8-3)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll (+21-10)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll (+6-3)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll (+18-11)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll (+38-20)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll (+26-22)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll (+17-18)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll (+11-4)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll (+18-12)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll (+3-40)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 25a4b60e9a533..00ea3a932a209 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1601,11 +1601,24 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { // Only convert the scatter to a scalar store if it is unmasked. // TODO: Support converting scatter masked by the header mask to scalar // store. - if (Mask) + if (Mask && !vputils::isHeaderMask(Mask, Plan)) continue; - auto *Extract = new VPInstruction(VPInstruction::ExtractLastLane, - {WidenStoreR->getOperand(1)}); + VPInstruction *Extract; + if (!Mask) { + Extract = new VPInstruction(VPInstruction::ExtractLastLane, + {WidenStoreR->getOperand(1)}); + } else { + // If the mask is the header mask, this mask contains at least one + // active lane. So it is safe to convert the scatter to a scalar + // store. + assert(vputils::isHeaderMask(Mask, Plan) && + "Mask must be header mask."); + auto *Idx = new VPInstruction(VPInstruction::LastActiveLane, Mask); + Idx->insertBefore(WidenStoreR); + Extract = new VPInstruction(VPInstruction::ExtractLane, + {Idx, WidenStoreR->getOperand(1)}); + } Extract->insertBefore(WidenStoreR); // TODO: Sink the scalar store recipe to middle block if possible. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 0a11e8e4390cb..b4f7c77ff607e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -911,51 +911,130 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFNONE: [[END]]: ; TFNONE-NEXT: ret void ; -; TFCOMMON-LABEL: define void @test_widen_exp_v2( -; TFCOMMON-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { -; TFCOMMON-NEXT: [[ENTRY:.*]]: -; TFCOMMON-NEXT: br label %[[LOOP:.*]] -; TFCOMMON: [[LOOP]]: -; TFCOMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; TFCOMMON-NEXT: [[LD:%.*]] = load double, ptr [[P2]], align 8 -; TFCOMMON-NEXT: [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR7:[0-9]+]] -; TFCOMMON-NEXT: [[COND1:%.*]] = fcmp ogt double [[EXP]], 0.000000e+00 -; TFCOMMON-NEXT: [[SINK:%.*]] = select i1 [[COND1]], double 0.000000e+00, double 1.000000e+00 -; TFCOMMON-NEXT: store double [[SINK]], ptr [[P]], align 8 -; TFCOMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; TFCOMMON-NEXT: [[COND2:%.*]] = icmp eq i64 [[IV]], [[N]] -; TFCOMMON-NEXT: br i1 [[COND2]], label %[[END:.*]], label %[[LOOP]] -; TFCOMMON: [[END]]: -; TFCOMMON-NEXT: ret void +; TFALWAYS-LABEL: define void @test_widen_exp_v2( +; TFALWAYS-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; TFALWAYS-NEXT: [[ENTRY:.*]]: +; TFALWAYS-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; TFALWAYS-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; TFALWAYS-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2 +; TFALWAYS-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TFALWAYS-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 +; TFALWAYS-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], [[TMP4]] +; TFALWAYS-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP0]], [[TMP4]] +; TFALWAYS-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 0 +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; TFALWAYS-NEXT: br label %[[VECTOR_BODY:.*]] +; TFALWAYS: [[VECTOR_BODY]]: +; TFALWAYS-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFALWAYS-NEXT: [[TMP8:%.*]] = load double, ptr [[P2]], align 8 +; TFALWAYS-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP8]], i64 0 +; TFALWAYS-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer +; TFALWAYS-NEXT: [[TMP9:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; TFALWAYS-NEXT: [[TMP10:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP9]], zeroinitializer +; TFALWAYS-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFALWAYS-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true) +; TFALWAYS-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP11]], i1 false) +; TFALWAYS-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; TFALWAYS-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]] +; TFALWAYS-NEXT: store double [[TMP14]], ptr [[P]], align 8 +; TFALWAYS-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP7]]) +; TFALWAYS-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; TFALWAYS-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; TFALWAYS-NEXT: br i1 [[TMP16]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; TFALWAYS: [[END]]: +; TFALWAYS-NEXT: ret void +; +; TFFALLBACK-LABEL: define void @test_widen_exp_v2( +; TFFALLBACK-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; TFFALLBACK-NEXT: [[ENTRY:.*]]: +; TFFALLBACK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; TFFALLBACK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2 +; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 +; TFFALLBACK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], [[TMP4]] +; TFFALLBACK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP0]], [[TMP4]] +; TFFALLBACK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 0 +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; TFFALLBACK-NEXT: br label %[[VECTOR_BODY:.*]] +; TFFALLBACK: [[VECTOR_BODY]]: +; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[TMP8:%.*]] = load double, ptr [[P2]], align 8 +; TFFALLBACK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP8]], i64 0 +; TFFALLBACK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer +; TFFALLBACK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; TFFALLBACK-NEXT: [[TMP10:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP9]], zeroinitializer +; TFFALLBACK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFFALLBACK-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true) +; TFFALLBACK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP11]], i1 false) +; TFFALLBACK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; TFFALLBACK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]] +; TFFALLBACK-NEXT: store double [[TMP14]], ptr [[P]], align 8 +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP7]]) +; TFFALLBACK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; TFFALLBACK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; TFFALLBACK-NEXT: br i1 [[TMP16]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; TFFALLBACK: [[END]]: +; TFFALLBACK-NEXT: ret void ; ; TFA_INTERLEAVE-LABEL: define void @test_widen_exp_v2( ; TFA_INTERLEAVE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { ; TFA_INTERLEAVE-NEXT: [[ENTRY:.*]]: ; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP7]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[TMP10]] +; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], [[TMP10]] ; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ult i64 0, [[TMP0]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[TMP0]] +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1 +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[TMP0]]) ; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFA_INTERLEAVE: [[VECTOR_BODY]]: -; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP9:.*]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP9]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP9]] ] +; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT1:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT3:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8 -; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]] -; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP8]], double 0.000000e+00, double 1.000000e+00 -; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = or i1 [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK2]] -; TFA_INTERLEAVE-NEXT: br i1 [[TMP14]], label %[[BB8:.*]], label %[[TMP9]] -; TFA_INTERLEAVE: [[BB8]]: +; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP4]], i64 0 +; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]]) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]]) +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP11]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP13]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP14]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], splat (i1 true) +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]], splat (i1 true) +; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 2 +; TFA_INTERLEAVE-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP16]], i1 false) +; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = add i64 [[TMP29]], [[FIRST_INACTIVE_LANE]] +; TFA_INTERLEAVE-NEXT: [[FIRST_INACTIVE_LANE4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP15]], i1 false) +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = icmp ne i64 [[FIRST_INACTIVE_LANE4]], [[TMP29]] +; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = select i1 [[TMP20]], i64 [[FIRST_INACTIVE_LANE4]], i64 [[TMP30]] +; TFA_INTERLEAVE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP31]], 1 +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]] +; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = sub i64 [[LAST_ACTIVE_LANE]], [[TMP23]] +; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI4]], i64 [[TMP25]] +; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = icmp uge i64 [[LAST_ACTIVE_LANE]], [[TMP23]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP27]], double [[TMP26]], double [[TMP24]] ; TFA_INTERLEAVE-NEXT: store double [[PREDPHI3]], ptr [[P]], align 8 -; TFA_INTERLEAVE-NEXT: br label %[[TMP9]] -; TFA_INTERLEAVE: [[TMP9]]: -; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 1 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = icmp ult i64 [[TMP20]], [[TMP3]] +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1 +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT1]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP3]]) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT3]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP19]], i64 [[TMP3]]) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT1]], i32 0 ; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT]], true ; TFA_INTERLEAVE-NEXT: br i1 [[TMP21]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; TFA_INTERLEAVE: [[END]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index 8cc9e431e6214..3b1d37a1f123c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -397,15 +397,20 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true) +; CHECK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP14]], i1 false) +; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP15]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP10]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], i64 [[LAST_ACTIVE_LANE]] +; CHECK-NEXT: store i32 [[TMP16]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll index 36ebd422b5d7b..12da17de09ee5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll @@ -31,17 +31,21 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[COND:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT1]], splat (i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[BOXES]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[NBRBOXES]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 ... [truncated] 
@llvmbot
Copy link
Member

llvmbot commented Dec 18, 2025

@llvm/pr-subscribers-backend-risc-v

Author: Elvis Wang (ElvisWang123)

Changes

This patch converts scatter with uniform address and the mask being the header mask to the last-active-lane + extract-lane + scalar store.

The header mask can guarantee that the scatter has at least one active lane and can be converted to the scalar store.

Note that some dead instructions will be generated by extract-lane. Fix in #172798.


Patch is 60.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/172799.diff

12 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+16-3)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll (+114-35)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll (+8-3)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll (+21-10)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll (+6-3)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/pointer-induction.ll (+18-11)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll (+38-20)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll (+26-22)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll (+17-18)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll (+11-4)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll (+18-12)
  • (modified) llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll (+3-40)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 25a4b60e9a533..00ea3a932a209 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1601,11 +1601,24 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { // Only convert the scatter to a scalar store if it is unmasked. // TODO: Support converting scatter masked by the header mask to scalar // store. - if (Mask) + if (Mask && !vputils::isHeaderMask(Mask, Plan)) continue; - auto *Extract = new VPInstruction(VPInstruction::ExtractLastLane, - {WidenStoreR->getOperand(1)}); + VPInstruction *Extract; + if (!Mask) { + Extract = new VPInstruction(VPInstruction::ExtractLastLane, + {WidenStoreR->getOperand(1)}); + } else { + // If the mask is the header mask, this mask contains at least one + // active lane. So it is safe to convert the scatter to a scalar + // store. + assert(vputils::isHeaderMask(Mask, Plan) && + "Mask must be header mask."); + auto *Idx = new VPInstruction(VPInstruction::LastActiveLane, Mask); + Idx->insertBefore(WidenStoreR); + Extract = new VPInstruction(VPInstruction::ExtractLane, + {Idx, WidenStoreR->getOperand(1)}); + } Extract->insertBefore(WidenStoreR); // TODO: Sink the scalar store recipe to middle block if possible. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 0a11e8e4390cb..b4f7c77ff607e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -911,51 +911,130 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFNONE: [[END]]: ; TFNONE-NEXT: ret void ; -; TFCOMMON-LABEL: define void @test_widen_exp_v2( -; TFCOMMON-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { -; TFCOMMON-NEXT: [[ENTRY:.*]]: -; TFCOMMON-NEXT: br label %[[LOOP:.*]] -; TFCOMMON: [[LOOP]]: -; TFCOMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; TFCOMMON-NEXT: [[LD:%.*]] = load double, ptr [[P2]], align 8 -; TFCOMMON-NEXT: [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR7:[0-9]+]] -; TFCOMMON-NEXT: [[COND1:%.*]] = fcmp ogt double [[EXP]], 0.000000e+00 -; TFCOMMON-NEXT: [[SINK:%.*]] = select i1 [[COND1]], double 0.000000e+00, double 1.000000e+00 -; TFCOMMON-NEXT: store double [[SINK]], ptr [[P]], align 8 -; TFCOMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; TFCOMMON-NEXT: [[COND2:%.*]] = icmp eq i64 [[IV]], [[N]] -; TFCOMMON-NEXT: br i1 [[COND2]], label %[[END:.*]], label %[[LOOP]] -; TFCOMMON: [[END]]: -; TFCOMMON-NEXT: ret void +; TFALWAYS-LABEL: define void @test_widen_exp_v2( +; TFALWAYS-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; TFALWAYS-NEXT: [[ENTRY:.*]]: +; TFALWAYS-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; TFALWAYS-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; TFALWAYS-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2 +; TFALWAYS-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TFALWAYS-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 +; TFALWAYS-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], [[TMP4]] +; TFALWAYS-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP0]], [[TMP4]] +; TFALWAYS-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 0 +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; TFALWAYS-NEXT: br label %[[VECTOR_BODY:.*]] +; TFALWAYS: [[VECTOR_BODY]]: +; TFALWAYS-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFALWAYS-NEXT: [[TMP8:%.*]] = load double, ptr [[P2]], align 8 +; TFALWAYS-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP8]], i64 0 +; TFALWAYS-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer +; TFALWAYS-NEXT: [[TMP9:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; TFALWAYS-NEXT: [[TMP10:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP9]], zeroinitializer +; TFALWAYS-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFALWAYS-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true) +; TFALWAYS-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP11]], i1 false) +; TFALWAYS-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; TFALWAYS-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]] +; TFALWAYS-NEXT: store double [[TMP14]], ptr [[P]], align 8 +; TFALWAYS-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP7]]) +; TFALWAYS-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; TFALWAYS-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; TFALWAYS-NEXT: br i1 [[TMP16]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; TFALWAYS: [[END]]: +; TFALWAYS-NEXT: ret void +; +; TFFALLBACK-LABEL: define void @test_widen_exp_v2( +; TFFALLBACK-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; TFFALLBACK-NEXT: [[ENTRY:.*]]: +; TFFALLBACK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; TFFALLBACK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2 +; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1 +; TFFALLBACK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], [[TMP4]] +; TFFALLBACK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP0]], [[TMP4]] +; TFFALLBACK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 0 +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; TFFALLBACK-NEXT: br label %[[VECTOR_BODY:.*]] +; TFFALLBACK: [[VECTOR_BODY]]: +; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[TMP8:%.*]] = load double, ptr [[P2]], align 8 +; TFFALLBACK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP8]], i64 0 +; TFFALLBACK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer +; TFFALLBACK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]) +; TFFALLBACK-NEXT: [[TMP10:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP9]], zeroinitializer +; TFFALLBACK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFFALLBACK-NEXT: [[TMP11:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true) +; TFFALLBACK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP11]], i1 false) +; TFFALLBACK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; TFFALLBACK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]] +; TFFALLBACK-NEXT: store double [[TMP14]], ptr [[P]], align 8 +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP7]]) +; TFFALLBACK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; TFFALLBACK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true +; TFFALLBACK-NEXT: br i1 [[TMP16]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; TFFALLBACK: [[END]]: +; TFFALLBACK-NEXT: ret void ; ; TFA_INTERLEAVE-LABEL: define void @test_widen_exp_v2( ; TFA_INTERLEAVE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { ; TFA_INTERLEAVE-NEXT: [[ENTRY:.*]]: ; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4 +; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP7]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[TMP10]] +; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], [[TMP10]] ; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ult i64 0, [[TMP0]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[TMP0]] +; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1 +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]]) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[TMP0]]) ; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFA_INTERLEAVE: [[VECTOR_BODY]]: -; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[TMP9:.*]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP9]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP9]] ] +; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT1:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT3:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8 -; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]] -; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP8]], double 0.000000e+00, double 1.000000e+00 -; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = or i1 [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK2]] -; TFA_INTERLEAVE-NEXT: br i1 [[TMP14]], label %[[BB8:.*]], label %[[TMP9]] -; TFA_INTERLEAVE: [[BB8]]: +; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP4]], i64 0 +; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]]) +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]]) +; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP11]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP13]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFA_INTERLEAVE-NEXT: [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP14]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00) +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], splat (i1 true) +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK3]], splat (i1 true) +; TFA_INTERLEAVE-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 2 +; TFA_INTERLEAVE-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP16]], i1 false) +; TFA_INTERLEAVE-NEXT: [[TMP30:%.*]] = add i64 [[TMP29]], [[FIRST_INACTIVE_LANE]] +; TFA_INTERLEAVE-NEXT: [[FIRST_INACTIVE_LANE4:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP15]], i1 false) +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = icmp ne i64 [[FIRST_INACTIVE_LANE4]], [[TMP29]] +; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = select i1 [[TMP20]], i64 [[FIRST_INACTIVE_LANE4]], i64 [[TMP30]] +; TFA_INTERLEAVE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP31]], 1 +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 2 +; TFA_INTERLEAVE-NEXT: [[TMP24:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI]], i64 [[LAST_ACTIVE_LANE]] +; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = sub i64 [[LAST_ACTIVE_LANE]], [[TMP23]] +; TFA_INTERLEAVE-NEXT: [[TMP26:%.*]] = extractelement <vscale x 2 x double> [[PREDPHI4]], i64 [[TMP25]] +; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = icmp uge i64 [[LAST_ACTIVE_LANE]], [[TMP23]] +; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP27]], double [[TMP26]], double [[TMP24]] ; TFA_INTERLEAVE-NEXT: store double [[PREDPHI3]], ptr [[P]], align 8 -; TFA_INTERLEAVE-NEXT: br label %[[TMP9]] -; TFA_INTERLEAVE: [[TMP9]]: -; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 1 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX]], [[TMP3]] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT6]] = icmp ult i64 [[TMP20]], [[TMP3]] +; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1 +; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT1]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP3]]) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT3]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP19]], i64 [[TMP3]]) +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT1]], i32 0 ; TFA_INTERLEAVE-NEXT: [[TMP21:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT]], true ; TFA_INTERLEAVE-NEXT: br i1 [[TMP21]], label %[[END:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; TFA_INTERLEAVE: [[END]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index 8cc9e431e6214..3b1d37a1f123c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -397,15 +397,20 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST:%.*]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison) -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], splat (i1 true) +; CHECK-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP14]], i1 false) +; CHECK-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP15]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP10]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], i64 [[LAST_ACTIVE_LANE]] +; CHECK-NEXT: store i32 [[TMP16]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll index 36ebd422b5d7b..12da17de09ee5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll @@ -31,17 +31,21 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i1> poison, i1 [[COND:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = xor <vscale x 4 x i1> [[BROADCAST_SPLAT1]], splat (i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[BOXES]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[NBRBOXES]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 ... [truncated] 
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you verify that this profitable on all targets?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment