Skip to content
51 changes: 8 additions & 43 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29755,65 +29755,30 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
const X86Subtarget &Subtarget,
SelectionDAG &DAG,
SDValue *Low = nullptr) {
unsigned NumElts = VT.getVectorNumElements();

// For vXi8 we will unpack the low and high half of each 128 bit lane to widen
// to a vXi16 type. Do the multiplies, shift the results and pack the half
// lane results back together.

// We'll take different approaches for signed and unsigned.
// For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
// and use pmullw to calculate the full 16-bit product.
// For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
// words and use pmullw to calculate the full 16-bit product.
// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
// shift them left into the upper byte of each word. This allows us to use
// pmulhw to calculate the full 16-bit product. This trick means we don't
// need to sign extend the bytes to use pmullw.

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
SDValue Zero = DAG.getConstant(0, dl, VT);

SDValue ALo, AHi;
SDValue ALo, AHi, BLo, BHi;
if (IsSigned) {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
} else {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
}

SDValue BLo, BHi;
if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
// If the RHS is a constant, manually unpackl/unpackh and extend.
SmallVector<SDValue, 16> LoOps, HiOps;
for (unsigned i = 0; i != NumElts; i += 16) {
for (unsigned j = 0; j != 8; ++j) {
SDValue LoOp = B.getOperand(i + j);
SDValue HiOp = B.getOperand(i + j + 8);

if (IsSigned) {
LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
DAG.getConstant(8, dl, MVT::i16));
HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
DAG.getConstant(8, dl, MVT::i16));
} else {
LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
}

LoOps.push_back(LoOp);
HiOps.push_back(HiOp);
}
}

BLo = DAG.getBuildVector(ExVT, dl, LoOps);
BHi = DAG.getBuildVector(ExVT, dl, HiOps);
} else if (IsSigned) {
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
} else {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
}

Expand All @@ -29826,7 +29791,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
if (Low)
*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);

return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
}

static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/combine-sdiv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2927,7 +2927,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,0,0,0,0,0,37632]
; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
Expand All @@ -2947,7 +2947,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,37632]
; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm1
; SSE41-NEXT: paddb %xmm0, %xmm1
Expand All @@ -2971,7 +2971,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1
Expand Down Expand Up @@ -3044,7 +3044,7 @@ define <16 x i8> @pr38658(<16 x i8> %x) {
; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632]
; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15]
; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/X86/combine-udiv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -665,14 +665,12 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
;
; XOP-LABEL: combine_vec_udiv_nonuniform4:
; XOP: # %bb.0:
; XOP-NEXT: movl $171, %eax
; XOP-NEXT: movl $249, %eax
; XOP-NEXT: vmovd %eax, %xmm1
; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpsrlw $8, %xmm1, %xmm1
; XOP-NEXT: movl $249, %eax
; XOP-NEXT: vmovd %eax, %xmm2
; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1
; XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it a regression? I remember we prefer to narrow load size if load is needed.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We've picked up an extra X86ISD::VZEXT_MOVL someplace that causes the scalar_to_vector to constant fold - I'll see if there's an easy fix.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we keep the bitcasts, we bump the instruction recursion depth just enough that SimplifyDemandedVectorElts can't remove the VZEXT_MOVL node for us - and then we perform this in combineTargetShuffle for all VZEXT_MOVL(SCALAR_TO_VECTOR(CONSTANT)) cases:

 // Load a scalar integer constant directly to XMM instead of transferring an // immediate value from GPR. // vzext_movl (scalar_to_vector C) --> load [C,0...] if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) { if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) { // Create a vector constant - scalar constant followed by zeros. EVT ScalarVT = N0.getOperand(0).getValueType(); Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext()); Constant *Zero = ConstantInt::getNullValue(ScalarTy); SmallVector<Constant *, 32> ConstantVec(NumElts, Zero); ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue()); // Load the vector constant from constant pool. MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT); MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign(); return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment, MachineMemOperand::MOLoad); } } 

Its odd that if we just have SCALAR_TO_VECTOR(CONSTANT) we keep the gpr->xmm transfer, especially as VZEXT_MOVL will most likely disappear due to the implicit zeroing of upper elements by MOVD/Q. IIRC we've encountered this many times and sometimes we've tried to avoid a load, and other times we've wanted to fold the load to reduce register pressure - we never come up with a solution that works in all cases, although I suspect trying to solve this in DAG is where we're going wrong.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for checking! Yeah, unless we have a precise register model. We don't know which is the best in different scenarios. I see the rest are NFC, so I'll +1 for the change.

; XOP-NEXT: vpsrlw $8, %xmm2, %xmm2
; XOP-NEXT: vpshlb %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
; XOP-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; XOP-NEXT: retq
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
; AVX256BW: # %bb.0:
; AVX256BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
; AVX256BW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX256BW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2335,10 +2335,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [34048,34048,26368,37632,21760,33024,22016,35072]
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [20224,26368,6912,30976,33024,33024,33024,12032]
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0]
Expand Down Expand Up @@ -2369,10 +2369,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm4
; CHECK-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [2304,0,10496,37632,33024,33024,21760,36096]
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [22016,24320,37632,11008,12544,32512,16640,37632]
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
Expand Down Expand Up @@ -2417,10 +2417,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [34048,34048,26368,37632,21760,33024,22016,35072,2304,0,10496,37632,33024,33024,21760,36096]
; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
; CHECK-AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
; CHECK-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [20224,26368,6912,30976,33024,33024,33024,12032,22016,24320,37632,11008,12544,32512,16640,37632]
; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
; CHECK-AVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
Expand Down
Loading