llvm
diff --git a/‎llvm/lib/Target/X86/X86ISelLowering.cpp‎
Lines changed: 34 additions & 0 deletions b/‎llvm/lib/Target/X86/X86ISelLowering.cpp‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/X86/bfloat-calling-conv.ll‎
Lines changed: 2 additions & 4 deletions b/‎llvm/test/CodeGen/X86/bfloat-calling-conv.ll‎
Lines changed: 2 additions & 4 deletions
@@ -54634,6 +54634,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
  const X86Subtarget &Subtarget) {
  EVT VT = N->getValueType(0);
  SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
  SDLoc DL(N);
 
  // Attempt to pre-truncate inputs to arithmetic ops instead.
@@ -54652,6 +54653,39 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
  if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
  return V;
 
+ // Fold trunc(srl(load(p),amt)) -> load(p+amt/8)
+ // If we're shifting down byte aligned bit chunks from a larger load for
+ // truncation, see if we can convert the shift into a pointer offset instead.
+ // Limit this to normal (non-ext) scalar integer loads.
+ if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
+ Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
+ ISD::isNormalLoad(Src.getOperand(0).getNode())) {
+ auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
+ if (Ld->isSimple() && VT.isByteSized() &&
+ isPowerOf2_64(VT.getSizeInBits())) {
+ SDValue ShAmt = Src.getOperand(1);
+ KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+ // Check the shift amount is byte aligned.
+ // Check the truncation doesn't use any shifted in (zero) top bits.
+ if (KnownAmt.countMinTrailingZeros() >= 3 &&
+ KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() -
+ VT.getSizeInBits())) {
+ EVT PtrVT = Ld->getBasePtr().getValueType();
+ SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT);
+ SDValue PtrByteOfs =
+ DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
+ DAG.getShiftAmountConstant(3, PtrVT, DL));
+ SDValue NewPtr = DAG.getMemBasePlusOffset(
+ Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
+ SDValue NewLoad =
+ DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1),
+ NewLoad.getValue(1));
+ return NewLoad;
+ }
+ }
+ }
+
  // The bitcast source is a direct mmx result.
  // Detect bitcasts between i32 to x86mmx
  if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
 
@@ -660,8 +660,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
 ; SSE2-LABEL: call_ret_v3bf16:
 ; SSE2: # %bb.0:
 ; SSE2-NEXT: pushq %rax
-; SSE2-NEXT: movl 4(%rdi), %eax
-; SSE2-NEXT: pinsrw $0, %eax, %xmm1
+; SSE2-NEXT: pinsrw $0, 4(%rdi), %xmm1
 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT: callq returns_v3bf16@PLT
@@ -725,8 +724,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
 ; AVXNECONVERT-LABEL: call_ret_v3bf16:
 ; AVXNECONVERT: # %bb.0:
 ; AVXNECONVERT-NEXT: pushq %rax
-; AVXNECONVERT-NEXT: movl 4(%rdi), %eax
-; AVXNECONVERT-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVXNECONVERT-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0
 ; AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVXNECONVERT-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; AVXNECONVERT-NEXT: callq returns_v3bf16@PLT