Skip to content

Commit da709f5

Browse files
authored
[X86] combinePTESTCC - fold PTESTZ(X,SIGNMASK) -> VTESTPD/PSZ(X,X) on AVX targets (#165676)
If the PTEST is just using the ZF result and one of the operands is a i32/i64 sign mask we can use the TESTPD/PS instructions instead and avoid the use of an extra constant. Fixes some codegen identified in #156233
1 parent 5c5cef3 commit da709f5

File tree

2 files changed

+38
-66
lines changed

2 files changed

+38
-66
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48860,6 +48860,26 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
4886048860
return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
4886148861
}
4886248862

48863+
// Attempt to convert PTESTZ(X,SIGNMASK) -> VTESTPD/PSZ(X,X) on AVX targets.
48864+
if (EFLAGS.getOpcode() == X86ISD::PTEST && Subtarget.hasAVX()) {
48865+
KnownBits KnownOp1 = DAG.computeKnownBits(Op1);
48866+
assert(KnownOp1.getBitWidth() == 64 &&
48867+
"Illegal PTEST vector element width");
48868+
if (KnownOp1.isConstant()) {
48869+
const APInt &Mask = KnownOp1.getConstant();
48870+
if (Mask.isSignMask()) {
48871+
MVT FpVT = MVT::getVectorVT(MVT::f64, OpVT.getSizeInBits() / 64);
48872+
Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0));
48873+
return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0);
48874+
}
48875+
if (Mask.isSplat(32) && Mask.trunc(32).isSignMask()) {
48876+
MVT FpVT = MVT::getVectorVT(MVT::f32, OpVT.getSizeInBits() / 32);
48877+
Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0));
48878+
return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0);
48879+
}
48880+
}
48881+
}
48882+
4886348883
// TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
4886448884
// TODO: Add COND_NE handling?
4886548885
if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {

llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll

Lines changed: 18 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -875,28 +875,12 @@ define i1 @mask_v8i32(<8 x i32> %a0) {
875875
; SSE41-NEXT: sete %al
876876
; SSE41-NEXT: retq
877877
;
878-
; AVX1-LABEL: mask_v8i32:
879-
; AVX1: # %bb.0:
880-
; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
881-
; AVX1-NEXT: sete %al
882-
; AVX1-NEXT: vzeroupper
883-
; AVX1-NEXT: retq
884-
;
885-
; AVX2-LABEL: mask_v8i32:
886-
; AVX2: # %bb.0:
887-
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
888-
; AVX2-NEXT: vptest %ymm1, %ymm0
889-
; AVX2-NEXT: sete %al
890-
; AVX2-NEXT: vzeroupper
891-
; AVX2-NEXT: retq
892-
;
893-
; AVX512-LABEL: mask_v8i32:
894-
; AVX512: # %bb.0:
895-
; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
896-
; AVX512-NEXT: vptest %ymm1, %ymm0
897-
; AVX512-NEXT: sete %al
898-
; AVX512-NEXT: vzeroupper
899-
; AVX512-NEXT: retq
878+
; AVX-LABEL: mask_v8i32:
879+
; AVX: # %bb.0:
880+
; AVX-NEXT: vtestps %ymm0, %ymm0
881+
; AVX-NEXT: sete %al
882+
; AVX-NEXT: vzeroupper
883+
; AVX-NEXT: retq
900884
%1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
901885
%2 = and i32 %1, 2147483648
902886
%3 = icmp eq i32 %2, 0
@@ -965,28 +949,12 @@ define i1 @signtest_v8i32(<8 x i32> %a0) {
965949
; SSE41-NEXT: sete %al
966950
; SSE41-NEXT: retq
967951
;
968-
; AVX1-LABEL: signtest_v8i32:
969-
; AVX1: # %bb.0:
970-
; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
971-
; AVX1-NEXT: sete %al
972-
; AVX1-NEXT: vzeroupper
973-
; AVX1-NEXT: retq
974-
;
975-
; AVX2-LABEL: signtest_v8i32:
976-
; AVX2: # %bb.0:
977-
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
978-
; AVX2-NEXT: vptest %ymm1, %ymm0
979-
; AVX2-NEXT: sete %al
980-
; AVX2-NEXT: vzeroupper
981-
; AVX2-NEXT: retq
982-
;
983-
; AVX512-LABEL: signtest_v8i32:
984-
; AVX512: # %bb.0:
985-
; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
986-
; AVX512-NEXT: vptest %ymm1, %ymm0
987-
; AVX512-NEXT: sete %al
988-
; AVX512-NEXT: vzeroupper
989-
; AVX512-NEXT: retq
952+
; AVX-LABEL: signtest_v8i32:
953+
; AVX: # %bb.0:
954+
; AVX-NEXT: vtestps %ymm0, %ymm0
955+
; AVX-NEXT: sete %al
956+
; AVX-NEXT: vzeroupper
957+
; AVX-NEXT: retq
990958
%1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
991959
%2 = icmp sgt i32 %1, -1
992960
ret i1 %2
@@ -1010,28 +978,12 @@ define i1 @signtest_v4i64(<4 x i64> %a0) {
1010978
; SSE41-NEXT: sete %al
1011979
; SSE41-NEXT: retq
1012980
;
1013-
; AVX1-LABEL: signtest_v4i64:
1014-
; AVX1: # %bb.0:
1015-
; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
1016-
; AVX1-NEXT: sete %al
1017-
; AVX1-NEXT: vzeroupper
1018-
; AVX1-NEXT: retq
1019-
;
1020-
; AVX2-LABEL: signtest_v4i64:
1021-
; AVX2: # %bb.0:
1022-
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1023-
; AVX2-NEXT: vptest %ymm1, %ymm0
1024-
; AVX2-NEXT: sete %al
1025-
; AVX2-NEXT: vzeroupper
1026-
; AVX2-NEXT: retq
1027-
;
1028-
; AVX512-LABEL: signtest_v4i64:
1029-
; AVX512: # %bb.0:
1030-
; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1031-
; AVX512-NEXT: vptest %ymm1, %ymm0
1032-
; AVX512-NEXT: sete %al
1033-
; AVX512-NEXT: vzeroupper
1034-
; AVX512-NEXT: retq
981+
; AVX-LABEL: signtest_v4i64:
982+
; AVX: # %bb.0:
983+
; AVX-NEXT: vtestpd %ymm0, %ymm0
984+
; AVX-NEXT: sete %al
985+
; AVX-NEXT: vzeroupper
986+
; AVX-NEXT: retq
1035987
%1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0)
1036988
%2 = icmp sgt i64 %1, -1
1037989
ret i1 %2

0 commit comments

Comments
 (0)