Skip to content

Commit f8edcba

Browse files
authored
[DAG] visitTRUNCATE - more aggressively fold trunc(add(x,x)) -> add(trunc(x),trunc(x)) (#164227)
We're very careful not to truncate binary arithmetic ops if it will affect legality, or cause additional truncation instructions, hence we currently limit this to cases where one operand is constant. But if both ops are the same (i.e. for some add/mul cases) then we wouldn't increase the number of truncations, so can be slightly more aggressive at folding the truncation.
1 parent 909f429 commit f8edcba

File tree

3 files changed

+48
-61
lines changed

3 files changed

+48
-61
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16433,7 +16433,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
1643316433
case ISD::OR:
1643416434
case ISD::XOR:
1643516435
if (!LegalOperations && N0.hasOneUse() &&
16436-
(isConstantOrConstantVector(N0.getOperand(0), true) ||
16436+
(N0.getOperand(0) == N0.getOperand(1) ||
16437+
isConstantOrConstantVector(N0.getOperand(0), true) ||
1643716438
isConstantOrConstantVector(N0.getOperand(1), true))) {
1643816439
// TODO: We already restricted this to pre-legalization, but for vectors
1643916440
// we are extra cautious to not create an unsupported operation.

llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll

Lines changed: 44 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -5,32 +5,30 @@ define <16 x i8> @lower_trunc_16xi8(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16
55
; CHECK-LABEL: lower_trunc_16xi8:
66
; CHECK: // %bb.0:
77
; CHECK-NEXT: fmov s0, w0
8-
; CHECK-NEXT: ldr h1, [sp]
8+
; CHECK-NEXT: mov x8, sp
9+
; CHECK-NEXT: mov v0.b[1], w1
10+
; CHECK-NEXT: mov v0.b[2], w2
11+
; CHECK-NEXT: mov v0.b[3], w3
12+
; CHECK-NEXT: mov v0.b[4], w4
13+
; CHECK-NEXT: mov v0.b[5], w5
14+
; CHECK-NEXT: mov v0.b[6], w6
15+
; CHECK-NEXT: mov v0.b[7], w7
16+
; CHECK-NEXT: ld1 { v0.b }[8], [x8]
917
; CHECK-NEXT: add x8, sp, #8
10-
; CHECK-NEXT: ld1 { v1.h }[1], [x8]
18+
; CHECK-NEXT: ld1 { v0.b }[9], [x8]
1119
; CHECK-NEXT: add x8, sp, #16
12-
; CHECK-NEXT: mov v0.h[1], w1
13-
; CHECK-NEXT: ld1 { v1.h }[2], [x8]
20+
; CHECK-NEXT: ld1 { v0.b }[10], [x8]
1421
; CHECK-NEXT: add x8, sp, #24
15-
; CHECK-NEXT: mov v0.h[2], w2
16-
; CHECK-NEXT: ld1 { v1.h }[3], [x8]
22+
; CHECK-NEXT: ld1 { v0.b }[11], [x8]
1723
; CHECK-NEXT: add x8, sp, #32
18-
; CHECK-NEXT: mov v0.h[3], w3
19-
; CHECK-NEXT: ld1 { v1.h }[4], [x8]
24+
; CHECK-NEXT: ld1 { v0.b }[12], [x8]
2025
; CHECK-NEXT: add x8, sp, #40
21-
; CHECK-NEXT: ld1 { v1.h }[5], [x8]
26+
; CHECK-NEXT: ld1 { v0.b }[13], [x8]
2227
; CHECK-NEXT: add x8, sp, #48
23-
; CHECK-NEXT: mov v0.h[4], w4
24-
; CHECK-NEXT: ld1 { v1.h }[6], [x8]
28+
; CHECK-NEXT: ld1 { v0.b }[14], [x8]
2529
; CHECK-NEXT: add x8, sp, #56
26-
; CHECK-NEXT: mov v0.h[5], w5
27-
; CHECK-NEXT: ld1 { v1.h }[7], [x8]
28-
; CHECK-NEXT: mov v0.h[6], w6
29-
; CHECK-NEXT: add v2.8h, v1.8h, v1.8h
30-
; CHECK-NEXT: mov v0.h[7], w7
31-
; CHECK-NEXT: add v3.8h, v0.8h, v0.8h
32-
; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
33-
; CHECK-NEXT: uzp1 v1.16b, v3.16b, v2.16b
30+
; CHECK-NEXT: ld1 { v0.b }[15], [x8]
31+
; CHECK-NEXT: add v1.16b, v0.16b, v0.16b
3432
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
3533
; CHECK-NEXT: ret
3634
%a1 = insertelement <16 x i16> poison, i16 %a, i16 0
@@ -59,18 +57,15 @@ define <16 x i8> @lower_trunc_16xi8(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16
5957
define <8 x i16> @lower_trunc_8xi16(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) {
6058
; CHECK-LABEL: lower_trunc_8xi16:
6159
; CHECK: // %bb.0:
62-
; CHECK-NEXT: fmov s0, w4
63-
; CHECK-NEXT: fmov s1, w0
64-
; CHECK-NEXT: mov v0.s[1], w5
65-
; CHECK-NEXT: mov v1.s[1], w1
66-
; CHECK-NEXT: mov v0.s[2], w6
67-
; CHECK-NEXT: mov v1.s[2], w2
68-
; CHECK-NEXT: mov v0.s[3], w7
69-
; CHECK-NEXT: mov v1.s[3], w3
70-
; CHECK-NEXT: add v2.4s, v0.4s, v0.4s
71-
; CHECK-NEXT: add v3.4s, v1.4s, v1.4s
72-
; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
73-
; CHECK-NEXT: uzp1 v1.8h, v3.8h, v2.8h
60+
; CHECK-NEXT: fmov s0, w0
61+
; CHECK-NEXT: mov v0.h[1], w1
62+
; CHECK-NEXT: mov v0.h[2], w2
63+
; CHECK-NEXT: mov v0.h[3], w3
64+
; CHECK-NEXT: mov v0.h[4], w4
65+
; CHECK-NEXT: mov v0.h[5], w5
66+
; CHECK-NEXT: mov v0.h[6], w6
67+
; CHECK-NEXT: mov v0.h[7], w7
68+
; CHECK-NEXT: add v1.8h, v0.8h, v0.8h
7469
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
7570
; CHECK-NEXT: ret
7671
%a1 = insertelement <8 x i32> poison, i32 %a, i32 0
@@ -91,14 +86,11 @@ define <8 x i16> @lower_trunc_8xi16(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32
9186
define <4 x i32> @lower_trunc_4xi32(i64 %a, i64 %b, i64 %c, i64 %d) {
9287
; CHECK-LABEL: lower_trunc_4xi32:
9388
; CHECK: // %bb.0:
94-
; CHECK-NEXT: fmov d0, x2
95-
; CHECK-NEXT: fmov d1, x0
96-
; CHECK-NEXT: mov v0.d[1], x3
97-
; CHECK-NEXT: mov v1.d[1], x1
98-
; CHECK-NEXT: add v2.2d, v0.2d, v0.2d
99-
; CHECK-NEXT: add v3.2d, v1.2d, v1.2d
100-
; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
101-
; CHECK-NEXT: uzp1 v1.4s, v3.4s, v2.4s
89+
; CHECK-NEXT: fmov s0, w0
90+
; CHECK-NEXT: mov v0.s[1], w1
91+
; CHECK-NEXT: mov v0.s[2], w2
92+
; CHECK-NEXT: mov v0.s[3], w3
93+
; CHECK-NEXT: add v1.4s, v0.4s, v0.4s
10294
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
10395
; CHECK-NEXT: ret
10496
%a1 = insertelement <4 x i64> poison, i64 %a, i64 0
@@ -115,24 +107,20 @@ define <4 x i32> @lower_trunc_4xi32(i64 %a, i64 %b, i64 %c, i64 %d) {
115107
define <8 x i32> @lower_trunc_8xi32(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h) {
116108
; CHECK-LABEL: lower_trunc_8xi32:
117109
; CHECK: // %bb.0:
118-
; CHECK-NEXT: fmov d0, x2
119-
; CHECK-NEXT: fmov d1, x0
120-
; CHECK-NEXT: fmov d2, x6
121-
; CHECK-NEXT: fmov d3, x4
122-
; CHECK-NEXT: mov v0.d[1], x3
123-
; CHECK-NEXT: mov v1.d[1], x1
124-
; CHECK-NEXT: mov v2.d[1], x7
125-
; CHECK-NEXT: mov v3.d[1], x5
126-
; CHECK-NEXT: add v4.2d, v0.2d, v0.2d
127-
; CHECK-NEXT: add v5.2d, v1.2d, v1.2d
128-
; CHECK-NEXT: add v6.2d, v2.2d, v2.2d
129-
; CHECK-NEXT: add v7.2d, v3.2d, v3.2d
110+
; CHECK-NEXT: fmov d0, x6
111+
; CHECK-NEXT: fmov d1, x4
112+
; CHECK-NEXT: fmov d2, x2
113+
; CHECK-NEXT: fmov d3, x0
114+
; CHECK-NEXT: mov v0.d[1], x7
115+
; CHECK-NEXT: mov v1.d[1], x5
116+
; CHECK-NEXT: mov v2.d[1], x3
117+
; CHECK-NEXT: mov v3.d[1], x1
118+
; CHECK-NEXT: uzp1 v1.4s, v1.4s, v0.4s
130119
; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s
131-
; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
132-
; CHECK-NEXT: uzp1 v3.4s, v5.4s, v4.4s
133-
; CHECK-NEXT: uzp1 v1.4s, v7.4s, v6.4s
134-
; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b
135-
; CHECK-NEXT: eor v1.16b, v2.16b, v1.16b
120+
; CHECK-NEXT: add v3.4s, v1.4s, v1.4s
121+
; CHECK-NEXT: add v0.4s, v2.4s, v2.4s
122+
; CHECK-NEXT: eor v1.16b, v1.16b, v3.16b
123+
; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
136124
; CHECK-NEXT: ret
137125
%a1 = insertelement <8 x i64> poison, i64 %a, i64 0
138126
%b1 = insertelement <8 x i64> %a1, i64 %b, i64 1

llvm/test/CodeGen/AArch64/zext-shuffle.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -674,10 +674,8 @@ define <4 x i32> @isUndefDeInterleave_t1_bad(<8 x i16> %a) {
674674
define i16 @undeftop(<8 x i16> %0) {
675675
; CHECK-LABEL: undeftop:
676676
; CHECK: // %bb.0:
677-
; CHECK-NEXT: dup v0.8h, v0.h[4]
678-
; CHECK-NEXT: uaddl v0.4s, v0.4h, v0.4h
679-
; CHECK-NEXT: xtn v0.4h, v0.4s
680-
; CHECK-NEXT: umov w0, v0.h[0]
677+
; CHECK-NEXT: add v0.8h, v0.8h, v0.8h
678+
; CHECK-NEXT: umov w0, v0.h[4]
681679
; CHECK-NEXT: ret
682680
%2 = shufflevector <8 x i16> %0, <8 x i16> zeroinitializer, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 9, i32 7, i32 5, i32 3>
683681
%3 = zext <8 x i16> %2 to <8 x i64>

0 commit comments

Comments
 (0)