Skip to content

Commit b0b94e1

Browse files
committed
[CostModel][X86] Improve single src shuffle costs
Add missing SK_PermuteSingleSrc costs for AVX2 targets and earlier, also added some of the simpler SK_PermuteTwoSrc costs to support splitting of SK_PermuteSingleSrc shuffles git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310632 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 92e2e47 commit b0b94e1

File tree

2 files changed

+96
-71
lines changed

2 files changed

+96
-71
lines changed

lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -838,6 +838,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
838838
{ TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
839839
{ TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb
840840

841+
{ TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
842+
{ TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
841843
{ TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
842844
{ TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
843845
{ TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb
@@ -872,7 +874,16 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
872874
{ TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
873875
{ TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
874876
{ TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
875-
{ TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor
877+
{ TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor
878+
879+
{ TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
880+
{ TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
881+
{ TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
882+
{ TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
883+
{ TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
884+
// + 2*por + vinsertf128
885+
{ TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb
886+
// + 2*por + vinsertf128
876887
};
877888

878889
if (ST->hasAVX())
@@ -899,11 +910,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
899910
{ TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
900911
{ TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
901912

902-
{ TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por
903-
{ TTI::SK_Alternate, MVT::v16i8, 3 }, // pshufb + pshufb + por
913+
{ TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por
914+
{ TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por
904915

905916
{ TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
906-
{ TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 } // pshufb
917+
{ TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
918+
919+
{ TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por
920+
{ TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por
907921
};
908922

909923
if (ST->hasSSSE3())
@@ -914,13 +928,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
914928
{ TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
915929
{ TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
916930
{ TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
917-
{ TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
931+
{ TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
918932
{ TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd
919933

920934
{ TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
921935
{ TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
922936
{ TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
923-
{ TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
937+
{ TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
924938
{ TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
925939
// + 2*pshufd + 2*unpck + packus
926940

@@ -930,18 +944,29 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
930944
{ TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
931945
{ TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por
932946

933-
{ TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
934-
{ TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 } // pshufd
947+
{ TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd
948+
{ TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
949+
{ TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd
950+
{ TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw
951+
// + pshufd/unpck
952+
{ TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
953+
// + 2*pshufd + 2*unpck + 2*packus
954+
955+
{ TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
956+
{ TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
957+
{ TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
935958
};
936959

937960
if (ST->hasSSE2())
938961
if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
939962
return LT.first * Entry->Cost;
940963

941964
static const CostTblEntry SSE1ShuffleTbl[] = {
942-
{ TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
943-
{ TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
944-
{ TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps
965+
{ TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
966+
{ TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
967+
{ TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps
968+
{ TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
969+
{ TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
945970
};
946971

947972
if (ST->hasSSE1())

test/Analysis/CostModel/X86/shuffle-single-src.ll

Lines changed: 60 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -13,33 +13,33 @@
1313
; CHECK-LABEL: 'test_vXf64'
1414
define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024) {
1515

16-
; SSE2: cost of 2 {{.*}} %V128 = shufflevector
17-
; SSSE3: cost of 2 {{.*}} %V128 = shufflevector
18-
; SSE42: cost of 2 {{.*}} %V128 = shufflevector
19-
; AVX1: cost of 2 {{.*}} %V128 = shufflevector
20-
; AVX2: cost of 2 {{.*}} %V128 = shufflevector
16+
; SSE2: cost of 1 {{.*}} %V128 = shufflevector
17+
; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
18+
; SSE42: cost of 1 {{.*}} %V128 = shufflevector
19+
; AVX1: cost of 1 {{.*}} %V128 = shufflevector
20+
; AVX2: cost of 1 {{.*}} %V128 = shufflevector
2121
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
2222
%V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 1>
2323

24-
; SSE2: cost of 4 {{.*}} %V256 = shufflevector
25-
; SSSE3: cost of 4 {{.*}} %V256 = shufflevector
26-
; SSE42: cost of 4 {{.*}} %V256 = shufflevector
27-
; AVX1: cost of 6 {{.*}} %V256 = shufflevector
28-
; AVX2: cost of 6 {{.*}} %V256 = shufflevector
24+
; SSE2: cost of 2 {{.*}} %V256 = shufflevector
25+
; SSSE3: cost of 2 {{.*}} %V256 = shufflevector
26+
; SSE42: cost of 2 {{.*}} %V256 = shufflevector
27+
; AVX1: cost of 3 {{.*}} %V256 = shufflevector
28+
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
2929
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
3030
%V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
3131

32-
; SSE2: cost of 24 {{.*}} %V512 = shufflevector
33-
; SSSE3: cost of 24 {{.*}} %V512 = shufflevector
34-
; SSE42: cost of 24 {{.*}} %V512 = shufflevector
32+
; SSE2: cost of 12 {{.*}} %V512 = shufflevector
33+
; SSSE3: cost of 12 {{.*}} %V512 = shufflevector
34+
; SSE42: cost of 12 {{.*}} %V512 = shufflevector
3535
; AVX1: cost of 12 {{.*}} %V512 = shufflevector
3636
; AVX2: cost of 12 {{.*}} %V512 = shufflevector
3737
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
3838
%V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
3939

40-
; SSE2: cost of 112 {{.*}} %V1024 = shufflevector
41-
; SSSE3: cost of 112 {{.*}} %V1024 = shufflevector
42-
; SSE42: cost of 112 {{.*}} %V1024 = shufflevector
40+
; SSE2: cost of 56 {{.*}} %V1024 = shufflevector
41+
; SSSE3: cost of 56 {{.*}} %V1024 = shufflevector
42+
; SSE42: cost of 56 {{.*}} %V1024 = shufflevector
4343
; AVX1: cost of 72 {{.*}} %V1024 = shufflevector
4444
; AVX2: cost of 72 {{.*}} %V1024 = shufflevector
4545
; AVX512: cost of 2 {{.*}} %V1024 = shufflevector
@@ -59,17 +59,17 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
5959
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
6060
%V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
6161

62-
; SSE2: cost of 8 {{.*}} %V256 = shufflevector
63-
; SSSE3: cost of 8 {{.*}} %V256 = shufflevector
64-
; SSE42: cost of 8 {{.*}} %V256 = shufflevector
65-
; AVX1: cost of 8 {{.*}} %V256 = shufflevector
62+
; SSE2: cost of 2 {{.*}} %V256 = shufflevector
63+
; SSSE3: cost of 2 {{.*}} %V256 = shufflevector
64+
; SSE42: cost of 2 {{.*}} %V256 = shufflevector
65+
; AVX1: cost of 3 {{.*}} %V256 = shufflevector
6666
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
6767
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
6868
%V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
6969

70-
; SSE2: cost of 48 {{.*}} %V512 = shufflevector
71-
; SSSE3: cost of 48 {{.*}} %V512 = shufflevector
72-
; SSE42: cost of 48 {{.*}} %V512 = shufflevector
70+
; SSE2: cost of 12 {{.*}} %V512 = shufflevector
71+
; SSSE3: cost of 12 {{.*}} %V512 = shufflevector
72+
; SSE42: cost of 12 {{.*}} %V512 = shufflevector
7373
; AVX1: cost of 16 {{.*}} %V512 = shufflevector
7474
; AVX2: cost of 16 {{.*}} %V512 = shufflevector
7575
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
@@ -81,25 +81,25 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
8181
; CHECK-LABEL: 'test_vXf32'
8282
define void @test_vXf32(<4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
8383

84-
; SSE2: cost of 6 {{.*}} %V128 = shufflevector
85-
; SSSE3: cost of 6 {{.*}} %V128 = shufflevector
86-
; SSE42: cost of 6 {{.*}} %V128 = shufflevector
87-
; AVX1: cost of 6 {{.*}} %V128 = shufflevector
88-
; AVX2: cost of 6 {{.*}} %V128 = shufflevector
84+
; SSE2: cost of 1 {{.*}} %V128 = shufflevector
85+
; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
86+
; SSE42: cost of 1 {{.*}} %V128 = shufflevector
87+
; AVX1: cost of 1 {{.*}} %V128 = shufflevector
88+
; AVX2: cost of 1 {{.*}} %V128 = shufflevector
8989
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
9090
%V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
9191

92-
; SSE2: cost of 12 {{.*}} %V256 = shufflevector
93-
; SSSE3: cost of 12 {{.*}} %V256 = shufflevector
94-
; SSE42: cost of 12 {{.*}} %V256 = shufflevector
95-
; AVX1: cost of 14 {{.*}} %V256 = shufflevector
96-
; AVX2: cost of 14 {{.*}} %V256 = shufflevector
92+
; SSE2: cost of 4 {{.*}} %V256 = shufflevector
93+
; SSSE3: cost of 4 {{.*}} %V256 = shufflevector
94+
; SSE42: cost of 4 {{.*}} %V256 = shufflevector
95+
; AVX1: cost of 4 {{.*}} %V256 = shufflevector
96+
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
9797
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
9898
%V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
9999

100-
; SSE2: cost of 72 {{.*}} %V512 = shufflevector
101-
; SSSE3: cost of 72 {{.*}} %V512 = shufflevector
102-
; SSE42: cost of 72 {{.*}} %V512 = shufflevector
100+
; SSE2: cost of 24 {{.*}} %V512 = shufflevector
101+
; SSSE3: cost of 24 {{.*}} %V512 = shufflevector
102+
; SSE42: cost of 24 {{.*}} %V512 = shufflevector
103103
; AVX1: cost of 28 {{.*}} %V512 = shufflevector
104104
; AVX2: cost of 28 {{.*}} %V512 = shufflevector
105105
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
@@ -119,25 +119,25 @@ define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512
119119
; AVX512: cost of 1 {{.*}} %V128 = shufflevector
120120
%V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
121121

122-
; SSE2: cost of 16 {{.*}} %V256 = shufflevector
123-
; SSSE3: cost of 16 {{.*}} %V256 = shufflevector
124-
; SSE42: cost of 16 {{.*}} %V256 = shufflevector
125-
; AVX1: cost of 16 {{.*}} %V256 = shufflevector
122+
; SSE2: cost of 4 {{.*}} %V256 = shufflevector
123+
; SSSE3: cost of 4 {{.*}} %V256 = shufflevector
124+
; SSE42: cost of 4 {{.*}} %V256 = shufflevector
125+
; AVX1: cost of 4 {{.*}} %V256 = shufflevector
126126
; AVX2: cost of 1 {{.*}} %V256 = shufflevector
127127
; AVX512: cost of 1 {{.*}} %V256 = shufflevector
128128
%V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
129129

130-
; SSE2: cost of 96 {{.*}} %V512 = shufflevector
131-
; SSSE3: cost of 96 {{.*}} %V512 = shufflevector
132-
; SSE42: cost of 96 {{.*}} %V512 = shufflevector
130+
; SSE2: cost of 24 {{.*}} %V512 = shufflevector
131+
; SSSE3: cost of 24 {{.*}} %V512 = shufflevector
132+
; SSE42: cost of 24 {{.*}} %V512 = shufflevector
133133
; AVX1: cost of 32 {{.*}} %V512 = shufflevector
134134
; AVX2: cost of 32 {{.*}} %V512 = shufflevector
135135
; AVX512: cost of 1 {{.*}} %V512 = shufflevector
136136
%V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 13, i32 10, i32 9, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
137137

138-
; SSE2: cost of 448 {{.*}} %V1024 = shufflevector
139-
; SSSE3: cost of 448 {{.*}} %V1024 = shufflevector
140-
; SSE42: cost of 448 {{.*}} %V1024 = shufflevector
138+
; SSE2: cost of 112 {{.*}} %V1024 = shufflevector
139+
; SSSE3: cost of 112 {{.*}} %V1024 = shufflevector
140+
; SSE42: cost of 112 {{.*}} %V1024 = shufflevector
141141
; AVX1: cost of 192 {{.*}} %V1024 = shufflevector
142142
; AVX2: cost of 192 {{.*}} %V1024 = shufflevector
143143
; AVX512: cost of 2 {{.*}} %V1024 = shufflevector
@@ -148,7 +148,7 @@ define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512
148148
; CHECK-LABEL: 'test_vXi16'
149149
define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512, <64 x i16> %src1024) {
150150

151-
; SSE2: cost of 16 {{.*}} %V128 = shufflevector
151+
; SSE2: cost of 5 {{.*}} %V128 = shufflevector
152152
; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
153153
; SSE42: cost of 1 {{.*}} %V128 = shufflevector
154154
; AVX1: cost of 1 {{.*}} %V128 = shufflevector
@@ -158,26 +158,26 @@ define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src51
158158
%V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
159159

160160
; SSE2: cost of 32 {{.*}} %V256 = shufflevector
161-
; SSSE3: cost of 32 {{.*}} %V256 = shufflevector
162-
; SSE42: cost of 32 {{.*}} %V256 = shufflevector
163-
; AVX1: cost of 32 {{.*}} %V256 = shufflevector
161+
; SSSE3: cost of 6 {{.*}} %V256 = shufflevector
162+
; SSE42: cost of 6 {{.*}} %V256 = shufflevector
163+
; AVX1: cost of 8 {{.*}} %V256 = shufflevector
164164
; AVX2: cost of 4 {{.*}} %V256 = shufflevector
165165
; AVX512F: cost of 4 {{.*}} %V256 = shufflevector
166166
; AVX512BW cost of 1 {{.*}} %V256 = shufflevector
167167
%V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
168168

169169
; SSE2: cost of 192 {{.*}} %V512 = shufflevector
170-
; SSSE3: cost of 192 {{.*}} %V512 = shufflevector
171-
; SSE42: cost of 192 {{.*}} %V512 = shufflevector
170+
; SSSE3: cost of 36 {{.*}} %V512 = shufflevector
171+
; SSE42: cost of 36 {{.*}} %V512 = shufflevector
172172
; AVX1: cost of 64 {{.*}} %V512 = shufflevector
173173
; AVX2: cost of 64 {{.*}} %V512 = shufflevector
174174
; AVX512F: cost of 64 {{.*}} %V512 = shufflevector
175175
; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
176176
%V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
177177

178178
; SSE2: cost of 896 {{.*}} %V1024 = shufflevector
179-
; SSSE3: cost of 896 {{.*}} %V1024 = shufflevector
180-
; SSE42: cost of 896 {{.*}} %V1024 = shufflevector
179+
; SSSE3: cost of 168 {{.*}} %V1024 = shufflevector
180+
; SSE42: cost of 168 {{.*}} %V1024 = shufflevector
181181
; AVX1: cost of 384 {{.*}} %V1024 = shufflevector
182182
; AVX2: cost of 384 {{.*}} %V1024 = shufflevector
183183
; AVX512F: cost of 384 {{.*}} %V1024 = shufflevector
@@ -188,7 +188,7 @@ define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src51
188188

189189
; CHECK-LABEL: 'test_vXi8'
190190
define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
191-
; SSE2: cost of 32 {{.*}} %V128 = shufflevector
191+
; SSE2: cost of 10 {{.*}} %V128 = shufflevector
192192
; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
193193
; SSE42: cost of 1 {{.*}} %V128 = shufflevector
194194
; AVX1: cost of 1 {{.*}} %V128 = shufflevector
@@ -197,17 +197,17 @@ define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512)
197197
%V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
198198

199199
; SSE2: cost of 64 {{.*}} %V256 = shufflevector
200-
; SSSE3: cost of 64 {{.*}} %V256 = shufflevector
201-
; SSE42: cost of 64 {{.*}} %V256 = shufflevector
202-
; AVX1: cost of 64 {{.*}} %V256 = shufflevector
200+
; SSSE3: cost of 6 {{.*}} %V256 = shufflevector
201+
; SSE42: cost of 6 {{.*}} %V256 = shufflevector
202+
; AVX1: cost of 8 {{.*}} %V256 = shufflevector
203203
; AVX2: cost of 4 {{.*}} %V256 = shufflevector
204204
; AVX512F: cost of 4 {{.*}} %V256 = shufflevector
205205
; AVX512BW: cost of 3 {{.*}} %V256 = shufflevector
206206
%V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
207207

208208
; SSE2: cost of 384 {{.*}} %V512 = shufflevector
209-
; SSSE3: cost of 384 {{.*}} %V512 = shufflevector
210-
; SSE42: cost of 384 {{.*}} %V512 = shufflevector
209+
; SSSE3: cost of 36 {{.*}} %V512 = shufflevector
210+
; SSE42: cost of 36 {{.*}} %V512 = shufflevector
211211
; AVX1: cost of 128 {{.*}} %V512 = shufflevector
212212
; AVX2: cost of 128 {{.*}} %V512 = shufflevector
213213
; AVX512F: cost of 128 {{.*}} %V512 = shufflevector

0 commit comments

Comments
 (0)