1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512
6
7; Verify that we don't scalarize a packed vector shift left of 16-bit
8; signed integers if the amount is a constant build_vector.
9; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
10
11define <8 x i16> @test1(<8 x i16> %a) {
12; SSE-LABEL: test1:
13; SSE:       # %bb.0:
14; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
15; SSE-NEXT:    retq
16;
17; AVX-LABEL: test1:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
20; AVX-NEXT:    retq
21  %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
22  ret <8 x i16> %shl
23}
24
25define <8 x i16> @test2(<8 x i16> %a) {
26; SSE-LABEL: test2:
27; SSE:       # %bb.0:
28; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
29; SSE-NEXT:    retq
30;
31; AVX-LABEL: test2:
32; AVX:       # %bb.0:
33; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
34; AVX-NEXT:    retq
35  %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
36  ret <8 x i16> %shl
37}
38
39; Verify that a vector shift left of 32-bit signed integers is simply expanded
40; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
41; counts is a constant build_vector.
42
43define <4 x i32> @test3(<4 x i32> %a) {
44; SSE2-LABEL: test3:
45; SSE2:       # %bb.0:
46; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
47; SSE2-NEXT:    pmuludq %xmm0, %xmm1
48; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
49; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
50; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
51; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
52; SSE2-NEXT:    retq
53;
54; SSE41-LABEL: test3:
55; SSE41:       # %bb.0:
56; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
57; SSE41-NEXT:    retq
58;
59; AVX-LABEL: test3:
60; AVX:       # %bb.0:
61; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
62; AVX-NEXT:    retq
63  %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
64  ret <4 x i32> %shl
65}
66
67define <4 x i32> @test4(<4 x i32> %a) {
68; SSE2-LABEL: test4:
69; SSE2:       # %bb.0:
70; SSE2-NEXT:    movdqa %xmm0, %xmm1
71; SSE2-NEXT:    pslld $1, %xmm1
72; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
73; SSE2-NEXT:    retq
74;
75; SSE41-LABEL: test4:
76; SSE41:       # %bb.0:
77; SSE41-NEXT:    movdqa %xmm0, %xmm1
78; SSE41-NEXT:    pslld $1, %xmm1
79; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
80; SSE41-NEXT:    retq
81;
82; AVX-LABEL: test4:
83; AVX:       # %bb.0:
84; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
85; AVX-NEXT:    retq
86  %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
87  ret <4 x i32> %shl
88}
89
90; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
91; into two pmullw instructions. With AVX2, the test case below would produce
92; a single vpmullw.
93
94define <16 x i16> @test5(<16 x i16> %a) {
95; SSE-LABEL: test5:
96; SSE:       # %bb.0:
97; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
98; SSE-NEXT:    pmullw %xmm2, %xmm0
99; SSE-NEXT:    pmullw %xmm2, %xmm1
100; SSE-NEXT:    retq
101;
102; AVX-LABEL: test5:
103; AVX:       # %bb.0:
104; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
105; AVX-NEXT:    retq
106  %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
107  ret <16 x i16> %shl
108}
109
110; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
111; into two pmulld instructions. With AVX2, the test case below would produce
112; a single vpsllvd instead.
113
114define <8 x i32> @test6(<8 x i32> %a) {
115; SSE2-LABEL: test6:
116; SSE2:       # %bb.0:
117; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8]
118; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
119; SSE2-NEXT:    pmuludq %xmm2, %xmm0
120; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
121; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,8,8]
122; SSE2-NEXT:    pmuludq %xmm4, %xmm3
123; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
124; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
125; SSE2-NEXT:    pmuludq %xmm1, %xmm2
126; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
127; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
128; SSE2-NEXT:    pmuludq %xmm4, %xmm1
129; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
130; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
131; SSE2-NEXT:    movdqa %xmm2, %xmm1
132; SSE2-NEXT:    retq
133;
134; SSE41-LABEL: test6:
135; SSE41:       # %bb.0:
136; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,4,8]
137; SSE41-NEXT:    pmulld %xmm2, %xmm0
138; SSE41-NEXT:    pmulld %xmm2, %xmm1
139; SSE41-NEXT:    retq
140;
141; AVX-LABEL: test6:
142; AVX:       # %bb.0:
143; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
144; AVX-NEXT:    retq
145  %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
146  ret <8 x i32> %shl
147}
148
149; With AVX2 and AVX512, the test case below should produce a sequence of
150; two vpmullw instructions. On SSE2 instead, we split the shift in four
151; parts and then we convert each part into a pmullw.
152
153define <32 x i16> @test7(<32 x i16> %a) {
154; SSE-LABEL: test7:
155; SSE:       # %bb.0:
156; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
157; SSE-NEXT:    pmullw %xmm4, %xmm0
158; SSE-NEXT:    pmullw %xmm4, %xmm1
159; SSE-NEXT:    pmullw %xmm4, %xmm2
160; SSE-NEXT:    pmullw %xmm4, %xmm3
161; SSE-NEXT:    retq
162;
163; AVX2-LABEL: test7:
164; AVX2:       # %bb.0:
165; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
166; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
167; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
168; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
169; AVX2-NEXT:    retq
170;
171; AVX512-LABEL: test7:
172; AVX512:       # %bb.0:
173; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
174; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
175; AVX512-NEXT:    # ymm2 = mem[0,1,0,1]
176; AVX512-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
177; AVX512-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
178; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
179; AVX512-NEXT:    retq
180  %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
181  ret <32 x i16> %shl
182}
183
184; Similar to test7; the difference is that with AVX512 support
185; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
186
187define <16 x i32> @test8(<16 x i32> %a) {
188; SSE2-LABEL: test8:
189; SSE2:       # %bb.0:
190; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8]
191; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
192; SSE2-NEXT:    pmuludq %xmm4, %xmm0
193; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
194; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2,2,8,8]
195; SSE2-NEXT:    pmuludq %xmm6, %xmm5
196; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
197; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
198; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
199; SSE2-NEXT:    pmuludq %xmm4, %xmm1
200; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
201; SSE2-NEXT:    pmuludq %xmm6, %xmm5
202; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
203; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
204; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
205; SSE2-NEXT:    pmuludq %xmm4, %xmm2
206; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
207; SSE2-NEXT:    pmuludq %xmm6, %xmm5
208; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
209; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
210; SSE2-NEXT:    pmuludq %xmm3, %xmm4
211; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
212; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
213; SSE2-NEXT:    pmuludq %xmm6, %xmm3
214; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
215; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
216; SSE2-NEXT:    movdqa %xmm4, %xmm3
217; SSE2-NEXT:    retq
218;
219; SSE41-LABEL: test8:
220; SSE41:       # %bb.0:
221; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2,2,4,8]
222; SSE41-NEXT:    pmulld %xmm4, %xmm0
223; SSE41-NEXT:    pmulld %xmm4, %xmm1
224; SSE41-NEXT:    pmulld %xmm4, %xmm2
225; SSE41-NEXT:    pmulld %xmm4, %xmm3
226; SSE41-NEXT:    retq
227;
228; AVX2-LABEL: test8:
229; AVX2:       # %bb.0:
230; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
231; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
232; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
233; AVX2-NEXT:    vpsllvd %ymm2, %ymm1, %ymm1
234; AVX2-NEXT:    retq
235;
236; AVX512-LABEL: test8:
237; AVX512:       # %bb.0:
238; AVX512-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
239; AVX512-NEXT:    retq
240  %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
241  ret <16 x i32> %shl
242}
243
244; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support.
245
246define <8 x i64> @test9(<8 x i64> %a) {
247; SSE2-LABEL: test9:
248; SSE2:       # %bb.0:
249; SSE2-NEXT:    movdqa %xmm1, %xmm4
250; SSE2-NEXT:    psllq $2, %xmm4
251; SSE2-NEXT:    psllq $3, %xmm1
252; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
253; SSE2-NEXT:    movdqa %xmm3, %xmm4
254; SSE2-NEXT:    psllq $2, %xmm4
255; SSE2-NEXT:    psllq $3, %xmm3
256; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
257; SSE2-NEXT:    paddq %xmm0, %xmm0
258; SSE2-NEXT:    paddq %xmm2, %xmm2
259; SSE2-NEXT:    retq
260;
261; SSE41-LABEL: test9:
262; SSE41:       # %bb.0:
263; SSE41-NEXT:    movdqa %xmm1, %xmm4
264; SSE41-NEXT:    psllq $3, %xmm4
265; SSE41-NEXT:    psllq $2, %xmm1
266; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
267; SSE41-NEXT:    movdqa %xmm3, %xmm4
268; SSE41-NEXT:    psllq $3, %xmm4
269; SSE41-NEXT:    psllq $2, %xmm3
270; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
271; SSE41-NEXT:    paddq %xmm0, %xmm0
272; SSE41-NEXT:    paddq %xmm2, %xmm2
273; SSE41-NEXT:    retq
274;
275; AVX2-LABEL: test9:
276; AVX2:       # %bb.0:
277; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,2,3]
278; AVX2-NEXT:    vpsllvq %ymm2, %ymm0, %ymm0
279; AVX2-NEXT:    vpsllvq %ymm2, %ymm1, %ymm1
280; AVX2-NEXT:    retq
281;
282; AVX512-LABEL: test9:
283; AVX512:       # %bb.0:
284; AVX512-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
285; AVX512-NEXT:    retq
286  %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
287  ret <8 x i64> %shl
288}
289