1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
5
6
7; Verify that the following shifts are lowered into a sequence of two shifts plus
8; a blend. On pre-avx2 targets, instead of scalarizing logical and arithmetic
9; packed shift right by a constant build_vector the backend should always try to
10; emit a simpler sequence of two shifts + blend when possible.
11
12define <8 x i16> @test1(<8 x i16> %a) {
13; SSE-LABEL: test1:
14; SSE:       # BB#0:
15; SSE-NEXT:    movdqa %xmm0, %xmm1
16; SSE-NEXT:    psrlw $2, %xmm1
17; SSE-NEXT:    psrlw $3, %xmm0
18; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
19; SSE-NEXT:    movaps %xmm1, %xmm0
20; SSE-NEXT:    retq
21;
22; AVX1-LABEL: test1:
23; AVX1:       # BB#0:
24; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
25; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
26; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
27; AVX1-NEXT:    retq
28;
29; AVX2-LABEL: test1:
30; AVX2:       # BB#0:
31; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm1
32; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm0
33; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
34; AVX2-NEXT:    retq
35  %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
36  ret <8 x i16> %lshr
37}
38
39define <8 x i16> @test2(<8 x i16> %a) {
40; SSE-LABEL: test2:
41; SSE:       # BB#0:
42; SSE-NEXT:    movdqa %xmm0, %xmm1
43; SSE-NEXT:    psrlw $2, %xmm1
44; SSE-NEXT:    psrlw $3, %xmm0
45; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
46; SSE-NEXT:    movapd %xmm1, %xmm0
47; SSE-NEXT:    retq
48;
49; AVX1-LABEL: test2:
50; AVX1:       # BB#0:
51; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
52; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
53; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
54; AVX1-NEXT:    retq
55;
56; AVX2-LABEL: test2:
57; AVX2:       # BB#0:
58; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm1
59; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm0
60; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
61; AVX2-NEXT:    retq
62  %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
63  ret <8 x i16> %lshr
64}
65
66define <4 x i32> @test3(<4 x i32> %a) {
67; SSE-LABEL: test3:
68; SSE:       # BB#0:
69; SSE-NEXT:    movdqa %xmm0, %xmm1
70; SSE-NEXT:    psrld $2, %xmm1
71; SSE-NEXT:    psrld $3, %xmm0
72; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
73; SSE-NEXT:    movaps %xmm1, %xmm0
74; SSE-NEXT:    retq
75;
76; AVX1-LABEL: test3:
77; AVX1:       # BB#0:
78; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm1
79; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm0
80; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
81; AVX1-NEXT:    retq
82;
83; AVX2-LABEL: test3:
84; AVX2:       # BB#0:
85; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
86; AVX2-NEXT:    retq
87  %lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
88  ret <4 x i32> %lshr
89}
90
91define <4 x i32> @test4(<4 x i32> %a) {
92; SSE-LABEL: test4:
93; SSE:       # BB#0:
94; SSE-NEXT:    movdqa %xmm0, %xmm1
95; SSE-NEXT:    psrld $2, %xmm1
96; SSE-NEXT:    psrld $3, %xmm0
97; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
98; SSE-NEXT:    movapd %xmm1, %xmm0
99; SSE-NEXT:    retq
100;
101; AVX1-LABEL: test4:
102; AVX1:       # BB#0:
103; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm1
104; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm0
105; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
106; AVX1-NEXT:    retq
107;
108; AVX2-LABEL: test4:
109; AVX2:       # BB#0:
110; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
111; AVX2-NEXT:    retq
112  %lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
113  ret <4 x i32> %lshr
114}
115
116define <8 x i16> @test5(<8 x i16> %a) {
117; SSE-LABEL: test5:
118; SSE:       # BB#0:
119; SSE-NEXT:    movdqa %xmm0, %xmm1
120; SSE-NEXT:    psraw $2, %xmm1
121; SSE-NEXT:    psraw $3, %xmm0
122; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
123; SSE-NEXT:    movaps %xmm1, %xmm0
124; SSE-NEXT:    retq
125;
126; AVX1-LABEL: test5:
127; AVX1:       # BB#0:
128; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm1
129; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm0
130; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
131; AVX1-NEXT:    retq
132;
133; AVX2-LABEL: test5:
134; AVX2:       # BB#0:
135; AVX2-NEXT:    vpsraw $3, %xmm0, %xmm1
136; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm0
137; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
138; AVX2-NEXT:    retq
139  %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
140  ret <8 x i16> %lshr
141}
142
143define <8 x i16> @test6(<8 x i16> %a) {
144; SSE-LABEL: test6:
145; SSE:       # BB#0:
146; SSE-NEXT:    movdqa %xmm0, %xmm1
147; SSE-NEXT:    psraw $2, %xmm1
148; SSE-NEXT:    psraw $3, %xmm0
149; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
150; SSE-NEXT:    movapd %xmm1, %xmm0
151; SSE-NEXT:    retq
152;
153; AVX1-LABEL: test6:
154; AVX1:       # BB#0:
155; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
156; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm0
157; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
158; AVX1-NEXT:    retq
159;
160; AVX2-LABEL: test6:
161; AVX2:       # BB#0:
162; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm1
163; AVX2-NEXT:    vpsraw $3, %xmm0, %xmm0
164; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
165; AVX2-NEXT:    retq
166  %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
167  ret <8 x i16> %lshr
168}
169
170define <4 x i32> @test7(<4 x i32> %a) {
171; SSE-LABEL: test7:
172; SSE:       # BB#0:
173; SSE-NEXT:    movdqa %xmm0, %xmm1
174; SSE-NEXT:    psrad $2, %xmm1
175; SSE-NEXT:    psrad $3, %xmm0
176; SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
177; SSE-NEXT:    movaps %xmm1, %xmm0
178; SSE-NEXT:    retq
179;
180; AVX1-LABEL: test7:
181; AVX1:       # BB#0:
182; AVX1-NEXT:    vpsrad $3, %xmm0, %xmm1
183; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
184; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
185; AVX1-NEXT:    retq
186;
187; AVX2-LABEL: test7:
188; AVX2:       # BB#0:
189; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
190; AVX2-NEXT:    retq
191  %lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
192  ret <4 x i32> %lshr
193}
194
195define <4 x i32> @test8(<4 x i32> %a) {
196; SSE-LABEL: test8:
197; SSE:       # BB#0:
198; SSE-NEXT:    movdqa %xmm0, %xmm1
199; SSE-NEXT:    psrad $2, %xmm1
200; SSE-NEXT:    psrad $3, %xmm0
201; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
202; SSE-NEXT:    movapd %xmm1, %xmm0
203; SSE-NEXT:    retq
204;
205; AVX1-LABEL: test8:
206; AVX1:       # BB#0:
207; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm1
208; AVX1-NEXT:    vpsrad $3, %xmm0, %xmm0
209; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
210; AVX1-NEXT:    retq
211;
212; AVX2-LABEL: test8:
213; AVX2:       # BB#0:
214; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
215; AVX2-NEXT:    retq
216  %lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
217  ret <4 x i32> %lshr
218}
219