1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4
5;
6; Unary shuffle indices from registers
7;
8
9define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
10; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64:
11; ALL:       # BB#0:
12; ALL-NEXT:    pushq %rbp
13; ALL-NEXT:    movq %rsp, %rbp
14; ALL-NEXT:    andq $-32, %rsp
15; ALL-NEXT:    subq $64, %rsp
16; ALL-NEXT:    andl $3, %ecx
17; ALL-NEXT:    andl $3, %edx
18; ALL-NEXT:    andl $3, %esi
19; ALL-NEXT:    andl $3, %edi
20; ALL-NEXT:    vmovaps %ymm0, (%rsp)
21; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
22; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
23; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
24; ALL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
25; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
26; ALL-NEXT:    movq %rbp, %rsp
27; ALL-NEXT:    popq %rbp
28; ALL-NEXT:    retq
29  %x0 = extractelement <4 x double> %x, i64 %i0
30  %x1 = extractelement <4 x double> %x, i64 %i1
31  %x2 = extractelement <4 x double> %x, i64 %i2
32  %x3 = extractelement <4 x double> %x, i64 %i3
33  %r0 = insertelement <4 x double> undef, double %x0, i32 0
34  %r1 = insertelement <4 x double>   %r0, double %x1, i32 1
35  %r2 = insertelement <4 x double>   %r1, double %x2, i32 2
36  %r3 = insertelement <4 x double>   %r2, double %x3, i32 3
37  ret <4 x double> %r3
38}
39
40define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
41; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64:
42; ALL:       # BB#0:
43; ALL-NEXT:    pushq %rbp
44; ALL-NEXT:    movq %rsp, %rbp
45; ALL-NEXT:    andq $-32, %rsp
46; ALL-NEXT:    subq $64, %rsp
47; ALL-NEXT:    andl $3, %edx
48; ALL-NEXT:    andl $3, %esi
49; ALL-NEXT:    vmovaps %ymm0, (%rsp)
50; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
51; ALL-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
52; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
53; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
54; ALL-NEXT:    movq %rbp, %rsp
55; ALL-NEXT:    popq %rbp
56; ALL-NEXT:    retq
57  %x0 = extractelement <4 x double> %x, i64 %i0
58  %x1 = extractelement <4 x double> %x, i64 %i1
59  %x2 = extractelement <4 x double> %x, i64 %i2
60  %x3 = extractelement <4 x double> %x, i64 %i3
61  %r0 = insertelement <4 x double> undef, double undef, i32 0
62  %r1 = insertelement <4 x double>   %r0, double   %x1, i32 1
63  %r2 = insertelement <4 x double>   %r1, double   %x2, i32 2
64  %r3 = insertelement <4 x double>   %r2, double   0.0, i32 3
65  ret <4 x double> %r3
66}
67
68define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
69; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
70; ALL:       # BB#0:
71; ALL-NEXT:    andl $1, %ecx
72; ALL-NEXT:    andl $1, %edx
73; ALL-NEXT:    andl $1, %esi
74; ALL-NEXT:    andl $1, %edi
75; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
76; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
77; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
78; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
79; ALL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
80; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
81; ALL-NEXT:    retq
82  %x0 = extractelement <2 x double> %x, i64 %i0
83  %x1 = extractelement <2 x double> %x, i64 %i1
84  %x2 = extractelement <2 x double> %x, i64 %i2
85  %x3 = extractelement <2 x double> %x, i64 %i3
86  %r0 = insertelement <4 x double> undef, double %x0, i32 0
87  %r1 = insertelement <4 x double>   %r0, double %x1, i32 1
88  %r2 = insertelement <4 x double>   %r1, double %x2, i32 2
89  %r3 = insertelement <4 x double>   %r2, double %x3, i32 3
90  ret <4 x double> %r3
91}
92
93define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
94; AVX1-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
95; AVX1:       # BB#0:
96; AVX1-NEXT:    pushq %rbp
97; AVX1-NEXT:    movq %rsp, %rbp
98; AVX1-NEXT:    andq $-32, %rsp
99; AVX1-NEXT:    subq $64, %rsp
100; AVX1-NEXT:    andl $3, %ecx
101; AVX1-NEXT:    andl $3, %edx
102; AVX1-NEXT:    andl $3, %esi
103; AVX1-NEXT:    andl $3, %edi
104; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
105; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
106; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
107; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
108; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
109; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
110; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
111; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
112; AVX1-NEXT:    movq %rbp, %rsp
113; AVX1-NEXT:    popq %rbp
114; AVX1-NEXT:    retq
115;
116; AVX2-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
117; AVX2:       # BB#0:
118; AVX2-NEXT:    pushq %rbp
119; AVX2-NEXT:    movq %rsp, %rbp
120; AVX2-NEXT:    andq $-32, %rsp
121; AVX2-NEXT:    subq $64, %rsp
122; AVX2-NEXT:    andl $3, %ecx
123; AVX2-NEXT:    andl $3, %edx
124; AVX2-NEXT:    andl $3, %esi
125; AVX2-NEXT:    andl $3, %edi
126; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
127; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
128; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
129; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
130; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
131; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
132; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
133; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
134; AVX2-NEXT:    movq %rbp, %rsp
135; AVX2-NEXT:    popq %rbp
136; AVX2-NEXT:    retq
137  %x0 = extractelement <4 x i64> %x, i64 %i0
138  %x1 = extractelement <4 x i64> %x, i64 %i1
139  %x2 = extractelement <4 x i64> %x, i64 %i2
140  %x3 = extractelement <4 x i64> %x, i64 %i3
141  %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
142  %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
143  %r2 = insertelement <4 x i64>   %r1, i64 %x2, i32 2
144  %r3 = insertelement <4 x i64>   %r2, i64 %x3, i32 3
145  ret <4 x i64> %r3
146}
147
148define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
149; AVX1-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
150; AVX1:       # BB#0:
151; AVX1-NEXT:    pushq %rbp
152; AVX1-NEXT:    movq %rsp, %rbp
153; AVX1-NEXT:    andq $-32, %rsp
154; AVX1-NEXT:    subq $64, %rsp
155; AVX1-NEXT:    andl $3, %esi
156; AVX1-NEXT:    andl $3, %edi
157; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
158; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
159; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
160; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
161; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
162; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
163; AVX1-NEXT:    movq %rbp, %rsp
164; AVX1-NEXT:    popq %rbp
165; AVX1-NEXT:    retq
166;
167; AVX2-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
168; AVX2:       # BB#0:
169; AVX2-NEXT:    pushq %rbp
170; AVX2-NEXT:    movq %rsp, %rbp
171; AVX2-NEXT:    andq $-32, %rsp
172; AVX2-NEXT:    subq $64, %rsp
173; AVX2-NEXT:    andl $3, %esi
174; AVX2-NEXT:    andl $3, %edi
175; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
176; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
177; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
178; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
179; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
180; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
181; AVX2-NEXT:    movq %rbp, %rsp
182; AVX2-NEXT:    popq %rbp
183; AVX2-NEXT:    retq
184  %x0 = extractelement <4 x i64> %x, i64 %i0
185  %x1 = extractelement <4 x i64> %x, i64 %i1
186  %x2 = extractelement <4 x i64> %x, i64 %i2
187  %x3 = extractelement <4 x i64> %x, i64 %i3
188  %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
189  %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
190  %r2 = insertelement <4 x i64>   %r1, i64   0, i32 2
191  %r3 = insertelement <4 x i64>   %r2, i64   0, i32 3
192  ret <4 x i64> %r3
193}
194
195define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
196; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
197; AVX1:       # BB#0:
198; AVX1-NEXT:    andl $1, %ecx
199; AVX1-NEXT:    andl $1, %edx
200; AVX1-NEXT:    andl $1, %esi
201; AVX1-NEXT:    andl $1, %edi
202; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
203; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
204; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
205; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
206; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
207; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
208; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
209; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
210; AVX1-NEXT:    retq
211;
212; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
213; AVX2:       # BB#0:
214; AVX2-NEXT:    andl $1, %ecx
215; AVX2-NEXT:    andl $1, %edx
216; AVX2-NEXT:    andl $1, %esi
217; AVX2-NEXT:    andl $1, %edi
218; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
219; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
220; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
221; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
222; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
223; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
224; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
225; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
226; AVX2-NEXT:    retq
227  %x0 = extractelement <2 x i64> %x, i64 %i0
228  %x1 = extractelement <2 x i64> %x, i64 %i1
229  %x2 = extractelement <2 x i64> %x, i64 %i2
230  %x3 = extractelement <2 x i64> %x, i64 %i3
231  %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
232  %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
233  %r2 = insertelement <4 x i64>   %r1, i64 %x2, i32 2
234  %r3 = insertelement <4 x i64>   %r2, i64 %x3, i32 3
235  ret <4 x i64> %r3
236}
237
238define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
239; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
240; AVX1:       # BB#0:
241; AVX1-NEXT:    pushq %rbp
242; AVX1-NEXT:    movq %rsp, %rbp
243; AVX1-NEXT:    andq $-32, %rsp
244; AVX1-NEXT:    subq $64, %rsp
245; AVX1-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
246; AVX1-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
247; AVX1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
248; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
249; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
250; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
251; AVX1-NEXT:    andl $7, %edi
252; AVX1-NEXT:    andl $7, %esi
253; AVX1-NEXT:    andl $7, %edx
254; AVX1-NEXT:    andl $7, %ecx
255; AVX1-NEXT:    andl $7, %r8d
256; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
257; AVX1-NEXT:    andl $7, %r9d
258; AVX1-NEXT:    movl 16(%rbp), %r10d
259; AVX1-NEXT:    andl $7, %r10d
260; AVX1-NEXT:    movl 24(%rbp), %eax
261; AVX1-NEXT:    andl $7, %eax
262; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
263; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
264; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
265; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
266; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
267; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
268; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
269; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
270; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
271; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
272; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
273; AVX1-NEXT:    movq %rbp, %rsp
274; AVX1-NEXT:    popq %rbp
275; AVX1-NEXT:    retq
276;
277; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
278; AVX2:       # BB#0:
279; AVX2-NEXT:    vmovd %edi, %xmm1
280; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm1
281; AVX2-NEXT:    vmovd %esi, %xmm2
282; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm2
283; AVX2-NEXT:    vmovd %edx, %xmm3
284; AVX2-NEXT:    vpermps %ymm0, %ymm3, %ymm3
285; AVX2-NEXT:    vmovd %ecx, %xmm4
286; AVX2-NEXT:    vpermps %ymm0, %ymm4, %ymm4
287; AVX2-NEXT:    vmovd %r8d, %xmm5
288; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm5
289; AVX2-NEXT:    vmovd %r9d, %xmm6
290; AVX2-NEXT:    vpermps %ymm0, %ymm6, %ymm6
291; AVX2-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
292; AVX2-NEXT:    vpermps %ymm0, %ymm7, %ymm7
293; AVX2-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
294; AVX2-NEXT:    vpermps %ymm0, %ymm8, %ymm0
295; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
296; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
297; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
298; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
299; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
300; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
301; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
302; AVX2-NEXT:    retq
303  %x0 = extractelement <8 x float> %x, i32 %i0
304  %x1 = extractelement <8 x float> %x, i32 %i1
305  %x2 = extractelement <8 x float> %x, i32 %i2
306  %x3 = extractelement <8 x float> %x, i32 %i3
307  %x4 = extractelement <8 x float> %x, i32 %i4
308  %x5 = extractelement <8 x float> %x, i32 %i5
309  %x6 = extractelement <8 x float> %x, i32 %i6
310  %x7 = extractelement <8 x float> %x, i32 %i7
311  %r0 = insertelement <8 x float> undef, float %x0, i32 0
312  %r1 = insertelement <8 x float>   %r0, float %x1, i32 1
313  %r2 = insertelement <8 x float>   %r1, float %x2, i32 2
314  %r3 = insertelement <8 x float>   %r2, float %x3, i32 3
315  %r4 = insertelement <8 x float>   %r3, float %x4, i32 4
316  %r5 = insertelement <8 x float>   %r4, float %x5, i32 5
317  %r6 = insertelement <8 x float>   %r5, float %x6, i32 6
318  %r7 = insertelement <8 x float>   %r6, float %x7, i32 7
319  ret <8 x float> %r7
320}
321
322define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
323; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32:
324; ALL:       # BB#0:
325; ALL-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
326; ALL-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
327; ALL-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
328; ALL-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
329; ALL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
330; ALL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
331; ALL-NEXT:    andl $3, %edi
332; ALL-NEXT:    andl $3, %esi
333; ALL-NEXT:    andl $3, %edx
334; ALL-NEXT:    andl $3, %ecx
335; ALL-NEXT:    andl $3, %r8d
336; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
337; ALL-NEXT:    andl $3, %r9d
338; ALL-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
339; ALL-NEXT:    andl $3, %r10d
340; ALL-NEXT:    movl {{[0-9]+}}(%rsp), %eax
341; ALL-NEXT:    andl $3, %eax
342; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
343; ALL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
344; ALL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
345; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
346; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
347; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
348; ALL-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
349; ALL-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
350; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
351; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
352; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
353; ALL-NEXT:    retq
354  %x0 = extractelement <4 x float> %x, i32 %i0
355  %x1 = extractelement <4 x float> %x, i32 %i1
356  %x2 = extractelement <4 x float> %x, i32 %i2
357  %x3 = extractelement <4 x float> %x, i32 %i3
358  %x4 = extractelement <4 x float> %x, i32 %i4
359  %x5 = extractelement <4 x float> %x, i32 %i5
360  %x6 = extractelement <4 x float> %x, i32 %i6
361  %x7 = extractelement <4 x float> %x, i32 %i7
362  %r0 = insertelement <8 x float> undef, float %x0, i32 0
363  %r1 = insertelement <8 x float>   %r0, float %x1, i32 1
364  %r2 = insertelement <8 x float>   %r1, float %x2, i32 2
365  %r3 = insertelement <8 x float>   %r2, float %x3, i32 3
366  %r4 = insertelement <8 x float>   %r3, float %x4, i32 4
367  %r5 = insertelement <8 x float>   %r4, float %x5, i32 5
368  %r6 = insertelement <8 x float>   %r5, float %x6, i32 6
369  %r7 = insertelement <8 x float>   %r6, float %x7, i32 7
370  ret <8 x float> %r7
371}
372
373define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
374; AVX1-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
375; AVX1:       # BB#0:
376; AVX1-NEXT:    pushq %rbp
377; AVX1-NEXT:    movq %rsp, %rbp
378; AVX1-NEXT:    andq $-32, %rsp
379; AVX1-NEXT:    subq $64, %rsp
380; AVX1-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
381; AVX1-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
382; AVX1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
383; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
384; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
385; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
386; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
387; AVX1-NEXT:    movl 32(%rbp), %eax
388; AVX1-NEXT:    andl $15, %eax
389; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
390; AVX1-NEXT:    vmovd %eax, %xmm0
391; AVX1-NEXT:    movl 40(%rbp), %eax
392; AVX1-NEXT:    andl $15, %eax
393; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
394; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
395; AVX1-NEXT:    movl 48(%rbp), %eax
396; AVX1-NEXT:    andl $15, %eax
397; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
398; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
399; AVX1-NEXT:    movl 56(%rbp), %eax
400; AVX1-NEXT:    andl $15, %eax
401; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
402; AVX1-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
403; AVX1-NEXT:    movl 64(%rbp), %eax
404; AVX1-NEXT:    andl $15, %eax
405; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
406; AVX1-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
407; AVX1-NEXT:    movl 72(%rbp), %eax
408; AVX1-NEXT:    andl $15, %eax
409; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
410; AVX1-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
411; AVX1-NEXT:    movl 80(%rbp), %eax
412; AVX1-NEXT:    andl $15, %eax
413; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
414; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
415; AVX1-NEXT:    movl 88(%rbp), %eax
416; AVX1-NEXT:    andl $15, %eax
417; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
418; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
419; AVX1-NEXT:    andl $15, %edi
420; AVX1-NEXT:    movzwl (%rsp,%rdi,2), %eax
421; AVX1-NEXT:    vmovd %eax, %xmm1
422; AVX1-NEXT:    andl $15, %esi
423; AVX1-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
424; AVX1-NEXT:    andl $15, %edx
425; AVX1-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
426; AVX1-NEXT:    andl $15, %ecx
427; AVX1-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
428; AVX1-NEXT:    andl $15, %r8d
429; AVX1-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
430; AVX1-NEXT:    andl $15, %r9d
431; AVX1-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
432; AVX1-NEXT:    movl 16(%rbp), %eax
433; AVX1-NEXT:    andl $15, %eax
434; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
435; AVX1-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
436; AVX1-NEXT:    movl 24(%rbp), %eax
437; AVX1-NEXT:    andl $15, %eax
438; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
439; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
440; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
441; AVX1-NEXT:    movq %rbp, %rsp
442; AVX1-NEXT:    popq %rbp
443; AVX1-NEXT:    retq
444;
445; AVX2-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
446; AVX2:       # BB#0:
447; AVX2-NEXT:    pushq %rbp
448; AVX2-NEXT:    movq %rsp, %rbp
449; AVX2-NEXT:    andq $-32, %rsp
450; AVX2-NEXT:    subq $64, %rsp
451; AVX2-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
452; AVX2-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
453; AVX2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
454; AVX2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
455; AVX2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
456; AVX2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
457; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
458; AVX2-NEXT:    movl 32(%rbp), %eax
459; AVX2-NEXT:    andl $15, %eax
460; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
461; AVX2-NEXT:    vmovd %eax, %xmm0
462; AVX2-NEXT:    movl 40(%rbp), %eax
463; AVX2-NEXT:    andl $15, %eax
464; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
465; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
466; AVX2-NEXT:    movl 48(%rbp), %eax
467; AVX2-NEXT:    andl $15, %eax
468; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
469; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
470; AVX2-NEXT:    movl 56(%rbp), %eax
471; AVX2-NEXT:    andl $15, %eax
472; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
473; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
474; AVX2-NEXT:    movl 64(%rbp), %eax
475; AVX2-NEXT:    andl $15, %eax
476; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
477; AVX2-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
478; AVX2-NEXT:    movl 72(%rbp), %eax
479; AVX2-NEXT:    andl $15, %eax
480; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
481; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
482; AVX2-NEXT:    movl 80(%rbp), %eax
483; AVX2-NEXT:    andl $15, %eax
484; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
485; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
486; AVX2-NEXT:    movl 88(%rbp), %eax
487; AVX2-NEXT:    andl $15, %eax
488; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
489; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
490; AVX2-NEXT:    andl $15, %edi
491; AVX2-NEXT:    movzwl (%rsp,%rdi,2), %eax
492; AVX2-NEXT:    vmovd %eax, %xmm1
493; AVX2-NEXT:    andl $15, %esi
494; AVX2-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
495; AVX2-NEXT:    andl $15, %edx
496; AVX2-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
497; AVX2-NEXT:    andl $15, %ecx
498; AVX2-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
499; AVX2-NEXT:    andl $15, %r8d
500; AVX2-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
501; AVX2-NEXT:    andl $15, %r9d
502; AVX2-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
503; AVX2-NEXT:    movl 16(%rbp), %eax
504; AVX2-NEXT:    andl $15, %eax
505; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
506; AVX2-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
507; AVX2-NEXT:    movl 24(%rbp), %eax
508; AVX2-NEXT:    andl $15, %eax
509; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
510; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
511; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
512; AVX2-NEXT:    movq %rbp, %rsp
513; AVX2-NEXT:    popq %rbp
514; AVX2-NEXT:    retq
515  %x0  = extractelement <16 x i16> %x, i32 %i0
516  %x1  = extractelement <16 x i16> %x, i32 %i1
517  %x2  = extractelement <16 x i16> %x, i32 %i2
518  %x3  = extractelement <16 x i16> %x, i32 %i3
519  %x4  = extractelement <16 x i16> %x, i32 %i4
520  %x5  = extractelement <16 x i16> %x, i32 %i5
521  %x6  = extractelement <16 x i16> %x, i32 %i6
522  %x7  = extractelement <16 x i16> %x, i32 %i7
523  %x8  = extractelement <16 x i16> %x, i32 %i8
524  %x9  = extractelement <16 x i16> %x, i32 %i9
525  %x10 = extractelement <16 x i16> %x, i32 %i10
526  %x11 = extractelement <16 x i16> %x, i32 %i11
527  %x12 = extractelement <16 x i16> %x, i32 %i12
528  %x13 = extractelement <16 x i16> %x, i32 %i13
529  %x14 = extractelement <16 x i16> %x, i32 %i14
530  %x15 = extractelement <16 x i16> %x, i32 %i15
531  %r0  = insertelement <16 x i16> undef, i16 %x0 , i32 0
532  %r1  = insertelement <16 x i16>  %r0 , i16 %x1 , i32 1
533  %r2  = insertelement <16 x i16>  %r1 , i16 %x2 , i32 2
534  %r3  = insertelement <16 x i16>  %r2 , i16 %x3 , i32 3
535  %r4  = insertelement <16 x i16>  %r3 , i16 %x4 , i32 4
536  %r5  = insertelement <16 x i16>  %r4 , i16 %x5 , i32 5
537  %r6  = insertelement <16 x i16>  %r5 , i16 %x6 , i32 6
538  %r7  = insertelement <16 x i16>  %r6 , i16 %x7 , i32 7
539  %r8  = insertelement <16 x i16>  %r7 , i16 %x8 , i32 8
540  %r9  = insertelement <16 x i16>  %r8 , i16 %x9 , i32 9
541  %r10 = insertelement <16 x i16>  %r9 , i16 %x10, i32 10
542  %r11 = insertelement <16 x i16>  %r10, i16 %x11, i32 11
543  %r12 = insertelement <16 x i16>  %r11, i16 %x12, i32 12
544  %r13 = insertelement <16 x i16>  %r12, i16 %x13, i32 13
545  %r14 = insertelement <16 x i16>  %r13, i16 %x14, i32 14
546  %r15 = insertelement <16 x i16>  %r14, i16 %x15, i32 15
547  ret <16 x i16> %r15
548}
549
550define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
551; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
552; AVX1:       # BB#0:
553; AVX1-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
554; AVX1-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
555; AVX1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
556; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
557; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
558; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
559; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
560; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
561; AVX1-NEXT:    andl $7, %eax
562; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
563; AVX1-NEXT:    vmovd %eax, %xmm0
564; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
565; AVX1-NEXT:    andl $7, %eax
566; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
567; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
568; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
569; AVX1-NEXT:    andl $7, %eax
570; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
571; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
572; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
573; AVX1-NEXT:    andl $7, %eax
574; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
575; AVX1-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
576; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
577; AVX1-NEXT:    andl $7, %eax
578; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
579; AVX1-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
580; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
581; AVX1-NEXT:    andl $7, %eax
582; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
583; AVX1-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
584; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
585; AVX1-NEXT:    andl $7, %eax
586; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
587; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
588; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
589; AVX1-NEXT:    andl $7, %eax
590; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
591; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
592; AVX1-NEXT:    andl $7, %edi
593; AVX1-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
594; AVX1-NEXT:    vmovd %eax, %xmm1
595; AVX1-NEXT:    andl $7, %esi
596; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
597; AVX1-NEXT:    andl $7, %edx
598; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
599; AVX1-NEXT:    andl $7, %ecx
600; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
601; AVX1-NEXT:    andl $7, %r8d
602; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
603; AVX1-NEXT:    andl $7, %r9d
604; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
605; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
606; AVX1-NEXT:    andl $7, %eax
607; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
608; AVX1-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
609; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
610; AVX1-NEXT:    andl $7, %eax
611; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
612; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
613; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
614; AVX1-NEXT:    retq
615;
616; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
617; AVX2:       # BB#0:
618; AVX2-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
619; AVX2-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
620; AVX2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
621; AVX2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
622; AVX2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
623; AVX2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
624; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
625; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
626; AVX2-NEXT:    andl $7, %eax
627; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
628; AVX2-NEXT:    vmovd %eax, %xmm0
629; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
630; AVX2-NEXT:    andl $7, %eax
631; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
632; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
633; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
634; AVX2-NEXT:    andl $7, %eax
635; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
636; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
637; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
638; AVX2-NEXT:    andl $7, %eax
639; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
640; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
641; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
642; AVX2-NEXT:    andl $7, %eax
643; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
644; AVX2-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
645; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
646; AVX2-NEXT:    andl $7, %eax
647; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
648; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
649; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
650; AVX2-NEXT:    andl $7, %eax
651; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
652; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
653; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
654; AVX2-NEXT:    andl $7, %eax
655; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
656; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
657; AVX2-NEXT:    andl $7, %edi
658; AVX2-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
659; AVX2-NEXT:    vmovd %eax, %xmm1
660; AVX2-NEXT:    andl $7, %esi
661; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
662; AVX2-NEXT:    andl $7, %edx
663; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
664; AVX2-NEXT:    andl $7, %ecx
665; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
666; AVX2-NEXT:    andl $7, %r8d
667; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
668; AVX2-NEXT:    andl $7, %r9d
669; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
670; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
671; AVX2-NEXT:    andl $7, %eax
672; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
673; AVX2-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
674; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
675; AVX2-NEXT:    andl $7, %eax
676; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
677; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
678; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
679; AVX2-NEXT:    retq
680  %x0  = extractelement <8 x i16> %x, i32 %i0
681  %x1  = extractelement <8 x i16> %x, i32 %i1
682  %x2  = extractelement <8 x i16> %x, i32 %i2
683  %x3  = extractelement <8 x i16> %x, i32 %i3
684  %x4  = extractelement <8 x i16> %x, i32 %i4
685  %x5  = extractelement <8 x i16> %x, i32 %i5
686  %x6  = extractelement <8 x i16> %x, i32 %i6
687  %x7  = extractelement <8 x i16> %x, i32 %i7
688  %x8  = extractelement <8 x i16> %x, i32 %i8
689  %x9  = extractelement <8 x i16> %x, i32 %i9
690  %x10 = extractelement <8 x i16> %x, i32 %i10
691  %x11 = extractelement <8 x i16> %x, i32 %i11
692  %x12 = extractelement <8 x i16> %x, i32 %i12
693  %x13 = extractelement <8 x i16> %x, i32 %i13
694  %x14 = extractelement <8 x i16> %x, i32 %i14
695  %x15 = extractelement <8 x i16> %x, i32 %i15
696  %r0  = insertelement <16 x i16> undef, i16 %x0 , i32 0
697  %r1  = insertelement <16 x i16>  %r0 , i16 %x1 , i32 1
698  %r2  = insertelement <16 x i16>  %r1 , i16 %x2 , i32 2
699  %r3  = insertelement <16 x i16>  %r2 , i16 %x3 , i32 3
700  %r4  = insertelement <16 x i16>  %r3 , i16 %x4 , i32 4
701  %r5  = insertelement <16 x i16>  %r4 , i16 %x5 , i32 5
702  %r6  = insertelement <16 x i16>  %r5 , i16 %x6 , i32 6
703  %r7  = insertelement <16 x i16>  %r6 , i16 %x7 , i32 7
704  %r8  = insertelement <16 x i16>  %r7 , i16 %x8 , i32 8
705  %r9  = insertelement <16 x i16>  %r8 , i16 %x9 , i32 9
706  %r10 = insertelement <16 x i16>  %r9 , i16 %x10, i32 10
707  %r11 = insertelement <16 x i16>  %r10, i16 %x11, i32 11
708  %r12 = insertelement <16 x i16>  %r11, i16 %x12, i32 12
709  %r13 = insertelement <16 x i16>  %r12, i16 %x13, i32 13
710  %r14 = insertelement <16 x i16>  %r13, i16 %x14, i32 14
711  %r15 = insertelement <16 x i16>  %r14, i16 %x15, i32 15
712  ret <16 x i16> %r15
713}
714
715;
716; Unary shuffle indices from memory
717;
718
719define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind {
720; AVX1-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
721; AVX1:       # BB#0:
722; AVX1-NEXT:    pushq %rbp
723; AVX1-NEXT:    movq %rsp, %rbp
724; AVX1-NEXT:    andq $-32, %rsp
725; AVX1-NEXT:    subq $64, %rsp
726; AVX1-NEXT:    movq (%rdi), %rax
727; AVX1-NEXT:    movq 8(%rdi), %rcx
728; AVX1-NEXT:    andl $3, %eax
729; AVX1-NEXT:    andl $3, %ecx
730; AVX1-NEXT:    movq 16(%rdi), %rdx
731; AVX1-NEXT:    andl $3, %edx
732; AVX1-NEXT:    movq 24(%rdi), %rsi
733; AVX1-NEXT:    andl $3, %esi
734; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
735; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
736; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
737; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
738; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
739; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
740; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
741; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
742; AVX1-NEXT:    movq %rbp, %rsp
743; AVX1-NEXT:    popq %rbp
744; AVX1-NEXT:    retq
745;
746; AVX2-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
747; AVX2:       # BB#0:
748; AVX2-NEXT:    pushq %rbp
749; AVX2-NEXT:    movq %rsp, %rbp
750; AVX2-NEXT:    andq $-32, %rsp
751; AVX2-NEXT:    subq $64, %rsp
752; AVX2-NEXT:    movq (%rdi), %rax
753; AVX2-NEXT:    movq 8(%rdi), %rcx
754; AVX2-NEXT:    andl $3, %eax
755; AVX2-NEXT:    andl $3, %ecx
756; AVX2-NEXT:    movq 16(%rdi), %rdx
757; AVX2-NEXT:    andl $3, %edx
758; AVX2-NEXT:    movq 24(%rdi), %rsi
759; AVX2-NEXT:    andl $3, %esi
760; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
761; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
762; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
763; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
764; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
765; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
766; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
767; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
768; AVX2-NEXT:    movq %rbp, %rsp
769; AVX2-NEXT:    popq %rbp
770; AVX2-NEXT:    retq
771  %p0  = getelementptr inbounds i64, i64* %i, i32 0
772  %p1  = getelementptr inbounds i64, i64* %i, i32 1
773  %p2  = getelementptr inbounds i64, i64* %i, i32 2
774  %p3  = getelementptr inbounds i64, i64* %i, i32 3
775  %i0  = load i64, i64* %p0, align 4
776  %i1  = load i64, i64* %p1, align 4
777  %i2  = load i64, i64* %p2, align 4
778  %i3  = load i64, i64* %p3, align 4
779  %x0 = extractelement <4 x i64> %x, i64 %i0
780  %x1 = extractelement <4 x i64> %x, i64 %i1
781  %x2 = extractelement <4 x i64> %x, i64 %i2
782  %x3 = extractelement <4 x i64> %x, i64 %i3
783  %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
784  %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
785  %r2 = insertelement <4 x i64>   %r1, i64 %x2, i32 2
786  %r3 = insertelement <4 x i64>   %r2, i64 %x3, i32 3
787  ret <4 x i64> %r3
788}
789
790define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind {
791; AVX1-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
792; AVX1:       # BB#0:
793; AVX1-NEXT:    movq (%rdi), %rax
794; AVX1-NEXT:    movq 8(%rdi), %rcx
795; AVX1-NEXT:    andl $1, %eax
796; AVX1-NEXT:    andl $1, %ecx
797; AVX1-NEXT:    movq 16(%rdi), %rdx
798; AVX1-NEXT:    andl $1, %edx
799; AVX1-NEXT:    movq 24(%rdi), %rsi
800; AVX1-NEXT:    andl $1, %esi
801; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
802; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
803; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
804; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
805; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
806; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
807; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
808; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
809; AVX1-NEXT:    retq
810;
811; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
812; AVX2:       # BB#0:
813; AVX2-NEXT:    movq (%rdi), %rax
814; AVX2-NEXT:    movq 8(%rdi), %rcx
815; AVX2-NEXT:    andl $1, %eax
816; AVX2-NEXT:    andl $1, %ecx
817; AVX2-NEXT:    movq 16(%rdi), %rdx
818; AVX2-NEXT:    andl $1, %edx
819; AVX2-NEXT:    movq 24(%rdi), %rsi
820; AVX2-NEXT:    andl $1, %esi
821; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
822; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
823; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
824; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
825; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
826; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
827; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
828; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
829; AVX2-NEXT:    retq
830  %p0  = getelementptr inbounds i64, i64* %i, i32 0
831  %p1  = getelementptr inbounds i64, i64* %i, i32 1
832  %p2  = getelementptr inbounds i64, i64* %i, i32 2
833  %p3  = getelementptr inbounds i64, i64* %i, i32 3
834  %i0  = load i64, i64* %p0, align 4
835  %i1  = load i64, i64* %p1, align 4
836  %i2  = load i64, i64* %p2, align 4
837  %i3  = load i64, i64* %p3, align 4
838  %x0 = extractelement <2 x i64> %x, i64 %i0
839  %x1 = extractelement <2 x i64> %x, i64 %i1
840  %x2 = extractelement <2 x i64> %x, i64 %i2
841  %x3 = extractelement <2 x i64> %x, i64 %i3
842  %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
843  %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
844  %r2 = insertelement <4 x i64>   %r1, i64 %x2, i32 2
845  %r3 = insertelement <4 x i64>   %r2, i64 %x3, i32 3
846  ret <4 x i64> %r3
847}
848