1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
4; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
7; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL
8; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE
9;
10; Verify that the DAG combiner correctly folds bitwise operations across
11; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
12; basic and always-safe patterns. Also test that the DAG combiner will combine
13; target-specific shuffle instructions where reasonable.
14
15target triple = "x86_64-unknown-unknown"
16
17declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
18declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
19declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
20
21define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
22; CHECK-LABEL: combine_pshufd1:
23; CHECK:       # %bb.0: # %entry
24; CHECK-NEXT:    retq
25entry:
26  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
27  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
28  ret <4 x i32> %c
29}
30
31define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
32; CHECK-LABEL: combine_pshufd2:
33; CHECK:       # %bb.0: # %entry
34; CHECK-NEXT:    retq
35entry:
36  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
37  %b.cast = bitcast <4 x i32> %b to <8 x i16>
38  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
39  %c.cast = bitcast <8 x i16> %c to <4 x i32>
40  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
41  ret <4 x i32> %d
42}
43
44define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
45; CHECK-LABEL: combine_pshufd3:
46; CHECK:       # %bb.0: # %entry
47; CHECK-NEXT:    retq
48entry:
49  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
50  %b.cast = bitcast <4 x i32> %b to <8 x i16>
51  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
52  %c.cast = bitcast <8 x i16> %c to <4 x i32>
53  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
54  ret <4 x i32> %d
55}
56
57define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
58; SSE-LABEL: combine_pshufd4:
59; SSE:       # %bb.0: # %entry
60; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
61; SSE-NEXT:    retq
62;
63; AVX-LABEL: combine_pshufd4:
64; AVX:       # %bb.0: # %entry
65; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
66; AVX-NEXT:    retq
67entry:
68  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
69  %b.cast = bitcast <4 x i32> %b to <8 x i16>
70  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
71  %c.cast = bitcast <8 x i16> %c to <4 x i32>
72  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
73  ret <4 x i32> %d
74}
75
76define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
77; SSE-LABEL: combine_pshufd5:
78; SSE:       # %bb.0: # %entry
79; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
80; SSE-NEXT:    retq
81;
82; AVX-LABEL: combine_pshufd5:
83; AVX:       # %bb.0: # %entry
84; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
85; AVX-NEXT:    retq
86entry:
87  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
88  %b.cast = bitcast <4 x i32> %b to <8 x i16>
89  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
90  %c.cast = bitcast <8 x i16> %c to <4 x i32>
91  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
92  ret <4 x i32> %d
93}
94
95define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
96; SSE-LABEL: combine_pshufd6:
97; SSE:       # %bb.0: # %entry
98; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
99; SSE-NEXT:    retq
100;
101; AVX1-LABEL: combine_pshufd6:
102; AVX1:       # %bb.0: # %entry
103; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
104; AVX1-NEXT:    retq
105;
106; AVX2-LABEL: combine_pshufd6:
107; AVX2:       # %bb.0: # %entry
108; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
109; AVX2-NEXT:    retq
110entry:
111  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
112  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
113  ret <4 x i32> %c
114}
115
116define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
117; CHECK-LABEL: combine_pshuflw1:
118; CHECK:       # %bb.0: # %entry
119; CHECK-NEXT:    retq
120entry:
121  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
122  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
123  ret <8 x i16> %c
124}
125
126define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
127; CHECK-LABEL: combine_pshuflw2:
128; CHECK:       # %bb.0: # %entry
129; CHECK-NEXT:    retq
130entry:
131  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
132  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
133  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
134  ret <8 x i16> %d
135}
136
137define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
138; SSE-LABEL: combine_pshuflw3:
139; SSE:       # %bb.0: # %entry
140; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
141; SSE-NEXT:    retq
142;
143; AVX-LABEL: combine_pshuflw3:
144; AVX:       # %bb.0: # %entry
145; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
146; AVX-NEXT:    retq
147entry:
148  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
149  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
150  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
151  ret <8 x i16> %d
152}
153
154define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
155; SSE-LABEL: combine_pshufhw1:
156; SSE:       # %bb.0: # %entry
157; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
158; SSE-NEXT:    retq
159;
160; AVX-LABEL: combine_pshufhw1:
161; AVX:       # %bb.0: # %entry
162; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
163; AVX-NEXT:    retq
164entry:
165  %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
166  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
167  %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
168  ret <8 x i16> %d
169}
170
171define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
172; SSE-LABEL: combine_bitwise_ops_test1:
173; SSE:       # %bb.0:
174; SSE-NEXT:    pand %xmm1, %xmm0
175; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
176; SSE-NEXT:    retq
177;
178; AVX-LABEL: combine_bitwise_ops_test1:
179; AVX:       # %bb.0:
180; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
181; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
182; AVX-NEXT:    retq
183  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
184  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
185  %and = and <4 x i32> %shuf1, %shuf2
186  ret <4 x i32> %and
187}
188
189define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
190; SSE-LABEL: combine_bitwise_ops_test2:
191; SSE:       # %bb.0:
192; SSE-NEXT:    por %xmm1, %xmm0
193; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
194; SSE-NEXT:    retq
195;
196; AVX-LABEL: combine_bitwise_ops_test2:
197; AVX:       # %bb.0:
198; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
199; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
200; AVX-NEXT:    retq
201  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
202  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
203  %or = or <4 x i32> %shuf1, %shuf2
204  ret <4 x i32> %or
205}
206
207define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
208; SSE-LABEL: combine_bitwise_ops_test3:
209; SSE:       # %bb.0:
210; SSE-NEXT:    pxor %xmm1, %xmm0
211; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
212; SSE-NEXT:    retq
213;
214; AVX-LABEL: combine_bitwise_ops_test3:
215; AVX:       # %bb.0:
216; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
217; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
218; AVX-NEXT:    retq
219  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
220  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
221  %xor = xor <4 x i32> %shuf1, %shuf2
222  ret <4 x i32> %xor
223}
224
225define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
226; SSE-LABEL: combine_bitwise_ops_test4:
227; SSE:       # %bb.0:
228; SSE-NEXT:    pand %xmm1, %xmm0
229; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
230; SSE-NEXT:    retq
231;
232; AVX-LABEL: combine_bitwise_ops_test4:
233; AVX:       # %bb.0:
234; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
235; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
236; AVX-NEXT:    retq
237  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
238  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
239  %and = and <4 x i32> %shuf1, %shuf2
240  ret <4 x i32> %and
241}
242
243define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
244; SSE-LABEL: combine_bitwise_ops_test5:
245; SSE:       # %bb.0:
246; SSE-NEXT:    por %xmm1, %xmm0
247; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
248; SSE-NEXT:    retq
249;
250; AVX-LABEL: combine_bitwise_ops_test5:
251; AVX:       # %bb.0:
252; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
253; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
254; AVX-NEXT:    retq
255  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
256  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
257  %or = or <4 x i32> %shuf1, %shuf2
258  ret <4 x i32> %or
259}
260
261define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
262; SSE-LABEL: combine_bitwise_ops_test6:
263; SSE:       # %bb.0:
264; SSE-NEXT:    pxor %xmm1, %xmm0
265; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
266; SSE-NEXT:    retq
267;
268; AVX-LABEL: combine_bitwise_ops_test6:
269; AVX:       # %bb.0:
270; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
271; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
272; AVX-NEXT:    retq
273  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
274  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
275  %xor = xor <4 x i32> %shuf1, %shuf2
276  ret <4 x i32> %xor
277}
278
279
280; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
281; are not performing a swizzle operations.
282
283define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
284; SSE2-LABEL: combine_bitwise_ops_test1b:
285; SSE2:       # %bb.0:
286; SSE2-NEXT:    pand %xmm1, %xmm0
287; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
288; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
289; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
290; SSE2-NEXT:    retq
291;
292; SSSE3-LABEL: combine_bitwise_ops_test1b:
293; SSSE3:       # %bb.0:
294; SSSE3-NEXT:    pand %xmm1, %xmm0
295; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
296; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
297; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
298; SSSE3-NEXT:    retq
299;
300; SSE41-LABEL: combine_bitwise_ops_test1b:
301; SSE41:       # %bb.0:
302; SSE41-NEXT:    andps %xmm1, %xmm0
303; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
304; SSE41-NEXT:    retq
305;
306; AVX-LABEL: combine_bitwise_ops_test1b:
307; AVX:       # %bb.0:
308; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
309; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
310; AVX-NEXT:    retq
311  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
312  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
313  %and = and <4 x i32> %shuf1, %shuf2
314  ret <4 x i32> %and
315}
316
317define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
318; SSE2-LABEL: combine_bitwise_ops_test2b:
319; SSE2:       # %bb.0:
320; SSE2-NEXT:    por %xmm1, %xmm0
321; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
322; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
323; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
324; SSE2-NEXT:    retq
325;
326; SSSE3-LABEL: combine_bitwise_ops_test2b:
327; SSSE3:       # %bb.0:
328; SSSE3-NEXT:    por %xmm1, %xmm0
329; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
330; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
331; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
332; SSSE3-NEXT:    retq
333;
334; SSE41-LABEL: combine_bitwise_ops_test2b:
335; SSE41:       # %bb.0:
336; SSE41-NEXT:    orps %xmm1, %xmm0
337; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
338; SSE41-NEXT:    retq
339;
340; AVX-LABEL: combine_bitwise_ops_test2b:
341; AVX:       # %bb.0:
342; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
343; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
344; AVX-NEXT:    retq
345  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
346  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
347  %or = or <4 x i32> %shuf1, %shuf2
348  ret <4 x i32> %or
349}
350
351define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
352; SSE2-LABEL: combine_bitwise_ops_test3b:
353; SSE2:       # %bb.0:
354; SSE2-NEXT:    xorps %xmm1, %xmm0
355; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
356; SSE2-NEXT:    retq
357;
358; SSSE3-LABEL: combine_bitwise_ops_test3b:
359; SSSE3:       # %bb.0:
360; SSSE3-NEXT:    xorps %xmm1, %xmm0
361; SSSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
362; SSSE3-NEXT:    retq
363;
364; SSE41-LABEL: combine_bitwise_ops_test3b:
365; SSE41:       # %bb.0:
366; SSE41-NEXT:    xorps %xmm1, %xmm0
367; SSE41-NEXT:    xorps %xmm1, %xmm1
368; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
369; SSE41-NEXT:    retq
370;
371; AVX-LABEL: combine_bitwise_ops_test3b:
372; AVX:       # %bb.0:
373; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
374; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
375; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
376; AVX-NEXT:    retq
377  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
378  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
379  %xor = xor <4 x i32> %shuf1, %shuf2
380  ret <4 x i32> %xor
381}
382
383define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
384; SSE2-LABEL: combine_bitwise_ops_test4b:
385; SSE2:       # %bb.0:
386; SSE2-NEXT:    pand %xmm1, %xmm0
387; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
388; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
389; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
390; SSE2-NEXT:    retq
391;
392; SSSE3-LABEL: combine_bitwise_ops_test4b:
393; SSSE3:       # %bb.0:
394; SSSE3-NEXT:    pand %xmm1, %xmm0
395; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
396; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
397; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
398; SSSE3-NEXT:    retq
399;
400; SSE41-LABEL: combine_bitwise_ops_test4b:
401; SSE41:       # %bb.0:
402; SSE41-NEXT:    andps %xmm1, %xmm0
403; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
404; SSE41-NEXT:    retq
405;
406; AVX-LABEL: combine_bitwise_ops_test4b:
407; AVX:       # %bb.0:
408; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
409; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
410; AVX-NEXT:    retq
411  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
412  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
413  %and = and <4 x i32> %shuf1, %shuf2
414  ret <4 x i32> %and
415}
416
417define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
418; SSE2-LABEL: combine_bitwise_ops_test5b:
419; SSE2:       # %bb.0:
420; SSE2-NEXT:    por %xmm1, %xmm0
421; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
422; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
423; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
424; SSE2-NEXT:    retq
425;
426; SSSE3-LABEL: combine_bitwise_ops_test5b:
427; SSSE3:       # %bb.0:
428; SSSE3-NEXT:    por %xmm1, %xmm0
429; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
430; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
431; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
432; SSSE3-NEXT:    retq
433;
434; SSE41-LABEL: combine_bitwise_ops_test5b:
435; SSE41:       # %bb.0:
436; SSE41-NEXT:    orps %xmm1, %xmm0
437; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
438; SSE41-NEXT:    retq
439;
440; AVX-LABEL: combine_bitwise_ops_test5b:
441; AVX:       # %bb.0:
442; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
443; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
444; AVX-NEXT:    retq
445  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
446  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
447  %or = or <4 x i32> %shuf1, %shuf2
448  ret <4 x i32> %or
449}
450
451define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
452; SSE2-LABEL: combine_bitwise_ops_test6b:
453; SSE2:       # %bb.0:
454; SSE2-NEXT:    xorps %xmm1, %xmm0
455; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
456; SSE2-NEXT:    retq
457;
458; SSSE3-LABEL: combine_bitwise_ops_test6b:
459; SSSE3:       # %bb.0:
460; SSSE3-NEXT:    xorps %xmm1, %xmm0
461; SSSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
462; SSSE3-NEXT:    retq
463;
464; SSE41-LABEL: combine_bitwise_ops_test6b:
465; SSE41:       # %bb.0:
466; SSE41-NEXT:    xorps %xmm1, %xmm0
467; SSE41-NEXT:    xorps %xmm1, %xmm1
468; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
469; SSE41-NEXT:    retq
470;
471; AVX-LABEL: combine_bitwise_ops_test6b:
472; AVX:       # %bb.0:
473; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
474; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
475; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
476; AVX-NEXT:    retq
477  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
478  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
479  %xor = xor <4 x i32> %shuf1, %shuf2
480  ret <4 x i32> %xor
481}
482
483define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
484; SSE-LABEL: combine_bitwise_ops_test1c:
485; SSE:       # %bb.0:
486; SSE-NEXT:    andps %xmm1, %xmm0
487; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
488; SSE-NEXT:    retq
489;
490; AVX-LABEL: combine_bitwise_ops_test1c:
491; AVX:       # %bb.0:
492; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
493; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
494; AVX-NEXT:    retq
495  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
496  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
497  %and = and <4 x i32> %shuf1, %shuf2
498  ret <4 x i32> %and
499}
500
501define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
502; SSE-LABEL: combine_bitwise_ops_test2c:
503; SSE:       # %bb.0:
504; SSE-NEXT:    orps %xmm1, %xmm0
505; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
506; SSE-NEXT:    retq
507;
508; AVX-LABEL: combine_bitwise_ops_test2c:
509; AVX:       # %bb.0:
510; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
511; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
512; AVX-NEXT:    retq
513  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
514  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
515  %or = or <4 x i32> %shuf1, %shuf2
516  ret <4 x i32> %or
517}
518
519define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
520; SSE2-LABEL: combine_bitwise_ops_test3c:
521; SSE2:       # %bb.0:
522; SSE2-NEXT:    xorps %xmm1, %xmm0
523; SSE2-NEXT:    xorps %xmm1, %xmm1
524; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
525; SSE2-NEXT:    retq
526;
527; SSSE3-LABEL: combine_bitwise_ops_test3c:
528; SSSE3:       # %bb.0:
529; SSSE3-NEXT:    xorps %xmm1, %xmm0
530; SSSE3-NEXT:    xorps %xmm1, %xmm1
531; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
532; SSSE3-NEXT:    retq
533;
534; SSE41-LABEL: combine_bitwise_ops_test3c:
535; SSE41:       # %bb.0:
536; SSE41-NEXT:    xorps %xmm1, %xmm0
537; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
538; SSE41-NEXT:    retq
539;
540; AVX-LABEL: combine_bitwise_ops_test3c:
541; AVX:       # %bb.0:
542; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
543; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
544; AVX-NEXT:    retq
545  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
546  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
547  %xor = xor <4 x i32> %shuf1, %shuf2
548  ret <4 x i32> %xor
549}
550
551define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
552; SSE-LABEL: combine_bitwise_ops_test4c:
553; SSE:       # %bb.0:
554; SSE-NEXT:    andps %xmm1, %xmm0
555; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
556; SSE-NEXT:    movaps %xmm2, %xmm0
557; SSE-NEXT:    retq
558;
559; AVX-LABEL: combine_bitwise_ops_test4c:
560; AVX:       # %bb.0:
561; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
562; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
563; AVX-NEXT:    retq
564  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
565  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
566  %and = and <4 x i32> %shuf1, %shuf2
567  ret <4 x i32> %and
568}
569
570define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
571; SSE-LABEL: combine_bitwise_ops_test5c:
572; SSE:       # %bb.0:
573; SSE-NEXT:    orps %xmm1, %xmm0
574; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
575; SSE-NEXT:    movaps %xmm2, %xmm0
576; SSE-NEXT:    retq
577;
578; AVX-LABEL: combine_bitwise_ops_test5c:
579; AVX:       # %bb.0:
580; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
581; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
582; AVX-NEXT:    retq
583  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
584  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
585  %or = or <4 x i32> %shuf1, %shuf2
586  ret <4 x i32> %or
587}
588
589define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
590; SSE2-LABEL: combine_bitwise_ops_test6c:
591; SSE2:       # %bb.0:
592; SSE2-NEXT:    xorps %xmm1, %xmm0
593; SSE2-NEXT:    xorps %xmm1, %xmm1
594; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
595; SSE2-NEXT:    movaps %xmm1, %xmm0
596; SSE2-NEXT:    retq
597;
598; SSSE3-LABEL: combine_bitwise_ops_test6c:
599; SSSE3:       # %bb.0:
600; SSSE3-NEXT:    xorps %xmm1, %xmm0
601; SSSE3-NEXT:    xorps %xmm1, %xmm1
602; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
603; SSSE3-NEXT:    movaps %xmm1, %xmm0
604; SSSE3-NEXT:    retq
605;
606; SSE41-LABEL: combine_bitwise_ops_test6c:
607; SSE41:       # %bb.0:
608; SSE41-NEXT:    xorps %xmm1, %xmm0
609; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
610; SSE41-NEXT:    retq
611;
612; AVX-LABEL: combine_bitwise_ops_test6c:
613; AVX:       # %bb.0:
614; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
615; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
616; AVX-NEXT:    retq
617  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
618  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
619  %xor = xor <4 x i32> %shuf1, %shuf2
620  ret <4 x i32> %xor
621}
622
623define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
624; SSE-LABEL: combine_nested_undef_test1:
625; SSE:       # %bb.0:
626; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
627; SSE-NEXT:    retq
628;
629; AVX-LABEL: combine_nested_undef_test1:
630; AVX:       # %bb.0:
631; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
632; AVX-NEXT:    retq
633  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
634  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
635  ret <4 x i32> %2
636}
637
638define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
639; SSE-LABEL: combine_nested_undef_test2:
640; SSE:       # %bb.0:
641; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
642; SSE-NEXT:    retq
643;
644; AVX-LABEL: combine_nested_undef_test2:
645; AVX:       # %bb.0:
646; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
647; AVX-NEXT:    retq
648  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
649  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
650  ret <4 x i32> %2
651}
652
653define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
654; SSE-LABEL: combine_nested_undef_test3:
655; SSE:       # %bb.0:
656; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
657; SSE-NEXT:    retq
658;
659; AVX-LABEL: combine_nested_undef_test3:
660; AVX:       # %bb.0:
661; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
662; AVX-NEXT:    retq
663  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
664  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
665  ret <4 x i32> %2
666}
667
668define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
669; SSE-LABEL: combine_nested_undef_test4:
670; SSE:       # %bb.0:
671; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
672; SSE-NEXT:    retq
673;
674; AVX1-LABEL: combine_nested_undef_test4:
675; AVX1:       # %bb.0:
676; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
677; AVX1-NEXT:    retq
678;
679; AVX2-LABEL: combine_nested_undef_test4:
680; AVX2:       # %bb.0:
681; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
682; AVX2-NEXT:    retq
683  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
684  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
685  ret <4 x i32> %2
686}
687
688define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
689; SSE-LABEL: combine_nested_undef_test5:
690; SSE:       # %bb.0:
691; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
692; SSE-NEXT:    retq
693;
694; AVX-LABEL: combine_nested_undef_test5:
695; AVX:       # %bb.0:
696; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
697; AVX-NEXT:    retq
698  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
699  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
700  ret <4 x i32> %2
701}
702
703define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
704; SSE-LABEL: combine_nested_undef_test6:
705; SSE:       # %bb.0:
706; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
707; SSE-NEXT:    retq
708;
709; AVX-LABEL: combine_nested_undef_test6:
710; AVX:       # %bb.0:
711; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
712; AVX-NEXT:    retq
713  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
714  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
715  ret <4 x i32> %2
716}
717
718define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
719; SSE-LABEL: combine_nested_undef_test7:
720; SSE:       # %bb.0:
721; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
722; SSE-NEXT:    retq
723;
724; AVX-LABEL: combine_nested_undef_test7:
725; AVX:       # %bb.0:
726; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
727; AVX-NEXT:    retq
728  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
729  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
730  ret <4 x i32> %2
731}
732
733define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
734; SSE-LABEL: combine_nested_undef_test8:
735; SSE:       # %bb.0:
736; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
737; SSE-NEXT:    retq
738;
739; AVX-LABEL: combine_nested_undef_test8:
740; AVX:       # %bb.0:
741; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
742; AVX-NEXT:    retq
743  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
744  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
745  ret <4 x i32> %2
746}
747
748define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
749; SSE-LABEL: combine_nested_undef_test9:
750; SSE:       # %bb.0:
751; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
752; SSE-NEXT:    retq
753;
754; AVX-LABEL: combine_nested_undef_test9:
755; AVX:       # %bb.0:
756; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2]
757; AVX-NEXT:    retq
758  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
759  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
760  ret <4 x i32> %2
761}
762
763define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
764; SSE-LABEL: combine_nested_undef_test10:
765; SSE:       # %bb.0:
766; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
767; SSE-NEXT:    retq
768;
769; AVX-LABEL: combine_nested_undef_test10:
770; AVX:       # %bb.0:
771; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
772; AVX-NEXT:    retq
773  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
774  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
775  ret <4 x i32> %2
776}
777
778define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
779; SSE-LABEL: combine_nested_undef_test11:
780; SSE:       # %bb.0:
781; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
782; SSE-NEXT:    retq
783;
784; AVX-LABEL: combine_nested_undef_test11:
785; AVX:       # %bb.0:
786; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1]
787; AVX-NEXT:    retq
788  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
789  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
790  ret <4 x i32> %2
791}
792
793define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
794; SSE-LABEL: combine_nested_undef_test12:
795; SSE:       # %bb.0:
796; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
797; SSE-NEXT:    retq
798;
799; AVX1-LABEL: combine_nested_undef_test12:
800; AVX1:       # %bb.0:
801; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
802; AVX1-NEXT:    retq
803;
804; AVX2-LABEL: combine_nested_undef_test12:
805; AVX2:       # %bb.0:
806; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
807; AVX2-NEXT:    retq
808  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
809  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
810  ret <4 x i32> %2
811}
812
813; The following pair of shuffles is folded into vector %A.
814define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
815; CHECK-LABEL: combine_nested_undef_test13:
816; CHECK:       # %bb.0:
817; CHECK-NEXT:    retq
818  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
819  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
820  ret <4 x i32> %2
821}
822
823; The following pair of shuffles is folded into vector %B.
824define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
825; SSE-LABEL: combine_nested_undef_test14:
826; SSE:       # %bb.0:
827; SSE-NEXT:    movaps %xmm1, %xmm0
828; SSE-NEXT:    retq
829;
830; AVX-LABEL: combine_nested_undef_test14:
831; AVX:       # %bb.0:
832; AVX-NEXT:    vmovaps %xmm1, %xmm0
833; AVX-NEXT:    retq
834  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
835  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
836  ret <4 x i32> %2
837}
838
839
840; Verify that we don't optimize the following cases. We expect more than one shuffle.
841;
842; FIXME: Many of these already don't make sense, and the rest should stop
843; making sense with th enew vector shuffle lowering. Revisit at least testing for
844; it.
845
846define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
847; SSE2-LABEL: combine_nested_undef_test15:
848; SSE2:       # %bb.0:
849; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
850; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
851; SSE2-NEXT:    movaps %xmm1, %xmm0
852; SSE2-NEXT:    retq
853;
854; SSSE3-LABEL: combine_nested_undef_test15:
855; SSSE3:       # %bb.0:
856; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
857; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
858; SSSE3-NEXT:    movaps %xmm1, %xmm0
859; SSSE3-NEXT:    retq
860;
861; SSE41-LABEL: combine_nested_undef_test15:
862; SSE41:       # %bb.0:
863; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
864; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
865; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
866; SSE41-NEXT:    retq
867;
868; AVX1-LABEL: combine_nested_undef_test15:
869; AVX1:       # %bb.0:
870; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
871; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
872; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
873; AVX1-NEXT:    retq
874;
875; AVX2-LABEL: combine_nested_undef_test15:
876; AVX2:       # %bb.0:
877; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
878; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
879; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
880; AVX2-NEXT:    retq
881  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
882  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
883  ret <4 x i32> %2
884}
885
886define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
887; SSE2-LABEL: combine_nested_undef_test16:
888; SSE2:       # %bb.0:
889; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
890; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
891; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
892; SSE2-NEXT:    retq
893;
894; SSSE3-LABEL: combine_nested_undef_test16:
895; SSSE3:       # %bb.0:
896; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
897; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
898; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
899; SSSE3-NEXT:    retq
900;
901; SSE41-LABEL: combine_nested_undef_test16:
902; SSE41:       # %bb.0:
903; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
904; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
905; SSE41-NEXT:    retq
906;
907; AVX-LABEL: combine_nested_undef_test16:
908; AVX:       # %bb.0:
909; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
910; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
911; AVX-NEXT:    retq
912  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
913  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
914  ret <4 x i32> %2
915}
916
917define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
918; SSE2-LABEL: combine_nested_undef_test17:
919; SSE2:       # %bb.0:
920; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
921; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
922; SSE2-NEXT:    retq
923;
924; SSSE3-LABEL: combine_nested_undef_test17:
925; SSSE3:       # %bb.0:
926; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
927; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
928; SSSE3-NEXT:    retq
929;
930; SSE41-LABEL: combine_nested_undef_test17:
931; SSE41:       # %bb.0:
932; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
933; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
934; SSE41-NEXT:    retq
935;
936; AVX-LABEL: combine_nested_undef_test17:
937; AVX:       # %bb.0:
938; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
939; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
940; AVX-NEXT:    retq
941  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
942  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
943  ret <4 x i32> %2
944}
945
946define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
947; SSE-LABEL: combine_nested_undef_test18:
948; SSE:       # %bb.0:
949; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
950; SSE-NEXT:    retq
951;
952; AVX-LABEL: combine_nested_undef_test18:
953; AVX:       # %bb.0:
954; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3]
955; AVX-NEXT:    retq
956  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
957  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
958  ret <4 x i32> %2
959}
960
961define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
962; SSE2-LABEL: combine_nested_undef_test19:
963; SSE2:       # %bb.0:
964; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
965; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
966; SSE2-NEXT:    retq
967;
968; SSSE3-LABEL: combine_nested_undef_test19:
969; SSSE3:       # %bb.0:
970; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
971; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
972; SSSE3-NEXT:    retq
973;
974; SSE41-LABEL: combine_nested_undef_test19:
975; SSE41:       # %bb.0:
976; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
977; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
978; SSE41-NEXT:    retq
979;
980; AVX-LABEL: combine_nested_undef_test19:
981; AVX:       # %bb.0:
982; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
983; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
984; AVX-NEXT:    retq
985  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
986  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
987  ret <4 x i32> %2
988}
989
990define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
991; SSE2-LABEL: combine_nested_undef_test20:
992; SSE2:       # %bb.0:
993; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
994; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
995; SSE2-NEXT:    movaps %xmm1, %xmm0
996; SSE2-NEXT:    retq
997;
998; SSSE3-LABEL: combine_nested_undef_test20:
999; SSSE3:       # %bb.0:
1000; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1001; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1002; SSSE3-NEXT:    movaps %xmm1, %xmm0
1003; SSSE3-NEXT:    retq
1004;
1005; SSE41-LABEL: combine_nested_undef_test20:
1006; SSE41:       # %bb.0:
1007; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1008; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1009; SSE41-NEXT:    retq
1010;
1011; AVX-LABEL: combine_nested_undef_test20:
1012; AVX:       # %bb.0:
1013; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1014; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0]
1015; AVX-NEXT:    retq
1016  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
1017  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1018  ret <4 x i32> %2
1019}
1020
1021define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
1022; SSE2-LABEL: combine_nested_undef_test21:
1023; SSE2:       # %bb.0:
1024; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1025; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1026; SSE2-NEXT:    retq
1027;
1028; SSSE3-LABEL: combine_nested_undef_test21:
1029; SSSE3:       # %bb.0:
1030; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1031; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1032; SSSE3-NEXT:    retq
1033;
1034; SSE41-LABEL: combine_nested_undef_test21:
1035; SSE41:       # %bb.0:
1036; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1037; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1038; SSE41-NEXT:    retq
1039;
1040; AVX1-LABEL: combine_nested_undef_test21:
1041; AVX1:       # %bb.0:
1042; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1043; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1044; AVX1-NEXT:    retq
1045;
1046; AVX2-LABEL: combine_nested_undef_test21:
1047; AVX2:       # %bb.0:
1048; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1049; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1050; AVX2-NEXT:    retq
1051  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1052  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1053  ret <4 x i32> %2
1054}
1055
1056
1057; Test that we correctly combine shuffles according to rule
1058;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
1059
1060define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
1061; SSE-LABEL: combine_nested_undef_test22:
1062; SSE:       # %bb.0:
1063; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1064; SSE-NEXT:    retq
1065;
1066; AVX-LABEL: combine_nested_undef_test22:
1067; AVX:       # %bb.0:
1068; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3]
1069; AVX-NEXT:    retq
1070  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1071  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
1072  ret <4 x i32> %2
1073}
1074
1075define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
1076; SSE-LABEL: combine_nested_undef_test23:
1077; SSE:       # %bb.0:
1078; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1079; SSE-NEXT:    retq
1080;
1081; AVX-LABEL: combine_nested_undef_test23:
1082; AVX:       # %bb.0:
1083; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3]
1084; AVX-NEXT:    retq
1085  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1086  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1087  ret <4 x i32> %2
1088}
1089
1090define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
1091; SSE-LABEL: combine_nested_undef_test24:
1092; SSE:       # %bb.0:
1093; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1094; SSE-NEXT:    retq
1095;
1096; AVX-LABEL: combine_nested_undef_test24:
1097; AVX:       # %bb.0:
1098; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3]
1099; AVX-NEXT:    retq
1100  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1101  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
1102  ret <4 x i32> %2
1103}
1104
1105define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
1106; SSE-LABEL: combine_nested_undef_test25:
1107; SSE:       # %bb.0:
1108; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1109; SSE-NEXT:    retq
1110;
1111; AVX1-LABEL: combine_nested_undef_test25:
1112; AVX1:       # %bb.0:
1113; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1114; AVX1-NEXT:    retq
1115;
1116; AVX2-LABEL: combine_nested_undef_test25:
1117; AVX2:       # %bb.0:
1118; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1119; AVX2-NEXT:    retq
1120  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
1121  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
1122  ret <4 x i32> %2
1123}
1124
1125define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
1126; SSE-LABEL: combine_nested_undef_test26:
1127; SSE:       # %bb.0:
1128; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1129; SSE-NEXT:    retq
1130;
1131; AVX-LABEL: combine_nested_undef_test26:
1132; AVX:       # %bb.0:
1133; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
1134; AVX-NEXT:    retq
1135  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
1136  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
1137  ret <4 x i32> %2
1138}
1139
1140define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
1141; SSE-LABEL: combine_nested_undef_test27:
1142; SSE:       # %bb.0:
1143; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1144; SSE-NEXT:    retq
1145;
1146; AVX1-LABEL: combine_nested_undef_test27:
1147; AVX1:       # %bb.0:
1148; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1149; AVX1-NEXT:    retq
1150;
1151; AVX2-LABEL: combine_nested_undef_test27:
1152; AVX2:       # %bb.0:
1153; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1154; AVX2-NEXT:    retq
1155  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
1156  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
1157  ret <4 x i32> %2
1158}
1159
1160define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
1161; SSE-LABEL: combine_nested_undef_test28:
1162; SSE:       # %bb.0:
1163; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1164; SSE-NEXT:    retq
1165;
1166; AVX-LABEL: combine_nested_undef_test28:
1167; AVX:       # %bb.0:
1168; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
1169; AVX-NEXT:    retq
1170  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
1171  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
1172  ret <4 x i32> %2
1173}
1174
1175define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
1176; SSE-LABEL: combine_test1:
1177; SSE:       # %bb.0:
1178; SSE-NEXT:    movaps %xmm1, %xmm0
1179; SSE-NEXT:    retq
1180;
1181; AVX-LABEL: combine_test1:
1182; AVX:       # %bb.0:
1183; AVX-NEXT:    vmovaps %xmm1, %xmm0
1184; AVX-NEXT:    retq
1185  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1186  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1187  ret <4 x float> %2
1188}
1189
1190define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
1191; SSE2-LABEL: combine_test2:
1192; SSE2:       # %bb.0:
1193; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1194; SSE2-NEXT:    movaps %xmm1, %xmm0
1195; SSE2-NEXT:    retq
1196;
1197; SSSE3-LABEL: combine_test2:
1198; SSSE3:       # %bb.0:
1199; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1200; SSSE3-NEXT:    movaps %xmm1, %xmm0
1201; SSSE3-NEXT:    retq
1202;
1203; SSE41-LABEL: combine_test2:
1204; SSE41:       # %bb.0:
1205; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1206; SSE41-NEXT:    retq
1207;
1208; AVX-LABEL: combine_test2:
1209; AVX:       # %bb.0:
1210; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1211; AVX-NEXT:    retq
1212  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1213  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1214  ret <4 x float> %2
1215}
1216
1217define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
1218; SSE-LABEL: combine_test3:
1219; SSE:       # %bb.0:
1220; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1221; SSE-NEXT:    retq
1222;
1223; AVX-LABEL: combine_test3:
1224; AVX:       # %bb.0:
1225; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1226; AVX-NEXT:    retq
1227  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1228  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1229  ret <4 x float> %2
1230}
1231
1232define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
1233; SSE-LABEL: combine_test4:
1234; SSE:       # %bb.0:
1235; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1236; SSE-NEXT:    retq
1237;
1238; AVX-LABEL: combine_test4:
1239; AVX:       # %bb.0:
1240; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1241; AVX-NEXT:    retq
1242  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1243  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1244  ret <4 x float> %2
1245}
1246
1247define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
1248; SSE2-LABEL: combine_test5:
1249; SSE2:       # %bb.0:
1250; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1251; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1252; SSE2-NEXT:    retq
1253;
1254; SSSE3-LABEL: combine_test5:
1255; SSSE3:       # %bb.0:
1256; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1257; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1258; SSSE3-NEXT:    retq
1259;
1260; SSE41-LABEL: combine_test5:
1261; SSE41:       # %bb.0:
1262; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1263; SSE41-NEXT:    retq
1264;
1265; AVX-LABEL: combine_test5:
1266; AVX:       # %bb.0:
1267; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1268; AVX-NEXT:    retq
1269  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1270  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1271  ret <4 x float> %2
1272}
1273
1274define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
1275; SSE-LABEL: combine_test6:
1276; SSE:       # %bb.0:
1277; SSE-NEXT:    movaps %xmm1, %xmm0
1278; SSE-NEXT:    retq
1279;
1280; AVX-LABEL: combine_test6:
1281; AVX:       # %bb.0:
1282; AVX-NEXT:    vmovaps %xmm1, %xmm0
1283; AVX-NEXT:    retq
1284  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1285  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1286  ret <4 x i32> %2
1287}
1288
1289define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
1290; SSE2-LABEL: combine_test7:
1291; SSE2:       # %bb.0:
1292; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1293; SSE2-NEXT:    movaps %xmm1, %xmm0
1294; SSE2-NEXT:    retq
1295;
1296; SSSE3-LABEL: combine_test7:
1297; SSSE3:       # %bb.0:
1298; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1299; SSSE3-NEXT:    movaps %xmm1, %xmm0
1300; SSSE3-NEXT:    retq
1301;
1302; SSE41-LABEL: combine_test7:
1303; SSE41:       # %bb.0:
1304; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1305; SSE41-NEXT:    retq
1306;
1307; AVX-LABEL: combine_test7:
1308; AVX:       # %bb.0:
1309; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1310; AVX-NEXT:    retq
1311  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1312  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1313  ret <4 x i32> %2
1314}
1315
1316define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
1317; SSE-LABEL: combine_test8:
1318; SSE:       # %bb.0:
1319; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1320; SSE-NEXT:    retq
1321;
1322; AVX-LABEL: combine_test8:
1323; AVX:       # %bb.0:
1324; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1325; AVX-NEXT:    retq
1326  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1327  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1328  ret <4 x i32> %2
1329}
1330
1331define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
1332; SSE-LABEL: combine_test9:
1333; SSE:       # %bb.0:
1334; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1335; SSE-NEXT:    movaps %xmm1, %xmm0
1336; SSE-NEXT:    retq
1337;
1338; AVX-LABEL: combine_test9:
1339; AVX:       # %bb.0:
1340; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1341; AVX-NEXT:    retq
1342  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1343  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1344  ret <4 x i32> %2
1345}
1346
1347define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
1348; SSE2-LABEL: combine_test10:
1349; SSE2:       # %bb.0:
1350; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1351; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1352; SSE2-NEXT:    retq
1353;
1354; SSSE3-LABEL: combine_test10:
1355; SSSE3:       # %bb.0:
1356; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1357; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1358; SSSE3-NEXT:    retq
1359;
1360; SSE41-LABEL: combine_test10:
1361; SSE41:       # %bb.0:
1362; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1363; SSE41-NEXT:    retq
1364;
1365; AVX-LABEL: combine_test10:
1366; AVX:       # %bb.0:
1367; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1368; AVX-NEXT:    retq
1369  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1370  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1371  ret <4 x i32> %2
1372}
1373
1374define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
1375; CHECK-LABEL: combine_test11:
1376; CHECK:       # %bb.0:
1377; CHECK-NEXT:    retq
1378  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1379  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1380  ret <4 x float> %2
1381}
1382
1383define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
1384; SSE2-LABEL: combine_test12:
1385; SSE2:       # %bb.0:
1386; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1387; SSE2-NEXT:    movaps %xmm1, %xmm0
1388; SSE2-NEXT:    retq
1389;
1390; SSSE3-LABEL: combine_test12:
1391; SSSE3:       # %bb.0:
1392; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1393; SSSE3-NEXT:    movaps %xmm1, %xmm0
1394; SSSE3-NEXT:    retq
1395;
1396; SSE41-LABEL: combine_test12:
1397; SSE41:       # %bb.0:
1398; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1399; SSE41-NEXT:    retq
1400;
1401; AVX-LABEL: combine_test12:
1402; AVX:       # %bb.0:
1403; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1404; AVX-NEXT:    retq
1405  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1406  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1407  ret <4 x float> %2
1408}
1409
1410define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
1411; SSE-LABEL: combine_test13:
1412; SSE:       # %bb.0:
1413; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1414; SSE-NEXT:    retq
1415;
1416; AVX-LABEL: combine_test13:
1417; AVX:       # %bb.0:
1418; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1419; AVX-NEXT:    retq
1420  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1421  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1422  ret <4 x float> %2
1423}
1424
1425define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
1426; SSE-LABEL: combine_test14:
1427; SSE:       # %bb.0:
1428; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1429; SSE-NEXT:    retq
1430;
1431; AVX-LABEL: combine_test14:
1432; AVX:       # %bb.0:
1433; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1434; AVX-NEXT:    retq
1435  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1436  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1437  ret <4 x float> %2
1438}
1439
1440define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
1441; SSE2-LABEL: combine_test15:
1442; SSE2:       # %bb.0:
1443; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1444; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1445; SSE2-NEXT:    retq
1446;
1447; SSSE3-LABEL: combine_test15:
1448; SSSE3:       # %bb.0:
1449; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1450; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1451; SSSE3-NEXT:    retq
1452;
1453; SSE41-LABEL: combine_test15:
1454; SSE41:       # %bb.0:
1455; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1456; SSE41-NEXT:    retq
1457;
1458; AVX-LABEL: combine_test15:
1459; AVX:       # %bb.0:
1460; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1461; AVX-NEXT:    retq
1462  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1463  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1464  ret <4 x float> %2
1465}
1466
1467define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
1468; CHECK-LABEL: combine_test16:
1469; CHECK:       # %bb.0:
1470; CHECK-NEXT:    retq
1471  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1472  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1473  ret <4 x i32> %2
1474}
1475
1476define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
1477; SSE2-LABEL: combine_test17:
1478; SSE2:       # %bb.0:
1479; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1480; SSE2-NEXT:    movaps %xmm1, %xmm0
1481; SSE2-NEXT:    retq
1482;
1483; SSSE3-LABEL: combine_test17:
1484; SSSE3:       # %bb.0:
1485; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1486; SSSE3-NEXT:    movaps %xmm1, %xmm0
1487; SSSE3-NEXT:    retq
1488;
1489; SSE41-LABEL: combine_test17:
1490; SSE41:       # %bb.0:
1491; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1492; SSE41-NEXT:    retq
1493;
1494; AVX-LABEL: combine_test17:
1495; AVX:       # %bb.0:
1496; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1497; AVX-NEXT:    retq
1498  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1499  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1500  ret <4 x i32> %2
1501}
1502
1503define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
1504; SSE-LABEL: combine_test18:
1505; SSE:       # %bb.0:
1506; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1507; SSE-NEXT:    retq
1508;
1509; AVX-LABEL: combine_test18:
1510; AVX:       # %bb.0:
1511; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1512; AVX-NEXT:    retq
1513  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1514  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1515  ret <4 x i32> %2
1516}
1517
1518define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
1519; SSE-LABEL: combine_test19:
1520; SSE:       # %bb.0:
1521; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1522; SSE-NEXT:    retq
1523;
1524; AVX-LABEL: combine_test19:
1525; AVX:       # %bb.0:
1526; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1527; AVX-NEXT:    retq
1528  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1529  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1530  ret <4 x i32> %2
1531}
1532
1533define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
1534; SSE2-LABEL: combine_test20:
1535; SSE2:       # %bb.0:
1536; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1537; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1538; SSE2-NEXT:    retq
1539;
1540; SSSE3-LABEL: combine_test20:
1541; SSSE3:       # %bb.0:
1542; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1543; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1544; SSSE3-NEXT:    retq
1545;
1546; SSE41-LABEL: combine_test20:
1547; SSE41:       # %bb.0:
1548; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1549; SSE41-NEXT:    retq
1550;
1551; AVX-LABEL: combine_test20:
1552; AVX:       # %bb.0:
1553; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1554; AVX-NEXT:    retq
1555  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1556  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1557  ret <4 x i32> %2
1558}
1559
1560define <4 x i32> @combine_test21(<8 x i32> %a, ptr %ptr) {
1561; SSE-LABEL: combine_test21:
1562; SSE:       # %bb.0:
1563; SSE-NEXT:    movaps %xmm0, %xmm2
1564; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1565; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1566; SSE-NEXT:    movaps %xmm2, (%rdi)
1567; SSE-NEXT:    retq
1568;
1569; AVX-LABEL: combine_test21:
1570; AVX:       # %bb.0:
1571; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1572; AVX-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1573; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1574; AVX-NEXT:    vmovaps %xmm2, (%rdi)
1575; AVX-NEXT:    vzeroupper
1576; AVX-NEXT:    retq
1577  %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1578  %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1579  store <4 x i32> %1, ptr %ptr, align 16
1580  ret <4 x i32> %2
1581}
1582
1583define <8 x float> @combine_test22(ptr %a, ptr %b) {
1584; SSE-LABEL: combine_test22:
1585; SSE:       # %bb.0:
1586; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1587; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1588; SSE-NEXT:    retq
1589;
1590; AVX-LABEL: combine_test22:
1591; AVX:       # %bb.0:
1592; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1593; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1594; AVX-NEXT:    retq
1595; Current AVX2 lowering of this is still awful, not adding a test case.
1596  %1 = load <2 x float>, ptr %a, align 8
1597  %2 = load <2 x float>, ptr %b, align 8
1598  %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1599  ret <8 x float> %3
1600}
1601
1602; PR22359
1603define void @combine_test23(<8 x float> %v, ptr %ptr) {
1604; SSE-LABEL: combine_test23:
1605; SSE:       # %bb.0:
1606; SSE-NEXT:    movups %xmm0, (%rdi)
1607; SSE-NEXT:    retq
1608;
1609; AVX-LABEL: combine_test23:
1610; AVX:       # %bb.0:
1611; AVX-NEXT:    vmovups %xmm0, (%rdi)
1612; AVX-NEXT:    vzeroupper
1613; AVX-NEXT:    retq
1614  %idx2 = getelementptr inbounds <2 x float>, ptr %ptr, i64 1
1615  %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1>
1616  %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3>
1617  store <2 x float> %shuffle0, ptr %ptr, align 8
1618  store <2 x float> %shuffle1, ptr %idx2, align 8
1619  ret void
1620}
1621
1622; Check some negative cases.
1623; FIXME: Do any of these really make sense? Are they redundant with the above tests?
1624
1625define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
1626; SSE-LABEL: combine_test1b:
1627; SSE:       # %bb.0:
1628; SSE-NEXT:    movaps %xmm1, %xmm0
1629; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
1630; SSE-NEXT:    retq
1631;
1632; AVX-LABEL: combine_test1b:
1633; AVX:       # %bb.0:
1634; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
1635; AVX-NEXT:    retq
1636  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1637  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
1638  ret <4 x float> %2
1639}
1640
1641define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
1642; SSE2-LABEL: combine_test2b:
1643; SSE2:       # %bb.0:
1644; SSE2-NEXT:    movaps %xmm1, %xmm0
1645; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1646; SSE2-NEXT:    retq
1647;
1648; SSSE3-LABEL: combine_test2b:
1649; SSSE3:       # %bb.0:
1650; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1651; SSSE3-NEXT:    retq
1652;
1653; SSE41-LABEL: combine_test2b:
1654; SSE41:       # %bb.0:
1655; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1656; SSE41-NEXT:    retq
1657;
1658; AVX-LABEL: combine_test2b:
1659; AVX:       # %bb.0:
1660; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
1661; AVX-NEXT:    retq
1662  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1663  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
1664  ret <4 x float> %2
1665}
1666
1667define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
1668; SSE2-LABEL: combine_test3b:
1669; SSE2:       # %bb.0:
1670; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1671; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1672; SSE2-NEXT:    retq
1673;
1674; SSSE3-LABEL: combine_test3b:
1675; SSSE3:       # %bb.0:
1676; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1677; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1678; SSSE3-NEXT:    retq
1679;
1680; SSE41-LABEL: combine_test3b:
1681; SSE41:       # %bb.0:
1682; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1683; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1684; SSE41-NEXT:    retq
1685;
1686; AVX-LABEL: combine_test3b:
1687; AVX:       # %bb.0:
1688; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1689; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1690; AVX-NEXT:    retq
1691  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
1692  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
1693  ret <4 x float> %2
1694}
1695
1696define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
1697; SSE-LABEL: combine_test4b:
1698; SSE:       # %bb.0:
1699; SSE-NEXT:    movaps %xmm1, %xmm0
1700; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
1701; SSE-NEXT:    retq
1702;
1703; AVX-LABEL: combine_test4b:
1704; AVX:       # %bb.0:
1705; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
1706; AVX-NEXT:    retq
1707  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1708  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
1709  ret <4 x float> %2
1710}
1711
1712
1713; Verify that we correctly fold shuffles even when we use illegal vector types.
1714
1715define <4 x i8> @combine_test1c(ptr %a, ptr %b) {
1716; SSE2-LABEL: combine_test1c:
1717; SSE2:       # %bb.0:
1718; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1719; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1720; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1721; SSE2-NEXT:    andps %xmm0, %xmm2
1722; SSE2-NEXT:    andnps %xmm1, %xmm0
1723; SSE2-NEXT:    orps %xmm2, %xmm0
1724; SSE2-NEXT:    retq
1725;
1726; SSSE3-LABEL: combine_test1c:
1727; SSSE3:       # %bb.0:
1728; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1729; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1730; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1731; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1732; SSSE3-NEXT:    retq
1733;
1734; SSE41-LABEL: combine_test1c:
1735; SSE41:       # %bb.0:
1736; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1737; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1738; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1739; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1740; SSE41-NEXT:    movdqa %xmm1, %xmm0
1741; SSE41-NEXT:    retq
1742;
1743; AVX-LABEL: combine_test1c:
1744; AVX:       # %bb.0:
1745; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1746; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1747; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1748; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1749; AVX-NEXT:    retq
1750  %A = load <4 x i8>, ptr %a
1751  %B = load <4 x i8>, ptr %b
1752  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1753  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1754  ret <4 x i8> %2
1755}
1756
1757define <4 x i8> @combine_test2c(ptr %a, ptr %b) {
1758; SSE-LABEL: combine_test2c:
1759; SSE:       # %bb.0:
1760; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1761; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1762; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1763; SSE-NEXT:    retq
1764;
1765; AVX-LABEL: combine_test2c:
1766; AVX:       # %bb.0:
1767; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1768; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1769; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1770; AVX-NEXT:    retq
1771  %A = load <4 x i8>, ptr %a
1772  %B = load <4 x i8>, ptr %b
1773  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
1774  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1775  ret <4 x i8> %2
1776}
1777
1778define <4 x i8> @combine_test3c(ptr %a, ptr %b) {
1779; SSE-LABEL: combine_test3c:
1780; SSE:       # %bb.0:
1781; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1782; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1783; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1784; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1785; SSE-NEXT:    retq
1786;
1787; AVX-LABEL: combine_test3c:
1788; AVX:       # %bb.0:
1789; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1790; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1791; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1792; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1793; AVX-NEXT:    retq
1794  %A = load <4 x i8>, ptr %a
1795  %B = load <4 x i8>, ptr %b
1796  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1797  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1798  ret <4 x i8> %2
1799}
1800
1801define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
1802; SSE2-LABEL: combine_test4c:
1803; SSE2:       # %bb.0:
1804; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1805; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1806; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1807; SSE2-NEXT:    andps %xmm0, %xmm2
1808; SSE2-NEXT:    andnps %xmm1, %xmm0
1809; SSE2-NEXT:    orps %xmm2, %xmm0
1810; SSE2-NEXT:    retq
1811;
1812; SSSE3-LABEL: combine_test4c:
1813; SSSE3:       # %bb.0:
1814; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1815; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1816; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1817; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1818; SSSE3-NEXT:    retq
1819;
1820; SSE41-LABEL: combine_test4c:
1821; SSE41:       # %bb.0:
1822; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1823; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1824; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1825; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1826; SSE41-NEXT:    movdqa %xmm1, %xmm0
1827; SSE41-NEXT:    retq
1828;
1829; AVX-LABEL: combine_test4c:
1830; AVX:       # %bb.0:
1831; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1832; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1833; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1834; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1835; AVX-NEXT:    retq
1836  %A = load <4 x i8>, ptr %a
1837  %B = load <4 x i8>, ptr %b
1838  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1839  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1840  ret <4 x i8> %2
1841}
1842
1843
1844; The following test cases are generated from this C++ code
1845;
1846;__m128 blend_01(__m128 a, __m128 b)
1847;{
1848;  __m128 s = a;
1849;  s = _mm_blend_ps( s, b, 1<<0 );
1850;  s = _mm_blend_ps( s, b, 1<<1 );
1851;  return s;
1852;}
1853;
1854;__m128 blend_02(__m128 a, __m128 b)
1855;{
1856;  __m128 s = a;
1857;  s = _mm_blend_ps( s, b, 1<<0 );
1858;  s = _mm_blend_ps( s, b, 1<<2 );
1859;  return s;
1860;}
1861;
1862;__m128 blend_123(__m128 a, __m128 b)
1863;{
1864;  __m128 s = a;
1865;  s = _mm_blend_ps( s, b, 1<<1 );
1866;  s = _mm_blend_ps( s, b, 1<<2 );
1867;  s = _mm_blend_ps( s, b, 1<<3 );
1868;  return s;
1869;}
1870
1871; Ideally, we should collapse the following shuffles into a single one.
1872
1873define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
1874; SSE2-LABEL: combine_blend_01:
1875; SSE2:       # %bb.0:
1876; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1877; SSE2-NEXT:    retq
1878;
1879; SSSE3-LABEL: combine_blend_01:
1880; SSSE3:       # %bb.0:
1881; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1882; SSSE3-NEXT:    retq
1883;
1884; SSE41-LABEL: combine_blend_01:
1885; SSE41:       # %bb.0:
1886; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1887; SSE41-NEXT:    retq
1888;
1889; AVX-LABEL: combine_blend_01:
1890; AVX:       # %bb.0:
1891; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1892; AVX-NEXT:    retq
1893  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
1894  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1895  ret <4 x float> %shuffle6
1896}
1897
1898define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
1899; SSE2-LABEL: combine_blend_02:
1900; SSE2:       # %bb.0:
1901; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
1902; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1903; SSE2-NEXT:    movaps %xmm1, %xmm0
1904; SSE2-NEXT:    retq
1905;
1906; SSSE3-LABEL: combine_blend_02:
1907; SSSE3:       # %bb.0:
1908; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
1909; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1910; SSSE3-NEXT:    movaps %xmm1, %xmm0
1911; SSSE3-NEXT:    retq
1912;
1913; SSE41-LABEL: combine_blend_02:
1914; SSE41:       # %bb.0:
1915; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1916; SSE41-NEXT:    retq
1917;
1918; AVX-LABEL: combine_blend_02:
1919; AVX:       # %bb.0:
1920; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1921; AVX-NEXT:    retq
1922  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
1923  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1924  ret <4 x float> %shuffle6
1925}
1926
1927define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
1928; SSE2-LABEL: combine_blend_123:
1929; SSE2:       # %bb.0:
1930; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1931; SSE2-NEXT:    movaps %xmm1, %xmm0
1932; SSE2-NEXT:    retq
1933;
1934; SSSE3-LABEL: combine_blend_123:
1935; SSSE3:       # %bb.0:
1936; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1937; SSSE3-NEXT:    movaps %xmm1, %xmm0
1938; SSSE3-NEXT:    retq
1939;
1940; SSE41-LABEL: combine_blend_123:
1941; SSE41:       # %bb.0:
1942; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1943; SSE41-NEXT:    retq
1944;
1945; AVX-LABEL: combine_blend_123:
1946; AVX:       # %bb.0:
1947; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1948; AVX-NEXT:    retq
1949  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
1950  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
1951  %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1952  ret <4 x float> %shuffle12
1953}
1954
1955define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
1956; SSE-LABEL: combine_test_movhl_1:
1957; SSE:       # %bb.0:
1958; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1959; SSE-NEXT:    movaps %xmm1, %xmm0
1960; SSE-NEXT:    retq
1961;
1962; AVX-LABEL: combine_test_movhl_1:
1963; AVX:       # %bb.0:
1964; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1965; AVX-NEXT:    retq
1966  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
1967  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
1968  ret <4 x i32> %2
1969}
1970
1971define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
1972; SSE-LABEL: combine_test_movhl_2:
1973; SSE:       # %bb.0:
1974; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1975; SSE-NEXT:    movaps %xmm1, %xmm0
1976; SSE-NEXT:    retq
1977;
1978; AVX-LABEL: combine_test_movhl_2:
1979; AVX:       # %bb.0:
1980; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1981; AVX-NEXT:    retq
1982  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
1983  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
1984  ret <4 x i32> %2
1985}
1986
1987define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
1988; SSE-LABEL: combine_test_movhl_3:
1989; SSE:       # %bb.0:
1990; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1991; SSE-NEXT:    movaps %xmm1, %xmm0
1992; SSE-NEXT:    retq
1993;
1994; AVX-LABEL: combine_test_movhl_3:
1995; AVX:       # %bb.0:
1996; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1997; AVX-NEXT:    retq
1998  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
1999  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
2000  ret <4 x i32> %2
2001}
2002
2003define <16 x i8> @combine_and_or_shuffle(<16 x i8> %x, <16 x i8> %y) {
2004; SSE2-LABEL: combine_and_or_shuffle:
2005; SSE2:       # %bb.0:
2006; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
2007; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2008; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2009; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,2,4,5,6,7]
2010; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,5,7,7]
2011; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2012; SSE2-NEXT:    pxor %xmm3, %xmm3
2013; SSE2-NEXT:    movdqa %xmm1, %xmm0
2014; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
2015; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
2016; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,1,3]
2017; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535]
2018; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2019; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
2020; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,0,2,1,4,5,6,7]
2021; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
2022; SSE2-NEXT:    pand %xmm0, %xmm1
2023; SSE2-NEXT:    pandn %xmm4, %xmm0
2024; SSE2-NEXT:    por %xmm1, %xmm0
2025; SSE2-NEXT:    packuswb %xmm0, %xmm0
2026; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2027; SSE2-NEXT:    por %xmm2, %xmm0
2028; SSE2-NEXT:    retq
2029;
2030; SSSE3-LABEL: combine_and_or_shuffle:
2031; SSSE3:       # %bb.0:
2032; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero
2033; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero
2034; SSSE3-NEXT:    por %xmm1, %xmm0
2035; SSSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2036; SSSE3-NEXT:    retq
2037;
2038; SSE41-LABEL: combine_and_or_shuffle:
2039; SSE41:       # %bb.0:
2040; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero
2041; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero
2042; SSE41-NEXT:    por %xmm1, %xmm0
2043; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2044; SSE41-NEXT:    retq
2045;
2046; AVX-LABEL: combine_and_or_shuffle:
2047; AVX:       # %bb.0:
2048; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero
2049; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero
2050; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2051; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2052; AVX-NEXT:    retq
2053  %1 = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 15, i32 16, i32 1, i32 16, i32 14, i32 16, i32 2, i32 16, i32 13, i32 16, i32 3, i32 16, i32 16>
2054  %2 = shufflevector <16 x i8> %y, <16 x i8> zeroinitializer, <16 x i32> <i32 7, i32 16, i32 0, i32 16, i32 8, i32 16, i32 1, i32 16, i32 9, i32 16, i32 10, i32 16, i32 7, i32 16, i32 7, i32 16>
2055  %3 = or <16 x i8> %1, %2
2056  %4 = and <16 x i8> %3, <i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
2057  ret <16 x i8> %4
2058}
2059
2060; Verify that we fold shuffles according to rule:
2061;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
2062
2063define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
2064; SSE2-LABEL: combine_undef_input_test1:
2065; SSE2:       # %bb.0:
2066; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2067; SSE2-NEXT:    retq
2068;
2069; SSSE3-LABEL: combine_undef_input_test1:
2070; SSSE3:       # %bb.0:
2071; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2072; SSSE3-NEXT:    retq
2073;
2074; SSE41-LABEL: combine_undef_input_test1:
2075; SSE41:       # %bb.0:
2076; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2077; SSE41-NEXT:    retq
2078;
2079; AVX-LABEL: combine_undef_input_test1:
2080; AVX:       # %bb.0:
2081; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2082; AVX-NEXT:    retq
2083  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2084  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2085  ret <4 x float> %2
2086}
2087
2088define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
2089; SSE-LABEL: combine_undef_input_test2:
2090; SSE:       # %bb.0:
2091; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2092; SSE-NEXT:    retq
2093;
2094; AVX-LABEL: combine_undef_input_test2:
2095; AVX:       # %bb.0:
2096; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2097; AVX-NEXT:    retq
2098  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2099  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2100  ret <4 x float> %2
2101}
2102
2103define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
2104; SSE-LABEL: combine_undef_input_test3:
2105; SSE:       # %bb.0:
2106; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2107; SSE-NEXT:    retq
2108;
2109; AVX-LABEL: combine_undef_input_test3:
2110; AVX:       # %bb.0:
2111; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2112; AVX-NEXT:    retq
2113  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2114  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2115  ret <4 x float> %2
2116}
2117
2118define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
2119; SSE-LABEL: combine_undef_input_test4:
2120; SSE:       # %bb.0:
2121; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2122; SSE-NEXT:    retq
2123;
2124; AVX-LABEL: combine_undef_input_test4:
2125; AVX:       # %bb.0:
2126; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2127; AVX-NEXT:    retq
2128  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2129  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2130  ret <4 x float> %2
2131}
2132
2133define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
2134; SSE2-LABEL: combine_undef_input_test5:
2135; SSE2:       # %bb.0:
2136; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2137; SSE2-NEXT:    retq
2138;
2139; SSSE3-LABEL: combine_undef_input_test5:
2140; SSSE3:       # %bb.0:
2141; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2142; SSSE3-NEXT:    retq
2143;
2144; SSE41-LABEL: combine_undef_input_test5:
2145; SSE41:       # %bb.0:
2146; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2147; SSE41-NEXT:    retq
2148;
2149; AVX-LABEL: combine_undef_input_test5:
2150; AVX:       # %bb.0:
2151; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2152; AVX-NEXT:    retq
2153  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2154  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2155  ret <4 x float> %2
2156}
2157
2158
2159; Verify that we fold shuffles according to rule:
2160;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2161
2162define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
2163; CHECK-LABEL: combine_undef_input_test6:
2164; CHECK:       # %bb.0:
2165; CHECK-NEXT:    retq
2166  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2167  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2168  ret <4 x float> %2
2169}
2170
2171define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
2172; SSE2-LABEL: combine_undef_input_test7:
2173; SSE2:       # %bb.0:
2174; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2175; SSE2-NEXT:    retq
2176;
2177; SSSE3-LABEL: combine_undef_input_test7:
2178; SSSE3:       # %bb.0:
2179; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2180; SSSE3-NEXT:    retq
2181;
2182; SSE41-LABEL: combine_undef_input_test7:
2183; SSE41:       # %bb.0:
2184; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2185; SSE41-NEXT:    retq
2186;
2187; AVX-LABEL: combine_undef_input_test7:
2188; AVX:       # %bb.0:
2189; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2190; AVX-NEXT:    retq
2191  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2192  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2193  ret <4 x float> %2
2194}
2195
2196define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
2197; SSE2-LABEL: combine_undef_input_test8:
2198; SSE2:       # %bb.0:
2199; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2200; SSE2-NEXT:    retq
2201;
2202; SSSE3-LABEL: combine_undef_input_test8:
2203; SSSE3:       # %bb.0:
2204; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2205; SSSE3-NEXT:    retq
2206;
2207; SSE41-LABEL: combine_undef_input_test8:
2208; SSE41:       # %bb.0:
2209; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2210; SSE41-NEXT:    retq
2211;
2212; AVX-LABEL: combine_undef_input_test8:
2213; AVX:       # %bb.0:
2214; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2215; AVX-NEXT:    retq
2216  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2217  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2218  ret <4 x float> %2
2219}
2220
2221define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
2222; SSE-LABEL: combine_undef_input_test9:
2223; SSE:       # %bb.0:
2224; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2225; SSE-NEXT:    retq
2226;
2227; AVX-LABEL: combine_undef_input_test9:
2228; AVX:       # %bb.0:
2229; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
2230; AVX-NEXT:    retq
2231  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2232  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2233  ret <4 x float> %2
2234}
2235
2236define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
2237; CHECK-LABEL: combine_undef_input_test10:
2238; CHECK:       # %bb.0:
2239; CHECK-NEXT:    retq
2240  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2241  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2242  ret <4 x float> %2
2243}
2244
2245define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
2246; SSE2-LABEL: combine_undef_input_test11:
2247; SSE2:       # %bb.0:
2248; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2249; SSE2-NEXT:    retq
2250;
2251; SSSE3-LABEL: combine_undef_input_test11:
2252; SSSE3:       # %bb.0:
2253; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2254; SSSE3-NEXT:    retq
2255;
2256; SSE41-LABEL: combine_undef_input_test11:
2257; SSE41:       # %bb.0:
2258; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2259; SSE41-NEXT:    retq
2260;
2261; AVX-LABEL: combine_undef_input_test11:
2262; AVX:       # %bb.0:
2263; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2264; AVX-NEXT:    retq
2265  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2266  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
2267  ret <4 x float> %2
2268}
2269
2270define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
2271; SSE-LABEL: combine_undef_input_test12:
2272; SSE:       # %bb.0:
2273; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2274; SSE-NEXT:    retq
2275;
2276; AVX-LABEL: combine_undef_input_test12:
2277; AVX:       # %bb.0:
2278; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2279; AVX-NEXT:    retq
2280  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2281  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2282  ret <4 x float> %2
2283}
2284
2285define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
2286; SSE-LABEL: combine_undef_input_test13:
2287; SSE:       # %bb.0:
2288; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2289; SSE-NEXT:    retq
2290;
2291; AVX-LABEL: combine_undef_input_test13:
2292; AVX:       # %bb.0:
2293; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2294; AVX-NEXT:    retq
2295  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2296  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
2297  ret <4 x float> %2
2298}
2299
2300define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
2301; SSE-LABEL: combine_undef_input_test14:
2302; SSE:       # %bb.0:
2303; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2304; SSE-NEXT:    retq
2305;
2306; AVX-LABEL: combine_undef_input_test14:
2307; AVX:       # %bb.0:
2308; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2309; AVX-NEXT:    retq
2310  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2311  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2312  ret <4 x float> %2
2313}
2314
2315define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
2316; SSE2-LABEL: combine_undef_input_test15:
2317; SSE2:       # %bb.0:
2318; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2319; SSE2-NEXT:    retq
2320;
2321; SSSE3-LABEL: combine_undef_input_test15:
2322; SSSE3:       # %bb.0:
2323; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2324; SSSE3-NEXT:    retq
2325;
2326; SSE41-LABEL: combine_undef_input_test15:
2327; SSE41:       # %bb.0:
2328; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2329; SSE41-NEXT:    retq
2330;
2331; AVX-LABEL: combine_undef_input_test15:
2332; AVX:       # %bb.0:
2333; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2334; AVX-NEXT:    retq
2335  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2336  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2337  ret <4 x float> %2
2338}
2339
2340
2341; Verify that shuffles are canonicalized according to rules:
2342;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
2343;
2344; This allows to trigger the following combine rule:
2345;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2346;
2347; As a result, all the shuffle pairs in each function below should be
2348; combined into a single legal shuffle operation.
2349
2350define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
2351; CHECK-LABEL: combine_undef_input_test16:
2352; CHECK:       # %bb.0:
2353; CHECK-NEXT:    retq
2354  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2355  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
2356  ret <4 x float> %2
2357}
2358
2359define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
2360; SSE2-LABEL: combine_undef_input_test17:
2361; SSE2:       # %bb.0:
2362; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2363; SSE2-NEXT:    retq
2364;
2365; SSSE3-LABEL: combine_undef_input_test17:
2366; SSSE3:       # %bb.0:
2367; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2368; SSSE3-NEXT:    retq
2369;
2370; SSE41-LABEL: combine_undef_input_test17:
2371; SSE41:       # %bb.0:
2372; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2373; SSE41-NEXT:    retq
2374;
2375; AVX-LABEL: combine_undef_input_test17:
2376; AVX:       # %bb.0:
2377; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2378; AVX-NEXT:    retq
2379  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2380  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2381  ret <4 x float> %2
2382}
2383
2384define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
2385; SSE2-LABEL: combine_undef_input_test18:
2386; SSE2:       # %bb.0:
2387; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2388; SSE2-NEXT:    retq
2389;
2390; SSSE3-LABEL: combine_undef_input_test18:
2391; SSSE3:       # %bb.0:
2392; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2393; SSSE3-NEXT:    retq
2394;
2395; SSE41-LABEL: combine_undef_input_test18:
2396; SSE41:       # %bb.0:
2397; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2398; SSE41-NEXT:    retq
2399;
2400; AVX-LABEL: combine_undef_input_test18:
2401; AVX:       # %bb.0:
2402; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2403; AVX-NEXT:    retq
2404  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2405  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
2406  ret <4 x float> %2
2407}
2408
2409define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
2410; SSE-LABEL: combine_undef_input_test19:
2411; SSE:       # %bb.0:
2412; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2413; SSE-NEXT:    retq
2414;
2415; AVX-LABEL: combine_undef_input_test19:
2416; AVX:       # %bb.0:
2417; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
2418; AVX-NEXT:    retq
2419  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2420  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2421  ret <4 x float> %2
2422}
2423
2424define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
2425; CHECK-LABEL: combine_undef_input_test20:
2426; CHECK:       # %bb.0:
2427; CHECK-NEXT:    retq
2428  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2429  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2430  ret <4 x float> %2
2431}
2432
2433; These tests are designed to test the ability to combine away unnecessary
2434; operations feeding into a shuffle. The AVX cases are the important ones as
2435; they leverage operations which cannot be done naturally on the entire vector
2436; and thus are decomposed into multiple smaller operations.
2437
2438define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
2439; SSE-LABEL: combine_unneeded_subvector1:
2440; SSE:       # %bb.0:
2441; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
2442; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2443; SSE-NEXT:    movdqa %xmm0, %xmm1
2444; SSE-NEXT:    retq
2445;
2446; AVX1-LABEL: combine_unneeded_subvector1:
2447; AVX1:       # %bb.0:
2448; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2449; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2450; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2451; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2452; AVX1-NEXT:    retq
2453;
2454; AVX2-SLOW-LABEL: combine_unneeded_subvector1:
2455; AVX2-SLOW:       # %bb.0:
2456; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2457; AVX2-SLOW-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2458; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
2459; AVX2-SLOW-NEXT:    retq
2460;
2461; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1:
2462; AVX2-FAST-ALL:       # %bb.0:
2463; AVX2-FAST-ALL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2464; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
2465; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
2466; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2467; AVX2-FAST-ALL-NEXT:    retq
2468;
2469; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1:
2470; AVX2-FAST-PERLANE:       # %bb.0:
2471; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2472; AVX2-FAST-PERLANE-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2473; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
2474; AVX2-FAST-PERLANE-NEXT:    retq
2475  %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2476  %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
2477  ret <8 x i32> %c
2478}
2479
2480define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
2481; SSE-LABEL: combine_unneeded_subvector2:
2482; SSE:       # %bb.0:
2483; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
2484; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
2485; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2486; SSE-NEXT:    retq
2487;
2488; AVX1-LABEL: combine_unneeded_subvector2:
2489; AVX1:       # %bb.0:
2490; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2491; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2492; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2493; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2494; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2495; AVX1-NEXT:    retq
2496;
2497; AVX2-LABEL: combine_unneeded_subvector2:
2498; AVX2:       # %bb.0:
2499; AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2500; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2501; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2502; AVX2-NEXT:    retq
2503  %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2504  %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
2505  ret <8 x i32> %d
2506}
2507
2508define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
2509; SSE2-LABEL: combine_insertps1:
2510; SSE2:       # %bb.0:
2511; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2512; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2513; SSE2-NEXT:    movaps %xmm1, %xmm0
2514; SSE2-NEXT:    retq
2515;
2516; SSSE3-LABEL: combine_insertps1:
2517; SSSE3:       # %bb.0:
2518; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2519; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2520; SSSE3-NEXT:    movaps %xmm1, %xmm0
2521; SSSE3-NEXT:    retq
2522;
2523; SSE41-LABEL: combine_insertps1:
2524; SSE41:       # %bb.0:
2525; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2526; SSE41-NEXT:    retq
2527;
2528; AVX-LABEL: combine_insertps1:
2529; AVX:       # %bb.0:
2530; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2531; AVX-NEXT:    retq
2532
2533  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
2534  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
2535  ret <4 x float> %d
2536}
2537
2538define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
2539; SSE2-LABEL: combine_insertps2:
2540; SSE2:       # %bb.0:
2541; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2542; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2543; SSE2-NEXT:    movaps %xmm1, %xmm0
2544; SSE2-NEXT:    retq
2545;
2546; SSSE3-LABEL: combine_insertps2:
2547; SSSE3:       # %bb.0:
2548; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2549; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2550; SSSE3-NEXT:    movaps %xmm1, %xmm0
2551; SSSE3-NEXT:    retq
2552;
2553; SSE41-LABEL: combine_insertps2:
2554; SSE41:       # %bb.0:
2555; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2556; SSE41-NEXT:    retq
2557;
2558; AVX-LABEL: combine_insertps2:
2559; AVX:       # %bb.0:
2560; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2561; AVX-NEXT:    retq
2562
2563  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
2564  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2565  ret <4 x float> %d
2566}
2567
2568define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
2569; SSE2-LABEL: combine_insertps3:
2570; SSE2:       # %bb.0:
2571; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2572; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2573; SSE2-NEXT:    retq
2574;
2575; SSSE3-LABEL: combine_insertps3:
2576; SSSE3:       # %bb.0:
2577; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2578; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2579; SSSE3-NEXT:    retq
2580;
2581; SSE41-LABEL: combine_insertps3:
2582; SSE41:       # %bb.0:
2583; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2584; SSE41-NEXT:    retq
2585;
2586; AVX-LABEL: combine_insertps3:
2587; AVX:       # %bb.0:
2588; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2589; AVX-NEXT:    retq
2590
2591  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2592  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
2593  ret <4 x float> %d
2594}
2595
2596define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
2597; SSE2-LABEL: combine_insertps4:
2598; SSE2:       # %bb.0:
2599; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2600; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2601; SSE2-NEXT:    retq
2602;
2603; SSSE3-LABEL: combine_insertps4:
2604; SSSE3:       # %bb.0:
2605; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2606; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2607; SSSE3-NEXT:    retq
2608;
2609; SSE41-LABEL: combine_insertps4:
2610; SSE41:       # %bb.0:
2611; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2612; SSE41-NEXT:    retq
2613;
2614; AVX-LABEL: combine_insertps4:
2615; AVX:       # %bb.0:
2616; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2617; AVX-NEXT:    retq
2618
2619  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2620  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
2621  ret <4 x float> %d
2622}
2623
2624define void @combine_scalar_load_with_blend_with_zero(ptr %a0, ptr %a1) {
2625; SSE-LABEL: combine_scalar_load_with_blend_with_zero:
2626; SSE:       # %bb.0:
2627; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2628; SSE-NEXT:    movaps %xmm0, (%rsi)
2629; SSE-NEXT:    retq
2630;
2631; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
2632; AVX:       # %bb.0:
2633; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2634; AVX-NEXT:    vmovaps %xmm0, (%rsi)
2635; AVX-NEXT:    retq
2636  %1 = load double, ptr %a0, align 8
2637  %2 = insertelement <2 x double> undef, double %1, i32 0
2638  %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1
2639  %4 = bitcast <2 x double> %3 to <4 x float>
2640  %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
2641  store <4 x float> %5, ptr %a1, align 16
2642  ret void
2643}
2644
2645; PR30371
2646define <4 x float> @combine_constant_insertion_v4f32(float %f) {
2647; SSE2-LABEL: combine_constant_insertion_v4f32:
2648; SSE2:       # %bb.0:
2649; SSE2-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
2650; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2651; SSE2-NEXT:    movaps %xmm1, %xmm0
2652; SSE2-NEXT:    retq
2653;
2654; SSSE3-LABEL: combine_constant_insertion_v4f32:
2655; SSSE3:       # %bb.0:
2656; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
2657; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2658; SSSE3-NEXT:    movaps %xmm1, %xmm0
2659; SSSE3-NEXT:    retq
2660;
2661; SSE41-LABEL: combine_constant_insertion_v4f32:
2662; SSE41:       # %bb.0:
2663; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2664; SSE41-NEXT:    retq
2665;
2666; AVX-LABEL: combine_constant_insertion_v4f32:
2667; AVX:       # %bb.0:
2668; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2669; AVX-NEXT:    retq
2670  %a0 = insertelement <4 x float> undef, float %f, i32 0
2671  %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2672  ret <4 x float> %ret
2673}
2674
2675define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
2676; SSE2-LABEL: combine_constant_insertion_v4i32:
2677; SSE2:       # %bb.0:
2678; SSE2-NEXT:    movd %edi, %xmm1
2679; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
2680; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2681; SSE2-NEXT:    retq
2682;
2683; SSSE3-LABEL: combine_constant_insertion_v4i32:
2684; SSSE3:       # %bb.0:
2685; SSSE3-NEXT:    movd %edi, %xmm1
2686; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
2687; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2688; SSSE3-NEXT:    retq
2689;
2690; SSE41-LABEL: combine_constant_insertion_v4i32:
2691; SSE41:       # %bb.0:
2692; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <u,4,5,30>
2693; SSE41-NEXT:    pinsrd $0, %edi, %xmm0
2694; SSE41-NEXT:    retq
2695;
2696; AVX-LABEL: combine_constant_insertion_v4i32:
2697; AVX:       # %bb.0:
2698; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,4,5,30>
2699; AVX-NEXT:    vpinsrd $0, %edi, %xmm0, %xmm0
2700; AVX-NEXT:    retq
2701  %a0 = insertelement <4 x i32> undef, i32 %f, i32 0
2702  %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2703  ret <4 x i32> %ret
2704}
2705
2706define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
2707; SSE2-LABEL: PR22377:
2708; SSE2:       # %bb.0: # %entry
2709; SSE2-NEXT:    movaps %xmm0, %xmm1
2710; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3]
2711; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2712; SSE2-NEXT:    addps %xmm0, %xmm1
2713; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2714; SSE2-NEXT:    retq
2715;
2716; SSSE3-LABEL: PR22377:
2717; SSSE3:       # %bb.0: # %entry
2718; SSSE3-NEXT:    movaps %xmm0, %xmm1
2719; SSSE3-NEXT:    haddps %xmm0, %xmm1
2720; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2721; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2722; SSSE3-NEXT:    retq
2723;
2724; SSE41-LABEL: PR22377:
2725; SSE41:       # %bb.0: # %entry
2726; SSE41-NEXT:    movaps %xmm0, %xmm1
2727; SSE41-NEXT:    haddps %xmm0, %xmm1
2728; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2729; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2730; SSE41-NEXT:    retq
2731;
2732; AVX-LABEL: PR22377:
2733; AVX:       # %bb.0: # %entry
2734; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm1
2735; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2736; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2737; AVX-NEXT:    retq
2738entry:
2739  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
2740  %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2741  %r2 = fadd <4 x float> %s1, %s2
2742  %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2743  ret <4 x float> %s3
2744}
2745
2746define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
2747; SSE2-LABEL: PR22390:
2748; SSE2:       # %bb.0: # %entry
2749; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2750; SSE2-NEXT:    movaps %xmm0, %xmm2
2751; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2752; SSE2-NEXT:    addps %xmm2, %xmm0
2753; SSE2-NEXT:    retq
2754;
2755; SSSE3-LABEL: PR22390:
2756; SSSE3:       # %bb.0: # %entry
2757; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2758; SSSE3-NEXT:    movaps %xmm0, %xmm2
2759; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2760; SSSE3-NEXT:    addps %xmm2, %xmm0
2761; SSSE3-NEXT:    retq
2762;
2763; SSE41-LABEL: PR22390:
2764; SSE41:       # %bb.0: # %entry
2765; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2766; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2767; SSE41-NEXT:    addps %xmm1, %xmm0
2768; SSE41-NEXT:    retq
2769;
2770; AVX-LABEL: PR22390:
2771; AVX:       # %bb.0: # %entry
2772; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2773; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2774; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2775; AVX-NEXT:    retq
2776entry:
2777  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
2778  %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
2779  %r2 = fadd <4 x float> %s1, %s2
2780  ret <4 x float> %r2
2781}
2782
2783define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
2784; SSE-LABEL: PR22412:
2785; SSE:       # %bb.0: # %entry
2786; SSE-NEXT:    movaps %xmm3, %xmm1
2787; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2788; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[3,2]
2789; SSE-NEXT:    retq
2790;
2791; AVX1-LABEL: PR22412:
2792; AVX1:       # %bb.0: # %entry
2793; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
2794; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2795; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[3,2],ymm0[5,4],ymm2[7,6]
2796; AVX1-NEXT:    retq
2797;
2798; AVX2-LABEL: PR22412:
2799; AVX2:       # %bb.0: # %entry
2800; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2801; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
2802; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
2803; AVX2-NEXT:    retq
2804entry:
2805  %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2806  %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
2807  ret <8 x float> %s2
2808}
2809
2810define <4 x float> @PR30264(<4 x float> %x) {
2811; SSE2-LABEL: PR30264:
2812; SSE2:       # %bb.0:
2813; SSE2-NEXT:    xorps %xmm1, %xmm1
2814; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2815; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
2816; SSE2-NEXT:    movaps %xmm1, %xmm0
2817; SSE2-NEXT:    retq
2818;
2819; SSSE3-LABEL: PR30264:
2820; SSSE3:       # %bb.0:
2821; SSSE3-NEXT:    xorps %xmm1, %xmm1
2822; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2823; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
2824; SSSE3-NEXT:    movaps %xmm1, %xmm0
2825; SSSE3-NEXT:    retq
2826;
2827; SSE41-LABEL: PR30264:
2828; SSE41:       # %bb.0:
2829; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
2830; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3]
2831; SSE41-NEXT:    movaps %xmm1, %xmm0
2832; SSE41-NEXT:    retq
2833;
2834; AVX-LABEL: PR30264:
2835; AVX:       # %bb.0:
2836; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
2837; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3]
2838; AVX-NEXT:    retq
2839  %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
2840  %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2841  ret <4 x float> %shuf2
2842}
2843
2844define <8 x i16> @PR39549(<16 x i8> %x) {
2845; SSE-LABEL: PR39549:
2846; SSE:       # %bb.0:
2847; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2848; SSE-NEXT:    psraw $8, %xmm0
2849; SSE-NEXT:    retq
2850;
2851; AVX-LABEL: PR39549:
2852; AVX:       # %bb.0:
2853; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2854; AVX-NEXT:    vpsraw $8, %xmm0, %xmm0
2855; AVX-NEXT:    retq
2856  %a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15, i32 undef>
2857  %b = bitcast <16 x i8> %a to <8 x i16>
2858  %c = shl <8 x i16> %b, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2859  %d = ashr <8 x i16> %c, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2860  ret <8 x i16> %d
2861}
2862
2863define <4 x i32> @PR41545(<4 x i32> %a0, <16 x i8> %a1) {
2864; SSE-LABEL: PR41545:
2865; SSE:       # %bb.0:
2866; SSE-NEXT:    paddd %xmm1, %xmm0
2867; SSE-NEXT:    retq
2868;
2869; AVX-LABEL: PR41545:
2870; AVX:       # %bb.0:
2871; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2872; AVX-NEXT:    retq
2873  %1  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
2874  %2  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
2875  %3  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
2876  %4  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
2877  %5  = zext <4 x i8> %1 to <4 x i32>
2878  %6  = zext <4 x i8> %2 to <4 x i32>
2879  %7  = zext <4 x i8> %3 to <4 x i32>
2880  %8  = zext <4 x i8> %4 to <4 x i32>
2881  %9  = shl <4 x i32> %6, <i32 8, i32 8, i32 8, i32 8>
2882  %10 = shl <4 x i32> %7, <i32 16, i32 16, i32 16, i32 16>
2883  %11 = shl <4 x i32> %8, <i32 24, i32 24, i32 24, i32 24>
2884  %12 = or <4 x i32> %5, %9
2885  %13 = or <4 x i32> %12, %10
2886  %14 = or <4 x i32> %13, %11
2887  %15 = add <4 x i32> %a0, %14
2888  ret <4 x i32> %15
2889}
2890
2891define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) {
2892; SSE-LABEL: shuffle_extract_insert:
2893; SSE:       # %bb.0:
2894; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2895; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2896; SSE-NEXT:    retq
2897;
2898; AVX1-LABEL: shuffle_extract_insert:
2899; AVX1:       # %bb.0:
2900; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2901; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2902; AVX1-NEXT:    retq
2903;
2904; AVX2-SLOW-LABEL: shuffle_extract_insert:
2905; AVX2-SLOW:       # %bb.0:
2906; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2907; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2908; AVX2-SLOW-NEXT:    retq
2909;
2910; AVX2-FAST-LABEL: shuffle_extract_insert:
2911; AVX2-FAST:       # %bb.0:
2912; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15]
2913; AVX2-FAST-NEXT:    retq
2914  %a0 = extractelement <8 x i16> %a, i32 0
2915  %a1 = extractelement <8 x i16> %a, i32 1
2916  %a3 = extractelement <8 x i16> %a, i32 3
2917  %a4 = extractelement <8 x i16> %a, i32 4
2918  %a5 = extractelement <8 x i16> %a, i32 5
2919  %a6 = extractelement <8 x i16> %a, i32 6
2920  %a7 = extractelement <8 x i16> %a, i32 7
2921  %1 = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2922  %2 = insertelement <8 x i16> %1, i16 %a1, i32 1
2923  %3 = insertelement <8 x i16> %2, i16 %a0, i32 2
2924  %4 = insertelement <8 x i16> %3, i16 %a3, i32 3
2925  %5 = insertelement <8 x i16> %4, i16 %a6, i32 4
2926  %6 = insertelement <8 x i16> %5, i16 %a5, i32 5
2927  %7 = insertelement <8 x i16> %6, i16 %a4, i32 6
2928  %8 = insertelement <8 x i16> %7, i16 %a7, i32 7
2929  ret <8 x i16> %8
2930}
2931
2932define <8 x i16> @shuffle_extract_insert_double(<8 x i16> %a, <8 x i16> %b) {
2933; SSE2-LABEL: shuffle_extract_insert_double:
2934; SSE2:       # %bb.0:
2935; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
2936; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
2937; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2938; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
2939; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2940; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2941; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2942; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2943; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2944; SSE2-NEXT:    retq
2945;
2946; SSSE3-LABEL: shuffle_extract_insert_double:
2947; SSSE3:       # %bb.0:
2948; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2949; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2950; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2951; SSSE3-NEXT:    retq
2952;
2953; SSE41-LABEL: shuffle_extract_insert_double:
2954; SSE41:       # %bb.0:
2955; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2956; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2957; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2958; SSE41-NEXT:    retq
2959;
2960; AVX-LABEL: shuffle_extract_insert_double:
2961; AVX:       # %bb.0:
2962; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2963; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2964; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2965; AVX-NEXT:    retq
2966  %a0 = extractelement <8 x i16> %a, i32 0
2967  %a4 = extractelement <8 x i16> %a, i32 4
2968  %a6 = extractelement <8 x i16> %a, i32 6
2969  %b11 = extractelement <8 x i16> %b, i32 3
2970  %b13 = extractelement <8 x i16> %b, i32 5
2971  %b15 = extractelement <8 x i16> %b, i32 7
2972  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2973  %2 = insertelement <8 x i16> %1, i16 %a0, i32 2
2974  %3 = insertelement <8 x i16> %2, i16 %b11, i32 3
2975  %4 = insertelement <8 x i16> %3, i16 %a6, i32 4
2976  %5 = insertelement <8 x i16> %4, i16 %b13, i32 5
2977  %6 = insertelement <8 x i16> %5, i16 %a4, i32 6
2978  %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
2979  ret <8 x i16> %7
2980}
2981
2982define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) {
2983; SSE2-LABEL: shuffle_extract_concat_insert:
2984; SSE2:       # %bb.0:
2985; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2986; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2987; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2988; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2989; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2990; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
2991; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
2992; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2993; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
2994; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2995; SSE2-NEXT:    retq
2996;
2997; SSSE3-LABEL: shuffle_extract_concat_insert:
2998; SSSE3:       # %bb.0:
2999; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3000; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
3001; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3002; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3003; SSSE3-NEXT:    retq
3004;
3005; SSE41-LABEL: shuffle_extract_concat_insert:
3006; SSE41:       # %bb.0:
3007; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3008; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
3009; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3010; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3011; SSE41-NEXT:    retq
3012;
3013; AVX-LABEL: shuffle_extract_concat_insert:
3014; AVX:       # %bb.0:
3015; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3016; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
3017; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
3018; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3019; AVX-NEXT:    retq
3020  %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3021  %a0 = extractelement <8 x i16> %a, i32 0
3022  %a4 = extractelement <8 x i16> %a, i32 4
3023  %a6 = extractelement <8 x i16> %a, i32 6
3024  %b11 = extractelement <8 x i16> %b, i32 3
3025  %b13 = extractelement <8 x i16> %b, i32 5
3026  %b15 = extractelement <8 x i16> %b, i32 7
3027  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3028  %2 = insertelement <8 x i16> %1, i16 %a0, i32 2
3029  %3 = insertelement <8 x i16> %2, i16 %b11, i32 3
3030  %4 = insertelement <8 x i16> %3, i16 %a6, i32 4
3031  %5 = insertelement <8 x i16> %4, i16 %b13, i32 5
3032  %6 = insertelement <8 x i16> %5, i16 %a4, i32 6
3033  %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
3034  ret <8 x i16> %7
3035}
3036
3037define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) {
3038; SSE2-LABEL: shuffle_scalar_to_vector_extract:
3039; SSE2:       # %bb.0:
3040; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3041; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3042; SSE2-NEXT:    psraw $8, %xmm1
3043; SSE2-NEXT:    pextrw $7, %xmm1, %eax
3044; SSE2-NEXT:    movd %eax, %xmm2
3045; SSE2-NEXT:    movsbl (%rsi), %eax
3046; SSE2-NEXT:    movd %eax, %xmm0
3047; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
3048; SSE2-NEXT:    movsbl (%rdx), %eax
3049; SSE2-NEXT:    movd %eax, %xmm0
3050; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3051; SSE2-NEXT:    pxor %xmm0, %xmm0
3052; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3053; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3054; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3055; SSE2-NEXT:    retq
3056;
3057; SSSE3-LABEL: shuffle_scalar_to_vector_extract:
3058; SSSE3:       # %bb.0:
3059; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3060; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3061; SSSE3-NEXT:    psraw $8, %xmm1
3062; SSSE3-NEXT:    movsbl (%rsi), %eax
3063; SSSE3-NEXT:    movd %eax, %xmm2
3064; SSSE3-NEXT:    palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
3065; SSSE3-NEXT:    movsbl (%rdx), %eax
3066; SSSE3-NEXT:    movd %eax, %xmm0
3067; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3068; SSSE3-NEXT:    pxor %xmm0, %xmm0
3069; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3070; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3071; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3072; SSSE3-NEXT:    retq
3073;
3074; SSE41-LABEL: shuffle_scalar_to_vector_extract:
3075; SSE41:       # %bb.0:
3076; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
3077; SSE41-NEXT:    pextrw $4, %xmm0, %eax
3078; SSE41-NEXT:    pextrw $7, %xmm0, %ecx
3079; SSE41-NEXT:    pxor %xmm0, %xmm0
3080; SSE41-NEXT:    pinsrw $1, %eax, %xmm0
3081; SSE41-NEXT:    movl $65531, %eax # imm = 0xFFFB
3082; SSE41-NEXT:    pinsrw $2, %eax, %xmm0
3083; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
3084; SSE41-NEXT:    movsbl (%rsi), %eax
3085; SSE41-NEXT:    pinsrw $5, %eax, %xmm0
3086; SSE41-NEXT:    movsbl (%rdx), %eax
3087; SSE41-NEXT:    pinsrw $6, %eax, %xmm0
3088; SSE41-NEXT:    retq
3089;
3090; AVX-LABEL: shuffle_scalar_to_vector_extract:
3091; AVX:       # %bb.0:
3092; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
3093; AVX-NEXT:    vpextrw $4, %xmm0, %eax
3094; AVX-NEXT:    vpextrw $7, %xmm0, %ecx
3095; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3096; AVX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
3097; AVX-NEXT:    movl $65531, %eax # imm = 0xFFFB
3098; AVX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
3099; AVX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
3100; AVX-NEXT:    movsbl (%rsi), %eax
3101; AVX-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
3102; AVX-NEXT:    movsbl (%rdx), %eax
3103; AVX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
3104; AVX-NEXT:    retq
3105  %tmp = load <8 x i8>, ptr %p0, align 1
3106  %tmp1 = sext <8 x i8> %tmp to <8 x i16>
3107  %tmp2 = load i8, ptr %p1, align 1
3108  %cvt1 = sext i8 %tmp2 to i16
3109  %tmp3 = load i8, ptr %p2, align 1
3110  %cvt2 = sext i8 %tmp3 to i16
3111  %tmp4 = extractelement <8 x i16> %tmp1, i32 4
3112  %tmp5 = extractelement <8 x i16> %tmp1, i32 7
3113  %tmp6 = insertelement <8 x i16> <i16 undef, i16 undef, i16 -5, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 undef, i32 0
3114  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp4, i32 1
3115  %tmp8 = insertelement <8 x i16> %tmp7, i16 undef, i32 3
3116  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp5, i32 4
3117  %tmp10 = insertelement <8 x i16> %tmp9, i16 %cvt1, i32 5
3118  %tmp11 = insertelement <8 x i16> %tmp10, i16 %cvt2, i32 6
3119  %tmp12 = insertelement <8 x i16> %tmp11, i16 undef, i32 7
3120  %tmp13 = shufflevector <8 x i16> %tmp12, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
3121  ret <8 x i16> %tmp13
3122}
3123
3124; Bug noticed in D96345
3125define i32 @shuffle_binops_with_undef() {
3126; SSE-LABEL: shuffle_binops_with_undef:
3127; SSE:       # %bb.0: # %entry
3128; SSE-NEXT:    movdqa (%rax), %xmm0
3129; SSE-NEXT:    paddw %xmm0, %xmm0
3130; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
3131; SSE-NEXT:    psrlw %xmm1, %xmm0
3132; SSE-NEXT:    movdqa %xmm0, (%rax)
3133; SSE-NEXT:    retq
3134;
3135; AVX-LABEL: shuffle_binops_with_undef:
3136; AVX:       # %bb.0: # %entry
3137; AVX-NEXT:    vmovdqa (%rax), %xmm0
3138; AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
3139; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
3140; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
3141; AVX-NEXT:    vmovdqa %xmm0, (%rax)
3142; AVX-NEXT:    retq
3143entry:
3144  %load0 = load <8 x i16>, ptr undef, align 16
3145  %load1 = load <8 x i16>, ptr undef, align 16
3146  %shuf0 = shufflevector <16 x i8> undef, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
3147  %addi = add <8 x i16> %load0, %load1
3148  %bc0 = bitcast <8 x i16> %addi to <2 x i64>
3149  %bc1 = bitcast <16 x i8> %shuf0 to <8 x i16>
3150  %shuf1 = shufflevector <8 x i16> %load1, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
3151  %addi24 = add <8 x i16> %shuf1, %bc1
3152  %bc2 = bitcast <8 x i16> %addi24 to <2 x i64>
3153  %shuf2 = shufflevector <2 x i64> %bc0, <2 x i64> %bc2, <2 x i32> <i32 0, i32 2>
3154  %bc3 = bitcast <2 x i64> %shuf2 to <8 x i16>
3155  %psrli = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %bc3, i32 ptrtoint (ptr @shuffle_binops_with_undef to i32))
3156  store <8 x i16> %psrli, ptr undef, align 16
3157  ret i32 undef
3158}
3159declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
3160
3161define void @PR43024() {
3162; SSE2-LABEL: PR43024:
3163; SSE2:       # %bb.0:
3164; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3165; SSE2-NEXT:    movaps %xmm0, (%rax)
3166; SSE2-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3167; SSE2-NEXT:    xorps %xmm1, %xmm1
3168; SSE2-NEXT:    addss %xmm1, %xmm0
3169; SSE2-NEXT:    addss %xmm1, %xmm0
3170; SSE2-NEXT:    movss %xmm0, (%rax)
3171; SSE2-NEXT:    retq
3172;
3173; SSSE3-LABEL: PR43024:
3174; SSSE3:       # %bb.0:
3175; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3176; SSSE3-NEXT:    movaps %xmm0, (%rax)
3177; SSSE3-NEXT:    addss %xmm0, %xmm0
3178; SSSE3-NEXT:    xorps %xmm1, %xmm1
3179; SSSE3-NEXT:    addss %xmm1, %xmm0
3180; SSSE3-NEXT:    addss %xmm1, %xmm0
3181; SSSE3-NEXT:    movss %xmm0, (%rax)
3182; SSSE3-NEXT:    retq
3183;
3184; SSE41-LABEL: PR43024:
3185; SSE41:       # %bb.0:
3186; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3187; SSE41-NEXT:    movaps %xmm0, (%rax)
3188; SSE41-NEXT:    addss %xmm0, %xmm0
3189; SSE41-NEXT:    xorps %xmm1, %xmm1
3190; SSE41-NEXT:    addss %xmm1, %xmm0
3191; SSE41-NEXT:    addss %xmm1, %xmm0
3192; SSE41-NEXT:    movss %xmm0, (%rax)
3193; SSE41-NEXT:    retq
3194;
3195; AVX-LABEL: PR43024:
3196; AVX:       # %bb.0:
3197; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3198; AVX-NEXT:    vmovaps %xmm0, (%rax)
3199; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0
3200; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
3201; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
3202; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}+12(%rip), %xmm0, %xmm0
3203; AVX-NEXT:    vmovss %xmm0, (%rax)
3204; AVX-NEXT:    retq
3205  store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, ptr undef, align 16
3206  %1 = load <4 x float>, ptr undef, align 16
3207  %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0>
3208  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
3209  %4 = fadd <4 x float> %2, %3
3210  %5 = fadd <4 x float> zeroinitializer, %4
3211  %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
3212  %7 = fadd <4 x float> %6, %5
3213  %8 = extractelement <4 x float> %7, i32 0
3214  store float %8, ptr undef, align 8
3215  ret void
3216}
3217
3218define void @PR45604(ptr %dst, ptr %src) {
3219; SSE2-LABEL: PR45604:
3220; SSE2:       # %bb.0:
3221; SSE2-NEXT:    movdqa (%rsi), %xmm0
3222; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
3223; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3224; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
3225; SSE2-NEXT:    movdqa %xmm2, %xmm3
3226; SSE2-NEXT:    pandn %xmm1, %xmm3
3227; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,0,0,11,0,0,0,0,0,0,0,11,0,0,0]
3228; SSE2-NEXT:    por %xmm1, %xmm3
3229; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
3230; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
3231; SSE2-NEXT:    movdqa %xmm2, %xmm5
3232; SSE2-NEXT:    pandn %xmm4, %xmm5
3233; SSE2-NEXT:    por %xmm1, %xmm5
3234; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,2,2,2]
3235; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
3236; SSE2-NEXT:    movdqa %xmm2, %xmm6
3237; SSE2-NEXT:    pandn %xmm4, %xmm6
3238; SSE2-NEXT:    por %xmm1, %xmm6
3239; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
3240; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3241; SSE2-NEXT:    pandn %xmm0, %xmm2
3242; SSE2-NEXT:    por %xmm1, %xmm2
3243; SSE2-NEXT:    movdqa %xmm2, 48(%rdi)
3244; SSE2-NEXT:    movdqa %xmm6, 32(%rdi)
3245; SSE2-NEXT:    movdqa %xmm5, 16(%rdi)
3246; SSE2-NEXT:    movdqa %xmm3, (%rdi)
3247; SSE2-NEXT:    retq
3248;
3249; SSSE3-LABEL: PR45604:
3250; SSSE3:       # %bb.0:
3251; SSSE3-NEXT:    movdqa (%rsi), %xmm0
3252; SSSE3-NEXT:    movdqa %xmm0, %xmm1
3253; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[2,3],zero,zero,zero,zero,zero,zero
3254; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,0,0,0,11,0,0,0,0,0,0,0,11,0,0,0]
3255; SSSE3-NEXT:    por %xmm2, %xmm1
3256; SSSE3-NEXT:    movdqa %xmm0, %xmm3
3257; SSSE3-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[4,5],zero,zero,zero,zero,zero,zero,xmm3[6,7],zero,zero,zero,zero,zero,zero
3258; SSSE3-NEXT:    por %xmm2, %xmm3
3259; SSSE3-NEXT:    movdqa %xmm0, %xmm4
3260; SSSE3-NEXT:    pshufb {{.*#+}} xmm4 = xmm4[8,9],zero,zero,zero,zero,zero,zero,xmm4[10,11],zero,zero,zero,zero,zero,zero
3261; SSSE3-NEXT:    por %xmm2, %xmm4
3262; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13],zero,zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero
3263; SSSE3-NEXT:    por %xmm2, %xmm0
3264; SSSE3-NEXT:    movdqa %xmm0, 48(%rdi)
3265; SSSE3-NEXT:    movdqa %xmm4, 32(%rdi)
3266; SSSE3-NEXT:    movdqa %xmm3, 16(%rdi)
3267; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
3268; SSSE3-NEXT:    retq
3269;
3270; SSE41-LABEL: PR45604:
3271; SSE41:       # %bb.0:
3272; SSE41-NEXT:    movdqa (%rsi), %xmm0
3273; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
3274; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3275; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <u,0,11,0,u,0,11,0>
3276; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
3277; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
3278; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3279; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7]
3280; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
3281; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
3282; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2,3],xmm4[4],xmm2[5,6,7]
3283; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3284; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
3285; SSE41-NEXT:    movdqa %xmm0, (%rdi)
3286; SSE41-NEXT:    movdqa %xmm4, 48(%rdi)
3287; SSE41-NEXT:    movdqa %xmm3, 32(%rdi)
3288; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
3289; SSE41-NEXT:    retq
3290;
3291; AVX1-LABEL: PR45604:
3292; AVX1:       # %bb.0:
3293; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
3294; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
3295; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3296; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0]
3297; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3298; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
3299; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3300; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3301; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
3302; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
3303; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3304; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3305; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3306; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
3307; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3308; AVX1-NEXT:    vmovups %ymm0, (%rdi)
3309; AVX1-NEXT:    vmovups %ymm1, 32(%rdi)
3310; AVX1-NEXT:    vzeroupper
3311; AVX1-NEXT:    retq
3312;
3313; AVX2-LABEL: PR45604:
3314; AVX2:       # %bb.0:
3315; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
3316; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2]
3317; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
3318; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3319; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0>
3320; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
3321; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
3322; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3323; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
3324; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
3325; AVX2-NEXT:    vmovdqu %ymm1, (%rdi)
3326; AVX2-NEXT:    vzeroupper
3327; AVX2-NEXT:    retq
3328  %v1 = load <8 x i16>, ptr %src, align 16
3329  %v2 = shufflevector <8 x i16> %v1, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3330  %v3 = shufflevector <16 x i16> %v2, <16 x i16> <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
3331  store <32 x i16> %v3, ptr %dst, align 16
3332  ret void
3333}
3334
3335; getFauxShuffle AND/ANDN decoding wrongly assumed an undef src always gives an undef dst.
3336define <2 x i64> @PR55157(ptr %0) {
3337; SSE-LABEL: PR55157:
3338; SSE:       # %bb.0:
3339; SSE-NEXT:    xorps %xmm0, %xmm0
3340; SSE-NEXT:    retq
3341;
3342; AVX-LABEL: PR55157:
3343; AVX:       # %bb.0:
3344; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
3345; AVX-NEXT:    retq
3346  %2 = load <16 x i8>, ptr %0, align 16
3347  %3 = icmp eq <16 x i8> %2, zeroinitializer
3348  %4 = tail call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer)
3349  %5 = select <16 x i1> %3, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %4
3350  %6 = shufflevector <16 x i8> %5, <16 x i8> poison, <16 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
3351  %7 = bitcast <16 x i8> %6 to <2 x i64>
3352  ret <2 x i64> %7
3353}
3354declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>)
3355
3356; SelectionDAG::isSplatValue - incorrect handling of undef sub-elements
3357define <2 x i64> @PR56520(<16 x i8> %0) {
3358; SSE-LABEL: PR56520:
3359; SSE:       # %bb.0:
3360; SSE-NEXT:    pxor %xmm1, %xmm1
3361; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
3362; SSE-NEXT:    movd %xmm1, %eax
3363; SSE-NEXT:    movsbl %al, %eax
3364; SSE-NEXT:    movd %eax, %xmm0
3365; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3366; SSE-NEXT:    retq
3367;
3368; AVX1-LABEL: PR56520:
3369; AVX1:       # %bb.0:
3370; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3371; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
3372; AVX1-NEXT:    vmovd %xmm0, %eax
3373; AVX1-NEXT:    movsbl %al, %eax
3374; AVX1-NEXT:    vmovd %eax, %xmm0
3375; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3376; AVX1-NEXT:    retq
3377;
3378; AVX2-SLOW-LABEL: PR56520:
3379; AVX2-SLOW:       # %bb.0:
3380; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3381; AVX2-SLOW-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
3382; AVX2-SLOW-NEXT:    vmovd %xmm0, %eax
3383; AVX2-SLOW-NEXT:    movsbl %al, %eax
3384; AVX2-SLOW-NEXT:    vmovd %eax, %xmm0
3385; AVX2-SLOW-NEXT:    vpbroadcastq %xmm0, %xmm0
3386; AVX2-SLOW-NEXT:    retq
3387;
3388; AVX2-FAST-LABEL: PR56520:
3389; AVX2-FAST:       # %bb.0:
3390; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3391; AVX2-FAST-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
3392; AVX2-FAST-NEXT:    vmovd %xmm0, %eax
3393; AVX2-FAST-NEXT:    movsbl %al, %eax
3394; AVX2-FAST-NEXT:    vmovd %eax, %xmm0
3395; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
3396; AVX2-FAST-NEXT:    retq
3397  %2 = icmp eq <16 x i8> zeroinitializer, %0
3398  %3 = extractelement <16 x i1> %2, i64 0
3399  %4 = sext i1 %3 to i32
3400  %5 = insertelement <2 x i32> zeroinitializer, i32 %4, i64 0
3401  %6 = zext <2 x i32> %5 to <2 x i64>
3402  %7 = shufflevector <2 x i64> %6, <2 x i64> zeroinitializer, <2 x i32> zeroinitializer
3403  ret <2 x i64> %7
3404}
3405
3406; Test case reported on D105827
3407define void @SpinningCube() {
3408; SSE2-LABEL: SpinningCube:
3409; SSE2:       # %bb.0: # %entry
3410; SSE2-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3411; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3412; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3413; SSE2-NEXT:    movapd {{.*#+}} xmm2 = <u,u,-2.0E+0,u>
3414; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3415; SSE2-NEXT:    xorps %xmm3, %xmm3
3416; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
3417; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
3418; SSE2-NEXT:    addps %xmm3, %xmm1
3419; SSE2-NEXT:    movaps %xmm1, (%rax)
3420; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3421; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
3422; SSE2-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3423; SSE2-NEXT:    addps %xmm0, %xmm1
3424; SSE2-NEXT:    movaps %xmm1, (%rax)
3425; SSE2-NEXT:    retq
3426;
3427; SSSE3-LABEL: SpinningCube:
3428; SSSE3:       # %bb.0: # %entry
3429; SSSE3-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3430; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3431; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3432; SSSE3-NEXT:    movapd {{.*#+}} xmm2 = <u,u,-2.0E+0,u>
3433; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3434; SSSE3-NEXT:    xorps %xmm3, %xmm3
3435; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
3436; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
3437; SSSE3-NEXT:    addps %xmm3, %xmm1
3438; SSSE3-NEXT:    movaps %xmm1, (%rax)
3439; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
3440; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,2]
3441; SSSE3-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
3442; SSSE3-NEXT:    addps %xmm0, %xmm1
3443; SSSE3-NEXT:    movaps %xmm1, (%rax)
3444; SSSE3-NEXT:    retq
3445;
3446; SSE41-LABEL: SpinningCube:
3447; SSE41:       # %bb.0: # %entry
3448; SSE41-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3449; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3450; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
3451; SSE41-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3452; SSE41-NEXT:    movaps %xmm1, %xmm3
3453; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0]
3454; SSE41-NEXT:    movaps %xmm0, %xmm4
3455; SSE41-NEXT:    insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3]
3456; SSE41-NEXT:    addps %xmm3, %xmm4
3457; SSE41-NEXT:    movaps %xmm4, (%rax)
3458; SSE41-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3459; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,0,2]
3460; SSE41-NEXT:    mulps %xmm1, %xmm2
3461; SSE41-NEXT:    addps %xmm0, %xmm2
3462; SSE41-NEXT:    movaps %xmm2, (%rax)
3463; SSE41-NEXT:    retq
3464;
3465; AVX1-LABEL: SpinningCube:
3466; AVX1:       # %bb.0: # %entry
3467; AVX1-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3468; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3469; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
3470; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3471; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
3472; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
3473; AVX1-NEXT:    vaddps %xmm2, %xmm3, %xmm2
3474; AVX1-NEXT:    vmovaps %xmm2, (%rax)
3475; AVX1-NEXT:    vbroadcastss (%rax), %xmm2
3476; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
3477; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
3478; AVX1-NEXT:    vmovaps %xmm0, (%rax)
3479; AVX1-NEXT:    retq
3480;
3481; AVX2-LABEL: SpinningCube:
3482; AVX2:       # %bb.0: # %entry
3483; AVX2-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3484; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
3485; AVX2-NEXT:    vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
3486; AVX2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3487; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
3488; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
3489; AVX2-NEXT:    vaddps %xmm2, %xmm3, %xmm2
3490; AVX2-NEXT:    vmovaps %xmm2, (%rax)
3491; AVX2-NEXT:    vbroadcastss (%rax), %xmm2
3492; AVX2-NEXT:    vmulps %xmm1, %xmm2, %xmm1
3493; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
3494; AVX2-NEXT:    vmovaps %xmm0, (%rax)
3495; AVX2-NEXT:    retq
3496entry:
3497  store float 1.000000e+00, ptr undef, align 4
3498  %0 = load float, ptr undef, align 4
3499  %1 = fmul float undef, 0.000000e+00
3500  %2 = insertelement <4 x float> poison, float %0, i32 3
3501  %3 = load float, ptr undef, align 4
3502  %4 = insertelement <2 x float> poison, float %3, i32 0
3503  %5 = shufflevector <2 x float> %4, <2 x float> poison, <2 x i32> zeroinitializer
3504  %6 = fmul <2 x float> %5, <float 0.000000e+00, float -2.000000e+00>
3505  %7 = fadd float %1, undef
3506  %8 = shufflevector <2 x float> %6, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
3507  %9 = shufflevector <4 x float> undef, <4 x float> %8, <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
3508  %10 = insertelement <4 x float> %9, float %7, i32 3
3509  %11 = insertelement <4 x float> %2, float 0x7FF8000000000000, i32 1
3510  %12 = insertelement <4 x float> %11, float undef, i32 0
3511  %13 = insertelement <4 x float> %12, float undef, i32 2
3512  %14 = fadd <4 x float> %10, %13
3513  store <4 x float> %14, ptr undef, align 16
3514  %15 = load float, ptr undef, align 4
3515  %16 = insertelement <2 x float> poison, float %15, i32 0
3516  %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> zeroinitializer
3517  %18 = fmul <2 x float> %17, <float 0.000000e+00, float -2.000000e+00>
3518  %19 = shufflevector <2 x float> %18, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
3519  %20 = shufflevector <4 x float> undef, <4 x float> %19, <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
3520  %21 = fadd <4 x float> %20, %2
3521  store <4 x float> %21, ptr undef, align 16
3522  ret void
3523}
3524