1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
7;
8; Verify that the DAG combiner correctly folds bitwise operations across
9; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
10; basic and always-safe patterns. Also test that the DAG combiner will combine
11; target-specific shuffle instructions where reasonable.
12
13target triple = "x86_64-unknown-unknown"
14
15declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
16declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
17declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
18
19define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
20; ALL-LABEL: combine_pshufd1:
21; ALL:       # BB#0: # %entry
22; ALL-NEXT:    retq
23entry:
24  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
25  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
26  ret <4 x i32> %c
27}
28
29define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
30; ALL-LABEL: combine_pshufd2:
31; ALL:       # BB#0: # %entry
32; ALL-NEXT:    retq
33entry:
34  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
35  %b.cast = bitcast <4 x i32> %b to <8 x i16>
36  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
37  %c.cast = bitcast <8 x i16> %c to <4 x i32>
38  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
39  ret <4 x i32> %d
40}
41
42define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
43; ALL-LABEL: combine_pshufd3:
44; ALL:       # BB#0: # %entry
45; ALL-NEXT:    retq
46entry:
47  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
48  %b.cast = bitcast <4 x i32> %b to <8 x i16>
49  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
50  %c.cast = bitcast <8 x i16> %c to <4 x i32>
51  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
52  ret <4 x i32> %d
53}
54
55define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
56; SSE-LABEL: combine_pshufd4:
57; SSE:       # BB#0: # %entry
58; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
59; SSE-NEXT:    retq
60;
61; AVX-LABEL: combine_pshufd4:
62; AVX:       # BB#0: # %entry
63; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
64; AVX-NEXT:    retq
65entry:
66  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
67  %b.cast = bitcast <4 x i32> %b to <8 x i16>
68  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
69  %c.cast = bitcast <8 x i16> %c to <4 x i32>
70  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
71  ret <4 x i32> %d
72}
73
74define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
75; SSE-LABEL: combine_pshufd5:
76; SSE:       # BB#0: # %entry
77; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
78; SSE-NEXT:    retq
79;
80; AVX-LABEL: combine_pshufd5:
81; AVX:       # BB#0: # %entry
82; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
83; AVX-NEXT:    retq
84entry:
85  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
86  %b.cast = bitcast <4 x i32> %b to <8 x i16>
87  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
88  %c.cast = bitcast <8 x i16> %c to <4 x i32>
89  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
90  ret <4 x i32> %d
91}
92
93define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
94; SSE-LABEL: combine_pshufd6:
95; SSE:       # BB#0: # %entry
96; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
97; SSE-NEXT:    retq
98;
99; AVX1-LABEL: combine_pshufd6:
100; AVX1:       # BB#0: # %entry
101; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
102; AVX1-NEXT:    retq
103;
104; AVX2-LABEL: combine_pshufd6:
105; AVX2:       # BB#0: # %entry
106; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
107; AVX2-NEXT:    retq
108entry:
109  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
110  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
111  ret <4 x i32> %c
112}
113
114define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
115; ALL-LABEL: combine_pshuflw1:
116; ALL:       # BB#0: # %entry
117; ALL-NEXT:    retq
118entry:
119  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
120  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
121  ret <8 x i16> %c
122}
123
124define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
125; ALL-LABEL: combine_pshuflw2:
126; ALL:       # BB#0: # %entry
127; ALL-NEXT:    retq
128entry:
129  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
130  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
131  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
132  ret <8 x i16> %d
133}
134
135define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
136; SSE-LABEL: combine_pshuflw3:
137; SSE:       # BB#0: # %entry
138; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
139; SSE-NEXT:    retq
140;
141; AVX-LABEL: combine_pshuflw3:
142; AVX:       # BB#0: # %entry
143; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
144; AVX-NEXT:    retq
145entry:
146  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
147  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
148  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
149  ret <8 x i16> %d
150}
151
152define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
153; SSE-LABEL: combine_pshufhw1:
154; SSE:       # BB#0: # %entry
155; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
156; SSE-NEXT:    retq
157;
158; AVX-LABEL: combine_pshufhw1:
159; AVX:       # BB#0: # %entry
160; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
161; AVX-NEXT:    retq
162entry:
163  %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
164  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
165  %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
166  ret <8 x i16> %d
167}
168
169define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
170; SSE-LABEL: combine_bitwise_ops_test1:
171; SSE:       # BB#0:
172; SSE-NEXT:    pand %xmm1, %xmm0
173; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
174; SSE-NEXT:    retq
175;
176; AVX-LABEL: combine_bitwise_ops_test1:
177; AVX:       # BB#0:
178; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
179; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
180; AVX-NEXT:    retq
181  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
182  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
183  %and = and <4 x i32> %shuf1, %shuf2
184  ret <4 x i32> %and
185}
186
187define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
188; SSE-LABEL: combine_bitwise_ops_test2:
189; SSE:       # BB#0:
190; SSE-NEXT:    por %xmm1, %xmm0
191; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
192; SSE-NEXT:    retq
193;
194; AVX-LABEL: combine_bitwise_ops_test2:
195; AVX:       # BB#0:
196; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
197; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
198; AVX-NEXT:    retq
199  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
200  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
201  %or = or <4 x i32> %shuf1, %shuf2
202  ret <4 x i32> %or
203}
204
205define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
206; SSE-LABEL: combine_bitwise_ops_test3:
207; SSE:       # BB#0:
208; SSE-NEXT:    pxor %xmm1, %xmm0
209; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
210; SSE-NEXT:    retq
211;
212; AVX-LABEL: combine_bitwise_ops_test3:
213; AVX:       # BB#0:
214; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
215; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
216; AVX-NEXT:    retq
217  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
218  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
219  %xor = xor <4 x i32> %shuf1, %shuf2
220  ret <4 x i32> %xor
221}
222
223define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
224; SSE-LABEL: combine_bitwise_ops_test4:
225; SSE:       # BB#0:
226; SSE-NEXT:    pand %xmm1, %xmm0
227; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
228; SSE-NEXT:    retq
229;
230; AVX-LABEL: combine_bitwise_ops_test4:
231; AVX:       # BB#0:
232; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
233; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
234; AVX-NEXT:    retq
235  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
236  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
237  %and = and <4 x i32> %shuf1, %shuf2
238  ret <4 x i32> %and
239}
240
241define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
242; SSE-LABEL: combine_bitwise_ops_test5:
243; SSE:       # BB#0:
244; SSE-NEXT:    por %xmm1, %xmm0
245; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
246; SSE-NEXT:    retq
247;
248; AVX-LABEL: combine_bitwise_ops_test5:
249; AVX:       # BB#0:
250; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
251; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
252; AVX-NEXT:    retq
253  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
254  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
255  %or = or <4 x i32> %shuf1, %shuf2
256  ret <4 x i32> %or
257}
258
259define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
260; SSE-LABEL: combine_bitwise_ops_test6:
261; SSE:       # BB#0:
262; SSE-NEXT:    pxor %xmm1, %xmm0
263; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
264; SSE-NEXT:    retq
265;
266; AVX-LABEL: combine_bitwise_ops_test6:
267; AVX:       # BB#0:
268; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
269; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
270; AVX-NEXT:    retq
271  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
272  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
273  %xor = xor <4 x i32> %shuf1, %shuf2
274  ret <4 x i32> %xor
275}
276
277
278; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
279; are not performing a swizzle operations.
280
281define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
282; SSE2-LABEL: combine_bitwise_ops_test1b:
283; SSE2:       # BB#0:
284; SSE2-NEXT:    pand %xmm1, %xmm0
285; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
286; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
287; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
288; SSE2-NEXT:    retq
289;
290; SSSE3-LABEL: combine_bitwise_ops_test1b:
291; SSSE3:       # BB#0:
292; SSSE3-NEXT:    pand %xmm1, %xmm0
293; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
294; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
295; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
296; SSSE3-NEXT:    retq
297;
298; SSE41-LABEL: combine_bitwise_ops_test1b:
299; SSE41:       # BB#0:
300; SSE41-NEXT:    pand %xmm1, %xmm0
301; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
302; SSE41-NEXT:    retq
303;
304; AVX1-LABEL: combine_bitwise_ops_test1b:
305; AVX1:       # BB#0:
306; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
307; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
308; AVX1-NEXT:    retq
309;
310; AVX2-LABEL: combine_bitwise_ops_test1b:
311; AVX2:       # BB#0:
312; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
313; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
314; AVX2-NEXT:    retq
315  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
316  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
317  %and = and <4 x i32> %shuf1, %shuf2
318  ret <4 x i32> %and
319}
320
321define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
322; SSE2-LABEL: combine_bitwise_ops_test2b:
323; SSE2:       # BB#0:
324; SSE2-NEXT:    por %xmm1, %xmm0
325; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
326; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
327; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
328; SSE2-NEXT:    retq
329;
330; SSSE3-LABEL: combine_bitwise_ops_test2b:
331; SSSE3:       # BB#0:
332; SSSE3-NEXT:    por %xmm1, %xmm0
333; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
334; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
335; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
336; SSSE3-NEXT:    retq
337;
338; SSE41-LABEL: combine_bitwise_ops_test2b:
339; SSE41:       # BB#0:
340; SSE41-NEXT:    por %xmm1, %xmm0
341; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
342; SSE41-NEXT:    retq
343;
344; AVX1-LABEL: combine_bitwise_ops_test2b:
345; AVX1:       # BB#0:
346; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
347; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
348; AVX1-NEXT:    retq
349;
350; AVX2-LABEL: combine_bitwise_ops_test2b:
351; AVX2:       # BB#0:
352; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
353; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
354; AVX2-NEXT:    retq
355  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
356  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
357  %or = or <4 x i32> %shuf1, %shuf2
358  ret <4 x i32> %or
359}
360
361define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
362; SSE2-LABEL: combine_bitwise_ops_test3b:
363; SSE2:       # BB#0:
364; SSE2-NEXT:    xorps %xmm1, %xmm0
365; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
366; SSE2-NEXT:    retq
367;
368; SSSE3-LABEL: combine_bitwise_ops_test3b:
369; SSSE3:       # BB#0:
370; SSSE3-NEXT:    xorps %xmm1, %xmm0
371; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
372; SSSE3-NEXT:    retq
373;
374; SSE41-LABEL: combine_bitwise_ops_test3b:
375; SSE41:       # BB#0:
376; SSE41-NEXT:    pxor %xmm1, %xmm0
377; SSE41-NEXT:    pxor %xmm1, %xmm1
378; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
379; SSE41-NEXT:    retq
380;
381; AVX1-LABEL: combine_bitwise_ops_test3b:
382; AVX1:       # BB#0:
383; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
384; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
385; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
386; AVX1-NEXT:    retq
387;
388; AVX2-LABEL: combine_bitwise_ops_test3b:
389; AVX2:       # BB#0:
390; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
391; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
392; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
393; AVX2-NEXT:    retq
394  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
395  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
396  %xor = xor <4 x i32> %shuf1, %shuf2
397  ret <4 x i32> %xor
398}
399
400define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
401; SSE2-LABEL: combine_bitwise_ops_test4b:
402; SSE2:       # BB#0:
403; SSE2-NEXT:    pand %xmm1, %xmm0
404; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
405; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
406; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
407; SSE2-NEXT:    retq
408;
409; SSSE3-LABEL: combine_bitwise_ops_test4b:
410; SSSE3:       # BB#0:
411; SSSE3-NEXT:    pand %xmm1, %xmm0
412; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
413; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
414; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
415; SSSE3-NEXT:    retq
416;
417; SSE41-LABEL: combine_bitwise_ops_test4b:
418; SSE41:       # BB#0:
419; SSE41-NEXT:    pand %xmm1, %xmm0
420; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
421; SSE41-NEXT:    retq
422;
423; AVX1-LABEL: combine_bitwise_ops_test4b:
424; AVX1:       # BB#0:
425; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
426; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
427; AVX1-NEXT:    retq
428;
429; AVX2-LABEL: combine_bitwise_ops_test4b:
430; AVX2:       # BB#0:
431; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
432; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
433; AVX2-NEXT:    retq
434  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
435  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
436  %and = and <4 x i32> %shuf1, %shuf2
437  ret <4 x i32> %and
438}
439
440define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
441; SSE2-LABEL: combine_bitwise_ops_test5b:
442; SSE2:       # BB#0:
443; SSE2-NEXT:    por %xmm1, %xmm0
444; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
445; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
446; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
447; SSE2-NEXT:    retq
448;
449; SSSE3-LABEL: combine_bitwise_ops_test5b:
450; SSSE3:       # BB#0:
451; SSSE3-NEXT:    por %xmm1, %xmm0
452; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
453; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
454; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
455; SSSE3-NEXT:    retq
456;
457; SSE41-LABEL: combine_bitwise_ops_test5b:
458; SSE41:       # BB#0:
459; SSE41-NEXT:    por %xmm1, %xmm0
460; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
461; SSE41-NEXT:    retq
462;
463; AVX1-LABEL: combine_bitwise_ops_test5b:
464; AVX1:       # BB#0:
465; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
466; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
467; AVX1-NEXT:    retq
468;
469; AVX2-LABEL: combine_bitwise_ops_test5b:
470; AVX2:       # BB#0:
471; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
472; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
473; AVX2-NEXT:    retq
474  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
475  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
476  %or = or <4 x i32> %shuf1, %shuf2
477  ret <4 x i32> %or
478}
479
480define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
481; SSE2-LABEL: combine_bitwise_ops_test6b:
482; SSE2:       # BB#0:
483; SSE2-NEXT:    xorps %xmm1, %xmm0
484; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
485; SSE2-NEXT:    retq
486;
487; SSSE3-LABEL: combine_bitwise_ops_test6b:
488; SSSE3:       # BB#0:
489; SSSE3-NEXT:    xorps %xmm1, %xmm0
490; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
491; SSSE3-NEXT:    retq
492;
493; SSE41-LABEL: combine_bitwise_ops_test6b:
494; SSE41:       # BB#0:
495; SSE41-NEXT:    pxor %xmm1, %xmm0
496; SSE41-NEXT:    pxor %xmm1, %xmm1
497; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
498; SSE41-NEXT:    retq
499;
500; AVX1-LABEL: combine_bitwise_ops_test6b:
501; AVX1:       # BB#0:
502; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
503; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
504; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
505; AVX1-NEXT:    retq
506;
507; AVX2-LABEL: combine_bitwise_ops_test6b:
508; AVX2:       # BB#0:
509; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
510; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
511; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
512; AVX2-NEXT:    retq
513  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
514  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
515  %xor = xor <4 x i32> %shuf1, %shuf2
516  ret <4 x i32> %xor
517}
518
519define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
520; SSE-LABEL: combine_bitwise_ops_test1c:
521; SSE:       # BB#0:
522; SSE-NEXT:    andps %xmm1, %xmm0
523; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
524; SSE-NEXT:    retq
525;
526; AVX-LABEL: combine_bitwise_ops_test1c:
527; AVX:       # BB#0:
528; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
529; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
530; AVX-NEXT:    retq
531  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
532  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
533  %and = and <4 x i32> %shuf1, %shuf2
534  ret <4 x i32> %and
535}
536
537define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
538; SSE-LABEL: combine_bitwise_ops_test2c:
539; SSE:       # BB#0:
540; SSE-NEXT:    orps %xmm1, %xmm0
541; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
542; SSE-NEXT:    retq
543;
544; AVX-LABEL: combine_bitwise_ops_test2c:
545; AVX:       # BB#0:
546; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
547; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
548; AVX-NEXT:    retq
549  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
550  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
551  %or = or <4 x i32> %shuf1, %shuf2
552  ret <4 x i32> %or
553}
554
555define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
556; SSE2-LABEL: combine_bitwise_ops_test3c:
557; SSE2:       # BB#0:
558; SSE2-NEXT:    xorps %xmm1, %xmm0
559; SSE2-NEXT:    xorps %xmm1, %xmm1
560; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
561; SSE2-NEXT:    retq
562;
563; SSSE3-LABEL: combine_bitwise_ops_test3c:
564; SSSE3:       # BB#0:
565; SSSE3-NEXT:    xorps %xmm1, %xmm0
566; SSSE3-NEXT:    xorps %xmm1, %xmm1
567; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
568; SSSE3-NEXT:    retq
569;
570; SSE41-LABEL: combine_bitwise_ops_test3c:
571; SSE41:       # BB#0:
572; SSE41-NEXT:    xorps %xmm1, %xmm0
573; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
574; SSE41-NEXT:    retq
575;
576; AVX-LABEL: combine_bitwise_ops_test3c:
577; AVX:       # BB#0:
578; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
579; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
580; AVX-NEXT:    retq
581  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
582  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
583  %xor = xor <4 x i32> %shuf1, %shuf2
584  ret <4 x i32> %xor
585}
586
587define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
588; SSE-LABEL: combine_bitwise_ops_test4c:
589; SSE:       # BB#0:
590; SSE-NEXT:    andps %xmm1, %xmm0
591; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
592; SSE-NEXT:    movaps %xmm2, %xmm0
593; SSE-NEXT:    retq
594;
595; AVX-LABEL: combine_bitwise_ops_test4c:
596; AVX:       # BB#0:
597; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
598; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
599; AVX-NEXT:    retq
600  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
601  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
602  %and = and <4 x i32> %shuf1, %shuf2
603  ret <4 x i32> %and
604}
605
606define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
607; SSE-LABEL: combine_bitwise_ops_test5c:
608; SSE:       # BB#0:
609; SSE-NEXT:    orps %xmm1, %xmm0
610; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
611; SSE-NEXT:    movaps %xmm2, %xmm0
612; SSE-NEXT:    retq
613;
614; AVX-LABEL: combine_bitwise_ops_test5c:
615; AVX:       # BB#0:
616; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
617; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
618; AVX-NEXT:    retq
619  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
620  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
621  %or = or <4 x i32> %shuf1, %shuf2
622  ret <4 x i32> %or
623}
624
625define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
626; SSE2-LABEL: combine_bitwise_ops_test6c:
627; SSE2:       # BB#0:
628; SSE2-NEXT:    xorps %xmm1, %xmm0
629; SSE2-NEXT:    xorps %xmm1, %xmm1
630; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
631; SSE2-NEXT:    movaps %xmm1, %xmm0
632; SSE2-NEXT:    retq
633;
634; SSSE3-LABEL: combine_bitwise_ops_test6c:
635; SSSE3:       # BB#0:
636; SSSE3-NEXT:    xorps %xmm1, %xmm0
637; SSSE3-NEXT:    xorps %xmm1, %xmm1
638; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
639; SSSE3-NEXT:    movaps %xmm1, %xmm0
640; SSSE3-NEXT:    retq
641;
642; SSE41-LABEL: combine_bitwise_ops_test6c:
643; SSE41:       # BB#0:
644; SSE41-NEXT:    xorps %xmm1, %xmm0
645; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
646; SSE41-NEXT:    retq
647;
648; AVX-LABEL: combine_bitwise_ops_test6c:
649; AVX:       # BB#0:
650; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
651; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
652; AVX-NEXT:    retq
653  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
654  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
655  %xor = xor <4 x i32> %shuf1, %shuf2
656  ret <4 x i32> %xor
657}
658
659define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
660; SSE-LABEL: combine_nested_undef_test1:
661; SSE:       # BB#0:
662; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
663; SSE-NEXT:    retq
664;
665; AVX-LABEL: combine_nested_undef_test1:
666; AVX:       # BB#0:
667; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
668; AVX-NEXT:    retq
669  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
670  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
671  ret <4 x i32> %2
672}
673
674define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
675; SSE-LABEL: combine_nested_undef_test2:
676; SSE:       # BB#0:
677; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
678; SSE-NEXT:    retq
679;
680; AVX-LABEL: combine_nested_undef_test2:
681; AVX:       # BB#0:
682; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
683; AVX-NEXT:    retq
684  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
685  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
686  ret <4 x i32> %2
687}
688
689define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
690; SSE-LABEL: combine_nested_undef_test3:
691; SSE:       # BB#0:
692; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
693; SSE-NEXT:    retq
694;
695; AVX-LABEL: combine_nested_undef_test3:
696; AVX:       # BB#0:
697; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
698; AVX-NEXT:    retq
699  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
700  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
701  ret <4 x i32> %2
702}
703
704define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
705; SSE-LABEL: combine_nested_undef_test4:
706; SSE:       # BB#0:
707; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
708; SSE-NEXT:    retq
709;
710; AVX1-LABEL: combine_nested_undef_test4:
711; AVX1:       # BB#0:
712; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
713; AVX1-NEXT:    retq
714;
715; AVX2-LABEL: combine_nested_undef_test4:
716; AVX2:       # BB#0:
717; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
718; AVX2-NEXT:    retq
719  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
720  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
721  ret <4 x i32> %2
722}
723
724define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
725; SSE-LABEL: combine_nested_undef_test5:
726; SSE:       # BB#0:
727; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
728; SSE-NEXT:    retq
729;
730; AVX-LABEL: combine_nested_undef_test5:
731; AVX:       # BB#0:
732; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
733; AVX-NEXT:    retq
734  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
735  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
736  ret <4 x i32> %2
737}
738
739define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
740; SSE-LABEL: combine_nested_undef_test6:
741; SSE:       # BB#0:
742; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
743; SSE-NEXT:    retq
744;
745; AVX-LABEL: combine_nested_undef_test6:
746; AVX:       # BB#0:
747; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
748; AVX-NEXT:    retq
749  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
750  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
751  ret <4 x i32> %2
752}
753
754define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
755; SSE-LABEL: combine_nested_undef_test7:
756; SSE:       # BB#0:
757; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
758; SSE-NEXT:    retq
759;
760; AVX-LABEL: combine_nested_undef_test7:
761; AVX:       # BB#0:
762; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
763; AVX-NEXT:    retq
764  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
765  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
766  ret <4 x i32> %2
767}
768
769define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
770; SSE-LABEL: combine_nested_undef_test8:
771; SSE:       # BB#0:
772; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
773; SSE-NEXT:    retq
774;
775; AVX-LABEL: combine_nested_undef_test8:
776; AVX:       # BB#0:
777; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
778; AVX-NEXT:    retq
779  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
780  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
781  ret <4 x i32> %2
782}
783
784define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
785; SSE-LABEL: combine_nested_undef_test9:
786; SSE:       # BB#0:
787; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
788; SSE-NEXT:    retq
789;
790; AVX-LABEL: combine_nested_undef_test9:
791; AVX:       # BB#0:
792; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
793; AVX-NEXT:    retq
794  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
795  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
796  ret <4 x i32> %2
797}
798
799define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
800; SSE-LABEL: combine_nested_undef_test10:
801; SSE:       # BB#0:
802; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
803; SSE-NEXT:    retq
804;
805; AVX-LABEL: combine_nested_undef_test10:
806; AVX:       # BB#0:
807; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
808; AVX-NEXT:    retq
809  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
810  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
811  ret <4 x i32> %2
812}
813
814define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
815; SSE-LABEL: combine_nested_undef_test11:
816; SSE:       # BB#0:
817; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
818; SSE-NEXT:    retq
819;
820; AVX-LABEL: combine_nested_undef_test11:
821; AVX:       # BB#0:
822; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
823; AVX-NEXT:    retq
824  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
825  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
826  ret <4 x i32> %2
827}
828
829define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
830; SSE-LABEL: combine_nested_undef_test12:
831; SSE:       # BB#0:
832; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
833; SSE-NEXT:    retq
834;
835; AVX1-LABEL: combine_nested_undef_test12:
836; AVX1:       # BB#0:
837; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
838; AVX1-NEXT:    retq
839;
840; AVX2-LABEL: combine_nested_undef_test12:
841; AVX2:       # BB#0:
842; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
843; AVX2-NEXT:    retq
844  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
845  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
846  ret <4 x i32> %2
847}
848
849; The following pair of shuffles is folded into vector %A.
850define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
851; ALL-LABEL: combine_nested_undef_test13:
852; ALL:       # BB#0:
853; ALL-NEXT:    retq
854  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
855  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
856  ret <4 x i32> %2
857}
858
859; The following pair of shuffles is folded into vector %B.
860define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
861; SSE-LABEL: combine_nested_undef_test14:
862; SSE:       # BB#0:
863; SSE-NEXT:    movaps %xmm1, %xmm0
864; SSE-NEXT:    retq
865;
866; AVX-LABEL: combine_nested_undef_test14:
867; AVX:       # BB#0:
868; AVX-NEXT:    vmovaps %xmm1, %xmm0
869; AVX-NEXT:    retq
870  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
871  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
872  ret <4 x i32> %2
873}
874
875
876; Verify that we don't optimize the following cases. We expect more than one shuffle.
877;
878; FIXME: Many of these already don't make sense, and the rest should stop
879; making sense with th enew vector shuffle lowering. Revisit at least testing for
880; it.
881
882define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
883; SSE2-LABEL: combine_nested_undef_test15:
884; SSE2:       # BB#0:
885; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
886; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
887; SSE2-NEXT:    movaps %xmm1, %xmm0
888; SSE2-NEXT:    retq
889;
890; SSSE3-LABEL: combine_nested_undef_test15:
891; SSSE3:       # BB#0:
892; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
893; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
894; SSSE3-NEXT:    movaps %xmm1, %xmm0
895; SSSE3-NEXT:    retq
896;
897; SSE41-LABEL: combine_nested_undef_test15:
898; SSE41:       # BB#0:
899; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
900; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
901; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
902; SSE41-NEXT:    retq
903;
904; AVX1-LABEL: combine_nested_undef_test15:
905; AVX1:       # BB#0:
906; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
907; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
908; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
909; AVX1-NEXT:    retq
910;
911; AVX2-LABEL: combine_nested_undef_test15:
912; AVX2:       # BB#0:
913; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
914; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
915; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
916; AVX2-NEXT:    retq
917  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
918  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
919  ret <4 x i32> %2
920}
921
922define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
923; SSE2-LABEL: combine_nested_undef_test16:
924; SSE2:       # BB#0:
925; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
926; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
927; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
928; SSE2-NEXT:    retq
929;
930; SSSE3-LABEL: combine_nested_undef_test16:
931; SSSE3:       # BB#0:
932; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
933; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
934; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
935; SSSE3-NEXT:    retq
936;
937; SSE41-LABEL: combine_nested_undef_test16:
938; SSE41:       # BB#0:
939; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
940; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
941; SSE41-NEXT:    retq
942;
943; AVX1-LABEL: combine_nested_undef_test16:
944; AVX1:       # BB#0:
945; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
946; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
947; AVX1-NEXT:    retq
948;
949; AVX2-LABEL: combine_nested_undef_test16:
950; AVX2:       # BB#0:
951; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
952; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
953; AVX2-NEXT:    retq
954  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
955  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
956  ret <4 x i32> %2
957}
958
959define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
960; SSE2-LABEL: combine_nested_undef_test17:
961; SSE2:       # BB#0:
962; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
963; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
964; SSE2-NEXT:    retq
965;
966; SSSE3-LABEL: combine_nested_undef_test17:
967; SSSE3:       # BB#0:
968; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
969; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
970; SSSE3-NEXT:    retq
971;
972; SSE41-LABEL: combine_nested_undef_test17:
973; SSE41:       # BB#0:
974; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
975; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
976; SSE41-NEXT:    retq
977;
978; AVX1-LABEL: combine_nested_undef_test17:
979; AVX1:       # BB#0:
980; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
981; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
982; AVX1-NEXT:    retq
983;
984; AVX2-LABEL: combine_nested_undef_test17:
985; AVX2:       # BB#0:
986; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
987; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
988; AVX2-NEXT:    retq
989  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
990  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
991  ret <4 x i32> %2
992}
993
994define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
995; SSE-LABEL: combine_nested_undef_test18:
996; SSE:       # BB#0:
997; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
998; SSE-NEXT:    retq
999;
1000; AVX-LABEL: combine_nested_undef_test18:
1001; AVX:       # BB#0:
1002; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
1003; AVX-NEXT:    retq
1004  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1005  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
1006  ret <4 x i32> %2
1007}
1008
1009define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
1010; SSE2-LABEL: combine_nested_undef_test19:
1011; SSE2:       # BB#0:
1012; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1013; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
1014; SSE2-NEXT:    retq
1015;
1016; SSSE3-LABEL: combine_nested_undef_test19:
1017; SSSE3:       # BB#0:
1018; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1019; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
1020; SSSE3-NEXT:    retq
1021;
1022; SSE41-LABEL: combine_nested_undef_test19:
1023; SSE41:       # BB#0:
1024; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1025; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1026; SSE41-NEXT:    retq
1027;
1028; AVX1-LABEL: combine_nested_undef_test19:
1029; AVX1:       # BB#0:
1030; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1031; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1032; AVX1-NEXT:    retq
1033;
1034; AVX2-LABEL: combine_nested_undef_test19:
1035; AVX2:       # BB#0:
1036; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1037; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1038; AVX2-NEXT:    retq
1039  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
1040  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
1041  ret <4 x i32> %2
1042}
1043
1044define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
1045; SSE2-LABEL: combine_nested_undef_test20:
1046; SSE2:       # BB#0:
1047; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1048; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1049; SSE2-NEXT:    movaps %xmm1, %xmm0
1050; SSE2-NEXT:    retq
1051;
1052; SSSE3-LABEL: combine_nested_undef_test20:
1053; SSSE3:       # BB#0:
1054; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1055; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1056; SSSE3-NEXT:    movaps %xmm1, %xmm0
1057; SSSE3-NEXT:    retq
1058;
1059; SSE41-LABEL: combine_nested_undef_test20:
1060; SSE41:       # BB#0:
1061; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1062; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1063; SSE41-NEXT:    retq
1064;
1065; AVX1-LABEL: combine_nested_undef_test20:
1066; AVX1:       # BB#0:
1067; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1068; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1069; AVX1-NEXT:    retq
1070;
1071; AVX2-LABEL: combine_nested_undef_test20:
1072; AVX2:       # BB#0:
1073; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1074; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1075; AVX2-NEXT:    retq
1076  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
1077  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1078  ret <4 x i32> %2
1079}
1080
1081define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
1082; SSE2-LABEL: combine_nested_undef_test21:
1083; SSE2:       # BB#0:
1084; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1085; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1086; SSE2-NEXT:    retq
1087;
1088; SSSE3-LABEL: combine_nested_undef_test21:
1089; SSSE3:       # BB#0:
1090; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1091; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1092; SSSE3-NEXT:    retq
1093;
1094; SSE41-LABEL: combine_nested_undef_test21:
1095; SSE41:       # BB#0:
1096; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1097; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1098; SSE41-NEXT:    retq
1099;
1100; AVX1-LABEL: combine_nested_undef_test21:
1101; AVX1:       # BB#0:
1102; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1103; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1104; AVX1-NEXT:    retq
1105;
1106; AVX2-LABEL: combine_nested_undef_test21:
1107; AVX2:       # BB#0:
1108; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1109; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
1110; AVX2-NEXT:    retq
1111  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1112  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1113  ret <4 x i32> %2
1114}
1115
1116
1117; Test that we correctly combine shuffles according to rule
1118;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
1119
1120define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
1121; SSE-LABEL: combine_nested_undef_test22:
1122; SSE:       # BB#0:
1123; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1124; SSE-NEXT:    retq
1125;
1126; AVX-LABEL: combine_nested_undef_test22:
1127; AVX:       # BB#0:
1128; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1129; AVX-NEXT:    retq
1130  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1131  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
1132  ret <4 x i32> %2
1133}
1134
1135define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
1136; SSE-LABEL: combine_nested_undef_test23:
1137; SSE:       # BB#0:
1138; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1139; SSE-NEXT:    retq
1140;
1141; AVX-LABEL: combine_nested_undef_test23:
1142; AVX:       # BB#0:
1143; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1144; AVX-NEXT:    retq
1145  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1146  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1147  ret <4 x i32> %2
1148}
1149
1150define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
1151; SSE-LABEL: combine_nested_undef_test24:
1152; SSE:       # BB#0:
1153; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1154; SSE-NEXT:    retq
1155;
1156; AVX-LABEL: combine_nested_undef_test24:
1157; AVX:       # BB#0:
1158; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1159; AVX-NEXT:    retq
1160  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1161  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
1162  ret <4 x i32> %2
1163}
1164
1165define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
1166; SSE-LABEL: combine_nested_undef_test25:
1167; SSE:       # BB#0:
1168; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1169; SSE-NEXT:    retq
1170;
1171; AVX1-LABEL: combine_nested_undef_test25:
1172; AVX1:       # BB#0:
1173; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1174; AVX1-NEXT:    retq
1175;
1176; AVX2-LABEL: combine_nested_undef_test25:
1177; AVX2:       # BB#0:
1178; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
1179; AVX2-NEXT:    retq
1180  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
1181  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
1182  ret <4 x i32> %2
1183}
1184
1185define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
1186; SSE-LABEL: combine_nested_undef_test26:
1187; SSE:       # BB#0:
1188; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1189; SSE-NEXT:    retq
1190;
1191; AVX-LABEL: combine_nested_undef_test26:
1192; AVX:       # BB#0:
1193; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1194; AVX-NEXT:    retq
1195  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
1196  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
1197  ret <4 x i32> %2
1198}
1199
1200define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
1201; SSE-LABEL: combine_nested_undef_test27:
1202; SSE:       # BB#0:
1203; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1204; SSE-NEXT:    retq
1205;
1206; AVX1-LABEL: combine_nested_undef_test27:
1207; AVX1:       # BB#0:
1208; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1209; AVX1-NEXT:    retq
1210;
1211; AVX2-LABEL: combine_nested_undef_test27:
1212; AVX2:       # BB#0:
1213; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
1214; AVX2-NEXT:    retq
1215  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
1216  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
1217  ret <4 x i32> %2
1218}
1219
1220define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
1221; SSE-LABEL: combine_nested_undef_test28:
1222; SSE:       # BB#0:
1223; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1224; SSE-NEXT:    retq
1225;
1226; AVX-LABEL: combine_nested_undef_test28:
1227; AVX:       # BB#0:
1228; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1229; AVX-NEXT:    retq
1230  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
1231  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
1232  ret <4 x i32> %2
1233}
1234
1235define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
1236; SSE-LABEL: combine_test1:
1237; SSE:       # BB#0:
1238; SSE-NEXT:    movaps %xmm1, %xmm0
1239; SSE-NEXT:    retq
1240;
1241; AVX-LABEL: combine_test1:
1242; AVX:       # BB#0:
1243; AVX-NEXT:    vmovaps %xmm1, %xmm0
1244; AVX-NEXT:    retq
1245  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1246  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1247  ret <4 x float> %2
1248}
1249
1250define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
1251; SSE2-LABEL: combine_test2:
1252; SSE2:       # BB#0:
1253; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1254; SSE2-NEXT:    movaps %xmm1, %xmm0
1255; SSE2-NEXT:    retq
1256;
1257; SSSE3-LABEL: combine_test2:
1258; SSSE3:       # BB#0:
1259; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1260; SSSE3-NEXT:    movaps %xmm1, %xmm0
1261; SSSE3-NEXT:    retq
1262;
1263; SSE41-LABEL: combine_test2:
1264; SSE41:       # BB#0:
1265; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1266; SSE41-NEXT:    retq
1267;
1268; AVX-LABEL: combine_test2:
1269; AVX:       # BB#0:
1270; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1271; AVX-NEXT:    retq
1272  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1273  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1274  ret <4 x float> %2
1275}
1276
1277define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
1278; SSE-LABEL: combine_test3:
1279; SSE:       # BB#0:
1280; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1281; SSE-NEXT:    retq
1282;
1283; AVX-LABEL: combine_test3:
1284; AVX:       # BB#0:
1285; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1286; AVX-NEXT:    retq
1287  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1288  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1289  ret <4 x float> %2
1290}
1291
1292define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
1293; SSE-LABEL: combine_test4:
1294; SSE:       # BB#0:
1295; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1296; SSE-NEXT:    retq
1297;
1298; AVX-LABEL: combine_test4:
1299; AVX:       # BB#0:
1300; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1301; AVX-NEXT:    retq
1302  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1303  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1304  ret <4 x float> %2
1305}
1306
1307define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
1308; SSE2-LABEL: combine_test5:
1309; SSE2:       # BB#0:
1310; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1311; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1312; SSE2-NEXT:    retq
1313;
1314; SSSE3-LABEL: combine_test5:
1315; SSSE3:       # BB#0:
1316; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1317; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1318; SSSE3-NEXT:    retq
1319;
1320; SSE41-LABEL: combine_test5:
1321; SSE41:       # BB#0:
1322; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1323; SSE41-NEXT:    retq
1324;
1325; AVX-LABEL: combine_test5:
1326; AVX:       # BB#0:
1327; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1328; AVX-NEXT:    retq
1329  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1330  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1331  ret <4 x float> %2
1332}
1333
1334define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
1335; SSE-LABEL: combine_test6:
1336; SSE:       # BB#0:
1337; SSE-NEXT:    movaps %xmm1, %xmm0
1338; SSE-NEXT:    retq
1339;
1340; AVX-LABEL: combine_test6:
1341; AVX:       # BB#0:
1342; AVX-NEXT:    vmovaps %xmm1, %xmm0
1343; AVX-NEXT:    retq
1344  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1345  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1346  ret <4 x i32> %2
1347}
1348
1349define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
1350; SSE2-LABEL: combine_test7:
1351; SSE2:       # BB#0:
1352; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1353; SSE2-NEXT:    movaps %xmm1, %xmm0
1354; SSE2-NEXT:    retq
1355;
1356; SSSE3-LABEL: combine_test7:
1357; SSSE3:       # BB#0:
1358; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1359; SSSE3-NEXT:    movaps %xmm1, %xmm0
1360; SSSE3-NEXT:    retq
1361;
1362; SSE41-LABEL: combine_test7:
1363; SSE41:       # BB#0:
1364; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1365; SSE41-NEXT:    retq
1366;
1367; AVX1-LABEL: combine_test7:
1368; AVX1:       # BB#0:
1369; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1370; AVX1-NEXT:    retq
1371;
1372; AVX2-LABEL: combine_test7:
1373; AVX2:       # BB#0:
1374; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1375; AVX2-NEXT:    retq
1376  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1377  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1378  ret <4 x i32> %2
1379}
1380
1381define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
1382; SSE-LABEL: combine_test8:
1383; SSE:       # BB#0:
1384; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1385; SSE-NEXT:    retq
1386;
1387; AVX-LABEL: combine_test8:
1388; AVX:       # BB#0:
1389; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1390; AVX-NEXT:    retq
1391  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1392  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1393  ret <4 x i32> %2
1394}
1395
1396define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
1397; SSE-LABEL: combine_test9:
1398; SSE:       # BB#0:
1399; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1400; SSE-NEXT:    movdqa %xmm1, %xmm0
1401; SSE-NEXT:    retq
1402;
1403; AVX-LABEL: combine_test9:
1404; AVX:       # BB#0:
1405; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1406; AVX-NEXT:    retq
1407  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1408  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1409  ret <4 x i32> %2
1410}
1411
1412define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
1413; SSE2-LABEL: combine_test10:
1414; SSE2:       # BB#0:
1415; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1416; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1417; SSE2-NEXT:    retq
1418;
1419; SSSE3-LABEL: combine_test10:
1420; SSSE3:       # BB#0:
1421; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1422; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1423; SSSE3-NEXT:    retq
1424;
1425; SSE41-LABEL: combine_test10:
1426; SSE41:       # BB#0:
1427; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1428; SSE41-NEXT:    retq
1429;
1430; AVX1-LABEL: combine_test10:
1431; AVX1:       # BB#0:
1432; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1433; AVX1-NEXT:    retq
1434;
1435; AVX2-LABEL: combine_test10:
1436; AVX2:       # BB#0:
1437; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1438; AVX2-NEXT:    retq
1439  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1440  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1441  ret <4 x i32> %2
1442}
1443
1444define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
1445; ALL-LABEL: combine_test11:
1446; ALL:       # BB#0:
1447; ALL-NEXT:    retq
1448  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1449  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1450  ret <4 x float> %2
1451}
1452
1453define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
1454; SSE2-LABEL: combine_test12:
1455; SSE2:       # BB#0:
1456; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1457; SSE2-NEXT:    movaps %xmm1, %xmm0
1458; SSE2-NEXT:    retq
1459;
1460; SSSE3-LABEL: combine_test12:
1461; SSSE3:       # BB#0:
1462; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1463; SSSE3-NEXT:    movaps %xmm1, %xmm0
1464; SSSE3-NEXT:    retq
1465;
1466; SSE41-LABEL: combine_test12:
1467; SSE41:       # BB#0:
1468; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1469; SSE41-NEXT:    retq
1470;
1471; AVX-LABEL: combine_test12:
1472; AVX:       # BB#0:
1473; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1474; AVX-NEXT:    retq
1475  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1476  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1477  ret <4 x float> %2
1478}
1479
1480define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
1481; SSE-LABEL: combine_test13:
1482; SSE:       # BB#0:
1483; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1484; SSE-NEXT:    retq
1485;
1486; AVX-LABEL: combine_test13:
1487; AVX:       # BB#0:
1488; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1489; AVX-NEXT:    retq
1490  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1491  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1492  ret <4 x float> %2
1493}
1494
1495define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
1496; SSE-LABEL: combine_test14:
1497; SSE:       # BB#0:
1498; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1499; SSE-NEXT:    retq
1500;
1501; AVX-LABEL: combine_test14:
1502; AVX:       # BB#0:
1503; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1504; AVX-NEXT:    retq
1505  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1506  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1507  ret <4 x float> %2
1508}
1509
1510define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
1511; SSE2-LABEL: combine_test15:
1512; SSE2:       # BB#0:
1513; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1514; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1515; SSE2-NEXT:    retq
1516;
1517; SSSE3-LABEL: combine_test15:
1518; SSSE3:       # BB#0:
1519; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1520; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1521; SSSE3-NEXT:    retq
1522;
1523; SSE41-LABEL: combine_test15:
1524; SSE41:       # BB#0:
1525; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1526; SSE41-NEXT:    retq
1527;
1528; AVX-LABEL: combine_test15:
1529; AVX:       # BB#0:
1530; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1531; AVX-NEXT:    retq
1532  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1533  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1534  ret <4 x float> %2
1535}
1536
1537define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
1538; ALL-LABEL: combine_test16:
1539; ALL:       # BB#0:
1540; ALL-NEXT:    retq
1541  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1542  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1543  ret <4 x i32> %2
1544}
1545
1546define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
1547; SSE2-LABEL: combine_test17:
1548; SSE2:       # BB#0:
1549; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1550; SSE2-NEXT:    movaps %xmm1, %xmm0
1551; SSE2-NEXT:    retq
1552;
1553; SSSE3-LABEL: combine_test17:
1554; SSSE3:       # BB#0:
1555; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1556; SSSE3-NEXT:    movaps %xmm1, %xmm0
1557; SSSE3-NEXT:    retq
1558;
1559; SSE41-LABEL: combine_test17:
1560; SSE41:       # BB#0:
1561; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1562; SSE41-NEXT:    retq
1563;
1564; AVX1-LABEL: combine_test17:
1565; AVX1:       # BB#0:
1566; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1567; AVX1-NEXT:    retq
1568;
1569; AVX2-LABEL: combine_test17:
1570; AVX2:       # BB#0:
1571; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1572; AVX2-NEXT:    retq
1573  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1574  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1575  ret <4 x i32> %2
1576}
1577
1578define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
1579; SSE-LABEL: combine_test18:
1580; SSE:       # BB#0:
1581; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1582; SSE-NEXT:    retq
1583;
1584; AVX-LABEL: combine_test18:
1585; AVX:       # BB#0:
1586; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1587; AVX-NEXT:    retq
1588  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1589  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1590  ret <4 x i32> %2
1591}
1592
1593define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
1594; SSE-LABEL: combine_test19:
1595; SSE:       # BB#0:
1596; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1597; SSE-NEXT:    retq
1598;
1599; AVX-LABEL: combine_test19:
1600; AVX:       # BB#0:
1601; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1602; AVX-NEXT:    retq
1603  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1604  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1605  ret <4 x i32> %2
1606}
1607
1608define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
1609; SSE2-LABEL: combine_test20:
1610; SSE2:       # BB#0:
1611; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1612; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1613; SSE2-NEXT:    retq
1614;
1615; SSSE3-LABEL: combine_test20:
1616; SSSE3:       # BB#0:
1617; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1618; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1619; SSSE3-NEXT:    retq
1620;
1621; SSE41-LABEL: combine_test20:
1622; SSE41:       # BB#0:
1623; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1624; SSE41-NEXT:    retq
1625;
1626; AVX1-LABEL: combine_test20:
1627; AVX1:       # BB#0:
1628; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1629; AVX1-NEXT:    retq
1630;
1631; AVX2-LABEL: combine_test20:
1632; AVX2:       # BB#0:
1633; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1634; AVX2-NEXT:    retq
1635  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1636  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1637  ret <4 x i32> %2
1638}
1639
1640define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
1641; SSE-LABEL: combine_test21:
1642; SSE:       # BB#0:
1643; SSE-NEXT:    movdqa %xmm0, %xmm2
1644; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1645; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1646; SSE-NEXT:    movdqa %xmm2, (%rdi)
1647; SSE-NEXT:    retq
1648;
1649; AVX1-LABEL: combine_test21:
1650; AVX1:       # BB#0:
1651; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1652; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1653; AVX1-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1654; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
1655; AVX1-NEXT:    vzeroupper
1656; AVX1-NEXT:    retq
1657;
1658; AVX2-LABEL: combine_test21:
1659; AVX2:       # BB#0:
1660; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1661; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1662; AVX2-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1663; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
1664; AVX2-NEXT:    vzeroupper
1665; AVX2-NEXT:    retq
1666  %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1667  %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1668  store <4 x i32> %1, <4 x i32>* %ptr, align 16
1669  ret <4 x i32> %2
1670}
1671
1672define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
1673; SSE-LABEL: combine_test22:
1674; SSE:       # BB#0:
1675; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1676; SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
1677; SSE-NEXT:    retq
1678;
1679; AVX-LABEL: combine_test22:
1680; AVX:       # BB#0:
1681; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1682; AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
1683; AVX-NEXT:    retq
1684; Current AVX2 lowering of this is still awful, not adding a test case.
1685  %1 = load <2 x float>, <2 x float>* %a, align 8
1686  %2 = load <2 x float>, <2 x float>* %b, align 8
1687  %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1688  ret <8 x float> %3
1689}
1690
1691; PR22359
1692define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) {
1693; SSE-LABEL: combine_test23:
1694; SSE:       # BB#0:
1695; SSE-NEXT:    movups %xmm0, (%rdi)
1696; SSE-NEXT:    retq
1697;
1698; AVX-LABEL: combine_test23:
1699; AVX:       # BB#0:
1700; AVX-NEXT:    vmovups %xmm0, (%rdi)
1701; AVX-NEXT:    vzeroupper
1702; AVX-NEXT:    retq
1703  %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1
1704  %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1>
1705  %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3>
1706  store <2 x float> %shuffle0, <2 x float>* %ptr, align 8
1707  store <2 x float> %shuffle1, <2 x float>* %idx2, align 8
1708  ret void
1709}
1710
1711; Check some negative cases.
1712; FIXME: Do any of these really make sense? Are they redundant with the above tests?
1713
1714define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
1715; SSE-LABEL: combine_test1b:
1716; SSE:       # BB#0:
1717; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1718; SSE-NEXT:    movaps %xmm1, %xmm0
1719; SSE-NEXT:    retq
1720;
1721; AVX-LABEL: combine_test1b:
1722; AVX:       # BB#0:
1723; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
1724; AVX-NEXT:    retq
1725  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1726  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
1727  ret <4 x float> %2
1728}
1729
1730define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
1731; SSE2-LABEL: combine_test2b:
1732; SSE2:       # BB#0:
1733; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
1734; SSE2-NEXT:    movaps %xmm1, %xmm0
1735; SSE2-NEXT:    retq
1736;
1737; SSSE3-LABEL: combine_test2b:
1738; SSSE3:       # BB#0:
1739; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1740; SSSE3-NEXT:    retq
1741;
1742; SSE41-LABEL: combine_test2b:
1743; SSE41:       # BB#0:
1744; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1745; SSE41-NEXT:    retq
1746;
1747; AVX-LABEL: combine_test2b:
1748; AVX:       # BB#0:
1749; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
1750; AVX-NEXT:    retq
1751  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1752  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
1753  ret <4 x float> %2
1754}
1755
1756define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
1757; SSE2-LABEL: combine_test3b:
1758; SSE2:       # BB#0:
1759; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1760; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1761; SSE2-NEXT:    retq
1762;
1763; SSSE3-LABEL: combine_test3b:
1764; SSSE3:       # BB#0:
1765; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1766; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1767; SSSE3-NEXT:    retq
1768;
1769; SSE41-LABEL: combine_test3b:
1770; SSE41:       # BB#0:
1771; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1772; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1773; SSE41-NEXT:    retq
1774;
1775; AVX-LABEL: combine_test3b:
1776; AVX:       # BB#0:
1777; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1778; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1779; AVX-NEXT:    retq
1780  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
1781  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
1782  ret <4 x float> %2
1783}
1784
1785define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
1786; SSE-LABEL: combine_test4b:
1787; SSE:       # BB#0:
1788; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
1789; SSE-NEXT:    movaps %xmm1, %xmm0
1790; SSE-NEXT:    retq
1791;
1792; AVX-LABEL: combine_test4b:
1793; AVX:       # BB#0:
1794; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
1795; AVX-NEXT:    retq
1796  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1797  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
1798  ret <4 x float> %2
1799}
1800
1801
1802; Verify that we correctly fold shuffles even when we use illegal vector types.
1803
1804define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
1805; SSE2-LABEL: combine_test1c:
1806; SSE2:       # BB#0:
1807; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1808; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1809; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1810; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1811; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1812; SSE2-NEXT:    retq
1813;
1814; SSSE3-LABEL: combine_test1c:
1815; SSSE3:       # BB#0:
1816; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1817; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1818; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1819; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1820; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1821; SSSE3-NEXT:    retq
1822;
1823; SSE41-LABEL: combine_test1c:
1824; SSE41:       # BB#0:
1825; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1826; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1827; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1828; SSE41-NEXT:    retq
1829;
1830; AVX1-LABEL: combine_test1c:
1831; AVX1:       # BB#0:
1832; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1833; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1834; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1835; AVX1-NEXT:    retq
1836;
1837; AVX2-LABEL: combine_test1c:
1838; AVX2:       # BB#0:
1839; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1840; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1841; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1842; AVX2-NEXT:    retq
1843  %A = load <4 x i8>, <4 x i8>* %a
1844  %B = load <4 x i8>, <4 x i8>* %b
1845  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1846  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1847  ret <4 x i8> %2
1848}
1849
1850define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
1851; SSE2-LABEL: combine_test2c:
1852; SSE2:       # BB#0:
1853; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1854; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1855; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1856; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1857; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1858; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1859; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1860; SSE2-NEXT:    retq
1861;
1862; SSSE3-LABEL: combine_test2c:
1863; SSSE3:       # BB#0:
1864; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1865; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1866; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1867; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1868; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1869; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1870; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1871; SSSE3-NEXT:    retq
1872;
1873; SSE41-LABEL: combine_test2c:
1874; SSE41:       # BB#0:
1875; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1876; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1877; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1878; SSE41-NEXT:    retq
1879;
1880; AVX-LABEL: combine_test2c:
1881; AVX:       # BB#0:
1882; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1883; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1884; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1885; AVX-NEXT:    retq
1886  %A = load <4 x i8>, <4 x i8>* %a
1887  %B = load <4 x i8>, <4 x i8>* %b
1888  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
1889  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1890  ret <4 x i8> %2
1891}
1892
1893define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
1894; SSE2-LABEL: combine_test3c:
1895; SSE2:       # BB#0:
1896; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1897; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1898; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1899; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1900; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1901; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1902; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1903; SSE2-NEXT:    retq
1904;
1905; SSSE3-LABEL: combine_test3c:
1906; SSSE3:       # BB#0:
1907; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1908; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1909; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1910; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1911; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1912; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1913; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1914; SSSE3-NEXT:    retq
1915;
1916; SSE41-LABEL: combine_test3c:
1917; SSE41:       # BB#0:
1918; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1919; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1920; SSE41-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1921; SSE41-NEXT:    retq
1922;
1923; AVX-LABEL: combine_test3c:
1924; AVX:       # BB#0:
1925; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1926; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1927; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1928; AVX-NEXT:    retq
1929  %A = load <4 x i8>, <4 x i8>* %a
1930  %B = load <4 x i8>, <4 x i8>* %b
1931  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1932  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1933  ret <4 x i8> %2
1934}
1935
1936define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
1937; SSE2-LABEL: combine_test4c:
1938; SSE2:       # BB#0:
1939; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1940; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1941; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1942; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1943; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1944; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1945; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1946; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1947; SSE2-NEXT:    retq
1948;
1949; SSSE3-LABEL: combine_test4c:
1950; SSSE3:       # BB#0:
1951; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1952; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1953; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1954; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1955; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1956; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1957; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1958; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1959; SSSE3-NEXT:    retq
1960;
1961; SSE41-LABEL: combine_test4c:
1962; SSE41:       # BB#0:
1963; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1964; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1965; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1966; SSE41-NEXT:    retq
1967;
1968; AVX1-LABEL: combine_test4c:
1969; AVX1:       # BB#0:
1970; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1971; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1972; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1973; AVX1-NEXT:    retq
1974;
1975; AVX2-LABEL: combine_test4c:
1976; AVX2:       # BB#0:
1977; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1978; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1979; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1980; AVX2-NEXT:    retq
1981  %A = load <4 x i8>, <4 x i8>* %a
1982  %B = load <4 x i8>, <4 x i8>* %b
1983  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1984  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1985  ret <4 x i8> %2
1986}
1987
1988
1989; The following test cases are generated from this C++ code
1990;
1991;__m128 blend_01(__m128 a, __m128 b)
1992;{
1993;  __m128 s = a;
1994;  s = _mm_blend_ps( s, b, 1<<0 );
1995;  s = _mm_blend_ps( s, b, 1<<1 );
1996;  return s;
1997;}
1998;
1999;__m128 blend_02(__m128 a, __m128 b)
2000;{
2001;  __m128 s = a;
2002;  s = _mm_blend_ps( s, b, 1<<0 );
2003;  s = _mm_blend_ps( s, b, 1<<2 );
2004;  return s;
2005;}
2006;
2007;__m128 blend_123(__m128 a, __m128 b)
2008;{
2009;  __m128 s = a;
2010;  s = _mm_blend_ps( s, b, 1<<1 );
2011;  s = _mm_blend_ps( s, b, 1<<2 );
2012;  s = _mm_blend_ps( s, b, 1<<3 );
2013;  return s;
2014;}
2015
2016; Ideally, we should collapse the following shuffles into a single one.
2017
2018define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
2019; SSE2-LABEL: combine_blend_01:
2020; SSE2:       # BB#0:
2021; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2022; SSE2-NEXT:    retq
2023;
2024; SSSE3-LABEL: combine_blend_01:
2025; SSSE3:       # BB#0:
2026; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2027; SSSE3-NEXT:    retq
2028;
2029; SSE41-LABEL: combine_blend_01:
2030; SSE41:       # BB#0:
2031; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2032; SSE41-NEXT:    retq
2033;
2034; AVX-LABEL: combine_blend_01:
2035; AVX:       # BB#0:
2036; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2037; AVX-NEXT:    retq
2038  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
2039  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
2040  ret <4 x float> %shuffle6
2041}
2042
2043define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
2044; SSE2-LABEL: combine_blend_02:
2045; SSE2:       # BB#0:
2046; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
2047; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
2048; SSE2-NEXT:    movaps %xmm1, %xmm0
2049; SSE2-NEXT:    retq
2050;
2051; SSSE3-LABEL: combine_blend_02:
2052; SSSE3:       # BB#0:
2053; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
2054; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
2055; SSSE3-NEXT:    movaps %xmm1, %xmm0
2056; SSSE3-NEXT:    retq
2057;
2058; SSE41-LABEL: combine_blend_02:
2059; SSE41:       # BB#0:
2060; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2061; SSE41-NEXT:    retq
2062;
2063; AVX-LABEL: combine_blend_02:
2064; AVX:       # BB#0:
2065; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2066; AVX-NEXT:    retq
2067  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
2068  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
2069  ret <4 x float> %shuffle6
2070}
2071
2072define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
2073; SSE2-LABEL: combine_blend_123:
2074; SSE2:       # BB#0:
2075; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2076; SSE2-NEXT:    movaps %xmm1, %xmm0
2077; SSE2-NEXT:    retq
2078;
2079; SSSE3-LABEL: combine_blend_123:
2080; SSSE3:       # BB#0:
2081; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2082; SSSE3-NEXT:    movaps %xmm1, %xmm0
2083; SSSE3-NEXT:    retq
2084;
2085; SSE41-LABEL: combine_blend_123:
2086; SSE41:       # BB#0:
2087; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2088; SSE41-NEXT:    retq
2089;
2090; AVX-LABEL: combine_blend_123:
2091; AVX:       # BB#0:
2092; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2093; AVX-NEXT:    retq
2094  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
2095  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
2096  %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2097  ret <4 x float> %shuffle12
2098}
2099
2100define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
2101; SSE-LABEL: combine_test_movhl_1:
2102; SSE:       # BB#0:
2103; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2104; SSE-NEXT:    movdqa %xmm1, %xmm0
2105; SSE-NEXT:    retq
2106;
2107; AVX-LABEL: combine_test_movhl_1:
2108; AVX:       # BB#0:
2109; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2110; AVX-NEXT:    retq
2111  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
2112  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
2113  ret <4 x i32> %2
2114}
2115
2116define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
2117; SSE-LABEL: combine_test_movhl_2:
2118; SSE:       # BB#0:
2119; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2120; SSE-NEXT:    movdqa %xmm1, %xmm0
2121; SSE-NEXT:    retq
2122;
2123; AVX-LABEL: combine_test_movhl_2:
2124; AVX:       # BB#0:
2125; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2126; AVX-NEXT:    retq
2127  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
2128  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
2129  ret <4 x i32> %2
2130}
2131
2132define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
2133; SSE-LABEL: combine_test_movhl_3:
2134; SSE:       # BB#0:
2135; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2136; SSE-NEXT:    movdqa %xmm1, %xmm0
2137; SSE-NEXT:    retq
2138;
2139; AVX-LABEL: combine_test_movhl_3:
2140; AVX:       # BB#0:
2141; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2142; AVX-NEXT:    retq
2143  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
2144  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
2145  ret <4 x i32> %2
2146}
2147
2148
2149; Verify that we fold shuffles according to rule:
2150;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
2151
2152define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
2153; SSE2-LABEL: combine_undef_input_test1:
2154; SSE2:       # BB#0:
2155; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2156; SSE2-NEXT:    retq
2157;
2158; SSSE3-LABEL: combine_undef_input_test1:
2159; SSSE3:       # BB#0:
2160; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2161; SSSE3-NEXT:    retq
2162;
2163; SSE41-LABEL: combine_undef_input_test1:
2164; SSE41:       # BB#0:
2165; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2166; SSE41-NEXT:    retq
2167;
2168; AVX-LABEL: combine_undef_input_test1:
2169; AVX:       # BB#0:
2170; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2171; AVX-NEXT:    retq
2172  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2173  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2174  ret <4 x float> %2
2175}
2176
2177define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
2178; SSE-LABEL: combine_undef_input_test2:
2179; SSE:       # BB#0:
2180; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2181; SSE-NEXT:    retq
2182;
2183; AVX-LABEL: combine_undef_input_test2:
2184; AVX:       # BB#0:
2185; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2186; AVX-NEXT:    retq
2187  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2188  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2189  ret <4 x float> %2
2190}
2191
2192define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
2193; SSE-LABEL: combine_undef_input_test3:
2194; SSE:       # BB#0:
2195; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2196; SSE-NEXT:    retq
2197;
2198; AVX-LABEL: combine_undef_input_test3:
2199; AVX:       # BB#0:
2200; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2201; AVX-NEXT:    retq
2202  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2203  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2204  ret <4 x float> %2
2205}
2206
2207define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
2208; SSE-LABEL: combine_undef_input_test4:
2209; SSE:       # BB#0:
2210; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2211; SSE-NEXT:    retq
2212;
2213; AVX-LABEL: combine_undef_input_test4:
2214; AVX:       # BB#0:
2215; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2216; AVX-NEXT:    retq
2217  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2218  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2219  ret <4 x float> %2
2220}
2221
2222define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
2223; SSE2-LABEL: combine_undef_input_test5:
2224; SSE2:       # BB#0:
2225; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2226; SSE2-NEXT:    movapd %xmm1, %xmm0
2227; SSE2-NEXT:    retq
2228;
2229; SSSE3-LABEL: combine_undef_input_test5:
2230; SSSE3:       # BB#0:
2231; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2232; SSSE3-NEXT:    movapd %xmm1, %xmm0
2233; SSSE3-NEXT:    retq
2234;
2235; SSE41-LABEL: combine_undef_input_test5:
2236; SSE41:       # BB#0:
2237; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2238; SSE41-NEXT:    retq
2239;
2240; AVX-LABEL: combine_undef_input_test5:
2241; AVX:       # BB#0:
2242; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2243; AVX-NEXT:    retq
2244  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2245  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2246  ret <4 x float> %2
2247}
2248
2249
2250; Verify that we fold shuffles according to rule:
2251;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2252
2253define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
2254; ALL-LABEL: combine_undef_input_test6:
2255; ALL:       # BB#0:
2256; ALL-NEXT:    retq
2257  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2258  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2259  ret <4 x float> %2
2260}
2261
2262define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
2263; SSE2-LABEL: combine_undef_input_test7:
2264; SSE2:       # BB#0:
2265; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2266; SSE2-NEXT:    retq
2267;
2268; SSSE3-LABEL: combine_undef_input_test7:
2269; SSSE3:       # BB#0:
2270; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2271; SSSE3-NEXT:    retq
2272;
2273; SSE41-LABEL: combine_undef_input_test7:
2274; SSE41:       # BB#0:
2275; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2276; SSE41-NEXT:    retq
2277;
2278; AVX-LABEL: combine_undef_input_test7:
2279; AVX:       # BB#0:
2280; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2281; AVX-NEXT:    retq
2282  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2283  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2284  ret <4 x float> %2
2285}
2286
2287define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
2288; SSE2-LABEL: combine_undef_input_test8:
2289; SSE2:       # BB#0:
2290; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2291; SSE2-NEXT:    retq
2292;
2293; SSSE3-LABEL: combine_undef_input_test8:
2294; SSSE3:       # BB#0:
2295; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2296; SSSE3-NEXT:    retq
2297;
2298; SSE41-LABEL: combine_undef_input_test8:
2299; SSE41:       # BB#0:
2300; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2301; SSE41-NEXT:    retq
2302;
2303; AVX-LABEL: combine_undef_input_test8:
2304; AVX:       # BB#0:
2305; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2306; AVX-NEXT:    retq
2307  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2308  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2309  ret <4 x float> %2
2310}
2311
2312define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
2313; SSE-LABEL: combine_undef_input_test9:
2314; SSE:       # BB#0:
2315; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
2316; SSE-NEXT:    retq
2317;
2318; AVX-LABEL: combine_undef_input_test9:
2319; AVX:       # BB#0:
2320; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
2321; AVX-NEXT:    retq
2322  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2323  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2324  ret <4 x float> %2
2325}
2326
2327define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
2328; ALL-LABEL: combine_undef_input_test10:
2329; ALL:       # BB#0:
2330; ALL-NEXT:    retq
2331  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2332  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2333  ret <4 x float> %2
2334}
2335
2336define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
2337; SSE2-LABEL: combine_undef_input_test11:
2338; SSE2:       # BB#0:
2339; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2340; SSE2-NEXT:    retq
2341;
2342; SSSE3-LABEL: combine_undef_input_test11:
2343; SSSE3:       # BB#0:
2344; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2345; SSSE3-NEXT:    retq
2346;
2347; SSE41-LABEL: combine_undef_input_test11:
2348; SSE41:       # BB#0:
2349; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2350; SSE41-NEXT:    retq
2351;
2352; AVX-LABEL: combine_undef_input_test11:
2353; AVX:       # BB#0:
2354; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2355; AVX-NEXT:    retq
2356  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2357  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
2358  ret <4 x float> %2
2359}
2360
2361define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
2362; SSE-LABEL: combine_undef_input_test12:
2363; SSE:       # BB#0:
2364; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2365; SSE-NEXT:    retq
2366;
2367; AVX-LABEL: combine_undef_input_test12:
2368; AVX:       # BB#0:
2369; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2370; AVX-NEXT:    retq
2371  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2372  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2373  ret <4 x float> %2
2374}
2375
2376define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
2377; SSE-LABEL: combine_undef_input_test13:
2378; SSE:       # BB#0:
2379; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2380; SSE-NEXT:    retq
2381;
2382; AVX-LABEL: combine_undef_input_test13:
2383; AVX:       # BB#0:
2384; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2385; AVX-NEXT:    retq
2386  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2387  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
2388  ret <4 x float> %2
2389}
2390
2391define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
2392; SSE-LABEL: combine_undef_input_test14:
2393; SSE:       # BB#0:
2394; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2395; SSE-NEXT:    retq
2396;
2397; AVX-LABEL: combine_undef_input_test14:
2398; AVX:       # BB#0:
2399; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2400; AVX-NEXT:    retq
2401  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2402  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2403  ret <4 x float> %2
2404}
2405
2406define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
2407; SSE2-LABEL: combine_undef_input_test15:
2408; SSE2:       # BB#0:
2409; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2410; SSE2-NEXT:    movapd %xmm1, %xmm0
2411; SSE2-NEXT:    retq
2412;
2413; SSSE3-LABEL: combine_undef_input_test15:
2414; SSSE3:       # BB#0:
2415; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2416; SSSE3-NEXT:    movapd %xmm1, %xmm0
2417; SSSE3-NEXT:    retq
2418;
2419; SSE41-LABEL: combine_undef_input_test15:
2420; SSE41:       # BB#0:
2421; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2422; SSE41-NEXT:    retq
2423;
2424; AVX-LABEL: combine_undef_input_test15:
2425; AVX:       # BB#0:
2426; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2427; AVX-NEXT:    retq
2428  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2429  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2430  ret <4 x float> %2
2431}
2432
2433
2434; Verify that shuffles are canonicalized according to rules:
2435;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
2436;
2437; This allows to trigger the following combine rule:
2438;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2439;
2440; As a result, all the shuffle pairs in each function below should be
2441; combined into a single legal shuffle operation.
2442
2443define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
2444; ALL-LABEL: combine_undef_input_test16:
2445; ALL:       # BB#0:
2446; ALL-NEXT:    retq
2447  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2448  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
2449  ret <4 x float> %2
2450}
2451
2452define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
2453; SSE2-LABEL: combine_undef_input_test17:
2454; SSE2:       # BB#0:
2455; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2456; SSE2-NEXT:    retq
2457;
2458; SSSE3-LABEL: combine_undef_input_test17:
2459; SSSE3:       # BB#0:
2460; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2461; SSSE3-NEXT:    retq
2462;
2463; SSE41-LABEL: combine_undef_input_test17:
2464; SSE41:       # BB#0:
2465; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2466; SSE41-NEXT:    retq
2467;
2468; AVX-LABEL: combine_undef_input_test17:
2469; AVX:       # BB#0:
2470; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2471; AVX-NEXT:    retq
2472  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2473  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2474  ret <4 x float> %2
2475}
2476
2477define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
2478; SSE2-LABEL: combine_undef_input_test18:
2479; SSE2:       # BB#0:
2480; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2481; SSE2-NEXT:    retq
2482;
2483; SSSE3-LABEL: combine_undef_input_test18:
2484; SSSE3:       # BB#0:
2485; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2486; SSSE3-NEXT:    retq
2487;
2488; SSE41-LABEL: combine_undef_input_test18:
2489; SSE41:       # BB#0:
2490; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2491; SSE41-NEXT:    retq
2492;
2493; AVX-LABEL: combine_undef_input_test18:
2494; AVX:       # BB#0:
2495; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2496; AVX-NEXT:    retq
2497  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2498  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
2499  ret <4 x float> %2
2500}
2501
2502define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
2503; SSE-LABEL: combine_undef_input_test19:
2504; SSE:       # BB#0:
2505; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
2506; SSE-NEXT:    retq
2507;
2508; AVX-LABEL: combine_undef_input_test19:
2509; AVX:       # BB#0:
2510; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
2511; AVX-NEXT:    retq
2512  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2513  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2514  ret <4 x float> %2
2515}
2516
2517define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
2518; ALL-LABEL: combine_undef_input_test20:
2519; ALL:       # BB#0:
2520; ALL-NEXT:    retq
2521  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2522  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2523  ret <4 x float> %2
2524}
2525
2526; These tests are designed to test the ability to combine away unnecessary
2527; operations feeding into a shuffle. The AVX cases are the important ones as
2528; they leverage operations which cannot be done naturally on the entire vector
2529; and thus are decomposed into multiple smaller operations.
2530
2531define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
2532; SSE-LABEL: combine_unneeded_subvector1:
2533; SSE:       # BB#0:
2534; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
2535; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
2536; SSE-NEXT:    movdqa %xmm0, %xmm1
2537; SSE-NEXT:    retq
2538;
2539; AVX1-LABEL: combine_unneeded_subvector1:
2540; AVX1:       # BB#0:
2541; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2542; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
2543; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2544; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2545; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
2546; AVX1-NEXT:    retq
2547;
2548; AVX2-LABEL: combine_unneeded_subvector1:
2549; AVX2:       # BB#0:
2550; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
2551; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2552; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
2553; AVX2-NEXT:    retq
2554  %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2555  %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
2556  ret <8 x i32> %c
2557}
2558
2559define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
2560; SSE-LABEL: combine_unneeded_subvector2:
2561; SSE:       # BB#0:
2562; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
2563; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
2564; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
2565; SSE-NEXT:    retq
2566;
2567; AVX1-LABEL: combine_unneeded_subvector2:
2568; AVX1:       # BB#0:
2569; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2570; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
2571; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2572; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2573; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2574; AVX1-NEXT:    retq
2575;
2576; AVX2-LABEL: combine_unneeded_subvector2:
2577; AVX2:       # BB#0:
2578; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
2579; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2580; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2581; AVX2-NEXT:    retq
2582  %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2583  %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
2584  ret <8 x i32> %d
2585}
2586
2587define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
2588; SSE2-LABEL: combine_insertps1:
2589; SSE2:       # BB#0:
2590; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2591; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2592; SSE2-NEXT:    movaps %xmm1, %xmm0
2593; SSE2-NEXT:    retq
2594;
2595; SSSE3-LABEL: combine_insertps1:
2596; SSSE3:       # BB#0:
2597; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2598; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2599; SSSE3-NEXT:    movaps %xmm1, %xmm0
2600; SSSE3-NEXT:    retq
2601;
2602; SSE41-LABEL: combine_insertps1:
2603; SSE41:       # BB#0:
2604; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2605; SSE41-NEXT:    retq
2606;
2607; AVX-LABEL: combine_insertps1:
2608; AVX:       # BB#0:
2609; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2610; AVX-NEXT:    retq
2611
2612  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
2613  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
2614  ret <4 x float> %d
2615}
2616
2617define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
2618; SSE2-LABEL: combine_insertps2:
2619; SSE2:       # BB#0:
2620; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2621; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2622; SSE2-NEXT:    movaps %xmm1, %xmm0
2623; SSE2-NEXT:    retq
2624;
2625; SSSE3-LABEL: combine_insertps2:
2626; SSSE3:       # BB#0:
2627; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2628; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2629; SSSE3-NEXT:    movaps %xmm1, %xmm0
2630; SSSE3-NEXT:    retq
2631;
2632; SSE41-LABEL: combine_insertps2:
2633; SSE41:       # BB#0:
2634; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2635; SSE41-NEXT:    retq
2636;
2637; AVX-LABEL: combine_insertps2:
2638; AVX:       # BB#0:
2639; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2640; AVX-NEXT:    retq
2641
2642  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
2643  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2644  ret <4 x float> %d
2645}
2646
2647define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
2648; SSE2-LABEL: combine_insertps3:
2649; SSE2:       # BB#0:
2650; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2651; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2652; SSE2-NEXT:    retq
2653;
2654; SSSE3-LABEL: combine_insertps3:
2655; SSSE3:       # BB#0:
2656; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2657; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2658; SSSE3-NEXT:    retq
2659;
2660; SSE41-LABEL: combine_insertps3:
2661; SSE41:       # BB#0:
2662; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2663; SSE41-NEXT:    retq
2664;
2665; AVX-LABEL: combine_insertps3:
2666; AVX:       # BB#0:
2667; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2668; AVX-NEXT:    retq
2669
2670  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2671  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
2672  ret <4 x float> %d
2673}
2674
2675define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
2676; SSE2-LABEL: combine_insertps4:
2677; SSE2:       # BB#0:
2678; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
2679; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2680; SSE2-NEXT:    retq
2681;
2682; SSSE3-LABEL: combine_insertps4:
2683; SSSE3:       # BB#0:
2684; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
2685; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2686; SSSE3-NEXT:    retq
2687;
2688; SSE41-LABEL: combine_insertps4:
2689; SSE41:       # BB#0:
2690; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2691; SSE41-NEXT:    retq
2692;
2693; AVX-LABEL: combine_insertps4:
2694; AVX:       # BB#0:
2695; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2696; AVX-NEXT:    retq
2697
2698  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2699  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
2700  ret <4 x float> %d
2701}
2702
2703define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) {
2704; SSE-LABEL: combine_scalar_load_with_blend_with_zero:
2705; SSE:       # BB#0:
2706; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2707; SSE-NEXT:    movaps %xmm0, (%rsi)
2708; SSE-NEXT:    retq
2709;
2710; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
2711; AVX:       # BB#0:
2712; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2713; AVX-NEXT:    vmovaps %xmm0, (%rsi)
2714; AVX-NEXT:    retq
2715  %1 = load double, double* %a0, align 8
2716  %2 = insertelement <2 x double> undef, double %1, i32 0
2717  %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1
2718  %4 = bitcast <2 x double> %3 to <4 x float>
2719  %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
2720  store <4 x float> %5, <4 x float>* %a1, align 16
2721  ret void
2722}
2723
2724; PR30371
2725define <4 x float> @combine_constant_insertion_v4f32(float %f) {
2726; SSE2-LABEL: combine_constant_insertion_v4f32:
2727; SSE2:       # BB#0:
2728; SSE2-NEXT:    movaps {{.*#+}} xmm1 = <u,4,5,3>
2729; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2730; SSE2-NEXT:    movaps %xmm1, %xmm0
2731; SSE2-NEXT:    retq
2732;
2733; SSSE3-LABEL: combine_constant_insertion_v4f32:
2734; SSSE3:       # BB#0:
2735; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = <u,4,5,3>
2736; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2737; SSSE3-NEXT:    movaps %xmm1, %xmm0
2738; SSSE3-NEXT:    retq
2739;
2740; SSE41-LABEL: combine_constant_insertion_v4f32:
2741; SSE41:       # BB#0:
2742; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2743; SSE41-NEXT:    retq
2744;
2745; AVX-LABEL: combine_constant_insertion_v4f32:
2746; AVX:       # BB#0:
2747; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2748; AVX-NEXT:    retq
2749  %a0 = insertelement <4 x float> undef, float %f, i32 0
2750  %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2751  ret <4 x float> %ret
2752}
2753
2754define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
2755; SSE2-LABEL: combine_constant_insertion_v4i32:
2756; SSE2:       # BB#0:
2757; SSE2-NEXT:    movd %edi, %xmm1
2758; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
2759; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2760; SSE2-NEXT:    retq
2761;
2762; SSSE3-LABEL: combine_constant_insertion_v4i32:
2763; SSSE3:       # BB#0:
2764; SSSE3-NEXT:    movd %edi, %xmm1
2765; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
2766; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2767; SSSE3-NEXT:    retq
2768;
2769; SSE41-LABEL: combine_constant_insertion_v4i32:
2770; SSE41:       # BB#0:
2771; SSE41-NEXT:    movd %edi, %xmm0
2772; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
2773; SSE41-NEXT:    retq
2774;
2775; AVX1-LABEL: combine_constant_insertion_v4i32:
2776; AVX1:       # BB#0:
2777; AVX1-NEXT:    vmovd %edi, %xmm0
2778; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
2779; AVX1-NEXT:    retq
2780;
2781; AVX2-LABEL: combine_constant_insertion_v4i32:
2782; AVX2:       # BB#0:
2783; AVX2-NEXT:    vmovd %edi, %xmm0
2784; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2785; AVX2-NEXT:    retq
2786  %a0 = insertelement <4 x i32> undef, i32 %f, i32 0
2787  %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2788  ret <4 x i32> %ret
2789}
2790
2791define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
2792; SSE-LABEL: PR22377:
2793; SSE:       # BB#0: # %entry
2794; SSE-NEXT:    movaps %xmm0, %xmm1
2795; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
2796; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2797; SSE-NEXT:    addps %xmm0, %xmm1
2798; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2799; SSE-NEXT:    retq
2800;
2801; AVX-LABEL: PR22377:
2802; AVX:       # BB#0: # %entry
2803; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
2804; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2805; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm1
2806; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2807; AVX-NEXT:    retq
2808entry:
2809  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
2810  %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2811  %r2 = fadd <4 x float> %s1, %s2
2812  %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2813  ret <4 x float> %s3
2814}
2815
2816define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
2817; SSE2-LABEL: PR22390:
2818; SSE2:       # BB#0: # %entry
2819; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2820; SSE2-NEXT:    movaps %xmm0, %xmm2
2821; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2822; SSE2-NEXT:    addps %xmm0, %xmm2
2823; SSE2-NEXT:    movaps %xmm2, %xmm0
2824; SSE2-NEXT:    retq
2825;
2826; SSSE3-LABEL: PR22390:
2827; SSSE3:       # BB#0: # %entry
2828; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2829; SSSE3-NEXT:    movaps %xmm0, %xmm2
2830; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2831; SSSE3-NEXT:    addps %xmm0, %xmm2
2832; SSSE3-NEXT:    movaps %xmm2, %xmm0
2833; SSSE3-NEXT:    retq
2834;
2835; SSE41-LABEL: PR22390:
2836; SSE41:       # BB#0: # %entry
2837; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2838; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2839; SSE41-NEXT:    addps %xmm1, %xmm0
2840; SSE41-NEXT:    retq
2841;
2842; AVX-LABEL: PR22390:
2843; AVX:       # BB#0: # %entry
2844; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2845; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2846; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2847; AVX-NEXT:    retq
2848entry:
2849  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
2850  %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
2851  %r2 = fadd <4 x float> %s1, %s2
2852  ret <4 x float> %r2
2853}
2854
2855define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
2856; SSE2-LABEL: PR22412:
2857; SSE2:       # BB#0: # %entry
2858; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2859; SSE2-NEXT:    movapd %xmm2, %xmm0
2860; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2861; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
2862; SSE2-NEXT:    movaps %xmm3, %xmm1
2863; SSE2-NEXT:    retq
2864;
2865; SSSE3-LABEL: PR22412:
2866; SSSE3:       # BB#0: # %entry
2867; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2868; SSSE3-NEXT:    movapd %xmm2, %xmm0
2869; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2870; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
2871; SSSE3-NEXT:    movaps %xmm3, %xmm1
2872; SSSE3-NEXT:    retq
2873;
2874; SSE41-LABEL: PR22412:
2875; SSE41:       # BB#0: # %entry
2876; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
2877; SSE41-NEXT:    movapd %xmm0, %xmm1
2878; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2]
2879; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2]
2880; SSE41-NEXT:    movaps %xmm1, %xmm0
2881; SSE41-NEXT:    movaps %xmm3, %xmm1
2882; SSE41-NEXT:    retq
2883;
2884; AVX1-LABEL: PR22412:
2885; AVX1:       # BB#0: # %entry
2886; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
2887; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
2888; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
2889; AVX1-NEXT:    retq
2890;
2891; AVX2-LABEL: PR22412:
2892; AVX2:       # BB#0: # %entry
2893; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
2894; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
2895; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
2896; AVX2-NEXT:    retq
2897entry:
2898  %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2899  %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
2900  ret <8 x float> %s2
2901}
2902