1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
3
4define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5; CHECK-LABEL: vuzpi8:
6; CHECK:       @ BB#0:
7; CHECK-NEXT:    vldr d16, [r1]
8; CHECK-NEXT:    vldr d17, [r0]
9; CHECK-NEXT:    vuzp.8 d17, d16
10; CHECK-NEXT:    vmul.i8 d16, d17, d16
11; CHECK-NEXT:    vmov r0, r1, d16
12; CHECK-NEXT:    mov pc, lr
13	%tmp1 = load <8 x i8>, <8 x i8>* %A
14	%tmp2 = load <8 x i8>, <8 x i8>* %B
15	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
16	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
17        %tmp5 = mul <8 x i8> %tmp3, %tmp4
18	ret <8 x i8> %tmp5
19}
20
21define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
22; CHECK-LABEL: vuzpi8_Qres:
23; CHECK:       @ BB#0:
24; CHECK-NEXT:    vldr d17, [r1]
25; CHECK-NEXT:    vldr d16, [r0]
26; CHECK-NEXT:    vuzp.8 d16, d17
27; CHECK-NEXT:    vmov r0, r1, d16
28; CHECK-NEXT:    vmov r2, r3, d17
29; CHECK-NEXT:    mov pc, lr
30	%tmp1 = load <8 x i8>, <8 x i8>* %A
31	%tmp2 = load <8 x i8>, <8 x i8>* %B
32	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
33	ret <16 x i8> %tmp3
34}
35
36define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
37; CHECK-LABEL: vuzpi16:
38; CHECK:       @ BB#0:
39; CHECK-NEXT:    vldr d16, [r1]
40; CHECK-NEXT:    vldr d17, [r0]
41; CHECK-NEXT:    vuzp.16 d17, d16
42; CHECK-NEXT:    vmul.i16 d16, d17, d16
43; CHECK-NEXT:    vmov r0, r1, d16
44; CHECK-NEXT:    mov pc, lr
45	%tmp1 = load <4 x i16>, <4 x i16>* %A
46	%tmp2 = load <4 x i16>, <4 x i16>* %B
47	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
48	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
49        %tmp5 = mul <4 x i16> %tmp3, %tmp4
50	ret <4 x i16> %tmp5
51}
52
53define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
54; CHECK-LABEL: vuzpi16_Qres:
55; CHECK:       @ BB#0:
56; CHECK-NEXT:    vldr d17, [r1]
57; CHECK-NEXT:    vldr d16, [r0]
58; CHECK-NEXT:    vuzp.16 d16, d17
59; CHECK-NEXT:    vmov r0, r1, d16
60; CHECK-NEXT:    vmov r2, r3, d17
61; CHECK-NEXT:    mov pc, lr
62	%tmp1 = load <4 x i16>, <4 x i16>* %A
63	%tmp2 = load <4 x i16>, <4 x i16>* %B
64	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
65	ret <8 x i16> %tmp3
66}
67
68; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors.
69
70define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
71; CHECK-LABEL: vuzpQi8:
72; CHECK:       @ BB#0:
73; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
74; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
75; CHECK-NEXT:    vuzp.8 q9, q8
76; CHECK-NEXT:    vadd.i8 q8, q9, q8
77; CHECK-NEXT:    vmov r0, r1, d16
78; CHECK-NEXT:    vmov r2, r3, d17
79; CHECK-NEXT:    mov pc, lr
80	%tmp1 = load <16 x i8>, <16 x i8>* %A
81	%tmp2 = load <16 x i8>, <16 x i8>* %B
82	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
83	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
84        %tmp5 = add <16 x i8> %tmp3, %tmp4
85	ret <16 x i8> %tmp5
86}
87
88define <32 x i8> @vuzpQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind {
89; CHECK-LABEL: vuzpQi8_QQres:
90; CHECK:       @ BB#0:
91; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
92; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
93; CHECK-NEXT:    vuzp.8 q9, q8
94; CHECK-NEXT:    vst1.8 {d18, d19}, [r0:128]!
95; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
96; CHECK-NEXT:    mov pc, lr
97	%tmp1 = load <16 x i8>, <16 x i8>* %A
98	%tmp2 = load <16 x i8>, <16 x i8>* %B
99	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
100	ret <32 x i8> %tmp3
101}
102
103define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
104; CHECK-LABEL: vuzpQi16:
105; CHECK:       @ BB#0:
106; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
107; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
108; CHECK-NEXT:    vuzp.16 q9, q8
109; CHECK-NEXT:    vadd.i16 q8, q9, q8
110; CHECK-NEXT:    vmov r0, r1, d16
111; CHECK-NEXT:    vmov r2, r3, d17
112; CHECK-NEXT:    mov pc, lr
113	%tmp1 = load <8 x i16>, <8 x i16>* %A
114	%tmp2 = load <8 x i16>, <8 x i16>* %B
115	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
116	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
117        %tmp5 = add <8 x i16> %tmp3, %tmp4
118	ret <8 x i16> %tmp5
119}
120
121define <16 x i16> @vuzpQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
122; CHECK-LABEL: vuzpQi16_QQres:
123; CHECK:       @ BB#0:
124; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
125; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
126; CHECK-NEXT:    vuzp.16 q9, q8
127; CHECK-NEXT:    vst1.16 {d18, d19}, [r0:128]!
128; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
129; CHECK-NEXT:    mov pc, lr
130	%tmp1 = load <8 x i16>, <8 x i16>* %A
131	%tmp2 = load <8 x i16>, <8 x i16>* %B
132	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
133	ret <16 x i16> %tmp3
134}
135
136define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
137; CHECK-LABEL: vuzpQi32:
138; CHECK:       @ BB#0:
139; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
140; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
141; CHECK-NEXT:    vuzp.32 q9, q8
142; CHECK-NEXT:    vadd.i32 q8, q9, q8
143; CHECK-NEXT:    vmov r0, r1, d16
144; CHECK-NEXT:    vmov r2, r3, d17
145; CHECK-NEXT:    mov pc, lr
146	%tmp1 = load <4 x i32>, <4 x i32>* %A
147	%tmp2 = load <4 x i32>, <4 x i32>* %B
148	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
149	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
150        %tmp5 = add <4 x i32> %tmp3, %tmp4
151	ret <4 x i32> %tmp5
152}
153
154define <8 x i32> @vuzpQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind {
155; CHECK-LABEL: vuzpQi32_QQres:
156; CHECK:       @ BB#0:
157; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
158; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
159; CHECK-NEXT:    vuzp.32 q9, q8
160; CHECK-NEXT:    vst1.32 {d18, d19}, [r0:128]!
161; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
162; CHECK-NEXT:    mov pc, lr
163	%tmp1 = load <4 x i32>, <4 x i32>* %A
164	%tmp2 = load <4 x i32>, <4 x i32>* %B
165	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
166	ret <8 x i32> %tmp3
167}
168
169define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
170; CHECK-LABEL: vuzpQf:
171; CHECK:       @ BB#0:
172; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
173; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
174; CHECK-NEXT:    vuzp.32 q9, q8
175; CHECK-NEXT:    vadd.f32 q8, q9, q8
176; CHECK-NEXT:    vmov r0, r1, d16
177; CHECK-NEXT:    vmov r2, r3, d17
178; CHECK-NEXT:    mov pc, lr
179	%tmp1 = load <4 x float>, <4 x float>* %A
180	%tmp2 = load <4 x float>, <4 x float>* %B
181	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
182	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
183        %tmp5 = fadd <4 x float> %tmp3, %tmp4
184	ret <4 x float> %tmp5
185}
186
187define <8 x float> @vuzpQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind {
188; CHECK-LABEL: vuzpQf_QQres:
189; CHECK:       @ BB#0:
190; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
191; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
192; CHECK-NEXT:    vuzp.32 q9, q8
193; CHECK-NEXT:    vst1.32 {d18, d19}, [r0:128]!
194; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
195; CHECK-NEXT:    mov pc, lr
196	%tmp1 = load <4 x float>, <4 x float>* %A
197	%tmp2 = load <4 x float>, <4 x float>* %B
198	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
199	ret <8 x float> %tmp3
200}
201
202; Undef shuffle indices should not prevent matching to VUZP:
203
204define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
205; CHECK-LABEL: vuzpi8_undef:
206; CHECK:       @ BB#0:
207; CHECK-NEXT:    vldr d16, [r1]
208; CHECK-NEXT:    vldr d17, [r0]
209; CHECK-NEXT:    vuzp.8 d17, d16
210; CHECK-NEXT:    vmul.i8 d16, d17, d16
211; CHECK-NEXT:    vmov r0, r1, d16
212; CHECK-NEXT:    mov pc, lr
213	%tmp1 = load <8 x i8>, <8 x i8>* %A
214	%tmp2 = load <8 x i8>, <8 x i8>* %B
215	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
216	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
217        %tmp5 = mul <8 x i8> %tmp3, %tmp4
218	ret <8 x i8> %tmp5
219}
220
221define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
222; CHECK-LABEL: vuzpi8_undef_Qres:
223; CHECK:       @ BB#0:
224; CHECK-NEXT:    vldr d17, [r1]
225; CHECK-NEXT:    vldr d16, [r0]
226; CHECK-NEXT:    vuzp.8 d16, d17
227; CHECK-NEXT:    vmov r0, r1, d16
228; CHECK-NEXT:    vmov r2, r3, d17
229; CHECK-NEXT:    mov pc, lr
230	%tmp1 = load <8 x i8>, <8 x i8>* %A
231	%tmp2 = load <8 x i8>, <8 x i8>* %B
232	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
233	ret <16 x i8> %tmp3
234}
235
236define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
237; CHECK-LABEL: vuzpQi16_undef:
238; CHECK:       @ BB#0:
239; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
240; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
241; CHECK-NEXT:    vuzp.16 q9, q8
242; CHECK-NEXT:    vadd.i16 q8, q9, q8
243; CHECK-NEXT:    vmov r0, r1, d16
244; CHECK-NEXT:    vmov r2, r3, d17
245; CHECK-NEXT:    mov pc, lr
246	%tmp1 = load <8 x i16>, <8 x i16>* %A
247	%tmp2 = load <8 x i16>, <8 x i16>* %B
248	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
249	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
250        %tmp5 = add <8 x i16> %tmp3, %tmp4
251	ret <8 x i16> %tmp5
252}
253
254define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
255; CHECK-LABEL: vuzpQi16_undef_QQres:
256; CHECK:       @ BB#0:
257; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
258; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
259; CHECK-NEXT:    vuzp.16 q9, q8
260; CHECK-NEXT:    vst1.16 {d18, d19}, [r0:128]!
261; CHECK-NEXT:    vst1.64 {d16, d17}, [r0:128]
262; CHECK-NEXT:    mov pc, lr
263	%tmp1 = load <8 x i16>, <8 x i16>* %A
264	%tmp2 = load <8 x i16>, <8 x i16>* %B
265	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
266	ret <16 x i16> %tmp3
267}
268
269define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
270; CHECK-LABEL: vuzp_lower_shufflemask_undef:
271; CHECK:       @ BB#0: @ %entry
272; CHECK-NEXT:    vldr d17, [r1]
273; CHECK-NEXT:    vldr d16, [r0]
274; CHECK-NEXT:    vorr q9, q8, q8
275; CHECK-NEXT:    vuzp.16 q8, q9
276; CHECK-NEXT:    vmov r0, r1, d18
277; CHECK-NEXT:    vmov r2, r3, d19
278; CHECK-NEXT:    mov pc, lr
279entry:
280	%tmp1 = load <4 x i16>, <4 x i16>* %A
281	%tmp2 = load <4 x i16>, <4 x i16>* %B
282  %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
283  ret <8 x i16> %0
284}
285
286define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) {
287; CHECK-LABEL: vuzp_lower_shufflemask_zeroed:
288; CHECK:       @ BB#0: @ %entry
289; CHECK-NEXT:    vldr d17, [r1]
290; CHECK-NEXT:    vldr d16, [r0]
291; CHECK-NEXT:    vdup.32 q9, d16[0]
292; CHECK-NEXT:    vuzp.32 q8, q9
293; CHECK-NEXT:    vext.32 q8, q9, q9, #2
294; CHECK-NEXT:    vmov r0, r1, d16
295; CHECK-NEXT:    vmov r2, r3, d17
296; CHECK-NEXT:    mov pc, lr
297entry:
298  %tmp1 = load <2 x i32>, <2 x i32>* %A
299	%tmp2 = load <2 x i32>, <2 x i32>* %B
300  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
301  ret <4 x i32> %0
302}
303
304define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) {
305; CHECK-LABEL: vuzp_rev_shufflemask_vtrn:
306; CHECK:       @ BB#0: @ %entry
307; CHECK-NEXT:    vldr d17, [r1]
308; CHECK-NEXT:    vldr d16, [r0]
309; CHECK-NEXT:    vrev64.32 q9, q8
310; CHECK-NEXT:    vuzp.32 q8, q9
311; CHECK-NEXT:    vst1.64 {d18, d19}, [r2]
312; CHECK-NEXT:    mov pc, lr
313entry:
314  %tmp1 = load <2 x i32>, <2 x i32>* %A
315  %tmp2 = load <2 x i32>, <2 x i32>* %B
316  %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
317  store <4 x i32> %0, <4 x i32>* %C
318  ret void
319}
320
321define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
322; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
323; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to
324; truncate from i32 to i16 and one vuzp to perform the final truncation for i8.
325; CHECK-LABEL: vuzp_trunc:
326; CHECK:       @ BB#0:
327; CHECK-NEXT:    .save {r4, r5, r11, lr}
328; CHECK-NEXT:    push {r4, r5, r11, lr}
329; CHECK-NEXT:    add r12, sp, #48
330; CHECK-NEXT:    add lr, sp, #16
331; CHECK-NEXT:    add r4, sp, #64
332; CHECK-NEXT:    add r5, sp, #32
333; CHECK-NEXT:    vld1.64 {d16, d17}, [r5]
334; CHECK-NEXT:    vld1.64 {d18, d19}, [r4]
335; CHECK-NEXT:    vld1.64 {d20, d21}, [lr]
336; CHECK-NEXT:    vld1.64 {d22, d23}, [r12]
337; CHECK-NEXT:    vcgt.u32 q8, q9, q8
338; CHECK-NEXT:    vcgt.u32 q9, q11, q10
339; CHECK-NEXT:    vmovn.i32 d16, q8
340; CHECK-NEXT:    vmovn.i32 d17, q9
341; CHECK-NEXT:    vmov.i8 d18, #0x7
342; CHECK-NEXT:    vmov d19, r0, r1
343; CHECK-NEXT:    vuzp.8 d17, d16
344; CHECK-NEXT:    vneg.s8 d16, d18
345; CHECK-NEXT:    vshl.i8 d17, d17, #7
346; CHECK-NEXT:    vmov d18, r2, r3
347; CHECK-NEXT:    vshl.s8 d16, d17, d16
348; CHECK-NEXT:    vbsl d16, d19, d18
349; CHECK-NEXT:    vmov r0, r1, d16
350; CHECK-NEXT:    pop {r4, r5, r11, lr}
351; CHECK-NEXT:    mov pc, lr
352  %c = icmp ult <8 x i32> %cmp0, %cmp1
353  %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1
354  ret <8 x i8> %res
355}
356
357; Shuffle the result from the compare with a <4 x i8>.
358; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able
359; to perform the vuzp and get the vbsl mask.
360define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
361; CHECK-LABEL: vuzp_trunc_and_shuffle:
362; CHECK:       @ BB#0:
363; CHECK-NEXT:    .save {r4, lr}
364; CHECK-NEXT:    push {r4, lr}
365; CHECK-NEXT:    ldr r12, [sp, #40]
366; CHECK-NEXT:    add lr, sp, #24
367; CHECK-NEXT:    add r4, sp, #8
368; CHECK-NEXT:    vld1.64 {d16, d17}, [r4]
369; CHECK-NEXT:    vld1.64 {d18, d19}, [lr]
370; CHECK-NEXT:    vld1.32 {d20[0]}, [r12:32]
371; CHECK-NEXT:    vcgt.u32 q8, q9, q8
372; CHECK-NEXT:    vmovn.i32 d16, q8
373; CHECK-NEXT:    vmov.i8 d17, #0x7
374; CHECK-NEXT:    vneg.s8 d17, d17
375; CHECK-NEXT:    vmovl.u8 q9, d20
376; CHECK-NEXT:    vuzp.8 d16, d18
377; CHECK-NEXT:    vshl.i8 d16, d16, #7
378; CHECK-NEXT:    vmov d18, r2, r3
379; CHECK-NEXT:    vmov d19, r0, r1
380; CHECK-NEXT:    vshl.s8 d16, d16, d17
381; CHECK-NEXT:    vbsl d16, d19, d18
382; CHECK-NEXT:    vmov r0, r1, d16
383; CHECK-NEXT:    pop {r4, lr}
384; CHECK-NEXT:    mov pc, lr
385                         <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
386  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
387  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
388  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
389  %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
390  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
391  ret <8 x i8> %rv
392}
393
394; Use an undef value for the <4 x i8> that is being shuffled with the compare result.
395; This produces a build_vector with some of the operands undefs.
396define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1,
397; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right:
398; CHECK:       @ BB#0:
399; CHECK-NEXT:    .save {r11, lr}
400; CHECK-NEXT:    push {r11, lr}
401; CHECK-NEXT:    add r12, sp, #24
402; CHECK-NEXT:    add lr, sp, #8
403; CHECK-NEXT:    vld1.64 {d16, d17}, [lr]
404; CHECK-NEXT:    vld1.64 {d18, d19}, [r12]
405; CHECK-NEXT:    vcgt.u32 q8, q9, q8
406; CHECK-NEXT:    vmov d19, r0, r1
407; CHECK-NEXT:    vmovn.i32 d16, q8
408; CHECK-NEXT:    vmov.i8 d17, #0x7
409; CHECK-NEXT:    vuzp.8 d16, d18
410; CHECK-NEXT:    vneg.s8 d17, d17
411; CHECK-NEXT:    vshl.i8 d16, d16, #7
412; CHECK-NEXT:    vmov d18, r2, r3
413; CHECK-NEXT:    vshl.s8 d16, d16, d17
414; CHECK-NEXT:    vbsl d16, d19, d18
415; CHECK-NEXT:    vmov r0, r1, d16
416; CHECK-NEXT:    pop {r11, lr}
417; CHECK-NEXT:    mov pc, lr
418                         <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
419  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
420  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
421  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
422  %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
423  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
424  ret <8 x i8> %rv
425}
426
427define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
428; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left:
429; CHECK:       @ BB#0:
430; CHECK-NEXT:    .save {r11, lr}
431; CHECK-NEXT:    push {r11, lr}
432; CHECK-NEXT:    add r12, sp, #24
433; CHECK-NEXT:    add lr, sp, #8
434; CHECK-NEXT:    vldr d20, .LCPI22_0
435; CHECK-NEXT:    vld1.64 {d16, d17}, [lr]
436; CHECK-NEXT:    vld1.64 {d18, d19}, [r12]
437; CHECK-NEXT:    vcgt.u32 q8, q9, q8
438; CHECK-NEXT:    vmov d18, r2, r3
439; CHECK-NEXT:    vmov d19, r0, r1
440; CHECK-NEXT:    vmovn.i32 d16, q8
441; CHECK-NEXT:    vmov.i8 d17, #0x7
442; CHECK-NEXT:    vtbl.8 d16, {d16}, d20
443; CHECK-NEXT:    vneg.s8 d17, d17
444; CHECK-NEXT:    vshl.i8 d16, d16, #7
445; CHECK-NEXT:    vshl.s8 d16, d16, d17
446; CHECK-NEXT:    vbsl d16, d19, d18
447; CHECK-NEXT:    vmov r0, r1, d16
448; CHECK-NEXT:    pop {r11, lr}
449; CHECK-NEXT:    mov pc, lr
450; CHECK-NEXT:    .p2align 3
451; CHECK-NEXT:  @ BB#1:
452; CHECK-NEXT:  .LCPI22_0:
453; CHECK-NEXT:    .byte 255 @ 0xff
454; CHECK-NEXT:    .byte 255 @ 0xff
455; CHECK-NEXT:    .byte 255 @ 0xff
456; CHECK-NEXT:    .byte 255 @ 0xff
457; CHECK-NEXT:    .byte 0 @ 0x0
458; CHECK-NEXT:    .byte 2 @ 0x2
459; CHECK-NEXT:    .byte 4 @ 0x4
460; CHECK-NEXT:    .byte 6 @ 0x6
461                         <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
462  %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
463  %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
464  %c0 = icmp ult <4 x i32> %cmp0, %cmp1
465  %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
466  %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
467  ret <8 x i8> %rv
468}
469
470; We're using large data types here, and we have to fill with undef values until we
471; get some vector size that we can represent.
472define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
473; CHECK-LABEL: vuzp_wide_type:
474; CHECK:       @ BB#0:
475; CHECK-NEXT:    .save {r4, r5, r6, r7, r11, lr}
476; CHECK-NEXT:    push {r4, r5, r6, r7, r11, lr}
477; CHECK-NEXT:    .setfp r11, sp, #16
478; CHECK-NEXT:    add r11, sp, #16
479; CHECK-NEXT:    .pad #8
480; CHECK-NEXT:    sub sp, sp, #8
481; CHECK-NEXT:    bic sp, sp, #15
482; CHECK-NEXT:    add r5, r11, #52
483; CHECK-NEXT:    add r7, r11, #32
484; CHECK-NEXT:    add r4, r11, #44
485; CHECK-NEXT:    add r6, r11, #24
486; CHECK-NEXT:    add r12, r11, #60
487; CHECK-NEXT:    add lr, r11, #40
488; CHECK-NEXT:    vld1.32 {d17[0]}, [r7:32]
489; CHECK-NEXT:    vld1.32 {d19[0]}, [r5:32]
490; CHECK-NEXT:    vld1.32 {d22[0]}, [r12:32]
491; CHECK-NEXT:    ldr r12, [r11, #64]
492; CHECK-NEXT:    vld1.32 {d20[0]}, [lr:32]
493; CHECK-NEXT:    add r7, r11, #48
494; CHECK-NEXT:    add r5, r11, #28
495; CHECK-NEXT:    vld1.32 {d16[0]}, [r6:32]
496; CHECK-NEXT:    vld1.32 {d18[0]}, [r4:32]
497; CHECK-NEXT:    add r6, r11, #56
498; CHECK-NEXT:    add r4, r11, #36
499; CHECK-NEXT:    vcgt.u32 q10, q11, q10
500; CHECK-NEXT:    vld1.32 {d19[1]}, [r6:32]
501; CHECK-NEXT:    vld1.32 {d17[1]}, [r4:32]
502; CHECK-NEXT:    add r6, r12, #4
503; CHECK-NEXT:    vld1.32 {d18[1]}, [r7:32]
504; CHECK-NEXT:    vld1.32 {d16[1]}, [r5:32]
505; CHECK-NEXT:    ldr r7, [r12]
506; CHECK-NEXT:    vcgt.u32 q8, q9, q8
507; CHECK-NEXT:    vmovn.i32 d18, q10
508; CHECK-NEXT:    vmov.32 d21[0], r7
509; CHECK-NEXT:    vmovn.i32 d16, q8
510; CHECK-NEXT:    vmov.u8 r7, d21[3]
511; CHECK-NEXT:    vmov.i8 d17, #0x7
512; CHECK-NEXT:    vuzp.8 d16, d18
513; CHECK-NEXT:    vmov.8 d23[0], r7
514; CHECK-NEXT:    vneg.s8 d17, d17
515; CHECK-NEXT:    add r7, r11, #8
516; CHECK-NEXT:    vldr d18, .LCPI23_0
517; CHECK-NEXT:    vld1.8 {d23[1]}, [r6]
518; CHECK-NEXT:    vshl.i8 d16, d16, #7
519; CHECK-NEXT:    vshl.s8 d20, d16, d17
520; CHECK-NEXT:    vmov.i8 q8, #0x7
521; CHECK-NEXT:    vneg.s8 q8, q8
522; CHECK-NEXT:    vtbl.8 d22, {d20, d21}, d18
523; CHECK-NEXT:    vld1.64 {d18, d19}, [r7]
524; CHECK-NEXT:    vshl.i8 q10, q11, #7
525; CHECK-NEXT:    vmov d23, r2, r3
526; CHECK-NEXT:    vmov d22, r0, r1
527; CHECK-NEXT:    vshl.s8 q8, q10, q8
528; CHECK-NEXT:    vbsl q8, q11, q9
529; CHECK-NEXT:    vmov r0, r1, d16
530; CHECK-NEXT:    vmov r2, r3, d17
531; CHECK-NEXT:    sub sp, r11, #16
532; CHECK-NEXT:    pop {r4, r5, r6, r7, r11, lr}
533; CHECK-NEXT:    mov pc, lr
534; CHECK-NEXT:    .p2align 3
535; CHECK-NEXT:  @ BB#1:
536; CHECK-NEXT:  .LCPI23_0:
537; CHECK-NEXT:    .byte 0 @ 0x0
538; CHECK-NEXT:    .byte 1 @ 0x1
539; CHECK-NEXT:    .byte 2 @ 0x2
540; CHECK-NEXT:    .byte 3 @ 0x3
541; CHECK-NEXT:    .byte 4 @ 0x4
542; CHECK-NEXT:    .byte 8 @ 0x8
543; CHECK-NEXT:    .byte 9 @ 0x9
544; CHECK-NEXT:    .byte 10 @ 0xa
545                            <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) {
546  %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4
547  %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>
548  %c0 = icmp ult <5 x i32> %cmp0, %cmp1
549  %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
550  %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1
551  ret <10 x i8> %rv
552}
553
554%struct.uint8x8x2_t = type { [2 x <8 x i8>] }
555define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
556; CHECK-LABEL: vuzp_extract_subvector:
557; CHECK:       @ BB#0:
558; CHECK-NEXT:    vmov d17, r2, r3
559; CHECK-NEXT:    vmov d16, r0, r1
560; CHECK-NEXT:    vorr d18, d17, d17
561; CHECK-NEXT:    vuzp.8 d16, d18
562; CHECK-NEXT:    vmov r0, r1, d16
563; CHECK-NEXT:    vmov r2, r3, d18
564; CHECK-NEXT:    mov pc, lr
565
566  %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
567  %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
568  %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
569  %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
570  ret %struct.uint8x8x2_t %.fca.0.1.insert
571}
572