1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
3
4; rdar://12471808
5
6define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
7; CHECK-LABEL: v_bsli8:
8; CHECK:       @ %bb.0:
9; CHECK-NEXT:    vldr d18, [r0]
10; CHECK-NEXT:    vldr d16, [r2]
11; CHECK-NEXT:    vldr d17, [r1]
12; CHECK-NEXT:    vbit d16, d17, d18
13; CHECK-NEXT:    vmov r0, r1, d16
14; CHECK-NEXT:    mov pc, lr
15	%tmp1 = load <8 x i8>, <8 x i8>* %A
16	%tmp2 = load <8 x i8>, <8 x i8>* %B
17	%tmp3 = load <8 x i8>, <8 x i8>* %C
18	%tmp4 = and <8 x i8> %tmp1, %tmp2
19	%tmp5 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
20	%tmp6 = and <8 x i8> %tmp5, %tmp3
21	%tmp7 = or <8 x i8> %tmp4, %tmp6
22	ret <8 x i8> %tmp7
23}
24
25define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
26; CHECK-LABEL: v_bsli16:
27; CHECK:       @ %bb.0:
28; CHECK-NEXT:    vldr d18, [r0]
29; CHECK-NEXT:    vldr d16, [r2]
30; CHECK-NEXT:    vldr d17, [r1]
31; CHECK-NEXT:    vbit d16, d17, d18
32; CHECK-NEXT:    vmov r0, r1, d16
33; CHECK-NEXT:    mov pc, lr
34	%tmp1 = load <4 x i16>, <4 x i16>* %A
35	%tmp2 = load <4 x i16>, <4 x i16>* %B
36	%tmp3 = load <4 x i16>, <4 x i16>* %C
37	%tmp4 = and <4 x i16> %tmp1, %tmp2
38	%tmp5 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
39	%tmp6 = and <4 x i16> %tmp5, %tmp3
40	%tmp7 = or <4 x i16> %tmp4, %tmp6
41	ret <4 x i16> %tmp7
42}
43
44define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
45; CHECK-LABEL: v_bsli32:
46; CHECK:       @ %bb.0:
47; CHECK-NEXT:    vldr d18, [r0]
48; CHECK-NEXT:    vldr d16, [r2]
49; CHECK-NEXT:    vldr d17, [r1]
50; CHECK-NEXT:    vbit d16, d17, d18
51; CHECK-NEXT:    vmov r0, r1, d16
52; CHECK-NEXT:    mov pc, lr
53	%tmp1 = load <2 x i32>, <2 x i32>* %A
54	%tmp2 = load <2 x i32>, <2 x i32>* %B
55	%tmp3 = load <2 x i32>, <2 x i32>* %C
56	%tmp4 = and <2 x i32> %tmp1, %tmp2
57	%tmp5 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
58	%tmp6 = and <2 x i32> %tmp5, %tmp3
59	%tmp7 = or <2 x i32> %tmp4, %tmp6
60	ret <2 x i32> %tmp7
61}
62
63define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind {
64; CHECK-LABEL: v_bsli64:
65; CHECK:       @ %bb.0:
66; CHECK-NEXT:    vldr d18, [r0]
67; CHECK-NEXT:    vldr d16, [r2]
68; CHECK-NEXT:    vldr d17, [r1]
69; CHECK-NEXT:    vbit d16, d17, d18
70; CHECK-NEXT:    vmov r0, r1, d16
71; CHECK-NEXT:    mov pc, lr
72	%tmp1 = load <1 x i64>, <1 x i64>* %A
73	%tmp2 = load <1 x i64>, <1 x i64>* %B
74	%tmp3 = load <1 x i64>, <1 x i64>* %C
75	%tmp4 = and <1 x i64> %tmp1, %tmp2
76	%tmp5 = xor <1 x i64> %tmp1, < i64 -1 >
77	%tmp6 = and <1 x i64> %tmp5, %tmp3
78	%tmp7 = or <1 x i64> %tmp4, %tmp6
79	ret <1 x i64> %tmp7
80}
81
82define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
83; CHECK-LABEL: v_bslQi8:
84; CHECK:       @ %bb.0:
85; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
86; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
87; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
88; CHECK-NEXT:    vbit q8, q9, q10
89; CHECK-NEXT:    vmov r0, r1, d16
90; CHECK-NEXT:    vmov r2, r3, d17
91; CHECK-NEXT:    mov pc, lr
92	%tmp1 = load <16 x i8>, <16 x i8>* %A
93	%tmp2 = load <16 x i8>, <16 x i8>* %B
94	%tmp3 = load <16 x i8>, <16 x i8>* %C
95	%tmp4 = and <16 x i8> %tmp1, %tmp2
96	%tmp5 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
97	%tmp6 = and <16 x i8> %tmp5, %tmp3
98	%tmp7 = or <16 x i8> %tmp4, %tmp6
99	ret <16 x i8> %tmp7
100}
101
102define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
103; CHECK-LABEL: v_bslQi16:
104; CHECK:       @ %bb.0:
105; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
106; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
107; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
108; CHECK-NEXT:    vbit q8, q9, q10
109; CHECK-NEXT:    vmov r0, r1, d16
110; CHECK-NEXT:    vmov r2, r3, d17
111; CHECK-NEXT:    mov pc, lr
112	%tmp1 = load <8 x i16>, <8 x i16>* %A
113	%tmp2 = load <8 x i16>, <8 x i16>* %B
114	%tmp3 = load <8 x i16>, <8 x i16>* %C
115	%tmp4 = and <8 x i16> %tmp1, %tmp2
116	%tmp5 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
117	%tmp6 = and <8 x i16> %tmp5, %tmp3
118	%tmp7 = or <8 x i16> %tmp4, %tmp6
119	ret <8 x i16> %tmp7
120}
121
122define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
123; CHECK-LABEL: v_bslQi32:
124; CHECK:       @ %bb.0:
125; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
126; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
127; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
128; CHECK-NEXT:    vbit q8, q9, q10
129; CHECK-NEXT:    vmov r0, r1, d16
130; CHECK-NEXT:    vmov r2, r3, d17
131; CHECK-NEXT:    mov pc, lr
132	%tmp1 = load <4 x i32>, <4 x i32>* %A
133	%tmp2 = load <4 x i32>, <4 x i32>* %B
134	%tmp3 = load <4 x i32>, <4 x i32>* %C
135	%tmp4 = and <4 x i32> %tmp1, %tmp2
136	%tmp5 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
137	%tmp6 = and <4 x i32> %tmp5, %tmp3
138	%tmp7 = or <4 x i32> %tmp4, %tmp6
139	ret <4 x i32> %tmp7
140}
141
142define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
143; CHECK-LABEL: v_bslQi64:
144; CHECK:       @ %bb.0:
145; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
146; CHECK-NEXT:    vld1.64 {d16, d17}, [r2]
147; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
148; CHECK-NEXT:    vbit q8, q9, q10
149; CHECK-NEXT:    vmov r0, r1, d16
150; CHECK-NEXT:    vmov r2, r3, d17
151; CHECK-NEXT:    mov pc, lr
152	%tmp1 = load <2 x i64>, <2 x i64>* %A
153	%tmp2 = load <2 x i64>, <2 x i64>* %B
154	%tmp3 = load <2 x i64>, <2 x i64>* %C
155	%tmp4 = and <2 x i64> %tmp1, %tmp2
156	%tmp5 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
157	%tmp6 = and <2 x i64> %tmp5, %tmp3
158	%tmp7 = or <2 x i64> %tmp4, %tmp6
159	ret <2 x i64> %tmp7
160}
161
162define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone optsize ssp {
163; CHECK-LABEL: f1:
164; CHECK:       @ %bb.0:
165; CHECK-NEXT:    vldr d16, [sp]
166; CHECK-NEXT:    vmov d17, r2, r3
167; CHECK-NEXT:    vmov d18, r0, r1
168; CHECK-NEXT:    vbit d16, d17, d18
169; CHECK-NEXT:    vmov r0, r1, d16
170; CHECK-NEXT:    mov pc, lr
171  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind
172  ret <8 x i8> %vbsl.i
173}
174
175define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
176; CHECK-LABEL: f2:
177; CHECK:       @ %bb.0:
178; CHECK-NEXT:    vldr d16, [sp]
179; CHECK-NEXT:    vmov d17, r2, r3
180; CHECK-NEXT:    vmov d18, r0, r1
181; CHECK-NEXT:    vbit d16, d17, d18
182; CHECK-NEXT:    vmov r0, r1, d16
183; CHECK-NEXT:    mov pc, lr
184  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind
185  ret <4 x i16> %vbsl3.i
186}
187
188define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
189; CHECK-LABEL: f3:
190; CHECK:       @ %bb.0:
191; CHECK-NEXT:    vldr d16, [sp]
192; CHECK-NEXT:    vmov d17, r2, r3
193; CHECK-NEXT:    vmov d18, r0, r1
194; CHECK-NEXT:    vbit d16, d17, d18
195; CHECK-NEXT:    vmov r0, r1, d16
196; CHECK-NEXT:    mov pc, lr
197  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind
198  ret <2 x i32> %vbsl3.i
199}
200
201define <2 x float> @f4(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone optsize ssp {
202; CHECK-LABEL: f4:
203; CHECK:       @ %bb.0:
204; CHECK-NEXT:    vldr d16, [sp]
205; CHECK-NEXT:    vmov d17, r2, r3
206; CHECK-NEXT:    vmov d18, r0, r1
207; CHECK-NEXT:    vbit d16, d17, d18
208; CHECK-NEXT:    vmov r0, r1, d16
209; CHECK-NEXT:    mov pc, lr
210  %vbsl4.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind
211  ret <2 x float> %vbsl4.i
212}
213
214define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp {
215; CHECK-LABEL: g1:
216; CHECK:       @ %bb.0:
217; CHECK-NEXT:    vmov d19, r2, r3
218; CHECK-NEXT:    add r12, sp, #16
219; CHECK-NEXT:    vmov d18, r0, r1
220; CHECK-NEXT:    mov r0, sp
221; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
222; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
223; CHECK-NEXT:    vbit q8, q10, q9
224; CHECK-NEXT:    vmov r0, r1, d16
225; CHECK-NEXT:    vmov r2, r3, d17
226; CHECK-NEXT:    mov pc, lr
227  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind
228  ret <16 x i8> %vbsl.i
229}
230
231define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp {
232; CHECK-LABEL: g2:
233; CHECK:       @ %bb.0:
234; CHECK-NEXT:    vmov d19, r2, r3
235; CHECK-NEXT:    add r12, sp, #16
236; CHECK-NEXT:    vmov d18, r0, r1
237; CHECK-NEXT:    mov r0, sp
238; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
239; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
240; CHECK-NEXT:    vbit q8, q10, q9
241; CHECK-NEXT:    vmov r0, r1, d16
242; CHECK-NEXT:    vmov r2, r3, d17
243; CHECK-NEXT:    mov pc, lr
244  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind
245  ret <8 x i16> %vbsl3.i
246}
247
248define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
249; CHECK-LABEL: g3:
250; CHECK:       @ %bb.0:
251; CHECK-NEXT:    vmov d19, r2, r3
252; CHECK-NEXT:    add r12, sp, #16
253; CHECK-NEXT:    vmov d18, r0, r1
254; CHECK-NEXT:    mov r0, sp
255; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
256; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
257; CHECK-NEXT:    vbit q8, q10, q9
258; CHECK-NEXT:    vmov r0, r1, d16
259; CHECK-NEXT:    vmov r2, r3, d17
260; CHECK-NEXT:    mov pc, lr
261  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind
262  ret <4 x i32> %vbsl3.i
263}
264
265define <4 x float> @g4(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone optsize ssp {
266; CHECK-LABEL: g4:
267; CHECK:       @ %bb.0:
268; CHECK-NEXT:    vmov d19, r2, r3
269; CHECK-NEXT:    add r12, sp, #16
270; CHECK-NEXT:    vmov d18, r0, r1
271; CHECK-NEXT:    mov r0, sp
272; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
273; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
274; CHECK-NEXT:    vbit q8, q10, q9
275; CHECK-NEXT:    vmov r0, r1, d16
276; CHECK-NEXT:    vmov r2, r3, d17
277; CHECK-NEXT:    mov pc, lr
278  %vbsl4.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind
279  ret <4 x float> %vbsl4.i
280}
281
282define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp {
283; CHECK-LABEL: test_vbsl_s64:
284; CHECK:       @ %bb.0:
285; CHECK-NEXT:    vldr d16, [sp]
286; CHECK-NEXT:    vmov d17, r2, r3
287; CHECK-NEXT:    vmov d18, r0, r1
288; CHECK-NEXT:    vbit d16, d17, d18
289; CHECK-NEXT:    vmov r0, r1, d16
290; CHECK-NEXT:    mov pc, lr
291  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind
292  ret <1 x i64> %vbsl3.i
293}
294
295define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp {
296; CHECK-LABEL: test_vbsl_u64:
297; CHECK:       @ %bb.0:
298; CHECK-NEXT:    vldr d16, [sp]
299; CHECK-NEXT:    vmov d17, r2, r3
300; CHECK-NEXT:    vmov d18, r0, r1
301; CHECK-NEXT:    vbit d16, d17, d18
302; CHECK-NEXT:    vmov r0, r1, d16
303; CHECK-NEXT:    mov pc, lr
304  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind
305  ret <1 x i64> %vbsl3.i
306}
307
308define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
309; CHECK-LABEL: test_vbslq_s64:
310; CHECK:       @ %bb.0:
311; CHECK-NEXT:    vmov d19, r2, r3
312; CHECK-NEXT:    add r12, sp, #16
313; CHECK-NEXT:    vmov d18, r0, r1
314; CHECK-NEXT:    mov r0, sp
315; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
316; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
317; CHECK-NEXT:    vbit q8, q10, q9
318; CHECK-NEXT:    vmov r0, r1, d16
319; CHECK-NEXT:    vmov r2, r3, d17
320; CHECK-NEXT:    mov pc, lr
321  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind
322  ret <2 x i64> %vbsl3.i
323}
324
325define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
326; CHECK-LABEL: test_vbslq_u64:
327; CHECK:       @ %bb.0:
328; CHECK-NEXT:    vmov d19, r2, r3
329; CHECK-NEXT:    add r12, sp, #16
330; CHECK-NEXT:    vmov d18, r0, r1
331; CHECK-NEXT:    mov r0, sp
332; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
333; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
334; CHECK-NEXT:    vbit q8, q10, q9
335; CHECK-NEXT:    vmov r0, r1, d16
336; CHECK-NEXT:    vmov r2, r3, d17
337; CHECK-NEXT:    mov pc, lr
338  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind
339  ret <2 x i64> %vbsl3.i
340}
341
342declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
343declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
344declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
345declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
346declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
347declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
348declare <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
349declare <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
350declare <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
351declare <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) nounwind readnone
352