1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm-eabi -mattr=+v6t2 | FileCheck %s --check-prefixes=CHECK,SCALAR
3; RUN: llc < %s -mtriple=arm-eabi -mattr=+v6t2 -mattr=+neon | FileCheck %s --check-prefixes=CHECK,NEON
4
5declare i8 @llvm.fshl.i8(i8, i8, i8)
6declare i16 @llvm.fshl.i16(i16, i16, i16)
7declare i32 @llvm.fshl.i32(i32, i32, i32)
8declare i64 @llvm.fshl.i64(i64, i64, i64)
9declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
10
11declare i8 @llvm.fshr.i8(i8, i8, i8)
12declare i16 @llvm.fshr.i16(i16, i16, i16)
13declare i32 @llvm.fshr.i32(i32, i32, i32)
14declare i64 @llvm.fshr.i64(i64, i64, i64)
15declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
16
17; General case - all operands can be variables.
18
19define i16 @fshl_i16(i16 %x, i16 %y, i16 %z) {
20; CHECK-LABEL: fshl_i16:
21; CHECK:       @ %bb.0:
22; CHECK-NEXT:    pkhbt r0, r1, r0, lsl #16
23; CHECK-NEXT:    and r1, r2, #15
24; CHECK-NEXT:    lsl r0, r0, r1
25; CHECK-NEXT:    lsr r0, r0, #16
26; CHECK-NEXT:    bx lr
27  %f = call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z)
28  ret i16 %f
29}
30
31define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
32; CHECK-LABEL: fshl_i32:
33; CHECK:       @ %bb.0:
34; CHECK-NEXT:    mov r3, #31
35; CHECK-NEXT:    lsr r1, r1, #1
36; CHECK-NEXT:    bic r3, r3, r2
37; CHECK-NEXT:    and r2, r2, #31
38; CHECK-NEXT:    lsl r0, r0, r2
39; CHECK-NEXT:    orr r0, r0, r1, lsr r3
40; CHECK-NEXT:    bx lr
41  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
42  ret i32 %f
43}
44
45; Verify that weird types are minimally supported.
46declare i37 @llvm.fshl.i37(i37, i37, i37)
47define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
48; SCALAR-LABEL: fshl_i37:
49; SCALAR:       @ %bb.0:
50; SCALAR-NEXT:    .save {r4, r5, r6, r7, r8, lr}
51; SCALAR-NEXT:    push {r4, r5, r6, r7, r8, lr}
52; SCALAR-NEXT:    mov r4, r1
53; SCALAR-NEXT:    mov r8, r0
54; SCALAR-NEXT:    ldr r0, [sp, #24]
55; SCALAR-NEXT:    mov r5, r3
56; SCALAR-NEXT:    ldr r1, [sp, #28]
57; SCALAR-NEXT:    mov r6, r2
58; SCALAR-NEXT:    mov r2, #37
59; SCALAR-NEXT:    mov r3, #0
60; SCALAR-NEXT:    bl __aeabi_uldivmod
61; SCALAR-NEXT:    lsl r1, r5, #27
62; SCALAR-NEXT:    ands r12, r2, #32
63; SCALAR-NEXT:    orr r1, r1, r6, lsr #5
64; SCALAR-NEXT:    mov r3, r8
65; SCALAR-NEXT:    and r5, r2, #31
66; SCALAR-NEXT:    mov r0, #31
67; SCALAR-NEXT:    movne r3, r1
68; SCALAR-NEXT:    cmp r12, #0
69; SCALAR-NEXT:    bic r2, r0, r2
70; SCALAR-NEXT:    lslne r1, r6, #27
71; SCALAR-NEXT:    movne r4, r8
72; SCALAR-NEXT:    lsl r7, r3, r5
73; SCALAR-NEXT:    lsr r0, r1, #1
74; SCALAR-NEXT:    lsl r1, r4, r5
75; SCALAR-NEXT:    lsr r3, r3, #1
76; SCALAR-NEXT:    orr r0, r7, r0, lsr r2
77; SCALAR-NEXT:    orr r1, r1, r3, lsr r2
78; SCALAR-NEXT:    pop {r4, r5, r6, r7, r8, pc}
79;
80; NEON-LABEL: fshl_i37:
81; NEON:       @ %bb.0:
82; NEON-NEXT:    .save {r4, r5, r6, r7, r11, lr}
83; NEON-NEXT:    push {r4, r5, r6, r7, r11, lr}
84; NEON-NEXT:    mov r4, r1
85; NEON-NEXT:    mov r5, r0
86; NEON-NEXT:    ldr r0, [sp, #24]
87; NEON-NEXT:    mov r7, r3
88; NEON-NEXT:    ldr r1, [sp, #28]
89; NEON-NEXT:    mov r6, r2
90; NEON-NEXT:    mov r2, #37
91; NEON-NEXT:    mov r3, #0
92; NEON-NEXT:    bl __aeabi_uldivmod
93; NEON-NEXT:    mov r0, #31
94; NEON-NEXT:    bic r1, r0, r2
95; NEON-NEXT:    lsl r0, r7, #27
96; NEON-NEXT:    ands r12, r2, #32
97; NEON-NEXT:    orr r0, r0, r6, lsr #5
98; NEON-NEXT:    mov r7, r5
99; NEON-NEXT:    and r2, r2, #31
100; NEON-NEXT:    movne r7, r0
101; NEON-NEXT:    lslne r0, r6, #27
102; NEON-NEXT:    cmp r12, #0
103; NEON-NEXT:    lsl r3, r7, r2
104; NEON-NEXT:    lsr r0, r0, #1
105; NEON-NEXT:    movne r4, r5
106; NEON-NEXT:    orr r0, r3, r0, lsr r1
107; NEON-NEXT:    lsr r3, r7, #1
108; NEON-NEXT:    lsl r2, r4, r2
109; NEON-NEXT:    orr r1, r2, r3, lsr r1
110; NEON-NEXT:    pop {r4, r5, r6, r7, r11, pc}
111  %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
112  ret i37 %f
113}
114
115; extract(concat(0b1110000, 0b1111111) << 2) = 0b1000011
116
117declare i7 @llvm.fshl.i7(i7, i7, i7)
118define i7 @fshl_i7_const_fold() {
119; CHECK-LABEL: fshl_i7_const_fold:
120; CHECK:       @ %bb.0:
121; CHECK-NEXT:    mov r0, #67
122; CHECK-NEXT:    bx lr
123  %f = call i7 @llvm.fshl.i7(i7 112, i7 127, i7 2)
124  ret i7 %f
125}
126
127define i8 @fshl_i8_const_fold_overshift_1() {
128; CHECK-LABEL: fshl_i8_const_fold_overshift_1:
129; CHECK:       @ %bb.0:
130; CHECK-NEXT:    mov r0, #128
131; CHECK-NEXT:    bx lr
132  %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 15)
133  ret i8 %f
134}
135
136define i8 @fshl_i8_const_fold_overshift_2() {
137; CHECK-LABEL: fshl_i8_const_fold_overshift_2:
138; CHECK:       @ %bb.0:
139; CHECK-NEXT:    mov r0, #120
140; CHECK-NEXT:    bx lr
141  %f = call i8 @llvm.fshl.i8(i8 15, i8 15, i8 11)
142  ret i8 %f
143}
144
145define i8 @fshl_i8_const_fold_overshift_3() {
146; CHECK-LABEL: fshl_i8_const_fold_overshift_3:
147; CHECK:       @ %bb.0:
148; CHECK-NEXT:    mov r0, #0
149; CHECK-NEXT:    bx lr
150  %f = call i8 @llvm.fshl.i8(i8 0, i8 225, i8 8)
151  ret i8 %f
152}
153
154; With constant shift amount, this is 'extr'.
155
156define i32 @fshl_i32_const_shift(i32 %x, i32 %y) {
157; CHECK-LABEL: fshl_i32_const_shift:
158; CHECK:       @ %bb.0:
159; CHECK-NEXT:    lsl r0, r0, #9
160; CHECK-NEXT:    orr r0, r0, r1, lsr #23
161; CHECK-NEXT:    bx lr
162  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
163  ret i32 %f
164}
165
166; Check modulo math on shift amount.
167
168define i32 @fshl_i32_const_overshift(i32 %x, i32 %y) {
169; CHECK-LABEL: fshl_i32_const_overshift:
170; CHECK:       @ %bb.0:
171; CHECK-NEXT:    lsl r0, r0, #9
172; CHECK-NEXT:    orr r0, r0, r1, lsr #23
173; CHECK-NEXT:    bx lr
174  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 41)
175  ret i32 %f
176}
177
178; 64-bit should also work.
179
180define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) {
181; CHECK-LABEL: fshl_i64_const_overshift:
182; CHECK:       @ %bb.0:
183; CHECK-NEXT:    lsl r1, r3, #9
184; CHECK-NEXT:    orr r2, r1, r2, lsr #23
185; CHECK-NEXT:    lsl r0, r0, #9
186; CHECK-NEXT:    orr r1, r0, r3, lsr #23
187; CHECK-NEXT:    mov r0, r2
188; CHECK-NEXT:    bx lr
189  %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 105)
190  ret i64 %f
191}
192
193; This should work without any node-specific logic.
194
195define i8 @fshl_i8_const_fold() {
196; CHECK-LABEL: fshl_i8_const_fold:
197; CHECK:       @ %bb.0:
198; CHECK-NEXT:    mov r0, #128
199; CHECK-NEXT:    bx lr
200  %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 7)
201  ret i8 %f
202}
203
204; Repeat everything for funnel shift right.
205
206; General case - all operands can be variables.
207
208define i16 @fshr_i16(i16 %x, i16 %y, i16 %z) {
209; CHECK-LABEL: fshr_i16:
210; CHECK:       @ %bb.0:
211; CHECK-NEXT:    pkhbt r0, r1, r0, lsl #16
212; CHECK-NEXT:    and r1, r2, #15
213; CHECK-NEXT:    lsr r0, r0, r1
214; CHECK-NEXT:    bx lr
215  %f = call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z)
216  ret i16 %f
217}
218
219define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
220; CHECK-LABEL: fshr_i32:
221; CHECK:       @ %bb.0:
222; CHECK-NEXT:    mov r3, #31
223; CHECK-NEXT:    lsl r0, r0, #1
224; CHECK-NEXT:    bic r3, r3, r2
225; CHECK-NEXT:    and r2, r2, #31
226; CHECK-NEXT:    lsl r0, r0, r3
227; CHECK-NEXT:    orr r0, r0, r1, lsr r2
228; CHECK-NEXT:    bx lr
229  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
230  ret i32 %f
231}
232
233; Verify that weird types are minimally supported.
234declare i37 @llvm.fshr.i37(i37, i37, i37)
235define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
236; CHECK-LABEL: fshr_i37:
237; CHECK:       @ %bb.0:
238; CHECK-NEXT:    .save {r4, r5, r6, r7, r11, lr}
239; CHECK-NEXT:    push {r4, r5, r6, r7, r11, lr}
240; CHECK-NEXT:    mov r4, r1
241; CHECK-NEXT:    mov r6, r0
242; CHECK-NEXT:    ldr r0, [sp, #24]
243; CHECK-NEXT:    mov r5, r3
244; CHECK-NEXT:    ldr r1, [sp, #28]
245; CHECK-NEXT:    mov r7, r2
246; CHECK-NEXT:    mov r2, #37
247; CHECK-NEXT:    mov r3, #0
248; CHECK-NEXT:    bl __aeabi_uldivmod
249; CHECK-NEXT:    lsl r3, r5, #27
250; CHECK-NEXT:    add r0, r2, #27
251; CHECK-NEXT:    orr r3, r3, r7, lsr #5
252; CHECK-NEXT:    mov r1, #31
253; CHECK-NEXT:    ands r12, r0, #32
254; CHECK-NEXT:    mov r5, r6
255; CHECK-NEXT:    moveq r5, r3
256; CHECK-NEXT:    bic r1, r1, r0
257; CHECK-NEXT:    lsl r2, r5, #1
258; CHECK-NEXT:    lsleq r3, r7, #27
259; CHECK-NEXT:    cmp r12, #0
260; CHECK-NEXT:    and r7, r0, #31
261; CHECK-NEXT:    lsl r2, r2, r1
262; CHECK-NEXT:    moveq r4, r6
263; CHECK-NEXT:    orr r0, r2, r3, lsr r7
264; CHECK-NEXT:    lsl r2, r4, #1
265; CHECK-NEXT:    lsl r1, r2, r1
266; CHECK-NEXT:    orr r1, r1, r5, lsr r7
267; CHECK-NEXT:    pop {r4, r5, r6, r7, r11, pc}
268  %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
269  ret i37 %f
270}
271
272; extract(concat(0b1110000, 0b1111111) >> 2) = 0b0011111
273
274declare i7 @llvm.fshr.i7(i7, i7, i7)
275define i7 @fshr_i7_const_fold() {
276; CHECK-LABEL: fshr_i7_const_fold:
277; CHECK:       @ %bb.0:
278; CHECK-NEXT:    mov r0, #31
279; CHECK-NEXT:    bx lr
280  %f = call i7 @llvm.fshr.i7(i7 112, i7 127, i7 2)
281  ret i7 %f
282}
283
284define i8 @fshr_i8_const_fold_overshift_1() {
285; CHECK-LABEL: fshr_i8_const_fold_overshift_1:
286; CHECK:       @ %bb.0:
287; CHECK-NEXT:    mov r0, #254
288; CHECK-NEXT:    bx lr
289  %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 15)
290  ret i8 %f
291}
292
293define i8 @fshr_i8_const_fold_overshift_2() {
294; CHECK-LABEL: fshr_i8_const_fold_overshift_2:
295; CHECK:       @ %bb.0:
296; CHECK-NEXT:    mov r0, #225
297; CHECK-NEXT:    bx lr
298  %f = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11)
299  ret i8 %f
300}
301
302define i8 @fshr_i8_const_fold_overshift_3() {
303; CHECK-LABEL: fshr_i8_const_fold_overshift_3:
304; CHECK:       @ %bb.0:
305; CHECK-NEXT:    mov r0, #255
306; CHECK-NEXT:    bx lr
307  %f = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8)
308  ret i8 %f
309}
310
311; With constant shift amount, this is 'extr'.
312
313define i32 @fshr_i32_const_shift(i32 %x, i32 %y) {
314; CHECK-LABEL: fshr_i32_const_shift:
315; CHECK:       @ %bb.0:
316; CHECK-NEXT:    lsl r0, r0, #23
317; CHECK-NEXT:    orr r0, r0, r1, lsr #9
318; CHECK-NEXT:    bx lr
319  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
320  ret i32 %f
321}
322
323; Check modulo math on shift amount. 41-32=9.
324
325define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) {
326; CHECK-LABEL: fshr_i32_const_overshift:
327; CHECK:       @ %bb.0:
328; CHECK-NEXT:    lsl r0, r0, #23
329; CHECK-NEXT:    orr r0, r0, r1, lsr #9
330; CHECK-NEXT:    bx lr
331  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 41)
332  ret i32 %f
333}
334
335; 64-bit should also work. 105-64 = 41.
336
337define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) {
338; CHECK-LABEL: fshr_i64_const_overshift:
339; CHECK:       @ %bb.0:
340; CHECK-NEXT:    lsl r2, r0, #23
341; CHECK-NEXT:    lsl r1, r1, #23
342; CHECK-NEXT:    orr r2, r2, r3, lsr #9
343; CHECK-NEXT:    orr r1, r1, r0, lsr #9
344; CHECK-NEXT:    mov r0, r2
345; CHECK-NEXT:    bx lr
346  %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105)
347  ret i64 %f
348}
349
350; This should work without any node-specific logic.
351
352define i8 @fshr_i8_const_fold() {
353; CHECK-LABEL: fshr_i8_const_fold:
354; CHECK:       @ %bb.0:
355; CHECK-NEXT:    mov r0, #254
356; CHECK-NEXT:    bx lr
357  %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 7)
358  ret i8 %f
359}
360
361define i32 @fshl_i32_shift_by_bitwidth(i32 %x, i32 %y) {
362; CHECK-LABEL: fshl_i32_shift_by_bitwidth:
363; CHECK:       @ %bb.0:
364; CHECK-NEXT:    bx lr
365  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 32)
366  ret i32 %f
367}
368
369define i32 @fshr_i32_shift_by_bitwidth(i32 %x, i32 %y) {
370; CHECK-LABEL: fshr_i32_shift_by_bitwidth:
371; CHECK:       @ %bb.0:
372; CHECK-NEXT:    mov r0, r1
373; CHECK-NEXT:    bx lr
374  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 32)
375  ret i32 %f
376}
377
378define <4 x i32> @fshl_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
379; CHECK-LABEL: fshl_v4i32_shift_by_bitwidth:
380; CHECK:       @ %bb.0:
381; CHECK-NEXT:    bx lr
382  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
383  ret <4 x i32> %f
384}
385
386define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
387; SCALAR-LABEL: fshr_v4i32_shift_by_bitwidth:
388; SCALAR:       @ %bb.0:
389; SCALAR-NEXT:    ldm sp, {r0, r1, r2, r3}
390; SCALAR-NEXT:    bx lr
391;
392; NEON-LABEL: fshr_v4i32_shift_by_bitwidth:
393; NEON:       @ %bb.0:
394; NEON-NEXT:    mov r0, sp
395; NEON-NEXT:    vld1.64 {d16, d17}, [r0]
396; NEON-NEXT:    vmov r0, r1, d16
397; NEON-NEXT:    vmov r2, r3, d17
398; NEON-NEXT:    bx lr
399  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
400  ret <4 x i32> %f
401}
402
403