1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s
3
4
5; i8
6
7define <16 x i8> @insert_v16i8_2_1(float %tmp, <16 x i8> %b, <16 x i8> %a) {
8; CHECK-LABEL: insert_v16i8_2_1:
9; CHECK:       // %bb.0:
10; CHECK-NEXT:    mov v0.16b, v1.16b
11; CHECK-NEXT:    mov v0.h[0], v2.h[0]
12; CHECK-NEXT:    ret
13  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
14  ret <16 x i8> %s2
15}
16
17define <16 x i8> @insert_v16i8_2_2(float %tmp, <16 x i8> %b, <16 x i8> %a) {
18; CHECK-LABEL: insert_v16i8_2_2:
19; CHECK:       // %bb.0:
20; CHECK-NEXT:    mov v0.16b, v1.16b
21; CHECK-NEXT:    mov v0.h[1], v2.h[0]
22; CHECK-NEXT:    ret
23  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
24  ret <16 x i8> %s2
25}
26
27define <16 x i8> @insert_v16i8_2_6(float %tmp, <16 x i8> %b, <16 x i8> %a) {
28; CHECK-LABEL: insert_v16i8_2_6:
29; CHECK:       // %bb.0:
30; CHECK-NEXT:    mov v0.16b, v1.16b
31; CHECK-NEXT:    mov v0.h[6], v2.h[0]
32; CHECK-NEXT:    ret
33  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 0, i32 1, i32 30, i32 31>
34  ret <16 x i8> %s2
35}
36
37define <16 x i8> @insert_v16i8_4_1(float %tmp, <16 x i8> %b, <16 x i8> %a) {
38; CHECK-LABEL: insert_v16i8_4_1:
39; CHECK:       // %bb.0:
40; CHECK-NEXT:    mov v0.16b, v1.16b
41; CHECK-NEXT:    mov v0.s[0], v2.s[0]
42; CHECK-NEXT:    ret
43  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
44  ret <16 x i8> %s2
45}
46
47define <16 x i8> @insert_v16i8_4_15(float %tmp, <16 x i8> %b, <16 x i8> %a) {
48; CHECK-LABEL: insert_v16i8_4_15:
49; CHECK:       // %bb.0:
50; CHECK-NEXT:    adrp x8, .LCPI4_0
51; CHECK-NEXT:    // kill: def $q2 killed $q2 def $q2_q3
52; CHECK-NEXT:    mov v3.16b, v1.16b
53; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI4_0]
54; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
55; CHECK-NEXT:    ret
56  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
57  ret <16 x i8> %s2
58}
59
60define <16 x i8> @insert_v16i8_4_2(float %tmp, <16 x i8> %b, <16 x i8> %a) {
61; CHECK-LABEL: insert_v16i8_4_2:
62; CHECK:       // %bb.0:
63; CHECK-NEXT:    mov v0.16b, v1.16b
64; CHECK-NEXT:    mov v0.s[1], v2.s[0]
65; CHECK-NEXT:    ret
66  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
67  ret <16 x i8> %s2
68}
69
70define <16 x i8> @insert_v16i8_4_3(float %tmp, <16 x i8> %b, <16 x i8> %a) {
71; CHECK-LABEL: insert_v16i8_4_3:
72; CHECK:       // %bb.0:
73; CHECK-NEXT:    mov v0.16b, v1.16b
74; CHECK-NEXT:    mov v0.s[2], v2.s[0]
75; CHECK-NEXT:    ret
76  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31>
77  ret <16 x i8> %s2
78}
79
80define <16 x i8> @insert_v16i8_4_4(float %tmp, <16 x i8> %b, <16 x i8> %a) {
81; CHECK-LABEL: insert_v16i8_4_4:
82; CHECK:       // %bb.0:
83; CHECK-NEXT:    mov v0.16b, v1.16b
84; CHECK-NEXT:    mov v0.s[3], v2.s[0]
85; CHECK-NEXT:    ret
86  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 0, i32 1, i32 2, i32 3>
87  ret <16 x i8> %s2
88}
89
90define <8 x i8> @insert_v8i8_4_1(float %tmp, <8 x i8> %b, <8 x i8> %a) {
91; CHECK-LABEL: insert_v8i8_4_1:
92; CHECK:       // %bb.0:
93; CHECK-NEXT:    fmov d0, d2
94; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
95; CHECK-NEXT:    mov v0.s[1], v1.s[1]
96; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
97; CHECK-NEXT:    ret
98  %s2 = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
99  ret <8 x i8> %s2
100}
101
102define <8 x i8> @insert_v8i8_4_2(float %tmp, <8 x i8> %b, <8 x i8> %a) {
103; CHECK-LABEL: insert_v8i8_4_2:
104; CHECK:       // %bb.0:
105; CHECK-NEXT:    fmov d0, d1
106; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
107; CHECK-NEXT:    mov v0.s[1], v2.s[0]
108; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
109; CHECK-NEXT:    ret
110  %s2 = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
111  ret <8 x i8> %s2
112}
113
114define <16 x i8> @insert_v16i8_8_1(float %tmp, <16 x i8> %b, <16 x i8> %a) {
115; CHECK-LABEL: insert_v16i8_8_1:
116; CHECK:       // %bb.0:
117; CHECK-NEXT:    mov v0.16b, v2.16b
118; CHECK-NEXT:    mov v0.d[1], v1.d[1]
119; CHECK-NEXT:    ret
120  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
121  ret <16 x i8> %s2
122}
123
124define <16 x i8> @insert_v16i8_8_2(float %tmp, <16 x i8> %b, <16 x i8> %a) {
125; CHECK-LABEL: insert_v16i8_8_2:
126; CHECK:       // %bb.0:
127; CHECK-NEXT:    mov v0.16b, v1.16b
128; CHECK-NEXT:    mov v0.d[1], v2.d[0]
129; CHECK-NEXT:    ret
130  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
131  ret <16 x i8> %s2
132}
133
134; i16
135
136define <8 x i16> @insert_v8i16_2_1(float %tmp, <8 x i16> %b, <8 x i16> %a) {
137; CHECK-LABEL: insert_v8i16_2_1:
138; CHECK:       // %bb.0:
139; CHECK-NEXT:    mov v0.16b, v1.16b
140; CHECK-NEXT:    mov v0.s[0], v2.s[0]
141; CHECK-NEXT:    ret
142  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
143  ret <8 x i16> %s2
144}
145
146define <8 x i16> @insert_v8i16_2_15(float %tmp, <8 x i16> %b, <8 x i16> %a) {
147; CHECK-LABEL: insert_v8i16_2_15:
148; CHECK:       // %bb.0:
149; CHECK-NEXT:    adrp x8, .LCPI13_0
150; CHECK-NEXT:    // kill: def $q2 killed $q2 def $q2_q3
151; CHECK-NEXT:    mov v3.16b, v1.16b
152; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI13_0]
153; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
154; CHECK-NEXT:    ret
155  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 1, i32 11, i32 12, i32 13, i32 14, i32 15>
156  ret <8 x i16> %s2
157}
158
159define <8 x i16> @insert_v8i16_2_2(float %tmp, <8 x i16> %b, <8 x i16> %a) {
160; CHECK-LABEL: insert_v8i16_2_2:
161; CHECK:       // %bb.0:
162; CHECK-NEXT:    mov v0.16b, v1.16b
163; CHECK-NEXT:    mov v0.s[1], v2.s[0]
164; CHECK-NEXT:    ret
165  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 0, i32 1, i32 12, i32 13, i32 14, i32 15>
166  ret <8 x i16> %s2
167}
168
169define <8 x i16> @insert_v8i16_2_3(float %tmp, <8 x i16> %b, <8 x i16> %a) {
170; CHECK-LABEL: insert_v8i16_2_3:
171; CHECK:       // %bb.0:
172; CHECK-NEXT:    mov v0.16b, v1.16b
173; CHECK-NEXT:    mov v0.s[2], v2.s[0]
174; CHECK-NEXT:    ret
175  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 14, i32 15>
176  ret <8 x i16> %s2
177}
178
179define <8 x i16> @insert_v8i16_2_4(float %tmp, <8 x i16> %b, <8 x i16> %a) {
180; CHECK-LABEL: insert_v8i16_2_4:
181; CHECK:       // %bb.0:
182; CHECK-NEXT:    mov v0.16b, v1.16b
183; CHECK-NEXT:    mov v0.s[3], v2.s[0]
184; CHECK-NEXT:    ret
185  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 0, i32 1>
186  ret <8 x i16> %s2
187}
188
189define <4 x i16> @insert_v4i16_2_1(float %tmp, <4 x i16> %b, <4 x i16> %a) {
190; CHECK-LABEL: insert_v4i16_2_1:
191; CHECK:       // %bb.0:
192; CHECK-NEXT:    fmov d0, d2
193; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
194; CHECK-NEXT:    mov v0.s[1], v1.s[1]
195; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
196; CHECK-NEXT:    ret
197  %s2 = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
198  ret <4 x i16> %s2
199}
200
201define <4 x i16> @insert_v4i16_2_2(float %tmp, <4 x i16> %b, <4 x i16> %a) {
202; CHECK-LABEL: insert_v4i16_2_2:
203; CHECK:       // %bb.0:
204; CHECK-NEXT:    fmov d0, d1
205; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
206; CHECK-NEXT:    mov v0.s[1], v2.s[0]
207; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
208; CHECK-NEXT:    ret
209  %s2 = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
210  ret <4 x i16> %s2
211}
212
213define <8 x i16> @insert_v8i16_4_1(float %tmp, <8 x i16> %b, <8 x i16> %a) {
214; CHECK-LABEL: insert_v8i16_4_1:
215; CHECK:       // %bb.0:
216; CHECK-NEXT:    mov v0.16b, v2.16b
217; CHECK-NEXT:    mov v0.d[1], v1.d[1]
218; CHECK-NEXT:    ret
219  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
220  ret <8 x i16> %s2
221}
222
223define <8 x i16> @insert_v8i16_4_2(float %tmp, <8 x i16> %b, <8 x i16> %a) {
224; CHECK-LABEL: insert_v8i16_4_2:
225; CHECK:       // %bb.0:
226; CHECK-NEXT:    mov v0.16b, v1.16b
227; CHECK-NEXT:    mov v0.d[1], v2.d[0]
228; CHECK-NEXT:    ret
229  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
230  ret <8 x i16> %s2
231}
232
233; i32
234
235define <4 x i32> @insert_v4i32_2_1(float %tmp, <4 x i32> %b, <4 x i32> %a) {
236; CHECK-LABEL: insert_v4i32_2_1:
237; CHECK:       // %bb.0:
238; CHECK-NEXT:    mov v0.16b, v2.16b
239; CHECK-NEXT:    mov v0.d[1], v1.d[1]
240; CHECK-NEXT:    ret
241  %s2 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
242  ret <4 x i32> %s2
243}
244
245define <4 x i32> @insert_v4i32_2_2(float %tmp, <4 x i32> %b, <4 x i32> %a) {
246; CHECK-LABEL: insert_v4i32_2_2:
247; CHECK:       // %bb.0:
248; CHECK-NEXT:    mov v0.16b, v1.16b
249; CHECK-NEXT:    mov v0.d[1], v2.d[0]
250; CHECK-NEXT:    ret
251  %s2 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
252  ret <4 x i32> %s2
253}
254
255
256
257
258; i8
259
260define <16 x i8> @load_v16i8_4_1(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
261; CHECK-LABEL: load_v16i8_4_1:
262; CHECK:       // %bb.0:
263; CHECK-NEXT:    mov v0.16b, v1.16b
264; CHECK-NEXT:    ld1 { v0.s }[0], [x0]
265; CHECK-NEXT:    ret
266  %l = load <4 x i8>, <4 x i8> *%a
267  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
268  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
269  ret <16 x i8> %s2
270}
271
272define <16 x i8> @load_v16i8_4_15(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
273; CHECK-LABEL: load_v16i8_4_15:
274; CHECK:       // %bb.0:
275; CHECK-NEXT:    adrp x8, .LCPI24_0
276; CHECK-NEXT:    // kill: def $q1 killed $q1 def $q0_q1
277; CHECK-NEXT:    ldr s0, [x0]
278; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI24_0]
279; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
280; CHECK-NEXT:    ret
281  %l = load <4 x i8>, <4 x i8> *%a
282  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
283  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
284  ret <16 x i8> %s2
285}
286
287define <16 x i8> @load_v16i8_4_2(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
288; CHECK-LABEL: load_v16i8_4_2:
289; CHECK:       // %bb.0:
290; CHECK-NEXT:    mov v0.16b, v1.16b
291; CHECK-NEXT:    ld1 { v0.s }[1], [x0]
292; CHECK-NEXT:    ret
293  %l = load <4 x i8>, <4 x i8> *%a
294  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
295  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
296  ret <16 x i8> %s2
297}
298
299define <16 x i8> @load_v16i8_4_3(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
300; CHECK-LABEL: load_v16i8_4_3:
301; CHECK:       // %bb.0:
302; CHECK-NEXT:    mov v0.16b, v1.16b
303; CHECK-NEXT:    ld1 { v0.s }[2], [x0]
304; CHECK-NEXT:    ret
305  %l = load <4 x i8>, <4 x i8> *%a
306  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
307  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31>
308  ret <16 x i8> %s2
309}
310
311define <16 x i8> @load_v16i8_4_4(float %tmp, <16 x i8> %b, <4 x i8> *%a) {
312; CHECK-LABEL: load_v16i8_4_4:
313; CHECK:       // %bb.0:
314; CHECK-NEXT:    mov v0.16b, v1.16b
315; CHECK-NEXT:    ld1 { v0.s }[3], [x0]
316; CHECK-NEXT:    ret
317  %l = load <4 x i8>, <4 x i8> *%a
318  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
319  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 0, i32 1, i32 2, i32 3>
320  ret <16 x i8> %s2
321}
322
323define <8 x i8> @load_v8i8_4_1(float %tmp, <8 x i8> %b, <4 x i8> *%a) {
324; CHECK-LABEL: load_v8i8_4_1:
325; CHECK:       // %bb.0:
326; CHECK-NEXT:    ldr s0, [x0]
327; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
328; CHECK-NEXT:    mov v0.s[1], v1.s[1]
329; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
330; CHECK-NEXT:    ret
331  %l = load <4 x i8>, <4 x i8> *%a
332  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
333  %s2 = shufflevector <8 x i8> %s1, <8 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
334  ret <8 x i8> %s2
335}
336
337define <8 x i8> @load_v8i8_4_2(float %tmp, <8 x i8> %b, <4 x i8> *%a) {
338; CHECK-LABEL: load_v8i8_4_2:
339; CHECK:       // %bb.0:
340; CHECK-NEXT:    fmov d0, d1
341; CHECK-NEXT:    ldr s1, [x0]
342; CHECK-NEXT:    mov v0.s[1], v1.s[0]
343; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
344; CHECK-NEXT:    ret
345  %l = load <4 x i8>, <4 x i8> *%a
346  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
347  %s2 = shufflevector <8 x i8> %s1, <8 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
348  ret <8 x i8> %s2
349}
350
351define <16 x i8> @load_v16i8_8_1(float %tmp, <16 x i8> %b, <8 x i8> *%a) {
352; CHECK-LABEL: load_v16i8_8_1:
353; CHECK:       // %bb.0:
354; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
355; CHECK-NEXT:    ldr d0, [x0]
356; CHECK-NEXT:    mov v0.d[1], v1.d[0]
357; CHECK-NEXT:    ret
358  %l = load <8 x i8>, <8 x i8> *%a
359  %s1 = shufflevector <8 x i8> %l, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
360  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
361  ret <16 x i8> %s2
362}
363
364define <16 x i8> @load_v16i8_8_2(float %tmp, <16 x i8> %b, <8 x i8> *%a) {
365; CHECK-LABEL: load_v16i8_8_2:
366; CHECK:       // %bb.0:
367; CHECK-NEXT:    mov v0.16b, v1.16b
368; CHECK-NEXT:    ldr d1, [x0]
369; CHECK-NEXT:    mov v0.d[1], v1.d[0]
370; CHECK-NEXT:    ret
371  %l = load <8 x i8>, <8 x i8> *%a
372  %s1 = shufflevector <8 x i8> %l, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
373  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
374  ret <16 x i8> %s2
375}
376
377; i16
378
379define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
380; CHECK-LABEL: load_v8i16_2_1:
381; CHECK:       // %bb.0:
382; CHECK-NEXT:    ldrh w9, [x0]
383; CHECK-NEXT:    add x8, x0, #2
384; CHECK-NEXT:    mov v0.16b, v1.16b
385; CHECK-NEXT:    fmov s2, w9
386; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
387; CHECK-NEXT:    xtn v1.4h, v2.4s
388; CHECK-NEXT:    mov v0.s[0], v1.s[0]
389; CHECK-NEXT:    ret
390  %l = load <2 x i16>, <2 x i16> *%a
391  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
392  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
393  ret <8 x i16> %s2
394}
395
396define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
397; CHECK-LABEL: load_v8i16_2_15:
398; CHECK:       // %bb.0:
399; CHECK-NEXT:    ldrh w9, [x0]
400; CHECK-NEXT:    add x8, x0, #2
401; CHECK-NEXT:    // kill: def $q1 killed $q1 def $q0_q1
402; CHECK-NEXT:    fmov s2, w9
403; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
404; CHECK-NEXT:    adrp x8, .LCPI33_0
405; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI33_0]
406; CHECK-NEXT:    xtn v0.4h, v2.4s
407; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v3.16b
408; CHECK-NEXT:    ret
409  %l = load <2 x i16>, <2 x i16> *%a
410  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
411  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 1, i32 11, i32 12, i32 13, i32 14, i32 15>
412  ret <8 x i16> %s2
413}
414
415define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
416; CHECK-LABEL: load_v8i16_2_2:
417; CHECK:       // %bb.0:
418; CHECK-NEXT:    ldrh w9, [x0]
419; CHECK-NEXT:    add x8, x0, #2
420; CHECK-NEXT:    mov v0.16b, v1.16b
421; CHECK-NEXT:    fmov s2, w9
422; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
423; CHECK-NEXT:    xtn v1.4h, v2.4s
424; CHECK-NEXT:    mov v0.s[1], v1.s[0]
425; CHECK-NEXT:    ret
426  %l = load <2 x i16>, <2 x i16> *%a
427  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
428  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 0, i32 1, i32 12, i32 13, i32 14, i32 15>
429  ret <8 x i16> %s2
430}
431
432define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
433; CHECK-LABEL: load_v8i16_2_3:
434; CHECK:       // %bb.0:
435; CHECK-NEXT:    ldrh w9, [x0]
436; CHECK-NEXT:    add x8, x0, #2
437; CHECK-NEXT:    mov v0.16b, v1.16b
438; CHECK-NEXT:    fmov s2, w9
439; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
440; CHECK-NEXT:    xtn v1.4h, v2.4s
441; CHECK-NEXT:    mov v0.s[2], v1.s[0]
442; CHECK-NEXT:    ret
443  %l = load <2 x i16>, <2 x i16> *%a
444  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
445  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 14, i32 15>
446  ret <8 x i16> %s2
447}
448
449define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, <2 x i16> *%a) {
450; CHECK-LABEL: load_v8i16_2_4:
451; CHECK:       // %bb.0:
452; CHECK-NEXT:    ldrh w9, [x0]
453; CHECK-NEXT:    add x8, x0, #2
454; CHECK-NEXT:    mov v0.16b, v1.16b
455; CHECK-NEXT:    fmov s2, w9
456; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
457; CHECK-NEXT:    xtn v1.4h, v2.4s
458; CHECK-NEXT:    mov v0.s[3], v1.s[0]
459; CHECK-NEXT:    ret
460  %l = load <2 x i16>, <2 x i16> *%a
461  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
462  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 0, i32 1>
463  ret <8 x i16> %s2
464}
465
466define <4 x i16> @load_v4i16_2_1(float %tmp, <4 x i16> %b, <2 x i16> *%a) {
467; CHECK-LABEL: load_v4i16_2_1:
468; CHECK:       // %bb.0:
469; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
470; CHECK-NEXT:    add x8, x0, #2
471; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
472; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
473; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
474; CHECK-NEXT:    mov v0.s[1], v1.s[1]
475; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
476; CHECK-NEXT:    ret
477  %l = load <2 x i16>, <2 x i16> *%a
478  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
479  %s2 = shufflevector <4 x i16> %s1, <4 x i16> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
480  ret <4 x i16> %s2
481}
482
483define <4 x i16> @load_v4i16_2_2(float %tmp, <4 x i16> %b, <2 x i16> *%a) {
484; CHECK-LABEL: load_v4i16_2_2:
485; CHECK:       // %bb.0:
486; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
487; CHECK-NEXT:    add x8, x0, #2
488; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
489; CHECK-NEXT:    uzp1 v2.4h, v0.4h, v0.4h
490; CHECK-NEXT:    fmov d0, d1
491; CHECK-NEXT:    mov v0.s[1], v2.s[0]
492; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
493; CHECK-NEXT:    ret
494  %l = load <2 x i16>, <2 x i16> *%a
495  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
496  %s2 = shufflevector <4 x i16> %s1, <4 x i16> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
497  ret <4 x i16> %s2
498}
499
500define <8 x i16> @load_v8i16_4_1(float %tmp, <8 x i16> %b, <4 x i16> *%a) {
501; CHECK-LABEL: load_v8i16_4_1:
502; CHECK:       // %bb.0:
503; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
504; CHECK-NEXT:    ldr d0, [x0]
505; CHECK-NEXT:    mov v0.d[1], v1.d[0]
506; CHECK-NEXT:    ret
507  %l = load <4 x i16>, <4 x i16> *%a
508  %s1 = shufflevector <4 x i16> %l, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
509  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
510  ret <8 x i16> %s2
511}
512
513define <8 x i16> @load_v8i16_4_2(float %tmp, <8 x i16> %b, <4 x i16> *%a) {
514; CHECK-LABEL: load_v8i16_4_2:
515; CHECK:       // %bb.0:
516; CHECK-NEXT:    mov v0.16b, v1.16b
517; CHECK-NEXT:    ldr d1, [x0]
518; CHECK-NEXT:    mov v0.d[1], v1.d[0]
519; CHECK-NEXT:    ret
520  %l = load <4 x i16>, <4 x i16> *%a
521  %s1 = shufflevector <4 x i16> %l, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
522  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
523  ret <8 x i16> %s2
524}
525
526; i32
527
528define <4 x i32> @load_v4i32_2_1(float %tmp, <4 x i32> %b, <2 x i32> *%a) {
529; CHECK-LABEL: load_v4i32_2_1:
530; CHECK:       // %bb.0:
531; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
532; CHECK-NEXT:    ldr d0, [x0]
533; CHECK-NEXT:    mov v0.d[1], v1.d[0]
534; CHECK-NEXT:    ret
535  %l = load <2 x i32>, <2 x i32> *%a
536  %s1 = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
537  %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
538  ret <4 x i32> %s2
539}
540
541define <4 x i32> @load_v4i32_2_2(float %tmp, <4 x i32> %b, <2 x i32> *%a) {
542; CHECK-LABEL: load_v4i32_2_2:
543; CHECK:       // %bb.0:
544; CHECK-NEXT:    mov v0.16b, v1.16b
545; CHECK-NEXT:    ldr d1, [x0]
546; CHECK-NEXT:    mov v0.d[1], v1.d[0]
547; CHECK-NEXT:    ret
548  %l = load <2 x i32>, <2 x i32> *%a
549  %s1 = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
550  %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
551  ret <4 x i32> %s2
552}
553
554; More than a single vector
555
556define <8 x i8> @load2_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) {
557; CHECK-LABEL: load2_v4i8:
558; CHECK:       // %bb.0:
559; CHECK-NEXT:    ldr s0, [x0]
560; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
561; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
562; CHECK-NEXT:    ret
563  %la = load <4 x i8>, <4 x i8> *%a
564  %lb = load <4 x i8>, <4 x i8> *%b
565  %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
566  ret <8 x i8> %s1
567}
568
569define <16 x i8> @load3_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) {
570; CHECK-LABEL: load3_v4i8:
571; CHECK:       // %bb.0:
572; CHECK-NEXT:    ldp s0, s1, [x0]
573; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
574; CHECK-NEXT:    mov v0.d[1], v1.d[0]
575; CHECK-NEXT:    ret
576  %la = load <4 x i8>, <4 x i8> *%a
577  %lb = load <4 x i8>, <4 x i8> *%b
578  %c = getelementptr <4 x i8>, <4 x i8> *%a, i64 1
579  %d = getelementptr <4 x i8>, <4 x i8> *%b, i64 1
580  %lc = load <4 x i8>, <4 x i8> *%c
581  %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
582  %s2 = shufflevector <4 x i8> %lc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
583  %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
584  ret <16 x i8> %s3
585}
586
587define <16 x i8> @load4_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) {
588; CHECK-LABEL: load4_v4i8:
589; CHECK:       // %bb.0:
590; CHECK-NEXT:    ldp s0, s1, [x0]
591; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
592; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
593; CHECK-NEXT:    mov v0.d[1], v1.d[0]
594; CHECK-NEXT:    ret
595  %la = load <4 x i8>, <4 x i8> *%a
596  %lb = load <4 x i8>, <4 x i8> *%b
597  %c = getelementptr <4 x i8>, <4 x i8> *%a, i64 1
598  %d = getelementptr <4 x i8>, <4 x i8> *%b, i64 1
599  %lc = load <4 x i8>, <4 x i8> *%c
600  %ld = load <4 x i8>, <4 x i8> *%d
601  %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
602  %s2 = shufflevector <4 x i8> %lc, <4 x i8> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
603  %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
604  ret <16 x i8> %s3
605}
606
607define <16 x i8> @load2multi1_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) {
608; CHECK-LABEL: load2multi1_v4i8:
609; CHECK:       // %bb.0:
610; CHECK-NEXT:    ldr s0, [x0]
611; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
612; CHECK-NEXT:    mov v0.d[1], v0.d[0]
613; CHECK-NEXT:    ret
614  %la = load <4 x i8>, <4 x i8> *%a
615  %lb = load <4 x i8>, <4 x i8> *%b
616  %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
617  %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
618  ret <16 x i8> %s3
619}
620
621define <16 x i8> @load2multi2_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) {
622; CHECK-LABEL: load2multi2_v4i8:
623; CHECK:       // %bb.0:
624; CHECK-NEXT:    ldr s0, [x1]
625; CHECK-NEXT:    ldr s1, [x0]
626; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
627; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
628; CHECK-NEXT:    mov v0.d[1], v0.d[0]
629; CHECK-NEXT:    mov v1.d[1], v1.d[0]
630; CHECK-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
631; CHECK-NEXT:    ret
632  %la = load <4 x i8>, <4 x i8> *%a
633  %lb = load <4 x i8>, <4 x i8> *%b
634  %s1 = shufflevector <4 x i8> %la, <4 x i8> %la, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
635  %s2 = shufflevector <4 x i8> %lb, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
636  %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
637  ret <16 x i8> %s3
638}
639
640define void @loads_before_stores(i8* %i44) {
641; CHECK-LABEL: loads_before_stores:
642; CHECK:       // %bb.0: // %bb
643; CHECK-NEXT:    add x8, x0, #20
644; CHECK-NEXT:    ldr s0, [x0, #28]
645; CHECK-NEXT:    ldrh w9, [x0, #26]
646; CHECK-NEXT:    ldrh w10, [x0, #24]
647; CHECK-NEXT:    ld1 { v0.s }[1], [x8]
648; CHECK-NEXT:    strh w9, [x0, #20]
649; CHECK-NEXT:    strh w10, [x0, #30]
650; CHECK-NEXT:    stur d0, [x0, #22]
651; CHECK-NEXT:    ret
652bb:
653  %i45 = getelementptr inbounds i8, i8* %i44, i64 20
654  %i46 = getelementptr inbounds i8, i8* %i44, i64 26
655  %i48 = load i8, i8* %i46, align 1
656  %i49 = getelementptr inbounds i8, i8* %i44, i64 21
657  %i50 = getelementptr inbounds i8, i8* %i44, i64 27
658  %i52 = load i8, i8* %i50, align 1
659  %i53 = getelementptr inbounds i8, i8* %i44, i64 22
660  %i54 = getelementptr inbounds i8, i8* %i44, i64 28
661  %i61 = getelementptr inbounds i8, i8* %i44, i64 24
662  %i62 = getelementptr inbounds i8, i8* %i44, i64 30
663  %i63 = load i8, i8* %i61, align 1
664  %i65 = getelementptr inbounds i8, i8* %i44, i64 25
665  %i66 = getelementptr inbounds i8, i8* %i44, i64 31
666  %i67 = load i8, i8* %i65, align 1
667  %0 = bitcast i8* %i45 to <4 x i8>*
668  %1 = load <4 x i8>, <4 x i8>* %0, align 1
669  store i8 %i48, i8* %i45, align 1
670  store i8 %i52, i8* %i49, align 1
671  %2 = bitcast i8* %i54 to <4 x i8>*
672  %3 = load <4 x i8>, <4 x i8>* %2, align 1
673  store i8 %i63, i8* %i62, align 1
674  %4 = shufflevector <4 x i8> %3, <4 x i8> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
675  %5 = bitcast i8* %i53 to <8 x i8>*
676  store <8 x i8> %4, <8 x i8>* %5, align 1
677  store i8 %i67, i8* %i66, align 1
678  ret void
679}
680