1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+bf16 < %s | FileCheck %s --check-prefixes=CHECK
3
4define <vscale x 2 x i64> @insert_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind {
5; CHECK-LABEL: insert_v2i64_nxv2i64:
6; CHECK:       // %bb.0:
7; CHECK-NEXT:    ptrue p0.d, vl2
8; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
9; CHECK-NEXT:    mov z0.d, p0/m, z1.d
10; CHECK-NEXT:    ret
11  %retval = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 0)
12  ret <vscale x 2 x i64> %retval
13}
14
15define <vscale x 2 x i64> @insert_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind {
16; CHECK-LABEL: insert_v2i64_nxv2i64_idx2:
17; CHECK:       // %bb.0:
18; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
19; CHECK-NEXT:    addvl sp, sp, #-1
20; CHECK-NEXT:    cntd x8
21; CHECK-NEXT:    mov w9, #2
22; CHECK-NEXT:    sub x8, x8, #2
23; CHECK-NEXT:    ptrue p0.d
24; CHECK-NEXT:    cmp x8, #2
25; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
26; CHECK-NEXT:    csel x8, x8, x9, lo
27; CHECK-NEXT:    mov x9, sp
28; CHECK-NEXT:    lsl x8, x8, #3
29; CHECK-NEXT:    str q1, [x9, x8]
30; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
31; CHECK-NEXT:    addvl sp, sp, #1
32; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
33; CHECK-NEXT:    ret
34  %retval = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 2)
35  ret <vscale x 2 x i64> %retval
36}
37
38define <vscale x 4 x i32> @insert_v4i32_nxv4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind {
39; CHECK-LABEL: insert_v4i32_nxv4i32:
40; CHECK:       // %bb.0:
41; CHECK-NEXT:    ptrue p0.s, vl4
42; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
43; CHECK-NEXT:    mov z0.s, p0/m, z1.s
44; CHECK-NEXT:    ret
45  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 0)
46  ret <vscale x 4 x i32> %retval
47}
48
49define <vscale x 4 x i32> @insert_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind {
50; CHECK-LABEL: insert_v4i32_nxv4i32_idx4:
51; CHECK:       // %bb.0:
52; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
53; CHECK-NEXT:    addvl sp, sp, #-1
54; CHECK-NEXT:    cntw x8
55; CHECK-NEXT:    mov w9, #4
56; CHECK-NEXT:    sub x8, x8, #4
57; CHECK-NEXT:    ptrue p0.s
58; CHECK-NEXT:    cmp x8, #4
59; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
60; CHECK-NEXT:    csel x8, x8, x9, lo
61; CHECK-NEXT:    mov x9, sp
62; CHECK-NEXT:    lsl x8, x8, #2
63; CHECK-NEXT:    str q1, [x9, x8]
64; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
65; CHECK-NEXT:    addvl sp, sp, #1
66; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
67; CHECK-NEXT:    ret
68  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 4)
69  ret <vscale x 4 x i32> %retval
70}
71
72define <vscale x 8 x i16> @insert_v8i16_nxv8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind {
73; CHECK-LABEL: insert_v8i16_nxv8i16:
74; CHECK:       // %bb.0:
75; CHECK-NEXT:    ptrue p0.h, vl8
76; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
77; CHECK-NEXT:    mov z0.h, p0/m, z1.h
78; CHECK-NEXT:    ret
79  %retval = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 0)
80  ret <vscale x 8 x i16> %retval
81}
82
83define <vscale x 8 x i16> @insert_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind {
84; CHECK-LABEL: insert_v8i16_nxv8i16_idx8:
85; CHECK:       // %bb.0:
86; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
87; CHECK-NEXT:    addvl sp, sp, #-1
88; CHECK-NEXT:    cnth x8
89; CHECK-NEXT:    mov w9, #8
90; CHECK-NEXT:    sub x8, x8, #8
91; CHECK-NEXT:    ptrue p0.h
92; CHECK-NEXT:    cmp x8, #8
93; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
94; CHECK-NEXT:    csel x8, x8, x9, lo
95; CHECK-NEXT:    mov x9, sp
96; CHECK-NEXT:    lsl x8, x8, #1
97; CHECK-NEXT:    str q1, [x9, x8]
98; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
99; CHECK-NEXT:    addvl sp, sp, #1
100; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
101; CHECK-NEXT:    ret
102  %retval = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 8)
103  ret <vscale x 8 x i16> %retval
104}
105
106define <vscale x 16 x i8> @insert_v16i8_nxv16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind {
107; CHECK-LABEL: insert_v16i8_nxv16i8:
108; CHECK:       // %bb.0:
109; CHECK-NEXT:    ptrue p0.b, vl16
110; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
111; CHECK-NEXT:    mov z0.b, p0/m, z1.b
112; CHECK-NEXT:    ret
113  %retval = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 0)
114  ret <vscale x 16 x i8> %retval
115}
116
117define <vscale x 16 x i8> @insert_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind {
118; CHECK-LABEL: insert_v16i8_nxv16i8_idx16:
119; CHECK:       // %bb.0:
120; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
121; CHECK-NEXT:    addvl sp, sp, #-1
122; CHECK-NEXT:    mov x8, #-16
123; CHECK-NEXT:    mov w9, #16
124; CHECK-NEXT:    ptrue p0.b
125; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
126; CHECK-NEXT:    addvl x8, x8, #1
127; CHECK-NEXT:    cmp x8, #16
128; CHECK-NEXT:    csel x8, x8, x9, lo
129; CHECK-NEXT:    mov x9, sp
130; CHECK-NEXT:    str q1, [x9, x8]
131; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
132; CHECK-NEXT:    addvl sp, sp, #1
133; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
134; CHECK-NEXT:    ret
135  %retval = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 16)
136  ret <vscale x 16 x i8> %retval
137}
138
139
140; Insert subvectors into illegal vectors
141
142define void @insert_nxv8i64_nxv16i64(<vscale x 8 x i64> %sv0, <vscale x 8 x i64> %sv1, <vscale x 16 x i64>* %out) {
143; CHECK-LABEL: insert_nxv8i64_nxv16i64:
144; CHECK:       // %bb.0:
145; CHECK-NEXT:    ptrue p0.d
146; CHECK-NEXT:    st1d { z7.d }, p0, [x0, #7, mul vl]
147; CHECK-NEXT:    st1d { z6.d }, p0, [x0, #6, mul vl]
148; CHECK-NEXT:    st1d { z5.d }, p0, [x0, #5, mul vl]
149; CHECK-NEXT:    st1d { z4.d }, p0, [x0, #4, mul vl]
150; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #3, mul vl]
151; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #2, mul vl]
152; CHECK-NEXT:    st1d { z1.d }, p0, [x0, #1, mul vl]
153; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
154; CHECK-NEXT:    ret
155  %v0 = call <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 0)
156  %v = call <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> %v0, <vscale x 8 x i64> %sv1, i64 8)
157  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
158  ret void
159}
160
161define void @insert_nxv8i64_nxv16i64_lo(<vscale x 8 x i64> %sv0, <vscale x 16 x i64>* %out) {
162; CHECK-LABEL: insert_nxv8i64_nxv16i64_lo:
163; CHECK:       // %bb.0:
164; CHECK-NEXT:    ptrue p0.d
165; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #3, mul vl]
166; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #2, mul vl]
167; CHECK-NEXT:    st1d { z1.d }, p0, [x0, #1, mul vl]
168; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
169; CHECK-NEXT:    ret
170  %v = call <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 0)
171  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
172  ret void
173}
174
175define void @insert_nxv8i64_nxv16i64_hi(<vscale x 8 x i64> %sv0, <vscale x 16 x i64>* %out) {
176; CHECK-LABEL: insert_nxv8i64_nxv16i64_hi:
177; CHECK:       // %bb.0:
178; CHECK-NEXT:    ptrue p0.d
179; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #7, mul vl]
180; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #6, mul vl]
181; CHECK-NEXT:    st1d { z1.d }, p0, [x0, #5, mul vl]
182; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #4, mul vl]
183; CHECK-NEXT:    ret
184  %v = call <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 8)
185  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
186  ret void
187}
188
189define void @insert_v2i64_nxv16i64(<2 x i64> %sv0, <2 x i64> %sv1, <vscale x 16 x i64>* %out) uwtable {
190; CHECK-LABEL: insert_v2i64_nxv16i64:
191; CHECK:       // %bb.0:
192; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
193; CHECK-NEXT:    .cfi_def_cfa_offset 16
194; CHECK-NEXT:    .cfi_offset w29, -16
195; CHECK-NEXT:    addvl sp, sp, #-4
196; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
197; CHECK-NEXT:    ptrue p0.d
198; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
199; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
200; CHECK-NEXT:    str q1, [sp, #32]
201; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
202; CHECK-NEXT:    ld1d { z1.d }, p0/z, [sp, #1, mul vl]
203; CHECK-NEXT:    ld1d { z2.d }, p0/z, [sp, #2, mul vl]
204; CHECK-NEXT:    ld1d { z3.d }, p0/z, [sp, #3, mul vl]
205; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #3, mul vl]
206; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #2, mul vl]
207; CHECK-NEXT:    st1d { z1.d }, p0, [x0, #1, mul vl]
208; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
209; CHECK-NEXT:    addvl sp, sp, #4
210; CHECK-NEXT:    .cfi_def_cfa wsp, 16
211; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
212; CHECK-NEXT:    .cfi_def_cfa_offset 0
213; CHECK-NEXT:    .cfi_restore w29
214; CHECK-NEXT:    ret
215  %v0 = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0)
216  %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> %v0, <2 x i64> %sv1, i64 4)
217  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
218  ret void
219}
220
221define void @insert_v2i64_nxv16i64_lo0(<2 x i64>* %psv, <vscale x 16 x i64>* %out) {
222; CHECK-LABEL: insert_v2i64_nxv16i64_lo0:
223; CHECK:       // %bb.0:
224; CHECK-NEXT:    ldr q0, [x0]
225; CHECK-NEXT:    ptrue p0.d
226; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
227; CHECK-NEXT:    ret
228  %sv = load <2 x i64>, <2 x i64>* %psv
229  %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0)
230  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
231  ret void
232}
233
234define void @insert_v2i64_nxv16i64_lo2(<2 x i64>* %psv, <vscale x 16 x i64>* %out) uwtable {
235; CHECK-LABEL: insert_v2i64_nxv16i64_lo2:
236; CHECK:       // %bb.0:
237; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
238; CHECK-NEXT:    .cfi_def_cfa_offset 16
239; CHECK-NEXT:    .cfi_offset w29, -16
240; CHECK-NEXT:    addvl sp, sp, #-2
241; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
242; CHECK-NEXT:    ldr q0, [x0]
243; CHECK-NEXT:    ptrue p0.d
244; CHECK-NEXT:    str q0, [sp, #16]
245; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
246; CHECK-NEXT:    ld1d { z1.d }, p0/z, [sp, #1, mul vl]
247; CHECK-NEXT:    st1d { z1.d }, p0, [x1, #1, mul vl]
248; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
249; CHECK-NEXT:    addvl sp, sp, #2
250; CHECK-NEXT:    .cfi_def_cfa wsp, 16
251; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
252; CHECK-NEXT:    .cfi_def_cfa_offset 0
253; CHECK-NEXT:    .cfi_restore w29
254; CHECK-NEXT:    ret
255  %sv = load <2 x i64>, <2 x i64>* %psv
256  %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
257  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
258  ret void
259}
260
261
262; Insert subvectors that need widening
263
264define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_undef() nounwind {
265; CHECK-LABEL: insert_nxv1i32_nxv4i32_undef:
266; CHECK:       // %bb.0: // %entry
267; CHECK-NEXT:    mov z0.s, #1 // =0x1
268; CHECK-NEXT:    ret
269entry:
270  %0 = insertelement <vscale x 1 x i32> undef, i32 1, i32 0
271  %subvec = shufflevector <vscale x 1 x i32> %0, <vscale x 1 x i32> undef, <vscale x 1 x i32> zeroinitializer
272  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> undef, <vscale x 1 x i32> %subvec, i64 0)
273  ret <vscale x 4 x i32> %retval
274}
275
276define <vscale x 6 x i16> @insert_nxv1i16_nxv6i16_undef() nounwind {
277; CHECK-LABEL: insert_nxv1i16_nxv6i16_undef:
278; CHECK:       // %bb.0: // %entry
279; CHECK-NEXT:    mov z0.h, #1 // =0x1
280; CHECK-NEXT:    ret
281entry:
282  %0 = insertelement <vscale x 1 x i16> undef, i16 1, i32 0
283  %subvec = shufflevector <vscale x 1 x i16> %0, <vscale x 1 x i16> undef, <vscale x 1 x i32> zeroinitializer
284  %retval = call <vscale x 6 x i16> @llvm.vector.insert.nxv6i16.nxv1i16(<vscale x 6 x i16> undef, <vscale x 1 x i16> %subvec, i64 0)
285  ret <vscale x 6 x i16> %retval
286}
287
288define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_undef(<vscale x 1 x float> %subvec) nounwind {
289; CHECK-LABEL: insert_nxv1f32_nxv4f32_undef:
290; CHECK:       // %bb.0: // %entry
291; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
292; CHECK-NEXT:    ret
293entry:
294  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> undef, <vscale x 1 x float> %subvec, i64 0)
295  ret <vscale x 4 x float> %retval
296}
297
298; This tests promotion of the input operand to INSERT_SUBVECTOR.
299define <vscale x 8 x i16> @insert_nxv8i16_nxv2i16(<vscale x 8 x i16> %vec, <vscale x 2 x i16> %in) nounwind {
300; CHECK-LABEL: insert_nxv8i16_nxv2i16:
301; CHECK:       // %bb.0:
302; CHECK-NEXT:    uunpklo z2.s, z0.h
303; CHECK-NEXT:    uunpkhi z0.s, z0.h
304; CHECK-NEXT:    uunpklo z2.d, z2.s
305; CHECK-NEXT:    uzp1 z1.s, z2.s, z1.s
306; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
307; CHECK-NEXT:    ret
308  %r = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv2i16(<vscale x 8 x i16> %vec, <vscale x 2 x i16> %in, i64 2)
309  ret <vscale x 8 x i16> %r
310}
311
312define <vscale x 4 x half> @insert_nxv4f16_nxv2f16_0(<vscale x 4 x half> %sv0, <vscale x 2 x half> %sv1) nounwind {
313; CHECK-LABEL: insert_nxv4f16_nxv2f16_0:
314; CHECK:       // %bb.0:
315; CHECK-NEXT:    uunpkhi z0.d, z0.s
316; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
317; CHECK-NEXT:    ret
318  %v0 = call <vscale x 4 x half> @llvm.vector.insert.nxv4f16.nxv2f16(<vscale x 4 x half> %sv0, <vscale x 2 x half> %sv1, i64 0)
319 ret <vscale x 4 x half> %v0
320}
321
322define <vscale x 4 x half> @insert_nxv4f16_nxv2f16_2(<vscale x 4 x half> %sv0, <vscale x 2 x half> %sv1) nounwind {
323; CHECK-LABEL: insert_nxv4f16_nxv2f16_2:
324; CHECK:       // %bb.0:
325; CHECK-NEXT:    uunpklo z0.d, z0.s
326; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
327; CHECK-NEXT:    ret
328  %v0 = call <vscale x 4 x half> @llvm.vector.insert.nxv4f16.nxv2f16(<vscale x 4 x half> %sv0, <vscale x 2 x half> %sv1, i64 2)
329 ret <vscale x 4 x half> %v0
330}
331
332; Test that the index is scaled by vscale if the subvector is scalable.
333define <vscale x 8 x half> @insert_nxv8f16_nxv2f16(<vscale x 8 x half> %vec, <vscale x 2 x half> %in) nounwind {
334; CHECK-LABEL: insert_nxv8f16_nxv2f16:
335; CHECK:       // %bb.0:
336; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
337; CHECK-NEXT:    addvl sp, sp, #-1
338; CHECK-NEXT:    ptrue p0.h
339; CHECK-NEXT:    ptrue p1.d
340; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
341; CHECK-NEXT:    st1h { z1.d }, p1, [sp, #1, mul vl]
342; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
343; CHECK-NEXT:    addvl sp, sp, #1
344; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
345; CHECK-NEXT:    ret
346  %r = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv2f16(<vscale x 8 x half> %vec, <vscale x 2 x half> %in, i64 2)
347  ret <vscale x 8 x half> %r
348}
349
350define <vscale x 8 x half> @insert_nxv8f16_nxv4f16_0(<vscale x 8 x half> %sv0, <vscale x 4 x half> %sv1) nounwind {
351; CHECK-LABEL: insert_nxv8f16_nxv4f16_0:
352; CHECK:       // %bb.0:
353; CHECK-NEXT:    uunpkhi z0.s, z0.h
354; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
355; CHECK-NEXT:    ret
356  %v0 = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half> %sv0, <vscale x 4 x half> %sv1, i64 0)
357 ret <vscale x 8 x half> %v0
358}
359
360define <vscale x 8 x half> @insert_nxv8f16_nxv4f16_4(<vscale x 8 x half> %sv0, <vscale x 4 x half> %sv1) nounwind {
361; CHECK-LABEL: insert_nxv8f16_nxv4f16_4:
362; CHECK:       // %bb.0:
363; CHECK-NEXT:    uunpklo z0.s, z0.h
364; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
365; CHECK-NEXT:    ret
366  %v0 = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half> %sv0, <vscale x 4 x half> %sv1, i64 4)
367 ret <vscale x 8 x half> %v0
368}
369
370; Fixed length clamping
371
372define <vscale x 2 x i64> @insert_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind #0 {
373; CHECK-LABEL: insert_fixed_v2i64_nxv2i64:
374; CHECK:       // %bb.0:
375; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
376; CHECK-NEXT:    addvl sp, sp, #-1
377; CHECK-NEXT:    cntd x8
378; CHECK-NEXT:    mov w9, #2
379; CHECK-NEXT:    sub x8, x8, #2
380; CHECK-NEXT:    ptrue p0.d
381; CHECK-NEXT:    cmp x8, #2
382; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
383; CHECK-NEXT:    csel x8, x8, x9, lo
384; CHECK-NEXT:    mov x9, sp
385; CHECK-NEXT:    lsl x8, x8, #3
386; CHECK-NEXT:    str q1, [x9, x8]
387; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
388; CHECK-NEXT:    addvl sp, sp, #1
389; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
390; CHECK-NEXT:    ret
391  %retval = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 2)
392  ret <vscale x 2 x i64> %retval
393}
394
395define <vscale x 2 x i64> @insert_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec, <4 x i64>* %ptr) nounwind #0 {
396; CHECK-LABEL: insert_fixed_v4i64_nxv2i64:
397; CHECK:       // %bb.0:
398; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
399; CHECK-NEXT:    addvl sp, sp, #-1
400; CHECK-NEXT:    cntd x8
401; CHECK-NEXT:    ptrue p0.d
402; CHECK-NEXT:    subs x8, x8, #4
403; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
404; CHECK-NEXT:    csel x8, xzr, x8, lo
405; CHECK-NEXT:    mov w9, #4
406; CHECK-NEXT:    cmp x8, #4
407; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
408; CHECK-NEXT:    csel x8, x8, x9, lo
409; CHECK-NEXT:    mov x9, sp
410; CHECK-NEXT:    st1d { z1.d }, p0, [x9, x8, lsl #3]
411; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
412; CHECK-NEXT:    addvl sp, sp, #1
413; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
414; CHECK-NEXT:    ret
415  %subvec = load <4 x i64>, <4 x i64>* %ptr
416  %retval = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> %vec, <4 x i64> %subvec, i64 4)
417  ret <vscale x 2 x i64> %retval
418}
419
420;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
421;;  Upacked types that need result widening
422;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
423
424define <vscale x 3 x i32> @insert_nxv3i32_nxv2i32(<vscale x 2 x i32> %sv0) {
425; CHECK-LABEL: insert_nxv3i32_nxv2i32:
426; CHECK:       // %bb.0:
427; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
428; CHECK-NEXT:    ret
429  %v0 = call <vscale x 3 x i32> @llvm.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32> undef, <vscale x 2 x i32> %sv0, i64 0)
430 ret <vscale x 3 x i32> %v0
431}
432
433;; Check that the Subvector is not widen so it does not crash.
434define <vscale x 3 x i32> @insert_nxv3i32_nxv2i32_2(<vscale x 3 x i32> %sv0, <vscale x 2 x i32> %sv1) {
435; CHECK-LABEL: insert_nxv3i32_nxv2i32_2:
436; CHECK:       // %bb.0:
437; CHECK-NEXT:    uunpkhi z0.d, z0.s
438; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
439; CHECK-NEXT:    ret
440  %v0 = call <vscale x 3 x i32> @llvm.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32> %sv0, <vscale x 2 x i32> %sv1, i64 0)
441  ret <vscale x 3 x i32> %v0
442}
443
444define <vscale x 3 x float> @insert_nxv3f32_nxv2f32(<vscale x 2 x float> %sv0) nounwind {
445; CHECK-LABEL: insert_nxv3f32_nxv2f32:
446; CHECK:       // %bb.0:
447; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
448; CHECK-NEXT:    ret
449  %v0 = call <vscale x 3 x float> @llvm.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float> undef, <vscale x 2 x float> %sv0, i64 0)
450 ret <vscale x 3 x float> %v0
451}
452
453define <vscale x 4 x float> @insert_nxv4f32_nxv2f32_0(<vscale x 4 x float> %sv0, <vscale x 2 x float> %sv1) nounwind {
454; CHECK-LABEL: insert_nxv4f32_nxv2f32_0:
455; CHECK:       // %bb.0:
456; CHECK-NEXT:    uunpkhi z0.d, z0.s
457; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
458; CHECK-NEXT:    ret
459  %v0 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> %sv0, <vscale x 2 x float> %sv1, i64 0)
460 ret <vscale x 4 x float> %v0
461}
462
463define <vscale x 4 x float> @insert_nxv4f32_nxv2f32_2(<vscale x 4 x float> %sv0, <vscale x 2 x float> %sv1) nounwind {
464; CHECK-LABEL: insert_nxv4f32_nxv2f32_2:
465; CHECK:       // %bb.0:
466; CHECK-NEXT:    uunpklo z0.d, z0.s
467; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
468; CHECK-NEXT:    ret
469  %v0 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> %sv0, <vscale x 2 x float> %sv1, i64 2)
470 ret <vscale x 4 x float> %v0
471}
472
473define <vscale x 6 x i32>  @insert_nxv6i32_nxv2i32(<vscale x 2 x i32> %sv0, <vscale x 2 x i32> %sv1) nounwind {
474; CHECK-LABEL: insert_nxv6i32_nxv2i32:
475; CHECK:       // %bb.0:
476; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
477; CHECK-NEXT:    addvl sp, sp, #-2
478; CHECK-NEXT:    ptrue p0.s
479; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
480; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
481; CHECK-NEXT:    ld1w { z1.s }, p0/z, [sp, #1, mul vl]
482; CHECK-NEXT:    addvl sp, sp, #2
483; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
484; CHECK-NEXT:    ret
485  %v0 = call <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32> undef, <vscale x 2 x i32> %sv0, i64 0)
486  %v1 = call <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32> %v0, <vscale x 2 x i32> %sv1, i64 2)
487  ret <vscale x 6 x i32> %v1
488}
489
490;; This only works because the input vector is undef and index is zero
491define  <vscale x 6 x i32> @insert_nxv6i32_nxv3i32(<vscale x 3 x i32> %sv0) {
492; CHECK-LABEL: insert_nxv6i32_nxv3i32:
493; CHECK:       // %bb.0:
494; CHECK-NEXT:    ret
495  %v0 = call <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32> undef, <vscale x 3 x i32> %sv0, i64 0)
496  ret <vscale x 6 x i32> %v0
497}
498
499define <vscale x 12 x i32> @insert_nxv12i32_nxv4i32(<vscale x 4 x i32> %sv0, <vscale x 4 x i32> %sv1, <vscale x 4 x i32> %sv2) {
500; CHECK-LABEL: insert_nxv12i32_nxv4i32:
501; CHECK:       // %bb.0:
502; CHECK-NEXT:    ret
503  %v0 = call <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> undef, <vscale x 4 x i32> %sv0, i64 0)
504  %v1 = call <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> %v0, <vscale x 4 x i32> %sv1, i64 4)
505  %v2 = call <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> %v1, <vscale x 4 x i32> %sv2, i64 8)
506  ret <vscale x 12 x i32> %v2
507}
508
509define <vscale x 2 x bfloat> @insert_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1) nounwind {
510; CHECK-LABEL: insert_nxv2bf16_nxv2bf16:
511; CHECK:       // %bb.0:
512; CHECK-NEXT:    mov z0.d, z1.d
513; CHECK-NEXT:    ret
514  %v0 = call <vscale x 2 x bfloat> @llvm.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1, i64 0)
515  ret <vscale x 2 x bfloat> %v0
516}
517
518define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1) nounwind {
519; CHECK-LABEL: insert_nxv4bf16_nxv4bf16:
520; CHECK:       // %bb.0:
521; CHECK-NEXT:    mov z0.d, z1.d
522; CHECK-NEXT:    ret
523  %v0 = call <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1, i64 0)
524  ret <vscale x 4 x bfloat> %v0
525}
526
527define <vscale x 4 x bfloat> @insert_nxv4bf16_v4bf16(<vscale x 4 x bfloat> %sv0, <4 x bfloat> %v1) nounwind {
528; CHECK-LABEL: insert_nxv4bf16_v4bf16:
529; CHECK:       // %bb.0:
530; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
531; CHECK-NEXT:    addvl sp, sp, #-1
532; CHECK-NEXT:    ptrue p0.s
533; CHECK-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
534; CHECK-NEXT:    addpl x8, sp, #4
535; CHECK-NEXT:    str d1, [x8]
536; CHECK-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
537; CHECK-NEXT:    addvl sp, sp, #1
538; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
539; CHECK-NEXT:    ret
540  %v0 = call <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat> %sv0, <4 x bfloat> %v1, i64 0)
541  ret <vscale x 4 x bfloat> %v0
542}
543
544define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %sv0, <vscale x 8 x bfloat> %sv1) nounwind {
545; CHECK-LABEL: insert_nxv8bf16_nxv8bf16:
546; CHECK:       // %bb.0:
547; CHECK-NEXT:    mov z0.d, z1.d
548; CHECK-NEXT:    ret
549  %v0 = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat> %sv0, <vscale x 8 x bfloat> %sv1, i64 0)
550  ret <vscale x 8 x bfloat> %v0
551}
552
553define <vscale x 8 x bfloat> @insert_nxv8bf16_v8bf16(<vscale x 8 x bfloat> %sv0, <8 x bfloat> %v1) nounwind {
554; CHECK-LABEL: insert_nxv8bf16_v8bf16:
555; CHECK:       // %bb.0:
556; CHECK-NEXT:    ptrue p0.h, vl8
557; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
558; CHECK-NEXT:    mov z0.h, p0/m, z1.h
559; CHECK-NEXT:    ret
560  %v0 = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> %sv0, <8 x bfloat> %v1, i64 0)
561  ret <vscale x 8 x bfloat> %v0
562}
563
564define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv4bf16_0(<vscale x 8 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1) nounwind {
565; CHECK-LABEL: insert_nxv8bf16_nxv4bf16_0:
566; CHECK:       // %bb.0:
567; CHECK-NEXT:    uunpkhi z0.s, z0.h
568; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
569; CHECK-NEXT:    ret
570  %v0 = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv4bf16(<vscale x 8 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1, i64 0)
571 ret <vscale x 8 x bfloat> %v0
572}
573
574define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv4bf16_4(<vscale x 8 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1) nounwind {
575; CHECK-LABEL: insert_nxv8bf16_nxv4bf16_4:
576; CHECK:       // %bb.0:
577; CHECK-NEXT:    uunpklo z0.s, z0.h
578; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
579; CHECK-NEXT:    ret
580  %v0 = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv4bf16(<vscale x 8 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1, i64 4)
581 ret <vscale x 8 x bfloat> %v0
582}
583
584define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv2bf16_0(<vscale x 4 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1) nounwind {
585; CHECK-LABEL: insert_nxv4bf16_nxv2bf16_0:
586; CHECK:       // %bb.0:
587; CHECK-NEXT:    uunpkhi z0.d, z0.s
588; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
589; CHECK-NEXT:    ret
590  %v0 = call <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv2bf16(<vscale x 4 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1, i64 0)
591 ret <vscale x 4 x bfloat> %v0
592}
593
594define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv2bf16_2(<vscale x 4 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1) nounwind {
595; CHECK-LABEL: insert_nxv4bf16_nxv2bf16_2:
596; CHECK:       // %bb.0:
597; CHECK-NEXT:    uunpklo z0.d, z0.s
598; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
599; CHECK-NEXT:    ret
600  %v0 = call <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv2bf16(<vscale x 4 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1, i64 2)
601 ret <vscale x 4 x bfloat> %v0
602}
603
604; Test predicate inserts of half size.
605define <vscale x 16 x i1> @insert_nxv16i1_nxv8i1_0(<vscale x 16 x i1> %vec, <vscale x 8 x i1> %sv) {
606; CHECK-LABEL: insert_nxv16i1_nxv8i1_0:
607; CHECK:       // %bb.0:
608; CHECK-NEXT:    punpkhi p0.h, p0.b
609; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
610; CHECK-NEXT:    ret
611  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1> %vec, <vscale x 8 x i1> %sv, i64 0)
612  ret <vscale x 16 x i1> %v0
613}
614
615define <vscale x 16 x i1> @insert_nxv16i1_nxv8i1_8(<vscale x 16 x i1> %vec, <vscale x 8 x i1> %sv) {
616; CHECK-LABEL: insert_nxv16i1_nxv8i1_8:
617; CHECK:       // %bb.0:
618; CHECK-NEXT:    punpklo p0.h, p0.b
619; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
620; CHECK-NEXT:    ret
621  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1> %vec, <vscale x 8 x i1> %sv, i64 8)
622  ret <vscale x 16 x i1> %v0
623}
624
625; Test predicate inserts of less than half the size.
626define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_0(<vscale x 16 x i1> %vec, <vscale x 4 x i1> %sv) {
627; CHECK-LABEL: insert_nxv16i1_nxv4i1_0:
628; CHECK:       // %bb.0:
629; CHECK-NEXT:    punpklo p2.h, p0.b
630; CHECK-NEXT:    punpkhi p0.h, p0.b
631; CHECK-NEXT:    punpkhi p2.h, p2.b
632; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
633; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
634; CHECK-NEXT:    ret
635  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> %vec, <vscale x 4 x i1> %sv, i64 0)
636  ret <vscale x 16 x i1> %v0
637}
638
639define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_12(<vscale x 16 x i1> %vec, <vscale x 4 x i1> %sv) {
640; CHECK-LABEL: insert_nxv16i1_nxv4i1_12:
641; CHECK:       // %bb.0:
642; CHECK-NEXT:    punpkhi p2.h, p0.b
643; CHECK-NEXT:    punpklo p0.h, p0.b
644; CHECK-NEXT:    punpklo p2.h, p2.b
645; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
646; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
647; CHECK-NEXT:    ret
648  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> %vec, <vscale x 4 x i1> %sv, i64 12)
649  ret <vscale x 16 x i1> %v0
650}
651
652; Test predicate insert into undef/zero
653define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_into_zero(<vscale x 4 x i1> %sv) {
654; CHECK-LABEL: insert_nxv16i1_nxv4i1_into_zero:
655; CHECK:       // %bb.0:
656; CHECK-NEXT:    pfalse p1.b
657; CHECK-NEXT:    uzp1 p0.h, p0.h, p1.h
658; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
659; CHECK-NEXT:    ret
660  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> zeroinitializer, <vscale x 4 x i1> %sv, i64 0)
661  ret <vscale x 16 x i1> %v0
662}
663
664define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_into_poison(<vscale x 4 x i1> %sv) {
665; CHECK-LABEL: insert_nxv16i1_nxv4i1_into_poison:
666; CHECK:       // %bb.0:
667; CHECK-NEXT:    uzp1 p0.h, p0.h, p0.h
668; CHECK-NEXT:    uzp1 p0.b, p0.b, p0.b
669; CHECK-NEXT:    ret
670  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> poison, <vscale x 4 x i1> %sv, i64 0)
671  ret <vscale x 16 x i1> %v0
672}
673
674; Test constant predicate insert into undef
675define <vscale x 2 x i1> @insert_nxv2i1_v8i1_const_true_into_undef() vscale_range(4,8) {
676; CHECK-LABEL: insert_nxv2i1_v8i1_const_true_into_undef:
677; CHECK:       // %bb.0:
678; CHECK-NEXT:    ptrue p0.d
679; CHECK-NEXT:    ret
680  %v0 = call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v8i1 (<vscale x 2 x i1> undef, <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i64 0)
681  ret <vscale x 2 x i1> %v0
682}
683
684define <vscale x 4 x i1> @insert_nxv4i1_v16i1_const_true_into_undef() vscale_range(4,8) {
685; CHECK-LABEL: insert_nxv4i1_v16i1_const_true_into_undef:
686; CHECK:       // %bb.0:
687; CHECK-NEXT:    ptrue p0.s
688; CHECK-NEXT:    ret
689  %v0 = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.v16i1 (<vscale x 4 x i1> undef, <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i64 0)
690  ret <vscale x 4 x i1> %v0
691}
692
693define <vscale x 8 x i1> @insert_nxv8i1_v32i1_const_true_into_undef() vscale_range(4,8) {
694; CHECK-LABEL: insert_nxv8i1_v32i1_const_true_into_undef:
695; CHECK:       // %bb.0:
696; CHECK-NEXT:    ptrue p0.h
697; CHECK-NEXT:    ret
698  %v0 = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.v32i1 (<vscale x 8 x i1> undef, <32 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i64 0)
699  ret <vscale x 8 x i1> %v0
700}
701
702define <vscale x 16 x i1> @insert_nxv16i1_v64i1_const_true_into_undef() vscale_range(4,8) {
703; CHECK-LABEL: insert_nxv16i1_v64i1_const_true_into_undef:
704; CHECK:       // %bb.0:
705; CHECK-NEXT:    ptrue p0.b
706; CHECK-NEXT:    ret
707  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v64i1 (<vscale x 16 x i1> undef, <64 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i64 0)
708  ret <vscale x 16 x i1> %v0
709}
710
711;
712; Insert nxv1i1 type into: nxv2i1
713;
714
715define <vscale x 2 x i1> @insert_nxv1i1_nxv2i1_0(<vscale x 2 x i1> %vec, <vscale x 1 x i1> %sv) {
716; CHECK-LABEL: insert_nxv1i1_nxv2i1_0:
717; CHECK:       // %bb.0:
718; CHECK-NEXT:    punpkhi p0.h, p0.b
719; CHECK-NEXT:    uzp1 p0.d, p1.d, p0.d
720; CHECK-NEXT:    ret
721  %res = call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.nxv1i1(<vscale x 2 x i1> %vec, <vscale x 1 x i1> %sv, i64 0)
722  ret <vscale x 2 x i1> %res
723}
724
725define <vscale x 2 x i1> @insert_nxv1i1_nxv2i1_1(<vscale x 2 x i1> %vec, <vscale x 1 x i1> %sv) {
726; CHECK-LABEL: insert_nxv1i1_nxv2i1_1:
727; CHECK:       // %bb.0:
728; CHECK-NEXT:    punpklo p0.h, p0.b
729; CHECK-NEXT:    uzp1 p0.d, p0.d, p1.d
730; CHECK-NEXT:    ret
731  %res = call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.nxv1i1(<vscale x 2 x i1> %vec, <vscale x 1 x i1> %sv, i64 1)
732  ret <vscale x 2 x i1> %res
733}
734
735;
736; Insert nxv1i1 type into: nxv4i1
737;
738
739define <vscale x 4 x i1> @insert_nxv1i1_nxv4i1_0(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv) {
740; CHECK-LABEL: insert_nxv1i1_nxv4i1_0:
741; CHECK:       // %bb.0:
742; CHECK-NEXT:    punpklo p2.h, p0.b
743; CHECK-NEXT:    punpkhi p0.h, p0.b
744; CHECK-NEXT:    punpkhi p2.h, p2.b
745; CHECK-NEXT:    uzp1 p1.d, p1.d, p2.d
746; CHECK-NEXT:    uzp1 p0.s, p1.s, p0.s
747; CHECK-NEXT:    ret
748  %res = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv, i64 0)
749  ret <vscale x 4 x i1> %res
750}
751
752define <vscale x 4 x i1> @insert_nxv1i1_nxv4i1_1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv) {
753; CHECK-LABEL: insert_nxv1i1_nxv4i1_1:
754; CHECK:       // %bb.0:
755; CHECK-NEXT:    punpklo p2.h, p0.b
756; CHECK-NEXT:    punpkhi p0.h, p0.b
757; CHECK-NEXT:    punpklo p2.h, p2.b
758; CHECK-NEXT:    uzp1 p1.d, p2.d, p1.d
759; CHECK-NEXT:    uzp1 p0.s, p1.s, p0.s
760; CHECK-NEXT:    ret
761  %res = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv, i64 1)
762  ret <vscale x 4 x i1> %res
763}
764
765define <vscale x 4 x i1> @insert_nxv1i1_nxv4i1_2(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv) {
766; CHECK-LABEL: insert_nxv1i1_nxv4i1_2:
767; CHECK:       // %bb.0:
768; CHECK-NEXT:    punpkhi p2.h, p0.b
769; CHECK-NEXT:    punpklo p0.h, p0.b
770; CHECK-NEXT:    punpkhi p2.h, p2.b
771; CHECK-NEXT:    uzp1 p1.d, p1.d, p2.d
772; CHECK-NEXT:    uzp1 p0.s, p0.s, p1.s
773; CHECK-NEXT:    ret
774  %res = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv, i64 2)
775  ret <vscale x 4 x i1> %res
776}
777
778define <vscale x 4 x i1> @insert_nxv1i1_nxv4i1_3(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv) {
779; CHECK-LABEL: insert_nxv1i1_nxv4i1_3:
780; CHECK:       // %bb.0:
781; CHECK-NEXT:    punpkhi p2.h, p0.b
782; CHECK-NEXT:    punpklo p0.h, p0.b
783; CHECK-NEXT:    punpklo p2.h, p2.b
784; CHECK-NEXT:    uzp1 p1.d, p2.d, p1.d
785; CHECK-NEXT:    uzp1 p0.s, p0.s, p1.s
786; CHECK-NEXT:    ret
787  %res = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv, i64 3)
788  ret <vscale x 4 x i1> %res
789}
790
791;
792; Insert nxv1i1 type into: nxv8i1
793;
794
795define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_0(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
796; CHECK-LABEL: insert_nxv1i1_nxv8i1_0:
797; CHECK:       // %bb.0:
798; CHECK-NEXT:    punpklo p2.h, p0.b
799; CHECK-NEXT:    punpkhi p0.h, p0.b
800; CHECK-NEXT:    punpklo p3.h, p2.b
801; CHECK-NEXT:    punpkhi p2.h, p2.b
802; CHECK-NEXT:    punpkhi p3.h, p3.b
803; CHECK-NEXT:    uzp1 p1.d, p1.d, p3.d
804; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
805; CHECK-NEXT:    uzp1 p0.h, p1.h, p0.h
806; CHECK-NEXT:    ret
807  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 0)
808  ret <vscale x 8 x i1> %res
809}
810
811define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
812; CHECK-LABEL: insert_nxv1i1_nxv8i1_1:
813; CHECK:       // %bb.0:
814; CHECK-NEXT:    punpklo p2.h, p0.b
815; CHECK-NEXT:    punpkhi p0.h, p0.b
816; CHECK-NEXT:    punpklo p3.h, p2.b
817; CHECK-NEXT:    punpkhi p2.h, p2.b
818; CHECK-NEXT:    punpklo p3.h, p3.b
819; CHECK-NEXT:    uzp1 p1.d, p3.d, p1.d
820; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
821; CHECK-NEXT:    uzp1 p0.h, p1.h, p0.h
822; CHECK-NEXT:    ret
823  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 1)
824  ret <vscale x 8 x i1> %res
825}
826
827define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_2(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
828; CHECK-LABEL: insert_nxv1i1_nxv8i1_2:
829; CHECK:       // %bb.0:
830; CHECK-NEXT:    punpklo p2.h, p0.b
831; CHECK-NEXT:    punpkhi p0.h, p0.b
832; CHECK-NEXT:    punpkhi p3.h, p2.b
833; CHECK-NEXT:    punpklo p2.h, p2.b
834; CHECK-NEXT:    punpkhi p3.h, p3.b
835; CHECK-NEXT:    uzp1 p1.d, p1.d, p3.d
836; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
837; CHECK-NEXT:    uzp1 p0.h, p1.h, p0.h
838; CHECK-NEXT:    ret
839  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 2)
840  ret <vscale x 8 x i1> %res
841}
842
843define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_3(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
844; CHECK-LABEL: insert_nxv1i1_nxv8i1_3:
845; CHECK:       // %bb.0:
846; CHECK-NEXT:    punpklo p2.h, p0.b
847; CHECK-NEXT:    punpkhi p0.h, p0.b
848; CHECK-NEXT:    punpkhi p3.h, p2.b
849; CHECK-NEXT:    punpklo p2.h, p2.b
850; CHECK-NEXT:    punpklo p3.h, p3.b
851; CHECK-NEXT:    uzp1 p1.d, p3.d, p1.d
852; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
853; CHECK-NEXT:    uzp1 p0.h, p1.h, p0.h
854; CHECK-NEXT:    ret
855  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 3)
856  ret <vscale x 8 x i1> %res
857}
858
859define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_4(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
860; CHECK-LABEL: insert_nxv1i1_nxv8i1_4:
861; CHECK:       // %bb.0:
862; CHECK-NEXT:    punpkhi p2.h, p0.b
863; CHECK-NEXT:    punpklo p0.h, p0.b
864; CHECK-NEXT:    punpklo p3.h, p2.b
865; CHECK-NEXT:    punpkhi p2.h, p2.b
866; CHECK-NEXT:    punpkhi p3.h, p3.b
867; CHECK-NEXT:    uzp1 p1.d, p1.d, p3.d
868; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
869; CHECK-NEXT:    uzp1 p0.h, p0.h, p1.h
870; CHECK-NEXT:    ret
871  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 4)
872  ret <vscale x 8 x i1> %res
873}
874
875define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_5(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
876; CHECK-LABEL: insert_nxv1i1_nxv8i1_5:
877; CHECK:       // %bb.0:
878; CHECK-NEXT:    punpkhi p2.h, p0.b
879; CHECK-NEXT:    punpklo p0.h, p0.b
880; CHECK-NEXT:    punpklo p3.h, p2.b
881; CHECK-NEXT:    punpkhi p2.h, p2.b
882; CHECK-NEXT:    punpklo p3.h, p3.b
883; CHECK-NEXT:    uzp1 p1.d, p3.d, p1.d
884; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
885; CHECK-NEXT:    uzp1 p0.h, p0.h, p1.h
886; CHECK-NEXT:    ret
887  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 5)
888  ret <vscale x 8 x i1> %res
889}
890
891define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_6(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
892; CHECK-LABEL: insert_nxv1i1_nxv8i1_6:
893; CHECK:       // %bb.0:
894; CHECK-NEXT:    punpkhi p2.h, p0.b
895; CHECK-NEXT:    punpklo p0.h, p0.b
896; CHECK-NEXT:    punpkhi p3.h, p2.b
897; CHECK-NEXT:    punpklo p2.h, p2.b
898; CHECK-NEXT:    punpkhi p3.h, p3.b
899; CHECK-NEXT:    uzp1 p1.d, p1.d, p3.d
900; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
901; CHECK-NEXT:    uzp1 p0.h, p0.h, p1.h
902; CHECK-NEXT:    ret
903  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 6)
904  ret <vscale x 8 x i1> %res
905}
906
907define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_7(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
908; CHECK-LABEL: insert_nxv1i1_nxv8i1_7:
909; CHECK:       // %bb.0:
910; CHECK-NEXT:    punpkhi p2.h, p0.b
911; CHECK-NEXT:    punpklo p0.h, p0.b
912; CHECK-NEXT:    punpkhi p3.h, p2.b
913; CHECK-NEXT:    punpklo p2.h, p2.b
914; CHECK-NEXT:    punpklo p3.h, p3.b
915; CHECK-NEXT:    uzp1 p1.d, p3.d, p1.d
916; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
917; CHECK-NEXT:    uzp1 p0.h, p0.h, p1.h
918; CHECK-NEXT:    ret
919  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 7)
920  ret <vscale x 8 x i1> %res
921}
922
923;
924; Insert nxv1i1 type into: nxv16i1
925;
926
927define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_0(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
928; CHECK-LABEL: insert_nxv1i1_nxv16i1_0:
929; CHECK:       // %bb.0:
930; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
931; CHECK-NEXT:    .cfi_def_cfa_offset 16
932; CHECK-NEXT:    .cfi_offset w29, -16
933; CHECK-NEXT:    addvl sp, sp, #-1
934; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
935; CHECK-NEXT:    punpklo p2.h, p0.b
936; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
937; CHECK-NEXT:    punpklo p3.h, p2.b
938; CHECK-NEXT:    punpkhi p2.h, p2.b
939; CHECK-NEXT:    punpklo p4.h, p3.b
940; CHECK-NEXT:    punpkhi p3.h, p3.b
941; CHECK-NEXT:    punpkhi p4.h, p4.b
942; CHECK-NEXT:    punpkhi p0.h, p0.b
943; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
944; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
945; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
946; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
947; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
948; CHECK-NEXT:    addvl sp, sp, #1
949; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
950; CHECK-NEXT:    ret
951  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 0)
952  ret <vscale x 16 x i1> %res
953}
954
955define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
956; CHECK-LABEL: insert_nxv1i1_nxv16i1_1:
957; CHECK:       // %bb.0:
958; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
959; CHECK-NEXT:    .cfi_def_cfa_offset 16
960; CHECK-NEXT:    .cfi_offset w29, -16
961; CHECK-NEXT:    addvl sp, sp, #-1
962; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
963; CHECK-NEXT:    punpklo p2.h, p0.b
964; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
965; CHECK-NEXT:    punpklo p3.h, p2.b
966; CHECK-NEXT:    punpkhi p2.h, p2.b
967; CHECK-NEXT:    punpklo p4.h, p3.b
968; CHECK-NEXT:    punpkhi p3.h, p3.b
969; CHECK-NEXT:    punpklo p4.h, p4.b
970; CHECK-NEXT:    punpkhi p0.h, p0.b
971; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
972; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
973; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
974; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
975; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
976; CHECK-NEXT:    addvl sp, sp, #1
977; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
978; CHECK-NEXT:    ret
979  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 1)
980  ret <vscale x 16 x i1> %res
981}
982
983define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_2(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
984; CHECK-LABEL: insert_nxv1i1_nxv16i1_2:
985; CHECK:       // %bb.0:
986; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
987; CHECK-NEXT:    .cfi_def_cfa_offset 16
988; CHECK-NEXT:    .cfi_offset w29, -16
989; CHECK-NEXT:    addvl sp, sp, #-1
990; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
991; CHECK-NEXT:    punpklo p2.h, p0.b
992; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
993; CHECK-NEXT:    punpklo p3.h, p2.b
994; CHECK-NEXT:    punpkhi p2.h, p2.b
995; CHECK-NEXT:    punpkhi p4.h, p3.b
996; CHECK-NEXT:    punpklo p3.h, p3.b
997; CHECK-NEXT:    punpkhi p4.h, p4.b
998; CHECK-NEXT:    punpkhi p0.h, p0.b
999; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1000; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1001; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1002; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
1003; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
1004; CHECK-NEXT:    addvl sp, sp, #1
1005; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1006; CHECK-NEXT:    ret
1007  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 2)
1008  ret <vscale x 16 x i1> %res
1009}
1010
1011define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_3(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1012; CHECK-LABEL: insert_nxv1i1_nxv16i1_3:
1013; CHECK:       // %bb.0:
1014; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1015; CHECK-NEXT:    .cfi_def_cfa_offset 16
1016; CHECK-NEXT:    .cfi_offset w29, -16
1017; CHECK-NEXT:    addvl sp, sp, #-1
1018; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1019; CHECK-NEXT:    punpklo p2.h, p0.b
1020; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1021; CHECK-NEXT:    punpklo p3.h, p2.b
1022; CHECK-NEXT:    punpkhi p2.h, p2.b
1023; CHECK-NEXT:    punpkhi p4.h, p3.b
1024; CHECK-NEXT:    punpklo p3.h, p3.b
1025; CHECK-NEXT:    punpklo p4.h, p4.b
1026; CHECK-NEXT:    punpkhi p0.h, p0.b
1027; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1028; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1029; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1030; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
1031; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
1032; CHECK-NEXT:    addvl sp, sp, #1
1033; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1034; CHECK-NEXT:    ret
1035  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 3)
1036  ret <vscale x 16 x i1> %res
1037}
1038
1039define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_4(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1040; CHECK-LABEL: insert_nxv1i1_nxv16i1_4:
1041; CHECK:       // %bb.0:
1042; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1043; CHECK-NEXT:    .cfi_def_cfa_offset 16
1044; CHECK-NEXT:    .cfi_offset w29, -16
1045; CHECK-NEXT:    addvl sp, sp, #-1
1046; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1047; CHECK-NEXT:    punpklo p2.h, p0.b
1048; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1049; CHECK-NEXT:    punpkhi p3.h, p2.b
1050; CHECK-NEXT:    punpklo p2.h, p2.b
1051; CHECK-NEXT:    punpklo p4.h, p3.b
1052; CHECK-NEXT:    punpkhi p3.h, p3.b
1053; CHECK-NEXT:    punpkhi p4.h, p4.b
1054; CHECK-NEXT:    punpkhi p0.h, p0.b
1055; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1056; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1057; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
1058; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1059; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
1060; CHECK-NEXT:    addvl sp, sp, #1
1061; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1062; CHECK-NEXT:    ret
1063  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 4)
1064  ret <vscale x 16 x i1> %res
1065}
1066
1067define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_5(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1068; CHECK-LABEL: insert_nxv1i1_nxv16i1_5:
1069; CHECK:       // %bb.0:
1070; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1071; CHECK-NEXT:    .cfi_def_cfa_offset 16
1072; CHECK-NEXT:    .cfi_offset w29, -16
1073; CHECK-NEXT:    addvl sp, sp, #-1
1074; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1075; CHECK-NEXT:    punpklo p2.h, p0.b
1076; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1077; CHECK-NEXT:    punpkhi p3.h, p2.b
1078; CHECK-NEXT:    punpklo p2.h, p2.b
1079; CHECK-NEXT:    punpklo p4.h, p3.b
1080; CHECK-NEXT:    punpkhi p3.h, p3.b
1081; CHECK-NEXT:    punpklo p4.h, p4.b
1082; CHECK-NEXT:    punpkhi p0.h, p0.b
1083; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1084; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1085; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
1086; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1087; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
1088; CHECK-NEXT:    addvl sp, sp, #1
1089; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1090; CHECK-NEXT:    ret
1091  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 5)
1092  ret <vscale x 16 x i1> %res
1093}
1094
1095define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_6(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1096; CHECK-LABEL: insert_nxv1i1_nxv16i1_6:
1097; CHECK:       // %bb.0:
1098; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1099; CHECK-NEXT:    .cfi_def_cfa_offset 16
1100; CHECK-NEXT:    .cfi_offset w29, -16
1101; CHECK-NEXT:    addvl sp, sp, #-1
1102; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1103; CHECK-NEXT:    punpklo p2.h, p0.b
1104; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1105; CHECK-NEXT:    punpkhi p3.h, p2.b
1106; CHECK-NEXT:    punpklo p2.h, p2.b
1107; CHECK-NEXT:    punpkhi p4.h, p3.b
1108; CHECK-NEXT:    punpklo p3.h, p3.b
1109; CHECK-NEXT:    punpkhi p4.h, p4.b
1110; CHECK-NEXT:    punpkhi p0.h, p0.b
1111; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1112; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1113; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1114; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1115; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
1116; CHECK-NEXT:    addvl sp, sp, #1
1117; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1118; CHECK-NEXT:    ret
1119  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 6)
1120  ret <vscale x 16 x i1> %res
1121}
1122
1123define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_7(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1124; CHECK-LABEL: insert_nxv1i1_nxv16i1_7:
1125; CHECK:       // %bb.0:
1126; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1127; CHECK-NEXT:    .cfi_def_cfa_offset 16
1128; CHECK-NEXT:    .cfi_offset w29, -16
1129; CHECK-NEXT:    addvl sp, sp, #-1
1130; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1131; CHECK-NEXT:    punpklo p2.h, p0.b
1132; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1133; CHECK-NEXT:    punpkhi p3.h, p2.b
1134; CHECK-NEXT:    punpklo p2.h, p2.b
1135; CHECK-NEXT:    punpkhi p4.h, p3.b
1136; CHECK-NEXT:    punpklo p3.h, p3.b
1137; CHECK-NEXT:    punpklo p4.h, p4.b
1138; CHECK-NEXT:    punpkhi p0.h, p0.b
1139; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1140; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1141; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1142; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1143; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
1144; CHECK-NEXT:    addvl sp, sp, #1
1145; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1146; CHECK-NEXT:    ret
1147  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 7)
1148  ret <vscale x 16 x i1> %res
1149}
1150
1151define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_8(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1152; CHECK-LABEL: insert_nxv1i1_nxv16i1_8:
1153; CHECK:       // %bb.0:
1154; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1155; CHECK-NEXT:    .cfi_def_cfa_offset 16
1156; CHECK-NEXT:    .cfi_offset w29, -16
1157; CHECK-NEXT:    addvl sp, sp, #-1
1158; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1159; CHECK-NEXT:    punpkhi p2.h, p0.b
1160; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1161; CHECK-NEXT:    punpklo p3.h, p2.b
1162; CHECK-NEXT:    punpkhi p2.h, p2.b
1163; CHECK-NEXT:    punpklo p4.h, p3.b
1164; CHECK-NEXT:    punpkhi p3.h, p3.b
1165; CHECK-NEXT:    punpkhi p4.h, p4.b
1166; CHECK-NEXT:    punpklo p0.h, p0.b
1167; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1168; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1169; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
1170; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
1171; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1172; CHECK-NEXT:    addvl sp, sp, #1
1173; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1174; CHECK-NEXT:    ret
1175  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 8)
1176  ret <vscale x 16 x i1> %res
1177}
1178
1179define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_9(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1180; CHECK-LABEL: insert_nxv1i1_nxv16i1_9:
1181; CHECK:       // %bb.0:
1182; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1183; CHECK-NEXT:    .cfi_def_cfa_offset 16
1184; CHECK-NEXT:    .cfi_offset w29, -16
1185; CHECK-NEXT:    addvl sp, sp, #-1
1186; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1187; CHECK-NEXT:    punpkhi p2.h, p0.b
1188; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1189; CHECK-NEXT:    punpklo p3.h, p2.b
1190; CHECK-NEXT:    punpkhi p2.h, p2.b
1191; CHECK-NEXT:    punpklo p4.h, p3.b
1192; CHECK-NEXT:    punpkhi p3.h, p3.b
1193; CHECK-NEXT:    punpklo p4.h, p4.b
1194; CHECK-NEXT:    punpklo p0.h, p0.b
1195; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1196; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1197; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
1198; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
1199; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1200; CHECK-NEXT:    addvl sp, sp, #1
1201; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1202; CHECK-NEXT:    ret
1203  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 9)
1204  ret <vscale x 16 x i1> %res
1205}
1206
1207define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_10(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1208; CHECK-LABEL: insert_nxv1i1_nxv16i1_10:
1209; CHECK:       // %bb.0:
1210; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1211; CHECK-NEXT:    .cfi_def_cfa_offset 16
1212; CHECK-NEXT:    .cfi_offset w29, -16
1213; CHECK-NEXT:    addvl sp, sp, #-1
1214; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1215; CHECK-NEXT:    punpkhi p2.h, p0.b
1216; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1217; CHECK-NEXT:    punpklo p3.h, p2.b
1218; CHECK-NEXT:    punpkhi p2.h, p2.b
1219; CHECK-NEXT:    punpkhi p4.h, p3.b
1220; CHECK-NEXT:    punpklo p3.h, p3.b
1221; CHECK-NEXT:    punpkhi p4.h, p4.b
1222; CHECK-NEXT:    punpklo p0.h, p0.b
1223; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1224; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1225; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1226; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
1227; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1228; CHECK-NEXT:    addvl sp, sp, #1
1229; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1230; CHECK-NEXT:    ret
1231  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 10)
1232  ret <vscale x 16 x i1> %res
1233}
1234
1235define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_11(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1236; CHECK-LABEL: insert_nxv1i1_nxv16i1_11:
1237; CHECK:       // %bb.0:
1238; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1239; CHECK-NEXT:    .cfi_def_cfa_offset 16
1240; CHECK-NEXT:    .cfi_offset w29, -16
1241; CHECK-NEXT:    addvl sp, sp, #-1
1242; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1243; CHECK-NEXT:    punpkhi p2.h, p0.b
1244; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1245; CHECK-NEXT:    punpklo p3.h, p2.b
1246; CHECK-NEXT:    punpkhi p2.h, p2.b
1247; CHECK-NEXT:    punpkhi p4.h, p3.b
1248; CHECK-NEXT:    punpklo p3.h, p3.b
1249; CHECK-NEXT:    punpklo p4.h, p4.b
1250; CHECK-NEXT:    punpklo p0.h, p0.b
1251; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1252; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1253; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1254; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
1255; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1256; CHECK-NEXT:    addvl sp, sp, #1
1257; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1258; CHECK-NEXT:    ret
1259  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 11)
1260  ret <vscale x 16 x i1> %res
1261}
1262
1263define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_12(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1264; CHECK-LABEL: insert_nxv1i1_nxv16i1_12:
1265; CHECK:       // %bb.0:
1266; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1267; CHECK-NEXT:    .cfi_def_cfa_offset 16
1268; CHECK-NEXT:    .cfi_offset w29, -16
1269; CHECK-NEXT:    addvl sp, sp, #-1
1270; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1271; CHECK-NEXT:    punpkhi p2.h, p0.b
1272; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1273; CHECK-NEXT:    punpkhi p3.h, p2.b
1274; CHECK-NEXT:    punpklo p2.h, p2.b
1275; CHECK-NEXT:    punpklo p4.h, p3.b
1276; CHECK-NEXT:    punpkhi p3.h, p3.b
1277; CHECK-NEXT:    punpkhi p4.h, p4.b
1278; CHECK-NEXT:    punpklo p0.h, p0.b
1279; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1280; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1281; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
1282; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1283; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1284; CHECK-NEXT:    addvl sp, sp, #1
1285; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1286; CHECK-NEXT:    ret
1287  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 12)
1288  ret <vscale x 16 x i1> %res
1289}
1290
1291define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_13(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1292; CHECK-LABEL: insert_nxv1i1_nxv16i1_13:
1293; CHECK:       // %bb.0:
1294; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1295; CHECK-NEXT:    .cfi_def_cfa_offset 16
1296; CHECK-NEXT:    .cfi_offset w29, -16
1297; CHECK-NEXT:    addvl sp, sp, #-1
1298; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1299; CHECK-NEXT:    punpkhi p2.h, p0.b
1300; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1301; CHECK-NEXT:    punpkhi p3.h, p2.b
1302; CHECK-NEXT:    punpklo p2.h, p2.b
1303; CHECK-NEXT:    punpklo p4.h, p3.b
1304; CHECK-NEXT:    punpkhi p3.h, p3.b
1305; CHECK-NEXT:    punpklo p4.h, p4.b
1306; CHECK-NEXT:    punpklo p0.h, p0.b
1307; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1308; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1309; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
1310; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1311; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1312; CHECK-NEXT:    addvl sp, sp, #1
1313; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1314; CHECK-NEXT:    ret
1315  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 13)
1316  ret <vscale x 16 x i1> %res
1317}
1318
1319define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_14(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1320; CHECK-LABEL: insert_nxv1i1_nxv16i1_14:
1321; CHECK:       // %bb.0:
1322; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1323; CHECK-NEXT:    .cfi_def_cfa_offset 16
1324; CHECK-NEXT:    .cfi_offset w29, -16
1325; CHECK-NEXT:    addvl sp, sp, #-1
1326; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1327; CHECK-NEXT:    punpkhi p2.h, p0.b
1328; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1329; CHECK-NEXT:    punpkhi p3.h, p2.b
1330; CHECK-NEXT:    punpklo p2.h, p2.b
1331; CHECK-NEXT:    punpkhi p4.h, p3.b
1332; CHECK-NEXT:    punpklo p3.h, p3.b
1333; CHECK-NEXT:    punpkhi p4.h, p4.b
1334; CHECK-NEXT:    punpklo p0.h, p0.b
1335; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1336; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1337; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1338; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1339; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1340; CHECK-NEXT:    addvl sp, sp, #1
1341; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1342; CHECK-NEXT:    ret
1343  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 14)
1344  ret <vscale x 16 x i1> %res
1345}
1346
1347define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_15(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1348; CHECK-LABEL: insert_nxv1i1_nxv16i1_15:
1349; CHECK:       // %bb.0:
1350; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1351; CHECK-NEXT:    .cfi_def_cfa_offset 16
1352; CHECK-NEXT:    .cfi_offset w29, -16
1353; CHECK-NEXT:    addvl sp, sp, #-1
1354; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1355; CHECK-NEXT:    punpkhi p2.h, p0.b
1356; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1357; CHECK-NEXT:    punpkhi p3.h, p2.b
1358; CHECK-NEXT:    punpklo p2.h, p2.b
1359; CHECK-NEXT:    punpkhi p4.h, p3.b
1360; CHECK-NEXT:    punpklo p3.h, p3.b
1361; CHECK-NEXT:    punpklo p4.h, p4.b
1362; CHECK-NEXT:    punpklo p0.h, p0.b
1363; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1364; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1365; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1366; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1367; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1368; CHECK-NEXT:    addvl sp, sp, #1
1369; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1370; CHECK-NEXT:    ret
1371  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 15)
1372  ret <vscale x 16 x i1> %res
1373}
1374
1375attributes #0 = { vscale_range(2,2) }
1376
1377declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
1378
1379declare <vscale x 6 x i16> @llvm.vector.insert.nxv6i16.nxv1i16(<vscale x 6 x i16>, <vscale x 1 x i16>, i64)
1380declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv2i16(<vscale x 8 x i16>, <vscale x 2 x i16>, i64)
1381declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
1382
1383declare <vscale x 3 x i32> @llvm.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32>, <vscale x 2 x i32>, i64)
1384declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32>, <vscale x 1 x i32>, i64)
1385declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
1386declare <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32>, <vscale x 4 x i32>, i64)
1387declare <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32>, <vscale x 2 x i32>, i64)
1388declare <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32>, <vscale x 3 x i32>, i64)
1389
1390declare <vscale x 2 x bfloat> @llvm.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i64)
1391declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv2bf16(<vscale x 4 x bfloat>, <vscale x 2 x bfloat>, i64)
1392declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i64)
1393declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat>, <4 x bfloat>, i64)
1394declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
1395declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv4bf16(<vscale x 8 x bfloat>, <vscale x 4 x bfloat>, i64)
1396declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
1397
1398declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
1399declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64)
1400declare <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64>, <vscale x 8 x i64>, i64)
1401declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
1402
1403declare <vscale x 4 x half> @llvm.vector.insert.nxv4f16.nxv2f16(<vscale x 4 x half>, <vscale x 2 x half>, i64)
1404declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv2f16(<vscale x 8 x half>, <vscale x 2 x half>, i64)
1405declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half>, <vscale x 4 x half>, i64)
1406
1407declare <vscale x 3 x float> @llvm.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float>, <vscale x 2 x float>, i64)
1408declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float>, <vscale x 1 x float>, i64)
1409declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float>, <vscale x 2 x float>, i64)
1410
1411declare <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v8i1(<vscale x 2 x i1>, <8 x i1>, i64)
1412declare <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.v16i1(<vscale x 4 x i1>, <16 x i1>, i64)
1413declare <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.v32i1(<vscale x 8 x i1>, <32 x i1>, i64)
1414declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1>, <vscale x 1 x i1>, i64)
1415declare <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1>, <vscale x 1 x i1>, i64)
1416declare <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1>, <vscale x 1 x i1>, i64)
1417declare <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.nxv1i1(<vscale x 2 x i1>, <vscale x 1 x i1>, i64)
1418declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1>, <vscale x 4 x i1>, i64)
1419declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1>, <vscale x 8 x i1>, i64)
1420declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v64i1(<vscale x 16 x i1>, <64 x i1>, i64)
1421