1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+bf16 < %s | FileCheck %s --check-prefixes=CHECK 3 4define <vscale x 2 x i64> @insert_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind { 5; CHECK-LABEL: insert_v2i64_nxv2i64: 6; CHECK: // %bb.0: 7; CHECK-NEXT: ptrue p0.d, vl2 8; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 9; CHECK-NEXT: mov z0.d, p0/m, z1.d 10; CHECK-NEXT: ret 11 %retval = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 0) 12 ret <vscale x 2 x i64> %retval 13} 14 15define <vscale x 2 x i64> @insert_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind { 16; CHECK-LABEL: insert_v2i64_nxv2i64_idx2: 17; CHECK: // %bb.0: 18; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 19; CHECK-NEXT: addvl sp, sp, #-1 20; CHECK-NEXT: cntd x8 21; CHECK-NEXT: mov w9, #2 22; CHECK-NEXT: sub x8, x8, #2 23; CHECK-NEXT: ptrue p0.d 24; CHECK-NEXT: cmp x8, #2 25; CHECK-NEXT: st1d { z0.d }, p0, [sp] 26; CHECK-NEXT: csel x8, x8, x9, lo 27; CHECK-NEXT: mov x9, sp 28; CHECK-NEXT: lsl x8, x8, #3 29; CHECK-NEXT: str q1, [x9, x8] 30; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] 31; CHECK-NEXT: addvl sp, sp, #1 32; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 33; CHECK-NEXT: ret 34 %retval = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 2) 35 ret <vscale x 2 x i64> %retval 36} 37 38define <vscale x 4 x i32> @insert_v4i32_nxv4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind { 39; CHECK-LABEL: insert_v4i32_nxv4i32: 40; CHECK: // %bb.0: 41; CHECK-NEXT: ptrue p0.s, vl4 42; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 43; CHECK-NEXT: mov z0.s, p0/m, z1.s 44; CHECK-NEXT: ret 45 %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 0) 46 ret <vscale x 4 x i32> %retval 47} 48 49define <vscale x 4 x i32> @insert_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind { 50; CHECK-LABEL: insert_v4i32_nxv4i32_idx4: 51; CHECK: // %bb.0: 52; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 53; CHECK-NEXT: addvl sp, sp, #-1 54; CHECK-NEXT: cntw x8 55; CHECK-NEXT: mov w9, #4 56; CHECK-NEXT: sub x8, x8, #4 57; CHECK-NEXT: ptrue p0.s 58; CHECK-NEXT: cmp x8, #4 59; CHECK-NEXT: st1w { z0.s }, p0, [sp] 60; CHECK-NEXT: csel x8, x8, x9, lo 61; CHECK-NEXT: mov x9, sp 62; CHECK-NEXT: lsl x8, x8, #2 63; CHECK-NEXT: str q1, [x9, x8] 64; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] 65; CHECK-NEXT: addvl sp, sp, #1 66; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 67; CHECK-NEXT: ret 68 %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 4) 69 ret <vscale x 4 x i32> %retval 70} 71 72define <vscale x 8 x i16> @insert_v8i16_nxv8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind { 73; CHECK-LABEL: insert_v8i16_nxv8i16: 74; CHECK: // %bb.0: 75; CHECK-NEXT: ptrue p0.h, vl8 76; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 77; CHECK-NEXT: mov z0.h, p0/m, z1.h 78; CHECK-NEXT: ret 79 %retval = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 0) 80 ret <vscale x 8 x i16> %retval 81} 82 83define <vscale x 8 x i16> @insert_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind { 84; CHECK-LABEL: insert_v8i16_nxv8i16_idx8: 85; CHECK: // %bb.0: 86; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 87; CHECK-NEXT: addvl sp, sp, #-1 88; CHECK-NEXT: cnth x8 89; CHECK-NEXT: mov w9, #8 90; CHECK-NEXT: sub x8, x8, #8 91; CHECK-NEXT: ptrue p0.h 92; CHECK-NEXT: cmp x8, #8 93; CHECK-NEXT: st1h { z0.h }, p0, [sp] 94; CHECK-NEXT: csel x8, x8, x9, lo 95; CHECK-NEXT: mov x9, sp 96; CHECK-NEXT: lsl x8, x8, #1 97; CHECK-NEXT: str q1, [x9, x8] 98; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] 99; CHECK-NEXT: addvl sp, sp, #1 100; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 101; CHECK-NEXT: ret 102 %retval = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 8) 103 ret <vscale x 8 x i16> %retval 104} 105 106define <vscale x 16 x i8> @insert_v16i8_nxv16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind { 107; CHECK-LABEL: insert_v16i8_nxv16i8: 108; CHECK: // %bb.0: 109; CHECK-NEXT: ptrue p0.b, vl16 110; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 111; CHECK-NEXT: mov z0.b, p0/m, z1.b 112; CHECK-NEXT: ret 113 %retval = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 0) 114 ret <vscale x 16 x i8> %retval 115} 116 117define <vscale x 16 x i8> @insert_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind { 118; CHECK-LABEL: insert_v16i8_nxv16i8_idx16: 119; CHECK: // %bb.0: 120; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 121; CHECK-NEXT: addvl sp, sp, #-1 122; CHECK-NEXT: mov x8, #-16 123; CHECK-NEXT: mov w9, #16 124; CHECK-NEXT: ptrue p0.b 125; CHECK-NEXT: st1b { z0.b }, p0, [sp] 126; CHECK-NEXT: addvl x8, x8, #1 127; CHECK-NEXT: cmp x8, #16 128; CHECK-NEXT: csel x8, x8, x9, lo 129; CHECK-NEXT: mov x9, sp 130; CHECK-NEXT: str q1, [x9, x8] 131; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] 132; CHECK-NEXT: addvl sp, sp, #1 133; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 134; CHECK-NEXT: ret 135 %retval = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 16) 136 ret <vscale x 16 x i8> %retval 137} 138 139 140; Insert subvectors into illegal vectors 141 142define void @insert_nxv8i64_nxv16i64(<vscale x 8 x i64> %sv0, <vscale x 8 x i64> %sv1, <vscale x 16 x i64>* %out) { 143; CHECK-LABEL: insert_nxv8i64_nxv16i64: 144; CHECK: // %bb.0: 145; CHECK-NEXT: ptrue p0.d 146; CHECK-NEXT: st1d { z7.d }, p0, [x0, #7, mul vl] 147; CHECK-NEXT: st1d { z6.d }, p0, [x0, #6, mul vl] 148; CHECK-NEXT: st1d { z5.d }, p0, [x0, #5, mul vl] 149; CHECK-NEXT: st1d { z4.d }, p0, [x0, #4, mul vl] 150; CHECK-NEXT: st1d { z3.d }, p0, [x0, #3, mul vl] 151; CHECK-NEXT: st1d { z2.d }, p0, [x0, #2, mul vl] 152; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl] 153; CHECK-NEXT: st1d { z0.d }, p0, [x0] 154; CHECK-NEXT: ret 155 %v0 = call <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 0) 156 %v = call <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> %v0, <vscale x 8 x i64> %sv1, i64 8) 157 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out 158 ret void 159} 160 161define void @insert_nxv8i64_nxv16i64_lo(<vscale x 8 x i64> %sv0, <vscale x 16 x i64>* %out) { 162; CHECK-LABEL: insert_nxv8i64_nxv16i64_lo: 163; CHECK: // %bb.0: 164; CHECK-NEXT: ptrue p0.d 165; CHECK-NEXT: st1d { z3.d }, p0, [x0, #3, mul vl] 166; CHECK-NEXT: st1d { z2.d }, p0, [x0, #2, mul vl] 167; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl] 168; CHECK-NEXT: st1d { z0.d }, p0, [x0] 169; CHECK-NEXT: ret 170 %v = call <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 0) 171 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out 172 ret void 173} 174 175define void @insert_nxv8i64_nxv16i64_hi(<vscale x 8 x i64> %sv0, <vscale x 16 x i64>* %out) { 176; CHECK-LABEL: insert_nxv8i64_nxv16i64_hi: 177; CHECK: // %bb.0: 178; CHECK-NEXT: ptrue p0.d 179; CHECK-NEXT: st1d { z3.d }, p0, [x0, #7, mul vl] 180; CHECK-NEXT: st1d { z2.d }, p0, [x0, #6, mul vl] 181; CHECK-NEXT: st1d { z1.d }, p0, [x0, #5, mul vl] 182; CHECK-NEXT: st1d { z0.d }, p0, [x0, #4, mul vl] 183; CHECK-NEXT: ret 184 %v = call <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 8) 185 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out 186 ret void 187} 188 189define void @insert_v2i64_nxv16i64(<2 x i64> %sv0, <2 x i64> %sv1, <vscale x 16 x i64>* %out) uwtable { 190; CHECK-LABEL: insert_v2i64_nxv16i64: 191; CHECK: // %bb.0: 192; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 193; CHECK-NEXT: .cfi_def_cfa_offset 16 194; CHECK-NEXT: .cfi_offset w29, -16 195; CHECK-NEXT: addvl sp, sp, #-4 196; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG 197; CHECK-NEXT: ptrue p0.d 198; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 199; CHECK-NEXT: st1d { z0.d }, p0, [sp] 200; CHECK-NEXT: str q1, [sp, #32] 201; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] 202; CHECK-NEXT: ld1d { z1.d }, p0/z, [sp, #1, mul vl] 203; CHECK-NEXT: ld1d { z2.d }, p0/z, [sp, #2, mul vl] 204; CHECK-NEXT: ld1d { z3.d }, p0/z, [sp, #3, mul vl] 205; CHECK-NEXT: st1d { z3.d }, p0, [x0, #3, mul vl] 206; CHECK-NEXT: st1d { z2.d }, p0, [x0, #2, mul vl] 207; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl] 208; CHECK-NEXT: st1d { z0.d }, p0, [x0] 209; CHECK-NEXT: addvl sp, sp, #4 210; CHECK-NEXT: .cfi_def_cfa wsp, 16 211; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 212; CHECK-NEXT: .cfi_def_cfa_offset 0 213; CHECK-NEXT: .cfi_restore w29 214; CHECK-NEXT: ret 215 %v0 = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0) 216 %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> %v0, <2 x i64> %sv1, i64 4) 217 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out 218 ret void 219} 220 221define void @insert_v2i64_nxv16i64_lo0(<2 x i64>* %psv, <vscale x 16 x i64>* %out) { 222; CHECK-LABEL: insert_v2i64_nxv16i64_lo0: 223; CHECK: // %bb.0: 224; CHECK-NEXT: ldr q0, [x0] 225; CHECK-NEXT: ptrue p0.d 226; CHECK-NEXT: st1d { z0.d }, p0, [x1] 227; CHECK-NEXT: ret 228 %sv = load <2 x i64>, <2 x i64>* %psv 229 %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0) 230 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out 231 ret void 232} 233 234define void @insert_v2i64_nxv16i64_lo2(<2 x i64>* %psv, <vscale x 16 x i64>* %out) uwtable { 235; CHECK-LABEL: insert_v2i64_nxv16i64_lo2: 236; CHECK: // %bb.0: 237; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 238; CHECK-NEXT: .cfi_def_cfa_offset 16 239; CHECK-NEXT: .cfi_offset w29, -16 240; CHECK-NEXT: addvl sp, sp, #-2 241; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG 242; CHECK-NEXT: ldr q0, [x0] 243; CHECK-NEXT: ptrue p0.d 244; CHECK-NEXT: str q0, [sp, #16] 245; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] 246; CHECK-NEXT: ld1d { z1.d }, p0/z, [sp, #1, mul vl] 247; CHECK-NEXT: st1d { z1.d }, p0, [x1, #1, mul vl] 248; CHECK-NEXT: st1d { z0.d }, p0, [x1] 249; CHECK-NEXT: addvl sp, sp, #2 250; CHECK-NEXT: .cfi_def_cfa wsp, 16 251; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 252; CHECK-NEXT: .cfi_def_cfa_offset 0 253; CHECK-NEXT: .cfi_restore w29 254; CHECK-NEXT: ret 255 %sv = load <2 x i64>, <2 x i64>* %psv 256 %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2) 257 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out 258 ret void 259} 260 261 262; Insert subvectors that need widening 263 264define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_undef() nounwind { 265; CHECK-LABEL: insert_nxv1i32_nxv4i32_undef: 266; CHECK: // %bb.0: // %entry 267; CHECK-NEXT: mov z0.s, #1 // =0x1 268; CHECK-NEXT: ret 269entry: 270 %0 = insertelement <vscale x 1 x i32> undef, i32 1, i32 0 271 %subvec = shufflevector <vscale x 1 x i32> %0, <vscale x 1 x i32> undef, <vscale x 1 x i32> zeroinitializer 272 %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> undef, <vscale x 1 x i32> %subvec, i64 0) 273 ret <vscale x 4 x i32> %retval 274} 275 276define <vscale x 6 x i16> @insert_nxv1i16_nxv6i16_undef() nounwind { 277; CHECK-LABEL: insert_nxv1i16_nxv6i16_undef: 278; CHECK: // %bb.0: // %entry 279; CHECK-NEXT: mov z0.h, #1 // =0x1 280; CHECK-NEXT: ret 281entry: 282 %0 = insertelement <vscale x 1 x i16> undef, i16 1, i32 0 283 %subvec = shufflevector <vscale x 1 x i16> %0, <vscale x 1 x i16> undef, <vscale x 1 x i32> zeroinitializer 284 %retval = call <vscale x 6 x i16> @llvm.vector.insert.nxv6i16.nxv1i16(<vscale x 6 x i16> undef, <vscale x 1 x i16> %subvec, i64 0) 285 ret <vscale x 6 x i16> %retval 286} 287 288define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_undef(<vscale x 1 x float> %subvec) nounwind { 289; CHECK-LABEL: insert_nxv1f32_nxv4f32_undef: 290; CHECK: // %bb.0: // %entry 291; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 292; CHECK-NEXT: ret 293entry: 294 %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> undef, <vscale x 1 x float> %subvec, i64 0) 295 ret <vscale x 4 x float> %retval 296} 297 298; This tests promotion of the input operand to INSERT_SUBVECTOR. 299define <vscale x 8 x i16> @insert_nxv8i16_nxv2i16(<vscale x 8 x i16> %vec, <vscale x 2 x i16> %in) nounwind { 300; CHECK-LABEL: insert_nxv8i16_nxv2i16: 301; CHECK: // %bb.0: 302; CHECK-NEXT: uunpklo z2.s, z0.h 303; CHECK-NEXT: uunpkhi z0.s, z0.h 304; CHECK-NEXT: uunpklo z2.d, z2.s 305; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s 306; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h 307; CHECK-NEXT: ret 308 %r = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv2i16(<vscale x 8 x i16> %vec, <vscale x 2 x i16> %in, i64 2) 309 ret <vscale x 8 x i16> %r 310} 311 312define <vscale x 4 x half> @insert_nxv4f16_nxv2f16_0(<vscale x 4 x half> %sv0, <vscale x 2 x half> %sv1) nounwind { 313; CHECK-LABEL: insert_nxv4f16_nxv2f16_0: 314; CHECK: // %bb.0: 315; CHECK-NEXT: uunpkhi z0.d, z0.s 316; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s 317; CHECK-NEXT: ret 318 %v0 = call <vscale x 4 x half> @llvm.vector.insert.nxv4f16.nxv2f16(<vscale x 4 x half> %sv0, <vscale x 2 x half> %sv1, i64 0) 319 ret <vscale x 4 x half> %v0 320} 321 322define <vscale x 4 x half> @insert_nxv4f16_nxv2f16_2(<vscale x 4 x half> %sv0, <vscale x 2 x half> %sv1) nounwind { 323; CHECK-LABEL: insert_nxv4f16_nxv2f16_2: 324; CHECK: // %bb.0: 325; CHECK-NEXT: uunpklo z0.d, z0.s 326; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s 327; CHECK-NEXT: ret 328 %v0 = call <vscale x 4 x half> @llvm.vector.insert.nxv4f16.nxv2f16(<vscale x 4 x half> %sv0, <vscale x 2 x half> %sv1, i64 2) 329 ret <vscale x 4 x half> %v0 330} 331 332; Test that the index is scaled by vscale if the subvector is scalable. 333define <vscale x 8 x half> @insert_nxv8f16_nxv2f16(<vscale x 8 x half> %vec, <vscale x 2 x half> %in) nounwind { 334; CHECK-LABEL: insert_nxv8f16_nxv2f16: 335; CHECK: // %bb.0: 336; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 337; CHECK-NEXT: addvl sp, sp, #-1 338; CHECK-NEXT: ptrue p0.h 339; CHECK-NEXT: ptrue p1.d 340; CHECK-NEXT: st1h { z0.h }, p0, [sp] 341; CHECK-NEXT: st1h { z1.d }, p1, [sp, #1, mul vl] 342; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] 343; CHECK-NEXT: addvl sp, sp, #1 344; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 345; CHECK-NEXT: ret 346 %r = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv2f16(<vscale x 8 x half> %vec, <vscale x 2 x half> %in, i64 2) 347 ret <vscale x 8 x half> %r 348} 349 350define <vscale x 8 x half> @insert_nxv8f16_nxv4f16_0(<vscale x 8 x half> %sv0, <vscale x 4 x half> %sv1) nounwind { 351; CHECK-LABEL: insert_nxv8f16_nxv4f16_0: 352; CHECK: // %bb.0: 353; CHECK-NEXT: uunpkhi z0.s, z0.h 354; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h 355; CHECK-NEXT: ret 356 %v0 = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half> %sv0, <vscale x 4 x half> %sv1, i64 0) 357 ret <vscale x 8 x half> %v0 358} 359 360define <vscale x 8 x half> @insert_nxv8f16_nxv4f16_4(<vscale x 8 x half> %sv0, <vscale x 4 x half> %sv1) nounwind { 361; CHECK-LABEL: insert_nxv8f16_nxv4f16_4: 362; CHECK: // %bb.0: 363; CHECK-NEXT: uunpklo z0.s, z0.h 364; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h 365; CHECK-NEXT: ret 366 %v0 = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half> %sv0, <vscale x 4 x half> %sv1, i64 4) 367 ret <vscale x 8 x half> %v0 368} 369 370; Fixed length clamping 371 372define <vscale x 2 x i64> @insert_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind #0 { 373; CHECK-LABEL: insert_fixed_v2i64_nxv2i64: 374; CHECK: // %bb.0: 375; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 376; CHECK-NEXT: addvl sp, sp, #-1 377; CHECK-NEXT: cntd x8 378; CHECK-NEXT: mov w9, #2 379; CHECK-NEXT: sub x8, x8, #2 380; CHECK-NEXT: ptrue p0.d 381; CHECK-NEXT: cmp x8, #2 382; CHECK-NEXT: st1d { z0.d }, p0, [sp] 383; CHECK-NEXT: csel x8, x8, x9, lo 384; CHECK-NEXT: mov x9, sp 385; CHECK-NEXT: lsl x8, x8, #3 386; CHECK-NEXT: str q1, [x9, x8] 387; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] 388; CHECK-NEXT: addvl sp, sp, #1 389; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 390; CHECK-NEXT: ret 391 %retval = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 2) 392 ret <vscale x 2 x i64> %retval 393} 394 395define <vscale x 2 x i64> @insert_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec, <4 x i64>* %ptr) nounwind #0 { 396; CHECK-LABEL: insert_fixed_v4i64_nxv2i64: 397; CHECK: // %bb.0: 398; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 399; CHECK-NEXT: addvl sp, sp, #-1 400; CHECK-NEXT: cntd x8 401; CHECK-NEXT: ptrue p0.d 402; CHECK-NEXT: subs x8, x8, #4 403; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] 404; CHECK-NEXT: csel x8, xzr, x8, lo 405; CHECK-NEXT: mov w9, #4 406; CHECK-NEXT: cmp x8, #4 407; CHECK-NEXT: st1d { z0.d }, p0, [sp] 408; CHECK-NEXT: csel x8, x8, x9, lo 409; CHECK-NEXT: mov x9, sp 410; CHECK-NEXT: st1d { z1.d }, p0, [x9, x8, lsl #3] 411; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] 412; CHECK-NEXT: addvl sp, sp, #1 413; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 414; CHECK-NEXT: ret 415 %subvec = load <4 x i64>, <4 x i64>* %ptr 416 %retval = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> %vec, <4 x i64> %subvec, i64 4) 417 ret <vscale x 2 x i64> %retval 418} 419 420;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 421;; Upacked types that need result widening 422;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 423 424define <vscale x 3 x i32> @insert_nxv3i32_nxv2i32(<vscale x 2 x i32> %sv0) { 425; CHECK-LABEL: insert_nxv3i32_nxv2i32: 426; CHECK: // %bb.0: 427; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 428; CHECK-NEXT: ret 429 %v0 = call <vscale x 3 x i32> @llvm.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32> undef, <vscale x 2 x i32> %sv0, i64 0) 430 ret <vscale x 3 x i32> %v0 431} 432 433;; Check that the Subvector is not widen so it does not crash. 434define <vscale x 3 x i32> @insert_nxv3i32_nxv2i32_2(<vscale x 3 x i32> %sv0, <vscale x 2 x i32> %sv1) { 435; CHECK-LABEL: insert_nxv3i32_nxv2i32_2: 436; CHECK: // %bb.0: 437; CHECK-NEXT: uunpkhi z0.d, z0.s 438; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s 439; CHECK-NEXT: ret 440 %v0 = call <vscale x 3 x i32> @llvm.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32> %sv0, <vscale x 2 x i32> %sv1, i64 0) 441 ret <vscale x 3 x i32> %v0 442} 443 444define <vscale x 3 x float> @insert_nxv3f32_nxv2f32(<vscale x 2 x float> %sv0) nounwind { 445; CHECK-LABEL: insert_nxv3f32_nxv2f32: 446; CHECK: // %bb.0: 447; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 448; CHECK-NEXT: ret 449 %v0 = call <vscale x 3 x float> @llvm.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float> undef, <vscale x 2 x float> %sv0, i64 0) 450 ret <vscale x 3 x float> %v0 451} 452 453define <vscale x 4 x float> @insert_nxv4f32_nxv2f32_0(<vscale x 4 x float> %sv0, <vscale x 2 x float> %sv1) nounwind { 454; CHECK-LABEL: insert_nxv4f32_nxv2f32_0: 455; CHECK: // %bb.0: 456; CHECK-NEXT: uunpkhi z0.d, z0.s 457; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s 458; CHECK-NEXT: ret 459 %v0 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> %sv0, <vscale x 2 x float> %sv1, i64 0) 460 ret <vscale x 4 x float> %v0 461} 462 463define <vscale x 4 x float> @insert_nxv4f32_nxv2f32_2(<vscale x 4 x float> %sv0, <vscale x 2 x float> %sv1) nounwind { 464; CHECK-LABEL: insert_nxv4f32_nxv2f32_2: 465; CHECK: // %bb.0: 466; CHECK-NEXT: uunpklo z0.d, z0.s 467; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s 468; CHECK-NEXT: ret 469 %v0 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> %sv0, <vscale x 2 x float> %sv1, i64 2) 470 ret <vscale x 4 x float> %v0 471} 472 473define <vscale x 6 x i32> @insert_nxv6i32_nxv2i32(<vscale x 2 x i32> %sv0, <vscale x 2 x i32> %sv1) nounwind { 474; CHECK-LABEL: insert_nxv6i32_nxv2i32: 475; CHECK: // %bb.0: 476; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 477; CHECK-NEXT: addvl sp, sp, #-2 478; CHECK-NEXT: ptrue p0.s 479; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s 480; CHECK-NEXT: st1w { z0.s }, p0, [sp] 481; CHECK-NEXT: ld1w { z1.s }, p0/z, [sp, #1, mul vl] 482; CHECK-NEXT: addvl sp, sp, #2 483; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 484; CHECK-NEXT: ret 485 %v0 = call <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32> undef, <vscale x 2 x i32> %sv0, i64 0) 486 %v1 = call <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32> %v0, <vscale x 2 x i32> %sv1, i64 2) 487 ret <vscale x 6 x i32> %v1 488} 489 490;; This only works because the input vector is undef and index is zero 491define <vscale x 6 x i32> @insert_nxv6i32_nxv3i32(<vscale x 3 x i32> %sv0) { 492; CHECK-LABEL: insert_nxv6i32_nxv3i32: 493; CHECK: // %bb.0: 494; CHECK-NEXT: ret 495 %v0 = call <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32> undef, <vscale x 3 x i32> %sv0, i64 0) 496 ret <vscale x 6 x i32> %v0 497} 498 499define <vscale x 12 x i32> @insert_nxv12i32_nxv4i32(<vscale x 4 x i32> %sv0, <vscale x 4 x i32> %sv1, <vscale x 4 x i32> %sv2) { 500; CHECK-LABEL: insert_nxv12i32_nxv4i32: 501; CHECK: // %bb.0: 502; CHECK-NEXT: ret 503 %v0 = call <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> undef, <vscale x 4 x i32> %sv0, i64 0) 504 %v1 = call <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> %v0, <vscale x 4 x i32> %sv1, i64 4) 505 %v2 = call <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> %v1, <vscale x 4 x i32> %sv2, i64 8) 506 ret <vscale x 12 x i32> %v2 507} 508 509define <vscale x 2 x bfloat> @insert_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1) nounwind { 510; CHECK-LABEL: insert_nxv2bf16_nxv2bf16: 511; CHECK: // %bb.0: 512; CHECK-NEXT: mov z0.d, z1.d 513; CHECK-NEXT: ret 514 %v0 = call <vscale x 2 x bfloat> @llvm.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1, i64 0) 515 ret <vscale x 2 x bfloat> %v0 516} 517 518define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1) nounwind { 519; CHECK-LABEL: insert_nxv4bf16_nxv4bf16: 520; CHECK: // %bb.0: 521; CHECK-NEXT: mov z0.d, z1.d 522; CHECK-NEXT: ret 523 %v0 = call <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1, i64 0) 524 ret <vscale x 4 x bfloat> %v0 525} 526 527define <vscale x 4 x bfloat> @insert_nxv4bf16_v4bf16(<vscale x 4 x bfloat> %sv0, <4 x bfloat> %v1) nounwind { 528; CHECK-LABEL: insert_nxv4bf16_v4bf16: 529; CHECK: // %bb.0: 530; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 531; CHECK-NEXT: addvl sp, sp, #-1 532; CHECK-NEXT: ptrue p0.s 533; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] 534; CHECK-NEXT: addpl x8, sp, #4 535; CHECK-NEXT: str d1, [x8] 536; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] 537; CHECK-NEXT: addvl sp, sp, #1 538; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 539; CHECK-NEXT: ret 540 %v0 = call <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat> %sv0, <4 x bfloat> %v1, i64 0) 541 ret <vscale x 4 x bfloat> %v0 542} 543 544define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %sv0, <vscale x 8 x bfloat> %sv1) nounwind { 545; CHECK-LABEL: insert_nxv8bf16_nxv8bf16: 546; CHECK: // %bb.0: 547; CHECK-NEXT: mov z0.d, z1.d 548; CHECK-NEXT: ret 549 %v0 = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat> %sv0, <vscale x 8 x bfloat> %sv1, i64 0) 550 ret <vscale x 8 x bfloat> %v0 551} 552 553define <vscale x 8 x bfloat> @insert_nxv8bf16_v8bf16(<vscale x 8 x bfloat> %sv0, <8 x bfloat> %v1) nounwind { 554; CHECK-LABEL: insert_nxv8bf16_v8bf16: 555; CHECK: // %bb.0: 556; CHECK-NEXT: ptrue p0.h, vl8 557; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 558; CHECK-NEXT: mov z0.h, p0/m, z1.h 559; CHECK-NEXT: ret 560 %v0 = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> %sv0, <8 x bfloat> %v1, i64 0) 561 ret <vscale x 8 x bfloat> %v0 562} 563 564define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv4bf16_0(<vscale x 8 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1) nounwind { 565; CHECK-LABEL: insert_nxv8bf16_nxv4bf16_0: 566; CHECK: // %bb.0: 567; CHECK-NEXT: uunpkhi z0.s, z0.h 568; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h 569; CHECK-NEXT: ret 570 %v0 = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv4bf16(<vscale x 8 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1, i64 0) 571 ret <vscale x 8 x bfloat> %v0 572} 573 574define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv4bf16_4(<vscale x 8 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1) nounwind { 575; CHECK-LABEL: insert_nxv8bf16_nxv4bf16_4: 576; CHECK: // %bb.0: 577; CHECK-NEXT: uunpklo z0.s, z0.h 578; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h 579; CHECK-NEXT: ret 580 %v0 = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv4bf16(<vscale x 8 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1, i64 4) 581 ret <vscale x 8 x bfloat> %v0 582} 583 584define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv2bf16_0(<vscale x 4 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1) nounwind { 585; CHECK-LABEL: insert_nxv4bf16_nxv2bf16_0: 586; CHECK: // %bb.0: 587; CHECK-NEXT: uunpkhi z0.d, z0.s 588; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s 589; CHECK-NEXT: ret 590 %v0 = call <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv2bf16(<vscale x 4 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1, i64 0) 591 ret <vscale x 4 x bfloat> %v0 592} 593 594define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv2bf16_2(<vscale x 4 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1) nounwind { 595; CHECK-LABEL: insert_nxv4bf16_nxv2bf16_2: 596; CHECK: // %bb.0: 597; CHECK-NEXT: uunpklo z0.d, z0.s 598; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s 599; CHECK-NEXT: ret 600 %v0 = call <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv2bf16(<vscale x 4 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1, i64 2) 601 ret <vscale x 4 x bfloat> %v0 602} 603 604; Test predicate inserts of half size. 605define <vscale x 16 x i1> @insert_nxv16i1_nxv8i1_0(<vscale x 16 x i1> %vec, <vscale x 8 x i1> %sv) { 606; CHECK-LABEL: insert_nxv16i1_nxv8i1_0: 607; CHECK: // %bb.0: 608; CHECK-NEXT: punpkhi p0.h, p0.b 609; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b 610; CHECK-NEXT: ret 611 %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1> %vec, <vscale x 8 x i1> %sv, i64 0) 612 ret <vscale x 16 x i1> %v0 613} 614 615define <vscale x 16 x i1> @insert_nxv16i1_nxv8i1_8(<vscale x 16 x i1> %vec, <vscale x 8 x i1> %sv) { 616; CHECK-LABEL: insert_nxv16i1_nxv8i1_8: 617; CHECK: // %bb.0: 618; CHECK-NEXT: punpklo p0.h, p0.b 619; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b 620; CHECK-NEXT: ret 621 %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1> %vec, <vscale x 8 x i1> %sv, i64 8) 622 ret <vscale x 16 x i1> %v0 623} 624 625; Test predicate inserts of less than half the size. 626define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_0(<vscale x 16 x i1> %vec, <vscale x 4 x i1> %sv) { 627; CHECK-LABEL: insert_nxv16i1_nxv4i1_0: 628; CHECK: // %bb.0: 629; CHECK-NEXT: punpklo p2.h, p0.b 630; CHECK-NEXT: punpkhi p0.h, p0.b 631; CHECK-NEXT: punpkhi p2.h, p2.b 632; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h 633; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b 634; CHECK-NEXT: ret 635 %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> %vec, <vscale x 4 x i1> %sv, i64 0) 636 ret <vscale x 16 x i1> %v0 637} 638 639define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_12(<vscale x 16 x i1> %vec, <vscale x 4 x i1> %sv) { 640; CHECK-LABEL: insert_nxv16i1_nxv4i1_12: 641; CHECK: // %bb.0: 642; CHECK-NEXT: punpkhi p2.h, p0.b 643; CHECK-NEXT: punpklo p0.h, p0.b 644; CHECK-NEXT: punpklo p2.h, p2.b 645; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h 646; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b 647; CHECK-NEXT: ret 648 %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> %vec, <vscale x 4 x i1> %sv, i64 12) 649 ret <vscale x 16 x i1> %v0 650} 651 652; Test predicate insert into undef/zero 653define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_into_zero(<vscale x 4 x i1> %sv) { 654; CHECK-LABEL: insert_nxv16i1_nxv4i1_into_zero: 655; CHECK: // %bb.0: 656; CHECK-NEXT: pfalse p1.b 657; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h 658; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b 659; CHECK-NEXT: ret 660 %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> zeroinitializer, <vscale x 4 x i1> %sv, i64 0) 661 ret <vscale x 16 x i1> %v0 662} 663 664define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_into_poison(<vscale x 4 x i1> %sv) { 665; CHECK-LABEL: insert_nxv16i1_nxv4i1_into_poison: 666; CHECK: // %bb.0: 667; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h 668; CHECK-NEXT: uzp1 p0.b, p0.b, p0.b 669; CHECK-NEXT: ret 670 %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> poison, <vscale x 4 x i1> %sv, i64 0) 671 ret <vscale x 16 x i1> %v0 672} 673 674; Test constant predicate insert into undef 675define <vscale x 2 x i1> @insert_nxv2i1_v8i1_const_true_into_undef() vscale_range(4,8) { 676; CHECK-LABEL: insert_nxv2i1_v8i1_const_true_into_undef: 677; CHECK: // %bb.0: 678; CHECK-NEXT: ptrue p0.d 679; CHECK-NEXT: ret 680 %v0 = call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v8i1 (<vscale x 2 x i1> undef, <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i64 0) 681 ret <vscale x 2 x i1> %v0 682} 683 684define <vscale x 4 x i1> @insert_nxv4i1_v16i1_const_true_into_undef() vscale_range(4,8) { 685; CHECK-LABEL: insert_nxv4i1_v16i1_const_true_into_undef: 686; CHECK: // %bb.0: 687; CHECK-NEXT: ptrue p0.s 688; CHECK-NEXT: ret 689 %v0 = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.v16i1 (<vscale x 4 x i1> undef, <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i64 0) 690 ret <vscale x 4 x i1> %v0 691} 692 693define <vscale x 8 x i1> @insert_nxv8i1_v32i1_const_true_into_undef() vscale_range(4,8) { 694; CHECK-LABEL: insert_nxv8i1_v32i1_const_true_into_undef: 695; CHECK: // %bb.0: 696; CHECK-NEXT: ptrue p0.h 697; CHECK-NEXT: ret 698 %v0 = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.v32i1 (<vscale x 8 x i1> undef, <32 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i64 0) 699 ret <vscale x 8 x i1> %v0 700} 701 702define <vscale x 16 x i1> @insert_nxv16i1_v64i1_const_true_into_undef() vscale_range(4,8) { 703; CHECK-LABEL: insert_nxv16i1_v64i1_const_true_into_undef: 704; CHECK: // %bb.0: 705; CHECK-NEXT: ptrue p0.b 706; CHECK-NEXT: ret 707 %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v64i1 (<vscale x 16 x i1> undef, <64 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i64 0) 708 ret <vscale x 16 x i1> %v0 709} 710 711; 712; Insert nxv1i1 type into: nxv2i1 713; 714 715define <vscale x 2 x i1> @insert_nxv1i1_nxv2i1_0(<vscale x 2 x i1> %vec, <vscale x 1 x i1> %sv) { 716; CHECK-LABEL: insert_nxv1i1_nxv2i1_0: 717; CHECK: // %bb.0: 718; CHECK-NEXT: punpkhi p0.h, p0.b 719; CHECK-NEXT: uzp1 p0.d, p1.d, p0.d 720; CHECK-NEXT: ret 721 %res = call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.nxv1i1(<vscale x 2 x i1> %vec, <vscale x 1 x i1> %sv, i64 0) 722 ret <vscale x 2 x i1> %res 723} 724 725define <vscale x 2 x i1> @insert_nxv1i1_nxv2i1_1(<vscale x 2 x i1> %vec, <vscale x 1 x i1> %sv) { 726; CHECK-LABEL: insert_nxv1i1_nxv2i1_1: 727; CHECK: // %bb.0: 728; CHECK-NEXT: punpklo p0.h, p0.b 729; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d 730; CHECK-NEXT: ret 731 %res = call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.nxv1i1(<vscale x 2 x i1> %vec, <vscale x 1 x i1> %sv, i64 1) 732 ret <vscale x 2 x i1> %res 733} 734 735; 736; Insert nxv1i1 type into: nxv4i1 737; 738 739define <vscale x 4 x i1> @insert_nxv1i1_nxv4i1_0(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv) { 740; CHECK-LABEL: insert_nxv1i1_nxv4i1_0: 741; CHECK: // %bb.0: 742; CHECK-NEXT: punpklo p2.h, p0.b 743; CHECK-NEXT: punpkhi p0.h, p0.b 744; CHECK-NEXT: punpkhi p2.h, p2.b 745; CHECK-NEXT: uzp1 p1.d, p1.d, p2.d 746; CHECK-NEXT: uzp1 p0.s, p1.s, p0.s 747; CHECK-NEXT: ret 748 %res = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv, i64 0) 749 ret <vscale x 4 x i1> %res 750} 751 752define <vscale x 4 x i1> @insert_nxv1i1_nxv4i1_1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv) { 753; CHECK-LABEL: insert_nxv1i1_nxv4i1_1: 754; CHECK: // %bb.0: 755; CHECK-NEXT: punpklo p2.h, p0.b 756; CHECK-NEXT: punpkhi p0.h, p0.b 757; CHECK-NEXT: punpklo p2.h, p2.b 758; CHECK-NEXT: uzp1 p1.d, p2.d, p1.d 759; CHECK-NEXT: uzp1 p0.s, p1.s, p0.s 760; CHECK-NEXT: ret 761 %res = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv, i64 1) 762 ret <vscale x 4 x i1> %res 763} 764 765define <vscale x 4 x i1> @insert_nxv1i1_nxv4i1_2(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv) { 766; CHECK-LABEL: insert_nxv1i1_nxv4i1_2: 767; CHECK: // %bb.0: 768; CHECK-NEXT: punpkhi p2.h, p0.b 769; CHECK-NEXT: punpklo p0.h, p0.b 770; CHECK-NEXT: punpkhi p2.h, p2.b 771; CHECK-NEXT: uzp1 p1.d, p1.d, p2.d 772; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s 773; CHECK-NEXT: ret 774 %res = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv, i64 2) 775 ret <vscale x 4 x i1> %res 776} 777 778define <vscale x 4 x i1> @insert_nxv1i1_nxv4i1_3(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv) { 779; CHECK-LABEL: insert_nxv1i1_nxv4i1_3: 780; CHECK: // %bb.0: 781; CHECK-NEXT: punpkhi p2.h, p0.b 782; CHECK-NEXT: punpklo p0.h, p0.b 783; CHECK-NEXT: punpklo p2.h, p2.b 784; CHECK-NEXT: uzp1 p1.d, p2.d, p1.d 785; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s 786; CHECK-NEXT: ret 787 %res = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv, i64 3) 788 ret <vscale x 4 x i1> %res 789} 790 791; 792; Insert nxv1i1 type into: nxv8i1 793; 794 795define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_0(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) { 796; CHECK-LABEL: insert_nxv1i1_nxv8i1_0: 797; CHECK: // %bb.0: 798; CHECK-NEXT: punpklo p2.h, p0.b 799; CHECK-NEXT: punpkhi p0.h, p0.b 800; CHECK-NEXT: punpklo p3.h, p2.b 801; CHECK-NEXT: punpkhi p2.h, p2.b 802; CHECK-NEXT: punpkhi p3.h, p3.b 803; CHECK-NEXT: uzp1 p1.d, p1.d, p3.d 804; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s 805; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h 806; CHECK-NEXT: ret 807 %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 0) 808 ret <vscale x 8 x i1> %res 809} 810 811define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) { 812; CHECK-LABEL: insert_nxv1i1_nxv8i1_1: 813; CHECK: // %bb.0: 814; CHECK-NEXT: punpklo p2.h, p0.b 815; CHECK-NEXT: punpkhi p0.h, p0.b 816; CHECK-NEXT: punpklo p3.h, p2.b 817; CHECK-NEXT: punpkhi p2.h, p2.b 818; CHECK-NEXT: punpklo p3.h, p3.b 819; CHECK-NEXT: uzp1 p1.d, p3.d, p1.d 820; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s 821; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h 822; CHECK-NEXT: ret 823 %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 1) 824 ret <vscale x 8 x i1> %res 825} 826 827define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_2(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) { 828; CHECK-LABEL: insert_nxv1i1_nxv8i1_2: 829; CHECK: // %bb.0: 830; CHECK-NEXT: punpklo p2.h, p0.b 831; CHECK-NEXT: punpkhi p0.h, p0.b 832; CHECK-NEXT: punpkhi p3.h, p2.b 833; CHECK-NEXT: punpklo p2.h, p2.b 834; CHECK-NEXT: punpkhi p3.h, p3.b 835; CHECK-NEXT: uzp1 p1.d, p1.d, p3.d 836; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s 837; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h 838; CHECK-NEXT: ret 839 %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 2) 840 ret <vscale x 8 x i1> %res 841} 842 843define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_3(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) { 844; CHECK-LABEL: insert_nxv1i1_nxv8i1_3: 845; CHECK: // %bb.0: 846; CHECK-NEXT: punpklo p2.h, p0.b 847; CHECK-NEXT: punpkhi p0.h, p0.b 848; CHECK-NEXT: punpkhi p3.h, p2.b 849; CHECK-NEXT: punpklo p2.h, p2.b 850; CHECK-NEXT: punpklo p3.h, p3.b 851; CHECK-NEXT: uzp1 p1.d, p3.d, p1.d 852; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s 853; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h 854; CHECK-NEXT: ret 855 %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 3) 856 ret <vscale x 8 x i1> %res 857} 858 859define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_4(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) { 860; CHECK-LABEL: insert_nxv1i1_nxv8i1_4: 861; CHECK: // %bb.0: 862; CHECK-NEXT: punpkhi p2.h, p0.b 863; CHECK-NEXT: punpklo p0.h, p0.b 864; CHECK-NEXT: punpklo p3.h, p2.b 865; CHECK-NEXT: punpkhi p2.h, p2.b 866; CHECK-NEXT: punpkhi p3.h, p3.b 867; CHECK-NEXT: uzp1 p1.d, p1.d, p3.d 868; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s 869; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h 870; CHECK-NEXT: ret 871 %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 4) 872 ret <vscale x 8 x i1> %res 873} 874 875define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_5(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) { 876; CHECK-LABEL: insert_nxv1i1_nxv8i1_5: 877; CHECK: // %bb.0: 878; CHECK-NEXT: punpkhi p2.h, p0.b 879; CHECK-NEXT: punpklo p0.h, p0.b 880; CHECK-NEXT: punpklo p3.h, p2.b 881; CHECK-NEXT: punpkhi p2.h, p2.b 882; CHECK-NEXT: punpklo p3.h, p3.b 883; CHECK-NEXT: uzp1 p1.d, p3.d, p1.d 884; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s 885; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h 886; CHECK-NEXT: ret 887 %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 5) 888 ret <vscale x 8 x i1> %res 889} 890 891define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_6(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) { 892; CHECK-LABEL: insert_nxv1i1_nxv8i1_6: 893; CHECK: // %bb.0: 894; CHECK-NEXT: punpkhi p2.h, p0.b 895; CHECK-NEXT: punpklo p0.h, p0.b 896; CHECK-NEXT: punpkhi p3.h, p2.b 897; CHECK-NEXT: punpklo p2.h, p2.b 898; CHECK-NEXT: punpkhi p3.h, p3.b 899; CHECK-NEXT: uzp1 p1.d, p1.d, p3.d 900; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s 901; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h 902; CHECK-NEXT: ret 903 %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 6) 904 ret <vscale x 8 x i1> %res 905} 906 907define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_7(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) { 908; CHECK-LABEL: insert_nxv1i1_nxv8i1_7: 909; CHECK: // %bb.0: 910; CHECK-NEXT: punpkhi p2.h, p0.b 911; CHECK-NEXT: punpklo p0.h, p0.b 912; CHECK-NEXT: punpkhi p3.h, p2.b 913; CHECK-NEXT: punpklo p2.h, p2.b 914; CHECK-NEXT: punpklo p3.h, p3.b 915; CHECK-NEXT: uzp1 p1.d, p3.d, p1.d 916; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s 917; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h 918; CHECK-NEXT: ret 919 %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 7) 920 ret <vscale x 8 x i1> %res 921} 922 923; 924; Insert nxv1i1 type into: nxv16i1 925; 926 927define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_0(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 928; CHECK-LABEL: insert_nxv1i1_nxv16i1_0: 929; CHECK: // %bb.0: 930; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 931; CHECK-NEXT: .cfi_def_cfa_offset 16 932; CHECK-NEXT: .cfi_offset w29, -16 933; CHECK-NEXT: addvl sp, sp, #-1 934; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 935; CHECK-NEXT: punpklo p2.h, p0.b 936; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 937; CHECK-NEXT: punpklo p3.h, p2.b 938; CHECK-NEXT: punpkhi p2.h, p2.b 939; CHECK-NEXT: punpklo p4.h, p3.b 940; CHECK-NEXT: punpkhi p3.h, p3.b 941; CHECK-NEXT: punpkhi p4.h, p4.b 942; CHECK-NEXT: punpkhi p0.h, p0.b 943; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d 944; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 945; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s 946; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h 947; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b 948; CHECK-NEXT: addvl sp, sp, #1 949; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 950; CHECK-NEXT: ret 951 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 0) 952 ret <vscale x 16 x i1> %res 953} 954 955define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 956; CHECK-LABEL: insert_nxv1i1_nxv16i1_1: 957; CHECK: // %bb.0: 958; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 959; CHECK-NEXT: .cfi_def_cfa_offset 16 960; CHECK-NEXT: .cfi_offset w29, -16 961; CHECK-NEXT: addvl sp, sp, #-1 962; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 963; CHECK-NEXT: punpklo p2.h, p0.b 964; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 965; CHECK-NEXT: punpklo p3.h, p2.b 966; CHECK-NEXT: punpkhi p2.h, p2.b 967; CHECK-NEXT: punpklo p4.h, p3.b 968; CHECK-NEXT: punpkhi p3.h, p3.b 969; CHECK-NEXT: punpklo p4.h, p4.b 970; CHECK-NEXT: punpkhi p0.h, p0.b 971; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d 972; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 973; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s 974; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h 975; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b 976; CHECK-NEXT: addvl sp, sp, #1 977; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 978; CHECK-NEXT: ret 979 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 1) 980 ret <vscale x 16 x i1> %res 981} 982 983define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_2(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 984; CHECK-LABEL: insert_nxv1i1_nxv16i1_2: 985; CHECK: // %bb.0: 986; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 987; CHECK-NEXT: .cfi_def_cfa_offset 16 988; CHECK-NEXT: .cfi_offset w29, -16 989; CHECK-NEXT: addvl sp, sp, #-1 990; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 991; CHECK-NEXT: punpklo p2.h, p0.b 992; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 993; CHECK-NEXT: punpklo p3.h, p2.b 994; CHECK-NEXT: punpkhi p2.h, p2.b 995; CHECK-NEXT: punpkhi p4.h, p3.b 996; CHECK-NEXT: punpklo p3.h, p3.b 997; CHECK-NEXT: punpkhi p4.h, p4.b 998; CHECK-NEXT: punpkhi p0.h, p0.b 999; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d 1000; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1001; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s 1002; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h 1003; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b 1004; CHECK-NEXT: addvl sp, sp, #1 1005; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1006; CHECK-NEXT: ret 1007 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 2) 1008 ret <vscale x 16 x i1> %res 1009} 1010 1011define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_3(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1012; CHECK-LABEL: insert_nxv1i1_nxv16i1_3: 1013; CHECK: // %bb.0: 1014; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1015; CHECK-NEXT: .cfi_def_cfa_offset 16 1016; CHECK-NEXT: .cfi_offset w29, -16 1017; CHECK-NEXT: addvl sp, sp, #-1 1018; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1019; CHECK-NEXT: punpklo p2.h, p0.b 1020; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1021; CHECK-NEXT: punpklo p3.h, p2.b 1022; CHECK-NEXT: punpkhi p2.h, p2.b 1023; CHECK-NEXT: punpkhi p4.h, p3.b 1024; CHECK-NEXT: punpklo p3.h, p3.b 1025; CHECK-NEXT: punpklo p4.h, p4.b 1026; CHECK-NEXT: punpkhi p0.h, p0.b 1027; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d 1028; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1029; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s 1030; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h 1031; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b 1032; CHECK-NEXT: addvl sp, sp, #1 1033; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1034; CHECK-NEXT: ret 1035 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 3) 1036 ret <vscale x 16 x i1> %res 1037} 1038 1039define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_4(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1040; CHECK-LABEL: insert_nxv1i1_nxv16i1_4: 1041; CHECK: // %bb.0: 1042; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1043; CHECK-NEXT: .cfi_def_cfa_offset 16 1044; CHECK-NEXT: .cfi_offset w29, -16 1045; CHECK-NEXT: addvl sp, sp, #-1 1046; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1047; CHECK-NEXT: punpklo p2.h, p0.b 1048; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1049; CHECK-NEXT: punpkhi p3.h, p2.b 1050; CHECK-NEXT: punpklo p2.h, p2.b 1051; CHECK-NEXT: punpklo p4.h, p3.b 1052; CHECK-NEXT: punpkhi p3.h, p3.b 1053; CHECK-NEXT: punpkhi p4.h, p4.b 1054; CHECK-NEXT: punpkhi p0.h, p0.b 1055; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d 1056; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1057; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s 1058; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h 1059; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b 1060; CHECK-NEXT: addvl sp, sp, #1 1061; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1062; CHECK-NEXT: ret 1063 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 4) 1064 ret <vscale x 16 x i1> %res 1065} 1066 1067define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_5(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1068; CHECK-LABEL: insert_nxv1i1_nxv16i1_5: 1069; CHECK: // %bb.0: 1070; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1071; CHECK-NEXT: .cfi_def_cfa_offset 16 1072; CHECK-NEXT: .cfi_offset w29, -16 1073; CHECK-NEXT: addvl sp, sp, #-1 1074; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1075; CHECK-NEXT: punpklo p2.h, p0.b 1076; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1077; CHECK-NEXT: punpkhi p3.h, p2.b 1078; CHECK-NEXT: punpklo p2.h, p2.b 1079; CHECK-NEXT: punpklo p4.h, p3.b 1080; CHECK-NEXT: punpkhi p3.h, p3.b 1081; CHECK-NEXT: punpklo p4.h, p4.b 1082; CHECK-NEXT: punpkhi p0.h, p0.b 1083; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d 1084; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1085; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s 1086; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h 1087; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b 1088; CHECK-NEXT: addvl sp, sp, #1 1089; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1090; CHECK-NEXT: ret 1091 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 5) 1092 ret <vscale x 16 x i1> %res 1093} 1094 1095define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_6(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1096; CHECK-LABEL: insert_nxv1i1_nxv16i1_6: 1097; CHECK: // %bb.0: 1098; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1099; CHECK-NEXT: .cfi_def_cfa_offset 16 1100; CHECK-NEXT: .cfi_offset w29, -16 1101; CHECK-NEXT: addvl sp, sp, #-1 1102; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1103; CHECK-NEXT: punpklo p2.h, p0.b 1104; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1105; CHECK-NEXT: punpkhi p3.h, p2.b 1106; CHECK-NEXT: punpklo p2.h, p2.b 1107; CHECK-NEXT: punpkhi p4.h, p3.b 1108; CHECK-NEXT: punpklo p3.h, p3.b 1109; CHECK-NEXT: punpkhi p4.h, p4.b 1110; CHECK-NEXT: punpkhi p0.h, p0.b 1111; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d 1112; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1113; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s 1114; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h 1115; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b 1116; CHECK-NEXT: addvl sp, sp, #1 1117; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1118; CHECK-NEXT: ret 1119 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 6) 1120 ret <vscale x 16 x i1> %res 1121} 1122 1123define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_7(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1124; CHECK-LABEL: insert_nxv1i1_nxv16i1_7: 1125; CHECK: // %bb.0: 1126; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1127; CHECK-NEXT: .cfi_def_cfa_offset 16 1128; CHECK-NEXT: .cfi_offset w29, -16 1129; CHECK-NEXT: addvl sp, sp, #-1 1130; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1131; CHECK-NEXT: punpklo p2.h, p0.b 1132; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1133; CHECK-NEXT: punpkhi p3.h, p2.b 1134; CHECK-NEXT: punpklo p2.h, p2.b 1135; CHECK-NEXT: punpkhi p4.h, p3.b 1136; CHECK-NEXT: punpklo p3.h, p3.b 1137; CHECK-NEXT: punpklo p4.h, p4.b 1138; CHECK-NEXT: punpkhi p0.h, p0.b 1139; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d 1140; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1141; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s 1142; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h 1143; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b 1144; CHECK-NEXT: addvl sp, sp, #1 1145; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1146; CHECK-NEXT: ret 1147 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 7) 1148 ret <vscale x 16 x i1> %res 1149} 1150 1151define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_8(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1152; CHECK-LABEL: insert_nxv1i1_nxv16i1_8: 1153; CHECK: // %bb.0: 1154; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1155; CHECK-NEXT: .cfi_def_cfa_offset 16 1156; CHECK-NEXT: .cfi_offset w29, -16 1157; CHECK-NEXT: addvl sp, sp, #-1 1158; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1159; CHECK-NEXT: punpkhi p2.h, p0.b 1160; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1161; CHECK-NEXT: punpklo p3.h, p2.b 1162; CHECK-NEXT: punpkhi p2.h, p2.b 1163; CHECK-NEXT: punpklo p4.h, p3.b 1164; CHECK-NEXT: punpkhi p3.h, p3.b 1165; CHECK-NEXT: punpkhi p4.h, p4.b 1166; CHECK-NEXT: punpklo p0.h, p0.b 1167; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d 1168; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1169; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s 1170; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h 1171; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b 1172; CHECK-NEXT: addvl sp, sp, #1 1173; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1174; CHECK-NEXT: ret 1175 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 8) 1176 ret <vscale x 16 x i1> %res 1177} 1178 1179define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_9(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1180; CHECK-LABEL: insert_nxv1i1_nxv16i1_9: 1181; CHECK: // %bb.0: 1182; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1183; CHECK-NEXT: .cfi_def_cfa_offset 16 1184; CHECK-NEXT: .cfi_offset w29, -16 1185; CHECK-NEXT: addvl sp, sp, #-1 1186; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1187; CHECK-NEXT: punpkhi p2.h, p0.b 1188; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1189; CHECK-NEXT: punpklo p3.h, p2.b 1190; CHECK-NEXT: punpkhi p2.h, p2.b 1191; CHECK-NEXT: punpklo p4.h, p3.b 1192; CHECK-NEXT: punpkhi p3.h, p3.b 1193; CHECK-NEXT: punpklo p4.h, p4.b 1194; CHECK-NEXT: punpklo p0.h, p0.b 1195; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d 1196; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1197; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s 1198; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h 1199; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b 1200; CHECK-NEXT: addvl sp, sp, #1 1201; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1202; CHECK-NEXT: ret 1203 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 9) 1204 ret <vscale x 16 x i1> %res 1205} 1206 1207define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_10(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1208; CHECK-LABEL: insert_nxv1i1_nxv16i1_10: 1209; CHECK: // %bb.0: 1210; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1211; CHECK-NEXT: .cfi_def_cfa_offset 16 1212; CHECK-NEXT: .cfi_offset w29, -16 1213; CHECK-NEXT: addvl sp, sp, #-1 1214; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1215; CHECK-NEXT: punpkhi p2.h, p0.b 1216; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1217; CHECK-NEXT: punpklo p3.h, p2.b 1218; CHECK-NEXT: punpkhi p2.h, p2.b 1219; CHECK-NEXT: punpkhi p4.h, p3.b 1220; CHECK-NEXT: punpklo p3.h, p3.b 1221; CHECK-NEXT: punpkhi p4.h, p4.b 1222; CHECK-NEXT: punpklo p0.h, p0.b 1223; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d 1224; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1225; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s 1226; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h 1227; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b 1228; CHECK-NEXT: addvl sp, sp, #1 1229; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1230; CHECK-NEXT: ret 1231 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 10) 1232 ret <vscale x 16 x i1> %res 1233} 1234 1235define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_11(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1236; CHECK-LABEL: insert_nxv1i1_nxv16i1_11: 1237; CHECK: // %bb.0: 1238; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1239; CHECK-NEXT: .cfi_def_cfa_offset 16 1240; CHECK-NEXT: .cfi_offset w29, -16 1241; CHECK-NEXT: addvl sp, sp, #-1 1242; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1243; CHECK-NEXT: punpkhi p2.h, p0.b 1244; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1245; CHECK-NEXT: punpklo p3.h, p2.b 1246; CHECK-NEXT: punpkhi p2.h, p2.b 1247; CHECK-NEXT: punpkhi p4.h, p3.b 1248; CHECK-NEXT: punpklo p3.h, p3.b 1249; CHECK-NEXT: punpklo p4.h, p4.b 1250; CHECK-NEXT: punpklo p0.h, p0.b 1251; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d 1252; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1253; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s 1254; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h 1255; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b 1256; CHECK-NEXT: addvl sp, sp, #1 1257; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1258; CHECK-NEXT: ret 1259 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 11) 1260 ret <vscale x 16 x i1> %res 1261} 1262 1263define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_12(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1264; CHECK-LABEL: insert_nxv1i1_nxv16i1_12: 1265; CHECK: // %bb.0: 1266; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1267; CHECK-NEXT: .cfi_def_cfa_offset 16 1268; CHECK-NEXT: .cfi_offset w29, -16 1269; CHECK-NEXT: addvl sp, sp, #-1 1270; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1271; CHECK-NEXT: punpkhi p2.h, p0.b 1272; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1273; CHECK-NEXT: punpkhi p3.h, p2.b 1274; CHECK-NEXT: punpklo p2.h, p2.b 1275; CHECK-NEXT: punpklo p4.h, p3.b 1276; CHECK-NEXT: punpkhi p3.h, p3.b 1277; CHECK-NEXT: punpkhi p4.h, p4.b 1278; CHECK-NEXT: punpklo p0.h, p0.b 1279; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d 1280; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1281; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s 1282; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h 1283; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b 1284; CHECK-NEXT: addvl sp, sp, #1 1285; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1286; CHECK-NEXT: ret 1287 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 12) 1288 ret <vscale x 16 x i1> %res 1289} 1290 1291define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_13(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1292; CHECK-LABEL: insert_nxv1i1_nxv16i1_13: 1293; CHECK: // %bb.0: 1294; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1295; CHECK-NEXT: .cfi_def_cfa_offset 16 1296; CHECK-NEXT: .cfi_offset w29, -16 1297; CHECK-NEXT: addvl sp, sp, #-1 1298; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1299; CHECK-NEXT: punpkhi p2.h, p0.b 1300; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1301; CHECK-NEXT: punpkhi p3.h, p2.b 1302; CHECK-NEXT: punpklo p2.h, p2.b 1303; CHECK-NEXT: punpklo p4.h, p3.b 1304; CHECK-NEXT: punpkhi p3.h, p3.b 1305; CHECK-NEXT: punpklo p4.h, p4.b 1306; CHECK-NEXT: punpklo p0.h, p0.b 1307; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d 1308; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1309; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s 1310; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h 1311; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b 1312; CHECK-NEXT: addvl sp, sp, #1 1313; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1314; CHECK-NEXT: ret 1315 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 13) 1316 ret <vscale x 16 x i1> %res 1317} 1318 1319define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_14(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1320; CHECK-LABEL: insert_nxv1i1_nxv16i1_14: 1321; CHECK: // %bb.0: 1322; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1323; CHECK-NEXT: .cfi_def_cfa_offset 16 1324; CHECK-NEXT: .cfi_offset w29, -16 1325; CHECK-NEXT: addvl sp, sp, #-1 1326; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1327; CHECK-NEXT: punpkhi p2.h, p0.b 1328; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1329; CHECK-NEXT: punpkhi p3.h, p2.b 1330; CHECK-NEXT: punpklo p2.h, p2.b 1331; CHECK-NEXT: punpkhi p4.h, p3.b 1332; CHECK-NEXT: punpklo p3.h, p3.b 1333; CHECK-NEXT: punpkhi p4.h, p4.b 1334; CHECK-NEXT: punpklo p0.h, p0.b 1335; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d 1336; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1337; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s 1338; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h 1339; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b 1340; CHECK-NEXT: addvl sp, sp, #1 1341; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1342; CHECK-NEXT: ret 1343 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 14) 1344 ret <vscale x 16 x i1> %res 1345} 1346 1347define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_15(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) { 1348; CHECK-LABEL: insert_nxv1i1_nxv16i1_15: 1349; CHECK: // %bb.0: 1350; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 1351; CHECK-NEXT: .cfi_def_cfa_offset 16 1352; CHECK-NEXT: .cfi_offset w29, -16 1353; CHECK-NEXT: addvl sp, sp, #-1 1354; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 1355; CHECK-NEXT: punpkhi p2.h, p0.b 1356; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill 1357; CHECK-NEXT: punpkhi p3.h, p2.b 1358; CHECK-NEXT: punpklo p2.h, p2.b 1359; CHECK-NEXT: punpkhi p4.h, p3.b 1360; CHECK-NEXT: punpklo p3.h, p3.b 1361; CHECK-NEXT: punpklo p4.h, p4.b 1362; CHECK-NEXT: punpklo p0.h, p0.b 1363; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d 1364; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload 1365; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s 1366; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h 1367; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b 1368; CHECK-NEXT: addvl sp, sp, #1 1369; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 1370; CHECK-NEXT: ret 1371 %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 15) 1372 ret <vscale x 16 x i1> %res 1373} 1374 1375attributes #0 = { vscale_range(2,2) } 1376 1377declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64) 1378 1379declare <vscale x 6 x i16> @llvm.vector.insert.nxv6i16.nxv1i16(<vscale x 6 x i16>, <vscale x 1 x i16>, i64) 1380declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv2i16(<vscale x 8 x i16>, <vscale x 2 x i16>, i64) 1381declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64) 1382 1383declare <vscale x 3 x i32> @llvm.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32>, <vscale x 2 x i32>, i64) 1384declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32>, <vscale x 1 x i32>, i64) 1385declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64) 1386declare <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32>, <vscale x 4 x i32>, i64) 1387declare <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32>, <vscale x 2 x i32>, i64) 1388declare <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32>, <vscale x 3 x i32>, i64) 1389 1390declare <vscale x 2 x bfloat> @llvm.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i64) 1391declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv2bf16(<vscale x 4 x bfloat>, <vscale x 2 x bfloat>, i64) 1392declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i64) 1393declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat>, <4 x bfloat>, i64) 1394declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64) 1395declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv4bf16(<vscale x 8 x bfloat>, <vscale x 4 x bfloat>, i64) 1396declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64) 1397 1398declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64) 1399declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64) 1400declare <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64>, <vscale x 8 x i64>, i64) 1401declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64) 1402 1403declare <vscale x 4 x half> @llvm.vector.insert.nxv4f16.nxv2f16(<vscale x 4 x half>, <vscale x 2 x half>, i64) 1404declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv2f16(<vscale x 8 x half>, <vscale x 2 x half>, i64) 1405declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half>, <vscale x 4 x half>, i64) 1406 1407declare <vscale x 3 x float> @llvm.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float>, <vscale x 2 x float>, i64) 1408declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float>, <vscale x 1 x float>, i64) 1409declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float>, <vscale x 2 x float>, i64) 1410 1411declare <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v8i1(<vscale x 2 x i1>, <8 x i1>, i64) 1412declare <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.v16i1(<vscale x 4 x i1>, <16 x i1>, i64) 1413declare <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.v32i1(<vscale x 8 x i1>, <32 x i1>, i64) 1414declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1>, <vscale x 1 x i1>, i64) 1415declare <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1>, <vscale x 1 x i1>, i64) 1416declare <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1>, <vscale x 1 x i1>, i64) 1417declare <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.nxv1i1(<vscale x 2 x i1>, <vscale x 1 x i1>, i64) 1418declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1>, <vscale x 4 x i1>, i64) 1419declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1>, <vscale x 8 x i1>, i64) 1420declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v64i1(<vscale x 16 x i1>, <64 x i1>, i64) 1421