1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+bf16 < %s | FileCheck %s --check-prefixes=CHECK 3 4define <vscale x 2 x i64> @insert_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind { 5; CHECK-LABEL: insert_v2i64_nxv2i64: 6; CHECK: // %bb.0: 7; CHECK-NEXT: ptrue p0.d, vl2 8; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 9; CHECK-NEXT: mov z0.d, p0/m, z1.d 10; CHECK-NEXT: ret 11 %retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 0) 12 ret <vscale x 2 x i64> %retval 13} 14 15define <vscale x 2 x i64> @insert_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind { 16; CHECK-LABEL: insert_v2i64_nxv2i64_idx2: 17; CHECK: // %bb.0: 18; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 19; CHECK-NEXT: addvl sp, sp, #-1 20; CHECK-NEXT: cntd x8 21; CHECK-NEXT: mov w9, #2 22; CHECK-NEXT: sub x8, x8, #2 23; CHECK-NEXT: ptrue p0.d 24; CHECK-NEXT: cmp x8, #2 25; CHECK-NEXT: st1d { z0.d }, p0, [sp] 26; CHECK-NEXT: csel x8, x8, x9, lo 27; CHECK-NEXT: mov x9, sp 28; CHECK-NEXT: lsl x8, x8, #3 29; CHECK-NEXT: str q1, [x9, x8] 30; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] 31; CHECK-NEXT: addvl sp, sp, #1 32; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 33; CHECK-NEXT: ret 34 %retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 2) 35 ret <vscale x 2 x i64> %retval 36} 37 38define <vscale x 4 x i32> @insert_v4i32_nxv4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind { 39; CHECK-LABEL: insert_v4i32_nxv4i32: 40; CHECK: // %bb.0: 41; CHECK-NEXT: ptrue p0.s, vl4 42; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 43; CHECK-NEXT: mov z0.s, p0/m, z1.s 44; CHECK-NEXT: ret 45 %retval = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 0) 46 ret <vscale x 4 x i32> %retval 47} 48 49define <vscale x 4 x i32> @insert_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind { 50; CHECK-LABEL: insert_v4i32_nxv4i32_idx4: 51; CHECK: // %bb.0: 52; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 53; CHECK-NEXT: addvl sp, sp, #-1 54; CHECK-NEXT: cntw x8 55; CHECK-NEXT: mov w9, #4 56; CHECK-NEXT: sub x8, x8, #4 57; CHECK-NEXT: ptrue p0.s 58; CHECK-NEXT: cmp x8, #4 59; CHECK-NEXT: st1w { z0.s }, p0, [sp] 60; CHECK-NEXT: csel x8, x8, x9, lo 61; CHECK-NEXT: mov x9, sp 62; CHECK-NEXT: lsl x8, x8, #2 63; CHECK-NEXT: str q1, [x9, x8] 64; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] 65; CHECK-NEXT: addvl sp, sp, #1 66; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 67; CHECK-NEXT: ret 68 %retval = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 4) 69 ret <vscale x 4 x i32> %retval 70} 71 72define <vscale x 8 x i16> @insert_v8i16_nxv8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind { 73; CHECK-LABEL: insert_v8i16_nxv8i16: 74; CHECK: // %bb.0: 75; CHECK-NEXT: ptrue p0.h, vl8 76; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 77; CHECK-NEXT: mov z0.h, p0/m, z1.h 78; CHECK-NEXT: ret 79 %retval = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 0) 80 ret <vscale x 8 x i16> %retval 81} 82 83define <vscale x 8 x i16> @insert_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind { 84; CHECK-LABEL: insert_v8i16_nxv8i16_idx8: 85; CHECK: // %bb.0: 86; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 87; CHECK-NEXT: addvl sp, sp, #-1 88; CHECK-NEXT: cnth x8 89; CHECK-NEXT: mov w9, #8 90; CHECK-NEXT: sub x8, x8, #8 91; CHECK-NEXT: ptrue p0.h 92; CHECK-NEXT: cmp x8, #8 93; CHECK-NEXT: st1h { z0.h }, p0, [sp] 94; CHECK-NEXT: csel x8, x8, x9, lo 95; CHECK-NEXT: mov x9, sp 96; CHECK-NEXT: lsl x8, x8, #1 97; CHECK-NEXT: str q1, [x9, x8] 98; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] 99; CHECK-NEXT: addvl sp, sp, #1 100; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 101; CHECK-NEXT: ret 102 %retval = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 8) 103 ret <vscale x 8 x i16> %retval 104} 105 106define <vscale x 16 x i8> @insert_v16i8_nxv16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind { 107; CHECK-LABEL: insert_v16i8_nxv16i8: 108; CHECK: // %bb.0: 109; CHECK-NEXT: ptrue p0.b, vl16 110; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 111; CHECK-NEXT: mov z0.b, p0/m, z1.b 112; CHECK-NEXT: ret 113 %retval = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 0) 114 ret <vscale x 16 x i8> %retval 115} 116 117define <vscale x 16 x i8> @insert_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind { 118; CHECK-LABEL: insert_v16i8_nxv16i8_idx16: 119; CHECK: // %bb.0: 120; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 121; CHECK-NEXT: addvl sp, sp, #-1 122; CHECK-NEXT: mov x8, #-16 123; CHECK-NEXT: mov w9, #16 124; CHECK-NEXT: ptrue p0.b 125; CHECK-NEXT: st1b { z0.b }, p0, [sp] 126; CHECK-NEXT: addvl x8, x8, #1 127; CHECK-NEXT: cmp x8, #16 128; CHECK-NEXT: csel x8, x8, x9, lo 129; CHECK-NEXT: mov x9, sp 130; CHECK-NEXT: str q1, [x9, x8] 131; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] 132; CHECK-NEXT: addvl sp, sp, #1 133; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 134; CHECK-NEXT: ret 135 %retval = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 16) 136 ret <vscale x 16 x i8> %retval 137} 138 139 140; Insert subvectors into illegal vectors 141 142define void @insert_nxv8i64_nxv16i64(<vscale x 8 x i64> %sv0, <vscale x 8 x i64> %sv1, <vscale x 16 x i64>* %out) { 143; CHECK-LABEL: insert_nxv8i64_nxv16i64: 144; CHECK: // %bb.0: 145; CHECK-NEXT: ptrue p0.d 146; CHECK-NEXT: st1d { z7.d }, p0, [x0, #7, mul vl] 147; CHECK-NEXT: st1d { z6.d }, p0, [x0, #6, mul vl] 148; CHECK-NEXT: st1d { z5.d }, p0, [x0, #5, mul vl] 149; CHECK-NEXT: st1d { z4.d }, p0, [x0, #4, mul vl] 150; CHECK-NEXT: st1d { z3.d }, p0, [x0, #3, mul vl] 151; CHECK-NEXT: st1d { z2.d }, p0, [x0, #2, mul vl] 152; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl] 153; CHECK-NEXT: st1d { z0.d }, p0, [x0] 154; CHECK-NEXT: ret 155 %v0 = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 0) 156 %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> %v0, <vscale x 8 x i64> %sv1, i64 8) 157 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out 158 ret void 159} 160 161define void @insert_nxv8i64_nxv16i64_lo(<vscale x 8 x i64> %sv0, <vscale x 16 x i64>* %out) { 162; CHECK-LABEL: insert_nxv8i64_nxv16i64_lo: 163; CHECK: // %bb.0: 164; CHECK-NEXT: ptrue p0.d 165; CHECK-NEXT: st1d { z3.d }, p0, [x0, #3, mul vl] 166; CHECK-NEXT: st1d { z2.d }, p0, [x0, #2, mul vl] 167; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl] 168; CHECK-NEXT: st1d { z0.d }, p0, [x0] 169; CHECK-NEXT: ret 170 %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 0) 171 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out 172 ret void 173} 174 175define void @insert_nxv8i64_nxv16i64_hi(<vscale x 8 x i64> %sv0, <vscale x 16 x i64>* %out) { 176; CHECK-LABEL: insert_nxv8i64_nxv16i64_hi: 177; CHECK: // %bb.0: 178; CHECK-NEXT: ptrue p0.d 179; CHECK-NEXT: st1d { z3.d }, p0, [x0, #7, mul vl] 180; CHECK-NEXT: st1d { z2.d }, p0, [x0, #6, mul vl] 181; CHECK-NEXT: st1d { z1.d }, p0, [x0, #5, mul vl] 182; CHECK-NEXT: st1d { z0.d }, p0, [x0, #4, mul vl] 183; CHECK-NEXT: ret 184 %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 8) 185 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out 186 ret void 187} 188 189define void @insert_v2i64_nxv16i64(<2 x i64> %sv0, <2 x i64> %sv1, <vscale x 16 x i64>* %out) { 190; CHECK-LABEL: insert_v2i64_nxv16i64: 191; CHECK: // %bb.0: 192; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 193; CHECK-NEXT: addvl sp, sp, #-4 194; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG 195; CHECK-NEXT: .cfi_offset w29, -16 196; CHECK-NEXT: ptrue p0.d 197; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 198; CHECK-NEXT: st1d { z0.d }, p0, [sp] 199; CHECK-NEXT: str q1, [sp, #32] 200; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] 201; CHECK-NEXT: ld1d { z1.d }, p0/z, [sp, #1, mul vl] 202; CHECK-NEXT: ld1d { z2.d }, p0/z, [sp, #2, mul vl] 203; CHECK-NEXT: ld1d { z3.d }, p0/z, [sp, #3, mul vl] 204; CHECK-NEXT: st1d { z3.d }, p0, [x0, #3, mul vl] 205; CHECK-NEXT: st1d { z2.d }, p0, [x0, #2, mul vl] 206; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl] 207; CHECK-NEXT: st1d { z0.d }, p0, [x0] 208; CHECK-NEXT: addvl sp, sp, #4 209; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 210; CHECK-NEXT: ret 211 %v0 = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0) 212 %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> %v0, <2 x i64> %sv1, i64 4) 213 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out 214 ret void 215} 216 217define void @insert_v2i64_nxv16i64_lo0(<2 x i64>* %psv, <vscale x 16 x i64>* %out) { 218; CHECK-LABEL: insert_v2i64_nxv16i64_lo0: 219; CHECK: // %bb.0: 220; CHECK-NEXT: ldr q0, [x0] 221; CHECK-NEXT: ptrue p0.d 222; CHECK-NEXT: st1d { z0.d }, p0, [x1] 223; CHECK-NEXT: ret 224 %sv = load <2 x i64>, <2 x i64>* %psv 225 %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0) 226 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out 227 ret void 228} 229 230define void @insert_v2i64_nxv16i64_lo2(<2 x i64>* %psv, <vscale x 16 x i64>* %out) { 231; CHECK-LABEL: insert_v2i64_nxv16i64_lo2: 232; CHECK: // %bb.0: 233; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 234; CHECK-NEXT: addvl sp, sp, #-2 235; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG 236; CHECK-NEXT: .cfi_offset w29, -16 237; CHECK-NEXT: ldr q0, [x0] 238; CHECK-NEXT: ptrue p0.d 239; CHECK-NEXT: str q0, [sp, #16] 240; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] 241; CHECK-NEXT: ld1d { z1.d }, p0/z, [sp, #1, mul vl] 242; CHECK-NEXT: st1d { z1.d }, p0, [x1, #1, mul vl] 243; CHECK-NEXT: st1d { z0.d }, p0, [x1] 244; CHECK-NEXT: addvl sp, sp, #2 245; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 246; CHECK-NEXT: ret 247 %sv = load <2 x i64>, <2 x i64>* %psv 248 %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2) 249 store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out 250 ret void 251} 252 253 254; Insert subvectors that need widening 255 256define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_undef() nounwind { 257; CHECK-LABEL: insert_nxv1i32_nxv4i32_undef: 258; CHECK: // %bb.0: // %entry 259; CHECK-NEXT: mov z0.s, #1 // =0x1 260; CHECK-NEXT: ret 261entry: 262 %0 = insertelement <vscale x 1 x i32> undef, i32 1, i32 0 263 %subvec = shufflevector <vscale x 1 x i32> %0, <vscale x 1 x i32> undef, <vscale x 1 x i32> zeroinitializer 264 %retval = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> undef, <vscale x 1 x i32> %subvec, i64 0) 265 ret <vscale x 4 x i32> %retval 266} 267 268define <vscale x 6 x i16> @insert_nxv1i16_nxv6i16_undef() nounwind { 269; CHECK-LABEL: insert_nxv1i16_nxv6i16_undef: 270; CHECK: // %bb.0: // %entry 271; CHECK-NEXT: mov z0.h, #1 // =0x1 272; CHECK-NEXT: ret 273entry: 274 %0 = insertelement <vscale x 1 x i16> undef, i16 1, i32 0 275 %subvec = shufflevector <vscale x 1 x i16> %0, <vscale x 1 x i16> undef, <vscale x 1 x i32> zeroinitializer 276 %retval = call <vscale x 6 x i16> @llvm.experimental.vector.insert.nxv6i16.nxv1i16(<vscale x 6 x i16> undef, <vscale x 1 x i16> %subvec, i64 0) 277 ret <vscale x 6 x i16> %retval 278} 279 280; This tests promotion of the input operand to INSERT_SUBVECTOR. 281define <vscale x 8 x i16> @insert_nxv8i16_nxv2i16(<vscale x 8 x i16> %vec, <vscale x 2 x i16> %in) nounwind { 282; CHECK-LABEL: insert_nxv8i16_nxv2i16: 283; CHECK: // %bb.0: 284; CHECK-NEXT: uunpklo z2.s, z0.h 285; CHECK-NEXT: uunpkhi z0.s, z0.h 286; CHECK-NEXT: uunpklo z2.d, z2.s 287; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s 288; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h 289; CHECK-NEXT: ret 290 %r = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.nxv2i16(<vscale x 8 x i16> %vec, <vscale x 2 x i16> %in, i64 2) 291 ret <vscale x 8 x i16> %r 292} 293 294; Test that the index is scaled by vscale if the subvector is scalable. 295define <vscale x 8 x half> @insert_nxv8f16_nxv2f16(<vscale x 8 x half> %vec, <vscale x 2 x half> %in) nounwind { 296; CHECK-LABEL: insert_nxv8f16_nxv2f16: 297; CHECK: // %bb.0: 298; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 299; CHECK-NEXT: addvl sp, sp, #-1 300; CHECK-NEXT: ptrue p0.h 301; CHECK-NEXT: ptrue p1.d 302; CHECK-NEXT: st1h { z0.h }, p0, [sp] 303; CHECK-NEXT: st1h { z1.d }, p1, [sp, #1, mul vl] 304; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] 305; CHECK-NEXT: addvl sp, sp, #1 306; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 307; CHECK-NEXT: ret 308 %r = call <vscale x 8 x half> @llvm.experimental.vector.insert.nxv8f16.nxv2f16(<vscale x 8 x half> %vec, <vscale x 2 x half> %in, i64 2) 309 ret <vscale x 8 x half> %r 310} 311 312; Fixed length clamping 313 314define <vscale x 2 x i64> @insert_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind #0 { 315; CHECK-LABEL: insert_fixed_v2i64_nxv2i64: 316; CHECK: // %bb.0: 317; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 318; CHECK-NEXT: addvl sp, sp, #-1 319; CHECK-NEXT: cntd x8 320; CHECK-NEXT: mov w9, #2 321; CHECK-NEXT: sub x8, x8, #2 322; CHECK-NEXT: ptrue p0.d 323; CHECK-NEXT: cmp x8, #2 324; CHECK-NEXT: st1d { z0.d }, p0, [sp] 325; CHECK-NEXT: csel x8, x8, x9, lo 326; CHECK-NEXT: mov x9, sp 327; CHECK-NEXT: lsl x8, x8, #3 328; CHECK-NEXT: str q1, [x9, x8] 329; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] 330; CHECK-NEXT: addvl sp, sp, #1 331; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 332; CHECK-NEXT: ret 333 %retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 2) 334 ret <vscale x 2 x i64> %retval 335} 336 337define <vscale x 2 x i64> @insert_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec, <4 x i64>* %ptr) nounwind #0 { 338; CHECK-LABEL: insert_fixed_v4i64_nxv2i64: 339; CHECK: // %bb.0: 340; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 341; CHECK-NEXT: addvl sp, sp, #-1 342; CHECK-NEXT: cntd x8 343; CHECK-NEXT: ptrue p0.d 344; CHECK-NEXT: subs x8, x8, #4 345; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] 346; CHECK-NEXT: csel x8, xzr, x8, lo 347; CHECK-NEXT: mov w9, #4 348; CHECK-NEXT: cmp x8, #4 349; CHECK-NEXT: st1d { z0.d }, p0, [sp] 350; CHECK-NEXT: csel x8, x8, x9, lo 351; CHECK-NEXT: mov x9, sp 352; CHECK-NEXT: st1d { z1.d }, p0, [x9, x8, lsl #3] 353; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] 354; CHECK-NEXT: addvl sp, sp, #1 355; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 356; CHECK-NEXT: ret 357 %subvec = load <4 x i64>, <4 x i64>* %ptr 358 %retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> %vec, <4 x i64> %subvec, i64 4) 359 ret <vscale x 2 x i64> %retval 360} 361 362attributes #0 = { vscale_range(2,2) } 363 364declare <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64) 365declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64) 366declare <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64) 367declare <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64) 368 369declare <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64) 370 371declare <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64>, <vscale x 8 x i64>, i64) 372declare <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64) 373declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32>, <vscale x 1 x i32>, i64) 374declare <vscale x 6 x i16> @llvm.experimental.vector.insert.nxv6i16.nxv1i16(<vscale x 6 x i16>, <vscale x 1 x i16>, i64) 375 376declare <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.nxv2i16(<vscale x 8 x i16>, <vscale x 2 x i16>, i64) 377 378declare <vscale x 8 x half> @llvm.experimental.vector.insert.nxv8f16.nxv2f16(<vscale x 8 x half>, <vscale x 2 x half>, i64) 379 380;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 381;; Upacked types that need result widening 382;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 383 384define <vscale x 3 x i32> @insert_nxv3i32_nxv2i32(<vscale x 2 x i32> %sv0) { 385; CHECK-LABEL: insert_nxv3i32_nxv2i32: 386; CHECK: // %bb.0: 387; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 388; CHECK-NEXT: ret 389 %v0 = call <vscale x 3 x i32> @llvm.experimental.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32> undef, <vscale x 2 x i32> %sv0, i64 0) 390 ret <vscale x 3 x i32> %v0 391} 392 393;; Check that the Subvector is not widen so it does not crash. 394define <vscale x 3 x i32> @insert_nxv3i32_nxv2i32_2(<vscale x 3 x i32> %sv0, <vscale x 2 x i32> %sv1) { 395; CHECK-LABEL: insert_nxv3i32_nxv2i32_2: 396; CHECK: // %bb.0: 397; CHECK-NEXT: uunpkhi z0.d, z0.s 398; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s 399; CHECK-NEXT: ret 400 %v0 = call <vscale x 3 x i32> @llvm.experimental.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32> %sv0, <vscale x 2 x i32> %sv1, i64 0) 401 ret <vscale x 3 x i32> %v0 402} 403 404define <vscale x 3 x float> @insert_nxv3f32_nxv2f32(<vscale x 2 x float> %sv0) nounwind { 405; CHECK-LABEL: insert_nxv3f32_nxv2f32: 406; CHECK: // %bb.0: 407; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 408; CHECK-NEXT: ret 409 %v0 = call <vscale x 3 x float> @llvm.experimental.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float> undef, <vscale x 2 x float> %sv0, i64 0) 410 ret <vscale x 3 x float> %v0 411} 412 413define <vscale x 6 x i32> @insert_nxv6i32_nxv2i32(<vscale x 2 x i32> %sv0, <vscale x 2 x i32> %sv1) nounwind { 414; CHECK-LABEL: insert_nxv6i32_nxv2i32: 415; CHECK: // %bb.0: 416; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 417; CHECK-NEXT: addvl sp, sp, #-2 418; CHECK-NEXT: ptrue p0.s 419; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s 420; CHECK-NEXT: st1w { z0.s }, p0, [sp] 421; CHECK-NEXT: ld1w { z1.s }, p0/z, [sp, #1, mul vl] 422; CHECK-NEXT: addvl sp, sp, #2 423; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 424; CHECK-NEXT: ret 425 %v0 = call <vscale x 6 x i32> @llvm.experimental.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32> undef, <vscale x 2 x i32> %sv0, i64 0) 426 %v1 = call <vscale x 6 x i32> @llvm.experimental.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32> %v0, <vscale x 2 x i32> %sv1, i64 2) 427 ret <vscale x 6 x i32> %v1 428} 429 430;; This only works because the input vector is undef and index is zero 431define <vscale x 6 x i32> @insert_nxv6i32_nxv3i32(<vscale x 3 x i32> %sv0) { 432; CHECK-LABEL: insert_nxv6i32_nxv3i32: 433; CHECK: // %bb.0: 434; CHECK-NEXT: ret 435 %v0 = call <vscale x 6 x i32> @llvm.experimental.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32> undef, <vscale x 3 x i32> %sv0, i64 0) 436 ret <vscale x 6 x i32> %v0 437} 438 439define <vscale x 12 x i32> @insert_nxv12i32_nxv4i32(<vscale x 4 x i32> %sv0, <vscale x 4 x i32> %sv1, <vscale x 4 x i32> %sv2) { 440; CHECK-LABEL: insert_nxv12i32_nxv4i32: 441; CHECK: // %bb.0: 442; CHECK-NEXT: ret 443 %v0 = call <vscale x 12 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> undef, <vscale x 4 x i32> %sv0, i64 0) 444 %v1 = call <vscale x 12 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> %v0, <vscale x 4 x i32> %sv1, i64 4) 445 %v2 = call <vscale x 12 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> %v1, <vscale x 4 x i32> %sv2, i64 8) 446 ret <vscale x 12 x i32> %v2 447} 448 449define <vscale x 2 x bfloat> @insert_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1) nounwind { 450; CHECK-LABEL: insert_nxv2bf16_nxv2bf16: 451; CHECK: // %bb.0: 452; CHECK-NEXT: mov z0.d, z1.d 453; CHECK-NEXT: ret 454 %v0 = call <vscale x 2 x bfloat> @llvm.experimental.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1, i64 0) 455 ret <vscale x 2 x bfloat> %v0 456} 457 458define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1) nounwind { 459; CHECK-LABEL: insert_nxv4bf16_nxv4bf16: 460; CHECK: // %bb.0: 461; CHECK-NEXT: mov z0.d, z1.d 462; CHECK-NEXT: ret 463 %v0 = call <vscale x 4 x bfloat> @llvm.experimental.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1, i64 0) 464 ret <vscale x 4 x bfloat> %v0 465} 466 467define <vscale x 4 x bfloat> @insert_nxv4bf16_v4bf16(<vscale x 4 x bfloat> %sv0, <4 x bfloat> %v1) nounwind { 468; CHECK-LABEL: insert_nxv4bf16_v4bf16: 469; CHECK: // %bb.0: 470; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 471; CHECK-NEXT: addvl sp, sp, #-1 472; CHECK-NEXT: ptrue p0.s 473; CHECK-NEXT: st1h { z0.s }, p0, [sp, #1, mul vl] 474; CHECK-NEXT: addpl x8, sp, #4 475; CHECK-NEXT: str d1, [x8] 476; CHECK-NEXT: ld1h { z0.s }, p0/z, [sp, #1, mul vl] 477; CHECK-NEXT: addvl sp, sp, #1 478; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 479; CHECK-NEXT: ret 480 %v0 = call <vscale x 4 x bfloat> @llvm.experimental.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat> %sv0, <4 x bfloat> %v1, i64 0) 481 ret <vscale x 4 x bfloat> %v0 482} 483 484define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %sv0, <vscale x 8 x bfloat> %sv1) nounwind { 485; CHECK-LABEL: insert_nxv8bf16_nxv8bf16: 486; CHECK: // %bb.0: 487; CHECK-NEXT: mov z0.d, z1.d 488; CHECK-NEXT: ret 489 %v0 = call <vscale x 8 x bfloat> @llvm.experimental.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat> %sv0, <vscale x 8 x bfloat> %sv1, i64 0) 490 ret <vscale x 8 x bfloat> %v0 491} 492 493define <vscale x 8 x bfloat> @insert_nxv8bf16_v8bf16(<vscale x 8 x bfloat> %sv0, <8 x bfloat> %v1) nounwind { 494; CHECK-LABEL: insert_nxv8bf16_v8bf16: 495; CHECK: // %bb.0: 496; CHECK-NEXT: ptrue p0.h, vl8 497; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 498; CHECK-NEXT: mov z0.h, p0/m, z1.h 499; CHECK-NEXT: ret 500 %v0 = call <vscale x 8 x bfloat> @llvm.experimental.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> %sv0, <8 x bfloat> %v1, i64 0) 501 ret <vscale x 8 x bfloat> %v0 502} 503 504declare <vscale x 3 x i32> @llvm.experimental.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32>, <vscale x 2 x i32>, i64) 505declare <vscale x 3 x float> @llvm.experimental.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float>, <vscale x 2 x float>, i64) 506declare <vscale x 6 x i32> @llvm.experimental.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32>, <vscale x 2 x i32>, i64) 507declare <vscale x 6 x i32> @llvm.experimental.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32>, <vscale x 3 x i32>, i64) 508declare <vscale x 12 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32>, <vscale x 4 x i32>, i64) 509declare <vscale x 8 x bfloat> @llvm.experimental.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64) 510declare <vscale x 8 x bfloat> @llvm.experimental.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64) 511declare <vscale x 4 x bfloat> @llvm.experimental.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i64) 512declare <vscale x 4 x bfloat> @llvm.experimental.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat>, <4 x bfloat>, i64) 513declare <vscale x 2 x bfloat> @llvm.experimental.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i64) 514