1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -tail-predication=enabled --verify-machineinstrs %s -o - | FileCheck %s 3 4define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { 5; CHECK-LABEL: mul_reduce_add: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: cmp r2, #0 8; CHECK-NEXT: itt eq 9; CHECK-NEXT: moveq r0, #0 10; CHECK-NEXT: bxeq lr 11; CHECK-NEXT: .LBB0_1: @ %vector.ph 12; CHECK-NEXT: push {r7, lr} 13; CHECK-NEXT: adds r3, r2, #3 14; CHECK-NEXT: vmov.i32 q0, #0x0 15; CHECK-NEXT: bic r3, r3, #3 16; CHECK-NEXT: sub.w r12, r3, #4 17; CHECK-NEXT: movs r3, #1 18; CHECK-NEXT: add.w lr, r3, r12, lsr #2 19; CHECK-NEXT: movs r3, #0 20; CHECK-NEXT: .LBB0_2: @ %vector.body 21; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 22; CHECK-NEXT: vctp.32 r2 23; CHECK-NEXT: vmov q1, q0 24; CHECK-NEXT: vpstt 25; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 26; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 27; CHECK-NEXT: adds r3, #4 28; CHECK-NEXT: vmul.i32 q0, q2, q0 29; CHECK-NEXT: subs r2, #4 30; CHECK-NEXT: vadd.i32 q0, q0, q1 31; CHECK-NEXT: le lr, .LBB0_2 32; CHECK-NEXT: @ %bb.3: @ %middle.block 33; CHECK-NEXT: vpsel q0, q0, q1 34; CHECK-NEXT: vaddv.u32 r0, q0 35; CHECK-NEXT: pop {r7, pc} 36entry: 37 %cmp8 = icmp eq i32 %N, 0 38 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 39 40vector.ph: ; preds = %entry 41 %n.rnd.up = add i32 %N, 3 42 %n.vec = and i32 %n.rnd.up, -4 43 %trip.count.minus.1 = add i32 %N, -1 44 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 45 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer 46 br label %vector.body 47 48vector.body: ; preds = %vector.body, %vector.ph 49 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 50 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ] 51 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 52 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 53 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 54 %0 = getelementptr inbounds i32, i32* %a, i32 %index 55 56 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12 57 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 58 59 %2 = bitcast i32* %0 to <4 x i32>* 60 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) 61 %3 = getelementptr inbounds i32, i32* %b, i32 %index 62 %4 = bitcast i32* %3 to <4 x i32>* 63 %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef) 64 %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load 65 %6 = add nsw <4 x i32> %5, %vec.phi 66 %index.next = add i32 %index, 4 67 %7 = icmp eq i32 %index.next, %n.vec 68 br i1 %7, label %middle.block, label %vector.body 69 70middle.block: ; preds = %vector.body 71 %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi 72 %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8) 73 br label %for.cond.cleanup 74 75for.cond.cleanup: ; preds = %middle.block, %entry 76 %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ] 77 ret i32 %res.0.lcssa 78} 79 80define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) { 81; CHECK-LABEL: mul_reduce_add_const: 82; CHECK: @ %bb.0: @ %entry 83; CHECK-NEXT: cmp r2, #0 84; CHECK-NEXT: itt eq 85; CHECK-NEXT: moveq r0, #0 86; CHECK-NEXT: bxeq lr 87; CHECK-NEXT: .LBB1_1: @ %vector.ph 88; CHECK-NEXT: push {r7, lr} 89; CHECK-NEXT: adds r1, r2, #3 90; CHECK-NEXT: movs r3, #1 91; CHECK-NEXT: bic r1, r1, #3 92; CHECK-NEXT: vmov.i32 q0, #0x0 93; CHECK-NEXT: subs r1, #4 94; CHECK-NEXT: add.w lr, r3, r1, lsr #2 95; CHECK-NEXT: movs r1, #0 96; CHECK-NEXT: .LBB1_2: @ %vector.body 97; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 98; CHECK-NEXT: vctp.32 r2 99; CHECK-NEXT: vmov q1, q0 100; CHECK-NEXT: vpst 101; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 102; CHECK-NEXT: adds r1, #4 103; CHECK-NEXT: subs r2, #4 104; CHECK-NEXT: vadd.i32 q0, q0, q1 105; CHECK-NEXT: le lr, .LBB1_2 106; CHECK-NEXT: @ %bb.3: @ %middle.block 107; CHECK-NEXT: vpsel q0, q0, q1 108; CHECK-NEXT: vaddv.u32 r0, q0 109; CHECK-NEXT: pop {r7, pc} 110entry: 111 %cmp6 = icmp eq i32 %N, 0 112 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph 113 114vector.ph: ; preds = %entry 115 %n.rnd.up = add i32 %N, 3 116 %n.vec = and i32 %n.rnd.up, -4 117 %trip.count.minus.1 = add i32 %N, -1 118 %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 119 %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer 120 br label %vector.body 121 122vector.body: ; preds = %vector.body, %vector.ph 123 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 124 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] 125 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 126 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 127 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 128 %0 = getelementptr inbounds i32, i32* %a, i32 %index 129 130 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 131 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 132 133 %2 = bitcast i32* %0 to <4 x i32>* 134 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) 135 %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi 136 %index.next = add i32 %index, 4 137 %4 = icmp eq i32 %index.next, %n.vec 138 br i1 %4, label %middle.block, label %vector.body 139 140middle.block: ; preds = %vector.body 141 %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi 142 %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) 143 br label %for.cond.cleanup 144 145for.cond.cleanup: ; preds = %middle.block, %entry 146 %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ] 147 ret i32 %res.0.lcssa 148} 149 150define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) { 151; CHECK-LABEL: add_reduce_add_const: 152; CHECK: @ %bb.0: @ %entry 153; CHECK-NEXT: cmp r2, #0 154; CHECK-NEXT: itt eq 155; CHECK-NEXT: moveq r0, #0 156; CHECK-NEXT: bxeq lr 157; CHECK-NEXT: .LBB2_1: @ %vector.ph 158; CHECK-NEXT: push {r7, lr} 159; CHECK-NEXT: adds r1, r2, #3 160; CHECK-NEXT: movs r3, #1 161; CHECK-NEXT: bic r1, r1, #3 162; CHECK-NEXT: vmov.i32 q0, #0x0 163; CHECK-NEXT: subs r1, #4 164; CHECK-NEXT: add.w lr, r3, r1, lsr #2 165; CHECK-NEXT: movs r1, #0 166; CHECK-NEXT: .LBB2_2: @ %vector.body 167; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 168; CHECK-NEXT: vctp.32 r2 169; CHECK-NEXT: vmov q1, q0 170; CHECK-NEXT: vpst 171; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 172; CHECK-NEXT: adds r1, #4 173; CHECK-NEXT: subs r2, #4 174; CHECK-NEXT: vadd.i32 q0, q0, q1 175; CHECK-NEXT: le lr, .LBB2_2 176; CHECK-NEXT: @ %bb.3: @ %middle.block 177; CHECK-NEXT: vpsel q0, q0, q1 178; CHECK-NEXT: vaddv.u32 r0, q0 179; CHECK-NEXT: pop {r7, pc} 180entry: 181 %cmp6 = icmp eq i32 %N, 0 182 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph 183 184vector.ph: ; preds = %entry 185 %n.rnd.up = add i32 %N, 3 186 %n.vec = and i32 %n.rnd.up, -4 187 %trip.count.minus.1 = add i32 %N, -1 188 %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 189 %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer 190 br label %vector.body 191 192vector.body: ; preds = %vector.body, %vector.ph 193 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 194 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] 195 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 196 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 197 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 198 %0 = getelementptr inbounds i32, i32* %a, i32 %index 199 200 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10 201 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 202 203 %2 = bitcast i32* %0 to <4 x i32>* 204 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) 205 %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi 206 %index.next = add i32 %index, 4 207 %4 = icmp eq i32 %index.next, %n.vec 208 br i1 %4, label %middle.block, label %vector.body 209 210middle.block: ; preds = %vector.body 211 %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi 212 %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) 213 br label %for.cond.cleanup 214 215for.cond.cleanup: ; preds = %middle.block, %entry 216 %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ] 217 ret i32 %res.0.lcssa 218} 219 220define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) { 221; CHECK-LABEL: vector_mul_const: 222; CHECK: @ %bb.0: @ %entry 223; CHECK-NEXT: push {r7, lr} 224; CHECK-NEXT: cmp r3, #0 225; CHECK-NEXT: it eq 226; CHECK-NEXT: popeq {r7, pc} 227; CHECK-NEXT: .LBB3_1: @ %vector.ph 228; CHECK-NEXT: mov.w r12, #0 229; CHECK-NEXT: dlstp.32 lr, r3 230; CHECK-NEXT: .LBB3_2: @ %vector.body 231; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 232; CHECK-NEXT: add.w r12, r12, #4 233; CHECK-NEXT: vldrw.u32 q0, [r1], #16 234; CHECK-NEXT: vmul.i32 q0, q0, r2 235; CHECK-NEXT: vstrw.32 q0, [r0], #16 236; CHECK-NEXT: letp lr, .LBB3_2 237; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 238; CHECK-NEXT: pop {r7, pc} 239entry: 240 %cmp6 = icmp eq i32 %N, 0 241 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph 242 243vector.ph: ; preds = %entry 244 %n.rnd.up = add i32 %N, 3 245 %n.vec = and i32 %n.rnd.up, -4 246 %trip.count.minus.1 = add i32 %N, -1 247 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 248 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer 249 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 250 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 251 br label %vector.body 252 253vector.body: ; preds = %vector.body, %vector.ph 254 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 255 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 256 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 257 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 258 %0 = getelementptr inbounds i32, i32* %b, i32 %index 259 260 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 261 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 262 263 %2 = bitcast i32* %0 to <4 x i32>* 264 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) 265 %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11 266 %4 = getelementptr inbounds i32, i32* %a, i32 %index 267 %5 = bitcast i32* %4 to <4 x i32>* 268 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1) 269 %index.next = add i32 %index, 4 270 %6 = icmp eq i32 %index.next, %n.vec 271 br i1 %6, label %for.cond.cleanup, label %vector.body 272 273for.cond.cleanup: ; preds = %vector.body, %entry 274 ret void 275} 276 277define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) { 278; CHECK-LABEL: vector_add_const: 279; CHECK: @ %bb.0: @ %entry 280; CHECK-NEXT: push {r7, lr} 281; CHECK-NEXT: cmp r3, #0 282; CHECK-NEXT: it eq 283; CHECK-NEXT: popeq {r7, pc} 284; CHECK-NEXT: .LBB4_1: @ %vector.ph 285; CHECK-NEXT: mov.w r12, #0 286; CHECK-NEXT: dlstp.32 lr, r3 287; CHECK-NEXT: .LBB4_2: @ %vector.body 288; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 289; CHECK-NEXT: add.w r12, r12, #4 290; CHECK-NEXT: vldrw.u32 q0, [r1], #16 291; CHECK-NEXT: vadd.i32 q0, q0, r2 292; CHECK-NEXT: vstrw.32 q0, [r0], #16 293; CHECK-NEXT: letp lr, .LBB4_2 294; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 295; CHECK-NEXT: pop {r7, pc} 296entry: 297 %cmp6 = icmp eq i32 %N, 0 298 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph 299 300vector.ph: ; preds = %entry 301 %n.rnd.up = add i32 %N, 3 302 %n.vec = and i32 %n.rnd.up, -4 303 %trip.count.minus.1 = add i32 %N, -1 304 %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 305 %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer 306 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0 307 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 308 br label %vector.body 309 310vector.body: ; preds = %vector.body, %vector.ph 311 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 312 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 313 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 314 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 315 %0 = getelementptr inbounds i32, i32* %b, i32 %index 316 317 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9 318 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 319 320 %2 = bitcast i32* %0 to <4 x i32>* 321 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef) 322 %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11 323 %4 = getelementptr inbounds i32, i32* %a, i32 %index 324 %5 = bitcast i32* %4 to <4 x i32>* 325 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1) 326 %index.next = add i32 %index, 4 327 %6 = icmp eq i32 %index.next, %n.vec 328 br i1 %6, label %for.cond.cleanup, label %vector.body 329 330for.cond.cleanup: ; preds = %vector.body, %entry 331 ret void 332} 333 334define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i32 %N) { 335; CHECK-LABEL: vector_mul_vector_i8: 336; CHECK: @ %bb.0: @ %entry 337; CHECK-NEXT: push {r7, lr} 338; CHECK-NEXT: cmp r3, #0 339; CHECK-NEXT: it eq 340; CHECK-NEXT: popeq {r7, pc} 341; CHECK-NEXT: .LBB5_1: @ %vector.ph 342; CHECK-NEXT: mov.w r12, #0 343; CHECK-NEXT: dlstp.8 lr, r3 344; CHECK-NEXT: .LBB5_2: @ %vector.body 345; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 346; CHECK-NEXT: add.w r12, r12, #16 347; CHECK-NEXT: vldrb.u8 q0, [r1], #16 348; CHECK-NEXT: vldrb.u8 q1, [r2], #16 349; CHECK-NEXT: vmul.i8 q0, q1, q0 350; CHECK-NEXT: vstrb.8 q0, [r0], #16 351; CHECK-NEXT: letp lr, .LBB5_2 352; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 353; CHECK-NEXT: pop {r7, pc} 354entry: 355 %cmp10 = icmp eq i32 %N, 0 356 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph 357 358vector.ph: ; preds = %entry 359 %n.rnd.up = add i32 %N, 15 360 %n.vec = and i32 %n.rnd.up, -16 361 %trip.count.minus.1 = add i32 %N, -1 362 %broadcast.splatinsert12 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0 363 %broadcast.splat13 = shufflevector <16 x i32> %broadcast.splatinsert12, <16 x i32> undef, <16 x i32> zeroinitializer 364 br label %vector.body 365 366vector.body: ; preds = %vector.body, %vector.ph 367 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 368 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 369 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer 370 %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 371 %0 = getelementptr inbounds i8, i8* %b, i32 %index 372 373 ; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13 374 %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) 375 376 %2 = bitcast i8* %0 to <16 x i8>* 377 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef) 378 %3 = getelementptr inbounds i8, i8* %c, i32 %index 379 %4 = bitcast i8* %3 to <16 x i8>* 380 %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %1, <16 x i8> undef) 381 %5 = mul <16 x i8> %wide.masked.load14, %wide.masked.load 382 %6 = getelementptr inbounds i8, i8* %a, i32 %index 383 %7 = bitcast i8* %6 to <16 x i8>* 384 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %5, <16 x i8>* %7, i32 1, <16 x i1> %1) 385 %index.next = add i32 %index, 16 386 %8 = icmp eq i32 %index.next, %n.vec 387 br i1 %8, label %for.cond.cleanup, label %vector.body 388 389for.cond.cleanup: ; preds = %vector.body, %entry 390 ret void 391} 392 393; Function Attrs: nofree norecurse nounwind 394define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c, i32 %N) local_unnamed_addr #0 { 395; CHECK-LABEL: vector_mul_vector_i16: 396; CHECK: @ %bb.0: @ %entry 397; CHECK-NEXT: push {r7, lr} 398; CHECK-NEXT: cmp r3, #0 399; CHECK-NEXT: it eq 400; CHECK-NEXT: popeq {r7, pc} 401; CHECK-NEXT: .LBB6_1: @ %vector.ph 402; CHECK-NEXT: mov.w r12, #0 403; CHECK-NEXT: dlstp.16 lr, r3 404; CHECK-NEXT: .LBB6_2: @ %vector.body 405; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 406; CHECK-NEXT: add.w r12, r12, #8 407; CHECK-NEXT: vldrh.u16 q0, [r1], #16 408; CHECK-NEXT: vldrh.u16 q1, [r2], #16 409; CHECK-NEXT: vmul.i16 q0, q1, q0 410; CHECK-NEXT: vstrh.16 q0, [r0], #16 411; CHECK-NEXT: letp lr, .LBB6_2 412; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 413; CHECK-NEXT: pop {r7, pc} 414entry: 415 %cmp10 = icmp eq i32 %N, 0 416 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph 417 418vector.ph: ; preds = %entry 419 %n.rnd.up = add i32 %N, 7 420 %n.vec = and i32 %n.rnd.up, -8 421 %trip.count.minus.1 = add i32 %N, -1 422 %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 423 %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer 424 br label %vector.body 425 426vector.body: ; preds = %vector.body, %vector.ph 427 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 428 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 429 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer 430 %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 431 %0 = getelementptr inbounds i16, i16* %b, i32 %index 432 433 ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 434 %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) 435 436 %2 = bitcast i16* %0 to <8 x i16>* 437 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef) 438 %3 = getelementptr inbounds i16, i16* %c, i32 %index 439 %4 = bitcast i16* %3 to <8 x i16>* 440 %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %1, <8 x i16> undef) 441 %5 = mul <8 x i16> %wide.masked.load14, %wide.masked.load 442 %6 = getelementptr inbounds i16, i16* %a, i32 %index 443 %7 = bitcast i16* %6 to <8 x i16>* 444 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %5, <8 x i16>* %7, i32 2, <8 x i1> %1) 445 %index.next = add i32 %index, 8 446 %8 = icmp eq i32 %index.next, %n.vec 447 br i1 %8, label %for.cond.cleanup, label %vector.body 448 449for.cond.cleanup: ; preds = %vector.body, %entry 450 ret void 451} 452 453declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) 454declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) 455declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) 456declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) 457declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) 458declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) 459declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 460declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 461declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) 462declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) 463