1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s 3 4; Tail predicated so we use DLSTP 5define void @simple(i32* nocapture readonly %x, i32* nocapture readnone %y, i32* nocapture %z, i32 %m, i32 %n) { 6; CHECK-LABEL: simple: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: .save {r7, lr} 9; CHECK-NEXT: push {r7, lr} 10; CHECK-NEXT: ldr r1, [sp, #8] 11; CHECK-NEXT: mov r12, r3 12; CHECK-NEXT: adds r3, r1, #3 13; CHECK-NEXT: lsrs r3, r3, #2 14; CHECK-NEXT: beq .LBB0_3 15; CHECK-NEXT: @ %bb.1: @ %do.body.preheader 16; CHECK-NEXT: dlstp.32 lr, r1 17; CHECK-NEXT: .LBB0_2: @ %do.body 18; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 19; CHECK-NEXT: vldrw.u32 q0, [r0], #16 20; CHECK-NEXT: vaddva.s32 r12, q0 21; CHECK-NEXT: letp lr, .LBB0_2 22; CHECK-NEXT: .LBB0_3: @ %if.end 23; CHECK-NEXT: str.w r12, [r2] 24; CHECK-NEXT: pop {r7, pc} 25entry: 26 %add = add i32 %n, 3 27 %div = lshr i32 %add, 2 28 %cmp.not = icmp eq i32 %div, 0 29 br i1 %cmp.not, label %if.end, label %do.body 30 31do.body: ; preds = %entry, %do.body 32 %n.addr.0 = phi i32 [ %sub, %do.body ], [ %n, %entry ] 33 %count.0 = phi i32 [ %sub3, %do.body ], [ %div, %entry ] 34 %s.0 = phi i32 [ %add2, %do.body ], [ %m, %entry ] 35 %x.addr.0 = phi i32* [ %add.ptr, %do.body ], [ %x, %entry ] 36 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.0) 37 %1 = bitcast i32* %x.addr.0 to <4 x i32>* 38 %2 = load <4 x i32>, <4 x i32>* %1, align 4 39 %3 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %2, i32 0, <4 x i1> %0) 40 %add2 = add nsw i32 %3, %s.0 41 %add.ptr = getelementptr inbounds i32, i32* %x.addr.0, i32 4 42 %sub = add i32 %n.addr.0, -4 43 %sub3 = add nsw i32 %count.0, -1 44 %cmp4 = icmp sgt i32 %count.0, 1 45 br i1 %cmp4, label %do.body, label %if.end 46 47if.end: ; preds = %do.body, %entry 48 %s.1 = phi i32 [ %m, %entry ], [ %add2, %do.body ] 49 store i32 %s.1, i32* %z, align 4 50 ret void 51} 52 53; Tail predicated so we use DLSTP 54define void @nested(i32* nocapture readonly %x, i32* nocapture readnone %y, i32* nocapture %z, i32 %m, i32 %n) { 55; CHECK-LABEL: nested: 56; CHECK: @ %bb.0: @ %entry 57; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} 58; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 59; CHECK-NEXT: cbz r3, .LBB1_8 60; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 61; CHECK-NEXT: ldr.w r12, [sp, #24] 62; CHECK-NEXT: movs r1, #0 63; CHECK-NEXT: b .LBB1_4 64; CHECK-NEXT: .LBB1_2: @ in Loop: Header=BB1_4 Depth=1 65; CHECK-NEXT: mov r4, r3 66; CHECK-NEXT: .LBB1_3: @ %if.end 67; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 68; CHECK-NEXT: str.w r4, [r2, r1, lsl #2] 69; CHECK-NEXT: adds r1, #1 70; CHECK-NEXT: cmp r1, r3 71; CHECK-NEXT: beq .LBB1_8 72; CHECK-NEXT: .LBB1_4: @ %for.body 73; CHECK-NEXT: @ =>This Loop Header: Depth=1 74; CHECK-NEXT: @ Child Loop BB1_6 Depth 2 75; CHECK-NEXT: add.w r6, r12, #3 76; CHECK-NEXT: lsrs r7, r6, #2 77; CHECK-NEXT: beq .LBB1_2 78; CHECK-NEXT: @ %bb.5: @ %do.body.preheader 79; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 80; CHECK-NEXT: bic r5, r6, #3 81; CHECK-NEXT: mov r4, r3 82; CHECK-NEXT: add.w r8, r0, r5, lsl #2 83; CHECK-NEXT: dlstp.32 lr, r12 84; CHECK-NEXT: .LBB1_6: @ %do.body 85; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 86; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 87; CHECK-NEXT: vldrw.u32 q0, [r0], #16 88; CHECK-NEXT: vaddva.s32 r4, q0 89; CHECK-NEXT: letp lr, .LBB1_6 90; CHECK-NEXT: @ %bb.7: @ %if.end.loopexit 91; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 92; CHECK-NEXT: sub.w r12, r12, r5 93; CHECK-NEXT: mov r0, r8 94; CHECK-NEXT: b .LBB1_3 95; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup 96; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 97entry: 98 %cmp20.not = icmp eq i32 %m, 0 99 br i1 %cmp20.not, label %for.cond.cleanup, label %for.body 100 101for.cond.cleanup: ; preds = %if.end, %entry 102 ret void 103 104for.body: ; preds = %entry, %if.end 105 %x.addr.023 = phi i32* [ %x.addr.2, %if.end ], [ %x, %entry ] 106 %a.022 = phi i32 [ %inc, %if.end ], [ 0, %entry ] 107 %n.addr.021 = phi i32 [ %n.addr.2, %if.end ], [ %n, %entry ] 108 %add = add i32 %n.addr.021, 3 109 %div = lshr i32 %add, 2 110 %cmp1.not = icmp eq i32 %div, 0 111 br i1 %cmp1.not, label %if.end, label %do.body.preheader 112 113do.body.preheader: ; preds = %for.body 114 %0 = and i32 %add, -4 115 %scevgep = getelementptr i32, i32* %x.addr.023, i32 %0 116 br label %do.body 117 118do.body: ; preds = %do.body.preheader, %do.body 119 %n.addr.1 = phi i32 [ %sub, %do.body ], [ %n.addr.021, %do.body.preheader ] 120 %count.0 = phi i32 [ %sub4, %do.body ], [ %div, %do.body.preheader ] 121 %s.0 = phi i32 [ %add3, %do.body ], [ %m, %do.body.preheader ] 122 %x.addr.1 = phi i32* [ %add.ptr, %do.body ], [ %x.addr.023, %do.body.preheader ] 123 %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.1) 124 %2 = bitcast i32* %x.addr.1 to <4 x i32>* 125 %3 = load <4 x i32>, <4 x i32>* %2, align 4 126 %4 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %3, i32 0, <4 x i1> %1) 127 %add3 = add nsw i32 %4, %s.0 128 %add.ptr = getelementptr inbounds i32, i32* %x.addr.1, i32 4 129 %sub = add i32 %n.addr.1, -4 130 %sub4 = add nsw i32 %count.0, -1 131 %cmp5 = icmp sgt i32 %count.0, 1 132 br i1 %cmp5, label %do.body, label %if.end.loopexit 133 134if.end.loopexit: ; preds = %do.body 135 %5 = sub i32 %n.addr.021, %0 136 br label %if.end 137 138if.end: ; preds = %if.end.loopexit, %for.body 139 %n.addr.2 = phi i32 [ %n.addr.021, %for.body ], [ %5, %if.end.loopexit ] 140 %s.1 = phi i32 [ %m, %for.body ], [ %add3, %if.end.loopexit ] 141 %x.addr.2 = phi i32* [ %x.addr.023, %for.body ], [ %scevgep, %if.end.loopexit ] 142 %arrayidx = getelementptr inbounds i32, i32* %z, i32 %a.022 143 store i32 %s.1, i32* %arrayidx, align 4 144 %inc = add nuw nsw i32 %a.022, 1 145 %exitcond.not = icmp eq i32 %inc, %m 146 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 147} 148 149declare <4 x i1> @llvm.arm.mve.vctp32(i32) 150declare i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) 151 152 153; Long test that was spilling lr between t2LoopDec and End 154define dso_local i32 @b(i32* %c, i32 %d, i32 %e, i32* %n) "frame-pointer"="all" { 155; CHECK-LABEL: b: 156; CHECK: @ %bb.0: @ %entry 157; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 158; CHECK-NEXT: push {r4, r5, r6, r7, lr} 159; CHECK-NEXT: .setfp r7, sp, #12 160; CHECK-NEXT: add r7, sp, #12 161; CHECK-NEXT: .save {r8, r9, r10, r11} 162; CHECK-NEXT: push.w {r8, r9, r10, r11} 163; CHECK-NEXT: .pad #16 164; CHECK-NEXT: sub sp, #16 165; CHECK-NEXT: wls lr, r1, .LBB2_3 166; CHECK-NEXT: @ %bb.1: @ %while.body.preheader 167; CHECK-NEXT: mov r12, r0 168; CHECK-NEXT: add.w r10, r3, #4 169; CHECK-NEXT: adds r0, #4 170; CHECK-NEXT: mvn r9, #1 171; CHECK-NEXT: @ implicit-def: $r8 172; CHECK-NEXT: @ implicit-def: $r4 173; CHECK-NEXT: str r2, [sp] @ 4-byte Spill 174; CHECK-NEXT: .LBB2_2: @ %while.body 175; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 176; CHECK-NEXT: ldr r2, [r0] 177; CHECK-NEXT: asrs r5, r4, #31 178; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill 179; CHECK-NEXT: muls r2, r3, r2 180; CHECK-NEXT: adds r4, r4, r2 181; CHECK-NEXT: adc.w r2, r5, r2, asr #31 182; CHECK-NEXT: ldr.w r5, [r9, #4] 183; CHECK-NEXT: adds.w r4, r4, #-2147483648 184; CHECK-NEXT: adc r1, r2, #0 185; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill 186; CHECK-NEXT: smull r5, r6, r5, r8 187; CHECK-NEXT: ldr.w r2, [r9] 188; CHECK-NEXT: asrs r4, r1, #31 189; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill 190; CHECK-NEXT: subs r5, r1, r5 191; CHECK-NEXT: sbcs r4, r6 192; CHECK-NEXT: adds.w r6, r5, #-2147483648 193; CHECK-NEXT: adc r5, r4, #0 194; CHECK-NEXT: ldr r4, [r0, #-4] 195; CHECK-NEXT: muls r4, r3, r4 196; CHECK-NEXT: adds r3, #4 197; CHECK-NEXT: adds.w r0, r4, #-2147483648 198; CHECK-NEXT: asr.w r1, r4, #31 199; CHECK-NEXT: ldr.w r4, [r10] 200; CHECK-NEXT: adc r1, r1, #0 201; CHECK-NEXT: mul r2, r4, r12 202; CHECK-NEXT: add.w r12, r12, #4 203; CHECK-NEXT: add.w r2, r2, #-2147483648 204; CHECK-NEXT: asrl r0, r1, r2 205; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload 206; CHECK-NEXT: smull r0, r1, r4, r0 207; CHECK-NEXT: lsll r0, r1, #30 208; CHECK-NEXT: asr.w r11, r1, #31 209; CHECK-NEXT: mov r0, r1 210; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload 211; CHECK-NEXT: lsll r0, r11, r4 212; CHECK-NEXT: lsrl r0, r11, #2 213; CHECK-NEXT: mul r1, r1, r8 214; CHECK-NEXT: adds r1, #2 215; CHECK-NEXT: lsll r0, r11, r1 216; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload 217; CHECK-NEXT: add.w r0, r0, #-2147483648 218; CHECK-NEXT: asrl r6, r5, r0 219; CHECK-NEXT: movs r0, #2 220; CHECK-NEXT: lsrl r6, r5, #2 221; CHECK-NEXT: str r6, [r0] 222; CHECK-NEXT: mov r8, r6 223; CHECK-NEXT: ldr r0, [r9], #-4 224; CHECK-NEXT: mls r0, r0, r4, r1 225; CHECK-NEXT: adds.w r4, r0, #-2147483648 226; CHECK-NEXT: asr.w r1, r0, #31 227; CHECK-NEXT: adc r1, r1, #0 228; CHECK-NEXT: lsrl r4, r1, #2 229; CHECK-NEXT: rsbs r0, r4, #0 230; CHECK-NEXT: str r0, [r2] 231; CHECK-NEXT: str r0, [r10, #-4] 232; CHECK-NEXT: add.w r10, r10, #4 233; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload 234; CHECK-NEXT: adds r0, #4 235; CHECK-NEXT: le lr, .LBB2_2 236; CHECK-NEXT: .LBB2_3: @ %while.end 237; CHECK-NEXT: add sp, #16 238; CHECK-NEXT: pop.w {r8, r9, r10, r11} 239; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 240entry: 241 %0 = inttoptr i32 %e to i32* 242 %tobool.not70 = icmp eq i32 %d, 0 243 br i1 %tobool.not70, label %while.end, label %while.body 244 245while.body: ; preds = %entry, %while.body 246 %p.077 = phi i32* [ %incdec.ptr22, %while.body ], [ inttoptr (i32 2 to i32*), %entry ] 247 %c.addr.076 = phi i32* [ %incdec.ptr1, %while.body ], [ %c, %entry ] 248 %n.075 = phi i32* [ %incdec.ptr43, %while.body ], [ %n, %entry ] 249 %m.074 = phi i32 [ %conv35, %while.body ], [ undef, %entry ] 250 %d.addr.073 = phi i32 [ %dec, %while.body ], [ %d, %entry ] 251 %h.072 = phi i32 [ %conv41, %while.body ], [ undef, %entry ] 252 %incdec.ptr43 = getelementptr inbounds i32, i32* %n.075, i32 1 253 %1 = ptrtoint i32* %n.075 to i32 254 %2 = load i32, i32* %incdec.ptr43, align 4 255 %3 = load i32, i32* %c.addr.076, align 4 256 %mul = mul nsw i32 %3, %1 257 %conv = sext i32 %mul to i64 258 %add = add nsw i64 %conv, 2147483648 259 %incdec.ptr1 = getelementptr inbounds i32, i32* %c.addr.076, i32 1 260 %4 = ptrtoint i32* %c.addr.076 to i32 261 %mul2 = mul nsw i32 %2, %4 262 %conv3 = sext i32 %mul2 to i64 263 %add4 = add nsw i64 %conv3, 2147483648 264 %shr = ashr i64 %add, %add4 265 %5 = shl nuw i64 %shr, 32 266 %conv6 = ashr exact i64 %5, 32 267 %conv7 = sext i32 %2 to i64 268 %conv11 = sext i32 %h.072 to i64 269 %6 = load i32, i32* %incdec.ptr1, align 4 270 %mul12 = mul nsw i32 %6, %1 271 %conv13 = sext i32 %mul12 to i64 272 %add14 = add nuw nsw i64 %conv11, 2147483648 273 %add15 = add nsw i64 %add14, %conv13 274 %shr16 = ashr i64 %add15, 32 275 %conv17 = trunc i64 %shr16 to i32 276 %mul8 = shl nsw i64 %conv7, 30 277 %7 = mul i64 %mul8, %conv6 278 %conv18 = ashr i64 %7, 32 279 %sh_prom = zext i32 %2 to i64 280 %shl = shl i64 %conv18, %sh_prom 281 %conv21 = sext i32 %conv17 to i64 282 %incdec.ptr22 = getelementptr inbounds i32, i32* %p.077, i32 -1 283 %8 = load i32, i32* %p.077, align 4 284 %conv23 = sext i32 %8 to i64 285 %conv24 = sext i32 %m.074 to i64 286 %mul25 = mul nsw i64 %conv23, %conv24 287 %sub = sub nsw i64 2147483648, %mul25 288 %add26 = add nsw i64 %sub, %conv21 289 %9 = shl i64 %shl, 30 290 %conv27 = ashr i64 %9, 32 291 %10 = load i32, i32* %incdec.ptr22, align 4 292 %mul28 = mul nsw i32 %10, %m.074 293 %add29 = add nsw i32 %mul28, 2 294 %sh_prom30 = zext i32 %add29 to i64 295 %shl31 = shl i64 %conv27, %sh_prom30 296 %add32 = add nsw i64 %shl31, 2147483648 297 %shr33 = ashr i64 %add26, %add32 298 %11 = lshr i64 %shr33, 2 299 %conv35 = trunc i64 %11 to i32 300 store i32 %conv35, i32* inttoptr (i32 2 to i32*), align 4 301 %12 = load i32, i32* %incdec.ptr22, align 4 302 %mul36 = mul nsw i32 %12, %2 303 %sub37 = sub nsw i32 %conv17, %mul36 304 %conv38 = sext i32 %sub37 to i64 305 %add39 = add nsw i64 %conv38, 2147483648 306 %13 = lshr i64 %add39, 2 307 %conv41 = trunc i64 %13 to i32 308 %sub42 = sub nsw i32 0, %conv41 309 store i32 %sub42, i32* %0, align 4 310 store i32 %sub42, i32* %n.075, align 4 311 %dec = add nsw i32 %d.addr.073, -1 312 %tobool.not = icmp eq i32 %dec, 0 313 br i1 %tobool.not, label %while.end, label %while.body 314 315while.end: ; preds = %while.body, %entry 316 ret i32 undef 317} 318 319declare void @callee() 320define void @callinpreheader(i32* noalias nocapture readonly %pAngle, i32* nocapture %pDst, i32 %size) { 321; CHECK-LABEL: callinpreheader: 322; CHECK: @ %bb.0: @ %entry 323; CHECK-NEXT: .save {r4, r5, r6, lr} 324; CHECK-NEXT: push {r4, r5, r6, lr} 325; CHECK-NEXT: mov r5, r0 326; CHECK-NEXT: mov r4, r1 327; CHECK-NEXT: movs r0, #0 328; CHECK-NEXT: cbz r2, .LBB3_3 329; CHECK-NEXT: @ %bb.1: @ %for.body.ph 330; CHECK-NEXT: mov r6, r2 331; CHECK-NEXT: bl callee 332; CHECK-NEXT: dls lr, r6 333; CHECK-NEXT: movs r0, #0 334; CHECK-NEXT: .LBB3_2: @ %for.body 335; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 336; CHECK-NEXT: ldr r1, [r5], #4 337; CHECK-NEXT: add r0, r1 338; CHECK-NEXT: le lr, .LBB3_2 339; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup 340; CHECK-NEXT: str r0, [r4] 341; CHECK-NEXT: pop {r4, r5, r6, pc} 342entry: 343 %cmp7.not = icmp eq i32 %size, 0 344 br i1 %cmp7.not, label %for.cond.cleanup, label %for.body.ph 345 346for.body.ph: 347 call void @callee() 348 br label %for.body 349 350for.body: 351 %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.ph ] 352 %s.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.ph ] 353 %arrayidx = getelementptr inbounds i32, i32* %pAngle, i32 %i.09 354 %0 = load i32, i32* %arrayidx, align 4 355 %add = add nsw i32 %0, %s.08 356 %inc = add nuw nsw i32 %i.09, 1 357 %exitcond.not = icmp eq i32 %inc, %size 358 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 359 360for.cond.cleanup: 361 %s.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] 362 store i32 %s.0.lcssa, i32* %pDst, align 4 363 ret void 364} 365