1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s
3
4; Tail predicated so we use DLSTP
5define void @simple(i32* nocapture readonly %x, i32* nocapture readnone %y, i32* nocapture %z, i32 %m, i32 %n) {
6; CHECK-LABEL: simple:
7; CHECK:       @ %bb.0: @ %entry
8; CHECK-NEXT:    .save {r7, lr}
9; CHECK-NEXT:    push {r7, lr}
10; CHECK-NEXT:    ldr r1, [sp, #8]
11; CHECK-NEXT:    mov r12, r3
12; CHECK-NEXT:    adds r3, r1, #3
13; CHECK-NEXT:    lsrs r3, r3, #2
14; CHECK-NEXT:    beq .LBB0_3
15; CHECK-NEXT:  @ %bb.1: @ %do.body.preheader
16; CHECK-NEXT:    dlstp.32 lr, r1
17; CHECK-NEXT:  .LBB0_2: @ %do.body
18; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
19; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
20; CHECK-NEXT:    vaddva.s32 r12, q0
21; CHECK-NEXT:    letp lr, .LBB0_2
22; CHECK-NEXT:  .LBB0_3: @ %if.end
23; CHECK-NEXT:    str.w r12, [r2]
24; CHECK-NEXT:    pop {r7, pc}
25entry:
26  %add = add i32 %n, 3
27  %div = lshr i32 %add, 2
28  %cmp.not = icmp eq i32 %div, 0
29  br i1 %cmp.not, label %if.end, label %do.body
30
31do.body:                                          ; preds = %entry, %do.body
32  %n.addr.0 = phi i32 [ %sub, %do.body ], [ %n, %entry ]
33  %count.0 = phi i32 [ %sub3, %do.body ], [ %div, %entry ]
34  %s.0 = phi i32 [ %add2, %do.body ], [ %m, %entry ]
35  %x.addr.0 = phi i32* [ %add.ptr, %do.body ], [ %x, %entry ]
36  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.0)
37  %1 = bitcast i32* %x.addr.0 to <4 x i32>*
38  %2 = load <4 x i32>, <4 x i32>* %1, align 4
39  %3 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %2, i32 0, <4 x i1> %0)
40  %add2 = add nsw i32 %3, %s.0
41  %add.ptr = getelementptr inbounds i32, i32* %x.addr.0, i32 4
42  %sub = add i32 %n.addr.0, -4
43  %sub3 = add nsw i32 %count.0, -1
44  %cmp4 = icmp sgt i32 %count.0, 1
45  br i1 %cmp4, label %do.body, label %if.end
46
47if.end:                                           ; preds = %do.body, %entry
48  %s.1 = phi i32 [ %m, %entry ], [ %add2, %do.body ]
49  store i32 %s.1, i32* %z, align 4
50  ret void
51}
52
53; Tail predicated so we use DLSTP
54define void @nested(i32* nocapture readonly %x, i32* nocapture readnone %y, i32* nocapture %z, i32 %m, i32 %n) {
55; CHECK-LABEL: nested:
56; CHECK:       @ %bb.0: @ %entry
57; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
58; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
59; CHECK-NEXT:    cbz r3, .LBB1_8
60; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
61; CHECK-NEXT:    ldr.w r12, [sp, #24]
62; CHECK-NEXT:    movs r1, #0
63; CHECK-NEXT:    b .LBB1_4
64; CHECK-NEXT:  .LBB1_2: @ in Loop: Header=BB1_4 Depth=1
65; CHECK-NEXT:    mov r4, r3
66; CHECK-NEXT:  .LBB1_3: @ %if.end
67; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
68; CHECK-NEXT:    str.w r4, [r2, r1, lsl #2]
69; CHECK-NEXT:    adds r1, #1
70; CHECK-NEXT:    cmp r1, r3
71; CHECK-NEXT:    beq .LBB1_8
72; CHECK-NEXT:  .LBB1_4: @ %for.body
73; CHECK-NEXT:    @ =>This Loop Header: Depth=1
74; CHECK-NEXT:    @ Child Loop BB1_6 Depth 2
75; CHECK-NEXT:    add.w r6, r12, #3
76; CHECK-NEXT:    lsrs r7, r6, #2
77; CHECK-NEXT:    beq .LBB1_2
78; CHECK-NEXT:  @ %bb.5: @ %do.body.preheader
79; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
80; CHECK-NEXT:    bic r5, r6, #3
81; CHECK-NEXT:    mov r4, r3
82; CHECK-NEXT:    add.w r8, r0, r5, lsl #2
83; CHECK-NEXT:    dlstp.32 lr, r12
84; CHECK-NEXT:  .LBB1_6: @ %do.body
85; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
86; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
87; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
88; CHECK-NEXT:    vaddva.s32 r4, q0
89; CHECK-NEXT:    letp lr, .LBB1_6
90; CHECK-NEXT:  @ %bb.7: @ %if.end.loopexit
91; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
92; CHECK-NEXT:    sub.w r12, r12, r5
93; CHECK-NEXT:    mov r0, r8
94; CHECK-NEXT:    b .LBB1_3
95; CHECK-NEXT:  .LBB1_8: @ %for.cond.cleanup
96; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
97entry:
98  %cmp20.not = icmp eq i32 %m, 0
99  br i1 %cmp20.not, label %for.cond.cleanup, label %for.body
100
101for.cond.cleanup:                                 ; preds = %if.end, %entry
102  ret void
103
104for.body:                                         ; preds = %entry, %if.end
105  %x.addr.023 = phi i32* [ %x.addr.2, %if.end ], [ %x, %entry ]
106  %a.022 = phi i32 [ %inc, %if.end ], [ 0, %entry ]
107  %n.addr.021 = phi i32 [ %n.addr.2, %if.end ], [ %n, %entry ]
108  %add = add i32 %n.addr.021, 3
109  %div = lshr i32 %add, 2
110  %cmp1.not = icmp eq i32 %div, 0
111  br i1 %cmp1.not, label %if.end, label %do.body.preheader
112
113do.body.preheader:                                ; preds = %for.body
114  %0 = and i32 %add, -4
115  %scevgep = getelementptr i32, i32* %x.addr.023, i32 %0
116  br label %do.body
117
118do.body:                                          ; preds = %do.body.preheader, %do.body
119  %n.addr.1 = phi i32 [ %sub, %do.body ], [ %n.addr.021, %do.body.preheader ]
120  %count.0 = phi i32 [ %sub4, %do.body ], [ %div, %do.body.preheader ]
121  %s.0 = phi i32 [ %add3, %do.body ], [ %m, %do.body.preheader ]
122  %x.addr.1 = phi i32* [ %add.ptr, %do.body ], [ %x.addr.023, %do.body.preheader ]
123  %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.1)
124  %2 = bitcast i32* %x.addr.1 to <4 x i32>*
125  %3 = load <4 x i32>, <4 x i32>* %2, align 4
126  %4 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %3, i32 0, <4 x i1> %1)
127  %add3 = add nsw i32 %4, %s.0
128  %add.ptr = getelementptr inbounds i32, i32* %x.addr.1, i32 4
129  %sub = add i32 %n.addr.1, -4
130  %sub4 = add nsw i32 %count.0, -1
131  %cmp5 = icmp sgt i32 %count.0, 1
132  br i1 %cmp5, label %do.body, label %if.end.loopexit
133
134if.end.loopexit:                                  ; preds = %do.body
135  %5 = sub i32 %n.addr.021, %0
136  br label %if.end
137
138if.end:                                           ; preds = %if.end.loopexit, %for.body
139  %n.addr.2 = phi i32 [ %n.addr.021, %for.body ], [ %5, %if.end.loopexit ]
140  %s.1 = phi i32 [ %m, %for.body ], [ %add3, %if.end.loopexit ]
141  %x.addr.2 = phi i32* [ %x.addr.023, %for.body ], [ %scevgep, %if.end.loopexit ]
142  %arrayidx = getelementptr inbounds i32, i32* %z, i32 %a.022
143  store i32 %s.1, i32* %arrayidx, align 4
144  %inc = add nuw nsw i32 %a.022, 1
145  %exitcond.not = icmp eq i32 %inc, %m
146  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
147}
148
149declare <4 x i1> @llvm.arm.mve.vctp32(i32)
150declare i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
151
152
153; Long test that was spilling lr between t2LoopDec and End
154define dso_local i32 @b(i32* %c, i32 %d, i32 %e, i32* %n) "frame-pointer"="all" {
155; CHECK-LABEL: b:
156; CHECK:       @ %bb.0: @ %entry
157; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
158; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
159; CHECK-NEXT:    .setfp r7, sp, #12
160; CHECK-NEXT:    add r7, sp, #12
161; CHECK-NEXT:    .save {r8, r9, r10, r11}
162; CHECK-NEXT:    push.w {r8, r9, r10, r11}
163; CHECK-NEXT:    .pad #16
164; CHECK-NEXT:    sub sp, #16
165; CHECK-NEXT:    wls lr, r1, .LBB2_3
166; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
167; CHECK-NEXT:    mov r12, r0
168; CHECK-NEXT:    add.w r10, r3, #4
169; CHECK-NEXT:    adds r0, #4
170; CHECK-NEXT:    mvn r9, #1
171; CHECK-NEXT:    @ implicit-def: $r8
172; CHECK-NEXT:    @ implicit-def: $r4
173; CHECK-NEXT:    str r2, [sp] @ 4-byte Spill
174; CHECK-NEXT:  .LBB2_2: @ %while.body
175; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
176; CHECK-NEXT:    ldr r2, [r0]
177; CHECK-NEXT:    asrs r5, r4, #31
178; CHECK-NEXT:    str r0, [sp, #12] @ 4-byte Spill
179; CHECK-NEXT:    muls r2, r3, r2
180; CHECK-NEXT:    adds r4, r4, r2
181; CHECK-NEXT:    adc.w r2, r5, r2, asr #31
182; CHECK-NEXT:    ldr.w r5, [r9, #4]
183; CHECK-NEXT:    adds.w r4, r4, #-2147483648
184; CHECK-NEXT:    adc r1, r2, #0
185; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
186; CHECK-NEXT:    smull r5, r6, r5, r8
187; CHECK-NEXT:    ldr.w r2, [r9]
188; CHECK-NEXT:    asrs r4, r1, #31
189; CHECK-NEXT:    str r2, [sp, #4] @ 4-byte Spill
190; CHECK-NEXT:    subs r5, r1, r5
191; CHECK-NEXT:    sbcs r4, r6
192; CHECK-NEXT:    adds.w r6, r5, #-2147483648
193; CHECK-NEXT:    adc r5, r4, #0
194; CHECK-NEXT:    ldr r4, [r0, #-4]
195; CHECK-NEXT:    muls r4, r3, r4
196; CHECK-NEXT:    adds r3, #4
197; CHECK-NEXT:    adds.w r0, r4, #-2147483648
198; CHECK-NEXT:    asr.w r1, r4, #31
199; CHECK-NEXT:    ldr.w r4, [r10]
200; CHECK-NEXT:    adc r1, r1, #0
201; CHECK-NEXT:    mul r2, r4, r12
202; CHECK-NEXT:    add.w r12, r12, #4
203; CHECK-NEXT:    add.w r2, r2, #-2147483648
204; CHECK-NEXT:    asrl r0, r1, r2
205; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
206; CHECK-NEXT:    smull r0, r1, r4, r0
207; CHECK-NEXT:    lsll r0, r1, #30
208; CHECK-NEXT:    asr.w r11, r1, #31
209; CHECK-NEXT:    mov r0, r1
210; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
211; CHECK-NEXT:    lsll r0, r11, r4
212; CHECK-NEXT:    lsrl r0, r11, #2
213; CHECK-NEXT:    mul r1, r1, r8
214; CHECK-NEXT:    adds r1, #2
215; CHECK-NEXT:    lsll r0, r11, r1
216; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
217; CHECK-NEXT:    add.w r0, r0, #-2147483648
218; CHECK-NEXT:    asrl r6, r5, r0
219; CHECK-NEXT:    movs r0, #2
220; CHECK-NEXT:    lsrl r6, r5, #2
221; CHECK-NEXT:    str r6, [r0]
222; CHECK-NEXT:    mov r8, r6
223; CHECK-NEXT:    ldr r0, [r9], #-4
224; CHECK-NEXT:    mls r0, r0, r4, r1
225; CHECK-NEXT:    adds.w r4, r0, #-2147483648
226; CHECK-NEXT:    asr.w r1, r0, #31
227; CHECK-NEXT:    adc r1, r1, #0
228; CHECK-NEXT:    lsrl r4, r1, #2
229; CHECK-NEXT:    rsbs r0, r4, #0
230; CHECK-NEXT:    str r0, [r2]
231; CHECK-NEXT:    str r0, [r10, #-4]
232; CHECK-NEXT:    add.w r10, r10, #4
233; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
234; CHECK-NEXT:    adds r0, #4
235; CHECK-NEXT:    le lr, .LBB2_2
236; CHECK-NEXT:  .LBB2_3: @ %while.end
237; CHECK-NEXT:    add sp, #16
238; CHECK-NEXT:    pop.w {r8, r9, r10, r11}
239; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
240entry:
241  %0 = inttoptr i32 %e to i32*
242  %tobool.not70 = icmp eq i32 %d, 0
243  br i1 %tobool.not70, label %while.end, label %while.body
244
245while.body:                                       ; preds = %entry, %while.body
246  %p.077 = phi i32* [ %incdec.ptr22, %while.body ], [ inttoptr (i32 2 to i32*), %entry ]
247  %c.addr.076 = phi i32* [ %incdec.ptr1, %while.body ], [ %c, %entry ]
248  %n.075 = phi i32* [ %incdec.ptr43, %while.body ], [ %n, %entry ]
249  %m.074 = phi i32 [ %conv35, %while.body ], [ undef, %entry ]
250  %d.addr.073 = phi i32 [ %dec, %while.body ], [ %d, %entry ]
251  %h.072 = phi i32 [ %conv41, %while.body ], [ undef, %entry ]
252  %incdec.ptr43 = getelementptr inbounds i32, i32* %n.075, i32 1
253  %1 = ptrtoint i32* %n.075 to i32
254  %2 = load i32, i32* %incdec.ptr43, align 4
255  %3 = load i32, i32* %c.addr.076, align 4
256  %mul = mul nsw i32 %3, %1
257  %conv = sext i32 %mul to i64
258  %add = add nsw i64 %conv, 2147483648
259  %incdec.ptr1 = getelementptr inbounds i32, i32* %c.addr.076, i32 1
260  %4 = ptrtoint i32* %c.addr.076 to i32
261  %mul2 = mul nsw i32 %2, %4
262  %conv3 = sext i32 %mul2 to i64
263  %add4 = add nsw i64 %conv3, 2147483648
264  %shr = ashr i64 %add, %add4
265  %5 = shl nuw i64 %shr, 32
266  %conv6 = ashr exact i64 %5, 32
267  %conv7 = sext i32 %2 to i64
268  %conv11 = sext i32 %h.072 to i64
269  %6 = load i32, i32* %incdec.ptr1, align 4
270  %mul12 = mul nsw i32 %6, %1
271  %conv13 = sext i32 %mul12 to i64
272  %add14 = add nuw nsw i64 %conv11, 2147483648
273  %add15 = add nsw i64 %add14, %conv13
274  %shr16 = ashr i64 %add15, 32
275  %conv17 = trunc i64 %shr16 to i32
276  %mul8 = shl nsw i64 %conv7, 30
277  %7 = mul i64 %mul8, %conv6
278  %conv18 = ashr i64 %7, 32
279  %sh_prom = zext i32 %2 to i64
280  %shl = shl i64 %conv18, %sh_prom
281  %conv21 = sext i32 %conv17 to i64
282  %incdec.ptr22 = getelementptr inbounds i32, i32* %p.077, i32 -1
283  %8 = load i32, i32* %p.077, align 4
284  %conv23 = sext i32 %8 to i64
285  %conv24 = sext i32 %m.074 to i64
286  %mul25 = mul nsw i64 %conv23, %conv24
287  %sub = sub nsw i64 2147483648, %mul25
288  %add26 = add nsw i64 %sub, %conv21
289  %9 = shl i64 %shl, 30
290  %conv27 = ashr i64 %9, 32
291  %10 = load i32, i32* %incdec.ptr22, align 4
292  %mul28 = mul nsw i32 %10, %m.074
293  %add29 = add nsw i32 %mul28, 2
294  %sh_prom30 = zext i32 %add29 to i64
295  %shl31 = shl i64 %conv27, %sh_prom30
296  %add32 = add nsw i64 %shl31, 2147483648
297  %shr33 = ashr i64 %add26, %add32
298  %11 = lshr i64 %shr33, 2
299  %conv35 = trunc i64 %11 to i32
300  store i32 %conv35, i32* inttoptr (i32 2 to i32*), align 4
301  %12 = load i32, i32* %incdec.ptr22, align 4
302  %mul36 = mul nsw i32 %12, %2
303  %sub37 = sub nsw i32 %conv17, %mul36
304  %conv38 = sext i32 %sub37 to i64
305  %add39 = add nsw i64 %conv38, 2147483648
306  %13 = lshr i64 %add39, 2
307  %conv41 = trunc i64 %13 to i32
308  %sub42 = sub nsw i32 0, %conv41
309  store i32 %sub42, i32* %0, align 4
310  store i32 %sub42, i32* %n.075, align 4
311  %dec = add nsw i32 %d.addr.073, -1
312  %tobool.not = icmp eq i32 %dec, 0
313  br i1 %tobool.not, label %while.end, label %while.body
314
315while.end:                                        ; preds = %while.body, %entry
316  ret i32 undef
317}
318
319declare void @callee()
320define void @callinpreheader(i32* noalias nocapture readonly %pAngle, i32* nocapture %pDst, i32 %size) {
321; CHECK-LABEL: callinpreheader:
322; CHECK:       @ %bb.0: @ %entry
323; CHECK-NEXT:    .save {r4, r5, r6, lr}
324; CHECK-NEXT:    push {r4, r5, r6, lr}
325; CHECK-NEXT:    mov r5, r0
326; CHECK-NEXT:    mov r4, r1
327; CHECK-NEXT:    movs r0, #0
328; CHECK-NEXT:    cbz r2, .LBB3_3
329; CHECK-NEXT:  @ %bb.1: @ %for.body.ph
330; CHECK-NEXT:    mov r6, r2
331; CHECK-NEXT:    bl callee
332; CHECK-NEXT:    dls lr, r6
333; CHECK-NEXT:    movs r0, #0
334; CHECK-NEXT:  .LBB3_2: @ %for.body
335; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
336; CHECK-NEXT:    ldr r1, [r5], #4
337; CHECK-NEXT:    add r0, r1
338; CHECK-NEXT:    le lr, .LBB3_2
339; CHECK-NEXT:  .LBB3_3: @ %for.cond.cleanup
340; CHECK-NEXT:    str r0, [r4]
341; CHECK-NEXT:    pop {r4, r5, r6, pc}
342entry:
343  %cmp7.not = icmp eq i32 %size, 0
344  br i1 %cmp7.not, label %for.cond.cleanup, label %for.body.ph
345
346for.body.ph:
347  call void @callee()
348  br label %for.body
349
350for.body:
351  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.ph ]
352  %s.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.ph ]
353  %arrayidx = getelementptr inbounds i32, i32* %pAngle, i32 %i.09
354  %0 = load i32, i32* %arrayidx, align 4
355  %add = add nsw i32 %0, %s.08
356  %inc = add nuw nsw i32 %i.09, 1
357  %exitcond.not = icmp eq i32 %inc, %size
358  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
359
360for.cond.cleanup:
361  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
362  store i32 %s.0.lcssa, i32* %pDst, align 4
363  ret void
364}
365