1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -tail-predication=enabled --verify-machineinstrs %s -o - | FileCheck %s
3
4define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
5; CHECK-LABEL: mul_reduce_add:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    cmp r2, #0
8; CHECK-NEXT:    itt eq
9; CHECK-NEXT:    moveq r0, #0
10; CHECK-NEXT:    bxeq lr
11; CHECK-NEXT:  .LBB0_1: @ %vector.ph
12; CHECK-NEXT:    push {r7, lr}
13; CHECK-NEXT:    adds r3, r2, #3
14; CHECK-NEXT:    vmov.i32 q0, #0x0
15; CHECK-NEXT:    bic r3, r3, #3
16; CHECK-NEXT:    sub.w r12, r3, #4
17; CHECK-NEXT:    movs r3, #1
18; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
19; CHECK-NEXT:    movs r3, #0
20; CHECK-NEXT:  .LBB0_2: @ %vector.body
21; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
22; CHECK-NEXT:    vctp.32 r2
23; CHECK-NEXT:    vmov q1, q0
24; CHECK-NEXT:    vpstt
25; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
26; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
27; CHECK-NEXT:    adds r3, #4
28; CHECK-NEXT:    vmul.i32 q0, q2, q0
29; CHECK-NEXT:    subs r2, #4
30; CHECK-NEXT:    vadd.i32 q0, q0, q1
31; CHECK-NEXT:    le lr, .LBB0_2
32; CHECK-NEXT:  @ %bb.3: @ %middle.block
33; CHECK-NEXT:    vpsel q0, q0, q1
34; CHECK-NEXT:    vaddv.u32 r0, q0
35; CHECK-NEXT:    pop {r7, pc}
36entry:
37  %cmp8 = icmp eq i32 %N, 0
38  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
39
40vector.ph:                                        ; preds = %entry
41  %n.rnd.up = add i32 %N, 3
42  %n.vec = and i32 %n.rnd.up, -4
43  %trip.count.minus.1 = add i32 %N, -1
44  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
45  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
46  br label %vector.body
47
48vector.body:                                      ; preds = %vector.body, %vector.ph
49  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
50  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
51  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
52  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
53  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
54  %0 = getelementptr inbounds i32, i32* %a, i32 %index
55
56  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
57  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
58
59  %2 = bitcast i32* %0 to <4 x i32>*
60  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
61  %3 = getelementptr inbounds i32, i32* %b, i32 %index
62  %4 = bitcast i32* %3 to <4 x i32>*
63  %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef)
64  %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
65  %6 = add nsw <4 x i32> %5, %vec.phi
66  %index.next = add i32 %index, 4
67  %7 = icmp eq i32 %index.next, %n.vec
68  br i1 %7, label %middle.block, label %vector.body
69
70middle.block:                                     ; preds = %vector.body
71  %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi
72  %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8)
73  br label %for.cond.cleanup
74
75for.cond.cleanup:                                 ; preds = %middle.block, %entry
76  %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ]
77  ret i32 %res.0.lcssa
78}
79
80define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
81; CHECK-LABEL: mul_reduce_add_const:
82; CHECK:       @ %bb.0: @ %entry
83; CHECK-NEXT:    cmp r2, #0
84; CHECK-NEXT:    itt eq
85; CHECK-NEXT:    moveq r0, #0
86; CHECK-NEXT:    bxeq lr
87; CHECK-NEXT:  .LBB1_1: @ %vector.ph
88; CHECK-NEXT:    push {r7, lr}
89; CHECK-NEXT:    adds r1, r2, #3
90; CHECK-NEXT:    movs r3, #1
91; CHECK-NEXT:    bic r1, r1, #3
92; CHECK-NEXT:    vmov.i32 q0, #0x0
93; CHECK-NEXT:    subs r1, #4
94; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
95; CHECK-NEXT:    movs r1, #0
96; CHECK-NEXT:  .LBB1_2: @ %vector.body
97; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
98; CHECK-NEXT:    vctp.32 r2
99; CHECK-NEXT:    vmov q1, q0
100; CHECK-NEXT:    vpst
101; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
102; CHECK-NEXT:    adds r1, #4
103; CHECK-NEXT:    subs r2, #4
104; CHECK-NEXT:    vadd.i32 q0, q0, q1
105; CHECK-NEXT:    le lr, .LBB1_2
106; CHECK-NEXT:  @ %bb.3: @ %middle.block
107; CHECK-NEXT:    vpsel q0, q0, q1
108; CHECK-NEXT:    vaddv.u32 r0, q0
109; CHECK-NEXT:    pop {r7, pc}
110entry:
111  %cmp6 = icmp eq i32 %N, 0
112  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
113
114vector.ph:                                        ; preds = %entry
115  %n.rnd.up = add i32 %N, 3
116  %n.vec = and i32 %n.rnd.up, -4
117  %trip.count.minus.1 = add i32 %N, -1
118  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
119  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
120  br label %vector.body
121
122vector.body:                                      ; preds = %vector.body, %vector.ph
123  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
124  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
125  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
126  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
127  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
128  %0 = getelementptr inbounds i32, i32* %a, i32 %index
129
130  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
131  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
132
133  %2 = bitcast i32* %0 to <4 x i32>*
134  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
135  %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
136  %index.next = add i32 %index, 4
137  %4 = icmp eq i32 %index.next, %n.vec
138  br i1 %4, label %middle.block, label %vector.body
139
140middle.block:                                     ; preds = %vector.body
141  %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
142  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
143  br label %for.cond.cleanup
144
145for.cond.cleanup:                                 ; preds = %middle.block, %entry
146  %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
147  ret i32 %res.0.lcssa
148}
149
150define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
151; CHECK-LABEL: add_reduce_add_const:
152; CHECK:       @ %bb.0: @ %entry
153; CHECK-NEXT:    cmp r2, #0
154; CHECK-NEXT:    itt eq
155; CHECK-NEXT:    moveq r0, #0
156; CHECK-NEXT:    bxeq lr
157; CHECK-NEXT:  .LBB2_1: @ %vector.ph
158; CHECK-NEXT:    push {r7, lr}
159; CHECK-NEXT:    adds r1, r2, #3
160; CHECK-NEXT:    movs r3, #1
161; CHECK-NEXT:    bic r1, r1, #3
162; CHECK-NEXT:    vmov.i32 q0, #0x0
163; CHECK-NEXT:    subs r1, #4
164; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
165; CHECK-NEXT:    movs r1, #0
166; CHECK-NEXT:  .LBB2_2: @ %vector.body
167; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
168; CHECK-NEXT:    vctp.32 r2
169; CHECK-NEXT:    vmov q1, q0
170; CHECK-NEXT:    vpst
171; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
172; CHECK-NEXT:    adds r1, #4
173; CHECK-NEXT:    subs r2, #4
174; CHECK-NEXT:    vadd.i32 q0, q0, q1
175; CHECK-NEXT:    le lr, .LBB2_2
176; CHECK-NEXT:  @ %bb.3: @ %middle.block
177; CHECK-NEXT:    vpsel q0, q0, q1
178; CHECK-NEXT:    vaddv.u32 r0, q0
179; CHECK-NEXT:    pop {r7, pc}
180entry:
181  %cmp6 = icmp eq i32 %N, 0
182  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
183
184vector.ph:                                        ; preds = %entry
185  %n.rnd.up = add i32 %N, 3
186  %n.vec = and i32 %n.rnd.up, -4
187  %trip.count.minus.1 = add i32 %N, -1
188  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
189  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
190  br label %vector.body
191
192vector.body:                                      ; preds = %vector.body, %vector.ph
193  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
194  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
195  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
196  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
197  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
198  %0 = getelementptr inbounds i32, i32* %a, i32 %index
199
200  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
201  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
202
203  %2 = bitcast i32* %0 to <4 x i32>*
204  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
205  %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
206  %index.next = add i32 %index, 4
207  %4 = icmp eq i32 %index.next, %n.vec
208  br i1 %4, label %middle.block, label %vector.body
209
210middle.block:                                     ; preds = %vector.body
211  %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
212  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
213  br label %for.cond.cleanup
214
215for.cond.cleanup:                                 ; preds = %middle.block, %entry
216  %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
217  ret i32 %res.0.lcssa
218}
219
220define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
221; CHECK-LABEL: vector_mul_const:
222; CHECK:       @ %bb.0: @ %entry
223; CHECK-NEXT:    push {r7, lr}
224; CHECK-NEXT:    cmp r3, #0
225; CHECK-NEXT:    it eq
226; CHECK-NEXT:    popeq {r7, pc}
227; CHECK-NEXT:  .LBB3_1: @ %vector.ph
228; CHECK-NEXT:    mov.w r12, #0
229; CHECK-NEXT:    dlstp.32 lr, r3
230; CHECK-NEXT:  .LBB3_2: @ %vector.body
231; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
232; CHECK-NEXT:    add.w r12, r12, #4
233; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
234; CHECK-NEXT:    vmul.i32 q0, q0, r2
235; CHECK-NEXT:    vstrw.32 q0, [r0], #16
236; CHECK-NEXT:    letp lr, .LBB3_2
237; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
238; CHECK-NEXT:    pop {r7, pc}
239entry:
240  %cmp6 = icmp eq i32 %N, 0
241  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
242
243vector.ph:                                        ; preds = %entry
244  %n.rnd.up = add i32 %N, 3
245  %n.vec = and i32 %n.rnd.up, -4
246  %trip.count.minus.1 = add i32 %N, -1
247  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
248  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
249  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
250  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
251  br label %vector.body
252
253vector.body:                                      ; preds = %vector.body, %vector.ph
254  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
255  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
256  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
257  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
258  %0 = getelementptr inbounds i32, i32* %b, i32 %index
259
260  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
261  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
262
263  %2 = bitcast i32* %0 to <4 x i32>*
264  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
265  %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
266  %4 = getelementptr inbounds i32, i32* %a, i32 %index
267  %5 = bitcast i32* %4 to <4 x i32>*
268  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
269  %index.next = add i32 %index, 4
270  %6 = icmp eq i32 %index.next, %n.vec
271  br i1 %6, label %for.cond.cleanup, label %vector.body
272
273for.cond.cleanup:                                 ; preds = %vector.body, %entry
274  ret void
275}
276
277define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
278; CHECK-LABEL: vector_add_const:
279; CHECK:       @ %bb.0: @ %entry
280; CHECK-NEXT:    push {r7, lr}
281; CHECK-NEXT:    cmp r3, #0
282; CHECK-NEXT:    it eq
283; CHECK-NEXT:    popeq {r7, pc}
284; CHECK-NEXT:  .LBB4_1: @ %vector.ph
285; CHECK-NEXT:    mov.w r12, #0
286; CHECK-NEXT:    dlstp.32 lr, r3
287; CHECK-NEXT:  .LBB4_2: @ %vector.body
288; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
289; CHECK-NEXT:    add.w r12, r12, #4
290; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
291; CHECK-NEXT:    vadd.i32 q0, q0, r2
292; CHECK-NEXT:    vstrw.32 q0, [r0], #16
293; CHECK-NEXT:    letp lr, .LBB4_2
294; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
295; CHECK-NEXT:    pop {r7, pc}
296entry:
297  %cmp6 = icmp eq i32 %N, 0
298  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
299
300vector.ph:                                        ; preds = %entry
301  %n.rnd.up = add i32 %N, 3
302  %n.vec = and i32 %n.rnd.up, -4
303  %trip.count.minus.1 = add i32 %N, -1
304  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
305  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
306  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
307  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
308  br label %vector.body
309
310vector.body:                                      ; preds = %vector.body, %vector.ph
311  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
312  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
313  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
314  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
315  %0 = getelementptr inbounds i32, i32* %b, i32 %index
316
317  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
318  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
319
320  %2 = bitcast i32* %0 to <4 x i32>*
321  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
322  %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
323  %4 = getelementptr inbounds i32, i32* %a, i32 %index
324  %5 = bitcast i32* %4 to <4 x i32>*
325  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
326  %index.next = add i32 %index, 4
327  %6 = icmp eq i32 %index.next, %n.vec
328  br i1 %6, label %for.cond.cleanup, label %vector.body
329
330for.cond.cleanup:                                 ; preds = %vector.body, %entry
331  ret void
332}
333
334define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i32 %N) {
335; CHECK-LABEL: vector_mul_vector_i8:
336; CHECK:       @ %bb.0: @ %entry
337; CHECK-NEXT:    push {r7, lr}
338; CHECK-NEXT:    cmp r3, #0
339; CHECK-NEXT:    it eq
340; CHECK-NEXT:    popeq {r7, pc}
341; CHECK-NEXT:  .LBB5_1: @ %vector.ph
342; CHECK-NEXT:    mov.w r12, #0
343; CHECK-NEXT:    dlstp.8 lr, r3
344; CHECK-NEXT:  .LBB5_2: @ %vector.body
345; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
346; CHECK-NEXT:    add.w r12, r12, #16
347; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
348; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
349; CHECK-NEXT:    vmul.i8 q0, q1, q0
350; CHECK-NEXT:    vstrb.8 q0, [r0], #16
351; CHECK-NEXT:    letp lr, .LBB5_2
352; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
353; CHECK-NEXT:    pop {r7, pc}
354entry:
355  %cmp10 = icmp eq i32 %N, 0
356  br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
357
358vector.ph:                                        ; preds = %entry
359  %n.rnd.up = add i32 %N, 15
360  %n.vec = and i32 %n.rnd.up, -16
361  %trip.count.minus.1 = add i32 %N, -1
362  %broadcast.splatinsert12 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
363  %broadcast.splat13 = shufflevector <16 x i32> %broadcast.splatinsert12, <16 x i32> undef, <16 x i32> zeroinitializer
364  br label %vector.body
365
366vector.body:                                      ; preds = %vector.body, %vector.ph
367  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
368  %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
369  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
370  %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
371  %0 = getelementptr inbounds i8, i8* %b, i32 %index
372
373  ; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13
374  %1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
375
376  %2 = bitcast i8* %0 to <16 x i8>*
377  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef)
378  %3 = getelementptr inbounds i8, i8* %c, i32 %index
379  %4 = bitcast i8* %3 to <16 x i8>*
380  %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %1, <16 x i8> undef)
381  %5 = mul <16 x i8> %wide.masked.load14, %wide.masked.load
382  %6 = getelementptr inbounds i8, i8* %a, i32 %index
383  %7 = bitcast i8* %6 to <16 x i8>*
384  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %5, <16 x i8>* %7, i32 1, <16 x i1> %1)
385  %index.next = add i32 %index, 16
386  %8 = icmp eq i32 %index.next, %n.vec
387  br i1 %8, label %for.cond.cleanup, label %vector.body
388
389for.cond.cleanup:                                 ; preds = %vector.body, %entry
390  ret void
391}
392
393; Function Attrs: nofree norecurse nounwind
394define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c, i32 %N) local_unnamed_addr #0 {
395; CHECK-LABEL: vector_mul_vector_i16:
396; CHECK:       @ %bb.0: @ %entry
397; CHECK-NEXT:    push {r7, lr}
398; CHECK-NEXT:    cmp r3, #0
399; CHECK-NEXT:    it eq
400; CHECK-NEXT:    popeq {r7, pc}
401; CHECK-NEXT:  .LBB6_1: @ %vector.ph
402; CHECK-NEXT:    mov.w r12, #0
403; CHECK-NEXT:    dlstp.16 lr, r3
404; CHECK-NEXT:  .LBB6_2: @ %vector.body
405; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
406; CHECK-NEXT:    add.w r12, r12, #8
407; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
408; CHECK-NEXT:    vldrh.u16 q1, [r2], #16
409; CHECK-NEXT:    vmul.i16 q0, q1, q0
410; CHECK-NEXT:    vstrh.16 q0, [r0], #16
411; CHECK-NEXT:    letp lr, .LBB6_2
412; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
413; CHECK-NEXT:    pop {r7, pc}
414entry:
415  %cmp10 = icmp eq i32 %N, 0
416  br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
417
418vector.ph:                                        ; preds = %entry
419  %n.rnd.up = add i32 %N, 7
420  %n.vec = and i32 %n.rnd.up, -8
421  %trip.count.minus.1 = add i32 %N, -1
422  %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
423  %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer
424  br label %vector.body
425
426vector.body:                                      ; preds = %vector.body, %vector.ph
427  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
428  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
429  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
430  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
431  %0 = getelementptr inbounds i16, i16* %b, i32 %index
432
433  ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13
434  %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
435
436  %2 = bitcast i16* %0 to <8 x i16>*
437  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef)
438  %3 = getelementptr inbounds i16, i16* %c, i32 %index
439  %4 = bitcast i16* %3 to <8 x i16>*
440  %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %1, <8 x i16> undef)
441  %5 = mul <8 x i16> %wide.masked.load14, %wide.masked.load
442  %6 = getelementptr inbounds i16, i16* %a, i32 %index
443  %7 = bitcast i16* %6 to <8 x i16>*
444  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %5, <8 x i16>* %7, i32 2, <8 x i1> %1)
445  %index.next = add i32 %index, 8
446  %8 = icmp eq i32 %index.next, %n.vec
447  br i1 %8, label %for.cond.cleanup, label %vector.body
448
449for.cond.cleanup:                                 ; preds = %vector.body, %entry
450  ret void
451}
452
453declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
454declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
455declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
456declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
457declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
458declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
459declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
460declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
461declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
462declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
463