1; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf \
2; RUN:   -disable-mve-tail-predication=false -loop-vectorize -S < %s | \
3; RUN:   FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING
4
5; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve \
6; RUN:   -disable-mve-tail-predication=false -loop-vectorize \
7; RUN:   -enable-arm-maskedldst=true -S < %s | \
8; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
9
10; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \
11; RUN:   -disable-mve-tail-predication=false -loop-vectorize \
12; RUN:   -enable-arm-maskedldst=false -S < %s | \
13; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
14
15; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \
16; RUN:   -disable-mve-tail-predication=true -loop-vectorize \
17; RUN:   -enable-arm-maskedldst=true -S < %s | \
18; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
19
20; Disabling the low-overhead branch extension will make
21; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for
22; these cases.
23; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob \
24; RUN:   -disable-mve-tail-predication=false -loop-vectorize \
25; RUN:   -enable-arm-maskedldst=true -S < %s | \
26; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
27
28; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
29; RUN:   -disable-mve-tail-predication=false -loop-vectorize \
30; RUN:   -enable-arm-maskedldst=true -S < %s | \
31; RUN:   FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING
32
33; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
34; RUN:   -prefer-predicate-over-epilog=false \
35; RUN:   -disable-mve-tail-predication=false -loop-vectorize \
36; RUN:   -enable-arm-maskedldst=true -S < %s | \
37; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
38
39; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
40; RUN:   -prefer-predicate-over-epilog=true \
41; RUN:   -disable-mve-tail-predication=false -loop-vectorize \
42; RUN:   -enable-arm-maskedldst=true -S < %s | \
43; RUN:   FileCheck %s -check-prefixes=CHECK,FOLDING-OPT
44
45define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
46; CHECK-LABEL:    prefer_folding(
47; PREFER-FOLDING: vector.body:
48; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
49; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
50; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
51; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
52;
53; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
54; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
55; NO-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %for.body
56entry:
57  br label %for.body
58
59for.cond.cleanup:
60  ret void
61
62for.body:
63  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
64  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
65  %0 = load i32, i32* %arrayidx, align 4
66  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
67  %1 = load i32, i32* %arrayidx1, align 4
68  %add = add nsw i32 %1, %0
69  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
70  store i32 %add, i32* %arrayidx2, align 4
71  %add3 = add nuw nsw i32 %i.09, 1
72  %exitcond = icmp eq i32 %add3, 431
73  br i1 %exitcond, label %for.cond.cleanup, label %for.body
74}
75
76define void @mixed_types(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32* noalias nocapture %D, i32* noalias nocapture readonly %E, i32* noalias nocapture readonly %F) #0 {
77; CHECK-LABEL:        mixed_types(
78; PREFER-FOLDING:     vector.body:
79; PREFER-FOLDING:     call <4 x i16> @llvm.masked.load.v4i16.p0v4i16
80; PREFER-FOLDING:     call <4 x i16> @llvm.masked.load.v4i16.p0v4i16
81; PREFER-FOLDING:     call void @llvm.masked.store.v4i16.p0v4i16
82; PREFER-FOLDING:     call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
83; PREFER-FOLDING:     call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
84; PREFER-FOLDING:     call void @llvm.masked.store.v4i32.p0v4i32
85; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
86entry:
87  br label %for.body
88
89for.cond.cleanup:
90  ret void
91
92for.body:
93  %i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ]
94  %arrayidx = getelementptr inbounds i16, i16* %B, i32 %i.018
95  %0 = load i16, i16* %arrayidx, align 2
96  %arrayidx1 = getelementptr inbounds i16, i16* %C, i32 %i.018
97  %1 = load i16, i16* %arrayidx1, align 2
98  %add = add i16 %1, %0
99  %arrayidx4 = getelementptr inbounds i16, i16* %A, i32 %i.018
100  store i16 %add, i16* %arrayidx4, align 2
101  %arrayidx5 = getelementptr inbounds i32, i32* %E, i32 %i.018
102  %2 = load i32, i32* %arrayidx5, align 4
103  %arrayidx6 = getelementptr inbounds i32, i32* %F, i32 %i.018
104  %3 = load i32, i32* %arrayidx6, align 4
105  %add7 = add nsw i32 %3, %2
106  %arrayidx8 = getelementptr inbounds i32, i32* %D, i32 %i.018
107  store i32 %add7, i32* %arrayidx8, align 4
108  %add9 = add nuw nsw i32 %i.018, 1
109  %exitcond = icmp eq i32 %add9, 431
110  br i1 %exitcond, label %for.cond.cleanup, label %for.body
111}
112
113define void @unsupported_i64_type(i64* noalias nocapture %A, i64* noalias nocapture readonly %B, i64* noalias nocapture readonly %C) #0 {
114; CHECK-LABEL:        unsupported_i64_type(
115; PREFER-FOLDING-NOT: vector.body:
116; PREFER-FOLDING-NOT: llvm.masked.load
117; PREFER-FOLDING-NOT: llvm.masked.store
118; PREFER-FOLDING:     for.body:
119entry:
120  br label %for.body
121
122for.cond.cleanup:
123  ret void
124
125for.body:
126  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
127  %arrayidx = getelementptr inbounds i64, i64* %B, i32 %i.09
128  %0 = load i64, i64* %arrayidx, align 8
129  %arrayidx1 = getelementptr inbounds i64, i64* %C, i32 %i.09
130  %1 = load i64, i64* %arrayidx1, align 8
131  %add = add nsw i64 %1, %0
132  %arrayidx2 = getelementptr inbounds i64, i64* %A, i32 %i.09
133  store i64 %add, i64* %arrayidx2, align 8
134  %add3 = add nuw nsw i32 %i.09, 1
135  %exitcond = icmp eq i32 %add3, 431
136  br i1 %exitcond, label %for.cond.cleanup, label %for.body
137}
138
139define void @zero_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
140; CHECK-LABEL:    zero_extending_load_allowed(
141; PREFER-FOLDING: vector.body:
142; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8
143; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
144; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
145; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
146entry:
147  br label %for.body
148
149for.cond.cleanup:
150  ret void
151
152for.body:
153  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
154  %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
155  %0 = load i8, i8* %arrayidx, align 1
156  %conv = zext i8 %0 to i32
157  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
158  %1 = load i32, i32* %arrayidx1, align 4
159  %add = add nsw i32 %1, %conv
160  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
161  store i32 %add, i32* %arrayidx2, align 4
162  %add3 = add nuw nsw i32 %i.09, 1
163  %exitcond = icmp eq i32 %add3, 431
164  br i1 %exitcond, label %for.cond.cleanup, label %for.body
165}
166
167define void @sign_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
168; CHECK-LABEL:    sign_extending_load_allowed(
169; PREFER-FOLDING: vector.body:
170; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8
171; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
172; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
173; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
174entry:
175  br label %for.body
176
177for.cond.cleanup:
178  ret void
179
180for.body:
181  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
182  %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
183  %0 = load i8, i8* %arrayidx, align 1
184  %conv = sext i8 %0 to i32
185  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
186  %1 = load i32, i32* %arrayidx1, align 4
187  %add = add nsw i32 %1, %conv
188  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
189  store i32 %add, i32* %arrayidx2, align 4
190  %add3 = add nuw nsw i32 %i.09, 1
191  %exitcond = icmp eq i32 %add3, 431
192  br i1 %exitcond, label %for.cond.cleanup, label %for.body
193}
194
195define void @narrowing_load_not_allowed(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i16* noalias nocapture readonly %C) #0 {
196; CHECK-LABEL:        narrowing_load_not_allowed(
197; PREFER-FOLDING:     vector.body:
198; PREFER-FOLDING-NOT: llvm.masked.load
199; PREFER-FOLDING-NOT: llvm.masked.store
200; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
201
202; FOLDING-OPT:        vector.body:
203; FOLDING-OPT:        call <8 x i16> @llvm.masked.load.v8i16.p0v8i16
204; FOLDING-OPT:        call <8 x i8> @llvm.masked.load.v8i8.p0v8i8
205; FOLDING-OPT:        call void @llvm.masked.store.v8i8.p0v8i8
206; FOLDING-OPT:        br i1 %{{.*}}, label %{{.*}}, label %vector.body
207entry:
208  br label %for.body
209
210for.cond.cleanup:                                 ; preds = %for.body
211  ret void
212
213for.body:                                         ; preds = %for.body, %entry
214  %i.012 = phi i32 [ 0, %entry ], [ %add6, %for.body ]
215  %arrayidx = getelementptr inbounds i16, i16* %C, i32 %i.012
216  %0 = load i16, i16* %arrayidx, align 2
217  %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.012
218  %1 = load i8, i8* %arrayidx1, align 1
219  %conv3 = trunc i16 %0 to i8
220  %add = add i8 %1, %conv3
221  %arrayidx5 = getelementptr inbounds i8, i8* %A, i32 %i.012
222  store i8 %add, i8* %arrayidx5, align 1
223  %add6 = add nuw nsw i32 %i.012, 1
224  %exitcond = icmp eq i32 %add6, 431
225  br i1 %exitcond, label %for.cond.cleanup, label %for.body
226}
227
228define void @narrowing_store_allowed(i8* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
229; CHECK-LABEL:    narrowing_store_allowed(
230; PREFER-FOLDING: call void @llvm.masked.store.v4i8.p0v4i8
231; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
232entry:
233  br label %for.body
234
235for.cond.cleanup:
236  ret void
237
238for.body:
239  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
240  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
241  %0 = load i32, i32* %arrayidx, align 4
242  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
243  %1 = load i32, i32* %arrayidx1, align 4
244  %add = add nsw i32 %1, %0
245  %conv = trunc i32 %add to i8
246  %arrayidx2 = getelementptr inbounds i8, i8* %A, i32 %i.09
247  store i8 %conv, i8* %arrayidx2, align 1
248  %add3 = add nuw nsw i32 %i.09, 1
249  %exitcond = icmp eq i32 %add3, 431
250  br i1 %exitcond, label %for.cond.cleanup, label %for.body
251}
252
253; This is a trunc not connected to a store, so we don't allow this.
254; TODO: this is conservative, because the trunc is only used in the
255; loop control statements, and thus not affecting element sizes, so
256; we could allow this case.
257define void @trunc_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
258; CHECK-LABEL:        trunc_not_allowed(
259; PREFER-FOLDING:     vector.body:
260; PREFER-FOLDING-NOT: llvm.masked.load
261; PREFER-FOLDING-NOT: llvm.masked.store
262; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
263entry:
264  br label %for.body
265
266for.cond.cleanup:
267  ret void
268
269for.body:
270  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
271  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
272  %0 = load i32, i32* %arrayidx, align 4
273  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
274  %1 = load i32, i32* %arrayidx1, align 4
275  %add = add nsw i32 %1, %0
276  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
277  store i32 %add, i32* %arrayidx2, align 4
278  %add3 = add nuw nsw i32 %i.09, 1
279
280  %add.iv = trunc i32 %add3 to i16
281
282  %exitcond = icmp eq i16 %add.iv, 431
283  br i1 %exitcond, label %for.cond.cleanup, label %for.body
284}
285
286define void @trunc_not_allowed_different_vec_elemns(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i16* noalias nocapture %D) #0 {
287; CHECK-LABEL:        trunc_not_allowed_different_vec_elemns(
288; PREFER-FOLDING:     vector.body:
289; PREFER-FOLDING-NOT: llvm.masked.load
290; PREFER-FOLDING-NOT: llvm.masked.store
291; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
292entry:
293  br label %for.body
294
295for.cond.cleanup:
296  ret void
297
298for.body:
299  %i.021 = phi i32 [ 0, %entry ], [ %add9, %for.body ]
300  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.021
301  %0 = load i32, i32* %arrayidx, align 4
302  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.021
303  %1 = load i32, i32* %arrayidx1, align 4
304  %add = add nsw i32 %1, %0
305  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.021
306  store i32 %add, i32* %arrayidx2, align 4
307  %add.tr = trunc i32 %add to i16
308  %conv7 = shl i16 %add.tr, 1
309  %arrayidx8 = getelementptr inbounds i16, i16* %D, i32 %i.021
310  store i16 %conv7, i16* %arrayidx8, align 2
311  %add9 = add nuw nsw i32 %i.021, 1
312  %exitcond = icmp eq i32 %add9, 431
313  br i1 %exitcond, label %for.cond.cleanup, label %for.body
314}
315
316
317@tab = common global [32 x i8] zeroinitializer, align 1
318
319define i32 @icmp_not_allowed() #0 {
320; CHECK-LABEL:        icmp_not_allowed(
321; PREFER-FOLDING:     vector.body:
322; PREFER-FOLDING-NOT: llvm.masked.load
323; PREFER-FOLDING-NOT: llvm.masked.store
324; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
325entry:
326  br label %for.body
327
328for.body:
329  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
330  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
331  %0 = load i8, i8* %arrayidx, align 1
332  %cmp1 = icmp eq i8 %0, 0
333  %. = select i1 %cmp1, i8 2, i8 1
334  store i8 %., i8* %arrayidx, align 1
335  %inc = add nsw i32 %i.08, 1
336  %exitcond = icmp slt i32 %inc, 1000
337  br i1 %exitcond, label %for.body, label %for.end
338
339for.end:
340  ret i32 0
341}
342
343@ftab = common global [32 x float] zeroinitializer, align 1
344
345define float @fcmp_not_allowed() #0 {
346; CHECK-LABEL:        fcmp_not_allowed(
347; PREFER-FOLDING:     vector.body:
348; PREFER-FOLDING-NOT: llvm.masked.load
349; PREFER-FOLDING-NOT: llvm.masked.store
350; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
351entry:
352  br label %for.body
353
354for.body:
355  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
356  %arrayidx = getelementptr inbounds [32 x float], [32 x float]* @ftab, i32 0, i32 %i.08
357  %0 = load float, float* %arrayidx, align 4
358  %cmp1 = fcmp oeq float %0, 0.000000e+00
359  %. = select i1 %cmp1, float 2.000000e+00, float 1.000000e+00
360  store float %., float* %arrayidx, align 4
361  %inc = add nsw i32 %i.08, 1
362  %exitcond = icmp slt i32 %inc, 999
363  br i1 %exitcond, label %for.body, label %for.end
364
365for.end:
366  ret float 0.000000e+00
367}
368
369define void @pragma_vect_predicate_disable(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
370; CHECK-LABEL:        pragma_vect_predicate_disable(
371; PREFER-FOLDING:     vector.body:
372; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
373; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
374; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32
375; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
376entry:
377  br label %for.body
378
379for.cond.cleanup:
380  ret void
381
382for.body:
383  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
384  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
385  %0 = load i32, i32* %arrayidx, align 4
386  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
387  %1 = load i32, i32* %arrayidx1, align 4
388  %add = add nsw i32 %1, %0
389  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
390  store i32 %add, i32* %arrayidx2, align 4
391  %add3 = add nuw nsw i32 %i.09, 1
392  %exitcond = icmp eq i32 %add3, 431
393  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !7
394}
395
396; Test directions for array indices i and N-1. I.e. check strides 1 and -1, and
397; force vectorisation with a loop hint.
398define void @strides_different_direction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) #0 {
399; CHECK-LABEL: strides_different_direction(
400; PREFER-FOLDING:     vector.body:
401; PREFER-FOLDING-NOT: llvm.masked.load
402; PREFER-FOLDING-NOT: llvm.masked.store
403; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
404entry:
405  br label %for.body
406
407for.cond.cleanup:
408  ret void
409
410for.body:
411  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
412  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
413  %0 = load i32, i32* %arrayidx, align 4
414  %sub = sub nsw i32 %N, %i.09
415  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %sub
416  %1 = load i32, i32* %arrayidx1, align 4
417  %add = add nsw i32 %1, %0
418  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
419  store i32 %add, i32* %arrayidx2, align 4
420  %add3 = add nuw nsw i32 %i.09, 1
421  %exitcond = icmp eq i32 %add3, 431
422  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
423}
424
425define void @stride_4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
426; CHECK-LABEL:        stride_4(
427; PREFER-FOLDING:     vector.body:
428; PREFER-FOLDING-NOT: llvm.masked.load
429; PREFER-FOLDING-NOT: llvm.masked.store
430; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
431entry:
432  br label %for.body
433
434for.cond.cleanup:
435  ret void
436
437for.body:
438  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
439  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
440  %0 = load i32, i32* %arrayidx, align 4
441  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
442  %1 = load i32, i32* %arrayidx1, align 4
443  %add = add nsw i32 %1, %0
444  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
445  store i32 %add, i32* %arrayidx2, align 4
446  %add3 = add nuw nsw i32 %i.09, 4
447  %cmp = icmp ult i32 %add3, 731
448  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !5
449}
450
451define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
452; CHECK-LABEL:        too_many_loop_blocks(
453; PREFER-FOLDING:     vector.body:
454; PREFER-FOLDING-NOT: llvm.masked.load
455; PREFER-FOLDING-NOT: llvm.masked.store
456; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
457entry:
458  br label %for.body
459
460for.cond.cleanup:
461  ret void
462
463for.body:
464  %i.09 = phi i32 [ 0, %entry ], [ %add3, %loopincr ]
465  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
466  %0 = load i32, i32* %arrayidx, align 4
467  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
468  %1 = load i32, i32* %arrayidx1, align 4
469  %add = add nsw i32 %1, %0
470  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
471  store i32 %add, i32* %arrayidx2, align 4
472  br label %loopincr
473
474loopincr:
475  %add3 = add nuw nsw i32 %i.09, 1
476  %exitcond = icmp eq i32 %add3, 431
477  br i1 %exitcond, label %for.cond.cleanup, label %for.body
478}
479
480define dso_local void @half(half* noalias nocapture %A, half* noalias nocapture readonly %B, half* noalias nocapture readonly %C) #0 {
481; CHECK-LABEL:    half(
482; PREFER-FOLDING: vector.body:
483; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16
484; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16
485; PREFER-FOLDING: call void @llvm.masked.store.v8f16.p0v8f16
486; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
487entry:
488  br label %for.body
489
490for.cond.cleanup:
491  ret void
492
493for.body:
494  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
495  %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09
496  %0 = load half, half* %arrayidx, align 2
497  %arrayidx1 = getelementptr inbounds half, half* %C, i32 %i.09
498  %1 = load half, half* %arrayidx1, align 2
499  %add = fadd fast half %1, %0
500  %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09
501  store half %add, half* %arrayidx2, align 2
502  %add3 = add nuw nsw i32 %i.09, 1
503  %exitcond = icmp eq i32 %add3, 431
504  br i1 %exitcond, label %for.cond.cleanup, label %for.body
505}
506
507define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
508; CHECK-LABEL:    float(
509; PREFER-FOLDING: vector.body:
510; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
511; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
512; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32
513; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
514entry:
515  br label %for.body
516
517for.cond.cleanup:
518  ret void
519
520for.body:
521  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
522  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
523  %0 = load float, float* %arrayidx, align 4
524  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
525  %1 = load float, float* %arrayidx1, align 4
526  %add = fadd fast float %1, %0
527  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
528  store float %add, float* %arrayidx2, align 4
529  %add3 = add nuw nsw i32 %i.09, 1
530  %exitcond = icmp eq i32 %add3, 431
531  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
532}
533
534define void @double(double* noalias nocapture %A, double* noalias nocapture readonly %B, double* noalias nocapture readonly %C) #0 {
535; CHECK-LABEL:        double(
536; PREFER-FOLDING:     for.body:
537; PREFER-FOLDING-NOT: vector.body:
538entry:
539  br label %for.body
540
541for.cond.cleanup:
542  ret void
543
544for.body:
545  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
546  %arrayidx = getelementptr inbounds double, double* %B, i32 %i.09
547  %0 = load double, double* %arrayidx, align 8
548  %arrayidx1 = getelementptr inbounds double, double* %C, i32 %i.09
549  %1 = load double, double* %arrayidx1, align 8
550  %add = fadd fast double %1, %0
551  %arrayidx2 = getelementptr inbounds double, double* %A, i32 %i.09
552  store double %add, double* %arrayidx2, align 8
553  %add3 = add nuw nsw i32 %i.09, 1
554  %exitcond = icmp eq i32 %add3, 431
555  br i1 %exitcond, label %for.cond.cleanup, label %for.body
556}
557
558; TODO: this fpext could be allowed, but we don't lower it very efficiently yet,
559; so reject this for now.
560define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
561; CHECK-LABEL:        fpext_allowed(
562; PREFER-FOLDING:     vector.body:
563; PREFER-FOLDING-NOT: llvm.masked.load
564; PREFER-FOLDING-NOT: llvm.masked.store
565; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
566entry:
567  br label %for.body
568
569for.cond.cleanup:
570  ret void
571
572for.body:
573  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
574  %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09
575  %0 = load half, half* %arrayidx, align 2
576  %conv = fpext half %0 to float
577  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
578  %1 = load float, float* %arrayidx1, align 4
579  %add = fadd fast float %1, %conv
580  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
581  store float %add, float* %arrayidx2, align 4
582  %add3 = add nuw nsw i32 %i.09, 1
583  %exitcond = icmp eq i32 %add3, 431
584  br i1 %exitcond, label %for.cond.cleanup, label %for.body
585}
586
587; TODO: this fptrunc could be allowed, but we don't lower it very efficiently yet,
588; so reject this for now.
589define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
590; CHECK-LABEL:        fptrunc_allowed(
591; PREFER-FOLDING:     vector.body:
592; PREFER-FOLDING-NOT: llvm.masked.load
593; PREFER-FOLDING-NOT: llvm.masked.store
594; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
595entry:
596  br label %for.body
597
598for.cond.cleanup:
599  ret void
600
601for.body:
602  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
603  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
604  %0 = load float, float* %arrayidx, align 4
605  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
606  %1 = load float, float* %arrayidx1, align 4
607  %add = fadd fast float %1, %0
608  %conv = fptrunc float %add to half
609  %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09
610  store half %conv, half* %arrayidx2, align 2
611  %add3 = add nuw nsw i32 %i.09, 1
612  %exitcond = icmp eq i32 %add3, 431
613  br i1 %exitcond, label %for.cond.cleanup, label %for.body
614}
615
616define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C, half* noalias nocapture %D) #0 {
617; CHECK-LABEL:        fptrunc_not_allowed(
618; PREFER-FOLDING:     vector.body:
619; PREFER-FOLDING-NOT: llvm.masked.load
620; PREFER-FOLDING-NOT: llvm.masked.store
621; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
622entry:
623  br label %for.body
624
625for.cond.cleanup:
626  ret void
627
628for.body:
629  %i.017 = phi i32 [ 0, %entry ], [ %add6, %for.body ]
630  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.017
631  %0 = load float, float* %arrayidx, align 4
632  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.017
633  %1 = load float, float* %arrayidx1, align 4
634  %add = fadd fast float %1, %0
635  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.017
636  store float %add, float* %arrayidx2, align 4
637  %conv = fptrunc float %add to half
638  %factor = fmul fast half %conv, 0xH4000
639  %arrayidx5 = getelementptr inbounds half, half* %D, i32 %i.017
640  store half %factor, half* %arrayidx5, align 2
641  %add6 = add nuw nsw i32 %i.017, 1
642  %exitcond = icmp eq i32 %add6, 431
643  br i1 %exitcond, label %for.cond.cleanup, label %for.body
644}
645
646attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
647
648!5 = distinct !{!5, !6}
649!6 = !{!"llvm.loop.vectorize.enable", i1 true}
650
651!7 = distinct !{!7, !8}
652!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
653
654!10 = distinct !{!10, !11}
655!11 = !{!"llvm.loop.vectorize.width", i32 4}
656