1; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf \
2; RUN:   -disable-mve-tail-predication=false -loop-vectorize -S < %s | \
3; RUN:   FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING
4
5; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve \
6; RUN:   -disable-mve-tail-predication=false -loop-vectorize \
7; RUN:   -enable-arm-maskedldst=true -S < %s | \
8; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
9
10; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \
11; RUN:   -disable-mve-tail-predication=false -loop-vectorize \
12; RUN:   -enable-arm-maskedldst=false -S < %s | \
13; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
14
15; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \
16; RUN:   -disable-mve-tail-predication=true -loop-vectorize \
17; RUN:   -enable-arm-maskedldst=true -S < %s | \
18; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
19
20; Disabling the low-overhead branch extension will make
21; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for
22; these cases.
23; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob \
24; RUN:   -disable-mve-tail-predication=false -loop-vectorize \
25; RUN:   -enable-arm-maskedldst=true -S < %s | \
26; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
27
28; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
29; RUN:   -disable-mve-tail-predication=false -loop-vectorize \
30; RUN:   -enable-arm-maskedldst=true -S < %s | \
31; RUN:   FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING
32
33; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
34; RUN:   -prefer-predicate-over-epilog=false \
35; RUN:   -disable-mve-tail-predication=false -loop-vectorize \
36; RUN:   -enable-arm-maskedldst=true -S < %s | \
37; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
38
39; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
40; RUN:   -prefer-predicate-over-epilog=true \
41; RUN:   -disable-mve-tail-predication=false -loop-vectorize \
42; RUN:   -enable-arm-maskedldst=true -S < %s | \
43; RUN:   FileCheck %s -check-prefixes=CHECK,FOLDING-OPT
44
45define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
46; CHECK-LABEL:    prefer_folding(
47; PREFER-FOLDING: vector.body:
48; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
49; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
50; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
51; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
52; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
53; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
54; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
55;
56; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
57; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
58; NO-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %for.body
59entry:
60  br label %for.body
61
62for.cond.cleanup:
63  ret void
64
65for.body:
66  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
67  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
68  %0 = load i32, i32* %arrayidx, align 4
69  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
70  %1 = load i32, i32* %arrayidx1, align 4
71  %add = add nsw i32 %1, %0
72  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
73  store i32 %add, i32* %arrayidx2, align 4
74  %add3 = add nuw nsw i32 %i.09, 1
75  %exitcond = icmp eq i32 %add3, 431
76  br i1 %exitcond, label %for.cond.cleanup, label %for.body
77}
78
79define void @mixed_types(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32* noalias nocapture %D, i32* noalias nocapture readonly %E, i32* noalias nocapture readonly %F) #0 {
80; CHECK-LABEL:        mixed_types(
81; PREFER-FOLDING:     vector.body:
82; PREFER-FOLDING:     call <4 x i16> @llvm.masked.load.v4i16.p0v4i16
83; PREFER-FOLDING:     call <4 x i16> @llvm.masked.load.v4i16.p0v4i16
84; PREFER-FOLDING:     call void @llvm.masked.store.v4i16.p0v4i16
85; PREFER-FOLDING:     call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
86; PREFER-FOLDING:     call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
87; PREFER-FOLDING:     call void @llvm.masked.store.v4i32.p0v4i32
88; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
89entry:
90  br label %for.body
91
92for.cond.cleanup:
93  ret void
94
95for.body:
96  %i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ]
97  %arrayidx = getelementptr inbounds i16, i16* %B, i32 %i.018
98  %0 = load i16, i16* %arrayidx, align 2
99  %arrayidx1 = getelementptr inbounds i16, i16* %C, i32 %i.018
100  %1 = load i16, i16* %arrayidx1, align 2
101  %add = add i16 %1, %0
102  %arrayidx4 = getelementptr inbounds i16, i16* %A, i32 %i.018
103  store i16 %add, i16* %arrayidx4, align 2
104  %arrayidx5 = getelementptr inbounds i32, i32* %E, i32 %i.018
105  %2 = load i32, i32* %arrayidx5, align 4
106  %arrayidx6 = getelementptr inbounds i32, i32* %F, i32 %i.018
107  %3 = load i32, i32* %arrayidx6, align 4
108  %add7 = add nsw i32 %3, %2
109  %arrayidx8 = getelementptr inbounds i32, i32* %D, i32 %i.018
110  store i32 %add7, i32* %arrayidx8, align 4
111  %add9 = add nuw nsw i32 %i.018, 1
112  %exitcond = icmp eq i32 %add9, 431
113  br i1 %exitcond, label %for.cond.cleanup, label %for.body
114}
115
116define void @unsupported_i64_type(i64* noalias nocapture %A, i64* noalias nocapture readonly %B, i64* noalias nocapture readonly %C) #0 {
117; CHECK-LABEL:        unsupported_i64_type(
118; PREFER-FOLDING-NOT: vector.body:
119; PREFER-FOLDING-NOT: llvm.masked.load
120; PREFER-FOLDING-NOT: llvm.masked.store
121; PREFER-FOLDING:     for.body:
122entry:
123  br label %for.body
124
125for.cond.cleanup:
126  ret void
127
128for.body:
129  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
130  %arrayidx = getelementptr inbounds i64, i64* %B, i32 %i.09
131  %0 = load i64, i64* %arrayidx, align 8
132  %arrayidx1 = getelementptr inbounds i64, i64* %C, i32 %i.09
133  %1 = load i64, i64* %arrayidx1, align 8
134  %add = add nsw i64 %1, %0
135  %arrayidx2 = getelementptr inbounds i64, i64* %A, i32 %i.09
136  store i64 %add, i64* %arrayidx2, align 8
137  %add3 = add nuw nsw i32 %i.09, 1
138  %exitcond = icmp eq i32 %add3, 431
139  br i1 %exitcond, label %for.cond.cleanup, label %for.body
140}
141
142define void @zero_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
143; CHECK-LABEL:    zero_extending_load_allowed(
144; PREFER-FOLDING: vector.body:
145; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8
146; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
147; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
148; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
149entry:
150  br label %for.body
151
152for.cond.cleanup:
153  ret void
154
155for.body:
156  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
157  %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
158  %0 = load i8, i8* %arrayidx, align 1
159  %conv = zext i8 %0 to i32
160  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
161  %1 = load i32, i32* %arrayidx1, align 4
162  %add = add nsw i32 %1, %conv
163  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
164  store i32 %add, i32* %arrayidx2, align 4
165  %add3 = add nuw nsw i32 %i.09, 1
166  %exitcond = icmp eq i32 %add3, 431
167  br i1 %exitcond, label %for.cond.cleanup, label %for.body
168}
169
170define void @sign_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
171; CHECK-LABEL:    sign_extending_load_allowed(
172; PREFER-FOLDING: vector.body:
173; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8
174; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
175; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
176; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
177entry:
178  br label %for.body
179
180for.cond.cleanup:
181  ret void
182
183for.body:
184  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
185  %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
186  %0 = load i8, i8* %arrayidx, align 1
187  %conv = sext i8 %0 to i32
188  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
189  %1 = load i32, i32* %arrayidx1, align 4
190  %add = add nsw i32 %1, %conv
191  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
192  store i32 %add, i32* %arrayidx2, align 4
193  %add3 = add nuw nsw i32 %i.09, 1
194  %exitcond = icmp eq i32 %add3, 431
195  br i1 %exitcond, label %for.cond.cleanup, label %for.body
196}
197
198define void @narrowing_load_not_allowed(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, i16* noalias nocapture readonly %C) #0 {
199; CHECK-LABEL:        narrowing_load_not_allowed(
200; PREFER-FOLDING:     vector.body:
201; PREFER-FOLDING-NOT: llvm.masked.load
202; PREFER-FOLDING-NOT: llvm.masked.store
203; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
204
205; FOLDING-OPT:        vector.body:
206; FOLDING-OPT:        call <8 x i16> @llvm.masked.load.v8i16.p0v8i16
207; FOLDING-OPT:        call <8 x i8> @llvm.masked.load.v8i8.p0v8i8
208; FOLDING-OPT:        call void @llvm.masked.store.v8i8.p0v8i8
209; FOLDING-OPT:        br i1 %{{.*}}, label %{{.*}}, label %vector.body
210entry:
211  br label %for.body
212
213for.cond.cleanup:                                 ; preds = %for.body
214  ret void
215
216for.body:                                         ; preds = %for.body, %entry
217  %i.012 = phi i32 [ 0, %entry ], [ %add6, %for.body ]
218  %arrayidx = getelementptr inbounds i16, i16* %C, i32 %i.012
219  %0 = load i16, i16* %arrayidx, align 2
220  %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.012
221  %1 = load i8, i8* %arrayidx1, align 1
222  %conv3 = trunc i16 %0 to i8
223  %add = add i8 %1, %conv3
224  %arrayidx5 = getelementptr inbounds i8, i8* %A, i32 %i.012
225  store i8 %add, i8* %arrayidx5, align 1
226  %add6 = add nuw nsw i32 %i.012, 1
227  %exitcond = icmp eq i32 %add6, 431
228  br i1 %exitcond, label %for.cond.cleanup, label %for.body
229}
230
231define void @narrowing_store_allowed(i8* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
232; CHECK-LABEL:    narrowing_store_allowed(
233; PREFER-FOLDING: call void @llvm.masked.store.v4i8.p0v4i8
234; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
235entry:
236  br label %for.body
237
238for.cond.cleanup:
239  ret void
240
241for.body:
242  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
243  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
244  %0 = load i32, i32* %arrayidx, align 4
245  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
246  %1 = load i32, i32* %arrayidx1, align 4
247  %add = add nsw i32 %1, %0
248  %conv = trunc i32 %add to i8
249  %arrayidx2 = getelementptr inbounds i8, i8* %A, i32 %i.09
250  store i8 %conv, i8* %arrayidx2, align 1
251  %add3 = add nuw nsw i32 %i.09, 1
252  %exitcond = icmp eq i32 %add3, 431
253  br i1 %exitcond, label %for.cond.cleanup, label %for.body
254}
255
256; This is a trunc not connected to a store, so we don't allow this.
257; TODO: this is conservative, because the trunc is only used in the
258; loop control statements, and thus not affecting element sizes, so
259; we could allow this case.
260define void @trunc_not_allowed(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
261; CHECK-LABEL:        trunc_not_allowed(
262; PREFER-FOLDING:     vector.body:
263; PREFER-FOLDING-NOT: llvm.masked.load
264; PREFER-FOLDING-NOT: llvm.masked.store
265; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
266entry:
267  br label %for.body
268
269for.cond.cleanup:
270  ret void
271
272for.body:
273  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
274  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
275  %0 = load i32, i32* %arrayidx, align 4
276  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
277  %1 = load i32, i32* %arrayidx1, align 4
278  %add = add nsw i32 %1, %0
279  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
280  store i32 %add, i32* %arrayidx2, align 4
281  %add3 = add nuw nsw i32 %i.09, 1
282
283  %add.iv = trunc i32 %add3 to i16
284
285  %exitcond = icmp eq i16 %add.iv, 431
286  br i1 %exitcond, label %for.cond.cleanup, label %for.body
287}
288
289define void @trunc_not_allowed_different_vec_elemns(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i16* noalias nocapture %D) #0 {
290; CHECK-LABEL:        trunc_not_allowed_different_vec_elemns(
291; PREFER-FOLDING:     vector.body:
292; PREFER-FOLDING-NOT: llvm.masked.load
293; PREFER-FOLDING-NOT: llvm.masked.store
294; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
295entry:
296  br label %for.body
297
298for.cond.cleanup:
299  ret void
300
301for.body:
302  %i.021 = phi i32 [ 0, %entry ], [ %add9, %for.body ]
303  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.021
304  %0 = load i32, i32* %arrayidx, align 4
305  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.021
306  %1 = load i32, i32* %arrayidx1, align 4
307  %add = add nsw i32 %1, %0
308  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.021
309  store i32 %add, i32* %arrayidx2, align 4
310  %add.tr = trunc i32 %add to i16
311  %conv7 = shl i16 %add.tr, 1
312  %arrayidx8 = getelementptr inbounds i16, i16* %D, i32 %i.021
313  store i16 %conv7, i16* %arrayidx8, align 2
314  %add9 = add nuw nsw i32 %i.021, 1
315  %exitcond = icmp eq i32 %add9, 431
316  br i1 %exitcond, label %for.cond.cleanup, label %for.body
317}
318
319
320@tab = common global [32 x i8] zeroinitializer, align 1
321
322define i32 @icmp_not_allowed() #0 {
323; CHECK-LABEL:        icmp_not_allowed(
324; PREFER-FOLDING:     vector.body:
325; PREFER-FOLDING-NOT: llvm.masked.load
326; PREFER-FOLDING-NOT: llvm.masked.store
327; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
328entry:
329  br label %for.body
330
331for.body:
332  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
333  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
334  %0 = load i8, i8* %arrayidx, align 1
335  %cmp1 = icmp eq i8 %0, 0
336  %. = select i1 %cmp1, i8 2, i8 1
337  store i8 %., i8* %arrayidx, align 1
338  %inc = add nsw i32 %i.08, 1
339  %exitcond = icmp slt i32 %inc, 1000
340  br i1 %exitcond, label %for.body, label %for.end
341
342for.end:
343  ret i32 0
344}
345
346@ftab = common global [32 x float] zeroinitializer, align 1
347
348define float @fcmp_not_allowed() #0 {
349; CHECK-LABEL:        fcmp_not_allowed(
350; PREFER-FOLDING:     vector.body:
351; PREFER-FOLDING-NOT: llvm.masked.load
352; PREFER-FOLDING-NOT: llvm.masked.store
353; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
354entry:
355  br label %for.body
356
357for.body:
358  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
359  %arrayidx = getelementptr inbounds [32 x float], [32 x float]* @ftab, i32 0, i32 %i.08
360  %0 = load float, float* %arrayidx, align 4
361  %cmp1 = fcmp oeq float %0, 0.000000e+00
362  %. = select i1 %cmp1, float 2.000000e+00, float 1.000000e+00
363  store float %., float* %arrayidx, align 4
364  %inc = add nsw i32 %i.08, 1
365  %exitcond = icmp slt i32 %inc, 999
366  br i1 %exitcond, label %for.body, label %for.end
367
368for.end:
369  ret float 0.000000e+00
370}
371
372define void @pragma_vect_predicate_disable(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
373; CHECK-LABEL:        pragma_vect_predicate_disable(
374; PREFER-FOLDING:     vector.body:
375; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
376; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
377; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32
378; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
379entry:
380  br label %for.body
381
382for.cond.cleanup:
383  ret void
384
385for.body:
386  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
387  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
388  %0 = load i32, i32* %arrayidx, align 4
389  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
390  %1 = load i32, i32* %arrayidx1, align 4
391  %add = add nsw i32 %1, %0
392  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
393  store i32 %add, i32* %arrayidx2, align 4
394  %add3 = add nuw nsw i32 %i.09, 1
395  %exitcond = icmp eq i32 %add3, 431
396  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !7
397}
398
399; Test directions for array indices i and N-1. I.e. check strides 1 and -1, and
400; force vectorisation with a loop hint.
401define void @strides_different_direction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) #0 {
402; CHECK-LABEL: strides_different_direction(
403; PREFER-FOLDING:     vector.body:
404; PREFER-FOLDING-NOT: llvm.masked.load
405; PREFER-FOLDING-NOT: llvm.masked.store
406; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
407entry:
408  br label %for.body
409
410for.cond.cleanup:
411  ret void
412
413for.body:
414  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
415  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
416  %0 = load i32, i32* %arrayidx, align 4
417  %sub = sub nsw i32 %N, %i.09
418  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %sub
419  %1 = load i32, i32* %arrayidx1, align 4
420  %add = add nsw i32 %1, %0
421  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
422  store i32 %add, i32* %arrayidx2, align 4
423  %add3 = add nuw nsw i32 %i.09, 1
424  %exitcond = icmp eq i32 %add3, 431
425  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
426}
427
428define void @stride_4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
429; CHECK-LABEL:        stride_4(
430; PREFER-FOLDING:     vector.body:
431; PREFER-FOLDING-NOT: llvm.masked.load
432; PREFER-FOLDING-NOT: llvm.masked.store
433; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
434entry:
435  br label %for.body
436
437for.cond.cleanup:
438  ret void
439
440for.body:
441  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
442  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
443  %0 = load i32, i32* %arrayidx, align 4
444  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
445  %1 = load i32, i32* %arrayidx1, align 4
446  %add = add nsw i32 %1, %0
447  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
448  store i32 %add, i32* %arrayidx2, align 4
449  %add3 = add nuw nsw i32 %i.09, 4
450  %cmp = icmp ult i32 %add3, 731
451  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !5
452}
453
454define void @too_many_loop_blocks(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
455; CHECK-LABEL:        too_many_loop_blocks(
456; PREFER-FOLDING:     vector.body:
457; PREFER-FOLDING-NOT: llvm.masked.load
458; PREFER-FOLDING-NOT: llvm.masked.store
459; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
460entry:
461  br label %for.body
462
463for.cond.cleanup:
464  ret void
465
466for.body:
467  %i.09 = phi i32 [ 0, %entry ], [ %add3, %loopincr ]
468  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
469  %0 = load i32, i32* %arrayidx, align 4
470  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
471  %1 = load i32, i32* %arrayidx1, align 4
472  %add = add nsw i32 %1, %0
473  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
474  store i32 %add, i32* %arrayidx2, align 4
475  br label %loopincr
476
477loopincr:
478  %add3 = add nuw nsw i32 %i.09, 1
479  %exitcond = icmp eq i32 %add3, 431
480  br i1 %exitcond, label %for.cond.cleanup, label %for.body
481}
482
483define dso_local void @half(half* noalias nocapture %A, half* noalias nocapture readonly %B, half* noalias nocapture readonly %C) #0 {
484; CHECK-LABEL:    half(
485; PREFER-FOLDING: vector.body:
486; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16
487; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16
488; PREFER-FOLDING: call void @llvm.masked.store.v8f16.p0v8f16
489; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
490entry:
491  br label %for.body
492
493for.cond.cleanup:
494  ret void
495
496for.body:
497  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
498  %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09
499  %0 = load half, half* %arrayidx, align 2
500  %arrayidx1 = getelementptr inbounds half, half* %C, i32 %i.09
501  %1 = load half, half* %arrayidx1, align 2
502  %add = fadd fast half %1, %0
503  %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09
504  store half %add, half* %arrayidx2, align 2
505  %add3 = add nuw nsw i32 %i.09, 1
506  %exitcond = icmp eq i32 %add3, 431
507  br i1 %exitcond, label %for.cond.cleanup, label %for.body
508}
509
510define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
511; CHECK-LABEL:    float(
512; PREFER-FOLDING: vector.body:
513; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
514; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
515; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
516; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
517; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
518; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask
519; PREFER-FOLDING: %index.next = add i32 %index, 4
520; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
521entry:
522  br label %for.body
523
524for.cond.cleanup:
525  ret void
526
527for.body:
528  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
529  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
530  %0 = load float, float* %arrayidx, align 4
531  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
532  %1 = load float, float* %arrayidx1, align 4
533  %add = fadd fast float %1, %0
534  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
535  store float %add, float* %arrayidx2, align 4
536  %add3 = add nuw nsw i32 %i.09, 1
537  %exitcond = icmp eq i32 %add3, 431
538  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
539}
540
541define void @double(double* noalias nocapture %A, double* noalias nocapture readonly %B, double* noalias nocapture readonly %C) #0 {
542; CHECK-LABEL:        double(
543; PREFER-FOLDING:     for.body:
544; PREFER-FOLDING-NOT: vector.body:
545entry:
546  br label %for.body
547
548for.cond.cleanup:
549  ret void
550
551for.body:
552  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
553  %arrayidx = getelementptr inbounds double, double* %B, i32 %i.09
554  %0 = load double, double* %arrayidx, align 8
555  %arrayidx1 = getelementptr inbounds double, double* %C, i32 %i.09
556  %1 = load double, double* %arrayidx1, align 8
557  %add = fadd fast double %1, %0
558  %arrayidx2 = getelementptr inbounds double, double* %A, i32 %i.09
559  store double %add, double* %arrayidx2, align 8
560  %add3 = add nuw nsw i32 %i.09, 1
561  %exitcond = icmp eq i32 %add3, 431
562  br i1 %exitcond, label %for.cond.cleanup, label %for.body
563}
564
565; TODO: this fpext could be allowed, but we don't lower it very efficiently yet,
566; so reject this for now.
567define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
568; CHECK-LABEL:        fpext_allowed(
569; PREFER-FOLDING:     vector.body:
570; PREFER-FOLDING-NOT: llvm.masked.load
571; PREFER-FOLDING-NOT: llvm.masked.store
572; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
573entry:
574  br label %for.body
575
576for.cond.cleanup:
577  ret void
578
579for.body:
580  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
581  %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09
582  %0 = load half, half* %arrayidx, align 2
583  %conv = fpext half %0 to float
584  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
585  %1 = load float, float* %arrayidx1, align 4
586  %add = fadd fast float %1, %conv
587  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
588  store float %add, float* %arrayidx2, align 4
589  %add3 = add nuw nsw i32 %i.09, 1
590  %exitcond = icmp eq i32 %add3, 431
591  br i1 %exitcond, label %for.cond.cleanup, label %for.body
592}
593
594; TODO: this fptrunc could be allowed, but we don't lower it very efficiently yet,
595; so reject this for now.
596define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
597; CHECK-LABEL:        fptrunc_allowed(
598; PREFER-FOLDING:     vector.body:
599; PREFER-FOLDING-NOT: llvm.masked.load
600; PREFER-FOLDING-NOT: llvm.masked.store
601; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
602entry:
603  br label %for.body
604
605for.cond.cleanup:
606  ret void
607
608for.body:
609  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
610  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
611  %0 = load float, float* %arrayidx, align 4
612  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
613  %1 = load float, float* %arrayidx1, align 4
614  %add = fadd fast float %1, %0
615  %conv = fptrunc float %add to half
616  %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09
617  store half %conv, half* %arrayidx2, align 2
618  %add3 = add nuw nsw i32 %i.09, 1
619  %exitcond = icmp eq i32 %add3, 431
620  br i1 %exitcond, label %for.cond.cleanup, label %for.body
621}
622
623define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C, half* noalias nocapture %D) #0 {
624; CHECK-LABEL:        fptrunc_not_allowed(
625; PREFER-FOLDING:     vector.body:
626; PREFER-FOLDING-NOT: llvm.masked.load
627; PREFER-FOLDING-NOT: llvm.masked.store
628; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
629entry:
630  br label %for.body
631
632for.cond.cleanup:
633  ret void
634
635for.body:
636  %i.017 = phi i32 [ 0, %entry ], [ %add6, %for.body ]
637  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.017
638  %0 = load float, float* %arrayidx, align 4
639  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.017
640  %1 = load float, float* %arrayidx1, align 4
641  %add = fadd fast float %1, %0
642  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.017
643  store float %add, float* %arrayidx2, align 4
644  %conv = fptrunc float %add to half
645  %factor = fmul fast half %conv, 0xH4000
646  %arrayidx5 = getelementptr inbounds half, half* %D, i32 %i.017
647  store half %factor, half* %arrayidx5, align 2
648  %add6 = add nuw nsw i32 %i.017, 1
649  %exitcond = icmp eq i32 %add6, 431
650  br i1 %exitcond, label %for.cond.cleanup, label %for.body
651}
652
653attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
654
655!5 = distinct !{!5, !6}
656!6 = !{!"llvm.loop.vectorize.enable", i1 true}
657
658!7 = distinct !{!7, !8}
659!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
660
661!10 = distinct !{!10, !11}
662!11 = !{!"llvm.loop.vectorize.width", i32 4}
663