1; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf \
2; RUN:   -enable-arm-maskedgatscat=false \
3; RUN:   -tail-predication=enabled -loop-vectorize -S < %s | \
4; RUN:   FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING
5
6; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve \
7; RUN:   -tail-predication=enabled -loop-vectorize \
8; RUN:   -enable-arm-maskedgatscat=false \
9; RUN:   -enable-arm-maskedldst=true -S < %s | \
10; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
11
12; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \
13; RUN:   -tail-predication=enabled -loop-vectorize \
14; RUN:   -enable-arm-maskedgatscat=false \
15; RUN:   -enable-arm-maskedldst=false -S < %s | \
16; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
17
18; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \
19; RUN:   -tail-predication=disabled -loop-vectorize \
20; RUN:   -enable-arm-maskedgatscat=false \
21; RUN:   -enable-arm-maskedldst=true -S < %s | \
22; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
23
24; Disabling the low-overhead branch extension will make
25; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for
26; these cases.
27; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob \
28; RUN:   -tail-predication=enabled -loop-vectorize \
29; RUN:   -enable-arm-maskedgatscat=false \
30; RUN:   -enable-arm-maskedldst=true -S < %s | \
31; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
32
33; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
34; RUN:   -tail-predication=enabled -loop-vectorize \
35; RUN:   -enable-arm-maskedgatscat=false \
36; RUN:   -enable-arm-maskedldst=true -S < %s | \
37; RUN:   FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING
38
39; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
40; RUN:   -prefer-predicate-over-epilogue=scalar-epilogue \
41; RUN:   -tail-predication=enabled -loop-vectorize \
42; RUN:   -enable-arm-maskedgatscat=false \
43; RUN:   -enable-arm-maskedldst=true -S < %s | \
44; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
45
46; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
47; RUN:   -prefer-predicate-over-epilogue=predicate-dont-vectorize \
48; RUN:   -tail-predication=enabled -loop-vectorize \
49; RUN:   -enable-arm-maskedgatscat=false \
50; RUN:   -enable-arm-maskedldst=true -S < %s | \
51; RUN:   FileCheck %s -check-prefixes=CHECK
52
53define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
54; CHECK-LABEL:    prefer_folding(
55; PREFER-FOLDING: vector.body:
56; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
57; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
58; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 431)
59; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
60; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
61; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
62; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
63;
64; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
65; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
66; NO-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %for.body
67entry:
68  br label %for.body
69
70for.cond.cleanup:
71  ret void
72
73for.body:
74  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
75  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
76  %0 = load i32, i32* %arrayidx, align 4
77  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
78  %1 = load i32, i32* %arrayidx1, align 4
79  %add = add nsw i32 %1, %0
80  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
81  store i32 %add, i32* %arrayidx2, align 4
82  %add3 = add nuw nsw i32 %i.09, 1
83  %exitcond = icmp eq i32 %add3, 431
84  br i1 %exitcond, label %for.cond.cleanup, label %for.body
85}
86
87define void @mixed_types(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32* noalias nocapture %D, i32* noalias nocapture readonly %E, i32* noalias nocapture readonly %F) #0 {
88; CHECK-LABEL:        mixed_types(
89; PREFER-FOLDING:     vector.body:
90; PREFER-FOLDING:     call <4 x i16> @llvm.masked.load.v4i16.p0v4i16
91; PREFER-FOLDING:     call <4 x i16> @llvm.masked.load.v4i16.p0v4i16
92; PREFER-FOLDING:     call void @llvm.masked.store.v4i16.p0v4i16
93; PREFER-FOLDING:     call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
94; PREFER-FOLDING:     call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
95; PREFER-FOLDING:     call void @llvm.masked.store.v4i32.p0v4i32
96; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
97entry:
98  br label %for.body
99
100for.cond.cleanup:
101  ret void
102
103for.body:
104  %i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ]
105  %arrayidx = getelementptr inbounds i16, i16* %B, i32 %i.018
106  %0 = load i16, i16* %arrayidx, align 2
107  %arrayidx1 = getelementptr inbounds i16, i16* %C, i32 %i.018
108  %1 = load i16, i16* %arrayidx1, align 2
109  %add = add i16 %1, %0
110  %arrayidx4 = getelementptr inbounds i16, i16* %A, i32 %i.018
111  store i16 %add, i16* %arrayidx4, align 2
112  %arrayidx5 = getelementptr inbounds i32, i32* %E, i32 %i.018
113  %2 = load i32, i32* %arrayidx5, align 4
114  %arrayidx6 = getelementptr inbounds i32, i32* %F, i32 %i.018
115  %3 = load i32, i32* %arrayidx6, align 4
116  %add7 = add nsw i32 %3, %2
117  %arrayidx8 = getelementptr inbounds i32, i32* %D, i32 %i.018
118  store i32 %add7, i32* %arrayidx8, align 4
119  %add9 = add nuw nsw i32 %i.018, 1
120  %exitcond = icmp eq i32 %add9, 431
121  br i1 %exitcond, label %for.cond.cleanup, label %for.body
122}
123
124define void @zero_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
125; CHECK-LABEL:    zero_extending_load_allowed(
126; PREFER-FOLDING: vector.body:
127; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8
128; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
129; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
130; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
131entry:
132  br label %for.body
133
134for.cond.cleanup:
135  ret void
136
137for.body:
138  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
139  %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
140  %0 = load i8, i8* %arrayidx, align 1
141  %conv = zext i8 %0 to i32
142  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
143  %1 = load i32, i32* %arrayidx1, align 4
144  %add = add nsw i32 %1, %conv
145  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
146  store i32 %add, i32* %arrayidx2, align 4
147  %add3 = add nuw nsw i32 %i.09, 1
148  %exitcond = icmp eq i32 %add3, 431
149  br i1 %exitcond, label %for.cond.cleanup, label %for.body
150}
151
152define void @sign_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
153; CHECK-LABEL:    sign_extending_load_allowed(
154; PREFER-FOLDING: vector.body:
155; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8
156; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
157; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
158; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
159entry:
160  br label %for.body
161
162for.cond.cleanup:
163  ret void
164
165for.body:
166  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
167  %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
168  %0 = load i8, i8* %arrayidx, align 1
169  %conv = sext i8 %0 to i32
170  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
171  %1 = load i32, i32* %arrayidx1, align 4
172  %add = add nsw i32 %1, %conv
173  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
174  store i32 %add, i32* %arrayidx2, align 4
175  %add3 = add nuw nsw i32 %i.09, 1
176  %exitcond = icmp eq i32 %add3, 431
177  br i1 %exitcond, label %for.cond.cleanup, label %for.body
178}
179
180define void @narrowing_store_allowed(i8* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
181; CHECK-LABEL:    narrowing_store_allowed(
182; PREFER-FOLDING: call void @llvm.masked.store.v4i8.p0v4i8
183; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
184entry:
185  br label %for.body
186
187for.cond.cleanup:
188  ret void
189
190for.body:
191  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
192  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
193  %0 = load i32, i32* %arrayidx, align 4
194  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
195  %1 = load i32, i32* %arrayidx1, align 4
196  %add = add nsw i32 %1, %0
197  %conv = trunc i32 %add to i8
198  %arrayidx2 = getelementptr inbounds i8, i8* %A, i32 %i.09
199  store i8 %conv, i8* %arrayidx2, align 1
200  %add3 = add nuw nsw i32 %i.09, 1
201  %exitcond = icmp eq i32 %add3, 431
202  br i1 %exitcond, label %for.cond.cleanup, label %for.body
203}
204
205@tab = common global [32 x i8] zeroinitializer, align 1
206
207define i32 @icmp_not_allowed() #0 {
208; CHECK-LABEL:        icmp_not_allowed(
209; PREFER-FOLDING:     vector.body:
210; PREFER-FOLDING-NOT: llvm.masked.load
211; PREFER-FOLDING-NOT: llvm.masked.store
212; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
213entry:
214  br label %for.body
215
216for.body:
217  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
218  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
219  %0 = load i8, i8* %arrayidx, align 1
220  %cmp1 = icmp eq i8 %0, 0
221  %. = select i1 %cmp1, i8 2, i8 1
222  store i8 %., i8* %arrayidx, align 1
223  %inc = add nsw i32 %i.08, 1
224  %exitcond = icmp slt i32 %inc, 1000
225  br i1 %exitcond, label %for.body, label %for.end
226
227for.end:
228  ret i32 0
229}
230
231@ftab = common global [32 x float] zeroinitializer, align 1
232
233define float @fcmp_not_allowed() #0 {
234; CHECK-LABEL:        fcmp_not_allowed(
235; PREFER-FOLDING:     vector.body:
236; PREFER-FOLDING-NOT: llvm.masked.load
237; PREFER-FOLDING-NOT: llvm.masked.store
238; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
239entry:
240  br label %for.body
241
242for.body:
243  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
244  %arrayidx = getelementptr inbounds [32 x float], [32 x float]* @ftab, i32 0, i32 %i.08
245  %0 = load float, float* %arrayidx, align 4
246  %cmp1 = fcmp oeq float %0, 0.000000e+00
247  %. = select i1 %cmp1, float 2.000000e+00, float 1.000000e+00
248  store float %., float* %arrayidx, align 4
249  %inc = add nsw i32 %i.08, 1
250  %exitcond = icmp slt i32 %inc, 999
251  br i1 %exitcond, label %for.body, label %for.end
252
253for.end:
254  ret float 0.000000e+00
255}
256
257define void @pragma_vect_predicate_disable(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
258; CHECK-LABEL:        pragma_vect_predicate_disable(
259; PREFER-FOLDING:     vector.body:
260; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
261; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
262; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32
263; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
264entry:
265  br label %for.body
266
267for.cond.cleanup:
268  ret void
269
270for.body:
271  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
272  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
273  %0 = load i32, i32* %arrayidx, align 4
274  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
275  %1 = load i32, i32* %arrayidx1, align 4
276  %add = add nsw i32 %1, %0
277  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
278  store i32 %add, i32* %arrayidx2, align 4
279  %add3 = add nuw nsw i32 %i.09, 1
280  %exitcond = icmp eq i32 %add3, 431
281  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !7
282}
283
284define void @stride_4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
285; CHECK-LABEL:        stride_4(
286; PREFER-FOLDING:     vector.body:
287; PREFER-FOLDING-NOT: llvm.masked.load
288; PREFER-FOLDING-NOT: llvm.masked.store
289; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
290entry:
291  br label %for.body
292
293for.cond.cleanup:
294  ret void
295
296for.body:
297  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
298  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
299  %0 = load i32, i32* %arrayidx, align 4
300  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
301  %1 = load i32, i32* %arrayidx1, align 4
302  %add = add nsw i32 %1, %0
303  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
304  store i32 %add, i32* %arrayidx2, align 4
305  %add3 = add nuw nsw i32 %i.09, 4
306  %cmp = icmp ult i32 %add3, 731
307  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !5
308}
309
310define dso_local void @half(half* noalias nocapture %A, half* noalias nocapture readonly %B, half* noalias nocapture readonly %C) #0 {
311; CHECK-LABEL:    half(
312; PREFER-FOLDING: vector.body:
313; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16
314; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16
315; PREFER-FOLDING: call void @llvm.masked.store.v8f16.p0v8f16
316; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
317entry:
318  br label %for.body
319
320for.cond.cleanup:
321  ret void
322
323for.body:
324  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
325  %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09
326  %0 = load half, half* %arrayidx, align 2
327  %arrayidx1 = getelementptr inbounds half, half* %C, i32 %i.09
328  %1 = load half, half* %arrayidx1, align 2
329  %add = fadd fast half %1, %0
330  %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09
331  store half %add, half* %arrayidx2, align 2
332  %add3 = add nuw nsw i32 %i.09, 1
333  %exitcond = icmp eq i32 %add3, 431
334  br i1 %exitcond, label %for.cond.cleanup, label %for.body
335}
336
337define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
338; CHECK-LABEL:    float(
339; PREFER-FOLDING: vector.body:
340; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
341; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
342; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 431)
343; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
344; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
345; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask
346; PREFER-FOLDING: %index.next = add i32 %index, 4
347; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
348entry:
349  br label %for.body
350
351for.cond.cleanup:
352  ret void
353
354for.body:
355  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
356  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
357  %0 = load float, float* %arrayidx, align 4
358  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
359  %1 = load float, float* %arrayidx1, align 4
360  %add = fadd fast float %1, %0
361  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
362  store float %add, float* %arrayidx2, align 4
363  %add3 = add nuw nsw i32 %i.09, 1
364  %exitcond = icmp eq i32 %add3, 431
365  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
366}
367
368define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
369; CHECK-LABEL:        fpext_allowed(
370; PREFER-FOLDING:     vector.body:
371; PREFER-FOLDING-NOT: llvm.masked.load
372; PREFER-FOLDING-NOT: llvm.masked.store
373; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
374entry:
375  br label %for.body
376
377for.cond.cleanup:
378  ret void
379
380for.body:
381  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
382  %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09
383  %0 = load half, half* %arrayidx, align 2
384  %conv = fpext half %0 to float
385  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
386  %1 = load float, float* %arrayidx1, align 4
387  %add = fadd fast float %1, %conv
388  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
389  store float %add, float* %arrayidx2, align 4
390  %add3 = add nuw nsw i32 %i.09, 1
391  %exitcond = icmp eq i32 %add3, 431
392  br i1 %exitcond, label %for.cond.cleanup, label %for.body
393}
394
395define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
396; CHECK-LABEL:        fptrunc_allowed(
397; PREFER-FOLDING:     vector.body:
398; PREFER-FOLDING-NOT: llvm.masked.load
399; PREFER-FOLDING-NOT: llvm.masked.store
400; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
401entry:
402  br label %for.body
403
404for.cond.cleanup:
405  ret void
406
407for.body:
408  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
409  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
410  %0 = load float, float* %arrayidx, align 4
411  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
412  %1 = load float, float* %arrayidx1, align 4
413  %add = fadd fast float %1, %0
414  %conv = fptrunc float %add to half
415  %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09
416  store half %conv, half* %arrayidx2, align 2
417  %add3 = add nuw nsw i32 %i.09, 1
418  %exitcond = icmp eq i32 %add3, 431
419  br i1 %exitcond, label %for.cond.cleanup, label %for.body
420}
421
422attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
423
424!5 = distinct !{!5, !6}
425!6 = !{!"llvm.loop.vectorize.enable", i1 true}
426
427!7 = distinct !{!7, !8}
428!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
429
430!10 = distinct !{!10, !11}
431!11 = !{!"llvm.loop.vectorize.width", i32 4}
432