1; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf \
2; RUN:   -tail-predication=enabled -loop-vectorize -S < %s | \
3; RUN:   FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING
4
5; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve \
6; RUN:   -tail-predication=enabled -loop-vectorize \
7; RUN:   -enable-arm-maskedldst=true -S < %s | \
8; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
9
10; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \
11; RUN:   -tail-predication=enabled -loop-vectorize \
12; RUN:   -enable-arm-maskedldst=false -S < %s | \
13; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
14
15; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \
16; RUN:   -tail-predication=disabled -loop-vectorize \
17; RUN:   -enable-arm-maskedldst=true -S < %s | \
18; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
19
20; Disabling the low-overhead branch extension will make
21; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for
22; these cases.
23; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob \
24; RUN:   -tail-predication=enabled -loop-vectorize \
25; RUN:   -enable-arm-maskedldst=true -S < %s | \
26; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
27
28; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
29; RUN:   -tail-predication=enabled -loop-vectorize \
30; RUN:   -enable-arm-maskedldst=true -S < %s | \
31; RUN:   FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING
32
33; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
34; RUN:   -prefer-predicate-over-epilog=false \
35; RUN:   -tail-predication=enabled -loop-vectorize \
36; RUN:   -enable-arm-maskedldst=true -S < %s | \
37; RUN:   FileCheck %s -check-prefixes=CHECK,NO-FOLDING
38
39; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \
40; RUN:   -prefer-predicate-over-epilog=true \
41; RUN:   -tail-predication=enabled -loop-vectorize \
42; RUN:   -enable-arm-maskedldst=true -S < %s | \
43; RUN:   FileCheck %s -check-prefixes=CHECK,FOLDING-OPT
44
45define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
46; CHECK-LABEL:    prefer_folding(
47; PREFER-FOLDING: vector.body:
48; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
49; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
50; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
51; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
52; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
53; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
54; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
55;
56; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
57; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
58; NO-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %for.body
59entry:
60  br label %for.body
61
62for.cond.cleanup:
63  ret void
64
65for.body:
66  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
67  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
68  %0 = load i32, i32* %arrayidx, align 4
69  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
70  %1 = load i32, i32* %arrayidx1, align 4
71  %add = add nsw i32 %1, %0
72  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
73  store i32 %add, i32* %arrayidx2, align 4
74  %add3 = add nuw nsw i32 %i.09, 1
75  %exitcond = icmp eq i32 %add3, 431
76  br i1 %exitcond, label %for.cond.cleanup, label %for.body
77}
78
79define void @mixed_types(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32* noalias nocapture %D, i32* noalias nocapture readonly %E, i32* noalias nocapture readonly %F) #0 {
80; CHECK-LABEL:        mixed_types(
81; PREFER-FOLDING:     vector.body:
82; PREFER-FOLDING:     call <4 x i16> @llvm.masked.load.v4i16.p0v4i16
83; PREFER-FOLDING:     call <4 x i16> @llvm.masked.load.v4i16.p0v4i16
84; PREFER-FOLDING:     call void @llvm.masked.store.v4i16.p0v4i16
85; PREFER-FOLDING:     call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
86; PREFER-FOLDING:     call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
87; PREFER-FOLDING:     call void @llvm.masked.store.v4i32.p0v4i32
88; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
89entry:
90  br label %for.body
91
92for.cond.cleanup:
93  ret void
94
95for.body:
96  %i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ]
97  %arrayidx = getelementptr inbounds i16, i16* %B, i32 %i.018
98  %0 = load i16, i16* %arrayidx, align 2
99  %arrayidx1 = getelementptr inbounds i16, i16* %C, i32 %i.018
100  %1 = load i16, i16* %arrayidx1, align 2
101  %add = add i16 %1, %0
102  %arrayidx4 = getelementptr inbounds i16, i16* %A, i32 %i.018
103  store i16 %add, i16* %arrayidx4, align 2
104  %arrayidx5 = getelementptr inbounds i32, i32* %E, i32 %i.018
105  %2 = load i32, i32* %arrayidx5, align 4
106  %arrayidx6 = getelementptr inbounds i32, i32* %F, i32 %i.018
107  %3 = load i32, i32* %arrayidx6, align 4
108  %add7 = add nsw i32 %3, %2
109  %arrayidx8 = getelementptr inbounds i32, i32* %D, i32 %i.018
110  store i32 %add7, i32* %arrayidx8, align 4
111  %add9 = add nuw nsw i32 %i.018, 1
112  %exitcond = icmp eq i32 %add9, 431
113  br i1 %exitcond, label %for.cond.cleanup, label %for.body
114}
115
116define void @zero_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
117; CHECK-LABEL:    zero_extending_load_allowed(
118; PREFER-FOLDING: vector.body:
119; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8
120; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
121; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
122; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
123entry:
124  br label %for.body
125
126for.cond.cleanup:
127  ret void
128
129for.body:
130  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
131  %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
132  %0 = load i8, i8* %arrayidx, align 1
133  %conv = zext i8 %0 to i32
134  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
135  %1 = load i32, i32* %arrayidx1, align 4
136  %add = add nsw i32 %1, %conv
137  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
138  store i32 %add, i32* %arrayidx2, align 4
139  %add3 = add nuw nsw i32 %i.09, 1
140  %exitcond = icmp eq i32 %add3, 431
141  br i1 %exitcond, label %for.cond.cleanup, label %for.body
142}
143
144define void @sign_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
145; CHECK-LABEL:    sign_extending_load_allowed(
146; PREFER-FOLDING: vector.body:
147; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0v4i8
148; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
149; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
150; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
151entry:
152  br label %for.body
153
154for.cond.cleanup:
155  ret void
156
157for.body:
158  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
159  %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
160  %0 = load i8, i8* %arrayidx, align 1
161  %conv = sext i8 %0 to i32
162  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
163  %1 = load i32, i32* %arrayidx1, align 4
164  %add = add nsw i32 %1, %conv
165  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
166  store i32 %add, i32* %arrayidx2, align 4
167  %add3 = add nuw nsw i32 %i.09, 1
168  %exitcond = icmp eq i32 %add3, 431
169  br i1 %exitcond, label %for.cond.cleanup, label %for.body
170}
171
172define void @narrowing_store_allowed(i8* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
173; CHECK-LABEL:    narrowing_store_allowed(
174; PREFER-FOLDING: call void @llvm.masked.store.v4i8.p0v4i8
175; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
176entry:
177  br label %for.body
178
179for.cond.cleanup:
180  ret void
181
182for.body:
183  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
184  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
185  %0 = load i32, i32* %arrayidx, align 4
186  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
187  %1 = load i32, i32* %arrayidx1, align 4
188  %add = add nsw i32 %1, %0
189  %conv = trunc i32 %add to i8
190  %arrayidx2 = getelementptr inbounds i8, i8* %A, i32 %i.09
191  store i8 %conv, i8* %arrayidx2, align 1
192  %add3 = add nuw nsw i32 %i.09, 1
193  %exitcond = icmp eq i32 %add3, 431
194  br i1 %exitcond, label %for.cond.cleanup, label %for.body
195}
196
197@tab = common global [32 x i8] zeroinitializer, align 1
198
199define i32 @icmp_not_allowed() #0 {
200; CHECK-LABEL:        icmp_not_allowed(
201; PREFER-FOLDING:     vector.body:
202; PREFER-FOLDING-NOT: llvm.masked.load
203; PREFER-FOLDING-NOT: llvm.masked.store
204; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
205entry:
206  br label %for.body
207
208for.body:
209  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
210  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
211  %0 = load i8, i8* %arrayidx, align 1
212  %cmp1 = icmp eq i8 %0, 0
213  %. = select i1 %cmp1, i8 2, i8 1
214  store i8 %., i8* %arrayidx, align 1
215  %inc = add nsw i32 %i.08, 1
216  %exitcond = icmp slt i32 %inc, 1000
217  br i1 %exitcond, label %for.body, label %for.end
218
219for.end:
220  ret i32 0
221}
222
223@ftab = common global [32 x float] zeroinitializer, align 1
224
225define float @fcmp_not_allowed() #0 {
226; CHECK-LABEL:        fcmp_not_allowed(
227; PREFER-FOLDING:     vector.body:
228; PREFER-FOLDING-NOT: llvm.masked.load
229; PREFER-FOLDING-NOT: llvm.masked.store
230; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
231entry:
232  br label %for.body
233
234for.body:
235  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
236  %arrayidx = getelementptr inbounds [32 x float], [32 x float]* @ftab, i32 0, i32 %i.08
237  %0 = load float, float* %arrayidx, align 4
238  %cmp1 = fcmp oeq float %0, 0.000000e+00
239  %. = select i1 %cmp1, float 2.000000e+00, float 1.000000e+00
240  store float %., float* %arrayidx, align 4
241  %inc = add nsw i32 %i.08, 1
242  %exitcond = icmp slt i32 %inc, 999
243  br i1 %exitcond, label %for.body, label %for.end
244
245for.end:
246  ret float 0.000000e+00
247}
248
249define void @pragma_vect_predicate_disable(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
250; CHECK-LABEL:        pragma_vect_predicate_disable(
251; PREFER-FOLDING:     vector.body:
252; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
253; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
254; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32
255; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
256entry:
257  br label %for.body
258
259for.cond.cleanup:
260  ret void
261
262for.body:
263  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
264  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
265  %0 = load i32, i32* %arrayidx, align 4
266  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
267  %1 = load i32, i32* %arrayidx1, align 4
268  %add = add nsw i32 %1, %0
269  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
270  store i32 %add, i32* %arrayidx2, align 4
271  %add3 = add nuw nsw i32 %i.09, 1
272  %exitcond = icmp eq i32 %add3, 431
273  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !7
274}
275
276define void @stride_4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
277; CHECK-LABEL:        stride_4(
278; PREFER-FOLDING:     vector.body:
279; PREFER-FOLDING-NOT: llvm.masked.load
280; PREFER-FOLDING-NOT: llvm.masked.store
281; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
282entry:
283  br label %for.body
284
285for.cond.cleanup:
286  ret void
287
288for.body:
289  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
290  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
291  %0 = load i32, i32* %arrayidx, align 4
292  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
293  %1 = load i32, i32* %arrayidx1, align 4
294  %add = add nsw i32 %1, %0
295  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
296  store i32 %add, i32* %arrayidx2, align 4
297  %add3 = add nuw nsw i32 %i.09, 4
298  %cmp = icmp ult i32 %add3, 731
299  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !5
300}
301
302define dso_local void @half(half* noalias nocapture %A, half* noalias nocapture readonly %B, half* noalias nocapture readonly %C) #0 {
303; CHECK-LABEL:    half(
304; PREFER-FOLDING: vector.body:
305; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16
306; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0v8f16
307; PREFER-FOLDING: call void @llvm.masked.store.v8f16.p0v8f16
308; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
309entry:
310  br label %for.body
311
312for.cond.cleanup:
313  ret void
314
315for.body:
316  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
317  %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09
318  %0 = load half, half* %arrayidx, align 2
319  %arrayidx1 = getelementptr inbounds half, half* %C, i32 %i.09
320  %1 = load half, half* %arrayidx1, align 2
321  %add = fadd fast half %1, %0
322  %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09
323  store half %add, half* %arrayidx2, align 2
324  %add3 = add nuw nsw i32 %i.09, 1
325  %exitcond = icmp eq i32 %add3, 431
326  br i1 %exitcond, label %for.cond.cleanup, label %for.body
327}
328
329define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
330; CHECK-LABEL:    float(
331; PREFER-FOLDING: vector.body:
332; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
333; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
334; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
335; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
336; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
337; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask
338; PREFER-FOLDING: %index.next = add i32 %index, 4
339; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
340entry:
341  br label %for.body
342
343for.cond.cleanup:
344  ret void
345
346for.body:
347  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
348  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
349  %0 = load float, float* %arrayidx, align 4
350  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
351  %1 = load float, float* %arrayidx1, align 4
352  %add = fadd fast float %1, %0
353  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
354  store float %add, float* %arrayidx2, align 4
355  %add3 = add nuw nsw i32 %i.09, 1
356  %exitcond = icmp eq i32 %add3, 431
357  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
358}
359
360define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
361; CHECK-LABEL:        fpext_allowed(
362; PREFER-FOLDING:     vector.body:
363; PREFER-FOLDING-NOT: llvm.masked.load
364; PREFER-FOLDING-NOT: llvm.masked.store
365; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
366entry:
367  br label %for.body
368
369for.cond.cleanup:
370  ret void
371
372for.body:
373  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
374  %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09
375  %0 = load half, half* %arrayidx, align 2
376  %conv = fpext half %0 to float
377  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
378  %1 = load float, float* %arrayidx1, align 4
379  %add = fadd fast float %1, %conv
380  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
381  store float %add, float* %arrayidx2, align 4
382  %add3 = add nuw nsw i32 %i.09, 1
383  %exitcond = icmp eq i32 %add3, 431
384  br i1 %exitcond, label %for.cond.cleanup, label %for.body
385}
386
387define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
388; CHECK-LABEL:        fptrunc_allowed(
389; PREFER-FOLDING:     vector.body:
390; PREFER-FOLDING-NOT: llvm.masked.load
391; PREFER-FOLDING-NOT: llvm.masked.store
392; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
393entry:
394  br label %for.body
395
396for.cond.cleanup:
397  ret void
398
399for.body:
400  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
401  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
402  %0 = load float, float* %arrayidx, align 4
403  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
404  %1 = load float, float* %arrayidx1, align 4
405  %add = fadd fast float %1, %0
406  %conv = fptrunc float %add to half
407  %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09
408  store half %conv, half* %arrayidx2, align 2
409  %add3 = add nuw nsw i32 %i.09, 1
410  %exitcond = icmp eq i32 %add3, 431
411  br i1 %exitcond, label %for.cond.cleanup, label %for.body
412}
413
414attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
415
416!5 = distinct !{!5, !6}
417!6 = !{!"llvm.loop.vectorize.enable", i1 true}
418
419!7 = distinct !{!7, !8}
420!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
421
422!10 = distinct !{!10, !11}
423!11 = !{!"llvm.loop.vectorize.width", i32 4}
424