1; RUN: opt -S < %s -basic-aa -loop-vectorize -force-vector-interleave=1 2>&1 | FileCheck %s
2
3target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
4target triple = "aarch64"
5
6; CHECK-LABEL: @add_a(
7; CHECK: load <16 x i8>, <16 x i8>*
8; CHECK: add <16 x i8>
9; CHECK: store <16 x i8>
10; Function Attrs: nounwind
11define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
12entry:
13  %cmp8 = icmp sgt i32 %len, 0
14  br i1 %cmp8, label %for.body, label %for.cond.cleanup
15
16for.cond.cleanup:                                 ; preds = %for.body, %entry
17  ret void
18
19for.body:                                         ; preds = %entry, %for.body
20  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
21  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
22  %0 = load i8, i8* %arrayidx
23  %conv = zext i8 %0 to i32
24  %add = add nuw nsw i32 %conv, 2
25  %conv1 = trunc i32 %add to i8
26  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
27  store i8 %conv1, i8* %arrayidx3
28  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
29  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
30  %exitcond = icmp eq i32 %lftr.wideiv, %len
31  br i1 %exitcond, label %for.cond.cleanup, label %for.body
32}
33
34; Ensure that we preserve nuw/nsw if we're not shrinking the values we're
35; working with.
36; CHECK-LABEL: @add_a1(
37; CHECK: load <16 x i8>, <16 x i8>*
38; CHECK: add nuw nsw <16 x i8>
39; CHECK: store <16 x i8>
40; Function Attrs: nounwind
41define void @add_a1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
42entry:
43  %cmp8 = icmp sgt i32 %len, 0
44  br i1 %cmp8, label %for.body, label %for.cond.cleanup
45
46for.cond.cleanup:                                 ; preds = %for.body, %entry
47  ret void
48
49for.body:                                         ; preds = %entry, %for.body
50  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
51  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
52  %0 = load i8, i8* %arrayidx
53  %add = add nuw nsw i8 %0, 2
54  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
55  store i8 %add, i8* %arrayidx3
56  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
57  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
58  %exitcond = icmp eq i32 %lftr.wideiv, %len
59  br i1 %exitcond, label %for.cond.cleanup, label %for.body
60}
61
62; CHECK-LABEL: @add_b(
63; CHECK: load <8 x i16>, <8 x i16>*
64; CHECK: add <8 x i16>
65; CHECK: store <8 x i16>
66; Function Attrs: nounwind
67define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
68entry:
69  %cmp9 = icmp sgt i32 %len, 0
70  br i1 %cmp9, label %for.body, label %for.cond.cleanup
71
72for.cond.cleanup:                                 ; preds = %for.body, %entry
73  ret void
74
75for.body:                                         ; preds = %entry, %for.body
76  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
77  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
78  %0 = load i16, i16* %arrayidx
79  %conv8 = zext i16 %0 to i32
80  %add = add nuw nsw i32 %conv8, 2
81  %conv1 = trunc i32 %add to i16
82  %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
83  store i16 %conv1, i16* %arrayidx3
84  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
85  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
86  %exitcond = icmp eq i32 %lftr.wideiv, %len
87  br i1 %exitcond, label %for.cond.cleanup, label %for.body
88}
89
90; CHECK-LABEL: @add_c(
91; CHECK: load <8 x i8>, <8 x i8>*
92; CHECK: add <8 x i16>
93; CHECK: store <8 x i16>
94; Function Attrs: nounwind
95define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
96entry:
97  %cmp8 = icmp sgt i32 %len, 0
98  br i1 %cmp8, label %for.body, label %for.cond.cleanup
99
100for.cond.cleanup:                                 ; preds = %for.body, %entry
101  ret void
102
103for.body:                                         ; preds = %entry, %for.body
104  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
105  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
106  %0 = load i8, i8* %arrayidx
107  %conv = zext i8 %0 to i32
108  %add = add nuw nsw i32 %conv, 2
109  %conv1 = trunc i32 %add to i16
110  %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
111  store i16 %conv1, i16* %arrayidx3
112  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
113  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
114  %exitcond = icmp eq i32 %lftr.wideiv, %len
115  br i1 %exitcond, label %for.cond.cleanup, label %for.body
116}
117
118; CHECK-LABEL: @add_d(
119; CHECK: load <8 x i16>
120; CHECK: add nsw <8 x i32>
121; CHECK: store <8 x i32>
122define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
123entry:
124  %cmp7 = icmp sgt i32 %len, 0
125  br i1 %cmp7, label %for.body, label %for.cond.cleanup
126
127for.cond.cleanup:                                 ; preds = %for.body, %entry
128  ret void
129
130for.body:                                         ; preds = %entry, %for.body
131  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
132  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
133  %0 = load i16, i16* %arrayidx
134  %conv = sext i16 %0 to i32
135  %add = add nsw i32 %conv, 2
136  %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
137  store i32 %add, i32* %arrayidx2
138  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
139  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
140  %exitcond = icmp eq i32 %lftr.wideiv, %len
141  br i1 %exitcond, label %for.cond.cleanup, label %for.body
142}
143
144; CHECK-LABEL: @add_e(
145; CHECK: load <16 x i8>
146; CHECK: shl <16 x i8>
147; CHECK: add <16 x i8>
148; CHECK: or <16 x i8>
149; CHECK: mul <16 x i8>
150; CHECK: and <16 x i8>
151; CHECK: xor <16 x i8>
152; CHECK: mul <16 x i8>
153; CHECK: store <16 x i8>
154define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
155entry:
156  %cmp.32 = icmp sgt i32 %len, 0
157  br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
158
159for.body.lr.ph:                                   ; preds = %entry
160  %conv11 = zext i8 %arg2 to i32
161  %conv13 = zext i8 %arg1 to i32
162  br label %for.body
163
164for.cond.cleanup:                                 ; preds = %for.body, %entry
165  ret void
166
167for.body:                                         ; preds = %for.body, %for.body.lr.ph
168  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
169  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
170  %0 = load i8, i8* %arrayidx
171  %conv = zext i8 %0 to i32
172  %add = shl i32 %conv, 4
173  %conv2 = add nuw nsw i32 %add, 32
174  %or = or i32 %conv, 51
175  %mul = mul nuw nsw i32 %or, 60
176  %and = and i32 %conv2, %conv13
177  %mul.masked = and i32 %mul, 252
178  %conv17 = xor i32 %mul.masked, %conv11
179  %mul18 = mul nuw nsw i32 %conv17, %and
180  %conv19 = trunc i32 %mul18 to i8
181  %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
182  store i8 %conv19, i8* %arrayidx21
183  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
184  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
185  %exitcond = icmp eq i32 %lftr.wideiv, %len
186  br i1 %exitcond, label %for.cond.cleanup, label %for.body
187}
188
189; CHECK-LABEL: @add_f
190; CHECK: load <8 x i16>
191; CHECK: trunc <8 x i16>
192; CHECK: shl <8 x i8>
193; CHECK: add <8 x i8>
194; CHECK: or <8 x i8>
195; CHECK: mul <8 x i8>
196; CHECK: and <8 x i8>
197; CHECK: xor <8 x i8>
198; CHECK: mul <8 x i8>
199; CHECK: store <8 x i8>
200define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
201entry:
202  %cmp.32 = icmp sgt i32 %len, 0
203  br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
204
205for.body.lr.ph:                                   ; preds = %entry
206  %conv11 = zext i8 %arg2 to i32
207  %conv13 = zext i8 %arg1 to i32
208  br label %for.body
209
210for.cond.cleanup:                                 ; preds = %for.body, %entry
211  ret void
212
213for.body:                                         ; preds = %for.body, %for.body.lr.ph
214  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
215  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
216  %0 = load i16, i16* %arrayidx
217  %conv = sext i16 %0 to i32
218  %add = shl i32 %conv, 4
219  %conv2 = add nsw i32 %add, 32
220  %or = and i32 %conv, 204
221  %conv8 = or i32 %or, 51
222  %mul = mul nuw nsw i32 %conv8, 60
223  %and = and i32 %conv2, %conv13
224  %mul.masked = and i32 %mul, 252
225  %conv17 = xor i32 %mul.masked, %conv11
226  %mul18 = mul nuw nsw i32 %conv17, %and
227  %conv19 = trunc i32 %mul18 to i8
228  %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
229  store i8 %conv19, i8* %arrayidx21
230  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
231  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
232  %exitcond = icmp eq i32 %lftr.wideiv, %len
233  br i1 %exitcond, label %for.cond.cleanup, label %for.body
234}
235
236; CHECK-LABEL: @add_phifail(
237; CHECK: load <16 x i8>, <16 x i8>*
238; CHECK: add nuw nsw <16 x i32>
239; CHECK: store <16 x i8>
240; Function Attrs: nounwind
241define void @add_phifail(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
242entry:
243  %cmp8 = icmp sgt i32 %len, 0
244  br i1 %cmp8, label %for.body, label %for.cond.cleanup
245
246for.cond.cleanup:                                 ; preds = %for.body, %entry
247  ret void
248
249for.body:                                         ; preds = %entry, %for.body
250  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
251  %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
252  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
253  %0 = load i8, i8* %arrayidx
254  %conv = zext i8 %0 to i32
255  %add = add nuw nsw i32 %conv, 2
256  %conv1 = trunc i32 %add to i8
257  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
258  store i8 %conv1, i8* %arrayidx3
259  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
260  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
261  %exitcond = icmp eq i32 %lftr.wideiv, %len
262  br i1 %exitcond, label %for.cond.cleanup, label %for.body
263}
264
265; Function Attrs: nounwind
266; When we vectorize this loop, we generate correct code
267; even when %len exactly divides VF (since we extract from the second last index
268; and pass this to the for.cond.cleanup block). Vectorized loop returns
269; the correct value a_phi = p[len -2]
270define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
271; CHECK-LABEL: @add_phifail2(
272; CHECK: vector.body:
273; CHECK:   %wide.load = load <16 x i8>, <16 x i8>*
274; CHECK:   %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32>
275; CHECK:   add nuw nsw <16 x i32>
276; CHECK:   store <16 x i8>
277; CHECK:   add nuw i64 %index, 16
278; CHECK:   icmp eq i64 %index.next, %n.vec
279; CHECK: middle.block:
280; CHECK:   %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15
281; CHECK:   %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14
282; CHECK: for.cond.cleanup:
283; CHECK:   %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
284; CHECK:   %ret = trunc i32 %a_phi.lcssa to i8
285; CHECK:   ret i8 %ret
286entry:
287  br label %for.body
288
289for.cond.cleanup:                                 ; preds = %for.body, %entry
290  %ret = trunc i32 %a_phi to i8
291  ret i8 %ret
292
293for.body:                                         ; preds = %entry, %for.body
294  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
295  %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
296  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
297  %0 = load i8, i8* %arrayidx
298  %conv = zext i8 %0 to i32
299  %add = add nuw nsw i32 %conv, 2
300  %conv1 = trunc i32 %add to i8
301  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
302  store i8 %conv1, i8* %arrayidx3
303  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
304  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
305  %exitcond = icmp eq i32 %lftr.wideiv, %len
306  br i1 %exitcond, label %for.cond.cleanup, label %for.body
307}
308
309attributes #0 = { nounwind }
310
311