1; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
2
3target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
4
5;CHECK-LABEL: @reduction_sum(
6;CHECK: phi <4 x i32>
7;CHECK: load <4 x i32>
8;CHECK: add <4 x i32>
9;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
10;CHECK: add <4 x i32>
11;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
12;CHECK: add <4 x i32>
13;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
14;CHECK: ret i32
15define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
16  %1 = icmp sgt i32 %n, 0
17  br i1 %1, label %.lr.ph, label %._crit_edge
18
19.lr.ph:                                           ; preds = %0, %.lr.ph
20  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
21  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
22  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
23  %3 = load i32, i32* %2, align 4
24  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
25  %5 = load i32, i32* %4, align 4
26  %6 = trunc i64 %indvars.iv to i32
27  %7 = add i32 %sum.02, %6
28  %8 = add i32 %7, %3
29  %9 = add i32 %8, %5
30  %indvars.iv.next = add i64 %indvars.iv, 1
31  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
32  %exitcond = icmp eq i32 %lftr.wideiv, %n
33  br i1 %exitcond, label %._crit_edge, label %.lr.ph
34
35._crit_edge:                                      ; preds = %.lr.ph, %0
36  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
37  ret i32 %sum.0.lcssa
38}
39
40;CHECK-LABEL: @reduction_prod(
41;CHECK: phi <4 x i32>
42;CHECK: load <4 x i32>
43;CHECK: mul <4 x i32>
44;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
45;CHECK: mul <4 x i32>
46;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
47;CHECK: mul <4 x i32>
48;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
49;CHECK: ret i32
50define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
51  %1 = icmp sgt i32 %n, 0
52  br i1 %1, label %.lr.ph, label %._crit_edge
53
54.lr.ph:                                           ; preds = %0, %.lr.ph
55  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
56  %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
57  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
58  %3 = load i32, i32* %2, align 4
59  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
60  %5 = load i32, i32* %4, align 4
61  %6 = trunc i64 %indvars.iv to i32
62  %7 = mul i32 %prod.02, %6
63  %8 = mul i32 %7, %3
64  %9 = mul i32 %8, %5
65  %indvars.iv.next = add i64 %indvars.iv, 1
66  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
67  %exitcond = icmp eq i32 %lftr.wideiv, %n
68  br i1 %exitcond, label %._crit_edge, label %.lr.ph
69
70._crit_edge:                                      ; preds = %.lr.ph, %0
71  %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
72  ret i32 %prod.0.lcssa
73}
74
75;CHECK-LABEL: @reduction_mix(
76;CHECK: phi <4 x i32>
77;CHECK: load <4 x i32>
78;CHECK: mul nsw <4 x i32>
79;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
80;CHECK: add <4 x i32>
81;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
82;CHECK: add <4 x i32>
83;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
84;CHECK: ret i32
85define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
86  %1 = icmp sgt i32 %n, 0
87  br i1 %1, label %.lr.ph, label %._crit_edge
88
89.lr.ph:                                           ; preds = %0, %.lr.ph
90  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
91  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
92  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
93  %3 = load i32, i32* %2, align 4
94  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
95  %5 = load i32, i32* %4, align 4
96  %6 = mul nsw i32 %5, %3
97  %7 = trunc i64 %indvars.iv to i32
98  %8 = add i32 %sum.02, %7
99  %9 = add i32 %8, %6
100  %indvars.iv.next = add i64 %indvars.iv, 1
101  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
102  %exitcond = icmp eq i32 %lftr.wideiv, %n
103  br i1 %exitcond, label %._crit_edge, label %.lr.ph
104
105._crit_edge:                                      ; preds = %.lr.ph, %0
106  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
107  ret i32 %sum.0.lcssa
108}
109
110;CHECK-LABEL: @reduction_mul(
111;CHECK: mul <4 x i32>
112;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
113;CHECK: mul <4 x i32>
114;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
115;CHECK: mul <4 x i32>
116;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
117;CHECK: ret i32
118define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
119  %1 = icmp sgt i32 %n, 0
120  br i1 %1, label %.lr.ph, label %._crit_edge
121
122.lr.ph:                                           ; preds = %0, %.lr.ph
123  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
124  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
125  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
126  %3 = load i32, i32* %2, align 4
127  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
128  %5 = load i32, i32* %4, align 4
129  %6 = trunc i64 %indvars.iv to i32
130  %7 = add i32 %3, %6
131  %8 = add i32 %7, %5
132  %9 = mul i32 %8, %sum.02
133  %indvars.iv.next = add i64 %indvars.iv, 1
134  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
135  %exitcond = icmp eq i32 %lftr.wideiv, %n
136  br i1 %exitcond, label %._crit_edge, label %.lr.ph
137
138._crit_edge:                                      ; preds = %.lr.ph, %0
139  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
140  ret i32 %sum.0.lcssa
141}
142
143;CHECK-LABEL: @start_at_non_zero(
144;CHECK: phi <4 x i32>
145;CHECK: <i32 120, i32 0, i32 0, i32 0>
146;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
147;CHECK: add <4 x i32>
148;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
149;CHECK: add <4 x i32>
150;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
151;CHECK: ret i32
152define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
153entry:
154  %cmp7 = icmp sgt i32 %n, 0
155  br i1 %cmp7, label %for.body, label %for.end
156
157for.body:                                         ; preds = %entry, %for.body
158  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
159  %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
160  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %indvars.iv
161  %0 = load i32, i32* %arrayidx, align 4
162  %arrayidx2 = getelementptr inbounds i32, i32* %coeff, i64 %indvars.iv
163  %1 = load i32, i32* %arrayidx2, align 4
164  %mul = mul nsw i32 %1, %0
165  %add = add nsw i32 %mul, %sum.09
166  %indvars.iv.next = add i64 %indvars.iv, 1
167  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
168  %exitcond = icmp eq i32 %lftr.wideiv, %n
169  br i1 %exitcond, label %for.end, label %for.body
170
171for.end:                                          ; preds = %for.body, %entry
172  %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
173  ret i32 %sum.0.lcssa
174}
175
176;CHECK-LABEL: @reduction_and(
177;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
178;CHECK: and <4 x i32>
179;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
180;CHECK: and <4 x i32>
181;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
182;CHECK: and <4 x i32>
183;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
184;CHECK: ret i32
185define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
186entry:
187  %cmp7 = icmp sgt i32 %n, 0
188  br i1 %cmp7, label %for.body, label %for.end
189
190for.body:                                         ; preds = %entry, %for.body
191  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
192  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
193  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
194  %0 = load i32, i32* %arrayidx, align 4
195  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
196  %1 = load i32, i32* %arrayidx2, align 4
197  %add = add nsw i32 %1, %0
198  %and = and i32 %add, %result.08
199  %indvars.iv.next = add i64 %indvars.iv, 1
200  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
201  %exitcond = icmp eq i32 %lftr.wideiv, %n
202  br i1 %exitcond, label %for.end, label %for.body
203
204for.end:                                          ; preds = %for.body, %entry
205  %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
206  ret i32 %result.0.lcssa
207}
208
209;CHECK-LABEL: @reduction_or(
210;CHECK: or <4 x i32>
211;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
212;CHECK: or <4 x i32>
213;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
214;CHECK: or <4 x i32>
215;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
216;CHECK: ret i32
217define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
218entry:
219  %cmp7 = icmp sgt i32 %n, 0
220  br i1 %cmp7, label %for.body, label %for.end
221
222for.body:                                         ; preds = %entry, %for.body
223  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
224  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
225  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
226  %0 = load i32, i32* %arrayidx, align 4
227  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
228  %1 = load i32, i32* %arrayidx2, align 4
229  %add = add nsw i32 %1, %0
230  %or = or i32 %add, %result.08
231  %indvars.iv.next = add i64 %indvars.iv, 1
232  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
233  %exitcond = icmp eq i32 %lftr.wideiv, %n
234  br i1 %exitcond, label %for.end, label %for.body
235
236for.end:                                          ; preds = %for.body, %entry
237  %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
238  ret i32 %result.0.lcssa
239}
240
241;CHECK-LABEL: @reduction_xor(
242;CHECK: xor <4 x i32>
243;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
244;CHECK: xor <4 x i32>
245;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
246;CHECK: xor <4 x i32>
247;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
248;CHECK: ret i32
249define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
250entry:
251  %cmp7 = icmp sgt i32 %n, 0
252  br i1 %cmp7, label %for.body, label %for.end
253
254for.body:                                         ; preds = %entry, %for.body
255  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
256  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
257  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
258  %0 = load i32, i32* %arrayidx, align 4
259  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
260  %1 = load i32, i32* %arrayidx2, align 4
261  %add = add nsw i32 %1, %0
262  %xor = xor i32 %add, %result.08
263  %indvars.iv.next = add i64 %indvars.iv, 1
264  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
265  %exitcond = icmp eq i32 %lftr.wideiv, %n
266  br i1 %exitcond, label %for.end, label %for.body
267
268for.end:                                          ; preds = %for.body, %entry
269  %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
270  ret i32 %result.0.lcssa
271}
272
273; In this code the subtracted variable is on the RHS and this is not an induction variable.
274;CHECK-LABEL: @reduction_sub_rhs(
275;CHECK-NOT: phi <4 x i32>
276;CHECK-NOT: sub nsw <4 x i32>
277;CHECK: ret i32
278define i32 @reduction_sub_rhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
279entry:
280  %cmp4 = icmp sgt i32 %n, 0
281  br i1 %cmp4, label %for.body, label %for.end
282
283for.body:                                         ; preds = %entry, %for.body
284  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
285  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
286  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
287  %0 = load i32, i32* %arrayidx, align 4
288  %sub = sub nsw i32 %0, %x.05
289  %indvars.iv.next = add i64 %indvars.iv, 1
290  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
291  %exitcond = icmp eq i32 %lftr.wideiv, %n
292  br i1 %exitcond, label %for.end, label %for.body
293
294for.end:                                          ; preds = %for.body, %entry
295  %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
296  ret i32 %x.0.lcssa
297}
298
299
300; In this test the reduction variable is on the LHS and we can vectorize it.
301;CHECK-LABEL: @reduction_sub_lhs(
302;CHECK: phi <4 x i32>
303;CHECK: sub nsw <4 x i32>
304;CHECK: ret i32
305define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
306entry:
307  %cmp4 = icmp sgt i32 %n, 0
308  br i1 %cmp4, label %for.body, label %for.end
309
310for.body:                                         ; preds = %entry, %for.body
311  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
312  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
313  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
314  %0 = load i32, i32* %arrayidx, align 4
315  %sub = sub nsw i32 %x.05, %0
316  %indvars.iv.next = add i64 %indvars.iv, 1
317  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
318  %exitcond = icmp eq i32 %lftr.wideiv, %n
319  br i1 %exitcond, label %for.end, label %for.body
320
321for.end:                                          ; preds = %for.body, %entry
322  %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
323  ret i32 %x.0.lcssa
324}
325
326; We can vectorize conditional reductions with multi-input phis.
327; CHECK: reduction_conditional
328; CHECK: fadd <4 x float>
329
330define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
331entry:
332  br label %for.body
333
334for.body:
335  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
336  %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
337  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
338  %0 = load float, float* %arrayidx, align 4
339  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
340  %1 = load float, float* %arrayidx2, align 4
341  %cmp3 = fcmp ogt float %0, %1
342  br i1 %cmp3, label %if.then, label %for.inc
343
344if.then:
345  %cmp6 = fcmp ogt float %1, 1.000000e+00
346  br i1 %cmp6, label %if.then8, label %if.else
347
348if.then8:
349  %add = fadd fast float %sum.033, %0
350  br label %for.inc
351
352if.else:
353  %cmp14 = fcmp ogt float %0, 2.000000e+00
354  br i1 %cmp14, label %if.then16, label %for.inc
355
356if.then16:
357  %add19 = fadd fast float %sum.033, %1
358  br label %for.inc
359
360for.inc:
361  %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ %sum.033, %if.else ], [ %sum.033, %for.body ]
362  %indvars.iv.next = add i64 %indvars.iv, 1
363  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
364  %exitcond = icmp ne i32 %lftr.wideiv, 128
365  br i1 %exitcond, label %for.body, label %for.end
366
367for.end:
368  %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
369  ret float %sum.1.lcssa
370}
371
372; We can't vectorize reductions with phi inputs from outside the reduction.
373; CHECK: noreduction_phi
374; CHECK-NOT: fadd <4 x float>
375define float @noreduction_phi(float* %A, float* %B, float* %C, float %S) {
376entry:
377  br label %for.body
378
379for.body:
380  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
381  %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
382  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
383  %0 = load float, float* %arrayidx, align 4
384  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
385  %1 = load float, float* %arrayidx2, align 4
386  %cmp3 = fcmp ogt float %0, %1
387  br i1 %cmp3, label %if.then, label %for.inc
388
389if.then:
390  %cmp6 = fcmp ogt float %1, 1.000000e+00
391  br i1 %cmp6, label %if.then8, label %if.else
392
393if.then8:
394  %add = fadd fast float %sum.033, %0
395  br label %for.inc
396
397if.else:
398  %cmp14 = fcmp ogt float %0, 2.000000e+00
399  br i1 %cmp14, label %if.then16, label %for.inc
400
401if.then16:
402  %add19 = fadd fast float %sum.033, %1
403  br label %for.inc
404
405for.inc:
406  %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ 0.000000e+00, %if.else ], [ %sum.033, %for.body ]
407  %indvars.iv.next = add i64 %indvars.iv, 1
408  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
409  %exitcond = icmp ne i32 %lftr.wideiv, 128
410  br i1 %exitcond, label %for.body, label %for.end
411
412for.end:
413  %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
414  ret float %sum.1.lcssa
415}
416
417; We can't vectorize reductions that feed another header PHI.
418; CHECK: noredux_header_phi
419; CHECK-NOT: fadd <4 x float>
420
421define float @noredux_header_phi(float* %A, float* %B, float* %C, float %S)  {
422entry:
423  br label %for.body
424
425for.body:
426  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
427  %sum2.09 = phi float [ 0.000000e+00, %entry ], [ %add1, %for.body ]
428  %sum.08 = phi float [ %S, %entry ], [ %add, %for.body ]
429  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
430  %0 = load float, float* %arrayidx, align 4
431  %add = fadd fast float %sum.08, %0
432  %add1 = fadd fast float %sum2.09, %add
433  %indvars.iv.next = add i64 %indvars.iv, 1
434  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
435  %exitcond = icmp ne i32 %lftr.wideiv, 128
436  br i1 %exitcond, label %for.body, label %for.end
437
438for.end:
439  %add1.lcssa = phi float [ %add1, %for.body ]
440  %add.lcssa = phi float [ %add, %for.body ]
441  %add2 = fadd fast float %add.lcssa, %add1.lcssa
442  ret float %add2
443}
444
445
446; When vectorizing a reduction whose loop header phi value is used outside the
447; loop special care must be taken. Otherwise, the reduced value feeding into the
448; outside user misses a few iterations (VF-1) of the loop.
449; PR16522
450
451; CHECK-LABEL: @phivalueredux(
452; CHECK-NOT: x i32>
453
454define i32 @phivalueredux(i32 %p) {
455entry:
456  br label %for.body
457
458for.body:
459  %t.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
460  %p.addr.02 = phi i32 [ %p, %entry ], [ %xor, %for.body ]
461  %xor = xor i32 %p.addr.02, -1
462  %inc = add nsw i32 %t.03, 1
463  %exitcond = icmp eq i32 %inc, 16
464  br i1 %exitcond, label %for.end, label %for.body
465
466for.end:
467  ret i32 %p.addr.02
468}
469
470; Don't vectorize a reduction value that is not the last in a reduction cyle. We
471; would loose iterations (VF-1) on the operations after that use.
472; PR17498
473
474; CHECK-LABEL: not_last_operation
475; CHECK-NOT: x i32>
476define i32 @not_last_operation(i32 %p, i32 %val) {
477entry:
478  %tobool = icmp eq i32 %p, 0
479  br label %for.body
480
481for.body:
482  %inc613.1 = phi i32 [ 0, %entry ], [ %inc6.1, %for.body ]
483  %inc511.1 = phi i32 [ %val, %entry ], [ %inc5.1, %for.body ]
484  %0 = zext i1 %tobool to i32
485  %inc4.1 = xor i32 %0, 1
486  %inc511.1.inc4.1 = add nsw i32 %inc511.1, %inc4.1
487  %inc5.1 = add nsw i32 %inc511.1.inc4.1, 1
488  %inc6.1 = add nsw i32 %inc613.1, 1
489  %exitcond.1 = icmp eq i32 %inc6.1, 22
490  br i1 %exitcond.1, label %exit, label %for.body
491
492exit:
493  %inc.2 = add nsw i32 %inc511.1.inc4.1, 2
494  ret i32 %inc.2
495}
496