1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
3
4target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
5
6define i32 @reduction_sum_single(i32* noalias nocapture %A) {
7; CHECK-LABEL: @reduction_sum_single(
8; CHECK-NEXT:  entry:
9; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
10; CHECK:       vector.ph:
11; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
12; CHECK:       vector.body:
13; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
14; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
15; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
16; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
17; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
18; CHECK-NEXT:    [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
19; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
20; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
21; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
22; CHECK:       middle.block:
23; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
24; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
25; CHECK:       scalar.ph:
26; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
27; CHECK:       .lr.ph:
28; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !2
29; CHECK:       ._crit_edge:
30; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
31; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
32;
33entry:
34  br label %.lr.ph
35
36.lr.ph:                                           ; preds = %entry, %.lr.ph
37  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
38  %sum.02 = phi i32 [ %l7, %.lr.ph ], [ 0, %entry ]
39  %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
40  %l3 = load i32, i32* %l2, align 4
41  %l7 = add i32 %sum.02, %l3
42  %indvars.iv.next = add i64 %indvars.iv, 1
43  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
44  %exitcond = icmp eq i32 %lftr.wideiv, 256
45  br i1 %exitcond, label %._crit_edge, label %.lr.ph
46
47._crit_edge:                                      ; preds = %.lr.ph
48  %sum.0.lcssa = phi i32 [ %l7, %.lr.ph ]
49  ret i32 %sum.0.lcssa
50}
51
52define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B) {
53; CHECK-LABEL: @reduction_sum(
54; CHECK-NEXT:  entry:
55; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
56; CHECK:       vector.ph:
57; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
58; CHECK:       vector.body:
59; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
60; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
61; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
62; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
63; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
64; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
65; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
66; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
67; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
68; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
69; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_LOAD]]
70; CHECK-NEXT:    [[TMP6]] = add <4 x i32> [[TMP5]], [[WIDE_LOAD1]]
71; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
72; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
73; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
74; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
75; CHECK:       middle.block:
76; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
77; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
78; CHECK:       scalar.ph:
79; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
80; CHECK:       .lr.ph:
81; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !5
82; CHECK:       ._crit_edge:
83; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
84; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
85;
86entry:
87  br label %.lr.ph
88
89.lr.ph:                                           ; preds = %entry, %.lr.ph
90  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
91  %sum.02 = phi i32 [ %l9, %.lr.ph ], [ 0, %entry ]
92  %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
93  %l3 = load i32, i32* %l2, align 4
94  %l4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
95  %l5 = load i32, i32* %l4, align 4
96  %l6 = trunc i64 %indvars.iv to i32
97  %l7 = add i32 %sum.02, %l6
98  %l8 = add i32 %l7, %l3
99  %l9 = add i32 %l8, %l5
100  %indvars.iv.next = add i64 %indvars.iv, 1
101  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
102  %exitcond = icmp eq i32 %lftr.wideiv, 256
103  br i1 %exitcond, label %._crit_edge, label %.lr.ph
104
105._crit_edge:                                      ; preds = %.lr.ph
106  %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ]
107  ret i32 %sum.0.lcssa
108}
109
110define i32 @reduction_sum_const(i32* noalias nocapture %A) {
111; CHECK-LABEL: @reduction_sum_const(
112; CHECK-NEXT:  entry:
113; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
114; CHECK:       vector.ph:
115; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
116; CHECK:       vector.body:
117; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
118; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
119; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
120; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
121; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
122; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
123; CHECK-NEXT:    [[TMP3]] = add <4 x i32> [[TMP2]], <i32 3, i32 3, i32 3, i32 3>
124; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
125; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
126; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
127; CHECK:       middle.block:
128; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
129; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
130; CHECK:       scalar.ph:
131; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
132; CHECK:       .lr.ph:
133; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !7
134; CHECK:       ._crit_edge:
135; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
136; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
137;
138entry:
139  br label %.lr.ph
140
141.lr.ph:                                           ; preds = %entry, %.lr.ph
142  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
143  %sum.02 = phi i32 [ %l9, %.lr.ph ], [ 0, %entry ]
144  %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
145  %l3 = load i32, i32* %l2, align 4
146  %l7 = add i32 %sum.02, %l3
147  %l9 = add i32 %l7, 3
148  %indvars.iv.next = add i64 %indvars.iv, 1
149  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
150  %exitcond = icmp eq i32 %lftr.wideiv, 256
151  br i1 %exitcond, label %._crit_edge, label %.lr.ph
152
153._crit_edge:                                      ; preds = %.lr.ph
154  %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ]
155  ret i32 %sum.0.lcssa
156}
157
158define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B) {
159; CHECK-LABEL: @reduction_prod(
160; CHECK-NEXT:  entry:
161; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
162; CHECK:       vector.ph:
163; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
164; CHECK:       vector.body:
165; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
166; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
167; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
168; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
169; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
170; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
171; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
172; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
173; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
174; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
175; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], [[WIDE_LOAD]]
176; CHECK-NEXT:    [[TMP6]] = mul <4 x i32> [[TMP5]], [[WIDE_LOAD1]]
177; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
178; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
179; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
180; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
181; CHECK:       middle.block:
182; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]])
183; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
184; CHECK:       scalar.ph:
185; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
186; CHECK:       .lr.ph:
187; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !9
188; CHECK:       ._crit_edge:
189; CHECK-NEXT:    [[PROD_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
190; CHECK-NEXT:    ret i32 [[PROD_0_LCSSA]]
191;
192entry:
193  br label %.lr.ph
194
195.lr.ph:                                           ; preds = %entry, %.lr.ph
196  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
197  %prod.02 = phi i32 [ %l9, %.lr.ph ], [ 1, %entry ]
198  %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
199  %l3 = load i32, i32* %l2, align 4
200  %l4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
201  %l5 = load i32, i32* %l4, align 4
202  %l6 = trunc i64 %indvars.iv to i32
203  %l7 = mul i32 %prod.02, %l6
204  %l8 = mul i32 %l7, %l3
205  %l9 = mul i32 %l8, %l5
206  %indvars.iv.next = add i64 %indvars.iv, 1
207  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
208  %exitcond = icmp eq i32 %lftr.wideiv, 256
209  br i1 %exitcond, label %._crit_edge, label %.lr.ph
210
211._crit_edge:                                      ; preds = %.lr.ph
212  %prod.0.lcssa = phi i32 [ %l9, %.lr.ph ]
213  ret i32 %prod.0.lcssa
214}
215
216define i32 @reduction_mix(i32* noalias nocapture %A, i32* noalias nocapture %B) {
217; CHECK-LABEL: @reduction_mix(
218; CHECK-NEXT:  entry:
219; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
220; CHECK:       vector.ph:
221; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
222; CHECK:       vector.body:
223; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
224; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
225; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
226; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
227; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
228; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
229; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
230; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
231; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
232; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
233; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
234; CHECK-NEXT:    [[TMP6]] = add <4 x i32> [[TMP5]], [[TMP4]]
235; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
236; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
237; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
238; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
239; CHECK:       middle.block:
240; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
241; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
242; CHECK:       scalar.ph:
243; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
244; CHECK:       .lr.ph:
245; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !11
246; CHECK:       ._crit_edge:
247; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
248; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
249;
250entry:
251  br label %.lr.ph
252
253.lr.ph:                                           ; preds = %entry, %.lr.ph
254  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
255  %sum.02 = phi i32 [ %l9, %.lr.ph ], [ 0, %entry ]
256  %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
257  %l3 = load i32, i32* %l2, align 4
258  %l4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
259  %l5 = load i32, i32* %l4, align 4
260  %l6 = mul nsw i32 %l5, %l3
261  %l7 = trunc i64 %indvars.iv to i32
262  %l8 = add i32 %sum.02, %l7
263  %l9 = add i32 %l8, %l6
264  %indvars.iv.next = add i64 %indvars.iv, 1
265  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
266  %exitcond = icmp eq i32 %lftr.wideiv, 256
267  br i1 %exitcond, label %._crit_edge, label %.lr.ph
268
269._crit_edge:                                      ; preds = %.lr.ph
270  %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ]
271  ret i32 %sum.0.lcssa
272}
273
274define i32 @reduction_mul(i32* noalias nocapture %A, i32* noalias nocapture %B) {
275; CHECK-LABEL: @reduction_mul(
276; CHECK-NEXT:  entry:
277; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
278; CHECK:       vector.ph:
279; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
280; CHECK:       vector.body:
281; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
282; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 19, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
283; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
284; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
285; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
286; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
287; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
288; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
289; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
290; CHECK-NEXT:    [[TMP5]] = mul <4 x i32> [[TMP4]], [[WIDE_LOAD1]]
291; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
292; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
293; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
294; CHECK:       middle.block:
295; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
296; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
297; CHECK:       scalar.ph:
298; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
299; CHECK:       .lr.ph:
300; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !13
301; CHECK:       ._crit_edge:
302; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
303; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
304;
305entry:
306  br label %.lr.ph
307
308.lr.ph:                                           ; preds = %entry, %.lr.ph
309  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
310  %sum.02 = phi i32 [ %l7, %.lr.ph ], [ 19, %entry ]
311  %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
312  %l3 = load i32, i32* %l2, align 4
313  %l4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
314  %l5 = load i32, i32* %l4, align 4
315  %l6 = mul i32 %sum.02, %l3
316  %l7 = mul i32 %l6, %l5
317  %indvars.iv.next = add i64 %indvars.iv, 1
318  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
319  %exitcond = icmp eq i32 %lftr.wideiv, 256
320  br i1 %exitcond, label %._crit_edge, label %.lr.ph
321
322._crit_edge:                                      ; preds = %.lr.ph
323  %sum.0.lcssa = phi i32 [ %l7, %.lr.ph ]
324  ret i32 %sum.0.lcssa
325}
326
327define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out) {
328; CHECK-LABEL: @start_at_non_zero(
329; CHECK-NEXT:  entry:
330; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
331; CHECK:       vector.ph:
332; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
333; CHECK:       vector.body:
334; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
335; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 120, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
336; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[INDEX]]
337; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
338; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
339; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[COEFF:%.*]], i64 [[INDEX]]
340; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
341; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
342; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
343; CHECK-NEXT:    [[TMP5]] = add <4 x i32> [[TMP4]], [[VEC_PHI]]
344; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
345; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
346; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14
347; CHECK:       middle.block:
348; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
349; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
350; CHECK:       scalar.ph:
351; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
352; CHECK:       for.body:
353; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !15
354; CHECK:       for.end:
355; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
356; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
357;
358entry:
359  br label %for.body
360
361for.body:                                         ; preds = %entry, %for.body
362  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
363  %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
364  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %indvars.iv
365  %l0 = load i32, i32* %arrayidx, align 4
366  %arrayidx2 = getelementptr inbounds i32, i32* %coeff, i64 %indvars.iv
367  %l1 = load i32, i32* %arrayidx2, align 4
368  %mul = mul nsw i32 %l1, %l0
369  %add = add nsw i32 %mul, %sum.09
370  %indvars.iv.next = add i64 %indvars.iv, 1
371  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
372  %exitcond = icmp eq i32 %lftr.wideiv, 256
373  br i1 %exitcond, label %for.end, label %for.body
374
375for.end:                                          ; preds = %for.body, %entry
376  %sum.0.lcssa = phi i32 [ %add, %for.body ]
377  ret i32 %sum.0.lcssa
378}
379
380define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) {
381; CHECK-LABEL: @reduction_and(
382; CHECK-NEXT:  entry:
383; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
384; CHECK:       vector.ph:
385; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
386; CHECK:       vector.body:
387; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
388; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
389; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
390; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
391; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
392; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
393; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
394; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
395; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
396; CHECK-NEXT:    [[TMP5]] = and <4 x i32> [[TMP4]], [[WIDE_LOAD1]]
397; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
398; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
399; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16
400; CHECK:       middle.block:
401; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP5]])
402; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
403; CHECK:       scalar.ph:
404; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
405; CHECK:       for.body:
406; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !17
407; CHECK:       for.end:
408; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
409; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
410;
411entry:
412  br label %for.body
413
414for.body:                                         ; preds = %entry, %for.body
415  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
416  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
417  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
418  %l0 = load i32, i32* %arrayidx, align 4
419  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
420  %l1 = load i32, i32* %arrayidx2, align 4
421  %add = and i32 %result.08, %l0
422  %and = and i32 %add, %l1
423  %indvars.iv.next = add i64 %indvars.iv, 1
424  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
425  %exitcond = icmp eq i32 %lftr.wideiv, 256
426  br i1 %exitcond, label %for.end, label %for.body
427
428for.end:                                          ; preds = %for.body, %entry
429  %result.0.lcssa = phi i32 [ %and, %for.body ]
430  ret i32 %result.0.lcssa
431}
432
433define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) {
434; CHECK-LABEL: @reduction_or(
435; CHECK-NEXT:  entry:
436; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
437; CHECK:       vector.ph:
438; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
439; CHECK:       vector.body:
440; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
441; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
442; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
443; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
444; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
445; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
446; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
447; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
448; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
449; CHECK-NEXT:    [[TMP5]] = or <4 x i32> [[TMP4]], [[VEC_PHI]]
450; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
451; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
452; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !18
453; CHECK:       middle.block:
454; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
455; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
456; CHECK:       scalar.ph:
457; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
458; CHECK:       for.body:
459; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !19
460; CHECK:       for.end:
461; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
462; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
463;
464entry:
465  br label %for.body
466
467for.body:                                         ; preds = %entry, %for.body
468  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
469  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
470  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
471  %l0 = load i32, i32* %arrayidx, align 4
472  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
473  %l1 = load i32, i32* %arrayidx2, align 4
474  %add = add nsw i32 %l1, %l0
475  %or = or i32 %add, %result.08
476  %indvars.iv.next = add i64 %indvars.iv, 1
477  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
478  %exitcond = icmp eq i32 %lftr.wideiv, 256
479  br i1 %exitcond, label %for.end, label %for.body
480
481for.end:                                          ; preds = %for.body, %entry
482  %result.0.lcssa = phi i32 [ %or, %for.body ]
483  ret i32 %result.0.lcssa
484}
485
486define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) {
487; CHECK-LABEL: @reduction_xor(
488; CHECK-NEXT:  entry:
489; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
490; CHECK:       vector.ph:
491; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
492; CHECK:       vector.body:
493; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
494; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
495; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
496; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
497; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
498; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
499; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
500; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
501; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
502; CHECK-NEXT:    [[TMP5]] = xor <4 x i32> [[TMP4]], [[VEC_PHI]]
503; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
504; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
505; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !20
506; CHECK:       middle.block:
507; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
508; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
509; CHECK:       scalar.ph:
510; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
511; CHECK:       for.body:
512; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !21
513; CHECK:       for.end:
514; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
515; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
516;
517entry:
518  br label %for.body
519
520for.body:                                         ; preds = %entry, %for.body
521  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
522  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
523  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
524  %l0 = load i32, i32* %arrayidx, align 4
525  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
526  %l1 = load i32, i32* %arrayidx2, align 4
527  %add = add nsw i32 %l1, %l0
528  %xor = xor i32 %add, %result.08
529  %indvars.iv.next = add i64 %indvars.iv, 1
530  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
531  %exitcond = icmp eq i32 %lftr.wideiv, 256
532  br i1 %exitcond, label %for.end, label %for.body
533
534for.end:                                          ; preds = %for.body, %entry
535  %result.0.lcssa = phi i32 [ %xor, %for.body ]
536  ret i32 %result.0.lcssa
537}
538
539define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
540; CHECK-LABEL: @reduction_fadd(
541; CHECK-NEXT:  entry:
542; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
543; CHECK:       vector.ph:
544; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
545; CHECK:       vector.body:
546; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
547; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
548; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
549; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
550; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
551; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
552; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
553; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
554; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
555; CHECK-NEXT:    [[TMP5]] = fadd fast <4 x float> [[TMP4]], [[WIDE_LOAD1]]
556; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
557; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
558; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !22
559; CHECK:       middle.block:
560; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
561; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
562; CHECK:       scalar.ph:
563; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
564; CHECK:       for.body:
565; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !23
566; CHECK:       for.end:
567; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
568; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
569;
570entry:
571  br label %for.body
572
573for.body:                                         ; preds = %entry, %for.body
574  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
575  %result.08 = phi float [ %fadd, %for.body ], [ 0.0, %entry ]
576  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
577  %l0 = load float, float* %arrayidx, align 4
578  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
579  %l1 = load float, float* %arrayidx2, align 4
580  %add = fadd fast float %result.08, %l0
581  %fadd = fadd fast float %add, %l1
582  %indvars.iv.next = add i64 %indvars.iv, 1
583  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
584  %exitcond = icmp eq i32 %lftr.wideiv, 256
585  br i1 %exitcond, label %for.end, label %for.body
586
587for.end:                                          ; preds = %for.body, %entry
588  %result.0.lcssa = phi float [ %fadd, %for.body ]
589  ret float %result.0.lcssa
590}
591
592define float @reduction_fmul(float* nocapture %A, float* nocapture %B) {
593; CHECK-LABEL: @reduction_fmul(
594; CHECK-NEXT:  entry:
595; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
596; CHECK:       vector.ph:
597; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
598; CHECK:       vector.body:
599; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
600; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
601; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
602; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
603; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
604; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
605; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
606; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
607; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
608; CHECK-NEXT:    [[TMP5]] = fmul fast <4 x float> [[TMP4]], [[WIDE_LOAD1]]
609; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
610; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
611; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !24
612; CHECK:       middle.block:
613; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
614; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
615; CHECK:       scalar.ph:
616; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
617; CHECK:       for.body:
618; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !25
619; CHECK:       for.end:
620; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
621; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
622;
623entry:
624  br label %for.body
625
626for.body:                                         ; preds = %entry, %for.body
627  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
628  %result.08 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
629  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
630  %l0 = load float, float* %arrayidx, align 4
631  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
632  %l1 = load float, float* %arrayidx2, align 4
633  %add = fmul fast float %result.08, %l0
634  %fmul = fmul fast float %add, %l1
635  %indvars.iv.next = add i64 %indvars.iv, 1
636  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
637  %exitcond = icmp eq i32 %lftr.wideiv, 256
638  br i1 %exitcond, label %for.end, label %for.body
639
640for.end:                                          ; preds = %for.body, %entry
641  %result.0.lcssa = phi float [ %fmul, %for.body ]
642  ret float %result.0.lcssa
643}
644
645define i32 @reduction_min(i32* nocapture %A, i32* nocapture %B) {
646; CHECK-LABEL: @reduction_min(
647; CHECK-NEXT:  entry:
648; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
649; CHECK:       vector.ph:
650; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
651; CHECK:       vector.body:
652; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
653; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
654; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
655; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
656; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
657; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
658; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
659; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
660; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
661; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !26
662; CHECK:       middle.block:
663; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]])
664; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
665; CHECK:       scalar.ph:
666; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
667; CHECK:       for.body:
668; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !27
669; CHECK:       for.end:
670; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
671; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
672;
673entry:
674  br label %for.body
675
676for.body:                                         ; preds = %entry, %for.body
677  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
678  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
679  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
680  %l0 = load i32, i32* %arrayidx, align 4
681  %c0 = icmp slt i32 %result.08, %l0
682  %v0 = select i1 %c0, i32 %result.08, i32 %l0
683  %indvars.iv.next = add i64 %indvars.iv, 1
684  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
685  %exitcond = icmp eq i32 %lftr.wideiv, 256
686  br i1 %exitcond, label %for.end, label %for.body
687
688for.end:                                          ; preds = %for.body, %entry
689  %result.0.lcssa = phi i32 [ %v0, %for.body ]
690  ret i32 %result.0.lcssa
691}
692
693define i32 @reduction_max(i32* nocapture %A, i32* nocapture %B) {
694; CHECK-LABEL: @reduction_max(
695; CHECK-NEXT:  entry:
696; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
697; CHECK:       vector.ph:
698; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
699; CHECK:       vector.body:
700; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
701; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
702; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
703; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
704; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
705; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
706; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
707; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
708; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
709; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !28
710; CHECK:       middle.block:
711; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
712; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
713; CHECK:       scalar.ph:
714; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
715; CHECK:       for.body:
716; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !29
717; CHECK:       for.end:
718; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
719; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
720;
721entry:
722  br label %for.body
723
724for.body:                                         ; preds = %entry, %for.body
725  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
726  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
727  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
728  %l0 = load i32, i32* %arrayidx, align 4
729  %c0 = icmp ugt i32 %result.08, %l0
730  %v0 = select i1 %c0, i32 %result.08, i32 %l0
731  %indvars.iv.next = add i64 %indvars.iv, 1
732  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
733  %exitcond = icmp eq i32 %lftr.wideiv, 256
734  br i1 %exitcond, label %for.end, label %for.body
735
736for.end:                                          ; preds = %for.body, %entry
737  %result.0.lcssa = phi i32 [ %v0, %for.body ]
738  ret i32 %result.0.lcssa
739}
740
741; Sub we can create a reduction, but not inloop
742define i32 @reduction_sub_lhs(i32* noalias nocapture %A) {
743; CHECK-LABEL: @reduction_sub_lhs(
744; CHECK-NEXT:  entry:
745; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
746; CHECK:       vector.ph:
747; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
748; CHECK:       vector.body:
749; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
750; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
751; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
752; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
753; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
754; CHECK-NEXT:    [[TMP2]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
755; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
756; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
757; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !30
758; CHECK:       middle.block:
759; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
760; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
761; CHECK:       scalar.ph:
762; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
763; CHECK:       for.body:
764; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !31
765; CHECK:       for.end:
766; CHECK-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
767; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
768;
769entry:
770  br label %for.body
771
772for.body:                                         ; preds = %entry, %for.body
773  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
774  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
775  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
776  %l0 = load i32, i32* %arrayidx, align 4
777  %sub = sub nsw i32 %x.05, %l0
778  %indvars.iv.next = add i64 %indvars.iv, 1
779  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
780  %exitcond = icmp eq i32 %lftr.wideiv, 256
781  br i1 %exitcond, label %for.end, label %for.body
782
783for.end:                                          ; preds = %for.body, %entry
784  %x.0.lcssa = phi i32 [ %sub, %for.body ]
785  ret i32 %x.0.lcssa
786}
787
788; Conditional reductions with multi-input phis.
789define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
790; CHECK-LABEL: @reduction_conditional(
791; CHECK-NEXT:  entry:
792; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
793; CHECK:       vector.ph:
794; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[S:%.*]], i32 0
795; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
796; CHECK:       vector.body:
797; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
798; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ]
799; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
800; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[TMP1]] to <4 x float>*
801; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
802; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
803; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
804; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
805; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
806; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
807; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
808; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]]
809; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]]
810; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
811; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]]
812; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
813; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
814; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]]
815; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP11]], [[TMP12]]
816; CHECK-NEXT:    [[PREDPHI3]] = select <4 x i1> [[TMP13]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]]
817; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
818; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
819; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !32
820; CHECK:       middle.block:
821; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
822; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
823; CHECK:       scalar.ph:
824; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
825; CHECK:       for.body:
826; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
827; CHECK:       if.then:
828; CHECK-NEXT:    br i1 undef, label [[IF_THEN8:%.*]], label [[IF_ELSE:%.*]]
829; CHECK:       if.then8:
830; CHECK-NEXT:    br label [[FOR_INC]]
831; CHECK:       if.else:
832; CHECK-NEXT:    br i1 undef, label [[IF_THEN16:%.*]], label [[FOR_INC]]
833; CHECK:       if.then16:
834; CHECK-NEXT:    br label [[FOR_INC]]
835; CHECK:       for.inc:
836; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !33
837; CHECK:       for.end:
838; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi float [ undef, [[FOR_INC]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
839; CHECK-NEXT:    ret float [[SUM_1_LCSSA]]
840;
841entry:
842  br label %for.body
843
844for.body:
845  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
846  %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
847  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
848  %l0 = load float, float* %arrayidx, align 4
849  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
850  %l1 = load float, float* %arrayidx2, align 4
851  %cmp3 = fcmp ogt float %l0, %l1
852  br i1 %cmp3, label %if.then, label %for.inc
853
854if.then:
855  %cmp6 = fcmp ogt float %l1, 1.000000e+00
856  br i1 %cmp6, label %if.then8, label %if.else
857
858if.then8:
859  %add = fadd fast float %sum.033, %l0
860  br label %for.inc
861
862if.else:
863  %cmp14 = fcmp ogt float %l0, 2.000000e+00
864  br i1 %cmp14, label %if.then16, label %for.inc
865
866if.then16:
867  %add19 = fadd fast float %sum.033, %l1
868  br label %for.inc
869
870for.inc:
871  %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ %sum.033, %if.else ], [ %sum.033, %for.body ]
872  %indvars.iv.next = add i64 %indvars.iv, 1
873  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
874  %exitcond = icmp ne i32 %lftr.wideiv, 128
875  br i1 %exitcond, label %for.body, label %for.end
876
877for.end:
878  %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
879  ret float %sum.1.lcssa
880}
881
882define i32 @reduction_sum_multiuse(i32* noalias nocapture %A, i32* noalias nocapture %B) {
883; CHECK-LABEL: @reduction_sum_multiuse(
884; CHECK-NEXT:  entry:
885; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
886; CHECK:       .lr.ph:
887; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ 0, [[ENTRY:%.*]] ]
888; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], [[DOTLR_PH]] ], [ 0, [[ENTRY]] ]
889; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
890; CHECK-NEXT:    [[L3:%.*]] = load i32, i32* [[L2]], align 4
891; CHECK-NEXT:    [[L6:%.*]] = trunc i64 [[INDVARS_IV]] to i32
892; CHECK-NEXT:    [[L7:%.*]] = add i32 [[SUM_02]], [[L6]]
893; CHECK-NEXT:    [[L8:%.*]] = add i32 [[L7]], [[L3]]
894; CHECK-NEXT:    [[L10]] = add i32 [[L8]], [[SUM_02]]
895; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
896; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
897; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
898; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[DOTLR_PH]]
899; CHECK:       end:
900; CHECK-NEXT:    ret i32 [[L10]]
901;
902entry:
903  br label %.lr.ph
904
905.lr.ph:                                           ; preds = %entry, %.lr.ph
906  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
907  %sum.02 = phi i32 [ %l10, %.lr.ph ], [ 0, %entry ]
908  %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
909  %l3 = load i32, i32* %l2, align 4
910  %l4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
911  %l5 = load i32, i32* %l4, align 4
912  %l6 = trunc i64 %indvars.iv to i32
913  %l7 = add i32 %sum.02, %l6
914  %l8 = add i32 %l7, %l3
915  %l9 = add i32 %l8, %l5
916  %l10 = add i32 %l8, %sum.02
917  %indvars.iv.next = add i64 %indvars.iv, 1
918  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
919  %exitcond = icmp eq i32 %lftr.wideiv, 256
920  br i1 %exitcond, label %end, label %.lr.ph
921
922end:
923  %f1 = phi i32 [ %l10, %.lr.ph ]
924  ret i32 %f1
925}
926
927; Predicated loop, cannot (yet) use in-loop reductions.
928define i32 @reduction_predicated(i32* noalias nocapture %A, i32* noalias nocapture %B) {
929; CHECK-LABEL: @reduction_predicated(
930; CHECK-NEXT:  entry:
931; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
932; CHECK:       vector.ph:
933; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
934; CHECK:       vector.body:
935; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
936; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
937; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
938; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
939; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
940; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
941; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
942; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
943; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
944; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
945; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_LOAD]]
946; CHECK-NEXT:    [[TMP6]] = add <4 x i32> [[TMP5]], [[WIDE_LOAD1]]
947; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
948; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
949; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
950; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !34
951; CHECK:       middle.block:
952; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
953; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
954; CHECK:       scalar.ph:
955; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
956; CHECK:       .lr.ph:
957; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !35
958; CHECK:       ._crit_edge:
959; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
960; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
961;
962entry:
963  br label %.lr.ph
964
965.lr.ph:                                           ; preds = %entry, %.lr.ph
966  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
967  %sum.02 = phi i32 [ %l9, %.lr.ph ], [ 0, %entry ]
968  %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
969  %l3 = load i32, i32* %l2, align 4
970  %l4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
971  %l5 = load i32, i32* %l4, align 4
972  %l6 = trunc i64 %indvars.iv to i32
973  %l7 = add i32 %sum.02, %l6
974  %l8 = add i32 %l7, %l3
975  %l9 = add i32 %l8, %l5
976  %indvars.iv.next = add i64 %indvars.iv, 1
977  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
978  %exitcond = icmp eq i32 %lftr.wideiv, 256
979  br i1 %exitcond, label %._crit_edge, label %.lr.ph, !llvm.loop !6
980
981._crit_edge:                                      ; preds = %.lr.ph
982  %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ]
983  ret i32 %sum.0.lcssa
984}
985
986define i8 @reduction_add_trunc(i8* noalias nocapture %A) {
987; CHECK-LABEL: @reduction_add_trunc(
988; CHECK-NEXT:  entry:
989; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
990; CHECK:       vector.ph:
991; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
992; CHECK:       vector.body:
993; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
994; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i8> [ <i8 -1, i8 0, i8 0, i8 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
995; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
996; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]]
997; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>*
998; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 4
999; CHECK-NEXT:    [[TMP3]] = add <4 x i8> [[VEC_PHI]], [[WIDE_LOAD]]
1000; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
1001; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
1002; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !36
1003; CHECK:       middle.block:
1004; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> [[TMP3]])
1005; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
1006; CHECK:       scalar.ph:
1007; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
1008; CHECK:       .lr.ph:
1009; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !37
1010; CHECK:       ._crit_edge:
1011; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i8 [ undef, [[DOTLR_PH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
1012; CHECK-NEXT:    ret i8 [[SUM_0_LCSSA]]
1013;
1014entry:
1015  br label %.lr.ph
1016
1017.lr.ph:                                           ; preds = %entry, %.lr.ph
1018  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
1019  %sum.02p = phi i32 [ %l9, %.lr.ph ], [ 255, %entry ]
1020  %sum.02 = and i32 %sum.02p, 255
1021  %l2 = getelementptr inbounds i8, i8* %A, i32 %indvars.iv
1022  %l3 = load i8, i8* %l2, align 4
1023  %l3e = zext i8 %l3 to i32
1024  %l9 = add i32 %sum.02, %l3e
1025  %indvars.iv.next = add i32 %indvars.iv, 1
1026  %exitcond = icmp eq i32 %indvars.iv.next, 256
1027  br i1 %exitcond, label %._crit_edge, label %.lr.ph
1028
1029._crit_edge:                                      ; preds = %.lr.ph
1030  %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ]
1031  %ret = trunc i32 %sum.0.lcssa to i8
1032  ret i8 %ret
1033}
1034
1035
1036define i8 @reduction_and_trunc(i8* noalias nocapture %A) {
1037; CHECK-LABEL: @reduction_and_trunc(
1038; CHECK-NEXT:  entry:
1039; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1040; CHECK:       vector.ph:
1041; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1042; CHECK:       vector.body:
1043; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1044; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 255, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
1045; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
1046; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]]
1047; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>*
1048; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 4
1049; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
1050; CHECK-NEXT:    [[TMP4]] = and <4 x i32> [[VEC_PHI]], [[TMP3]]
1051; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
1052; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
1053; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !38
1054; CHECK:       middle.block:
1055; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP4]])
1056; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
1057; CHECK:       scalar.ph:
1058; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
1059; CHECK:       .lr.ph:
1060; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !39
1061; CHECK:       ._crit_edge:
1062; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
1063; CHECK-NEXT:    [[RET:%.*]] = trunc i32 [[SUM_0_LCSSA]] to i8
1064; CHECK-NEXT:    ret i8 [[RET]]
1065;
1066entry:
1067  br label %.lr.ph
1068
1069.lr.ph:                                           ; preds = %entry, %.lr.ph
1070  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
1071  %sum.02p = phi i32 [ %l9, %.lr.ph ], [ 255, %entry ]
1072  %sum.02 = and i32 %sum.02p, 255
1073  %l2 = getelementptr inbounds i8, i8* %A, i32 %indvars.iv
1074  %l3 = load i8, i8* %l2, align 4
1075  %l3e = zext i8 %l3 to i32
1076  %l9 = and i32 %sum.02, %l3e
1077  %indvars.iv.next = add i32 %indvars.iv, 1
1078  %exitcond = icmp eq i32 %indvars.iv.next, 256
1079  br i1 %exitcond, label %._crit_edge, label %.lr.ph
1080
1081._crit_edge:                                      ; preds = %.lr.ph
1082  %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ]
1083  %ret = trunc i32 %sum.0.lcssa to i8
1084  ret i8 %ret
1085}
1086
1087!6 = distinct !{!6, !7, !8}
1088!7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
1089!8 = !{!"llvm.loop.vectorize.enable", i1 true}
1090