1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
3
4target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
5
6define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
7; CHECK-LABEL: @reduction_sum(
8; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
9; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
10; CHECK:       .lr.ph.preheader:
11; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
12; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
13; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
14; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 3
15; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
16; CHECK:       vector.ph:
17; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP4]], 8589934588
18; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
19; CHECK:       vector.body:
20; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
21; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
22; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
23; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
24; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
25; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
26; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
27; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
28; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
29; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
30; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[WIDE_LOAD]]
31; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[TMP10]], [[WIDE_LOAD1]]
32; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
33; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
34; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
35; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
36; CHECK:       middle.block:
37; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
38; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
39; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
40; CHECK:       scalar.ph:
41; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
42; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
43; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
44; CHECK:       .lr.ph:
45; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
46; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[TMP21:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
47; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
48; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
49; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
50; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
51; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32
52; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[SUM_02]], [[TMP18]]
53; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], [[TMP15]]
54; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP20]], [[TMP17]]
55; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
56; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
57; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
58; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !2
59; CHECK:       ._crit_edge.loopexit:
60; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
61; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
62; CHECK:       ._crit_edge:
63; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
64; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
65;
66  %1 = icmp sgt i32 %n, 0
67  br i1 %1, label %.lr.ph, label %._crit_edge
68
69.lr.ph:                                           ; preds = %0, %.lr.ph
70  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
71  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
72  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
73  %3 = load i32, i32* %2, align 4
74  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
75  %5 = load i32, i32* %4, align 4
76  %6 = trunc i64 %indvars.iv to i32
77  %7 = add i32 %sum.02, %6
78  %8 = add i32 %7, %3
79  %9 = add i32 %8, %5
80  %indvars.iv.next = add i64 %indvars.iv, 1
81  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
82  %exitcond = icmp eq i32 %lftr.wideiv, %n
83  br i1 %exitcond, label %._crit_edge, label %.lr.ph
84
85._crit_edge:                                      ; preds = %.lr.ph, %0
86  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
87  ret i32 %sum.0.lcssa
88}
89
90define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
91; CHECK-LABEL: @reduction_prod(
92; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
93; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
94; CHECK:       .lr.ph.preheader:
95; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
96; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
97; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
98; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 3
99; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
100; CHECK:       vector.ph:
101; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP4]], 8589934588
102; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
103; CHECK:       vector.body:
104; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
105; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
106; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
107; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
108; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
109; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
110; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
111; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
112; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
113; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
114; CHECK-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP9]], [[WIDE_LOAD]]
115; CHECK-NEXT:    [[TMP11]] = mul <4 x i32> [[TMP10]], [[WIDE_LOAD1]]
116; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
117; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
118; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
119; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
120; CHECK:       middle.block:
121; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP11]])
122; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
123; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
124; CHECK:       scalar.ph:
125; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
126; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 1, [[DOTLR_PH_PREHEADER]] ]
127; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
128; CHECK:       .lr.ph:
129; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
130; CHECK-NEXT:    [[PROD_02:%.*]] = phi i32 [ [[TMP21:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
131; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
132; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
133; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
134; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
135; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32
136; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[PROD_02]], [[TMP18]]
137; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[TMP15]]
138; CHECK-NEXT:    [[TMP21]] = mul i32 [[TMP20]], [[TMP17]]
139; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
140; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
141; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
142; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !5
143; CHECK:       ._crit_edge.loopexit:
144; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
145; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
146; CHECK:       ._crit_edge:
147; CHECK-NEXT:    [[PROD_0_LCSSA:%.*]] = phi i32 [ 1, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
148; CHECK-NEXT:    ret i32 [[PROD_0_LCSSA]]
149;
150  %1 = icmp sgt i32 %n, 0
151  br i1 %1, label %.lr.ph, label %._crit_edge
152
153.lr.ph:                                           ; preds = %0, %.lr.ph
154  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
155  %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
156  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
157  %3 = load i32, i32* %2, align 4
158  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
159  %5 = load i32, i32* %4, align 4
160  %6 = trunc i64 %indvars.iv to i32
161  %7 = mul i32 %prod.02, %6
162  %8 = mul i32 %7, %3
163  %9 = mul i32 %8, %5
164  %indvars.iv.next = add i64 %indvars.iv, 1
165  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
166  %exitcond = icmp eq i32 %lftr.wideiv, %n
167  br i1 %exitcond, label %._crit_edge, label %.lr.ph
168
169._crit_edge:                                      ; preds = %.lr.ph, %0
170  %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
171  ret i32 %prod.0.lcssa
172}
173
174define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
175; CHECK-LABEL: @reduction_mix(
176; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
177; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
178; CHECK:       .lr.ph.preheader:
179; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
180; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
181; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
182; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 3
183; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
184; CHECK:       vector.ph:
185; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP4]], 8589934588
186; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
187; CHECK:       vector.body:
188; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
189; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
190; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
191; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
192; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
193; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
194; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
195; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
196; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
197; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
198; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
199; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[TMP10]], [[TMP9]]
200; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
201; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
202; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
203; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
204; CHECK:       middle.block:
205; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
206; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
207; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
208; CHECK:       scalar.ph:
209; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
210; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
211; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
212; CHECK:       .lr.ph:
213; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
214; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[TMP21:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
215; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
216; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
217; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
218; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
219; CHECK-NEXT:    [[TMP18:%.*]] = mul nsw i32 [[TMP17]], [[TMP15]]
220; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV]] to i32
221; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[SUM_02]], [[TMP19]]
222; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP20]], [[TMP18]]
223; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
224; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
225; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
226; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !7
227; CHECK:       ._crit_edge.loopexit:
228; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
229; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
230; CHECK:       ._crit_edge:
231; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
232; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
233;
234  %1 = icmp sgt i32 %n, 0
235  br i1 %1, label %.lr.ph, label %._crit_edge
236
237.lr.ph:                                           ; preds = %0, %.lr.ph
238  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
239  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
240  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
241  %3 = load i32, i32* %2, align 4
242  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
243  %5 = load i32, i32* %4, align 4
244  %6 = mul nsw i32 %5, %3
245  %7 = trunc i64 %indvars.iv to i32
246  %8 = add i32 %sum.02, %7
247  %9 = add i32 %8, %6
248  %indvars.iv.next = add i64 %indvars.iv, 1
249  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
250  %exitcond = icmp eq i32 %lftr.wideiv, %n
251  br i1 %exitcond, label %._crit_edge, label %.lr.ph
252
253._crit_edge:                                      ; preds = %.lr.ph, %0
254  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
255  ret i32 %sum.0.lcssa
256}
257
258define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
259; CHECK-LABEL: @reduction_mul(
260; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
261; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
262; CHECK:       .lr.ph.preheader:
263; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
264; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
265; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
266; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 3
267; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
268; CHECK:       vector.ph:
269; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP4]], 8589934588
270; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
271; CHECK:       vector.body:
272; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
273; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 19, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
274; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
275; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
276; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
277; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
278; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
279; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
280; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
281; CHECK-NEXT:    [[TMP10]] = mul <4 x i32> [[TMP9]], [[WIDE_LOAD1]]
282; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
283; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
284; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
285; CHECK:       middle.block:
286; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP10]])
287; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
288; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
289; CHECK:       scalar.ph:
290; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
291; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 19, [[DOTLR_PH_PREHEADER]] ]
292; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
293; CHECK:       .lr.ph:
294; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
295; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[TMP18:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
296; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
297; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4
298; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
299; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4
300; CHECK-NEXT:    [[TMP17:%.*]] = mul i32 [[SUM_02]], [[TMP14]]
301; CHECK-NEXT:    [[TMP18]] = mul i32 [[TMP17]], [[TMP16]]
302; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
303; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
304; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
305; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !9
306; CHECK:       ._crit_edge.loopexit:
307; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[DOTLR_PH]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
308; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
309; CHECK:       ._crit_edge:
310; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
311; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
312;
313  %1 = icmp sgt i32 %n, 0
314  br i1 %1, label %.lr.ph, label %._crit_edge
315
316.lr.ph:                                           ; preds = %0, %.lr.ph
317  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
318  %sum.02 = phi i32 [ %7, %.lr.ph ], [ 19, %0 ]
319  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
320  %3 = load i32, i32* %2, align 4
321  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
322  %5 = load i32, i32* %4, align 4
323  %6 = mul i32 %sum.02, %3
324  %7 = mul i32 %6, %5
325  %indvars.iv.next = add i64 %indvars.iv, 1
326  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
327  %exitcond = icmp eq i32 %lftr.wideiv, %n
328  br i1 %exitcond, label %._crit_edge, label %.lr.ph
329
330._crit_edge:                                      ; preds = %.lr.ph, %0
331  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ]
332  ret i32 %sum.0.lcssa
333}
334
335define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
336; CHECK-LABEL: @start_at_non_zero(
337; CHECK-NEXT:  entry:
338; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
339; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
340; CHECK:       for.body.preheader:
341; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
342; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
343; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
344; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
345; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
346; CHECK:       vector.ph:
347; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
348; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
349; CHECK:       vector.body:
350; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
351; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 120, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
352; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[INDEX]]
353; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
354; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
355; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[COEFF:%.*]], i64 [[INDEX]]
356; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
357; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
358; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
359; CHECK-NEXT:    [[TMP8]] = add <4 x i32> [[TMP7]], [[VEC_PHI]]
360; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
361; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
362; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
363; CHECK:       middle.block:
364; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
365; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
366; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
367; CHECK:       scalar.ph:
368; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
369; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 120, [[FOR_BODY_PREHEADER]] ]
370; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
371; CHECK:       for.body:
372; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
373; CHECK-NEXT:    [[SUM_09:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
374; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[INDVARS_IV]]
375; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
376; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[COEFF]], i64 [[INDVARS_IV]]
377; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
378; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], [[TMP11]]
379; CHECK-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[SUM_09]]
380; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
381; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
382; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
383; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !11
384; CHECK:       for.end.loopexit:
385; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
386; CHECK-NEXT:    br label [[FOR_END]]
387; CHECK:       for.end:
388; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 120, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_END_LOOPEXIT]] ]
389; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
390;
391entry:
392  %cmp7 = icmp sgt i32 %n, 0
393  br i1 %cmp7, label %for.body, label %for.end
394
395for.body:                                         ; preds = %entry, %for.body
396  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
397  %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
398  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %indvars.iv
399  %0 = load i32, i32* %arrayidx, align 4
400  %arrayidx2 = getelementptr inbounds i32, i32* %coeff, i64 %indvars.iv
401  %1 = load i32, i32* %arrayidx2, align 4
402  %mul = mul nsw i32 %1, %0
403  %add = add nsw i32 %mul, %sum.09
404  %indvars.iv.next = add i64 %indvars.iv, 1
405  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
406  %exitcond = icmp eq i32 %lftr.wideiv, %n
407  br i1 %exitcond, label %for.end, label %for.body
408
409for.end:                                          ; preds = %for.body, %entry
410  %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
411  ret i32 %sum.0.lcssa
412}
413
414define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
415; CHECK-LABEL: @reduction_and(
416; CHECK-NEXT:  entry:
417; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
418; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
419; CHECK:       for.body.preheader:
420; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
421; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
422; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
423; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
424; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
425; CHECK:       vector.ph:
426; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
427; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
428; CHECK:       vector.body:
429; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
430; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
431; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
432; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
433; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
434; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
435; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
436; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
437; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
438; CHECK-NEXT:    [[TMP8]] = and <4 x i32> [[TMP7]], [[WIDE_LOAD1]]
439; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
440; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
441; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
442; CHECK:       middle.block:
443; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP8]])
444; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
445; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
446; CHECK:       scalar.ph:
447; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
448; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ -1, [[FOR_BODY_PREHEADER]] ]
449; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
450; CHECK:       for.body:
451; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
452; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
453; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
454; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
455; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
456; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
457; CHECK-NEXT:    [[ADD:%.*]] = and i32 [[RESULT_08]], [[TMP11]]
458; CHECK-NEXT:    [[AND]] = and i32 [[ADD]], [[TMP12]]
459; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
460; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
461; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
462; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !13
463; CHECK:       for.end.loopexit:
464; CHECK-NEXT:    [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
465; CHECK-NEXT:    br label [[FOR_END]]
466; CHECK:       for.end:
467; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[AND_LCSSA]], [[FOR_END_LOOPEXIT]] ]
468; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
469;
470entry:
471  %cmp7 = icmp sgt i32 %n, 0
472  br i1 %cmp7, label %for.body, label %for.end
473
474for.body:                                         ; preds = %entry, %for.body
475  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
476  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
477  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
478  %0 = load i32, i32* %arrayidx, align 4
479  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
480  %1 = load i32, i32* %arrayidx2, align 4
481  %add = and i32 %result.08, %0
482  %and = and i32 %add, %1
483  %indvars.iv.next = add i64 %indvars.iv, 1
484  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
485  %exitcond = icmp eq i32 %lftr.wideiv, %n
486  br i1 %exitcond, label %for.end, label %for.body
487
488for.end:                                          ; preds = %for.body, %entry
489  %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
490  ret i32 %result.0.lcssa
491}
492
493define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
494; CHECK-LABEL: @reduction_or(
495; CHECK-NEXT:  entry:
496; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
497; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
498; CHECK:       for.body.preheader:
499; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
500; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
501; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
502; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
503; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
504; CHECK:       vector.ph:
505; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
506; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
507; CHECK:       vector.body:
508; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
509; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
510; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
511; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
512; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
513; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
514; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
515; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
516; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
517; CHECK-NEXT:    [[TMP8]] = or <4 x i32> [[TMP7]], [[VEC_PHI]]
518; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
519; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
520; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14
521; CHECK:       middle.block:
522; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP8]])
523; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
524; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
525; CHECK:       scalar.ph:
526; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
527; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
528; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
529; CHECK:       for.body:
530; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
531; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
532; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
533; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
534; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
535; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
536; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
537; CHECK-NEXT:    [[OR]] = or i32 [[ADD]], [[RESULT_08]]
538; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
539; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
540; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
541; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !15
542; CHECK:       for.end.loopexit:
543; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
544; CHECK-NEXT:    br label [[FOR_END]]
545; CHECK:       for.end:
546; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OR_LCSSA]], [[FOR_END_LOOPEXIT]] ]
547; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
548;
549entry:
550  %cmp7 = icmp sgt i32 %n, 0
551  br i1 %cmp7, label %for.body, label %for.end
552
553for.body:                                         ; preds = %entry, %for.body
554  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
555  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
556  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
557  %0 = load i32, i32* %arrayidx, align 4
558  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
559  %1 = load i32, i32* %arrayidx2, align 4
560  %add = add nsw i32 %1, %0
561  %or = or i32 %add, %result.08
562  %indvars.iv.next = add i64 %indvars.iv, 1
563  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
564  %exitcond = icmp eq i32 %lftr.wideiv, %n
565  br i1 %exitcond, label %for.end, label %for.body
566
567for.end:                                          ; preds = %for.body, %entry
568  %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
569  ret i32 %result.0.lcssa
570}
571
572define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
573; CHECK-LABEL: @reduction_xor(
574; CHECK-NEXT:  entry:
575; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
576; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
577; CHECK:       for.body.preheader:
578; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
579; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
580; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
581; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
582; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
583; CHECK:       vector.ph:
584; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
585; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
586; CHECK:       vector.body:
587; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
588; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
589; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
590; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
591; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
592; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
593; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
594; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
595; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
596; CHECK-NEXT:    [[TMP8]] = xor <4 x i32> [[TMP7]], [[VEC_PHI]]
597; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
598; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
599; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16
600; CHECK:       middle.block:
601; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP8]])
602; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
603; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
604; CHECK:       scalar.ph:
605; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
606; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
607; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
608; CHECK:       for.body:
609; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
610; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[XOR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
611; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
612; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
613; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
614; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
615; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
616; CHECK-NEXT:    [[XOR]] = xor i32 [[ADD]], [[RESULT_08]]
617; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
618; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
619; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
620; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !17
621; CHECK:       for.end.loopexit:
622; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
623; CHECK-NEXT:    br label [[FOR_END]]
624; CHECK:       for.end:
625; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[XOR_LCSSA]], [[FOR_END_LOOPEXIT]] ]
626; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
627;
628entry:
629  %cmp7 = icmp sgt i32 %n, 0
630  br i1 %cmp7, label %for.body, label %for.end
631
632for.body:                                         ; preds = %entry, %for.body
633  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
634  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
635  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
636  %0 = load i32, i32* %arrayidx, align 4
637  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
638  %1 = load i32, i32* %arrayidx2, align 4
639  %add = add nsw i32 %1, %0
640  %xor = xor i32 %add, %result.08
641  %indvars.iv.next = add i64 %indvars.iv, 1
642  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
643  %exitcond = icmp eq i32 %lftr.wideiv, %n
644  br i1 %exitcond, label %for.end, label %for.body
645
646for.end:                                          ; preds = %for.body, %entry
647  %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
648  ret i32 %result.0.lcssa
649}
650
651define float @reduction_fadd(i32 %n, float* nocapture %A, float* nocapture %B) nounwind uwtable readonly {
652; CHECK-LABEL: @reduction_fadd(
653; CHECK-NEXT:  entry:
654; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
655; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
656; CHECK:       for.body.preheader:
657; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
658; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
659; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
660; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
661; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
662; CHECK:       vector.ph:
663; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
664; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
665; CHECK:       vector.body:
666; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
667; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
668; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
669; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
670; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
671; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
672; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
673; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
674; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
675; CHECK-NEXT:    [[TMP8]] = fadd fast <4 x float> [[TMP7]], [[WIDE_LOAD1]]
676; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
677; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
678; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !18
679; CHECK:       middle.block:
680; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP8]])
681; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
682; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
683; CHECK:       scalar.ph:
684; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
685; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
686; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
687; CHECK:       for.body:
688; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
689; CHECK-NEXT:    [[RESULT_08:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
690; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
691; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4
692; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
693; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4
694; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[RESULT_08]], [[TMP11]]
695; CHECK-NEXT:    [[FADD]] = fadd fast float [[ADD]], [[TMP12]]
696; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
697; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
698; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
699; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !19
700; CHECK:       for.end.loopexit:
701; CHECK-NEXT:    [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
702; CHECK-NEXT:    br label [[FOR_END]]
703; CHECK:       for.end:
704; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[FADD_LCSSA]], [[FOR_END_LOOPEXIT]] ]
705; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
706;
707entry:
708  %cmp7 = icmp sgt i32 %n, 0
709  br i1 %cmp7, label %for.body, label %for.end
710
711for.body:                                         ; preds = %entry, %for.body
712  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
713  %result.08 = phi float [ %fadd, %for.body ], [ 0.0, %entry ]
714  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
715  %0 = load float, float* %arrayidx, align 4
716  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
717  %1 = load float, float* %arrayidx2, align 4
718  %add = fadd fast float %result.08, %0
719  %fadd = fadd fast float %add, %1
720  %indvars.iv.next = add i64 %indvars.iv, 1
721  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
722  %exitcond = icmp eq i32 %lftr.wideiv, %n
723  br i1 %exitcond, label %for.end, label %for.body
724
725for.end:                                          ; preds = %for.body, %entry
726  %result.0.lcssa = phi float [ 0.0, %entry ], [ %fadd, %for.body ]
727  ret float %result.0.lcssa
728}
729
730define float @reduction_fmul(i32 %n, float* nocapture %A, float* nocapture %B) nounwind uwtable readonly {
731; CHECK-LABEL: @reduction_fmul(
732; CHECK-NEXT:  entry:
733; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
734; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
735; CHECK:       for.body.preheader:
736; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
737; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
738; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
739; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
740; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
741; CHECK:       vector.ph:
742; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
743; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
744; CHECK:       vector.body:
745; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
746; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
747; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
748; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
749; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
750; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
751; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
752; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
753; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
754; CHECK-NEXT:    [[TMP8]] = fmul fast <4 x float> [[TMP7]], [[WIDE_LOAD1]]
755; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
756; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
757; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !20
758; CHECK:       middle.block:
759; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP8]])
760; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
761; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
762; CHECK:       scalar.ph:
763; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
764; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
765; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
766; CHECK:       for.body:
767; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
768; CHECK-NEXT:    [[RESULT_08:%.*]] = phi float [ [[FMUL:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
769; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
770; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4
771; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
772; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4
773; CHECK-NEXT:    [[ADD:%.*]] = fmul fast float [[RESULT_08]], [[TMP11]]
774; CHECK-NEXT:    [[FMUL]] = fmul fast float [[ADD]], [[TMP12]]
775; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
776; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
777; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
778; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !21
779; CHECK:       for.end.loopexit:
780; CHECK-NEXT:    [[FMUL_LCSSA:%.*]] = phi float [ [[FMUL]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
781; CHECK-NEXT:    br label [[FOR_END]]
782; CHECK:       for.end:
783; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[FMUL_LCSSA]], [[FOR_END_LOOPEXIT]] ]
784; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
785;
786entry:
787  %cmp7 = icmp sgt i32 %n, 0
788  br i1 %cmp7, label %for.body, label %for.end
789
790for.body:                                         ; preds = %entry, %for.body
791  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
792  %result.08 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
793  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
794  %0 = load float, float* %arrayidx, align 4
795  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
796  %1 = load float, float* %arrayidx2, align 4
797  %add = fmul fast float %result.08, %0
798  %fmul = fmul fast float %add, %1
799  %indvars.iv.next = add i64 %indvars.iv, 1
800  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
801  %exitcond = icmp eq i32 %lftr.wideiv, %n
802  br i1 %exitcond, label %for.end, label %for.body
803
804for.end:                                          ; preds = %for.body, %entry
805  %result.0.lcssa = phi float [ 0.0, %entry ], [ %fmul, %for.body ]
806  ret float %result.0.lcssa
807}
808
809define i32 @reduction_min(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
810; CHECK-LABEL: @reduction_min(
811; CHECK-NEXT:  entry:
812; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
813; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
814; CHECK:       for.body.preheader:
815; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
816; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
817; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
818; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
819; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
820; CHECK:       vector.ph:
821; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
822; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
823; CHECK:       vector.body:
824; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
825; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
826; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
827; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
828; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
829; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
830; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
831; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
832; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
833; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !22
834; CHECK:       middle.block:
835; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP6]])
836; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
837; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
838; CHECK:       scalar.ph:
839; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
840; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 1000, [[FOR_BODY_PREHEADER]] ]
841; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
842; CHECK:       for.body:
843; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
844; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[V0:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
845; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
846; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
847; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 [[RESULT_08]], [[TMP9]]
848; CHECK-NEXT:    [[V0]] = select i1 [[C0]], i32 [[RESULT_08]], i32 [[TMP9]]
849; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
850; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
851; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
852; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !23
853; CHECK:       for.end.loopexit:
854; CHECK-NEXT:    [[V0_LCSSA:%.*]] = phi i32 [ [[V0]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
855; CHECK-NEXT:    br label [[FOR_END]]
856; CHECK:       for.end:
857; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[V0_LCSSA]], [[FOR_END_LOOPEXIT]] ]
858; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
859;
860entry:
861  %cmp7 = icmp sgt i32 %n, 0
862  br i1 %cmp7, label %for.body, label %for.end
863
864for.body:                                         ; preds = %entry, %for.body
865  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
866  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
867  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
868  %0 = load i32, i32* %arrayidx, align 4
869  %c0 = icmp slt i32 %result.08, %0
870  %v0 = select i1 %c0, i32 %result.08, i32 %0
871  %indvars.iv.next = add i64 %indvars.iv, 1
872  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
873  %exitcond = icmp eq i32 %lftr.wideiv, %n
874  br i1 %exitcond, label %for.end, label %for.body
875
876for.end:                                          ; preds = %for.body, %entry
877  %result.0.lcssa = phi i32 [ 0, %entry ], [ %v0, %for.body ]
878  ret i32 %result.0.lcssa
879}
880
881define i32 @reduction_max(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
882; CHECK-LABEL: @reduction_max(
883; CHECK-NEXT:  entry:
884; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
885; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
886; CHECK:       for.body.preheader:
887; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
888; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
889; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
890; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
891; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
892; CHECK:       vector.ph:
893; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
894; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
895; CHECK:       vector.body:
896; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
897; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
898; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
899; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
900; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
901; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
902; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[TMP5]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
903; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
904; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
905; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !24
906; CHECK:       middle.block:
907; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP6]])
908; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
909; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
910; CHECK:       scalar.ph:
911; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
912; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 1000, [[FOR_BODY_PREHEADER]] ]
913; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
914; CHECK:       for.body:
915; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
916; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[V0:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
917; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
918; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
919; CHECK-NEXT:    [[C0:%.*]] = icmp ugt i32 [[RESULT_08]], [[TMP9]]
920; CHECK-NEXT:    [[V0]] = select i1 [[C0]], i32 [[RESULT_08]], i32 [[TMP9]]
921; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
922; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
923; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
924; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !25
925; CHECK:       for.end.loopexit:
926; CHECK-NEXT:    [[V0_LCSSA:%.*]] = phi i32 [ [[V0]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
927; CHECK-NEXT:    br label [[FOR_END]]
928; CHECK:       for.end:
929; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[V0_LCSSA]], [[FOR_END_LOOPEXIT]] ]
930; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
931;
932entry:
933  %cmp7 = icmp sgt i32 %n, 0
934  br i1 %cmp7, label %for.body, label %for.end
935
936for.body:                                         ; preds = %entry, %for.body
937  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
938  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
939  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
940  %0 = load i32, i32* %arrayidx, align 4
941  %c0 = icmp ugt i32 %result.08, %0
942  %v0 = select i1 %c0, i32 %result.08, i32 %0
943  %indvars.iv.next = add i64 %indvars.iv, 1
944  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
945  %exitcond = icmp eq i32 %lftr.wideiv, %n
946  br i1 %exitcond, label %for.end, label %for.body
947
948for.end:                                          ; preds = %for.body, %entry
949  %result.0.lcssa = phi i32 [ 0, %entry ], [ %v0, %for.body ]
950  ret i32 %result.0.lcssa
951}
952
953; Sub we can create a reduction, but not inloop
954define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
955; CHECK-LABEL: @reduction_sub_lhs(
956; CHECK-NEXT:  entry:
957; CHECK-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0
958; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
959; CHECK:       for.body.preheader:
960; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
961; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
962; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
963; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
964; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
965; CHECK:       vector.ph:
966; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
967; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
968; CHECK:       vector.body:
969; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
970; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
971; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
972; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
973; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
974; CHECK-NEXT:    [[TMP5]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
975; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
976; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
977; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !26
978; CHECK:       middle.block:
979; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
980; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
981; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
982; CHECK:       scalar.ph:
983; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
984; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
985; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
986; CHECK:       for.body:
987; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
988; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[SUB:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
989; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
990; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
991; CHECK-NEXT:    [[SUB]] = sub nsw i32 [[X_05]], [[TMP8]]
992; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
993; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
994; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
995; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !27
996; CHECK:       for.end.loopexit:
997; CHECK-NEXT:    [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
998; CHECK-NEXT:    br label [[FOR_END]]
999; CHECK:       for.end:
1000; CHECK-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUB_LCSSA]], [[FOR_END_LOOPEXIT]] ]
1001; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
1002;
1003entry:
1004  %cmp4 = icmp sgt i32 %n, 0
1005  br i1 %cmp4, label %for.body, label %for.end
1006
1007for.body:                                         ; preds = %entry, %for.body
1008  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
1009  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
1010  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
1011  %0 = load i32, i32* %arrayidx, align 4
1012  %sub = sub nsw i32 %x.05, %0
1013  %indvars.iv.next = add i64 %indvars.iv, 1
1014  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
1015  %exitcond = icmp eq i32 %lftr.wideiv, %n
1016  br i1 %exitcond, label %for.end, label %for.body
1017
1018for.end:                                          ; preds = %for.body, %entry
1019  %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
1020  ret i32 %x.0.lcssa
1021}
1022
1023; Conditional reductions with multi-input phis.
1024define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
1025; CHECK-LABEL: @reduction_conditional(
1026; CHECK-NEXT:  entry:
1027; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1028; CHECK:       vector.ph:
1029; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[S:%.*]], i32 0
1030; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1031; CHECK:       vector.body:
1032; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1033; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ]
1034; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
1035; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[TMP1]] to <4 x float>*
1036; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
1037; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
1038; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
1039; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
1040; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
1041; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
1042; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1043; CHECK-NEXT:    [[TMP8:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]]
1044; CHECK-NEXT:    [[TMP9:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]]
1045; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
1046; CHECK-NEXT:    [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]]
1047; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
1048; CHECK-NEXT:    [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]]
1049; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]]
1050; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP11]], [[TMP12]]
1051; CHECK-NEXT:    [[PREDPHI3]] = select <4 x i1> [[TMP13]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]]
1052; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1053; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
1054; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !28
1055; CHECK:       middle.block:
1056; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
1057; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
1058; CHECK:       scalar.ph:
1059; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1060; CHECK:       for.body:
1061; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
1062; CHECK:       if.then:
1063; CHECK-NEXT:    br i1 undef, label [[IF_THEN8:%.*]], label [[IF_ELSE:%.*]]
1064; CHECK:       if.then8:
1065; CHECK-NEXT:    br label [[FOR_INC]]
1066; CHECK:       if.else:
1067; CHECK-NEXT:    br i1 undef, label [[IF_THEN16:%.*]], label [[FOR_INC]]
1068; CHECK:       if.then16:
1069; CHECK-NEXT:    br label [[FOR_INC]]
1070; CHECK:       for.inc:
1071; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !29
1072; CHECK:       for.end:
1073; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi float [ undef, [[FOR_INC]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
1074; CHECK-NEXT:    ret float [[SUM_1_LCSSA]]
1075;
1076entry:
1077  br label %for.body
1078
1079for.body:
1080  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
1081  %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
1082  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
1083  %0 = load float, float* %arrayidx, align 4
1084  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
1085  %1 = load float, float* %arrayidx2, align 4
1086  %cmp3 = fcmp ogt float %0, %1
1087  br i1 %cmp3, label %if.then, label %for.inc
1088
1089if.then:
1090  %cmp6 = fcmp ogt float %1, 1.000000e+00
1091  br i1 %cmp6, label %if.then8, label %if.else
1092
1093if.then8:
1094  %add = fadd fast float %sum.033, %0
1095  br label %for.inc
1096
1097if.else:
1098  %cmp14 = fcmp ogt float %0, 2.000000e+00
1099  br i1 %cmp14, label %if.then16, label %for.inc
1100
1101if.then16:
1102  %add19 = fadd fast float %sum.033, %1
1103  br label %for.inc
1104
1105for.inc:
1106  %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ %sum.033, %if.else ], [ %sum.033, %for.body ]
1107  %indvars.iv.next = add i64 %indvars.iv, 1
1108  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
1109  %exitcond = icmp ne i32 %lftr.wideiv, 128
1110  br i1 %exitcond, label %for.body, label %for.end
1111
1112for.end:
1113  %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
1114  ret float %sum.1.lcssa
1115}
1116
1117define i32 @reduction_sum_multiuse(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) {
1118; CHECK-LABEL: @reduction_sum_multiuse(
1119; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
1120; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[END:%.*]]
1121; CHECK:       .lr.ph.preheader:
1122; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
1123; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
1124; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
1125; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 3
1126; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1127; CHECK:       vector.ph:
1128; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP4]], 8589934588
1129; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1130; CHECK:       vector.body:
1131; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1132; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
1133; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
1134; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
1135; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
1136; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
1137; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
1138; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
1139; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
1140; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]]
1141; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[WIDE_LOAD]]
1142; CHECK-NEXT:    [[TMP11]] = add <4 x i32> [[TMP10]], [[WIDE_LOAD1]]
1143; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1144; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
1145; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1146; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !30
1147; CHECK:       middle.block:
1148; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]])
1149; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
1150; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
1151; CHECK:       scalar.ph:
1152; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
1153; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
1154; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
1155; CHECK:       .lr.ph:
1156; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1157; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[TMP21:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1158; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
1159; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
1160; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
1161; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
1162; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32
1163; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[SUM_02]], [[TMP18]]
1164; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], [[TMP15]]
1165; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP20]], [[TMP17]]
1166; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
1167; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
1168; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
1169; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !31
1170; CHECK:       ._crit_edge:
1171; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
1172; CHECK-NEXT:    [[SUM_COPY:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
1173; CHECK-NEXT:    br label [[END]]
1174; CHECK:       end:
1175; CHECK-NEXT:    [[F1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[SUM_LCSSA]], [[DOT_CRIT_EDGE]] ]
1176; CHECK-NEXT:    [[F2:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[SUM_COPY]], [[DOT_CRIT_EDGE]] ]
1177; CHECK-NEXT:    [[FINAL:%.*]] = add i32 [[F1]], [[F2]]
1178; CHECK-NEXT:    ret i32 [[FINAL]]
1179;
1180  %1 = icmp sgt i32 %n, 0
1181  br i1 %1, label %.lr.ph.preheader, label %end
1182.lr.ph.preheader:                                 ; preds = %0
1183  br label %.lr.ph
1184
1185.lr.ph:                                           ; preds = %0, %.lr.ph
1186  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
1187  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %.lr.ph.preheader ]
1188  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
1189  %3 = load i32, i32* %2, align 4
1190  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
1191  %5 = load i32, i32* %4, align 4
1192  %6 = trunc i64 %indvars.iv to i32
1193  %7 = add i32 %sum.02, %6
1194  %8 = add i32 %7, %3
1195  %9 = add i32 %8, %5
1196  %indvars.iv.next = add i64 %indvars.iv, 1
1197  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
1198  %exitcond = icmp eq i32 %lftr.wideiv, %n
1199  br i1 %exitcond, label %._crit_edge, label %.lr.ph
1200
1201._crit_edge:                                      ; preds = %.lr.ph, %0
1202  %sum.lcssa = phi i32 [ %9, %.lr.ph ]
1203  %sum.copy = phi i32 [ %9, %.lr.ph ]
1204  br label %end
1205
1206end:
1207  %f1 = phi i32 [ 0, %0 ], [ %sum.lcssa, %._crit_edge ]
1208  %f2 = phi i32 [ 0, %0 ], [ %sum.copy, %._crit_edge ]
1209  %final = add i32 %f1, %f2
1210  ret i32 %final
1211}
1212
1213; Predicated loop, cannot (yet) use in-loop reductions.
1214define i32 @reduction_predicated(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
1215; CHECK-LABEL: @reduction_predicated(
1216; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
1217; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
1218; CHECK:       .lr.ph.preheader:
1219; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
1220; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
1221; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1222; CHECK:       vector.ph:
1223; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4
1224; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588
1225; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0
1226; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
1227; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1228; CHECK:       vector.body:
1229; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ]
1230; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ]
1231; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[PRED_LOAD_CONTINUE14]] ]
1232; CHECK-NEXT:    [[VEC_IND15:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[PRED_LOAD_CONTINUE14]] ]
1233; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 1
1234; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 2
1235; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 3
1236; CHECK-NEXT:    [[TMP7:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
1237; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
1238; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
1239; CHECK:       pred.load.if:
1240; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
1241; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4
1242; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> undef, i32 [[TMP10]], i32 0
1243; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
1244; CHECK:       pred.load.continue:
1245; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x i32> [ undef, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
1246; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
1247; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
1248; CHECK:       pred.load.if1:
1249; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
1250; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
1251; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP15]], i32 1
1252; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
1253; CHECK:       pred.load.continue2:
1254; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF1]] ]
1255; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
1256; CHECK-NEXT:    br i1 [[TMP18]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
1257; CHECK:       pred.load.if3:
1258; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
1259; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4
1260; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP20]], i32 2
1261; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
1262; CHECK:       pred.load.continue4:
1263; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP21]], [[PRED_LOAD_IF3]] ]
1264; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
1265; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
1266; CHECK:       pred.load.if5:
1267; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]]
1268; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4
1269; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP25]], i32 3
1270; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
1271; CHECK:       pred.load.continue6:
1272; CHECK-NEXT:    [[TMP27:%.*]] = phi <4 x i32> [ [[TMP22]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP26]], [[PRED_LOAD_IF5]] ]
1273; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
1274; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
1275; CHECK:       pred.load.if7:
1276; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
1277; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4
1278; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> undef, i32 [[TMP30]], i32 0
1279; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
1280; CHECK:       pred.load.continue8:
1281; CHECK-NEXT:    [[TMP32:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP31]], [[PRED_LOAD_IF7]] ]
1282; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1
1283; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
1284; CHECK:       pred.load.if9:
1285; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
1286; CHECK-NEXT:    [[TMP35:%.*]] = load i32, i32* [[TMP34]], align 4
1287; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP35]], i32 1
1288; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
1289; CHECK:       pred.load.continue10:
1290; CHECK-NEXT:    [[TMP37:%.*]] = phi <4 x i32> [ [[TMP32]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP36]], [[PRED_LOAD_IF9]] ]
1291; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2
1292; CHECK-NEXT:    br i1 [[TMP38]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
1293; CHECK:       pred.load.if11:
1294; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP5]]
1295; CHECK-NEXT:    [[TMP40:%.*]] = load i32, i32* [[TMP39]], align 4
1296; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP40]], i32 2
1297; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
1298; CHECK:       pred.load.continue12:
1299; CHECK-NEXT:    [[TMP42:%.*]] = phi <4 x i32> [ [[TMP37]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP41]], [[PRED_LOAD_IF11]] ]
1300; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3
1301; CHECK-NEXT:    br i1 [[TMP43]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]]
1302; CHECK:       pred.load.if13:
1303; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP6]]
1304; CHECK-NEXT:    [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4
1305; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP45]], i32 3
1306; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
1307; CHECK:       pred.load.continue14:
1308; CHECK-NEXT:    [[TMP47:%.*]] = phi <4 x i32> [ [[TMP42]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP46]], [[PRED_LOAD_IF13]] ]
1309; CHECK-NEXT:    [[TMP48:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND15]]
1310; CHECK-NEXT:    [[TMP49:%.*]] = add <4 x i32> [[TMP48]], [[TMP27]]
1311; CHECK-NEXT:    [[TMP50]] = add <4 x i32> [[TMP49]], [[TMP47]]
1312; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1313; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
1314; CHECK-NEXT:    [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], <i32 4, i32 4, i32 4, i32 4>
1315; CHECK-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1316; CHECK-NEXT:    br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !32
1317; CHECK:       middle.block:
1318; CHECK-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> [[TMP50]], <4 x i32> [[VEC_PHI]]
1319; CHECK-NEXT:    [[TMP53:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP52]])
1320; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
1321; CHECK:       scalar.ph:
1322; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
1323; CHECK:       .lr.ph:
1324; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !33
1325; CHECK:       ._crit_edge.loopexit:
1326; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
1327; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
1328; CHECK:       ._crit_edge:
1329; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ]
1330; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
1331;
1332  %1 = icmp sgt i32 %n, 0
1333  br i1 %1, label %.lr.ph, label %._crit_edge
1334
1335.lr.ph:                                           ; preds = %0, %.lr.ph
1336  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
1337  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
1338  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
1339  %3 = load i32, i32* %2, align 4
1340  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
1341  %5 = load i32, i32* %4, align 4
1342  %6 = trunc i64 %indvars.iv to i32
1343  %7 = add i32 %sum.02, %6
1344  %8 = add i32 %7, %3
1345  %9 = add i32 %8, %5
1346  %indvars.iv.next = add i64 %indvars.iv, 1
1347  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
1348  %exitcond = icmp eq i32 %lftr.wideiv, %n
1349  br i1 %exitcond, label %._crit_edge, label %.lr.ph, !llvm.loop !6
1350
1351._crit_edge:                                      ; preds = %.lr.ph, %0
1352  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
1353  ret i32 %sum.0.lcssa
1354}
1355
1356!6 = distinct !{!6, !7, !8}
1357!7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
1358!8 = !{!"llvm.loop.vectorize.enable", i1 true}
1359