1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -loop-vectorize -tail-predication=enabled -dce -instcombine -S | FileCheck %s
3
4target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
5target triple = "thumbv8.1m.main-none-none-eabi"
6
7define i32 @reduction_sum_single(i32* noalias nocapture %A) {
8; CHECK-LABEL: @reduction_sum_single(
9; CHECK-NEXT:  entry:
10; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
11; CHECK:       vector.ph:
12; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
13; CHECK:       vector.body:
14; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
15; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
16; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
17; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
18; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
19; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
20; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
21; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
22; CHECK-NEXT:    [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]]
23; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
24; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
25; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
26; CHECK:       middle.block:
27; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
28; CHECK:       scalar.ph:
29; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
30; CHECK:       .lr.ph:
31; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], [[LOOP2:!llvm.loop !.*]]
32; CHECK:       ._crit_edge:
33; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
34; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
35;
36entry:
37  br label %.lr.ph
38
39.lr.ph:                                           ; preds = %entry, %.lr.ph
40  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
41  %sum.02 = phi i32 [ %l7, %.lr.ph ], [ 0, %entry ]
42  %l2 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
43  %l3 = load i32, i32* %l2, align 4
44  %l7 = add i32 %sum.02, %l3
45  %indvars.iv.next = add i32 %indvars.iv, 1
46  %exitcond = icmp eq i32 %indvars.iv.next, 257
47  br i1 %exitcond, label %._crit_edge, label %.lr.ph
48
49._crit_edge:                                      ; preds = %.lr.ph
50  %sum.0.lcssa = phi i32 [ %l7, %.lr.ph ]
51  ret i32 %sum.0.lcssa
52}
53
54define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B) {
55; CHECK-LABEL: @reduction_sum(
56; CHECK-NEXT:  entry:
57; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
58; CHECK:       vector.ph:
59; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
60; CHECK:       vector.body:
61; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
62; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
63; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
64; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
65; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
66; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
67; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
68; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
69; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
70; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
71; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[VEC_IND]], <4 x i32> zeroinitializer
72; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
73; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], [[VEC_PHI]]
74; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
75; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
76; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP6]]
77; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD1]], <4 x i32> zeroinitializer
78; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
79; CHECK-NEXT:    [[TMP12]] = add i32 [[TMP11]], [[TMP9]]
80; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
81; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
82; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
83; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
84; CHECK:       middle.block:
85; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
86; CHECK:       scalar.ph:
87; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
88; CHECK:       .lr.ph:
89; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], [[LOOP5:!llvm.loop !.*]]
90; CHECK:       ._crit_edge:
91; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
92; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
93;
94entry:
95  br label %.lr.ph
96
97.lr.ph:                                           ; preds = %entry, %.lr.ph
98  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
99  %sum.02 = phi i32 [ %l9, %.lr.ph ], [ 0, %entry ]
100  %l2 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
101  %l3 = load i32, i32* %l2, align 4
102  %l4 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
103  %l5 = load i32, i32* %l4, align 4
104  %l7 = add i32 %sum.02, %indvars.iv
105  %l8 = add i32 %l7, %l3
106  %l9 = add i32 %l8, %l5
107  %indvars.iv.next = add i32 %indvars.iv, 1
108  %exitcond = icmp eq i32 %indvars.iv.next, 257
109  br i1 %exitcond, label %._crit_edge, label %.lr.ph
110
111._crit_edge:                                      ; preds = %.lr.ph
112  %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ]
113  ret i32 %sum.0.lcssa
114}
115
116define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B) {
117; CHECK-LABEL: @reduction_prod(
118; CHECK-NEXT:  entry:
119; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
120; CHECK:       vector.ph:
121; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
122; CHECK:       vector.body:
123; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
124; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
125; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
126; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
127; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
128; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
129; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
130; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
131; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
132; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
133; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD1]]
134; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]]
135; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
136; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
137; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
138; CHECK:       middle.block:
139; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]])
140; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
141; CHECK:       scalar.ph:
142; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
143; CHECK:       .lr.ph:
144; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], [[LOOP7:!llvm.loop !.*]]
145; CHECK:       ._crit_edge:
146; CHECK-NEXT:    [[PROD_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
147; CHECK-NEXT:    ret i32 [[PROD_0_LCSSA]]
148;
149entry:
150  br label %.lr.ph
151
152.lr.ph:                                           ; preds = %entry, %.lr.ph
153  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
154  %prod.02 = phi i32 [ %l9, %.lr.ph ], [ 1, %entry ]
155  %l2 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
156  %l3 = load i32, i32* %l2, align 4
157  %l4 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
158  %l5 = load i32, i32* %l4, align 4
159  %l8 = mul i32 %prod.02, %l3
160  %l9 = mul i32 %l8, %l5
161  %indvars.iv.next = add i32 %indvars.iv, 1
162  %exitcond = icmp eq i32 %indvars.iv.next, 257
163  br i1 %exitcond, label %._crit_edge, label %.lr.ph
164
165._crit_edge:                                      ; preds = %.lr.ph
166  %prod.0.lcssa = phi i32 [ %l9, %.lr.ph ]
167  ret i32 %prod.0.lcssa
168}
169
170define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) {
171; CHECK-LABEL: @reduction_and(
172; CHECK-NEXT:  entry:
173; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
174; CHECK:       vector.ph:
175; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
176; CHECK:       vector.body:
177; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
178; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
179; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
180; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
181; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
182; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
183; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
184; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
185; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
186; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
187; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD1]]
188; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]]
189; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
190; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
191; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
192; CHECK:       middle.block:
193; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP6]])
194; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
195; CHECK:       scalar.ph:
196; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
197; CHECK:       for.body:
198; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]]
199; CHECK:       for.end:
200; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
201; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
202;
203entry:
204  br label %for.body
205
206for.body:                                         ; preds = %entry, %for.body
207  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
208  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
209  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
210  %l0 = load i32, i32* %arrayidx, align 4
211  %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
212  %l1 = load i32, i32* %arrayidx2, align 4
213  %add = and i32 %result.08, %l0
214  %and = and i32 %add, %l1
215  %indvars.iv.next = add i32 %indvars.iv, 1
216  %exitcond = icmp eq i32 %indvars.iv.next, 257
217  br i1 %exitcond, label %for.end, label %for.body
218
219for.end:                                          ; preds = %for.body, %entry
220  %result.0.lcssa = phi i32 [ %and, %for.body ]
221  ret i32 %result.0.lcssa
222}
223
224define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) {
225; CHECK-LABEL: @reduction_or(
226; CHECK-NEXT:  entry:
227; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
228; CHECK:       vector.ph:
229; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
230; CHECK:       vector.body:
231; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
232; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
233; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
234; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
235; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
236; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
237; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
238; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
239; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
240; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
241; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
242; CHECK-NEXT:    [[TMP6]] = or <4 x i32> [[VEC_PHI]], [[TMP5]]
243; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
244; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
245; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
246; CHECK:       middle.block:
247; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP6]])
248; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
249; CHECK:       scalar.ph:
250; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
251; CHECK:       for.body:
252; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]]
253; CHECK:       for.end:
254; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
255; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
256;
257entry:
258  br label %for.body
259
260for.body:                                         ; preds = %entry, %for.body
261  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
262  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
263  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
264  %l0 = load i32, i32* %arrayidx, align 4
265  %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
266  %l1 = load i32, i32* %arrayidx2, align 4
267  %add = add nsw i32 %l1, %l0
268  %or = or i32 %add, %result.08
269  %indvars.iv.next = add i32 %indvars.iv, 1
270  %exitcond = icmp eq i32 %indvars.iv.next, 257
271  br i1 %exitcond, label %for.end, label %for.body
272
273for.end:                                          ; preds = %for.body, %entry
274  %result.0.lcssa = phi i32 [ %or, %for.body ]
275  ret i32 %result.0.lcssa
276}
277
278define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) {
279; CHECK-LABEL: @reduction_xor(
280; CHECK-NEXT:  entry:
281; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
282; CHECK:       vector.ph:
283; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
284; CHECK:       vector.body:
285; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
286; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
287; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
288; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
289; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
290; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
291; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
292; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
293; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
294; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
295; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
296; CHECK-NEXT:    [[TMP6]] = xor <4 x i32> [[VEC_PHI]], [[TMP5]]
297; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
298; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
299; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
300; CHECK:       middle.block:
301; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]])
302; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
303; CHECK:       scalar.ph:
304; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
305; CHECK:       for.body:
306; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP13:!llvm.loop !.*]]
307; CHECK:       for.end:
308; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
309; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
310;
311entry:
312  br label %for.body
313
314for.body:                                         ; preds = %entry, %for.body
315  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
316  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
317  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
318  %l0 = load i32, i32* %arrayidx, align 4
319  %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
320  %l1 = load i32, i32* %arrayidx2, align 4
321  %add = add nsw i32 %l1, %l0
322  %xor = xor i32 %add, %result.08
323  %indvars.iv.next = add i32 %indvars.iv, 1
324  %exitcond = icmp eq i32 %indvars.iv.next, 257
325  br i1 %exitcond, label %for.end, label %for.body
326
327for.end:                                          ; preds = %for.body, %entry
328  %result.0.lcssa = phi i32 [ %xor, %for.body ]
329  ret i32 %result.0.lcssa
330}
331
332define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
333; CHECK-LABEL: @reduction_fadd(
334; CHECK-NEXT:  entry:
335; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
336; CHECK:       vector.ph:
337; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
338; CHECK:       vector.body:
339; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
340; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
341; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
342; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDEX]]
343; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
344; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
345; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[INDEX]]
346; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
347; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
348; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
349; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <4 x float> [[TMP4]], [[WIDE_MASKED_LOAD1]]
350; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]]
351; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
352; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
353; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
354; CHECK:       middle.block:
355; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP6]])
356; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
357; CHECK:       scalar.ph:
358; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
359; CHECK:       for.body:
360; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP15:!llvm.loop !.*]]
361; CHECK:       for.end:
362; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
363; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
364;
365entry:
366  br label %for.body
367
368for.body:                                         ; preds = %entry, %for.body
369  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
370  %result.08 = phi float [ %fadd, %for.body ], [ 0.0, %entry ]
371  %arrayidx = getelementptr inbounds float, float* %A, i32 %indvars.iv
372  %l0 = load float, float* %arrayidx, align 4
373  %arrayidx2 = getelementptr inbounds float, float* %B, i32 %indvars.iv
374  %l1 = load float, float* %arrayidx2, align 4
375  %add = fadd fast float %result.08, %l0
376  %fadd = fadd fast float %add, %l1
377  %indvars.iv.next = add i32 %indvars.iv, 1
378  %exitcond = icmp eq i32 %indvars.iv.next, 257
379  br i1 %exitcond, label %for.end, label %for.body
380
381for.end:                                          ; preds = %for.body, %entry
382  %result.0.lcssa = phi float [ %fadd, %for.body ]
383  ret float %result.0.lcssa
384}
385
386define float @reduction_fmul(float* nocapture %A, float* nocapture %B) {
387; CHECK-LABEL: @reduction_fmul(
388; CHECK-NEXT:  entry:
389; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
390; CHECK:       vector.ph:
391; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
392; CHECK:       vector.body:
393; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
394; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
395; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
396; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDEX]]
397; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
398; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
399; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[INDEX]]
400; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
401; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
402; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
403; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP4]], [[WIDE_MASKED_LOAD1]]
404; CHECK-NEXT:    [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]]
405; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
406; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
407; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]]
408; CHECK:       middle.block:
409; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP6]])
410; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
411; CHECK:       scalar.ph:
412; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
413; CHECK:       for.body:
414; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP17:!llvm.loop !.*]]
415; CHECK:       for.end:
416; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
417; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
418;
419entry:
420  br label %for.body
421
422for.body:                                         ; preds = %entry, %for.body
423  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
424  %result.08 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
425  %arrayidx = getelementptr inbounds float, float* %A, i32 %indvars.iv
426  %l0 = load float, float* %arrayidx, align 4
427  %arrayidx2 = getelementptr inbounds float, float* %B, i32 %indvars.iv
428  %l1 = load float, float* %arrayidx2, align 4
429  %add = fmul fast float %result.08, %l0
430  %fmul = fmul fast float %add, %l1
431  %indvars.iv.next = add i32 %indvars.iv, 1
432  %exitcond = icmp eq i32 %indvars.iv.next, 257
433  br i1 %exitcond, label %for.end, label %for.body
434
435for.end:                                          ; preds = %for.body, %entry
436  %result.0.lcssa = phi float [ %fmul, %for.body ]
437  ret float %result.0.lcssa
438}
439
440define i32 @reduction_min(i32* nocapture %A, i32* nocapture %B) {
441; CHECK-LABEL: @reduction_min(
442; CHECK-NEXT:  entry:
443; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
444; CHECK:       vector.ph:
445; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
446; CHECK:       vector.body:
447; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
448; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
449; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
450; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
451; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
452; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
453; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
454; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
455; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
456; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]]
457; CHECK:       middle.block:
458; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]])
459; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
460; CHECK:       scalar.ph:
461; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
462; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 1000, [[ENTRY]] ]
463; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
464; CHECK:       for.body:
465; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
466; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[V0:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
467; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
468; CHECK-NEXT:    [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
469; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 [[RESULT_08]], [[L0]]
470; CHECK-NEXT:    [[V0]] = select i1 [[C0]], i32 [[RESULT_08]], i32 [[L0]]
471; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
472; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
473; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP19:!llvm.loop !.*]]
474; CHECK:       for.end:
475; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[V0]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
476; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
477;
478entry:
479  br label %for.body
480
481for.body:                                         ; preds = %entry, %for.body
482  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
483  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
484  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
485  %l0 = load i32, i32* %arrayidx, align 4
486  %c0 = icmp slt i32 %result.08, %l0
487  %v0 = select i1 %c0, i32 %result.08, i32 %l0
488  %indvars.iv.next = add i32 %indvars.iv, 1
489  %exitcond = icmp eq i32 %indvars.iv.next, 257
490  br i1 %exitcond, label %for.end, label %for.body
491
492for.end:                                          ; preds = %for.body, %entry
493  %result.0.lcssa = phi i32 [ %v0, %for.body ]
494  ret i32 %result.0.lcssa
495}
496
497define i32 @reduction_max(i32* nocapture %A, i32* nocapture %B) {
498; CHECK-LABEL: @reduction_max(
499; CHECK-NEXT:  entry:
500; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
501; CHECK:       vector.ph:
502; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
503; CHECK:       vector.body:
504; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
505; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
506; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
507; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
508; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
509; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
510; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
511; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
512; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
513; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]]
514; CHECK:       middle.block:
515; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
516; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
517; CHECK:       scalar.ph:
518; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
519; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 1000, [[ENTRY]] ]
520; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
521; CHECK:       for.body:
522; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
523; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[V0:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
524; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
525; CHECK-NEXT:    [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
526; CHECK-NEXT:    [[C0:%.*]] = icmp ugt i32 [[RESULT_08]], [[L0]]
527; CHECK-NEXT:    [[V0]] = select i1 [[C0]], i32 [[RESULT_08]], i32 [[L0]]
528; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
529; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
530; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP21:!llvm.loop !.*]]
531; CHECK:       for.end:
532; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[V0]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
533; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
534;
535entry:
536  br label %for.body
537
538for.body:                                         ; preds = %entry, %for.body
539  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
540  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
541  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
542  %l0 = load i32, i32* %arrayidx, align 4
543  %c0 = icmp ugt i32 %result.08, %l0
544  %v0 = select i1 %c0, i32 %result.08, i32 %l0
545  %indvars.iv.next = add i32 %indvars.iv, 1
546  %exitcond = icmp eq i32 %indvars.iv.next, 257
547  br i1 %exitcond, label %for.end, label %for.body
548
549for.end:                                          ; preds = %for.body, %entry
550  %result.0.lcssa = phi i32 [ %v0, %for.body ]
551  ret i32 %result.0.lcssa
552}
553
554define float @reduction_fmax(float* nocapture %A, float* nocapture %B) {
555; CHECK-LABEL: @reduction_fmax(
556; CHECK-NEXT:  entry:
557; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
558; CHECK:       for.body:
559; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
560; CHECK-NEXT:    [[RESULT_08:%.*]] = phi float [ [[V0:%.*]], [[FOR_BODY]] ], [ 1.000000e+03, [[ENTRY]] ]
561; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDVARS_IV]]
562; CHECK-NEXT:    [[L0:%.*]] = load float, float* [[ARRAYIDX]], align 4
563; CHECK-NEXT:    [[C0:%.*]] = fcmp ogt float [[RESULT_08]], [[L0]]
564; CHECK-NEXT:    [[V0]] = select i1 [[C0]], float [[RESULT_08]], float [[L0]]
565; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
566; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
567; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
568; CHECK:       for.end:
569; CHECK-NEXT:    ret float [[V0]]
570;
571entry:
572  br label %for.body
573
574for.body:                                         ; preds = %entry, %for.body
575  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
576  %result.08 = phi float [ %v0, %for.body ], [ 1000.0, %entry ]
577  %arrayidx = getelementptr inbounds float, float* %A, i32 %indvars.iv
578  %l0 = load float, float* %arrayidx, align 4
579  %c0 = fcmp ogt float %result.08, %l0
580  %v0 = select i1 %c0, float %result.08, float %l0
581  %indvars.iv.next = add i32 %indvars.iv, 1
582  %exitcond = icmp eq i32 %indvars.iv.next, 257
583  br i1 %exitcond, label %for.end, label %for.body
584
585for.end:                                          ; preds = %for.body, %entry
586  %result.0.lcssa = phi float [ %v0, %for.body ]
587  ret float %result.0.lcssa
588}
589