1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefixes=ALL,CHECK
3; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefixes=ALL,STORE
4
5; #include <stdint.h>
6;
7; int foo(float *A, int n) {
8;   float sum = 0;
9;   for (intptr_t i=0; i < n; ++i) {
10;     sum += 7*A[i*4  ] +
11;            7*A[i*4+1] +
12;            7*A[i*4+2] +
13;            7*A[i*4+3];
14;   }
15;   return sum;
16; }
17
18define i32 @add_red(float* %A, i32 %n) {
19; ALL-LABEL: @add_red(
20; ALL-NEXT:  entry:
21; ALL-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
22; ALL-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
23; ALL:       for.body.lr.ph:
24; ALL-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
25; ALL-NEXT:    br label [[FOR_BODY:%.*]]
26; ALL:       for.body:
27; ALL-NEXT:    [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
28; ALL-NEXT:    [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ]
29; ALL-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_033]], 2
30; ALL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
31; ALL-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
32; ALL-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
33; ALL-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
34; ALL-NEXT:    [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
35; ALL-NEXT:    [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]]
36; ALL-NEXT:    [[INC]] = add nsw i64 [[I_033]], 1
37; ALL-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
38; ALL-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
39; ALL:       for.cond.for.end_crit_edge:
40; ALL-NEXT:    [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32
41; ALL-NEXT:    br label [[FOR_END]]
42; ALL:       for.end:
43; ALL-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
44; ALL-NEXT:    ret i32 [[SUM_0_LCSSA]]
45;
46entry:
47  %cmp31 = icmp sgt i32 %n, 0
48  br i1 %cmp31, label %for.body.lr.ph, label %for.end
49
50for.body.lr.ph:
51  %0 = sext i32 %n to i64
52  br label %for.body
53
54for.body:
55  %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
56  %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
57  %mul = shl nsw i64 %i.033, 2
58  %arrayidx = getelementptr inbounds float, float* %A, i64 %mul
59  %1 = load float, float* %arrayidx, align 4
60  %mul2 = fmul float %1, 7.000000e+00
61  %add28 = or i64 %mul, 1
62  %arrayidx4 = getelementptr inbounds float, float* %A, i64 %add28
63  %2 = load float, float* %arrayidx4, align 4
64  %mul5 = fmul float %2, 7.000000e+00
65  %add6 = fadd fast float %mul2, %mul5
66  %add829 = or i64 %mul, 2
67  %arrayidx9 = getelementptr inbounds float, float* %A, i64 %add829
68  %3 = load float, float* %arrayidx9, align 4
69  %mul10 = fmul float %3, 7.000000e+00
70  %add11 = fadd fast float %add6, %mul10
71  %add1330 = or i64 %mul, 3
72  %arrayidx14 = getelementptr inbounds float, float* %A, i64 %add1330
73  %4 = load float, float* %arrayidx14, align 4
74  %mul15 = fmul float %4, 7.000000e+00
75  %add16 = fadd fast float %add11, %mul15
76  %add17 = fadd fast float %sum.032, %add16
77  %inc = add nsw i64 %i.033, 1
78  %exitcond = icmp eq i64 %inc, %0
79  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
80
81for.cond.for.end_crit_edge:
82  %phitmp = fptosi float %add17 to i32
83  br label %for.end
84
85for.end:
86  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
87  ret i32 %sum.0.lcssa
88}
89
90; int foo(float * restrict A, float * restrict B, int n) {
91;   float sum = 0;
92;   for (intptr_t i=0; i < n; ++i) {
93;     sum *= B[0]*A[i*4  ] +
94;       B[1]*A[i*4+1] +
95;       B[2]*A[i*4+2] +
96;       B[3]*A[i*4+3];
97;   }
98;   return sum;
99; }
100
101define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
102; ALL-LABEL: @mul_red(
103; ALL-NEXT:  entry:
104; ALL-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0
105; ALL-NEXT:    br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
106; ALL:       for.body.lr.ph:
107; ALL-NEXT:    [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
108; ALL-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
109; ALL-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
110; ALL-NEXT:    br label [[FOR_BODY:%.*]]
111; ALL:       for.body:
112; ALL-NEXT:    [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
113; ALL-NEXT:    [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ]
114; ALL-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_040]], 2
115; ALL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
116; ALL-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
117; ALL-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
118; ALL-NEXT:    [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]]
119; ALL-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
120; ALL-NEXT:    [[MUL21]] = fmul float [[SUM_039]], [[TMP6]]
121; ALL-NEXT:    [[INC]] = add nsw i64 [[I_040]], 1
122; ALL-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
123; ALL-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
124; ALL:       for.cond.for.end_crit_edge:
125; ALL-NEXT:    [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32
126; ALL-NEXT:    br label [[FOR_END]]
127; ALL:       for.end:
128; ALL-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
129; ALL-NEXT:    ret i32 [[SUM_0_LCSSA]]
130;
131entry:
132  %cmp38 = icmp sgt i32 %n, 0
133  br i1 %cmp38, label %for.body.lr.ph, label %for.end
134
135for.body.lr.ph:
136  %0 = load float, float* %B, align 4
137  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
138  %1 = load float, float* %arrayidx4, align 4
139  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
140  %2 = load float, float* %arrayidx9, align 4
141  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
142  %3 = load float, float* %arrayidx15, align 4
143  %4 = sext i32 %n to i64
144  br label %for.body
145
146for.body:
147  %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
148  %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
149  %mul = shl nsw i64 %i.040, 2
150  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
151  %5 = load float, float* %arrayidx2, align 4
152  %mul3 = fmul float %0, %5
153  %add35 = or i64 %mul, 1
154  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add35
155  %6 = load float, float* %arrayidx6, align 4
156  %mul7 = fmul float %1, %6
157  %add8 = fadd fast float %mul3, %mul7
158  %add1136 = or i64 %mul, 2
159  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1136
160  %7 = load float, float* %arrayidx12, align 4
161  %mul13 = fmul float %2, %7
162  %add14 = fadd fast float %add8, %mul13
163  %add1737 = or i64 %mul, 3
164  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1737
165  %8 = load float, float* %arrayidx18, align 4
166  %mul19 = fmul float %3, %8
167  %add20 = fadd fast float %add14, %mul19
168  %mul21 = fmul float %sum.039, %add20
169  %inc = add nsw i64 %i.040, 1
170  %exitcond = icmp eq i64 %inc, %4
171  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
172
173for.cond.for.end_crit_edge:
174  %phitmp = fptosi float %mul21 to i32
175  br label %for.end
176
177for.end:
178  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
179  ret i32 %sum.0.lcssa
180}
181
182; int foo(float * restrict A, float * restrict B, int n) {
183;   float sum = 0;
184;   for (intptr_t i=0; i < n; ++i) {
185;     sum += B[0]*A[i*6  ] +
186;            B[1]*A[i*6+1] +
187;            B[2]*A[i*6+2] +
188;            B[3]*A[i*6+3] +
189;            B[4]*A[i*6+4] +
190;            B[5]*A[i*6+5] +
191;            B[6]*A[i*6+6] +
192;            B[7]*A[i*6+7] +
193;            B[8]*A[i*6+8];
194;   }
195;   return sum;
196; }
197
198define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
199; ALL-LABEL: @long_red(
200; ALL-NEXT:  entry:
201; ALL-NEXT:    [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0
202; ALL-NEXT:    br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
203; ALL:       for.body.lr.ph:
204; ALL-NEXT:    [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <8 x float>*
205; ALL-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
206; ALL-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, float* [[B]], i64 8
207; ALL-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX45]], align 4
208; ALL-NEXT:    [[TMP3:%.*]] = sext i32 [[N]] to i64
209; ALL-NEXT:    br label [[FOR_BODY:%.*]]
210; ALL:       for.body:
211; ALL-NEXT:    [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
212; ALL-NEXT:    [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ]
213; ALL-NEXT:    [[MUL:%.*]] = mul nsw i64 [[I_083]], 6
214; ALL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
215; ALL-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX2]] to <8 x float>*
216; ALL-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
217; ALL-NEXT:    [[TMP6:%.*]] = fmul fast <8 x float> [[TMP1]], [[TMP5]]
218; ALL-NEXT:    [[ADD47:%.*]] = add nsw i64 [[MUL]], 8
219; ALL-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]]
220; ALL-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4
221; ALL-NEXT:    [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]]
222; ALL-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP6]])
223; ALL-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[MUL49]]
224; ALL-NEXT:    [[ADD51]] = fadd fast float [[SUM_082]], [[OP_RDX]]
225; ALL-NEXT:    [[INC]] = add nsw i64 [[I_083]], 1
226; ALL-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP3]]
227; ALL-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
228; ALL:       for.cond.for.end_crit_edge:
229; ALL-NEXT:    [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32
230; ALL-NEXT:    br label [[FOR_END]]
231; ALL:       for.end:
232; ALL-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
233; ALL-NEXT:    ret i32 [[SUM_0_LCSSA]]
234;
235entry:
236  %cmp81 = icmp sgt i32 %n, 0
237  br i1 %cmp81, label %for.body.lr.ph, label %for.end
238
239for.body.lr.ph:
240  %0 = load float, float* %B, align 4
241  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
242  %1 = load float, float* %arrayidx4, align 4
243  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
244  %2 = load float, float* %arrayidx9, align 4
245  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
246  %3 = load float, float* %arrayidx15, align 4
247  %arrayidx21 = getelementptr inbounds float, float* %B, i64 4
248  %4 = load float, float* %arrayidx21, align 4
249  %arrayidx27 = getelementptr inbounds float, float* %B, i64 5
250  %5 = load float, float* %arrayidx27, align 4
251  %arrayidx33 = getelementptr inbounds float, float* %B, i64 6
252  %6 = load float, float* %arrayidx33, align 4
253  %arrayidx39 = getelementptr inbounds float, float* %B, i64 7
254  %7 = load float, float* %arrayidx39, align 4
255  %arrayidx45 = getelementptr inbounds float, float* %B, i64 8
256  %8 = load float, float* %arrayidx45, align 4
257  %9 = sext i32 %n to i64
258  br label %for.body
259
260for.body:
261  %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
262  %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
263  %mul = mul nsw i64 %i.083, 6
264  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
265  %10 = load float, float* %arrayidx2, align 4
266  %mul3 = fmul fast float %0, %10
267  %add80 = or i64 %mul, 1
268  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add80
269  %11 = load float, float* %arrayidx6, align 4
270  %mul7 = fmul fast float %1, %11
271  %add8 = fadd fast float %mul3, %mul7
272  %add11 = add nsw i64 %mul, 2
273  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add11
274  %12 = load float, float* %arrayidx12, align 4
275  %mul13 = fmul fast float %2, %12
276  %add14 = fadd fast float %add8, %mul13
277  %add17 = add nsw i64 %mul, 3
278  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add17
279  %13 = load float, float* %arrayidx18, align 4
280  %mul19 = fmul fast float %3, %13
281  %add20 = fadd fast float %add14, %mul19
282  %add23 = add nsw i64 %mul, 4
283  %arrayidx24 = getelementptr inbounds float, float* %A, i64 %add23
284  %14 = load float, float* %arrayidx24, align 4
285  %mul25 = fmul fast float %4, %14
286  %add26 = fadd fast float %add20, %mul25
287  %add29 = add nsw i64 %mul, 5
288  %arrayidx30 = getelementptr inbounds float, float* %A, i64 %add29
289  %15 = load float, float* %arrayidx30, align 4
290  %mul31 = fmul fast float %5, %15
291  %add32 = fadd fast float %add26, %mul31
292  %add35 = add nsw i64 %mul, 6
293  %arrayidx36 = getelementptr inbounds float, float* %A, i64 %add35
294  %16 = load float, float* %arrayidx36, align 4
295  %mul37 = fmul fast float %6, %16
296  %add38 = fadd fast float %add32, %mul37
297  %add41 = add nsw i64 %mul, 7
298  %arrayidx42 = getelementptr inbounds float, float* %A, i64 %add41
299  %17 = load float, float* %arrayidx42, align 4
300  %mul43 = fmul fast float %7, %17
301  %add44 = fadd fast float %add38, %mul43
302  %add47 = add nsw i64 %mul, 8
303  %arrayidx48 = getelementptr inbounds float, float* %A, i64 %add47
304  %18 = load float, float* %arrayidx48, align 4
305  %mul49 = fmul fast float %8, %18
306  %add50 = fadd fast float %add44, %mul49
307  %add51 = fadd fast float %sum.082, %add50
308  %inc = add nsw i64 %i.083, 1
309  %exitcond = icmp eq i64 %inc, %9
310  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
311
312for.cond.for.end_crit_edge:
313  %phitmp = fptosi float %add51 to i32
314  br label %for.end
315
316for.end:
317  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
318  ret i32 %sum.0.lcssa
319}
320
321; int foo(float * restrict A, float * restrict B, int n) {
322;   float sum = 0;
323;   for (intptr_t i=0; i < n; ++i) {
324;     sum += B[0]*A[i*4  ];
325;     sum += B[1]*A[i*4+1];
326;     sum += B[2]*A[i*4+2];
327;     sum += B[3]*A[i*4+3];
328;   }
329;   return sum;
330; }
331
332define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
333; ALL-LABEL: @chain_red(
334; ALL-NEXT:  entry:
335; ALL-NEXT:    [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0
336; ALL-NEXT:    br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
337; ALL:       for.body.lr.ph:
338; ALL-NEXT:    [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
339; ALL-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
340; ALL-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
341; ALL-NEXT:    br label [[FOR_BODY:%.*]]
342; ALL:       for.body:
343; ALL-NEXT:    [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
344; ALL-NEXT:    [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
345; ALL-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_043]], 2
346; ALL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
347; ALL-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
348; ALL-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
349; ALL-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]]
350; ALL-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
351; ALL-NEXT:    [[OP_RDX]] = fadd fast float [[TMP6]], [[SUM_042]]
352; ALL-NEXT:    [[INC]] = add nsw i64 [[I_043]], 1
353; ALL-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
354; ALL-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
355; ALL:       for.cond.for.end_crit_edge:
356; ALL-NEXT:    [[PHITMP:%.*]] = fptosi float [[OP_RDX]] to i32
357; ALL-NEXT:    br label [[FOR_END]]
358; ALL:       for.end:
359; ALL-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
360; ALL-NEXT:    ret i32 [[SUM_0_LCSSA]]
361;
362entry:
363  %cmp41 = icmp sgt i32 %n, 0
364  br i1 %cmp41, label %for.body.lr.ph, label %for.end
365
366for.body.lr.ph:
367  %0 = load float, float* %B, align 4
368  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
369  %1 = load float, float* %arrayidx4, align 4
370  %arrayidx10 = getelementptr inbounds float, float* %B, i64 2
371  %2 = load float, float* %arrayidx10, align 4
372  %arrayidx16 = getelementptr inbounds float, float* %B, i64 3
373  %3 = load float, float* %arrayidx16, align 4
374  %4 = sext i32 %n to i64
375  br label %for.body
376
377for.body:
378  %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
379  %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
380  %mul = shl nsw i64 %i.043, 2
381  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
382  %5 = load float, float* %arrayidx2, align 4
383  %mul3 = fmul fast float %0, %5
384  %add = fadd fast float %sum.042, %mul3
385  %add638 = or i64 %mul, 1
386  %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add638
387  %6 = load float, float* %arrayidx7, align 4
388  %mul8 = fmul fast float %1, %6
389  %add9 = fadd fast float %add, %mul8
390  %add1239 = or i64 %mul, 2
391  %arrayidx13 = getelementptr inbounds float, float* %A, i64 %add1239
392  %7 = load float, float* %arrayidx13, align 4
393  %mul14 = fmul fast float %2, %7
394  %add15 = fadd fast float %add9, %mul14
395  %add1840 = or i64 %mul, 3
396  %arrayidx19 = getelementptr inbounds float, float* %A, i64 %add1840
397  %8 = load float, float* %arrayidx19, align 4
398  %mul20 = fmul fast float %3, %8
399  %add21 = fadd fast float %add15, %mul20
400  %inc = add nsw i64 %i.043, 1
401  %exitcond = icmp eq i64 %inc, %4
402  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
403
404for.cond.for.end_crit_edge:
405  %phitmp = fptosi float %add21 to i32
406  br label %for.end
407
408for.end:
409  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
410  ret i32 %sum.0.lcssa
411}
412
413; void foo(const float *arg_A, unsigned arg_B, float *array) {
414;   for (uint32_t i = 0; i < 6; ++i) {
415;     const float *ptr = arg_A + i;
416;     float w0 = array[i * 4 + 0];
417;     float w1 = array[i * 4 + 1];
418;     float w2 = array[i * 4 + 2];
419;     float w3 = array[i * 4 + 3];
420;
421;     for (unsigned j = 0; j < arg_B; ++j) {
422;       const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1);
423;       const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1);
424;       const float x3 = x2 - (-3.1f * w2) - (3.2f * w3);
425;       const float x4 = x3 + (-4.0f * w2) + w3;
426;       w1 = w0;
427;       w0 = x1;
428;       w3 = w2;
429;       w2 = x3;
430;     }
431;
432;     array[i * 4 + 0] = w0;
433;     array[i * 4 + 1] = w1;
434;     array[i * 4 + 2] = w2;
435;     array[i * 4 + 3] = w3;
436;   }
437; }
438
439define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) {
440; ALL-LABEL: @foo(
441; ALL-NEXT:  entry:
442; ALL-NEXT:    [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0
443; ALL-NEXT:    br label [[FOR_BODY:%.*]]
444; ALL:       for.cond.cleanup:
445; ALL-NEXT:    ret void
446; ALL:       for.body:
447; ALL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
448; ALL-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
449; ALL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
450; ALL-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
451; ALL-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
452; ALL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
453; ALL-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
454; ALL-NEXT:    [[TMP4:%.*]] = or i64 [[TMP0]], 2
455; ALL-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
456; ALL-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
457; ALL-NEXT:    [[TMP6:%.*]] = or i64 [[TMP0]], 3
458; ALL-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
459; ALL-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
460; ALL-NEXT:    br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
461; ALL:       for.body16.lr.ph:
462; ALL-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
463; ALL-NEXT:    [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
464; ALL-NEXT:    br label [[FOR_BODY16:%.*]]
465; ALL:       for.cond.cleanup15:
466; ALL-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
467; ALL-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
468; ALL-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
469; ALL-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
470; ALL-NEXT:    store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
471; ALL-NEXT:    store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
472; ALL-NEXT:    store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
473; ALL-NEXT:    store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4
474; ALL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
475; ALL-NEXT:    [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
476; ALL-NEXT:    br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
477; ALL:       for.body16:
478; ALL-NEXT:    [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
479; ALL-NEXT:    [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
480; ALL-NEXT:    [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
481; ALL-NEXT:    [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
482; ALL-NEXT:    [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
483; ALL-NEXT:    [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
484; ALL-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
485; ALL-NEXT:    [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
486; ALL-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
487; ALL-NEXT:    [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
488; ALL-NEXT:    [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
489; ALL-NEXT:    [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
490; ALL-NEXT:    [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
491; ALL-NEXT:    [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
492; ALL-NEXT:    [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
493; ALL-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
494; ALL-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
495; ALL-NEXT:    [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
496; ALL-NEXT:    [[INC]] = add nuw i32 [[J_098]], 1
497; ALL-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
498; ALL-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
499;
500entry:
501  %cmp1495 = icmp eq i32 %arg_B, 0
502  br label %for.body
503
504for.cond.cleanup:                                 ; preds = %for.cond.cleanup15
505  ret void
506
507for.body:                                         ; preds = %for.cond.cleanup15, %entry
508  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ]
509  %0 = shl i64 %indvars.iv, 2
510  %arrayidx = getelementptr inbounds float, float* %array, i64 %0
511  %1 = load float, float* %arrayidx, align 4
512  %2 = or i64 %0, 1
513  %arrayidx4 = getelementptr inbounds float, float* %array, i64 %2
514  %3 = load float, float* %arrayidx4, align 4
515  %4 = or i64 %0, 2
516  %arrayidx8 = getelementptr inbounds float, float* %array, i64 %4
517  %5 = load float, float* %arrayidx8, align 4
518  %6 = or i64 %0, 3
519  %arrayidx12 = getelementptr inbounds float, float* %array, i64 %6
520  %7 = load float, float* %arrayidx12, align 4
521  br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph
522
523for.body16.lr.ph:                                 ; preds = %for.body
524  %add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv
525  %8 = load float, float* %add.ptr, align 4
526  br label %for.body16
527
528for.cond.cleanup15:                               ; preds = %for.body16, %for.body
529  %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ]
530  %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ]
531  %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ]
532  %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ]
533  store float %w0.0.lcssa, float* %arrayidx, align 4
534  store float %w1.0.lcssa, float* %arrayidx4, align 4
535  store float %w2.0.lcssa, float* %arrayidx8, align 4
536  store float %w3.0.lcssa, float* %arrayidx12, align 4
537  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
538  %exitcond109 = icmp eq i64 %indvars.iv.next, 6
539  br i1 %exitcond109, label %for.cond.cleanup, label %for.body
540
541for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
542  %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ]
543  %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ]
544  %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ]
545  %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ]
546  %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ]
547  %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000
548  %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000
549  %sub92 = fadd fast float %mul17, %mul18.neg
550  %sub19 = fadd fast float %sub92, %8
551  %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000
552  %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000
553  %mul23 = fmul fast float %w1.099, 0x4002666660000000
554  %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000
555  %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000
556  %add2293 = fadd fast float %mul27.neg, %mul25
557  %add24 = fadd fast float %add2293, %mul23
558  %sub2694 = fadd fast float %add24, %mul21.neg
559  %sub28 = fadd fast float %sub2694, %mul20
560  %inc = add nuw i32 %j.098, 1
561  %exitcond = icmp eq i32 %inc, %arg_B
562  br i1 %exitcond, label %for.cond.cleanup15, label %for.body16
563}
564
565
566; void foo(double * restrict A, double * restrict B, double * restrict C,
567;          int n) {
568;   for (intptr_t i=0; i < n; ++i) {
569;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
570;   }
571; }
572
573define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
574; CHECK-LABEL: @store_red_double(
575; CHECK-NEXT:  entry:
576; CHECK-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
577; CHECK-NEXT:    br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
578; CHECK:       for.body.lr.ph:
579; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[B:%.*]], align 8
580; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
581; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX4]], align 8
582; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
583; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
584; CHECK:       for.body:
585; CHECK-NEXT:    [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
586; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_018]], 2
587; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]]
588; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* [[ARRAYIDX2]], align 8
589; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast double [[TMP0]], [[TMP3]]
590; CHECK-NEXT:    [[ADD16:%.*]] = or i64 [[MUL]], 1
591; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[ADD16]]
592; CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX6]], align 8
593; CHECK-NEXT:    [[MUL7:%.*]] = fmul fast double [[TMP1]], [[TMP4]]
594; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast double [[MUL3]], [[MUL7]]
595; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]]
596; CHECK-NEXT:    store double [[ADD8]], double* [[ARRAYIDX9]], align 8
597; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_018]], 1
598; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
599; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
600; CHECK:       for.end:
601; CHECK-NEXT:    ret void
602;
603; STORE-LABEL: @store_red_double(
604; STORE-NEXT:  entry:
605; STORE-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
606; STORE-NEXT:    br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
607; STORE:       for.body.lr.ph:
608; STORE-NEXT:    [[TMP0:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
609; STORE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
610; STORE-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
611; STORE-NEXT:    br label [[FOR_BODY:%.*]]
612; STORE:       for.body:
613; STORE-NEXT:    [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
614; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_018]], 2
615; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]]
616; STORE-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>*
617; STORE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
618; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP4]]
619; STORE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
620; STORE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
621; STORE-NEXT:    [[ADD8:%.*]] = fadd fast double [[TMP6]], [[TMP7]]
622; STORE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]]
623; STORE-NEXT:    store double [[ADD8]], double* [[ARRAYIDX9]], align 8
624; STORE-NEXT:    [[INC]] = add nsw i64 [[I_018]], 1
625; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
626; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
627; STORE:       for.end:
628; STORE-NEXT:    ret void
629;
630entry:
631  %cmp17 = icmp sgt i32 %n, 0
632  br i1 %cmp17, label %for.body.lr.ph, label %for.end
633
634for.body.lr.ph:
635  %0 = load double, double* %B, align 8
636  %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
637  %1 = load double, double* %arrayidx4, align 8
638  %2 = sext i32 %n to i64
639  br label %for.body
640
641for.body:
642  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
643  %mul = shl nsw i64 %i.018, 2
644  %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
645  %3 = load double, double* %arrayidx2, align 8
646  %mul3 = fmul fast double %0, %3
647  %add16 = or i64 %mul, 1
648  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
649  %4 = load double, double* %arrayidx6, align 8
650  %mul7 = fmul fast double %1, %4
651  %add8 = fadd fast double %mul3, %mul7
652  %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
653  store double %add8, double* %arrayidx9, align 8
654  %inc = add nsw i64 %i.018, 1
655  %exitcond = icmp eq i64 %inc, %2
656  br i1 %exitcond, label %for.end, label %for.body
657
658for.end:
659  ret void
660}
661
662; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
663;   float sum = 0;
664;   for (intptr_t i=0; i < n; ++i) {
665;     C[i] = B[0] *A[i*4  ] +
666;          B[1] *A[i*4+1] +
667;          B[2] *A[i*4+2] +
668;          B[3] *A[i*4+3];
669;   }
670;   return sum;
671; }
672
673define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
674; CHECK-LABEL: @store_red(
675; CHECK-NEXT:  entry:
676; CHECK-NEXT:    [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0
677; CHECK-NEXT:    br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
678; CHECK:       for.body.lr.ph:
679; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
680; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
681; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
682; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
683; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
684; CHECK:       for.body:
685; CHECK-NEXT:    [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
686; CHECK-NEXT:    [[C_ADDR_038:%.*]] = phi float* [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
687; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[B]], align 4
688; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_039]], 2
689; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
690; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4
691; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast float [[TMP1]], [[TMP2]]
692; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
693; CHECK-NEXT:    [[ADD34:%.*]] = or i64 [[MUL]], 1
694; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD34]]
695; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX6]], align 4
696; CHECK-NEXT:    [[MUL7:%.*]] = fmul fast float [[TMP3]], [[TMP4]]
697; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast float [[MUL3]], [[MUL7]]
698; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX9]], align 4
699; CHECK-NEXT:    [[ADD1135:%.*]] = or i64 [[MUL]], 2
700; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1135]]
701; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX12]], align 4
702; CHECK-NEXT:    [[MUL13:%.*]] = fmul fast float [[TMP5]], [[TMP6]]
703; CHECK-NEXT:    [[ADD14:%.*]] = fadd fast float [[ADD8]], [[MUL13]]
704; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX15]], align 4
705; CHECK-NEXT:    [[ADD1736:%.*]] = or i64 [[MUL]], 3
706; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]]
707; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ARRAYIDX18]], align 4
708; CHECK-NEXT:    [[MUL19:%.*]] = fmul fast float [[TMP7]], [[TMP8]]
709; CHECK-NEXT:    [[ADD20:%.*]] = fadd fast float [[ADD14]], [[MUL19]]
710; CHECK-NEXT:    store float [[ADD20]], float* [[C_ADDR_038]], align 4
711; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1
712; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_039]], 1
713; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
714; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
715; CHECK:       for.end:
716; CHECK-NEXT:    ret i32 0
717;
718; STORE-LABEL: @store_red(
719; STORE-NEXT:  entry:
720; STORE-NEXT:    [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0
721; STORE-NEXT:    br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
722; STORE:       for.body.lr.ph:
723; STORE-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
724; STORE-NEXT:    br label [[FOR_BODY:%.*]]
725; STORE:       for.body:
726; STORE-NEXT:    [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
727; STORE-NEXT:    [[C_ADDR_038:%.*]] = phi float* [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
728; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_039]], 2
729; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
730; STORE-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
731; STORE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
732; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
733; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
734; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]]
735; STORE-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
736; STORE-NEXT:    store float [[TMP6]], float* [[C_ADDR_038]], align 4
737; STORE-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1
738; STORE-NEXT:    [[INC]] = add nsw i64 [[I_039]], 1
739; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
740; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
741; STORE:       for.end:
742; STORE-NEXT:    ret i32 0
743;
744entry:
745  %cmp37 = icmp sgt i32 %n, 0
746  br i1 %cmp37, label %for.body.lr.ph, label %for.end
747
748for.body.lr.ph:
749  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
750  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
751  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
752  %0 = sext i32 %n to i64
753  br label %for.body
754
755for.body:
756  %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
757  %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
758  %1 = load float, float* %B, align 4
759  %mul = shl nsw i64 %i.039, 2
760  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
761  %2 = load float, float* %arrayidx2, align 4
762  %mul3 = fmul fast float %1, %2
763  %3 = load float, float* %arrayidx4, align 4
764  %add34 = or i64 %mul, 1
765  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add34
766  %4 = load float, float* %arrayidx6, align 4
767  %mul7 = fmul fast float %3, %4
768  %add8 = fadd fast float %mul3, %mul7
769  %5 = load float, float* %arrayidx9, align 4
770  %add1135 = or i64 %mul, 2
771  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1135
772  %6 = load float, float* %arrayidx12, align 4
773  %mul13 = fmul fast float %5, %6
774  %add14 = fadd fast float %add8, %mul13
775  %7 = load float, float* %arrayidx15, align 4
776  %add1736 = or i64 %mul, 3
777  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1736
778  %8 = load float, float* %arrayidx18, align 4
779  %mul19 = fmul fast float %7, %8
780  %add20 = fadd fast float %add14, %mul19
781  store float %add20, float* %C.addr.038, align 4
782  %incdec.ptr = getelementptr inbounds float, float* %C.addr.038, i64 1
783  %inc = add nsw i64 %i.039, 1
784  %exitcond = icmp eq i64 %inc, %0
785  br i1 %exitcond, label %for.end, label %for.body
786
787for.end:
788  ret i32 0
789}
790
791@arr_i32 = global [32 x i32] zeroinitializer, align 16
792@arr_float = global [32 x float] zeroinitializer, align 16
793
794define void @float_red_example4(float* %res) {
795; CHECK-LABEL: @float_red_example4(
796; CHECK-NEXT:  entry:
797; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
798; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
799; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
800; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
801; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
802; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
803; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
804; CHECK-NEXT:    store float [[ADD_2]], float* [[RES:%.*]], align 16
805; CHECK-NEXT:    ret void
806;
807; STORE-LABEL: @float_red_example4(
808; STORE-NEXT:  entry:
809; STORE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16
810; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]])
811; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
812; STORE-NEXT:    ret void
813;
814entry:
815  %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
816  %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
817  %add = fadd fast float %1, %0
818  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
819  %add.1 = fadd fast float %2, %add
820  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
821  %add.2 = fadd fast float %3, %add.1
822  store float %add.2, float* %res, align 16
823  ret void
824}
825
826define void @float_red_example8(float* %res) {
827; CHECK-LABEL: @float_red_example8(
828; CHECK-NEXT:  entry:
829; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
830; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
831; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
832; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
833; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
834; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
835; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
836; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
837; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP4]], [[ADD_2]]
838; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
839; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float [[TMP5]], [[ADD_3]]
840; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
841; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float [[TMP6]], [[ADD_4]]
842; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
843; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float [[TMP7]], [[ADD_5]]
844; CHECK-NEXT:    store float [[ADD_6]], float* [[RES:%.*]], align 16
845; CHECK-NEXT:    ret void
846;
847; STORE-LABEL: @float_red_example8(
848; STORE-NEXT:  entry:
849; STORE-NEXT:    [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16
850; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]])
851; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
852; STORE-NEXT:    ret void
853;
854entry:
855  %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
856  %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
857  %add = fadd fast float %1, %0
858  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
859  %add.1 = fadd fast float %2, %add
860  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
861  %add.2 = fadd fast float %3, %add.1
862  %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
863  %add.3 = fadd fast float %4, %add.2
864  %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
865  %add.4 = fadd fast float %5, %add.3
866  %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
867  %add.5 = fadd fast float %6, %add.4
868  %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
869  %add.6 = fadd fast float %7, %add.5
870  store float %add.6, float* %res, align 16
871  ret void
872}
873
874define void @float_red_example16(float* %res) {
875; CHECK-LABEL: @float_red_example16(
876; CHECK-NEXT:  entry:
877; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
878; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
879; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
880; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
881; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
882; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
883; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
884; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
885; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP4]], [[ADD_2]]
886; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
887; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float [[TMP5]], [[ADD_3]]
888; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
889; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float [[TMP6]], [[ADD_4]]
890; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
891; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float [[TMP7]], [[ADD_5]]
892; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16
893; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float [[TMP8]], [[ADD_6]]
894; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4
895; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float [[TMP9]], [[ADD_7]]
896; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8
897; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float [[TMP10]], [[ADD_8]]
898; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4
899; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float [[TMP11]], [[ADD_9]]
900; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16
901; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float [[TMP12]], [[ADD_10]]
902; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4
903; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float [[TMP13]], [[ADD_11]]
904; CHECK-NEXT:    [[TMP14:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8
905; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float [[TMP14]], [[ADD_12]]
906; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4
907; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float [[TMP15]], [[ADD_13]]
908; CHECK-NEXT:    store float [[ADD_14]], float* [[RES:%.*]], align 16
909; CHECK-NEXT:    ret void
910;
911; STORE-LABEL: @float_red_example16(
912; STORE-NEXT:  entry:
913; STORE-NEXT:    [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16
914; STORE-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP0]])
915; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
916; STORE-NEXT:    ret void
917;
918entry:
919  %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
920  %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
921  %add = fadd fast float %1, %0
922  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
923  %add.1 = fadd fast float %2, %add
924  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
925  %add.2 = fadd fast float %3, %add.1
926  %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
927  %add.3 = fadd fast float %4, %add.2
928  %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
929  %add.4 = fadd fast float %5, %add.3
930  %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
931  %add.5 = fadd fast float %6, %add.4
932  %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
933  %add.6 = fadd fast float %7, %add.5
934  %8 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16
935  %add.7 = fadd fast float %8, %add.6
936  %9 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4
937  %add.8 = fadd fast float %9, %add.7
938  %10 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8
939  %add.9 = fadd fast float %10, %add.8
940  %11 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4
941  %add.10 = fadd fast float %11, %add.9
942  %12 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16
943  %add.11 = fadd fast float %12, %add.10
944  %13 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4
945  %add.12 = fadd fast float %13, %add.11
946  %14 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8
947  %add.13 = fadd fast float %14, %add.12
948  %15 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4
949  %add.14 = fadd fast float %15, %add.13
950  store float %add.14, float* %res, align 16
951  ret void
952}
953
954define void @i32_red_example4(i32* %res) {
955; CHECK-LABEL: @i32_red_example4(
956; CHECK-NEXT:  entry:
957; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
958; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
959; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
960; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
961; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
962; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
963; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
964; CHECK-NEXT:    store i32 [[ADD_2]], i32* [[RES:%.*]], align 16
965; CHECK-NEXT:    ret void
966;
967; STORE-LABEL: @i32_red_example4(
968; STORE-NEXT:  entry:
969; STORE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
970; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
971; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
972; STORE-NEXT:    ret void
973;
974entry:
975  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
976  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
977  %add = add nsw i32 %1, %0
978  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
979  %add.1 = add nsw i32 %2, %add
980  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
981  %add.2 = add nsw i32 %3, %add.1
982  store i32 %add.2, i32* %res, align 16
983  ret void
984}
985
986define void @i32_red_example8(i32* %res) {
987; CHECK-LABEL: @i32_red_example8(
988; CHECK-NEXT:  entry:
989; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
990; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
991; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
992; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
993; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
994; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
995; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
996; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
997; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
998; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
999; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
1000; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1001; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
1002; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1003; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
1004; CHECK-NEXT:    store i32 [[ADD_6]], i32* [[RES:%.*]], align 16
1005; CHECK-NEXT:    ret void
1006;
1007; STORE-LABEL: @i32_red_example8(
1008; STORE-NEXT:  entry:
1009; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1010; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1011; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
1012; STORE-NEXT:    ret void
1013;
1014entry:
1015  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1016  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1017  %add = add nsw i32 %1, %0
1018  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1019  %add.1 = add nsw i32 %2, %add
1020  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1021  %add.2 = add nsw i32 %3, %add.1
1022  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1023  %add.3 = add nsw i32 %4, %add.2
1024  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1025  %add.4 = add nsw i32 %5, %add.3
1026  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1027  %add.5 = add nsw i32 %6, %add.4
1028  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1029  %add.6 = add nsw i32 %7, %add.5
1030  store i32 %add.6, i32* %res, align 16
1031  ret void
1032}
1033
1034define void @i32_red_example16(i32* %res) {
1035; CHECK-LABEL: @i32_red_example16(
1036; CHECK-NEXT:  entry:
1037; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1038; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1039; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
1040; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1041; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
1042; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1043; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
1044; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1045; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
1046; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1047; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
1048; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1049; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
1050; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1051; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
1052; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
1053; CHECK-NEXT:    [[ADD_7:%.*]] = add nsw i32 [[TMP8]], [[ADD_6]]
1054; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
1055; CHECK-NEXT:    [[ADD_8:%.*]] = add nsw i32 [[TMP9]], [[ADD_7]]
1056; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
1057; CHECK-NEXT:    [[ADD_9:%.*]] = add nsw i32 [[TMP10]], [[ADD_8]]
1058; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
1059; CHECK-NEXT:    [[ADD_10:%.*]] = add nsw i32 [[TMP11]], [[ADD_9]]
1060; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
1061; CHECK-NEXT:    [[ADD_11:%.*]] = add nsw i32 [[TMP12]], [[ADD_10]]
1062; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
1063; CHECK-NEXT:    [[ADD_12:%.*]] = add nsw i32 [[TMP13]], [[ADD_11]]
1064; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
1065; CHECK-NEXT:    [[ADD_13:%.*]] = add nsw i32 [[TMP14]], [[ADD_12]]
1066; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
1067; CHECK-NEXT:    [[ADD_14:%.*]] = add nsw i32 [[TMP15]], [[ADD_13]]
1068; CHECK-NEXT:    store i32 [[ADD_14]], i32* [[RES:%.*]], align 16
1069; CHECK-NEXT:    ret void
1070;
1071; STORE-LABEL: @i32_red_example16(
1072; STORE-NEXT:  entry:
1073; STORE-NEXT:    [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16
1074; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP0]])
1075; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
1076; STORE-NEXT:    ret void
1077;
1078entry:
1079  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1080  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1081  %add = add nsw i32 %1, %0
1082  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1083  %add.1 = add nsw i32 %2, %add
1084  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1085  %add.2 = add nsw i32 %3, %add.1
1086  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1087  %add.3 = add nsw i32 %4, %add.2
1088  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1089  %add.4 = add nsw i32 %5, %add.3
1090  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1091  %add.5 = add nsw i32 %6, %add.4
1092  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1093  %add.6 = add nsw i32 %7, %add.5
1094  %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
1095  %add.7 = add nsw i32 %8, %add.6
1096  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
1097  %add.8 = add nsw i32 %9, %add.7
1098  %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
1099  %add.9 = add nsw i32 %10, %add.8
1100  %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
1101  %add.10 = add nsw i32 %11, %add.9
1102  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
1103  %add.11 = add nsw i32 %12, %add.10
1104  %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
1105  %add.12 = add nsw i32 %13, %add.11
1106  %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
1107  %add.13 = add nsw i32 %14, %add.12
1108  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
1109  %add.14 = add nsw i32 %15, %add.13
1110  store i32 %add.14, i32* %res, align 16
1111  ret void
1112}
1113
1114define void @i32_red_example32(i32* %res) {
1115; CHECK-LABEL: @i32_red_example32(
1116; CHECK-NEXT:  entry:
1117; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1118; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1119; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
1120; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1121; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
1122; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1123; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
1124; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1125; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
1126; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1127; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
1128; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1129; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
1130; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1131; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
1132; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
1133; CHECK-NEXT:    [[ADD_7:%.*]] = add nsw i32 [[TMP8]], [[ADD_6]]
1134; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
1135; CHECK-NEXT:    [[ADD_8:%.*]] = add nsw i32 [[TMP9]], [[ADD_7]]
1136; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
1137; CHECK-NEXT:    [[ADD_9:%.*]] = add nsw i32 [[TMP10]], [[ADD_8]]
1138; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
1139; CHECK-NEXT:    [[ADD_10:%.*]] = add nsw i32 [[TMP11]], [[ADD_9]]
1140; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
1141; CHECK-NEXT:    [[ADD_11:%.*]] = add nsw i32 [[TMP12]], [[ADD_10]]
1142; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
1143; CHECK-NEXT:    [[ADD_12:%.*]] = add nsw i32 [[TMP13]], [[ADD_11]]
1144; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
1145; CHECK-NEXT:    [[ADD_13:%.*]] = add nsw i32 [[TMP14]], [[ADD_12]]
1146; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
1147; CHECK-NEXT:    [[ADD_14:%.*]] = add nsw i32 [[TMP15]], [[ADD_13]]
1148; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16
1149; CHECK-NEXT:    [[ADD_15:%.*]] = add nsw i32 [[TMP16]], [[ADD_14]]
1150; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4
1151; CHECK-NEXT:    [[ADD_16:%.*]] = add nsw i32 [[TMP17]], [[ADD_15]]
1152; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8
1153; CHECK-NEXT:    [[ADD_17:%.*]] = add nsw i32 [[TMP18]], [[ADD_16]]
1154; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4
1155; CHECK-NEXT:    [[ADD_18:%.*]] = add nsw i32 [[TMP19]], [[ADD_17]]
1156; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16
1157; CHECK-NEXT:    [[ADD_19:%.*]] = add nsw i32 [[TMP20]], [[ADD_18]]
1158; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4
1159; CHECK-NEXT:    [[ADD_20:%.*]] = add nsw i32 [[TMP21]], [[ADD_19]]
1160; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8
1161; CHECK-NEXT:    [[ADD_21:%.*]] = add nsw i32 [[TMP22]], [[ADD_20]]
1162; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4
1163; CHECK-NEXT:    [[ADD_22:%.*]] = add nsw i32 [[TMP23]], [[ADD_21]]
1164; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16
1165; CHECK-NEXT:    [[ADD_23:%.*]] = add nsw i32 [[TMP24]], [[ADD_22]]
1166; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4
1167; CHECK-NEXT:    [[ADD_24:%.*]] = add nsw i32 [[TMP25]], [[ADD_23]]
1168; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8
1169; CHECK-NEXT:    [[ADD_25:%.*]] = add nsw i32 [[TMP26]], [[ADD_24]]
1170; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4
1171; CHECK-NEXT:    [[ADD_26:%.*]] = add nsw i32 [[TMP27]], [[ADD_25]]
1172; CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16
1173; CHECK-NEXT:    [[ADD_27:%.*]] = add nsw i32 [[TMP28]], [[ADD_26]]
1174; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4
1175; CHECK-NEXT:    [[ADD_28:%.*]] = add nsw i32 [[TMP29]], [[ADD_27]]
1176; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8
1177; CHECK-NEXT:    [[ADD_29:%.*]] = add nsw i32 [[TMP30]], [[ADD_28]]
1178; CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4
1179; CHECK-NEXT:    [[ADD_30:%.*]] = add nsw i32 [[TMP31]], [[ADD_29]]
1180; CHECK-NEXT:    store i32 [[ADD_30]], i32* [[RES:%.*]], align 16
1181; CHECK-NEXT:    ret void
1182;
1183; STORE-LABEL: @i32_red_example32(
1184; STORE-NEXT:  entry:
1185; STORE-NEXT:    [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16
1186; STORE-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP0]])
1187; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
1188; STORE-NEXT:    ret void
1189;
1190entry:
1191  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1192  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1193  %add = add nsw i32 %1, %0
1194  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1195  %add.1 = add nsw i32 %2, %add
1196  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1197  %add.2 = add nsw i32 %3, %add.1
1198  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1199  %add.3 = add nsw i32 %4, %add.2
1200  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1201  %add.4 = add nsw i32 %5, %add.3
1202  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1203  %add.5 = add nsw i32 %6, %add.4
1204  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1205  %add.6 = add nsw i32 %7, %add.5
1206  %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
1207  %add.7 = add nsw i32 %8, %add.6
1208  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
1209  %add.8 = add nsw i32 %9, %add.7
1210  %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
1211  %add.9 = add nsw i32 %10, %add.8
1212  %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
1213  %add.10 = add nsw i32 %11, %add.9
1214  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
1215  %add.11 = add nsw i32 %12, %add.10
1216  %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
1217  %add.12 = add nsw i32 %13, %add.11
1218  %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
1219  %add.13 = add nsw i32 %14, %add.12
1220  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
1221  %add.14 = add nsw i32 %15, %add.13
1222  %16 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16
1223  %add.15 = add nsw i32 %16, %add.14
1224  %17 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4
1225  %add.16 = add nsw i32 %17, %add.15
1226  %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8
1227  %add.17 = add nsw i32 %18, %add.16
1228  %19 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4
1229  %add.18 = add nsw i32 %19, %add.17
1230  %20 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16
1231  %add.19 = add nsw i32 %20, %add.18
1232  %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4
1233  %add.20 = add nsw i32 %21, %add.19
1234  %22 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8
1235  %add.21 = add nsw i32 %22, %add.20
1236  %23 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4
1237  %add.22 = add nsw i32 %23, %add.21
1238  %24 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16
1239  %add.23 = add nsw i32 %24, %add.22
1240  %25 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4
1241  %add.24 = add nsw i32 %25, %add.23
1242  %26 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8
1243  %add.25 = add nsw i32 %26, %add.24
1244  %27 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4
1245  %add.26 = add nsw i32 %27, %add.25
1246  %28 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16
1247  %add.27 = add nsw i32 %28, %add.26
1248  %29 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4
1249  %add.28 = add nsw i32 %29, %add.27
1250  %30 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8
1251  %add.29 = add nsw i32 %30, %add.28
1252  %31 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4
1253  %add.30 = add nsw i32 %31, %add.29
1254  store i32 %add.30, i32* %res, align 16
1255  ret void
1256}
1257
1258declare i32 @foobar(i32)
1259
1260define void @i32_red_call(i32 %val) {
1261; ALL-LABEL: @i32_red_call(
1262; ALL-NEXT:  entry:
1263; ALL-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1264; ALL-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1265; ALL-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
1266; ALL-NEXT:    ret void
1267;
1268entry:
1269  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1270  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1271  %add = add nsw i32 %1, %0
1272  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1273  %add.1 = add nsw i32 %2, %add
1274  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1275  %add.2 = add nsw i32 %3, %add.1
1276  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1277  %add.3 = add nsw i32 %4, %add.2
1278  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1279  %add.4 = add nsw i32 %5, %add.3
1280  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1281  %add.5 = add nsw i32 %6, %add.4
1282  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1283  %add.6 = add nsw i32 %7, %add.5
1284  %res = call i32 @foobar(i32 %add.6)
1285  ret void
1286}
1287
1288define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_v0 {
1289; ALL-LABEL: @i32_red_invoke(
1290; ALL-NEXT:  entry:
1291; ALL-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
1292; ALL-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP0]])
1293; ALL-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
1294; ALL-NEXT:    to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
1295; ALL:       exception:
1296; ALL-NEXT:    [[CLEANUP:%.*]] = landingpad i8
1297; ALL-NEXT:    cleanup
1298; ALL-NEXT:    br label [[NORMAL]]
1299; ALL:       normal:
1300; ALL-NEXT:    ret void
1301;
1302entry:
1303  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1304  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1305  %add = add nsw i32 %1, %0
1306  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1307  %add.1 = add nsw i32 %2, %add
1308  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1309  %add.2 = add nsw i32 %3, %add.1
1310  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
1311  %add.3 = add nsw i32 %4, %add.2
1312  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
1313  %add.4 = add nsw i32 %5, %add.3
1314  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
1315  %add.5 = add nsw i32 %6, %add.4
1316  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
1317  %add.6 = add nsw i32 %7, %add.5
1318  %res = invoke i32 @foobar(i32 %add.6) to label %normal unwind label %exception
1319exception:
1320  %cleanup = landingpad i8 cleanup
1321  br label %normal
1322normal:
1323  ret void
1324}
1325
1326; Test case from PR47670. Reduction result is used as incoming value in phi.
1327define i32 @reduction_result_used_in_phi(i32* nocapture readonly %data, i1 zeroext %b) {
1328; ALL-LABEL: @reduction_result_used_in_phi(
1329; ALL-NEXT:  entry:
1330; ALL-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1331; ALL:       bb:
1332; ALL-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA:%.*]] to <4 x i32>*
1333; ALL-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
1334; ALL-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1335; ALL-NEXT:    br label [[EXIT]]
1336; ALL:       exit:
1337; ALL-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
1338; ALL-NEXT:    ret i32 [[SUM_1]]
1339;
1340entry:
1341  br i1 %b, label %bb, label %exit
1342
1343bb:
1344  %l.0 = load i32, i32* %data, align 4
1345  %idx.1 = getelementptr inbounds i32, i32* %data, i64 1
1346  %l.1 = load i32, i32* %idx.1, align 4
1347  %add.1 = add i32 %l.1, %l.0
1348  %idx.2 = getelementptr inbounds i32, i32* %data, i64 2
1349  %l.2 = load i32, i32* %idx.2, align 4
1350  %add.2 = add i32 %l.2, %add.1
1351  %idx.3 = getelementptr inbounds i32, i32* %data, i64 3
1352  %l.3 = load i32, i32* %idx.3, align 4
1353  %add.3 = add i32 %l.3, %add.2
1354  br label %exit
1355
1356exit:
1357  %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb]
1358  ret i32 %sum.1
1359}
1360
1361define i32 @reduction_result_used_in_phi_loop(i32* nocapture readonly %data, i1 zeroext %b) {
1362; ALL-LABEL: @reduction_result_used_in_phi_loop(
1363; ALL-NEXT:  entry:
1364; ALL-NEXT:    br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]]
1365; ALL:       bb:
1366; ALL-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DATA:%.*]] to <4 x i32>*
1367; ALL-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
1368; ALL-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1369; ALL-NEXT:    br label [[EXIT]]
1370; ALL:       exit:
1371; ALL-NEXT:    [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ]
1372; ALL-NEXT:    ret i32 [[SUM_1]]
1373;
1374entry:
1375  br i1 %b, label %bb, label %exit
1376
1377bb:
1378  %l.0 = load i32, i32* %data, align 4
1379  %idx.1 = getelementptr inbounds i32, i32* %data, i64 1
1380  %l.1 = load i32, i32* %idx.1, align 4
1381  %add.1 = add i32 %l.1, %l.0
1382  %idx.2 = getelementptr inbounds i32, i32* %data, i64 2
1383  %l.2 = load i32, i32* %idx.2, align 4
1384  %add.2 = add i32 %l.2, %add.1
1385  %idx.3 = getelementptr inbounds i32, i32* %data, i64 3
1386  %l.3 = load i32, i32* %idx.3, align 4
1387  %add.3 = add i32 %l.3, %add.2
1388  br label %exit
1389
1390exit:
1391  %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb]
1392  ret i32 %sum.1
1393}
1394
1395; Make sure we do not crash or infinite loop on ill-formed IR.
1396
1397define void @unreachable_block() {
1398; ALL-LABEL: @unreachable_block(
1399; ALL-NEXT:  bb.0:
1400; ALL-NEXT:    br label [[BB_1:%.*]]
1401; ALL:       dead:
1402; ALL-NEXT:    [[T0:%.*]] = add i16 [[T0]], undef
1403; ALL-NEXT:    br label [[BB_1]]
1404; ALL:       bb.1:
1405; ALL-NEXT:    [[T1:%.*]] = phi i16 [ undef, [[BB_0:%.*]] ], [ [[T0]], [[DEAD:%.*]] ]
1406; ALL-NEXT:    ret void
1407;
1408bb.0:
1409  br label %bb.1
1410
1411dead:
1412  %t0 = add i16 %t0, undef ; unreachable IR may depend on itself
1413  br label %bb.1
1414
1415bb.1:
1416  %t1 = phi i16 [ undef, %bb.0 ], [ %t0, %dead ]
1417  ret void
1418}
1419
1420; The FMF on the reduction should match the incoming insts.
1421
1422define float @fadd_v4f32_fmf(float* %p) {
1423; ALL-LABEL: @fadd_v4f32_fmf(
1424; ALL-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
1425; ALL-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
1426; ALL-NEXT:    [[TMP3:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]])
1427; ALL-NEXT:    ret float [[TMP3]]
1428;
1429  %p1 = getelementptr inbounds float, float* %p, i64 1
1430  %p2 = getelementptr inbounds float, float* %p, i64 2
1431  %p3 = getelementptr inbounds float, float* %p, i64 3
1432  %t0 = load float, float* %p, align 4
1433  %t1 = load float, float* %p1, align 4
1434  %t2 = load float, float* %p2, align 4
1435  %t3 = load float, float* %p3, align 4
1436  %add1 = fadd reassoc nsz float %t1, %t0
1437  %add2 = fadd reassoc nsz float %t2, %add1
1438  %add3 = fadd reassoc nsz float %t3, %add2
1439  ret float %add3
1440}
1441
1442; The minimal FMF for fadd reduction are "reassoc nsz".
1443; Only the common FMF of all operations in the reduction propagate to the result.
1444; In this example, "contract nnan arcp" are dropped, but "ninf" transfers with the required flags.
1445
1446define float @fadd_v4f32_fmf_intersect(float* %p) {
1447; ALL-LABEL: @fadd_v4f32_fmf_intersect(
1448; ALL-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
1449; ALL-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
1450; ALL-NEXT:    [[TMP3:%.*]] = call reassoc ninf nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]])
1451; ALL-NEXT:    ret float [[TMP3]]
1452;
1453  %p1 = getelementptr inbounds float, float* %p, i64 1
1454  %p2 = getelementptr inbounds float, float* %p, i64 2
1455  %p3 = getelementptr inbounds float, float* %p, i64 3
1456  %t0 = load float, float* %p, align 4
1457  %t1 = load float, float* %p1, align 4
1458  %t2 = load float, float* %p2, align 4
1459  %t3 = load float, float* %p3, align 4
1460  %add1 = fadd ninf reassoc nsz nnan float %t1, %t0
1461  %add2 = fadd ninf reassoc nsz nnan arcp float %t2, %add1
1462  %add3 = fadd ninf reassoc nsz contract float %t3, %add2
1463  ret float %add3
1464}
1465
1466; This must not propagate 'nsw' to a new add instruction.
1467
1468define void @nsw_propagation_v4i32(i32* %res, i32 %start) {
1469; CHECK-LABEL: @nsw_propagation_v4i32(
1470; CHECK-NEXT:    [[T0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1471; CHECK-NEXT:    [[T1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1472; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1473; CHECK-NEXT:    [[T3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1474; CHECK-NEXT:    [[S:%.*]] = add nsw i32 [[START:%.*]], [[T0]]
1475; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[T1]], [[S]]
1476; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[T2]], [[ADD]]
1477; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[T3]], [[ADD_1]]
1478; CHECK-NEXT:    store i32 [[ADD_2]], i32* [[RES:%.*]], align 16
1479; CHECK-NEXT:    ret void
1480;
1481; STORE-LABEL: @nsw_propagation_v4i32(
1482; STORE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
1483; STORE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
1484; STORE-NEXT:    [[OP_RDX:%.*]] = add i32 [[TMP2]], [[START:%.*]]
1485; STORE-NEXT:    store i32 [[OP_RDX]], i32* [[RES:%.*]], align 16
1486; STORE-NEXT:    ret void
1487;
1488  %t0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
1489  %t1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
1490  %t2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
1491  %t3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
1492  %s = add nsw i32 %start, %t0
1493  %add = add nsw i32 %t1, %s
1494  %add.1 = add nsw i32 %t2, %add
1495  %add.2 = add nsw i32 %t3, %add.1
1496  store i32 %add.2, i32* %res, align 16
1497  ret void
1498}
1499
1500declare i32 @__gxx_personality_v0(...)
1501