1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
3
4; #include <stdint.h>
5;
6; int foo(float *A, int n) {
7;   float sum = 0;
8;   for (intptr_t i=0; i < n; ++i) {
9;     sum += 7*A[i*4  ] +
10;            7*A[i*4+1] +
11;            7*A[i*4+2] +
12;            7*A[i*4+3];
13;   }
14;   return sum;
15; }
16
17; CHECK-LABEL: add_red
18; CHECK: fmul <4 x float>
19; CHECK: shufflevector <4 x float>
20
21define i32 @add_red(float* %A, i32 %n) {
22entry:
23  %cmp31 = icmp sgt i32 %n, 0
24  br i1 %cmp31, label %for.body.lr.ph, label %for.end
25
26for.body.lr.ph:
27  %0 = sext i32 %n to i64
28  br label %for.body
29
30for.body:
31  %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
32  %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
33  %mul = shl nsw i64 %i.033, 2
34  %arrayidx = getelementptr inbounds float, float* %A, i64 %mul
35  %1 = load float, float* %arrayidx, align 4
36  %mul2 = fmul float %1, 7.000000e+00
37  %add28 = or i64 %mul, 1
38  %arrayidx4 = getelementptr inbounds float, float* %A, i64 %add28
39  %2 = load float, float* %arrayidx4, align 4
40  %mul5 = fmul float %2, 7.000000e+00
41  %add6 = fadd fast float %mul2, %mul5
42  %add829 = or i64 %mul, 2
43  %arrayidx9 = getelementptr inbounds float, float* %A, i64 %add829
44  %3 = load float, float* %arrayidx9, align 4
45  %mul10 = fmul float %3, 7.000000e+00
46  %add11 = fadd fast float %add6, %mul10
47  %add1330 = or i64 %mul, 3
48  %arrayidx14 = getelementptr inbounds float, float* %A, i64 %add1330
49  %4 = load float, float* %arrayidx14, align 4
50  %mul15 = fmul float %4, 7.000000e+00
51  %add16 = fadd fast float %add11, %mul15
52  %add17 = fadd fast float %sum.032, %add16
53  %inc = add nsw i64 %i.033, 1
54  %exitcond = icmp eq i64 %inc, %0
55  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
56
57for.cond.for.end_crit_edge:
58  %phitmp = fptosi float %add17 to i32
59  br label %for.end
60
61for.end:
62  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
63  ret i32 %sum.0.lcssa
64}
65
66; int foo(float * restrict A, float * restrict B, int n) {
67;   float sum = 0;
68;   for (intptr_t i=0; i < n; ++i) {
69;     sum *= B[0]*A[i*4  ] +
70;       B[1]*A[i*4+1] +
71;       B[2]*A[i*4+2] +
72;       B[3]*A[i*4+3];
73;   }
74;   return sum;
75; }
76
77; CHECK-LABEL: mul_red
78; CHECK: fmul <4 x float>
79; CHECK: shufflevector <4 x float>
80
81define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
82entry:
83  %cmp38 = icmp sgt i32 %n, 0
84  br i1 %cmp38, label %for.body.lr.ph, label %for.end
85
86for.body.lr.ph:
87  %0 = load float, float* %B, align 4
88  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
89  %1 = load float, float* %arrayidx4, align 4
90  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
91  %2 = load float, float* %arrayidx9, align 4
92  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
93  %3 = load float, float* %arrayidx15, align 4
94  %4 = sext i32 %n to i64
95  br label %for.body
96
97for.body:
98  %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
99  %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
100  %mul = shl nsw i64 %i.040, 2
101  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
102  %5 = load float, float* %arrayidx2, align 4
103  %mul3 = fmul float %0, %5
104  %add35 = or i64 %mul, 1
105  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add35
106  %6 = load float, float* %arrayidx6, align 4
107  %mul7 = fmul float %1, %6
108  %add8 = fadd fast float %mul3, %mul7
109  %add1136 = or i64 %mul, 2
110  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1136
111  %7 = load float, float* %arrayidx12, align 4
112  %mul13 = fmul float %2, %7
113  %add14 = fadd fast float %add8, %mul13
114  %add1737 = or i64 %mul, 3
115  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1737
116  %8 = load float, float* %arrayidx18, align 4
117  %mul19 = fmul float %3, %8
118  %add20 = fadd fast float %add14, %mul19
119  %mul21 = fmul float %sum.039, %add20
120  %inc = add nsw i64 %i.040, 1
121  %exitcond = icmp eq i64 %inc, %4
122  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
123
124for.cond.for.end_crit_edge:
125  %phitmp = fptosi float %mul21 to i32
126  br label %for.end
127
128for.end:
129  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
130  ret i32 %sum.0.lcssa
131}
132
133; int foo(float * restrict A, float * restrict B, int n) {
134;   float sum = 0;
135;   for (intptr_t i=0; i < n; ++i) {
136;     sum += B[0]*A[i*6  ] +
137;            B[1]*A[i*6+1] +
138;            B[2]*A[i*6+2] +
139;            B[3]*A[i*6+3] +
140;            B[4]*A[i*6+4] +
141;            B[5]*A[i*6+5] +
142;            B[6]*A[i*6+6] +
143;            B[7]*A[i*6+7] +
144;            B[8]*A[i*6+8];
145;   }
146;   return sum;
147; }
148
149; CHECK-LABEL: long_red
150; CHECK: fmul fast <8 x float>
151; CHECK: shufflevector <8 x float>
152
153define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
154entry:
155  %cmp81 = icmp sgt i32 %n, 0
156  br i1 %cmp81, label %for.body.lr.ph, label %for.end
157
158for.body.lr.ph:
159  %0 = load float, float* %B, align 4
160  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
161  %1 = load float, float* %arrayidx4, align 4
162  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
163  %2 = load float, float* %arrayidx9, align 4
164  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
165  %3 = load float, float* %arrayidx15, align 4
166  %arrayidx21 = getelementptr inbounds float, float* %B, i64 4
167  %4 = load float, float* %arrayidx21, align 4
168  %arrayidx27 = getelementptr inbounds float, float* %B, i64 5
169  %5 = load float, float* %arrayidx27, align 4
170  %arrayidx33 = getelementptr inbounds float, float* %B, i64 6
171  %6 = load float, float* %arrayidx33, align 4
172  %arrayidx39 = getelementptr inbounds float, float* %B, i64 7
173  %7 = load float, float* %arrayidx39, align 4
174  %arrayidx45 = getelementptr inbounds float, float* %B, i64 8
175  %8 = load float, float* %arrayidx45, align 4
176  %9 = sext i32 %n to i64
177  br label %for.body
178
179for.body:
180  %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
181  %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
182  %mul = mul nsw i64 %i.083, 6
183  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
184  %10 = load float, float* %arrayidx2, align 4
185  %mul3 = fmul fast float %0, %10
186  %add80 = or i64 %mul, 1
187  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add80
188  %11 = load float, float* %arrayidx6, align 4
189  %mul7 = fmul fast float %1, %11
190  %add8 = fadd fast float %mul3, %mul7
191  %add11 = add nsw i64 %mul, 2
192  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add11
193  %12 = load float, float* %arrayidx12, align 4
194  %mul13 = fmul fast float %2, %12
195  %add14 = fadd fast float %add8, %mul13
196  %add17 = add nsw i64 %mul, 3
197  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add17
198  %13 = load float, float* %arrayidx18, align 4
199  %mul19 = fmul fast float %3, %13
200  %add20 = fadd fast float %add14, %mul19
201  %add23 = add nsw i64 %mul, 4
202  %arrayidx24 = getelementptr inbounds float, float* %A, i64 %add23
203  %14 = load float, float* %arrayidx24, align 4
204  %mul25 = fmul fast float %4, %14
205  %add26 = fadd fast float %add20, %mul25
206  %add29 = add nsw i64 %mul, 5
207  %arrayidx30 = getelementptr inbounds float, float* %A, i64 %add29
208  %15 = load float, float* %arrayidx30, align 4
209  %mul31 = fmul fast float %5, %15
210  %add32 = fadd fast float %add26, %mul31
211  %add35 = add nsw i64 %mul, 6
212  %arrayidx36 = getelementptr inbounds float, float* %A, i64 %add35
213  %16 = load float, float* %arrayidx36, align 4
214  %mul37 = fmul fast float %6, %16
215  %add38 = fadd fast float %add32, %mul37
216  %add41 = add nsw i64 %mul, 7
217  %arrayidx42 = getelementptr inbounds float, float* %A, i64 %add41
218  %17 = load float, float* %arrayidx42, align 4
219  %mul43 = fmul fast float %7, %17
220  %add44 = fadd fast float %add38, %mul43
221  %add47 = add nsw i64 %mul, 8
222  %arrayidx48 = getelementptr inbounds float, float* %A, i64 %add47
223  %18 = load float, float* %arrayidx48, align 4
224  %mul49 = fmul fast float %8, %18
225  %add50 = fadd fast float %add44, %mul49
226  %add51 = fadd fast float %sum.082, %add50
227  %inc = add nsw i64 %i.083, 1
228  %exitcond = icmp eq i64 %inc, %9
229  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
230
231for.cond.for.end_crit_edge:
232  %phitmp = fptosi float %add51 to i32
233  br label %for.end
234
235for.end:
236  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
237  ret i32 %sum.0.lcssa
238}
239
240; int foo(float * restrict A, float * restrict B, int n) {
241;   float sum = 0;
242;   for (intptr_t i=0; i < n; ++i) {
243;     sum += B[0]*A[i*4  ];
244;     sum += B[1]*A[i*4+1];
245;     sum += B[2]*A[i*4+2];
246;     sum += B[3]*A[i*4+3];
247;   }
248;   return sum;
249; }
250
251; CHECK-LABEL: chain_red
252; CHECK: fmul fast <4 x float>
253; CHECK: shufflevector <4 x float>
254
255define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
256entry:
257  %cmp41 = icmp sgt i32 %n, 0
258  br i1 %cmp41, label %for.body.lr.ph, label %for.end
259
260for.body.lr.ph:
261  %0 = load float, float* %B, align 4
262  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
263  %1 = load float, float* %arrayidx4, align 4
264  %arrayidx10 = getelementptr inbounds float, float* %B, i64 2
265  %2 = load float, float* %arrayidx10, align 4
266  %arrayidx16 = getelementptr inbounds float, float* %B, i64 3
267  %3 = load float, float* %arrayidx16, align 4
268  %4 = sext i32 %n to i64
269  br label %for.body
270
271for.body:
272  %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
273  %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
274  %mul = shl nsw i64 %i.043, 2
275  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
276  %5 = load float, float* %arrayidx2, align 4
277  %mul3 = fmul fast float %0, %5
278  %add = fadd fast float %sum.042, %mul3
279  %add638 = or i64 %mul, 1
280  %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add638
281  %6 = load float, float* %arrayidx7, align 4
282  %mul8 = fmul fast float %1, %6
283  %add9 = fadd fast float %add, %mul8
284  %add1239 = or i64 %mul, 2
285  %arrayidx13 = getelementptr inbounds float, float* %A, i64 %add1239
286  %7 = load float, float* %arrayidx13, align 4
287  %mul14 = fmul fast float %2, %7
288  %add15 = fadd fast float %add9, %mul14
289  %add1840 = or i64 %mul, 3
290  %arrayidx19 = getelementptr inbounds float, float* %A, i64 %add1840
291  %8 = load float, float* %arrayidx19, align 4
292  %mul20 = fmul fast float %3, %8
293  %add21 = fadd fast float %add15, %mul20
294  %inc = add nsw i64 %i.043, 1
295  %exitcond = icmp eq i64 %inc, %4
296  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
297
298for.cond.for.end_crit_edge:
299  %phitmp = fptosi float %add21 to i32
300  br label %for.end
301
302for.end:
303  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
304  ret i32 %sum.0.lcssa
305}
306
307; void foo(const float *arg_A, unsigned arg_B, float *array) {
308;   for (uint32_t i = 0; i < 6; ++i) {
309;     const float *ptr = arg_A + i;
310;     float w0 = array[i * 4 + 0];
311;     float w1 = array[i * 4 + 1];
312;     float w2 = array[i * 4 + 2];
313;     float w3 = array[i * 4 + 3];
314;
315;     for (unsigned j = 0; j < arg_B; ++j) {
316;       const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1);
317;       const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1);
318;       const float x3 = x2 - (-3.1f * w2) - (3.2f * w3);
319;       const float x4 = x3 + (-4.0f * w2) + w3;
320;       w1 = w0;
321;       w0 = x1;
322;       w3 = w2;
323;       w2 = x3;
324;     }
325;
326;     array[i * 4 + 0] = w0;
327;     array[i * 4 + 1] = w1;
328;     array[i * 4 + 2] = w2;
329;     array[i * 4 + 3] = w3;
330;   }
331; }
332
333define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) {
334; CHECK-LABEL: @foo(
335; CHECK: fmul fast <4 x float>
336; CHECK: shufflevector <4 x float>
337;
338entry:
339  %cmp1495 = icmp eq i32 %arg_B, 0
340  br label %for.body
341
342for.cond.cleanup:                                 ; preds = %for.cond.cleanup15
343  ret void
344
345for.body:                                         ; preds = %for.cond.cleanup15, %entry
346  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ]
347  %0 = shl i64 %indvars.iv, 2
348  %arrayidx = getelementptr inbounds float, float* %array, i64 %0
349  %1 = load float, float* %arrayidx, align 4
350  %2 = or i64 %0, 1
351  %arrayidx4 = getelementptr inbounds float, float* %array, i64 %2
352  %3 = load float, float* %arrayidx4, align 4
353  %4 = or i64 %0, 2
354  %arrayidx8 = getelementptr inbounds float, float* %array, i64 %4
355  %5 = load float, float* %arrayidx8, align 4
356  %6 = or i64 %0, 3
357  %arrayidx12 = getelementptr inbounds float, float* %array, i64 %6
358  %7 = load float, float* %arrayidx12, align 4
359  br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph
360
361for.body16.lr.ph:                                 ; preds = %for.body
362  %add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv
363  %8 = load float, float* %add.ptr, align 4
364  br label %for.body16
365
366for.cond.cleanup15:                               ; preds = %for.body16, %for.body
367  %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ]
368  %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ]
369  %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ]
370  %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ]
371  store float %w0.0.lcssa, float* %arrayidx, align 4
372  store float %w1.0.lcssa, float* %arrayidx4, align 4
373  store float %w2.0.lcssa, float* %arrayidx8, align 4
374  store float %w3.0.lcssa, float* %arrayidx12, align 4
375  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
376  %exitcond109 = icmp eq i64 %indvars.iv.next, 6
377  br i1 %exitcond109, label %for.cond.cleanup, label %for.body
378
379for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
380  %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ]
381  %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ]
382  %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ]
383  %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ]
384  %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ]
385  %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000
386  %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000
387  %sub92 = fadd fast float %mul17, %mul18.neg
388  %sub19 = fadd fast float %sub92, %8
389  %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000
390  %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000
391  %mul23 = fmul fast float %w1.099, 0x4002666660000000
392  %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000
393  %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000
394  %add2293 = fadd fast float %mul27.neg, %mul25
395  %add24 = fadd fast float %add2293, %mul23
396  %sub2694 = fadd fast float %add24, %mul21.neg
397  %sub28 = fadd fast float %sub2694, %mul20
398  %inc = add nuw i32 %j.098, 1
399  %exitcond = icmp eq i32 %inc, %arg_B
400  br i1 %exitcond, label %for.cond.cleanup15, label %for.body16
401}
402
403; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
404
405; void foo(double * restrict A, double * restrict B, double * restrict C,
406;          int n) {
407;   for (intptr_t i=0; i < n; ++i) {
408;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
409;   }
410; }
411
412; STORE-LABEL: store_red_double
413; STORE: fmul fast <2 x double>
414; STORE: extractelement <2 x double>
415; STORE: extractelement <2 x double>
416
417define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
418entry:
419  %cmp17 = icmp sgt i32 %n, 0
420  br i1 %cmp17, label %for.body.lr.ph, label %for.end
421
422for.body.lr.ph:
423  %0 = load double, double* %B, align 8
424  %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
425  %1 = load double, double* %arrayidx4, align 8
426  %2 = sext i32 %n to i64
427  br label %for.body
428
429for.body:
430  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
431  %mul = shl nsw i64 %i.018, 2
432  %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
433  %3 = load double, double* %arrayidx2, align 8
434  %mul3 = fmul fast double %0, %3
435  %add16 = or i64 %mul, 1
436  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
437  %4 = load double, double* %arrayidx6, align 8
438  %mul7 = fmul fast double %1, %4
439  %add8 = fadd fast double %mul3, %mul7
440  %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
441  store double %add8, double* %arrayidx9, align 8
442  %inc = add nsw i64 %i.018, 1
443  %exitcond = icmp eq i64 %inc, %2
444  br i1 %exitcond, label %for.end, label %for.body
445
446for.end:
447  ret void
448}
449
450; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
451;   float sum = 0;
452;   for (intptr_t i=0; i < n; ++i) {
453;     C[i] = B[0] *A[i*4  ] +
454;          B[1] *A[i*4+1] +
455;          B[2] *A[i*4+2] +
456;          B[3] *A[i*4+3];
457;   }
458;   return sum;
459; }
460
461; STORE-LABEL: store_red
462; STORE: fmul fast <4 x float>
463; STORE: shufflevector <4 x float>
464
465define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
466entry:
467  %cmp37 = icmp sgt i32 %n, 0
468  br i1 %cmp37, label %for.body.lr.ph, label %for.end
469
470for.body.lr.ph:
471  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
472  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
473  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
474  %0 = sext i32 %n to i64
475  br label %for.body
476
477for.body:
478  %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
479  %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
480  %1 = load float, float* %B, align 4
481  %mul = shl nsw i64 %i.039, 2
482  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
483  %2 = load float, float* %arrayidx2, align 4
484  %mul3 = fmul fast float %1, %2
485  %3 = load float, float* %arrayidx4, align 4
486  %add34 = or i64 %mul, 1
487  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add34
488  %4 = load float, float* %arrayidx6, align 4
489  %mul7 = fmul fast float %3, %4
490  %add8 = fadd fast float %mul3, %mul7
491  %5 = load float, float* %arrayidx9, align 4
492  %add1135 = or i64 %mul, 2
493  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1135
494  %6 = load float, float* %arrayidx12, align 4
495  %mul13 = fmul fast float %5, %6
496  %add14 = fadd fast float %add8, %mul13
497  %7 = load float, float* %arrayidx15, align 4
498  %add1736 = or i64 %mul, 3
499  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1736
500  %8 = load float, float* %arrayidx18, align 4
501  %mul19 = fmul fast float %7, %8
502  %add20 = fadd fast float %add14, %mul19
503  store float %add20, float* %C.addr.038, align 4
504  %incdec.ptr = getelementptr inbounds float, float* %C.addr.038, i64 1
505  %inc = add nsw i64 %i.039, 1
506  %exitcond = icmp eq i64 %inc, %0
507  br i1 %exitcond, label %for.end, label %for.body
508
509for.end:
510  ret i32 0
511}
512
513@arr_i32 = global [32 x i32] zeroinitializer, align 16
514@arr_float = global [32 x float] zeroinitializer, align 16
515
516define void @float_red_example4(float* %res) {
517; STORE-LABEL: @float_red_example4(
518; STORE:         [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16
519; STORE:         [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
520; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP0]], [[RDX_SHUF]]
521; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
522; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
523; STORE-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
524; STORE:         store float [[TMP1]], float* %res, align 16
525; STORE-NEXT:    ret void
526;
527entry:
528  %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
529  %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
530  %add = fadd fast float %1, %0
531  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
532  %add.1 = fadd fast float %2, %add
533  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
534  %add.2 = fadd fast float %3, %add.1
535  store float %add.2, float* %res, align 16
536  ret void
537}
538
539define void @float_red_example8(float* %res) {
540; STORE-LABEL: @float_red_example8(
541; STORE:         [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16
542; STORE:         [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
543; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP0]], [[RDX_SHUF]]
544; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
545; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
546; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
547; STORE-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
548; STORE-NEXT:    [[TMP1:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
549; STORE:         store float [[TMP1]], float* %res, align 16
550; STORE-NEXT:    ret void
551;
552entry:
553  %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
554  %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
555  %add = fadd fast float %1, %0
556  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
557  %add.1 = fadd fast float %2, %add
558  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
559  %add.2 = fadd fast float %3, %add.1
560  %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
561  %add.3 = fadd fast float %4, %add.2
562  %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
563  %add.4 = fadd fast float %5, %add.3
564  %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
565  %add.5 = fadd fast float %6, %add.4
566  %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
567  %add.6 = fadd fast float %7, %add.5
568  store float %add.6, float* %res, align 16
569  ret void
570}
571
572define void @float_red_example16(float* %res) {
573; STORE-LABEL: @float_red_example16(
574; STORE:         [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16
575; STORE:         [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
576; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP0]], [[RDX_SHUF]]
577; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
578; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
579; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
580; STORE-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
581; STORE-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
582; STORE-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
583; STORE-NEXT:    [[TMP1:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
584; STORE:         store float [[TMP1]], float* %res, align 16
585; STORE-NEXT:    ret void
586;
587entry:
588  %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
589  %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
590  %add = fadd fast float %1, %0
591  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
592  %add.1 = fadd fast float %2, %add
593  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
594  %add.2 = fadd fast float %3, %add.1
595  %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
596  %add.3 = fadd fast float %4, %add.2
597  %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
598  %add.4 = fadd fast float %5, %add.3
599  %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
600  %add.5 = fadd fast float %6, %add.4
601  %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
602  %add.6 = fadd fast float %7, %add.5
603  %8 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16
604  %add.7 = fadd fast float %8, %add.6
605  %9 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4
606  %add.8 = fadd fast float %9, %add.7
607  %10 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8
608  %add.9 = fadd fast float %10, %add.8
609  %11 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4
610  %add.10 = fadd fast float %11, %add.9
611  %12 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16
612  %add.11 = fadd fast float %12, %add.10
613  %13 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4
614  %add.12 = fadd fast float %13, %add.11
615  %14 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8
616  %add.13 = fadd fast float %14, %add.12
617  %15 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4
618  %add.14 = fadd fast float %15, %add.13
619  store float %add.14, float* %res, align 16
620  ret void
621}
622
623define void @i32_red_example4(i32* %res) {
624; STORE-LABEL: @i32_red_example4(
625; STORE:         [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
626; STORE:         [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
627; STORE-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP0]], [[RDX_SHUF]]
628; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
629; STORE-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
630; STORE-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
631; STORE:         store i32 [[TMP1]], i32* %res, align 16
632; STORE-NEXT:    ret void
633;
634entry:
635  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
636  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
637  %add = add nsw i32 %1, %0
638  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
639  %add.1 = add nsw i32 %2, %add
640  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
641  %add.2 = add nsw i32 %3, %add.1
642  store i32 %add.2, i32* %res, align 16
643  ret void
644}
645
646define void @i32_red_example8(i32* %res) {
647; STORE-LABEL: @i32_red_example8(
648; STORE:         [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
649; STORE:         [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
650; STORE-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP0]], [[RDX_SHUF]]
651; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
652; STORE-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
653; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
654; STORE-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
655; STORE-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
656; STORE:         store i32 [[TMP1]], i32* %res, align 16
657; STORE-NEXT:    ret void
658;
659entry:
660  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
661  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
662  %add = add nsw i32 %1, %0
663  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
664  %add.1 = add nsw i32 %2, %add
665  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
666  %add.2 = add nsw i32 %3, %add.1
667  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
668  %add.3 = add nsw i32 %4, %add.2
669  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
670  %add.4 = add nsw i32 %5, %add.3
671  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
672  %add.5 = add nsw i32 %6, %add.4
673  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
674  %add.6 = add nsw i32 %7, %add.5
675  store i32 %add.6, i32* %res, align 16
676  ret void
677}
678
679define void @i32_red_example16(i32* %res) {
680; STORE-LABEL: @i32_red_example16(
681; STORE:         [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16
682; STORE:         [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
683; STORE-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP0]], [[RDX_SHUF]]
684; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
685; STORE-NEXT:    [[BIN_RDX2:%.*]] = add <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
686; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
687; STORE-NEXT:    [[BIN_RDX4:%.*]] = add <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
688; STORE-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
689; STORE-NEXT:    [[BIN_RDX6:%.*]] = add <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
690; STORE-NEXT:    [[TMP1:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
691; STORE:         store i32 [[TMP1]], i32* %res, align 16
692; STORE-NEXT:    ret void
693;
694entry:
695  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
696  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
697  %add = add nsw i32 %1, %0
698  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
699  %add.1 = add nsw i32 %2, %add
700  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
701  %add.2 = add nsw i32 %3, %add.1
702  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
703  %add.3 = add nsw i32 %4, %add.2
704  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
705  %add.4 = add nsw i32 %5, %add.3
706  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
707  %add.5 = add nsw i32 %6, %add.4
708  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
709  %add.6 = add nsw i32 %7, %add.5
710  %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
711  %add.7 = add nsw i32 %8, %add.6
712  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
713  %add.8 = add nsw i32 %9, %add.7
714  %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
715  %add.9 = add nsw i32 %10, %add.8
716  %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
717  %add.10 = add nsw i32 %11, %add.9
718  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
719  %add.11 = add nsw i32 %12, %add.10
720  %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
721  %add.12 = add nsw i32 %13, %add.11
722  %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
723  %add.13 = add nsw i32 %14, %add.12
724  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
725  %add.14 = add nsw i32 %15, %add.13
726  store i32 %add.14, i32* %res, align 16
727  ret void
728}
729
730define void @i32_red_example32(i32* %res) {
731; STORE-LABEL: @i32_red_example32(
732; STORE:         [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16
733; STORE:         [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
734; STORE-NEXT:    [[BIN_RDX:%.*]] = add <32 x i32> [[TMP0]], [[RDX_SHUF]]
735; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
736; STORE-NEXT:    [[BIN_RDX2:%.*]] = add <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
737; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
738; STORE-NEXT:    [[BIN_RDX4:%.*]] = add <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
739; STORE-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
740; STORE-NEXT:    [[BIN_RDX6:%.*]] = add <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
741; STORE-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
742; STORE-NEXT:    [[BIN_RDX8:%.*]] = add <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
743; STORE-NEXT:    [[TMP1:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0
744; STORE:         store i32 [[TMP1]], i32* %res, align 16
745; STORE-NEXT:    ret void
746;
747entry:
748  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
749  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
750  %add = add nsw i32 %1, %0
751  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
752  %add.1 = add nsw i32 %2, %add
753  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
754  %add.2 = add nsw i32 %3, %add.1
755  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
756  %add.3 = add nsw i32 %4, %add.2
757  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
758  %add.4 = add nsw i32 %5, %add.3
759  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
760  %add.5 = add nsw i32 %6, %add.4
761  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
762  %add.6 = add nsw i32 %7, %add.5
763  %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
764  %add.7 = add nsw i32 %8, %add.6
765  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
766  %add.8 = add nsw i32 %9, %add.7
767  %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
768  %add.9 = add nsw i32 %10, %add.8
769  %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
770  %add.10 = add nsw i32 %11, %add.9
771  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
772  %add.11 = add nsw i32 %12, %add.10
773  %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
774  %add.12 = add nsw i32 %13, %add.11
775  %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
776  %add.13 = add nsw i32 %14, %add.12
777  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
778  %add.14 = add nsw i32 %15, %add.13
779  %16 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16
780  %add.15 = add nsw i32 %16, %add.14
781  %17 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4
782  %add.16 = add nsw i32 %17, %add.15
783  %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8
784  %add.17 = add nsw i32 %18, %add.16
785  %19 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4
786  %add.18 = add nsw i32 %19, %add.17
787  %20 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16
788  %add.19 = add nsw i32 %20, %add.18
789  %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4
790  %add.20 = add nsw i32 %21, %add.19
791  %22 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8
792  %add.21 = add nsw i32 %22, %add.20
793  %23 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4
794  %add.22 = add nsw i32 %23, %add.21
795  %24 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16
796  %add.23 = add nsw i32 %24, %add.22
797  %25 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4
798  %add.24 = add nsw i32 %25, %add.23
799  %26 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8
800  %add.25 = add nsw i32 %26, %add.24
801  %27 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4
802  %add.26 = add nsw i32 %27, %add.25
803  %28 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16
804  %add.27 = add nsw i32 %28, %add.26
805  %29 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4
806  %add.28 = add nsw i32 %29, %add.27
807  %30 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8
808  %add.29 = add nsw i32 %30, %add.28
809  %31 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4
810  %add.30 = add nsw i32 %31, %add.29
811  store i32 %add.30, i32* %res, align 16
812  ret void
813}
814
815