1; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=scalar-epilogue -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize \
2; RUN:   -pass-remarks-missed=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve,+bf16 -S 2>%t | FileCheck %s -check-prefix=CHECK
3; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARK
4
5; Reduction can be vectorized
6
7; ADD
8
9; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
10define i32 @add(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
11; CHECK-LABEL: @add
12; CHECK: vector.body:
13; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
14; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
15; CHECK: %[[ADD1:.*]] = add <vscale x 8 x i32> %[[LOAD1]]
16; CHECK: %[[ADD2:.*]] = add <vscale x 8 x i32> %[[LOAD2]]
17; CHECK: middle.block:
18; CHECK: %[[ADD:.*]] = add <vscale x 8 x i32> %[[ADD2]], %[[ADD1]]
19; CHECK-NEXT: call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> %[[ADD]])
20entry:
21  br label %for.body
22
23for.body:                                         ; preds = %entry, %for.body
24  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
25  %sum.07 = phi i32 [ 2, %entry ], [ %add, %for.body ]
26  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
27  %0 = load i32, i32* %arrayidx, align 4
28  %add = add nsw i32 %0, %sum.07
29  %iv.next = add nuw nsw i64 %iv, 1
30  %exitcond.not = icmp eq i64 %iv.next, %n
31  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
32
33for.end:                                 ; preds = %for.body, %entry
34  ret i32 %add
35}
36
37; OR
38
39; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
40define i32 @or(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
41; CHECK-LABEL: @or
42; CHECK: vector.body:
43; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
44; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
45; CHECK: %[[OR1:.*]] = or <vscale x 8 x i32> %[[LOAD1]]
46; CHECK: %[[OR2:.*]] = or <vscale x 8 x i32> %[[LOAD2]]
47; CHECK: middle.block:
48; CHECK: %[[OR:.*]] = or <vscale x 8 x i32> %[[OR2]], %[[OR1]]
49; CHECK-NEXT: call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> %[[OR]])
50entry:
51  br label %for.body
52
53for.body:                                         ; preds = %entry, %for.body
54  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
55  %sum.07 = phi i32 [ 2, %entry ], [ %or, %for.body ]
56  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
57  %0 = load i32, i32* %arrayidx, align 4
58  %or = or i32 %0, %sum.07
59  %iv.next = add nuw nsw i64 %iv, 1
60  %exitcond.not = icmp eq i64 %iv.next, %n
61  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
62
63for.end:                                 ; preds = %for.body, %entry
64  ret i32 %or
65}
66
67; AND
68
69; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
70define i32 @and(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
71; CHECK-LABEL: @and
72; CHECK: vector.body:
73; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
74; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
75; CHECK: %[[AND1:.*]] = and <vscale x 8 x i32> %[[LOAD1]]
76; CHECK: %[[AND2:.*]] = and <vscale x 8 x i32> %[[LOAD2]]
77; CHECK: middle.block:
78; CHECK: %[[ABD:.*]] = and <vscale x 8 x i32> %[[ADD2]], %[[AND1]]
79; CHECK-NEXT: call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> %[[ADD]])
80entry:
81  br label %for.body
82
83for.body:                                         ; preds = %entry, %for.body
84  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
85  %sum.07 = phi i32 [ 2, %entry ], [ %and, %for.body ]
86  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
87  %0 = load i32, i32* %arrayidx, align 4
88  %and = and i32 %0, %sum.07
89  %iv.next = add nuw nsw i64 %iv, 1
90  %exitcond.not = icmp eq i64 %iv.next, %n
91  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
92
93for.end:                                 ; preds = %for.body, %entry
94  ret i32 %and
95}
96
97; XOR
98
99; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
100define i32 @xor(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
101; CHECK-LABEL: @xor
102; CHECK: vector.body:
103; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
104; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
105; CHECK: %[[XOR1:.*]] = xor <vscale x 8 x i32> %[[LOAD1]]
106; CHECK: %[[XOR2:.*]] = xor <vscale x 8 x i32> %[[LOAD2]]
107; CHECK: middle.block:
108; CHECK: %[[XOR:.*]] = xor <vscale x 8 x i32> %[[XOR2]], %[[XOR1]]
109; CHECK-NEXT: call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> %[[XOR]])
110entry:
111  br label %for.body
112
113for.body:                                         ; preds = %entry, %for.body
114  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
115  %sum.07 = phi i32 [ 2, %entry ], [ %xor, %for.body ]
116  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
117  %0 = load i32, i32* %arrayidx, align 4
118  %xor = xor i32 %0, %sum.07
119  %iv.next = add nuw nsw i64 %iv, 1
120  %exitcond.not = icmp eq i64 %iv.next, %n
121  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
122
123for.end:                                 ; preds = %for.body, %entry
124  ret i32 %xor
125}
126
127; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
128; SMIN
129
130define i32 @smin(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
131; CHECK-LABEL: @smin
132; CHECK: vector.body:
133; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
134; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
135; CHECK: %[[ICMP1:.*]] = icmp slt <vscale x 8 x i32> %[[LOAD1]]
136; CHECK: %[[ICMP2:.*]] = icmp slt <vscale x 8 x i32> %[[LOAD2]]
137; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
138; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
139; CHECK: middle.block:
140; CHECK: %[[ICMP:.*]] = icmp slt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
141; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
142; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32>  %[[SEL]])
143entry:
144  br label %for.body
145
146for.body:                                         ; preds = %entry, %for.body
147  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
148  %sum.010 = phi i32 [ 2, %entry ], [ %.sroa.speculated, %for.body ]
149  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
150  %0 = load i32, i32* %arrayidx, align 4
151  %cmp.i = icmp slt i32 %0, %sum.010
152  %.sroa.speculated = select i1 %cmp.i, i32 %0, i32 %sum.010
153  %iv.next = add nuw nsw i64 %iv, 1
154  %exitcond.not = icmp eq i64 %iv.next, %n
155  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
156
157for.end:
158  ret i32 %.sroa.speculated
159}
160
161; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
162; UMAX
163
164define i32 @umax(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
165; CHECK-LABEL: @umax
166; CHECK: vector.body:
167; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
168; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
169; CHECK: %[[ICMP1:.*]] = icmp ugt <vscale x 8 x i32> %[[LOAD1]]
170; CHECK: %[[ICMP2:.*]] = icmp ugt <vscale x 8 x i32> %[[LOAD2]]
171; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
172; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
173; CHECK: middle.block:
174; CHECK: %[[ICMP:.*]] = icmp ugt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
175; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
176; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32>  %[[SEL]])
177entry:
178  br label %for.body
179
180for.body:                                         ; preds = %entry, %for.body
181  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
182  %sum.010 = phi i32 [ 2, %entry ], [ %.sroa.speculated, %for.body ]
183  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
184  %0 = load i32, i32* %arrayidx, align 4
185  %cmp.i = icmp ugt i32 %0, %sum.010
186  %.sroa.speculated = select i1 %cmp.i, i32 %0, i32 %sum.010
187  %iv.next = add nuw nsw i64 %iv, 1
188  %exitcond.not = icmp eq i64 %iv.next, %n
189  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
190
191for.end:
192  ret i32 %.sroa.speculated
193}
194
195; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
196; FADD (FAST)
197
198define float @fadd_fast(float* noalias nocapture readonly %a, i64 %n) {
199; CHECK-LABEL: @fadd_fast
200; CHECK: vector.body:
201; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
202; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
203; CHECK: %[[ADD1:.*]] = fadd fast <vscale x 8 x float> %[[LOAD1]]
204; CHECK: %[[ADD2:.*]] = fadd fast <vscale x 8 x float> %[[LOAD2]]
205; CHECK: middle.block:
206; CHECK: %[[ADD:.*]] = fadd fast <vscale x 8 x float> %[[ADD2]], %[[ADD1]]
207; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[ADD]])
208entry:
209  br label %for.body
210
211for.body:
212  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
213  %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
214  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
215  %0 = load float, float* %arrayidx, align 4
216  %add = fadd fast float %0, %sum.07
217  %iv.next = add nuw nsw i64 %iv, 1
218  %exitcond.not = icmp eq i64 %iv.next, %n
219  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
220
221for.end:
222  ret float %add
223}
224
225; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
226; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
227define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) {
228; CHECK-LABEL: @fadd_fast_bfloat
229; CHECK: vector.body:
230; CHECK: %[[LOAD1:.*]] = load <8 x bfloat>
231; CHECK: %[[LOAD2:.*]] = load <8 x bfloat>
232; CHECK: %[[FADD1:.*]] = fadd fast <8 x bfloat> %[[LOAD1]]
233; CHECK: %[[FADD2:.*]] = fadd fast <8 x bfloat> %[[LOAD2]]
234; CHECK: middle.block:
235; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD2]], %[[FADD1]]
236; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[RDX]])
237entry:
238  br label %for.body
239
240for.body:
241  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
242  %sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %add, %for.body ]
243  %arrayidx = getelementptr inbounds bfloat, bfloat* %a, i64 %iv
244  %0 = load bfloat, bfloat* %arrayidx, align 4
245  %add = fadd fast bfloat %0, %sum.07
246  %iv.next = add nuw nsw i64 %iv, 1
247  %exitcond.not = icmp eq i64 %iv.next, %n
248  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
249
250for.end:
251  ret bfloat %add
252}
253
254; FMIN (FAST)
255
256; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
257define float @fmin_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
258; CHECK-LABEL: @fmin_fast
259; CHECK: vector.body:
260; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
261; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
262; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x float> %[[LOAD1]]
263; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x float> %[[LOAD2]]
264; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x float> %[[LOAD1]]
265; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x float> %[[LOAD2]]
266; CHECK: middle.block:
267; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x float> %[[SEL1]], %[[SEL2]]
268; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x float> %[[SEL1]], <vscale x 8 x float> %[[SEL2]]
269; CHECK-NEXT: call float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float> %[[SEL]])
270entry:
271  br label %for.body
272
273for.body:
274  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
275  %sum.07 = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
276  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
277  %0 = load float, float* %arrayidx, align 4
278  %cmp.i = fcmp olt float %0, %sum.07
279  %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07
280  %iv.next = add nuw nsw i64 %iv, 1
281  %exitcond.not = icmp eq i64 %iv.next, %n
282  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
283
284for.end:
285  ret float %.sroa.speculated
286}
287
288; FMAX (FAST)
289
290; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
291define float @fmax_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
292; CHECK-LABEL: @fmax_fast
293; CHECK: vector.body:
294; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
295; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
296; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x float> %[[LOAD1]]
297; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x float> %[[LOAD2]]
298; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x float> %[[LOAD1]]
299; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x float> %[[LOAD2]]
300; CHECK: middle.block:
301; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x float> %[[SEL1]], %[[SEL2]]
302; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x float> %[[SEL1]], <vscale x 8 x float> %[[SEL2]]
303; CHECK-NEXT: call fast float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float> %[[SEL]])
304entry:
305  br label %for.body
306
307for.body:
308  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
309  %sum.07 = phi float [ 0.000000e+00, %entry ], [ %.sroa.speculated, %for.body ]
310  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
311  %0 = load float, float* %arrayidx, align 4
312  %cmp.i = fcmp fast ogt float %0, %sum.07
313  %.sroa.speculated = select i1 %cmp.i, float %0, float %sum.07
314  %iv.next = add nuw nsw i64 %iv, 1
315  %exitcond.not = icmp eq i64 %iv.next, %n
316  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
317
318for.end:
319  ret float %.sroa.speculated
320}
321
322; ADD (with reduction stored in invariant address)
323
324; CHECK-REMARK: vectorized loop (vectorization width: vscale x 4, interleaved count: 2)
325define void @invariant_store(i32* %dst, i32* readonly %src) {
326; CHECK-LABEL: @invariant_store
327; CHECK: vector.body:
328; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x i32>
329; CHECK: %[[LOAD2:.*]] = load <vscale x 4 x i32>
330; CHECK: %[[ADD1:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD1]]
331; CHECK: %[[ADD2:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[LOAD2]]
332; CHECK: middle.block:
333; CHECK: %[[ADD:.*]] = add <vscale x 4 x i32> %[[ADD2]], %[[ADD1]]
334; CHECK-NEXT: %[[SUM:.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %[[ADD]])
335; CHECK-NEXT: store i32 %[[SUM]], i32* %gep.dst, align 4
336entry:
337  %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
338  store i32 0, i32* %gep.dst, align 4
339  br label %for.body
340for.body:
341  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
342  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
343  %gep.src = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
344  %0 = load i32, i32* %gep.src, align 4
345  %add = add nsw i32 %sum, %0
346  store i32 %add, i32* %gep.dst, align 4
347  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
348  %exitcond = icmp eq i64 %indvars.iv.next, 1000
349  br i1 %exitcond, label %for.cond.cleanup, label %for.body
350
351for.cond.cleanup:
352  ret void
353}
354
355; Reduction cannot be vectorized
356
357; MUL
358
359; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
360; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
361define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
362; CHECK-LABEL: @mul
363; CHECK: vector.body:
364; CHECK: %[[LOAD1:.*]] = load <4 x i32>
365; CHECK: %[[LOAD2:.*]] = load <4 x i32>
366; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]]
367; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]]
368; CHECK: middle.block:
369; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
370; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
371entry:
372  br label %for.body
373
374for.body:                                         ; preds = %entry, %for.body
375  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
376  %sum.07 = phi i32 [ 2, %entry ], [ %mul, %for.body ]
377  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
378  %0 = load i32, i32* %arrayidx, align 4
379  %mul = mul nsw i32 %0, %sum.07
380  %iv.next = add nuw nsw i64 %iv, 1
381  %exitcond.not = icmp eq i64 %iv.next, %n
382  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
383
384for.end:                                 ; preds = %for.body, %entry
385  ret i32 %mul
386}
387
388; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
389; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
390; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
391define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
392; CHECK-LABEL: @memory_dependence
393; CHECK: vector.body:
394; CHECK: %[[LOAD1:.*]] = load <4 x i32>
395; CHECK: %[[LOAD2:.*]] = load <4 x i32>
396; CHECK: %[[LOAD3:.*]] = load <4 x i32>
397; CHECK: %[[LOAD4:.*]] = load <4 x i32>
398; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
399; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
400; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
401; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
402; CHECK: middle.block:
403; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
404; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
405entry:
406  br label %for.body
407
408for.body:
409  %i = phi i64 [ %inc, %for.body ], [ 0, %entry ]
410  %sum = phi i32 [ %mul, %for.body ], [ 2, %entry ]
411  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i
412  %0 = load i32, i32* %arrayidx, align 4
413  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 %i
414  %1 = load i32, i32* %arrayidx1, align 4
415  %add = add nsw i32 %1, %0
416  %add2 = add nuw nsw i64 %i, 32
417  %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 %add2
418  store i32 %add, i32* %arrayidx3, align 4
419  %mul = mul nsw i32 %1, %sum
420  %inc = add nuw nsw i64 %i, 1
421  %exitcond.not = icmp eq i64 %inc, %n
422  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
423
424for.end:
425  ret i32 %mul
426}
427
428attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
429
430!0 = distinct !{!0, !1, !2, !3, !4}
431!1 = !{!"llvm.loop.vectorize.width", i32 8}
432!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
433!3 = !{!"llvm.loop.interleave.count", i32 2}
434!4 = !{!"llvm.loop.vectorize.enable", i1 true}
435