1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -loop-vectorize -instcombine -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -tail-predication=enabled < %s -S -o - | FileCheck %s
3
4target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
5target triple = "thumbv8.1m.main-arm-none-eabi"
6
7; Should not be vectorized
8define i64 @add_i64_i64(i64* nocapture readonly %x, i32 %n) #0 {
9; CHECK-LABEL: @add_i64_i64(
10; CHECK-NEXT:  entry:
11; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
12; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
13; CHECK:       for.body:
14; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
15; CHECK-NEXT:    [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
16; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[X:%.*]], i32 [[I_08]]
17; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
18; CHECK-NEXT:    [[ADD]] = add nsw i64 [[TMP0]], [[R_07]]
19; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
20; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
21; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
22; CHECK:       for.cond.cleanup:
23; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
24; CHECK-NEXT:    ret i64 [[R_0_LCSSA]]
25;
26entry:
27  %cmp6 = icmp sgt i32 %n, 0
28  br i1 %cmp6, label %for.body, label %for.cond.cleanup
29
30for.body:                                         ; preds = %entry, %for.body
31  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
32  %r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
33  %arrayidx = getelementptr inbounds i64, i64* %x, i32 %i.08
34  %0 = load i64, i64* %arrayidx, align 8
35  %add = add nsw i64 %0, %r.07
36  %inc = add nuw nsw i32 %i.08, 1
37  %exitcond = icmp eq i32 %inc, %n
38  br i1 %exitcond, label %for.cond.cleanup, label %for.body
39
40for.cond.cleanup:                                 ; preds = %for.body, %entry
41  %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
42  ret i64 %r.0.lcssa
43}
44
45; 4x to use VADDLV
46; FIXME: TailPredicate
47define i64 @add_i32_i64(i32* nocapture readonly %x, i32 %n) #0 {
48; CHECK-LABEL: @add_i32_i64(
49; CHECK-NEXT:  entry:
50; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
51; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
52; CHECK:       for.body.preheader:
53; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
54; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
55; CHECK:       vector.ph:
56; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -4
57; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
58; CHECK:       vector.body:
59; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
60; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
61; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]]
62; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
63; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
64; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
65; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
66; CHECK-NEXT:    [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]]
67; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
68; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
69; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
70; CHECK:       middle.block:
71; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
72; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
73; CHECK:       scalar.ph:
74; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
75; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
76; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
77; CHECK:       for.body:
78; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
79; CHECK-NEXT:    [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
80; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]]
81; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
82; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP6]] to i64
83; CHECK-NEXT:    [[ADD]] = add nsw i64 [[R_07]], [[CONV]]
84; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
85; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
86; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
87; CHECK:       for.cond.cleanup:
88; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
89; CHECK-NEXT:    ret i64 [[R_0_LCSSA]]
90;
91entry:
92  %cmp6 = icmp sgt i32 %n, 0
93  br i1 %cmp6, label %for.body, label %for.cond.cleanup
94
95for.body:                                         ; preds = %entry, %for.body
96  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
97  %r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
98  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
99  %0 = load i32, i32* %arrayidx, align 4
100  %conv = sext i32 %0 to i64
101  %add = add nsw i64 %r.07, %conv
102  %inc = add nuw nsw i32 %i.08, 1
103  %exitcond = icmp eq i32 %inc, %n
104  br i1 %exitcond, label %for.cond.cleanup, label %for.body
105
106for.cond.cleanup:                                 ; preds = %for.body, %entry
107  %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
108  ret i64 %r.0.lcssa
109}
110
111; 4x to use VADDLV
112; FIXME: TailPredicate
113define i64 @add_i16_i64(i16* nocapture readonly %x, i32 %n) #0 {
114; CHECK-LABEL: @add_i16_i64(
115; CHECK-NEXT:  entry:
116; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
117; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
118; CHECK:       for.body.preheader:
119; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
120; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
121; CHECK:       vector.ph:
122; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -4
123; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
124; CHECK:       vector.body:
125; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
126; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
127; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
128; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>*
129; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
130; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
131; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
132; CHECK-NEXT:    [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]]
133; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
134; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
135; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
136; CHECK:       middle.block:
137; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
138; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
139; CHECK:       scalar.ph:
140; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
141; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
142; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
143; CHECK:       for.body:
144; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
145; CHECK-NEXT:    [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
146; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_08]]
147; CHECK-NEXT:    [[TMP6:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
148; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP6]] to i64
149; CHECK-NEXT:    [[ADD]] = add nsw i64 [[R_07]], [[CONV]]
150; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
151; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
152; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
153; CHECK:       for.cond.cleanup:
154; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
155; CHECK-NEXT:    ret i64 [[R_0_LCSSA]]
156;
157entry:
158  %cmp6 = icmp sgt i32 %n, 0
159  br i1 %cmp6, label %for.body, label %for.cond.cleanup
160
161for.body:                                         ; preds = %entry, %for.body
162  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
163  %r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
164  %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.08
165  %0 = load i16, i16* %arrayidx, align 2
166  %conv = sext i16 %0 to i64
167  %add = add nsw i64 %r.07, %conv
168  %inc = add nuw nsw i32 %i.08, 1
169  %exitcond = icmp eq i32 %inc, %n
170  br i1 %exitcond, label %for.cond.cleanup, label %for.body
171
172for.cond.cleanup:                                 ; preds = %for.body, %entry
173  %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
174  ret i64 %r.0.lcssa
175}
176
177; 4x to use VADDLV
178; FIXME: TailPredicate
179define i64 @add_i8_i64(i8* nocapture readonly %x, i32 %n) #0 {
180; CHECK-LABEL: @add_i8_i64(
181; CHECK-NEXT:  entry:
182; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
183; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
184; CHECK:       for.body.preheader:
185; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
186; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
187; CHECK:       vector.ph:
188; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -4
189; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
190; CHECK:       vector.body:
191; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
192; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
193; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
194; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
195; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
196; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64>
197; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]])
198; CHECK-NEXT:    [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]]
199; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
200; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
201; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
202; CHECK:       middle.block:
203; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
204; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
205; CHECK:       scalar.ph:
206; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
207; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
208; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
209; CHECK:       for.body:
210; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
211; CHECK-NEXT:    [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
212; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_08]]
213; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
214; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP6]] to i64
215; CHECK-NEXT:    [[ADD]] = add nuw nsw i64 [[R_07]], [[CONV]]
216; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
217; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
218; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
219; CHECK:       for.cond.cleanup:
220; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
221; CHECK-NEXT:    ret i64 [[R_0_LCSSA]]
222;
223entry:
224  %cmp6 = icmp sgt i32 %n, 0
225  br i1 %cmp6, label %for.body, label %for.cond.cleanup
226
227for.body:                                         ; preds = %entry, %for.body
228  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
229  %r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ]
230  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.08
231  %0 = load i8, i8* %arrayidx, align 1
232  %conv = zext i8 %0 to i64
233  %add = add nuw nsw i64 %r.07, %conv
234  %inc = add nuw nsw i32 %i.08, 1
235  %exitcond = icmp eq i32 %inc, %n
236  br i1 %exitcond, label %for.cond.cleanup, label %for.body
237
238for.cond.cleanup:                                 ; preds = %for.body, %entry
239  %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
240  ret i64 %r.0.lcssa
241}
242
243; 4x to use VADDV.u32
244define i32 @add_i32_i32(i32* nocapture readonly %x, i32 %n) #0 {
245; CHECK-LABEL: @add_i32_i32(
246; CHECK-NEXT:  entry:
247; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
248; CHECK-NEXT:    br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
249; CHECK:       vector.ph:
250; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
251; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
252; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
253; CHECK:       vector.body:
254; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
255; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
256; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
257; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]]
258; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
259; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
260; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]])
261; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP2]], [[VEC_PHI]]
262; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
263; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
264; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
265; CHECK:       for.cond.cleanup:
266; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[VECTOR_BODY]] ]
267; CHECK-NEXT:    ret i32 [[R_0_LCSSA]]
268;
269entry:
270  %cmp6 = icmp sgt i32 %n, 0
271  br i1 %cmp6, label %for.body, label %for.cond.cleanup
272
273for.body:                                         ; preds = %entry, %for.body
274  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
275  %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
276  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
277  %0 = load i32, i32* %arrayidx, align 4
278  %add = add nsw i32 %0, %r.07
279  %inc = add nuw nsw i32 %i.08, 1
280  %exitcond = icmp eq i32 %inc, %n
281  br i1 %exitcond, label %for.cond.cleanup, label %for.body
282
283for.cond.cleanup:                                 ; preds = %for.body, %entry
284  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
285  ret i32 %r.0.lcssa
286}
287
288; 8x to use VADDV.u16
289define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) #0 {
290; CHECK-LABEL: @add_i16_i32(
291; CHECK-NEXT:  entry:
292; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
293; CHECK-NEXT:    br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
294; CHECK:       vector.ph:
295; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
296; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
297; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
298; CHECK:       vector.body:
299; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
300; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
301; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
302; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
303; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
304; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
305; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
306; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer
307; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
308; CHECK-NEXT:    [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
309; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
310; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
311; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
312; CHECK:       for.cond.cleanup:
313; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
314; CHECK-NEXT:    ret i32 [[R_0_LCSSA]]
315;
316entry:
317  %cmp6 = icmp sgt i32 %n, 0
318  br i1 %cmp6, label %for.body, label %for.cond.cleanup
319
320for.body:                                         ; preds = %entry, %for.body
321  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
322  %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
323  %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.08
324  %0 = load i16, i16* %arrayidx, align 2
325  %conv = sext i16 %0 to i32
326  %add = add nsw i32 %r.07, %conv
327  %inc = add nuw nsw i32 %i.08, 1
328  %exitcond = icmp eq i32 %inc, %n
329  br i1 %exitcond, label %for.cond.cleanup, label %for.body
330
331for.cond.cleanup:                                 ; preds = %for.body, %entry
332  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
333  ret i32 %r.0.lcssa
334}
335
336; 16x to use VADDV.u16
337define i32 @add_i8_i32(i8* nocapture readonly %x, i32 %n) #0 {
338; CHECK-LABEL: @add_i8_i32(
339; CHECK-NEXT:  entry:
340; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
341; CHECK-NEXT:    br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
342; CHECK:       vector.ph:
343; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 15
344; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
345; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
346; CHECK:       vector.body:
347; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
348; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
349; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
350; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
351; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
352; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
353; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
354; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer
355; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
356; CHECK-NEXT:    [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
357; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
358; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
359; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
360; CHECK:       for.cond.cleanup:
361; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
362; CHECK-NEXT:    ret i32 [[R_0_LCSSA]]
363;
364entry:
365  %cmp6 = icmp sgt i32 %n, 0
366  br i1 %cmp6, label %for.body, label %for.cond.cleanup
367
368for.body:                                         ; preds = %entry, %for.body
369  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
370  %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
371  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.08
372  %0 = load i8, i8* %arrayidx, align 1
373  %conv = zext i8 %0 to i32
374  %add = add nuw nsw i32 %r.07, %conv
375  %inc = add nuw nsw i32 %i.08, 1
376  %exitcond = icmp eq i32 %inc, %n
377  br i1 %exitcond, label %for.cond.cleanup, label %for.body
378
379for.cond.cleanup:                                 ; preds = %for.body, %entry
380  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
381  ret i32 %r.0.lcssa
382}
383
384; 8x to use VADDV.u16
385define signext i16 @add_i16_i16(i16* nocapture readonly %x, i32 %n) #0 {
386; CHECK-LABEL: @add_i16_i16(
387; CHECK-NEXT:  entry:
388; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
389; CHECK-NEXT:    br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
390; CHECK:       vector.ph:
391; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
392; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
393; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
394; CHECK:       vector.body:
395; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
396; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
397; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
398; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
399; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
400; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> zeroinitializer)
401; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]])
402; CHECK-NEXT:    [[TMP3]] = add i16 [[TMP2]], [[VEC_PHI]]
403; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
404; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
405; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
406; CHECK:       for.cond.cleanup:
407; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[VECTOR_BODY]] ]
408; CHECK-NEXT:    ret i16 [[R_0_LCSSA]]
409;
410entry:
411  %cmp8 = icmp sgt i32 %n, 0
412  br i1 %cmp8, label %for.body, label %for.cond.cleanup
413
414for.body:                                         ; preds = %entry, %for.body
415  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
416  %r.09 = phi i16 [ %add, %for.body ], [ 0, %entry ]
417  %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.010
418  %0 = load i16, i16* %arrayidx, align 2
419  %add = add i16 %0, %r.09
420  %inc = add nuw nsw i32 %i.010, 1
421  %exitcond = icmp eq i32 %inc, %n
422  br i1 %exitcond, label %for.cond.cleanup, label %for.body
423
424for.cond.cleanup:                                 ; preds = %for.body, %entry
425  %r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
426  ret i16 %r.0.lcssa
427}
428
429; 16x to use VADDV.u8
430define signext i16 @add_i8_i16(i8* nocapture readonly %x, i32 %n) #0 {
431; CHECK-LABEL: @add_i8_i16(
432; CHECK-NEXT:  entry:
433; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
434; CHECK-NEXT:    br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
435; CHECK:       vector.ph:
436; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 15
437; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
438; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
439; CHECK:       vector.body:
440; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
441; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
442; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
443; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
444; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
445; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
446; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
447; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP2]], <16 x i16> zeroinitializer
448; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP3]])
449; CHECK-NEXT:    [[TMP5]] = add i16 [[TMP4]], [[VEC_PHI]]
450; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
451; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
452; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
453; CHECK:       for.cond.cleanup:
454; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
455; CHECK-NEXT:    ret i16 [[R_0_LCSSA]]
456;
457entry:
458  %cmp8 = icmp sgt i32 %n, 0
459  br i1 %cmp8, label %for.body, label %for.cond.cleanup
460
461for.body:                                         ; preds = %entry, %for.body
462  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
463  %r.09 = phi i16 [ %add, %for.body ], [ 0, %entry ]
464  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.010
465  %0 = load i8, i8* %arrayidx, align 1
466  %conv = zext i8 %0 to i16
467  %add = add i16 %r.09, %conv
468  %inc = add nuw nsw i32 %i.010, 1
469  %exitcond = icmp eq i32 %inc, %n
470  br i1 %exitcond, label %for.cond.cleanup, label %for.body
471
472for.cond.cleanup:                                 ; preds = %for.body, %entry
473  %r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
474  ret i16 %r.0.lcssa
475}
476
477; 16x to use VADDV.u8
478define zeroext i8 @add_i8_i8(i8* nocapture readonly %x, i32 %n) #0 {
479; CHECK-LABEL: @add_i8_i8(
480; CHECK-NEXT:  entry:
481; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0
482; CHECK-NEXT:    br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
483; CHECK:       vector.ph:
484; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 15
485; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
486; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
487; CHECK:       vector.body:
488; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
489; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
490; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
491; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
492; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
493; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> zeroinitializer)
494; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[WIDE_MASKED_LOAD]])
495; CHECK-NEXT:    [[TMP3]] = add i8 [[TMP2]], [[VEC_PHI]]
496; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
497; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
498; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
499; CHECK:       for.cond.cleanup:
500; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[VECTOR_BODY]] ]
501; CHECK-NEXT:    ret i8 [[R_0_LCSSA]]
502;
503entry:
504  %cmp7 = icmp sgt i32 %n, 0
505  br i1 %cmp7, label %for.body, label %for.cond.cleanup
506
507for.body:                                         ; preds = %entry, %for.body
508  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
509  %r.08 = phi i8 [ %add, %for.body ], [ 0, %entry ]
510  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.09
511  %0 = load i8, i8* %arrayidx, align 1
512  %add = add i8 %0, %r.08
513  %inc = add nuw nsw i32 %i.09, 1
514  %exitcond = icmp eq i32 %inc, %n
515  br i1 %exitcond, label %for.cond.cleanup, label %for.body
516
517for.cond.cleanup:                                 ; preds = %for.body, %entry
518  %r.0.lcssa = phi i8 [ 0, %entry ], [ %add, %for.body ]
519  ret i8 %r.0.lcssa
520}
521
522; Not vectorized
523define i64 @mla_i64_i64(i64* nocapture readonly %x, i64* nocapture readonly %y, i32 %n) #0 {
524; CHECK-LABEL: @mla_i64_i64(
525; CHECK-NEXT:  entry:
526; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
527; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
528; CHECK:       for.body:
529; CHECK-NEXT:    [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
530; CHECK-NEXT:    [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
531; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[X:%.*]], i32 [[I_010]]
532; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
533; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[Y:%.*]], i32 [[I_010]]
534; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
535; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP1]], [[TMP0]]
536; CHECK-NEXT:    [[ADD]] = add nsw i64 [[MUL]], [[R_09]]
537; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_010]], 1
538; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
539; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
540; CHECK:       for.cond.cleanup:
541; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ]
542; CHECK-NEXT:    ret i64 [[R_0_LCSSA]]
543;
544entry:
545  %cmp8 = icmp sgt i32 %n, 0
546  br i1 %cmp8, label %for.body, label %for.cond.cleanup
547
548for.body:                                         ; preds = %entry, %for.body
549  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
550  %r.09 = phi i64 [ %add, %for.body ], [ 0, %entry ]
551  %arrayidx = getelementptr inbounds i64, i64* %x, i32 %i.010
552  %0 = load i64, i64* %arrayidx, align 8
553  %arrayidx1 = getelementptr inbounds i64, i64* %y, i32 %i.010
554  %1 = load i64, i64* %arrayidx1, align 8
555  %mul = mul nsw i64 %1, %0
556  %add = add nsw i64 %mul, %r.09
557  %inc = add nuw nsw i32 %i.010, 1
558  %exitcond = icmp eq i32 %inc, %n
559  br i1 %exitcond, label %for.cond.cleanup, label %for.body
560
561for.cond.cleanup:                                 ; preds = %for.body, %entry
562  %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
563  ret i64 %r.0.lcssa
564}
565
566; 4x to use VMLAL.u32
567; FIXME: TailPredicate
568define i64 @mla_i32_i64(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) #0 {
569; CHECK-LABEL: @mla_i32_i64(
570; CHECK-NEXT:  entry:
571; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
572; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
573; CHECK:       for.body.preheader:
574; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
575; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
576; CHECK:       vector.ph:
577; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -4
578; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
579; CHECK:       vector.body:
580; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
581; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
582; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]]
583; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
584; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
585; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[INDEX]]
586; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
587; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
588; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
589; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[TMP4]] to <4 x i64>
590; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP5]])
591; CHECK-NEXT:    [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]]
592; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
593; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
594; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
595; CHECK:       middle.block:
596; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
597; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
598; CHECK:       scalar.ph:
599; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
600; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
601; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
602; CHECK:       for.body:
603; CHECK-NEXT:    [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
604; CHECK-NEXT:    [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
605; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_010]]
606; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
607; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[Y]], i32 [[I_010]]
608; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
609; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], [[TMP9]]
610; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[MUL]] to i64
611; CHECK-NEXT:    [[ADD]] = add nsw i64 [[R_09]], [[CONV]]
612; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_010]], 1
613; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
614; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
615; CHECK:       for.cond.cleanup:
616; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
617; CHECK-NEXT:    ret i64 [[R_0_LCSSA]]
618;
619entry:
620  %cmp8 = icmp sgt i32 %n, 0
621  br i1 %cmp8, label %for.body, label %for.cond.cleanup
622
623for.body:                                         ; preds = %entry, %for.body
624  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
625  %r.09 = phi i64 [ %add, %for.body ], [ 0, %entry ]
626  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.010
627  %0 = load i32, i32* %arrayidx, align 4
628  %arrayidx1 = getelementptr inbounds i32, i32* %y, i32 %i.010
629  %1 = load i32, i32* %arrayidx1, align 4
630  %mul = mul nsw i32 %1, %0
631  %conv = sext i32 %mul to i64
632  %add = add nsw i64 %r.09, %conv
633  %inc = add nuw nsw i32 %i.010, 1
634  %exitcond = icmp eq i32 %inc, %n
635  br i1 %exitcond, label %for.cond.cleanup, label %for.body
636
637for.cond.cleanup:                                 ; preds = %for.body, %entry
638  %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
639  ret i64 %r.0.lcssa
640}
641
642; 8x to use VMLAL.u16
643; FIXME: TailPredicate
644define i64 @mla_i16_i64(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 {
645; CHECK-LABEL: @mla_i16_i64(
646; CHECK-NEXT:  entry:
647; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
648; CHECK-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
649; CHECK:       for.body.preheader:
650; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
651; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
652; CHECK:       vector.ph:
653; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -8
654; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
655; CHECK:       vector.body:
656; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
657; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
658; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
659; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
660; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
661; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
662; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]]
663; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>*
664; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 2
665; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i32>
666; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]]
667; CHECK-NEXT:    [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64>
668; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]])
669; CHECK-NEXT:    [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]]
670; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
671; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
672; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
673; CHECK:       middle.block:
674; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
675; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
676; CHECK:       scalar.ph:
677; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
678; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
679; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
680; CHECK:       for.body:
681; CHECK-NEXT:    [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
682; CHECK-NEXT:    [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
683; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_012]]
684; CHECK-NEXT:    [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
685; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP11]] to i32
686; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[Y]], i32 [[I_012]]
687; CHECK-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2
688; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[TMP12]] to i32
689; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
690; CHECK-NEXT:    [[CONV3:%.*]] = sext i32 [[MUL]] to i64
691; CHECK-NEXT:    [[ADD]] = add nsw i64 [[R_011]], [[CONV3]]
692; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_012]], 1
693; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
694; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
695; CHECK:       for.cond.cleanup:
696; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
697; CHECK-NEXT:    ret i64 [[R_0_LCSSA]]
698;
699entry:
700  %cmp10 = icmp sgt i32 %n, 0
701  br i1 %cmp10, label %for.body, label %for.cond.cleanup
702
703for.body:                                         ; preds = %entry, %for.body
704  %i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
705  %r.011 = phi i64 [ %add, %for.body ], [ 0, %entry ]
706  %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.012
707  %0 = load i16, i16* %arrayidx, align 2
708  %conv = sext i16 %0 to i32
709  %arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.012
710  %1 = load i16, i16* %arrayidx1, align 2
711  %conv2 = sext i16 %1 to i32
712  %mul = mul nsw i32 %conv2, %conv
713  %conv3 = sext i32 %mul to i64
714  %add = add nsw i64 %r.011, %conv3
715  %inc = add nuw nsw i32 %i.012, 1
716  %exitcond = icmp eq i32 %inc, %n
717  br i1 %exitcond, label %for.cond.cleanup, label %for.body
718
719for.cond.cleanup:                                 ; preds = %for.body, %entry
720  %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
721  ret i64 %r.0.lcssa
722}
723
724; 8x to use VMLAL.u16
725; FIXME: TailPredicate
726define i64 @mla_i8_i64(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 {
727; CHECK-LABEL: @mla_i8_i64(
728; CHECK-NEXT:  entry:
729; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
730; CHECK-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
731; CHECK:       for.body.preheader:
732; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
733; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
734; CHECK:       vector.ph:
735; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -8
736; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
737; CHECK:       vector.body:
738; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
739; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
740; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
741; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
742; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
743; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
744; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]]
745; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>*
746; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1
747; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i32>
748; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw <8 x i32> [[TMP5]], [[TMP2]]
749; CHECK-NEXT:    [[TMP7:%.*]] = zext <8 x i32> [[TMP6]] to <8 x i64>
750; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]])
751; CHECK-NEXT:    [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]]
752; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
753; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
754; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
755; CHECK:       middle.block:
756; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
757; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
758; CHECK:       scalar.ph:
759; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
760; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
761; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
762; CHECK:       for.body:
763; CHECK-NEXT:    [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
764; CHECK-NEXT:    [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
765; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_012]]
766; CHECK-NEXT:    [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
767; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP11]] to i32
768; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[Y]], i32 [[I_012]]
769; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
770; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP12]] to i32
771; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV2]], [[CONV]]
772; CHECK-NEXT:    [[CONV3:%.*]] = zext i32 [[MUL]] to i64
773; CHECK-NEXT:    [[ADD]] = add nuw nsw i64 [[R_011]], [[CONV3]]
774; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_012]], 1
775; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
776; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
777; CHECK:       for.cond.cleanup:
778; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
779; CHECK-NEXT:    ret i64 [[R_0_LCSSA]]
780;
781entry:
782  %cmp10 = icmp sgt i32 %n, 0
783  br i1 %cmp10, label %for.body, label %for.cond.cleanup
784
785for.body:                                         ; preds = %entry, %for.body
786  %i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
787  %r.011 = phi i64 [ %add, %for.body ], [ 0, %entry ]
788  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.012
789  %0 = load i8, i8* %arrayidx, align 1
790  %conv = zext i8 %0 to i32
791  %arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.012
792  %1 = load i8, i8* %arrayidx1, align 1
793  %conv2 = zext i8 %1 to i32
794  %mul = mul nuw nsw i32 %conv2, %conv
795  %conv3 = zext i32 %mul to i64
796  %add = add nuw nsw i64 %r.011, %conv3
797  %inc = add nuw nsw i32 %i.012, 1
798  %exitcond = icmp eq i32 %inc, %n
799  br i1 %exitcond, label %for.cond.cleanup, label %for.body
800
801for.cond.cleanup:                                 ; preds = %for.body, %entry
802  %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
803  ret i64 %r.0.lcssa
804}
805
806; 4x to use VMLA.u32
807define i32 @mla_i32_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) #0 {
808; CHECK-LABEL: @mla_i32_i32(
809; CHECK-NEXT:  entry:
810; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
811; CHECK-NEXT:    br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
812; CHECK:       vector.ph:
813; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
814; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
815; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
816; CHECK:       vector.body:
817; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
818; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
819; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
820; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]]
821; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
822; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
823; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[INDEX]]
824; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
825; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
826; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
827; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
828; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
829; CHECK-NEXT:    [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI]]
830; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
831; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
832; CHECK-NEXT:    br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
833; CHECK:       for.cond.cleanup:
834; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
835; CHECK-NEXT:    ret i32 [[R_0_LCSSA]]
836;
837entry:
838  %cmp8 = icmp sgt i32 %n, 0
839  br i1 %cmp8, label %for.body, label %for.cond.cleanup
840
841for.body:                                         ; preds = %entry, %for.body
842  %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
843  %r.09 = phi i32 [ %add, %for.body ], [ 0, %entry ]
844  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.010
845  %0 = load i32, i32* %arrayidx, align 4
846  %arrayidx1 = getelementptr inbounds i32, i32* %y, i32 %i.010
847  %1 = load i32, i32* %arrayidx1, align 4
848  %mul = mul nsw i32 %1, %0
849  %add = add nsw i32 %mul, %r.09
850  %inc = add nuw nsw i32 %i.010, 1
851  %exitcond = icmp eq i32 %inc, %n
852  br i1 %exitcond, label %for.cond.cleanup, label %for.body
853
854for.cond.cleanup:                                 ; preds = %for.body, %entry
855  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
856  ret i32 %r.0.lcssa
857}
858
859; 8x to use VMLA.u16
860define i32 @mla_i16_i32(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 {
861; CHECK-LABEL: @mla_i16_i32(
862; CHECK-NEXT:  entry:
863; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
864; CHECK-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
865; CHECK:       vector.ph:
866; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
867; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
868; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
869; CHECK:       vector.body:
870; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
871; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
872; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
873; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
874; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
875; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
876; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
877; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]]
878; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>*
879; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP4]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
880; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
881; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]]
882; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP6]], <8 x i32> zeroinitializer
883; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]])
884; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]]
885; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
886; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
887; CHECK-NEXT:    br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
888; CHECK:       for.cond.cleanup:
889; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ]
890; CHECK-NEXT:    ret i32 [[R_0_LCSSA]]
891;
892entry:
893  %cmp9 = icmp sgt i32 %n, 0
894  br i1 %cmp9, label %for.body, label %for.cond.cleanup
895
896for.body:                                         ; preds = %entry, %for.body
897  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
898  %r.010 = phi i32 [ %add, %for.body ], [ 0, %entry ]
899  %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.011
900  %0 = load i16, i16* %arrayidx, align 2
901  %conv = sext i16 %0 to i32
902  %arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.011
903  %1 = load i16, i16* %arrayidx1, align 2
904  %conv2 = sext i16 %1 to i32
905  %mul = mul nsw i32 %conv2, %conv
906  %add = add nsw i32 %mul, %r.010
907  %inc = add nuw nsw i32 %i.011, 1
908  %exitcond = icmp eq i32 %inc, %n
909  br i1 %exitcond, label %for.cond.cleanup, label %for.body
910
911for.cond.cleanup:                                 ; preds = %for.body, %entry
912  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
913  ret i32 %r.0.lcssa
914}
915
916; 16x to use VMLA.u8
917define i32 @mla_i8_i32(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 {
918; CHECK-LABEL: @mla_i8_i32(
919; CHECK-NEXT:  entry:
920; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
921; CHECK-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
922; CHECK:       vector.ph:
923; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 15
924; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
925; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
926; CHECK:       vector.body:
927; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
928; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
929; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
930; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
931; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
932; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
933; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
934; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]]
935; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
936; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
937; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
938; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP2]]
939; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP6]], <16 x i32> zeroinitializer
940; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP7]])
941; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]]
942; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
943; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
944; CHECK-NEXT:    br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
945; CHECK:       for.cond.cleanup:
946; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ]
947; CHECK-NEXT:    ret i32 [[R_0_LCSSA]]
948;
949entry:
950  %cmp9 = icmp sgt i32 %n, 0
951  br i1 %cmp9, label %for.body, label %for.cond.cleanup
952
953for.body:                                         ; preds = %entry, %for.body
954  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
955  %r.010 = phi i32 [ %add, %for.body ], [ 0, %entry ]
956  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.011
957  %0 = load i8, i8* %arrayidx, align 1
958  %conv = zext i8 %0 to i32
959  %arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.011
960  %1 = load i8, i8* %arrayidx1, align 1
961  %conv2 = zext i8 %1 to i32
962  %mul = mul nuw nsw i32 %conv2, %conv
963  %add = add nuw nsw i32 %mul, %r.010
964  %inc = add nuw nsw i32 %i.011, 1
965  %exitcond = icmp eq i32 %inc, %n
966  br i1 %exitcond, label %for.cond.cleanup, label %for.body
967
968for.cond.cleanup:                                 ; preds = %for.body, %entry
969  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
970  ret i32 %r.0.lcssa
971}
972
973; 8x to use VMLA.u16
974define signext i16 @mla_i16_i16(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 {
975; CHECK-LABEL: @mla_i16_i16(
976; CHECK-NEXT:  entry:
977; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
978; CHECK-NEXT:    br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
979; CHECK:       vector.ph:
980; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
981; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
982; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
983; CHECK:       vector.body:
984; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
985; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
986; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
987; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]]
988; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>*
989; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
990; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]]
991; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <8 x i16>*
992; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP3]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
993; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
994; CHECK-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP4]], <8 x i16> zeroinitializer
995; CHECK-NEXT:    [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP5]])
996; CHECK-NEXT:    [[TMP7]] = add i16 [[TMP6]], [[VEC_PHI]]
997; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
998; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
999; CHECK-NEXT:    br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
1000; CHECK:       for.cond.cleanup:
1001; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
1002; CHECK-NEXT:    ret i16 [[R_0_LCSSA]]
1003;
1004entry:
1005  %cmp11 = icmp sgt i32 %n, 0
1006  br i1 %cmp11, label %for.body, label %for.cond.cleanup
1007
1008for.body:                                         ; preds = %entry, %for.body
1009  %i.013 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
1010  %r.012 = phi i16 [ %add, %for.body ], [ 0, %entry ]
1011  %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.013
1012  %0 = load i16, i16* %arrayidx, align 2
1013  %arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.013
1014  %1 = load i16, i16* %arrayidx1, align 2
1015  %mul = mul i16 %1, %0
1016  %add = add i16 %mul, %r.012
1017  %inc = add nuw nsw i32 %i.013, 1
1018  %exitcond = icmp eq i32 %inc, %n
1019  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1020
1021for.cond.cleanup:                                 ; preds = %for.body, %entry
1022  %r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
1023  ret i16 %r.0.lcssa
1024}
1025
1026; 16x to use VMLA.u8
1027define signext i16 @mla_i8_i16(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 {
1028; CHECK-LABEL: @mla_i8_i16(
1029; CHECK-NEXT:  entry:
1030; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
1031; CHECK-NEXT:    br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
1032; CHECK:       vector.ph:
1033; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 15
1034; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
1035; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1036; CHECK:       vector.body:
1037; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1038; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
1039; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
1040; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
1041; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
1042; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
1043; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
1044; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]]
1045; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
1046; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
1047; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16>
1048; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw <16 x i16> [[TMP5]], [[TMP2]]
1049; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP6]], <16 x i16> zeroinitializer
1050; CHECK-NEXT:    [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP7]])
1051; CHECK-NEXT:    [[TMP9]] = add i16 [[TMP8]], [[VEC_PHI]]
1052; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
1053; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
1054; CHECK-NEXT:    br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
1055; CHECK:       for.cond.cleanup:
1056; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ]
1057; CHECK-NEXT:    ret i16 [[R_0_LCSSA]]
1058;
1059entry:
1060  %cmp11 = icmp sgt i32 %n, 0
1061  br i1 %cmp11, label %for.body, label %for.cond.cleanup
1062
1063for.body:                                         ; preds = %entry, %for.body
1064  %i.013 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
1065  %r.012 = phi i16 [ %add, %for.body ], [ 0, %entry ]
1066  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.013
1067  %0 = load i8, i8* %arrayidx, align 1
1068  %conv = zext i8 %0 to i16
1069  %arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.013
1070  %1 = load i8, i8* %arrayidx1, align 1
1071  %conv2 = zext i8 %1 to i16
1072  %mul = mul nuw i16 %conv2, %conv
1073  %add = add i16 %mul, %r.012
1074  %inc = add nuw nsw i32 %i.013, 1
1075  %exitcond = icmp eq i32 %inc, %n
1076  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1077
1078for.cond.cleanup:                                 ; preds = %for.body, %entry
1079  %r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ]
1080  ret i16 %r.0.lcssa
1081}
1082
1083; 16x to use VMLA.u8
1084define zeroext i8 @mla_i8_i8(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 {
1085; CHECK-LABEL: @mla_i8_i8(
1086; CHECK-NEXT:  entry:
1087; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
1088; CHECK-NEXT:    br i1 [[CMP10]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
1089; CHECK:       vector.ph:
1090; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 15
1091; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
1092; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1093; CHECK:       vector.body:
1094; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1095; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
1096; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
1097; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]]
1098; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
1099; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
1100; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]]
1101; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
1102; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
1103; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
1104; CHECK-NEXT:    [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP4]], <16 x i8> zeroinitializer
1105; CHECK-NEXT:    [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP5]])
1106; CHECK-NEXT:    [[TMP7]] = add i8 [[TMP6]], [[VEC_PHI]]
1107; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
1108; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
1109; CHECK-NEXT:    br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
1110; CHECK:       for.cond.cleanup:
1111; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
1112; CHECK-NEXT:    ret i8 [[R_0_LCSSA]]
1113;
1114entry:
1115  %cmp10 = icmp sgt i32 %n, 0
1116  br i1 %cmp10, label %for.body, label %for.cond.cleanup
1117
1118for.body:                                         ; preds = %entry, %for.body
1119  %i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
1120  %r.011 = phi i8 [ %add, %for.body ], [ 0, %entry ]
1121  %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.012
1122  %0 = load i8, i8* %arrayidx, align 1
1123  %arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.012
1124  %1 = load i8, i8* %arrayidx1, align 1
1125  %mul = mul i8 %1, %0
1126  %add = add i8 %mul, %r.011
1127  %inc = add nuw nsw i32 %i.012, 1
1128  %exitcond = icmp eq i32 %inc, %n
1129  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1130
1131for.cond.cleanup:                                 ; preds = %for.body, %entry
1132  %r.0.lcssa = phi i8 [ 0, %entry ], [ %add, %for.body ]
1133  ret i8 %r.0.lcssa
1134}
1135
1136; 8x as different types
1137define i32 @red_mla_ext_s8_s16_s32(i8* noalias nocapture readonly %A, i16* noalias nocapture readonly %B, i32 %n) #0 {
1138; CHECK-LABEL: @red_mla_ext_s8_s16_s32(
1139; CHECK-NEXT:  entry:
1140; CHECK-NEXT:    [[CMP9_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0
1141; CHECK-NEXT:    br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_PH:%.*]]
1142; CHECK:       vector.ph:
1143; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
1144; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
1145; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1146; CHECK:       vector.body:
1147; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1148; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
1149; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
1150; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[INDEX]]
1151; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
1152; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
1153; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
1154; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]]
1155; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>*
1156; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP4]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
1157; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
1158; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]]
1159; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP6]], <8 x i32> zeroinitializer
1160; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]])
1161; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]]
1162; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
1163; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
1164; CHECK-NEXT:    br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
1165; CHECK:       for.cond.cleanup:
1166; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ]
1167; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
1168;
1169entry:
1170  %cmp9.not = icmp eq i32 %n, 0
1171  br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader
1172
1173for.body.preheader:                               ; preds = %entry
1174  br label %for.body
1175
1176for.body:                                         ; preds = %for.body.preheader, %for.body
1177  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
1178  %s.010 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
1179  %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.011
1180  %0 = load i8, i8* %arrayidx, align 1
1181  %conv = sext i8 %0 to i32
1182  %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.011
1183  %1 = load i16, i16* %arrayidx1, align 2
1184  %conv2 = sext i16 %1 to i32
1185  %mul = mul nsw i32 %conv2, %conv
1186  %add = add nsw i32 %mul, %s.010
1187  %inc = add nuw i32 %i.011, 1
1188  %exitcond.not = icmp eq i32 %inc, %n
1189  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
1190
1191for.cond.cleanup.loopexit:                        ; preds = %for.body
1192  %add.lcssa = phi i32 [ %add, %for.body ]
1193  br label %for.cond.cleanup
1194
1195for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
1196  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
1197  ret i32 %s.0.lcssa
1198}
1199
1200; 4x as different sext vs zext
1201define i64 @red_mla_ext_s16_u16_s64(i16* noalias nocapture readonly %A, i16* noalias nocapture readonly %B, i32 %n) #0 {
1202; CHECK-LABEL: @red_mla_ext_s16_u16_s64(
1203; CHECK-NEXT:  entry:
1204; CHECK-NEXT:    [[CMP9_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0
1205; CHECK-NEXT:    br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
1206; CHECK:       for.body.preheader:
1207; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
1208; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1209; CHECK:       vector.ph:
1210; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -4
1211; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1212; CHECK:       vector.body:
1213; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1214; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
1215; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i32 [[INDEX]]
1216; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>*
1217; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
1218; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
1219; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]]
1220; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <4 x i16>*
1221; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2
1222; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i16> [[WIDE_LOAD1]] to <4 x i32>
1223; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP2]]
1224; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i32> [[TMP6]] to <4 x i64>
1225; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP7]])
1226; CHECK-NEXT:    [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]]
1227; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
1228; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
1229; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
1230; CHECK:       middle.block:
1231; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
1232; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
1233; CHECK:       scalar.ph:
1234; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
1235; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
1236; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1237; CHECK:       for.body:
1238; CHECK-NEXT:    [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1239; CHECK-NEXT:    [[S_010:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1240; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[A]], i32 [[I_011]]
1241; CHECK-NEXT:    [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX]], align 1
1242; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP11]] to i32
1243; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[I_011]]
1244; CHECK-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2
1245; CHECK-NEXT:    [[CONV2:%.*]] = zext i16 [[TMP12]] to i32
1246; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
1247; CHECK-NEXT:    [[MUL2:%.*]] = zext i32 [[MUL]] to i64
1248; CHECK-NEXT:    [[ADD]] = add nsw i64 [[S_010]], [[MUL2]]
1249; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_011]], 1
1250; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
1251; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
1252; CHECK:       for.cond.cleanup:
1253; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
1254; CHECK-NEXT:    ret i64 [[S_0_LCSSA]]
1255;
1256entry:
1257  %cmp9.not = icmp eq i32 %n, 0
1258  br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader
1259
1260for.body.preheader:                               ; preds = %entry
1261  br label %for.body
1262
1263for.body:                                         ; preds = %for.body.preheader, %for.body
1264  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
1265  %s.010 = phi i64 [ %add, %for.body ], [ 0, %for.body.preheader ]
1266  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.011
1267  %0 = load i16, i16* %arrayidx, align 1
1268  %conv = sext i16 %0 to i32
1269  %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.011
1270  %1 = load i16, i16* %arrayidx1, align 2
1271  %conv2 = zext i16 %1 to i32
1272  %mul = mul nsw i32 %conv2, %conv
1273  %mul2 = zext i32 %mul to i64
1274  %add = add nsw i64 %mul2, %s.010
1275  %inc = add nuw i32 %i.011, 1
1276  %exitcond.not = icmp eq i32 %inc, %n
1277  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
1278
1279for.cond.cleanup.loopexit:                        ; preds = %for.body
1280  %add.lcssa = phi i64 [ %add, %for.body ]
1281  br label %for.cond.cleanup
1282
1283for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
1284  %s.0.lcssa = phi i64 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
1285  ret i64 %s.0.lcssa
1286}
1287
1288; 4x as different sext vs zext
1289define i32 @red_mla_u8_s8_u32(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i32 %n) #0 {
1290; CHECK-LABEL: @red_mla_u8_s8_u32(
1291; CHECK-NEXT:  entry:
1292; CHECK-NEXT:    [[CMP9_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0
1293; CHECK-NEXT:    br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_PH:%.*]]
1294; CHECK:       vector.ph:
1295; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
1296; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
1297; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1298; CHECK:       vector.body:
1299; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1300; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
1301; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
1302; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[INDEX]]
1303; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
1304; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
1305; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32>
1306; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[INDEX]]
1307; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
1308; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
1309; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32>
1310; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP2]]
1311; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
1312; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
1313; CHECK-NEXT:    [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]]
1314; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
1315; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
1316; CHECK-NEXT:    br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
1317; CHECK:       for.cond.cleanup:
1318; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ]
1319; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
1320;
1321entry:
1322  %cmp9.not = icmp eq i32 %n, 0
1323  br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader
1324
1325for.body.preheader:                               ; preds = %entry
1326  br label %for.body
1327
1328for.body:                                         ; preds = %for.body.preheader, %for.body
1329  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
1330  %s.010 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
1331  %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.011
1332  %0 = load i8, i8* %arrayidx, align 1
1333  %conv = zext i8 %0 to i32
1334  %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.011
1335  %1 = load i8, i8* %arrayidx1, align 1
1336  %conv2 = sext i8 %1 to i32
1337  %mul = mul nsw i32 %conv2, %conv
1338  %add = add i32 %mul, %s.010
1339  %inc = add nuw i32 %i.011, 1
1340  %exitcond.not = icmp eq i32 %inc, %n
1341  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
1342
1343for.cond.cleanup.loopexit:                        ; preds = %for.body
1344  %add.lcssa = phi i32 [ %add, %for.body ]
1345  br label %for.cond.cleanup
1346
1347for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
1348  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
1349  ret i32 %s.0.lcssa
1350}
1351
1352; Make sure interleave group members feeding in-loop reductions can be handled.
1353define i32 @reduction_interleave_group(i32 %n, i32* %arr) #0 {
1354; CHECK-LABEL: @reduction_interleave_group(
1355; CHECK-NEXT:  entry:
1356; CHECK-NEXT:    [[GUARD:%.*]] = icmp sgt i32 [[N:%.*]], 0
1357; CHECK-NEXT:    br i1 [[GUARD]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
1358; CHECK:       for.body.preheader:
1359; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
1360; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 1
1361; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1
1362; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 6
1363; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1364; CHECK:       vector.ph:
1365; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[TMP2]], -4
1366; CHECK-NEXT:    [[IND_END:%.*]] = shl i32 [[N_VEC]], 1
1367; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1368; CHECK:       vector.body:
1369; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1370; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
1371; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
1372; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[OFFSET_IDX]], 1
1373; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[ARR:%.*]], i32 -1
1374; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 [[TMP3]]
1375; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
1376; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4
1377; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1378; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1379; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[STRIDED_VEC1]])
1380; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], [[VEC_PHI]]
1381; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[STRIDED_VEC]])
1382; CHECK-NEXT:    [[TMP10]] = add i32 [[TMP9]], [[TMP8]]
1383; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
1384; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
1385; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1386; CHECK:       middle.block:
1387; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
1388; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT]], label [[SCALAR_PH]]
1389; CHECK:       scalar.ph:
1390; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
1391; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
1392; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1393; CHECK:       for.body:
1394; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1395; CHECK-NEXT:    [[RED_PHI:%.*]] = phi i32 [ [[RED_2:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1396; CHECK-NEXT:    [[ADD:%.*]] = or i32 [[IV]], 1
1397; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i32 [[ADD]]
1398; CHECK-NEXT:    [[L_0:%.*]] = load i32, i32* [[GEP_0]], align 4
1399; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i32 [[IV]]
1400; CHECK-NEXT:    [[L_1:%.*]] = load i32, i32* [[GEP_1]], align 4
1401; CHECK-NEXT:    [[RED_1:%.*]] = add i32 [[L_0]], [[RED_PHI]]
1402; CHECK-NEXT:    [[RED_2]] = add i32 [[RED_1]], [[L_1]]
1403; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 2
1404; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
1405; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP31:![0-9]+]]
1406; CHECK:       exit:
1407; CHECK-NEXT:    [[RET_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RED_2]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
1408; CHECK-NEXT:    ret i32 [[RET_LCSSA]]
1409;
1410entry:
1411  %guard = icmp sgt i32 %n, 0
1412  br i1 %guard , label %for.body, label %exit
1413
1414for.body:                                         ; preds = %for.body.preheader, %for.body
1415  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
1416  %red.phi = phi i32 [ %red.2, %for.body ], [ 0, %entry ]
1417  %add = or i32 %iv, 1
1418  %gep.0 = getelementptr inbounds i32, i32* %arr, i32 %add
1419  %l.0 = load i32, i32* %gep.0, align 4
1420  %gep.1 = getelementptr inbounds i32, i32* %arr, i32 %iv
1421  %l.1 = load i32, i32* %gep.1, align 4
1422  %red.1 = add i32 %l.0, %red.phi
1423  %red.2 = add i32 %red.1, %l.1
1424  %iv.next = add nuw nsw i32 %iv, 2
1425  %cmp = icmp slt i32 %iv.next, %n
1426  br i1 %cmp, label %for.body, label %exit
1427
1428exit:
1429  %ret.lcssa = phi i32 [ 0, %entry ], [ %red.2, %for.body ]
1430  ret i32 %ret.lcssa
1431}
1432
1433attributes #0 = { "target-features"="+mve" }
1434