1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -loop-vectorize -instcombine -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -tail-predication=enabled < %s -S -o - | FileCheck %s 3 4target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" 5target triple = "thumbv8.1m.main-arm-none-eabi" 6 7; Should not be vectorized 8define i64 @add_i64_i64(i64* nocapture readonly %x, i32 %n) #0 { 9; CHECK-LABEL: @add_i64_i64( 10; CHECK-NEXT: entry: 11; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 12; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] 13; CHECK: for.body: 14; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] 15; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] 16; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[X:%.*]], i32 [[I_08]] 17; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 18; CHECK-NEXT: [[ADD]] = add nsw i64 [[TMP0]], [[R_07]] 19; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 20; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] 21; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] 22; CHECK: for.cond.cleanup: 23; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] 24; CHECK-NEXT: ret i64 [[R_0_LCSSA]] 25; 26entry: 27 %cmp6 = icmp sgt i32 %n, 0 28 br i1 %cmp6, label %for.body, label %for.cond.cleanup 29 30for.body: ; preds = %entry, %for.body 31 %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 32 %r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ] 33 %arrayidx = getelementptr inbounds i64, i64* %x, i32 %i.08 34 %0 = load i64, i64* %arrayidx, align 8 35 %add = add nsw i64 %0, %r.07 36 %inc = add nuw nsw i32 %i.08, 1 37 %exitcond = icmp eq i32 %inc, %n 38 br i1 %exitcond, label %for.cond.cleanup, label %for.body 39 40for.cond.cleanup: ; preds = %for.body, %entry 41 %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ] 42 ret i64 %r.0.lcssa 43} 44 45; 4x to use VADDLV 46; FIXME: TailPredicate 47define i64 @add_i32_i64(i32* nocapture readonly %x, i32 %n) #0 { 48; CHECK-LABEL: @add_i32_i64( 49; CHECK-NEXT: entry: 50; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 51; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] 52; CHECK: for.body.preheader: 53; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 54; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 55; CHECK: vector.ph: 56; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 57; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 58; CHECK: vector.body: 59; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 60; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] 61; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]] 62; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 63; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 64; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64> 65; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) 66; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] 67; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 68; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 69; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 70; CHECK: middle.block: 71; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] 72; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] 73; CHECK: scalar.ph: 74; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 75; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 76; CHECK-NEXT: br label [[FOR_BODY:%.*]] 77; CHECK: for.body: 78; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 79; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 80; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] 81; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 82; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP6]] to i64 83; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]] 84; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 85; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] 86; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] 87; CHECK: for.cond.cleanup: 88; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] 89; CHECK-NEXT: ret i64 [[R_0_LCSSA]] 90; 91entry: 92 %cmp6 = icmp sgt i32 %n, 0 93 br i1 %cmp6, label %for.body, label %for.cond.cleanup 94 95for.body: ; preds = %entry, %for.body 96 %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 97 %r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ] 98 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 99 %0 = load i32, i32* %arrayidx, align 4 100 %conv = sext i32 %0 to i64 101 %add = add nsw i64 %r.07, %conv 102 %inc = add nuw nsw i32 %i.08, 1 103 %exitcond = icmp eq i32 %inc, %n 104 br i1 %exitcond, label %for.cond.cleanup, label %for.body 105 106for.cond.cleanup: ; preds = %for.body, %entry 107 %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ] 108 ret i64 %r.0.lcssa 109} 110 111; 4x to use VADDLV 112; FIXME: TailPredicate 113define i64 @add_i16_i64(i16* nocapture readonly %x, i32 %n) #0 { 114; CHECK-LABEL: @add_i16_i64( 115; CHECK-NEXT: entry: 116; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 117; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] 118; CHECK: for.body.preheader: 119; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 120; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 121; CHECK: vector.ph: 122; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 123; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 124; CHECK: vector.body: 125; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 126; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] 127; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] 128; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* 129; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 130; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64> 131; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) 132; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] 133; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 134; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 135; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] 136; CHECK: middle.block: 137; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] 138; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] 139; CHECK: scalar.ph: 140; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 141; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 142; CHECK-NEXT: br label [[FOR_BODY:%.*]] 143; CHECK: for.body: 144; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 145; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 146; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_08]] 147; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 148; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP6]] to i64 149; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]] 150; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 151; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] 152; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] 153; CHECK: for.cond.cleanup: 154; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] 155; CHECK-NEXT: ret i64 [[R_0_LCSSA]] 156; 157entry: 158 %cmp6 = icmp sgt i32 %n, 0 159 br i1 %cmp6, label %for.body, label %for.cond.cleanup 160 161for.body: ; preds = %entry, %for.body 162 %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 163 %r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ] 164 %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.08 165 %0 = load i16, i16* %arrayidx, align 2 166 %conv = sext i16 %0 to i64 167 %add = add nsw i64 %r.07, %conv 168 %inc = add nuw nsw i32 %i.08, 1 169 %exitcond = icmp eq i32 %inc, %n 170 br i1 %exitcond, label %for.cond.cleanup, label %for.body 171 172for.cond.cleanup: ; preds = %for.body, %entry 173 %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ] 174 ret i64 %r.0.lcssa 175} 176 177; 4x to use VADDLV 178; FIXME: TailPredicate 179define i64 @add_i8_i64(i8* nocapture readonly %x, i32 %n) #0 { 180; CHECK-LABEL: @add_i8_i64( 181; CHECK-NEXT: entry: 182; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 183; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] 184; CHECK: for.body.preheader: 185; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 186; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 187; CHECK: vector.ph: 188; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 189; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 190; CHECK: vector.body: 191; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 192; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] 193; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] 194; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* 195; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 196; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64> 197; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) 198; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] 199; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 200; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 201; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] 202; CHECK: middle.block: 203; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] 204; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] 205; CHECK: scalar.ph: 206; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 207; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 208; CHECK-NEXT: br label [[FOR_BODY:%.*]] 209; CHECK: for.body: 210; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 211; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 212; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_08]] 213; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 214; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP6]] to i64 215; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_07]], [[CONV]] 216; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 217; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] 218; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] 219; CHECK: for.cond.cleanup: 220; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] 221; CHECK-NEXT: ret i64 [[R_0_LCSSA]] 222; 223entry: 224 %cmp6 = icmp sgt i32 %n, 0 225 br i1 %cmp6, label %for.body, label %for.cond.cleanup 226 227for.body: ; preds = %entry, %for.body 228 %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 229 %r.07 = phi i64 [ %add, %for.body ], [ 0, %entry ] 230 %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.08 231 %0 = load i8, i8* %arrayidx, align 1 232 %conv = zext i8 %0 to i64 233 %add = add nuw nsw i64 %r.07, %conv 234 %inc = add nuw nsw i32 %i.08, 1 235 %exitcond = icmp eq i32 %inc, %n 236 br i1 %exitcond, label %for.cond.cleanup, label %for.body 237 238for.cond.cleanup: ; preds = %for.body, %entry 239 %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ] 240 ret i64 %r.0.lcssa 241} 242 243; 4x to use VADDV.u32 244define i32 @add_i32_i32(i32* nocapture readonly %x, i32 %n) #0 { 245; CHECK-LABEL: @add_i32_i32( 246; CHECK-NEXT: entry: 247; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 248; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] 249; CHECK: vector.ph: 250; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 251; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 252; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 253; CHECK: vector.body: 254; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 255; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 256; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) 257; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]] 258; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 259; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer) 260; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]]) 261; CHECK-NEXT: [[TMP3]] = add i32 [[TMP2]], [[VEC_PHI]] 262; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 263; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 264; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] 265; CHECK: for.cond.cleanup: 266; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[VECTOR_BODY]] ] 267; CHECK-NEXT: ret i32 [[R_0_LCSSA]] 268; 269entry: 270 %cmp6 = icmp sgt i32 %n, 0 271 br i1 %cmp6, label %for.body, label %for.cond.cleanup 272 273for.body: ; preds = %entry, %for.body 274 %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 275 %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] 276 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 277 %0 = load i32, i32* %arrayidx, align 4 278 %add = add nsw i32 %0, %r.07 279 %inc = add nuw nsw i32 %i.08, 1 280 %exitcond = icmp eq i32 %inc, %n 281 br i1 %exitcond, label %for.cond.cleanup, label %for.body 282 283for.cond.cleanup: ; preds = %for.body, %entry 284 %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] 285 ret i32 %r.0.lcssa 286} 287 288; 8x to use VADDV.u16 289define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) #0 { 290; CHECK-LABEL: @add_i16_i32( 291; CHECK-NEXT: entry: 292; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 293; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] 294; CHECK: vector.ph: 295; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 296; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 297; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 298; CHECK: vector.body: 299; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 300; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 301; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) 302; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] 303; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* 304; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) 305; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> 306; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer 307; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]]) 308; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] 309; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 310; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 311; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] 312; CHECK: for.cond.cleanup: 313; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ] 314; CHECK-NEXT: ret i32 [[R_0_LCSSA]] 315; 316entry: 317 %cmp6 = icmp sgt i32 %n, 0 318 br i1 %cmp6, label %for.body, label %for.cond.cleanup 319 320for.body: ; preds = %entry, %for.body 321 %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 322 %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] 323 %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.08 324 %0 = load i16, i16* %arrayidx, align 2 325 %conv = sext i16 %0 to i32 326 %add = add nsw i32 %r.07, %conv 327 %inc = add nuw nsw i32 %i.08, 1 328 %exitcond = icmp eq i32 %inc, %n 329 br i1 %exitcond, label %for.cond.cleanup, label %for.body 330 331for.cond.cleanup: ; preds = %for.body, %entry 332 %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] 333 ret i32 %r.0.lcssa 334} 335 336; 16x to use VADDV.u16 337define i32 @add_i8_i32(i8* nocapture readonly %x, i32 %n) #0 { 338; CHECK-LABEL: @add_i8_i32( 339; CHECK-NEXT: entry: 340; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 341; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] 342; CHECK: vector.ph: 343; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 344; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 345; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 346; CHECK: vector.body: 347; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 348; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 349; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) 350; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] 351; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* 352; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) 353; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> 354; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer 355; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]]) 356; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] 357; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 358; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 359; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] 360; CHECK: for.cond.cleanup: 361; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ] 362; CHECK-NEXT: ret i32 [[R_0_LCSSA]] 363; 364entry: 365 %cmp6 = icmp sgt i32 %n, 0 366 br i1 %cmp6, label %for.body, label %for.cond.cleanup 367 368for.body: ; preds = %entry, %for.body 369 %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 370 %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ] 371 %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.08 372 %0 = load i8, i8* %arrayidx, align 1 373 %conv = zext i8 %0 to i32 374 %add = add nuw nsw i32 %r.07, %conv 375 %inc = add nuw nsw i32 %i.08, 1 376 %exitcond = icmp eq i32 %inc, %n 377 br i1 %exitcond, label %for.cond.cleanup, label %for.body 378 379for.cond.cleanup: ; preds = %for.body, %entry 380 %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] 381 ret i32 %r.0.lcssa 382} 383 384; 8x to use VADDV.u16 385define signext i16 @add_i16_i16(i16* nocapture readonly %x, i32 %n) #0 { 386; CHECK-LABEL: @add_i16_i16( 387; CHECK-NEXT: entry: 388; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 389; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] 390; CHECK: vector.ph: 391; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 392; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 393; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 394; CHECK: vector.body: 395; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 396; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 397; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) 398; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] 399; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* 400; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> zeroinitializer) 401; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]]) 402; CHECK-NEXT: [[TMP3]] = add i16 [[TMP2]], [[VEC_PHI]] 403; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 404; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 405; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] 406; CHECK: for.cond.cleanup: 407; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[VECTOR_BODY]] ] 408; CHECK-NEXT: ret i16 [[R_0_LCSSA]] 409; 410entry: 411 %cmp8 = icmp sgt i32 %n, 0 412 br i1 %cmp8, label %for.body, label %for.cond.cleanup 413 414for.body: ; preds = %entry, %for.body 415 %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 416 %r.09 = phi i16 [ %add, %for.body ], [ 0, %entry ] 417 %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.010 418 %0 = load i16, i16* %arrayidx, align 2 419 %add = add i16 %0, %r.09 420 %inc = add nuw nsw i32 %i.010, 1 421 %exitcond = icmp eq i32 %inc, %n 422 br i1 %exitcond, label %for.cond.cleanup, label %for.body 423 424for.cond.cleanup: ; preds = %for.body, %entry 425 %r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ] 426 ret i16 %r.0.lcssa 427} 428 429; 16x to use VADDV.u8 430define signext i16 @add_i8_i16(i8* nocapture readonly %x, i32 %n) #0 { 431; CHECK-LABEL: @add_i8_i16( 432; CHECK-NEXT: entry: 433; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 434; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] 435; CHECK: vector.ph: 436; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 437; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 438; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 439; CHECK: vector.body: 440; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 441; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 442; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) 443; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] 444; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* 445; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) 446; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16> 447; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP2]], <16 x i16> zeroinitializer 448; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP3]]) 449; CHECK-NEXT: [[TMP5]] = add i16 [[TMP4]], [[VEC_PHI]] 450; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 451; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 452; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] 453; CHECK: for.cond.cleanup: 454; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ] 455; CHECK-NEXT: ret i16 [[R_0_LCSSA]] 456; 457entry: 458 %cmp8 = icmp sgt i32 %n, 0 459 br i1 %cmp8, label %for.body, label %for.cond.cleanup 460 461for.body: ; preds = %entry, %for.body 462 %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 463 %r.09 = phi i16 [ %add, %for.body ], [ 0, %entry ] 464 %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.010 465 %0 = load i8, i8* %arrayidx, align 1 466 %conv = zext i8 %0 to i16 467 %add = add i16 %r.09, %conv 468 %inc = add nuw nsw i32 %i.010, 1 469 %exitcond = icmp eq i32 %inc, %n 470 br i1 %exitcond, label %for.cond.cleanup, label %for.body 471 472for.cond.cleanup: ; preds = %for.body, %entry 473 %r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ] 474 ret i16 %r.0.lcssa 475} 476 477; 16x to use VADDV.u8 478define zeroext i8 @add_i8_i8(i8* nocapture readonly %x, i32 %n) #0 { 479; CHECK-LABEL: @add_i8_i8( 480; CHECK-NEXT: entry: 481; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0 482; CHECK-NEXT: br i1 [[CMP7]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] 483; CHECK: vector.ph: 484; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 485; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 486; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 487; CHECK: vector.body: 488; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 489; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 490; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) 491; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] 492; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* 493; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> zeroinitializer) 494; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[WIDE_MASKED_LOAD]]) 495; CHECK-NEXT: [[TMP3]] = add i8 [[TMP2]], [[VEC_PHI]] 496; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 497; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 498; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] 499; CHECK: for.cond.cleanup: 500; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[VECTOR_BODY]] ] 501; CHECK-NEXT: ret i8 [[R_0_LCSSA]] 502; 503entry: 504 %cmp7 = icmp sgt i32 %n, 0 505 br i1 %cmp7, label %for.body, label %for.cond.cleanup 506 507for.body: ; preds = %entry, %for.body 508 %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 509 %r.08 = phi i8 [ %add, %for.body ], [ 0, %entry ] 510 %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.09 511 %0 = load i8, i8* %arrayidx, align 1 512 %add = add i8 %0, %r.08 513 %inc = add nuw nsw i32 %i.09, 1 514 %exitcond = icmp eq i32 %inc, %n 515 br i1 %exitcond, label %for.cond.cleanup, label %for.body 516 517for.cond.cleanup: ; preds = %for.body, %entry 518 %r.0.lcssa = phi i8 [ 0, %entry ], [ %add, %for.body ] 519 ret i8 %r.0.lcssa 520} 521 522; Not vectorized 523define i64 @mla_i64_i64(i64* nocapture readonly %x, i64* nocapture readonly %y, i32 %n) #0 { 524; CHECK-LABEL: @mla_i64_i64( 525; CHECK-NEXT: entry: 526; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 527; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] 528; CHECK: for.body: 529; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] 530; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] 531; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[X:%.*]], i32 [[I_010]] 532; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 533; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[Y:%.*]], i32 [[I_010]] 534; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8 535; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[TMP1]], [[TMP0]] 536; CHECK-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[R_09]] 537; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1 538; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] 539; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] 540; CHECK: for.cond.cleanup: 541; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] 542; CHECK-NEXT: ret i64 [[R_0_LCSSA]] 543; 544entry: 545 %cmp8 = icmp sgt i32 %n, 0 546 br i1 %cmp8, label %for.body, label %for.cond.cleanup 547 548for.body: ; preds = %entry, %for.body 549 %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 550 %r.09 = phi i64 [ %add, %for.body ], [ 0, %entry ] 551 %arrayidx = getelementptr inbounds i64, i64* %x, i32 %i.010 552 %0 = load i64, i64* %arrayidx, align 8 553 %arrayidx1 = getelementptr inbounds i64, i64* %y, i32 %i.010 554 %1 = load i64, i64* %arrayidx1, align 8 555 %mul = mul nsw i64 %1, %0 556 %add = add nsw i64 %mul, %r.09 557 %inc = add nuw nsw i32 %i.010, 1 558 %exitcond = icmp eq i32 %inc, %n 559 br i1 %exitcond, label %for.cond.cleanup, label %for.body 560 561for.cond.cleanup: ; preds = %for.body, %entry 562 %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ] 563 ret i64 %r.0.lcssa 564} 565 566; 4x to use VMLAL.u32 567; FIXME: TailPredicate 568define i64 @mla_i32_i64(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) #0 { 569; CHECK-LABEL: @mla_i32_i64( 570; CHECK-NEXT: entry: 571; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 572; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] 573; CHECK: for.body.preheader: 574; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 575; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 576; CHECK: vector.ph: 577; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 578; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 579; CHECK: vector.body: 580; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 581; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] 582; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]] 583; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 584; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 585; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[INDEX]] 586; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* 587; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 588; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] 589; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP4]] to <4 x i64> 590; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP5]]) 591; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]] 592; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 593; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 594; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] 595; CHECK: middle.block: 596; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] 597; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] 598; CHECK: scalar.ph: 599; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 600; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 601; CHECK-NEXT: br label [[FOR_BODY:%.*]] 602; CHECK: for.body: 603; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 604; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 605; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_010]] 606; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 607; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[Y]], i32 [[I_010]] 608; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 609; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], [[TMP9]] 610; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[MUL]] to i64 611; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_09]], [[CONV]] 612; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1 613; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] 614; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] 615; CHECK: for.cond.cleanup: 616; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 617; CHECK-NEXT: ret i64 [[R_0_LCSSA]] 618; 619entry: 620 %cmp8 = icmp sgt i32 %n, 0 621 br i1 %cmp8, label %for.body, label %for.cond.cleanup 622 623for.body: ; preds = %entry, %for.body 624 %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 625 %r.09 = phi i64 [ %add, %for.body ], [ 0, %entry ] 626 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.010 627 %0 = load i32, i32* %arrayidx, align 4 628 %arrayidx1 = getelementptr inbounds i32, i32* %y, i32 %i.010 629 %1 = load i32, i32* %arrayidx1, align 4 630 %mul = mul nsw i32 %1, %0 631 %conv = sext i32 %mul to i64 632 %add = add nsw i64 %r.09, %conv 633 %inc = add nuw nsw i32 %i.010, 1 634 %exitcond = icmp eq i32 %inc, %n 635 br i1 %exitcond, label %for.cond.cleanup, label %for.body 636 637for.cond.cleanup: ; preds = %for.body, %entry 638 %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ] 639 ret i64 %r.0.lcssa 640} 641 642; 8x to use VMLAL.u16 643; FIXME: TailPredicate 644define i64 @mla_i16_i64(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 { 645; CHECK-LABEL: @mla_i16_i64( 646; CHECK-NEXT: entry: 647; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 648; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] 649; CHECK: for.body.preheader: 650; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 651; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 652; CHECK: vector.ph: 653; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -8 654; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 655; CHECK: vector.body: 656; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 657; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] 658; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] 659; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* 660; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 661; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> 662; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]] 663; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>* 664; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 2 665; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i32> 666; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]] 667; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> 668; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]]) 669; CHECK-NEXT: [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]] 670; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 671; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 672; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] 673; CHECK: middle.block: 674; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] 675; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] 676; CHECK: scalar.ph: 677; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 678; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 679; CHECK-NEXT: br label [[FOR_BODY:%.*]] 680; CHECK: for.body: 681; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 682; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 683; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_012]] 684; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 685; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP11]] to i32 686; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[Y]], i32 [[I_012]] 687; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 688; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP12]] to i32 689; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]] 690; CHECK-NEXT: [[CONV3:%.*]] = sext i32 [[MUL]] to i64 691; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_011]], [[CONV3]] 692; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1 693; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] 694; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] 695; CHECK: for.cond.cleanup: 696; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] 697; CHECK-NEXT: ret i64 [[R_0_LCSSA]] 698; 699entry: 700 %cmp10 = icmp sgt i32 %n, 0 701 br i1 %cmp10, label %for.body, label %for.cond.cleanup 702 703for.body: ; preds = %entry, %for.body 704 %i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 705 %r.011 = phi i64 [ %add, %for.body ], [ 0, %entry ] 706 %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.012 707 %0 = load i16, i16* %arrayidx, align 2 708 %conv = sext i16 %0 to i32 709 %arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.012 710 %1 = load i16, i16* %arrayidx1, align 2 711 %conv2 = sext i16 %1 to i32 712 %mul = mul nsw i32 %conv2, %conv 713 %conv3 = sext i32 %mul to i64 714 %add = add nsw i64 %r.011, %conv3 715 %inc = add nuw nsw i32 %i.012, 1 716 %exitcond = icmp eq i32 %inc, %n 717 br i1 %exitcond, label %for.cond.cleanup, label %for.body 718 719for.cond.cleanup: ; preds = %for.body, %entry 720 %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ] 721 ret i64 %r.0.lcssa 722} 723 724; 8x to use VMLAL.u16 725; FIXME: TailPredicate 726define i64 @mla_i8_i64(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { 727; CHECK-LABEL: @mla_i8_i64( 728; CHECK-NEXT: entry: 729; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 730; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] 731; CHECK: for.body.preheader: 732; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 733; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 734; CHECK: vector.ph: 735; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -8 736; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 737; CHECK: vector.body: 738; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 739; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] 740; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] 741; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* 742; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 743; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> 744; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] 745; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>* 746; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1 747; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i32> 748; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i32> [[TMP5]], [[TMP2]] 749; CHECK-NEXT: [[TMP7:%.*]] = zext <8 x i32> [[TMP6]] to <8 x i64> 750; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]]) 751; CHECK-NEXT: [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]] 752; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 753; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 754; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] 755; CHECK: middle.block: 756; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] 757; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] 758; CHECK: scalar.ph: 759; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 760; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 761; CHECK-NEXT: br label [[FOR_BODY:%.*]] 762; CHECK: for.body: 763; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 764; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 765; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_012]] 766; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 767; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i32 768; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[Y]], i32 [[I_012]] 769; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 770; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP12]] to i32 771; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV2]], [[CONV]] 772; CHECK-NEXT: [[CONV3:%.*]] = zext i32 [[MUL]] to i64 773; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_011]], [[CONV3]] 774; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1 775; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] 776; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] 777; CHECK: for.cond.cleanup: 778; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] 779; CHECK-NEXT: ret i64 [[R_0_LCSSA]] 780; 781entry: 782 %cmp10 = icmp sgt i32 %n, 0 783 br i1 %cmp10, label %for.body, label %for.cond.cleanup 784 785for.body: ; preds = %entry, %for.body 786 %i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 787 %r.011 = phi i64 [ %add, %for.body ], [ 0, %entry ] 788 %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.012 789 %0 = load i8, i8* %arrayidx, align 1 790 %conv = zext i8 %0 to i32 791 %arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.012 792 %1 = load i8, i8* %arrayidx1, align 1 793 %conv2 = zext i8 %1 to i32 794 %mul = mul nuw nsw i32 %conv2, %conv 795 %conv3 = zext i32 %mul to i64 796 %add = add nuw nsw i64 %r.011, %conv3 797 %inc = add nuw nsw i32 %i.012, 1 798 %exitcond = icmp eq i32 %inc, %n 799 br i1 %exitcond, label %for.cond.cleanup, label %for.body 800 801for.cond.cleanup: ; preds = %for.body, %entry 802 %r.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ] 803 ret i64 %r.0.lcssa 804} 805 806; 4x to use VMLA.u32 807define i32 @mla_i32_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) #0 { 808; CHECK-LABEL: @mla_i32_i32( 809; CHECK-NEXT: entry: 810; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 811; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] 812; CHECK: vector.ph: 813; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 814; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 815; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 816; CHECK: vector.body: 817; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 818; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] 819; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) 820; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]] 821; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 822; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) 823; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[INDEX]] 824; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* 825; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) 826; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] 827; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer 828; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) 829; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI]] 830; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 831; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 832; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] 833; CHECK: for.cond.cleanup: 834; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ] 835; CHECK-NEXT: ret i32 [[R_0_LCSSA]] 836; 837entry: 838 %cmp8 = icmp sgt i32 %n, 0 839 br i1 %cmp8, label %for.body, label %for.cond.cleanup 840 841for.body: ; preds = %entry, %for.body 842 %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 843 %r.09 = phi i32 [ %add, %for.body ], [ 0, %entry ] 844 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.010 845 %0 = load i32, i32* %arrayidx, align 4 846 %arrayidx1 = getelementptr inbounds i32, i32* %y, i32 %i.010 847 %1 = load i32, i32* %arrayidx1, align 4 848 %mul = mul nsw i32 %1, %0 849 %add = add nsw i32 %mul, %r.09 850 %inc = add nuw nsw i32 %i.010, 1 851 %exitcond = icmp eq i32 %inc, %n 852 br i1 %exitcond, label %for.cond.cleanup, label %for.body 853 854for.cond.cleanup: ; preds = %for.body, %entry 855 %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] 856 ret i32 %r.0.lcssa 857} 858 859; 8x to use VMLA.u16 860define i32 @mla_i16_i32(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 { 861; CHECK-LABEL: @mla_i16_i32( 862; CHECK-NEXT: entry: 863; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 864; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] 865; CHECK: vector.ph: 866; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 867; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 868; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 869; CHECK: vector.body: 870; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 871; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] 872; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) 873; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] 874; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* 875; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) 876; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> 877; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]] 878; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>* 879; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP4]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) 880; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32> 881; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]] 882; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP6]], <8 x i32> zeroinitializer 883; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]]) 884; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] 885; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 886; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 887; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] 888; CHECK: for.cond.cleanup: 889; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ] 890; CHECK-NEXT: ret i32 [[R_0_LCSSA]] 891; 892entry: 893 %cmp9 = icmp sgt i32 %n, 0 894 br i1 %cmp9, label %for.body, label %for.cond.cleanup 895 896for.body: ; preds = %entry, %for.body 897 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 898 %r.010 = phi i32 [ %add, %for.body ], [ 0, %entry ] 899 %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.011 900 %0 = load i16, i16* %arrayidx, align 2 901 %conv = sext i16 %0 to i32 902 %arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.011 903 %1 = load i16, i16* %arrayidx1, align 2 904 %conv2 = sext i16 %1 to i32 905 %mul = mul nsw i32 %conv2, %conv 906 %add = add nsw i32 %mul, %r.010 907 %inc = add nuw nsw i32 %i.011, 1 908 %exitcond = icmp eq i32 %inc, %n 909 br i1 %exitcond, label %for.cond.cleanup, label %for.body 910 911for.cond.cleanup: ; preds = %for.body, %entry 912 %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] 913 ret i32 %r.0.lcssa 914} 915 916; 16x to use VMLA.u8 917define i32 @mla_i8_i32(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { 918; CHECK-LABEL: @mla_i8_i32( 919; CHECK-NEXT: entry: 920; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 921; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] 922; CHECK: vector.ph: 923; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 924; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 925; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 926; CHECK: vector.body: 927; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 928; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] 929; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) 930; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] 931; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* 932; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) 933; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> 934; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] 935; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* 936; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) 937; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> 938; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP2]] 939; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP6]], <16 x i32> zeroinitializer 940; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP7]]) 941; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] 942; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 943; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 944; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] 945; CHECK: for.cond.cleanup: 946; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ] 947; CHECK-NEXT: ret i32 [[R_0_LCSSA]] 948; 949entry: 950 %cmp9 = icmp sgt i32 %n, 0 951 br i1 %cmp9, label %for.body, label %for.cond.cleanup 952 953for.body: ; preds = %entry, %for.body 954 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 955 %r.010 = phi i32 [ %add, %for.body ], [ 0, %entry ] 956 %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.011 957 %0 = load i8, i8* %arrayidx, align 1 958 %conv = zext i8 %0 to i32 959 %arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.011 960 %1 = load i8, i8* %arrayidx1, align 1 961 %conv2 = zext i8 %1 to i32 962 %mul = mul nuw nsw i32 %conv2, %conv 963 %add = add nuw nsw i32 %mul, %r.010 964 %inc = add nuw nsw i32 %i.011, 1 965 %exitcond = icmp eq i32 %inc, %n 966 br i1 %exitcond, label %for.cond.cleanup, label %for.body 967 968for.cond.cleanup: ; preds = %for.body, %entry 969 %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] 970 ret i32 %r.0.lcssa 971} 972 973; 8x to use VMLA.u16 974define signext i16 @mla_i16_i16(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 { 975; CHECK-LABEL: @mla_i16_i16( 976; CHECK-NEXT: entry: 977; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 978; CHECK-NEXT: br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] 979; CHECK: vector.ph: 980; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 981; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 982; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 983; CHECK: vector.body: 984; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 985; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] 986; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) 987; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] 988; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* 989; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) 990; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]] 991; CHECK-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <8 x i16>* 992; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP3]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) 993; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] 994; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP4]], <8 x i16> zeroinitializer 995; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP5]]) 996; CHECK-NEXT: [[TMP7]] = add i16 [[TMP6]], [[VEC_PHI]] 997; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 998; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 999; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] 1000; CHECK: for.cond.cleanup: 1001; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ] 1002; CHECK-NEXT: ret i16 [[R_0_LCSSA]] 1003; 1004entry: 1005 %cmp11 = icmp sgt i32 %n, 0 1006 br i1 %cmp11, label %for.body, label %for.cond.cleanup 1007 1008for.body: ; preds = %entry, %for.body 1009 %i.013 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 1010 %r.012 = phi i16 [ %add, %for.body ], [ 0, %entry ] 1011 %arrayidx = getelementptr inbounds i16, i16* %x, i32 %i.013 1012 %0 = load i16, i16* %arrayidx, align 2 1013 %arrayidx1 = getelementptr inbounds i16, i16* %y, i32 %i.013 1014 %1 = load i16, i16* %arrayidx1, align 2 1015 %mul = mul i16 %1, %0 1016 %add = add i16 %mul, %r.012 1017 %inc = add nuw nsw i32 %i.013, 1 1018 %exitcond = icmp eq i32 %inc, %n 1019 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1020 1021for.cond.cleanup: ; preds = %for.body, %entry 1022 %r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ] 1023 ret i16 %r.0.lcssa 1024} 1025 1026; 16x to use VMLA.u8 1027define signext i16 @mla_i8_i16(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { 1028; CHECK-LABEL: @mla_i8_i16( 1029; CHECK-NEXT: entry: 1030; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 1031; CHECK-NEXT: br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] 1032; CHECK: vector.ph: 1033; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 1034; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 1035; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1036; CHECK: vector.body: 1037; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1038; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] 1039; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) 1040; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] 1041; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* 1042; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) 1043; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16> 1044; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] 1045; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* 1046; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) 1047; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16> 1048; CHECK-NEXT: [[TMP6:%.*]] = mul nuw <16 x i16> [[TMP5]], [[TMP2]] 1049; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP6]], <16 x i16> zeroinitializer 1050; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP7]]) 1051; CHECK-NEXT: [[TMP9]] = add i16 [[TMP8]], [[VEC_PHI]] 1052; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 1053; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 1054; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] 1055; CHECK: for.cond.cleanup: 1056; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ] 1057; CHECK-NEXT: ret i16 [[R_0_LCSSA]] 1058; 1059entry: 1060 %cmp11 = icmp sgt i32 %n, 0 1061 br i1 %cmp11, label %for.body, label %for.cond.cleanup 1062 1063for.body: ; preds = %entry, %for.body 1064 %i.013 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 1065 %r.012 = phi i16 [ %add, %for.body ], [ 0, %entry ] 1066 %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.013 1067 %0 = load i8, i8* %arrayidx, align 1 1068 %conv = zext i8 %0 to i16 1069 %arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.013 1070 %1 = load i8, i8* %arrayidx1, align 1 1071 %conv2 = zext i8 %1 to i16 1072 %mul = mul nuw i16 %conv2, %conv 1073 %add = add i16 %mul, %r.012 1074 %inc = add nuw nsw i32 %i.013, 1 1075 %exitcond = icmp eq i32 %inc, %n 1076 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1077 1078for.cond.cleanup: ; preds = %for.body, %entry 1079 %r.0.lcssa = phi i16 [ 0, %entry ], [ %add, %for.body ] 1080 ret i16 %r.0.lcssa 1081} 1082 1083; 16x to use VMLA.u8 1084define zeroext i8 @mla_i8_i8(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { 1085; CHECK-LABEL: @mla_i8_i8( 1086; CHECK-NEXT: entry: 1087; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 1088; CHECK-NEXT: br i1 [[CMP10]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] 1089; CHECK: vector.ph: 1090; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 1091; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 1092; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1093; CHECK: vector.body: 1094; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1095; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] 1096; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) 1097; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] 1098; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* 1099; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) 1100; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] 1101; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* 1102; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) 1103; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] 1104; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP4]], <16 x i8> zeroinitializer 1105; CHECK-NEXT: [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP5]]) 1106; CHECK-NEXT: [[TMP7]] = add i8 [[TMP6]], [[VEC_PHI]] 1107; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 1108; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 1109; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] 1110; CHECK: for.cond.cleanup: 1111; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ] 1112; CHECK-NEXT: ret i8 [[R_0_LCSSA]] 1113; 1114entry: 1115 %cmp10 = icmp sgt i32 %n, 0 1116 br i1 %cmp10, label %for.body, label %for.cond.cleanup 1117 1118for.body: ; preds = %entry, %for.body 1119 %i.012 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 1120 %r.011 = phi i8 [ %add, %for.body ], [ 0, %entry ] 1121 %arrayidx = getelementptr inbounds i8, i8* %x, i32 %i.012 1122 %0 = load i8, i8* %arrayidx, align 1 1123 %arrayidx1 = getelementptr inbounds i8, i8* %y, i32 %i.012 1124 %1 = load i8, i8* %arrayidx1, align 1 1125 %mul = mul i8 %1, %0 1126 %add = add i8 %mul, %r.011 1127 %inc = add nuw nsw i32 %i.012, 1 1128 %exitcond = icmp eq i32 %inc, %n 1129 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1130 1131for.cond.cleanup: ; preds = %for.body, %entry 1132 %r.0.lcssa = phi i8 [ 0, %entry ], [ %add, %for.body ] 1133 ret i8 %r.0.lcssa 1134} 1135 1136; 8x as different types 1137define i32 @red_mla_ext_s8_s16_s32(i8* noalias nocapture readonly %A, i16* noalias nocapture readonly %B, i32 %n) #0 { 1138; CHECK-LABEL: @red_mla_ext_s8_s16_s32( 1139; CHECK-NEXT: entry: 1140; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0 1141; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_PH:%.*]] 1142; CHECK: vector.ph: 1143; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 1144; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 1145; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1146; CHECK: vector.body: 1147; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1148; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] 1149; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) 1150; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[INDEX]] 1151; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* 1152; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) 1153; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32> 1154; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]] 1155; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>* 1156; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP4]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) 1157; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32> 1158; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]] 1159; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP6]], <8 x i32> zeroinitializer 1160; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]]) 1161; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] 1162; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 1163; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 1164; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] 1165; CHECK: for.cond.cleanup: 1166; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ] 1167; CHECK-NEXT: ret i32 [[S_0_LCSSA]] 1168; 1169entry: 1170 %cmp9.not = icmp eq i32 %n, 0 1171 br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader 1172 1173for.body.preheader: ; preds = %entry 1174 br label %for.body 1175 1176for.body: ; preds = %for.body.preheader, %for.body 1177 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 1178 %s.010 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] 1179 %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.011 1180 %0 = load i8, i8* %arrayidx, align 1 1181 %conv = sext i8 %0 to i32 1182 %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.011 1183 %1 = load i16, i16* %arrayidx1, align 2 1184 %conv2 = sext i16 %1 to i32 1185 %mul = mul nsw i32 %conv2, %conv 1186 %add = add nsw i32 %mul, %s.010 1187 %inc = add nuw i32 %i.011, 1 1188 %exitcond.not = icmp eq i32 %inc, %n 1189 br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body 1190 1191for.cond.cleanup.loopexit: ; preds = %for.body 1192 %add.lcssa = phi i32 [ %add, %for.body ] 1193 br label %for.cond.cleanup 1194 1195for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry 1196 %s.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ] 1197 ret i32 %s.0.lcssa 1198} 1199 1200; 4x as different sext vs zext 1201define i64 @red_mla_ext_s16_u16_s64(i16* noalias nocapture readonly %A, i16* noalias nocapture readonly %B, i32 %n) #0 { 1202; CHECK-LABEL: @red_mla_ext_s16_u16_s64( 1203; CHECK-NEXT: entry: 1204; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0 1205; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] 1206; CHECK: for.body.preheader: 1207; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 1208; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1209; CHECK: vector.ph: 1210; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 1211; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1212; CHECK: vector.body: 1213; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1214; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] 1215; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i32 [[INDEX]] 1216; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* 1217; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 1218; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32> 1219; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]] 1220; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <4 x i16>* 1221; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 1222; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i16> [[WIDE_LOAD1]] to <4 x i32> 1223; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP2]] 1224; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i32> [[TMP6]] to <4 x i64> 1225; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP7]]) 1226; CHECK-NEXT: [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]] 1227; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 1228; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 1229; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] 1230; CHECK: middle.block: 1231; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] 1232; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] 1233; CHECK: scalar.ph: 1234; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 1235; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 1236; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1237; CHECK: for.body: 1238; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1239; CHECK-NEXT: [[S_010:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 1240; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[A]], i32 [[I_011]] 1241; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX]], align 1 1242; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP11]] to i32 1243; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[I_011]] 1244; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 1245; CHECK-NEXT: [[CONV2:%.*]] = zext i16 [[TMP12]] to i32 1246; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]] 1247; CHECK-NEXT: [[MUL2:%.*]] = zext i32 [[MUL]] to i64 1248; CHECK-NEXT: [[ADD]] = add nsw i64 [[S_010]], [[MUL2]] 1249; CHECK-NEXT: [[INC]] = add nuw i32 [[I_011]], 1 1250; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] 1251; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] 1252; CHECK: for.cond.cleanup: 1253; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] 1254; CHECK-NEXT: ret i64 [[S_0_LCSSA]] 1255; 1256entry: 1257 %cmp9.not = icmp eq i32 %n, 0 1258 br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader 1259 1260for.body.preheader: ; preds = %entry 1261 br label %for.body 1262 1263for.body: ; preds = %for.body.preheader, %for.body 1264 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 1265 %s.010 = phi i64 [ %add, %for.body ], [ 0, %for.body.preheader ] 1266 %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.011 1267 %0 = load i16, i16* %arrayidx, align 1 1268 %conv = sext i16 %0 to i32 1269 %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.011 1270 %1 = load i16, i16* %arrayidx1, align 2 1271 %conv2 = zext i16 %1 to i32 1272 %mul = mul nsw i32 %conv2, %conv 1273 %mul2 = zext i32 %mul to i64 1274 %add = add nsw i64 %mul2, %s.010 1275 %inc = add nuw i32 %i.011, 1 1276 %exitcond.not = icmp eq i32 %inc, %n 1277 br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body 1278 1279for.cond.cleanup.loopexit: ; preds = %for.body 1280 %add.lcssa = phi i64 [ %add, %for.body ] 1281 br label %for.cond.cleanup 1282 1283for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry 1284 %s.0.lcssa = phi i64 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ] 1285 ret i64 %s.0.lcssa 1286} 1287 1288; 4x as different sext vs zext 1289define i32 @red_mla_u8_s8_u32(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i32 %n) #0 { 1290; CHECK-LABEL: @red_mla_u8_s8_u32( 1291; CHECK-NEXT: entry: 1292; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0 1293; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_PH:%.*]] 1294; CHECK: vector.ph: 1295; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 1296; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 1297; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1298; CHECK: vector.body: 1299; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1300; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] 1301; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) 1302; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[INDEX]] 1303; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* 1304; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) 1305; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> 1306; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[INDEX]] 1307; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* 1308; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) 1309; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32> 1310; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP2]] 1311; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer 1312; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) 1313; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] 1314; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 1315; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 1316; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] 1317; CHECK: for.cond.cleanup: 1318; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ] 1319; CHECK-NEXT: ret i32 [[S_0_LCSSA]] 1320; 1321entry: 1322 %cmp9.not = icmp eq i32 %n, 0 1323 br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader 1324 1325for.body.preheader: ; preds = %entry 1326 br label %for.body 1327 1328for.body: ; preds = %for.body.preheader, %for.body 1329 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 1330 %s.010 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] 1331 %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.011 1332 %0 = load i8, i8* %arrayidx, align 1 1333 %conv = zext i8 %0 to i32 1334 %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.011 1335 %1 = load i8, i8* %arrayidx1, align 1 1336 %conv2 = sext i8 %1 to i32 1337 %mul = mul nsw i32 %conv2, %conv 1338 %add = add i32 %mul, %s.010 1339 %inc = add nuw i32 %i.011, 1 1340 %exitcond.not = icmp eq i32 %inc, %n 1341 br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body 1342 1343for.cond.cleanup.loopexit: ; preds = %for.body 1344 %add.lcssa = phi i32 [ %add, %for.body ] 1345 br label %for.cond.cleanup 1346 1347for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry 1348 %s.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ] 1349 ret i32 %s.0.lcssa 1350} 1351 1352; Make sure interleave group members feeding in-loop reductions can be handled. 1353define i32 @reduction_interleave_group(i32 %n, i32* %arr) #0 { 1354; CHECK-LABEL: @reduction_interleave_group( 1355; CHECK-NEXT: entry: 1356; CHECK-NEXT: [[GUARD:%.*]] = icmp sgt i32 [[N:%.*]], 0 1357; CHECK-NEXT: br i1 [[GUARD]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] 1358; CHECK: for.body.preheader: 1359; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 1360; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1 1361; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1 1362; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 6 1363; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1364; CHECK: vector.ph: 1365; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -4 1366; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC]], 1 1367; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1368; CHECK: vector.body: 1369; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1370; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] 1371; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1 1372; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[OFFSET_IDX]], 1 1373; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[ARR:%.*]], i32 -1 1374; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 [[TMP3]] 1375; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* 1376; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4 1377; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1378; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1379; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[STRIDED_VEC1]]) 1380; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[VEC_PHI]] 1381; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[STRIDED_VEC]]) 1382; CHECK-NEXT: [[TMP10]] = add i32 [[TMP9]], [[TMP8]] 1383; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 1384; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] 1385; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] 1386; CHECK: middle.block: 1387; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] 1388; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[SCALAR_PH]] 1389; CHECK: scalar.ph: 1390; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 1391; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] 1392; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1393; CHECK: for.body: 1394; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1395; CHECK-NEXT: [[RED_PHI:%.*]] = phi i32 [ [[RED_2:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 1396; CHECK-NEXT: [[ADD:%.*]] = or i32 [[IV]], 1 1397; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i32 [[ADD]] 1398; CHECK-NEXT: [[L_0:%.*]] = load i32, i32* [[GEP_0]], align 4 1399; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i32 [[IV]] 1400; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[GEP_1]], align 4 1401; CHECK-NEXT: [[RED_1:%.*]] = add i32 [[L_0]], [[RED_PHI]] 1402; CHECK-NEXT: [[RED_2]] = add i32 [[RED_1]], [[L_1]] 1403; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 2 1404; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] 1405; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP31:![0-9]+]] 1406; CHECK: exit: 1407; CHECK-NEXT: [[RET_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RED_2]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] 1408; CHECK-NEXT: ret i32 [[RET_LCSSA]] 1409; 1410entry: 1411 %guard = icmp sgt i32 %n, 0 1412 br i1 %guard , label %for.body, label %exit 1413 1414for.body: ; preds = %for.body.preheader, %for.body 1415 %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ] 1416 %red.phi = phi i32 [ %red.2, %for.body ], [ 0, %entry ] 1417 %add = or i32 %iv, 1 1418 %gep.0 = getelementptr inbounds i32, i32* %arr, i32 %add 1419 %l.0 = load i32, i32* %gep.0, align 4 1420 %gep.1 = getelementptr inbounds i32, i32* %arr, i32 %iv 1421 %l.1 = load i32, i32* %gep.1, align 4 1422 %red.1 = add i32 %l.0, %red.phi 1423 %red.2 = add i32 %red.1, %l.1 1424 %iv.next = add nuw nsw i32 %iv, 2 1425 %cmp = icmp slt i32 %iv.next, %n 1426 br i1 %cmp, label %for.body, label %exit 1427 1428exit: 1429 %ret.lcssa = phi i32 [ 0, %entry ], [ %red.2, %for.body ] 1430 ret i32 %ret.lcssa 1431} 1432 1433attributes #0 = { "target-features"="+mve" } 1434