1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s 3 4target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 5 6define i32 @reduction_sum_single(i32* noalias nocapture %A) { 7; CHECK-LABEL: @reduction_sum_single( 8; CHECK-NEXT: entry: 9; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 10; CHECK: vector.ph: 11; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 12; CHECK: vector.body: 13; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 14; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] 15; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 16; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 17; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 18; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] 19; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 20; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 21; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 22; CHECK: middle.block: 23; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) 24; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] 25; CHECK: scalar.ph: 26; CHECK-NEXT: br label [[DOTLR_PH:%.*]] 27; CHECK: .lr.ph: 28; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !2 29; CHECK: ._crit_edge: 30; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] 31; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] 32; 33entry: 34 br label %.lr.ph 35 36.lr.ph: ; preds = %entry, %.lr.ph 37 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 38 %sum.02 = phi i32 [ %l7, %.lr.ph ], [ 0, %entry ] 39 %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 40 %l3 = load i32, i32* %l2, align 4 41 %l7 = add i32 %sum.02, %l3 42 %indvars.iv.next = add i64 %indvars.iv, 1 43 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 44 %exitcond = icmp eq i32 %lftr.wideiv, 256 45 br i1 %exitcond, label %._crit_edge, label %.lr.ph 46 47._crit_edge: ; preds = %.lr.ph 48 %sum.0.lcssa = phi i32 [ %l7, %.lr.ph ] 49 ret i32 %sum.0.lcssa 50} 51 52define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B) { 53; CHECK-LABEL: @reduction_sum( 54; CHECK-NEXT: entry: 55; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 56; CHECK: vector.ph: 57; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 58; CHECK: vector.body: 59; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 60; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] 61; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ] 62; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 63; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 64; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 65; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] 66; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* 67; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 68; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]] 69; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_LOAD]] 70; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[TMP5]], [[WIDE_LOAD1]] 71; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 72; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4> 73; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 74; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 75; CHECK: middle.block: 76; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) 77; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] 78; CHECK: scalar.ph: 79; CHECK-NEXT: br label [[DOTLR_PH:%.*]] 80; CHECK: .lr.ph: 81; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !5 82; CHECK: ._crit_edge: 83; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] 84; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] 85; 86entry: 87 br label %.lr.ph 88 89.lr.ph: ; preds = %entry, %.lr.ph 90 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 91 %sum.02 = phi i32 [ %l9, %.lr.ph ], [ 0, %entry ] 92 %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 93 %l3 = load i32, i32* %l2, align 4 94 %l4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 95 %l5 = load i32, i32* %l4, align 4 96 %l6 = trunc i64 %indvars.iv to i32 97 %l7 = add i32 %sum.02, %l6 98 %l8 = add i32 %l7, %l3 99 %l9 = add i32 %l8, %l5 100 %indvars.iv.next = add i64 %indvars.iv, 1 101 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 102 %exitcond = icmp eq i32 %lftr.wideiv, 256 103 br i1 %exitcond, label %._crit_edge, label %.lr.ph 104 105._crit_edge: ; preds = %.lr.ph 106 %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ] 107 ret i32 %sum.0.lcssa 108} 109 110define i32 @reduction_sum_const(i32* noalias nocapture %A) { 111; CHECK-LABEL: @reduction_sum_const( 112; CHECK-NEXT: entry: 113; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 114; CHECK: vector.ph: 115; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 116; CHECK: vector.body: 117; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 118; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 119; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 120; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 121; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 122; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] 123; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[TMP2]], <i32 3, i32 3, i32 3, i32 3> 124; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 125; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 126; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 127; CHECK: middle.block: 128; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) 129; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] 130; CHECK: scalar.ph: 131; CHECK-NEXT: br label [[DOTLR_PH:%.*]] 132; CHECK: .lr.ph: 133; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !7 134; CHECK: ._crit_edge: 135; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] 136; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] 137; 138entry: 139 br label %.lr.ph 140 141.lr.ph: ; preds = %entry, %.lr.ph 142 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 143 %sum.02 = phi i32 [ %l9, %.lr.ph ], [ 0, %entry ] 144 %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 145 %l3 = load i32, i32* %l2, align 4 146 %l7 = add i32 %sum.02, %l3 147 %l9 = add i32 %l7, 3 148 %indvars.iv.next = add i64 %indvars.iv, 1 149 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 150 %exitcond = icmp eq i32 %lftr.wideiv, 256 151 br i1 %exitcond, label %._crit_edge, label %.lr.ph 152 153._crit_edge: ; preds = %.lr.ph 154 %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ] 155 ret i32 %sum.0.lcssa 156} 157 158define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B) { 159; CHECK-LABEL: @reduction_prod( 160; CHECK-NEXT: entry: 161; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 162; CHECK: vector.ph: 163; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 164; CHECK: vector.body: 165; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 166; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] 167; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ] 168; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 169; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 170; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 171; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] 172; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* 173; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 174; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[VEC_IND2]] 175; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], [[WIDE_LOAD]] 176; CHECK-NEXT: [[TMP6]] = mul <4 x i32> [[TMP5]], [[WIDE_LOAD1]] 177; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 178; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4> 179; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 180; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 181; CHECK: middle.block: 182; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]]) 183; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] 184; CHECK: scalar.ph: 185; CHECK-NEXT: br label [[DOTLR_PH:%.*]] 186; CHECK: .lr.ph: 187; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !9 188; CHECK: ._crit_edge: 189; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] 190; CHECK-NEXT: ret i32 [[PROD_0_LCSSA]] 191; 192entry: 193 br label %.lr.ph 194 195.lr.ph: ; preds = %entry, %.lr.ph 196 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 197 %prod.02 = phi i32 [ %l9, %.lr.ph ], [ 1, %entry ] 198 %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 199 %l3 = load i32, i32* %l2, align 4 200 %l4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 201 %l5 = load i32, i32* %l4, align 4 202 %l6 = trunc i64 %indvars.iv to i32 203 %l7 = mul i32 %prod.02, %l6 204 %l8 = mul i32 %l7, %l3 205 %l9 = mul i32 %l8, %l5 206 %indvars.iv.next = add i64 %indvars.iv, 1 207 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 208 %exitcond = icmp eq i32 %lftr.wideiv, 256 209 br i1 %exitcond, label %._crit_edge, label %.lr.ph 210 211._crit_edge: ; preds = %.lr.ph 212 %prod.0.lcssa = phi i32 [ %l9, %.lr.ph ] 213 ret i32 %prod.0.lcssa 214} 215 216define i32 @reduction_mix(i32* noalias nocapture %A, i32* noalias nocapture %B) { 217; CHECK-LABEL: @reduction_mix( 218; CHECK-NEXT: entry: 219; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 220; CHECK: vector.ph: 221; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 222; CHECK: vector.body: 223; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 224; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] 225; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ] 226; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 227; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 228; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 229; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] 230; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* 231; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 232; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] 233; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]] 234; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[TMP5]], [[TMP4]] 235; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 236; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4> 237; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 238; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 239; CHECK: middle.block: 240; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) 241; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] 242; CHECK: scalar.ph: 243; CHECK-NEXT: br label [[DOTLR_PH:%.*]] 244; CHECK: .lr.ph: 245; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !11 246; CHECK: ._crit_edge: 247; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] 248; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] 249; 250entry: 251 br label %.lr.ph 252 253.lr.ph: ; preds = %entry, %.lr.ph 254 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 255 %sum.02 = phi i32 [ %l9, %.lr.ph ], [ 0, %entry ] 256 %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 257 %l3 = load i32, i32* %l2, align 4 258 %l4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 259 %l5 = load i32, i32* %l4, align 4 260 %l6 = mul nsw i32 %l5, %l3 261 %l7 = trunc i64 %indvars.iv to i32 262 %l8 = add i32 %sum.02, %l7 263 %l9 = add i32 %l8, %l6 264 %indvars.iv.next = add i64 %indvars.iv, 1 265 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 266 %exitcond = icmp eq i32 %lftr.wideiv, 256 267 br i1 %exitcond, label %._crit_edge, label %.lr.ph 268 269._crit_edge: ; preds = %.lr.ph 270 %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ] 271 ret i32 %sum.0.lcssa 272} 273 274define i32 @reduction_mul(i32* noalias nocapture %A, i32* noalias nocapture %B) { 275; CHECK-LABEL: @reduction_mul( 276; CHECK-NEXT: entry: 277; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 278; CHECK: vector.ph: 279; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 280; CHECK: vector.body: 281; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 282; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 19, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 283; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 284; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 285; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 286; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] 287; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* 288; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 289; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] 290; CHECK-NEXT: [[TMP5]] = mul <4 x i32> [[TMP4]], [[WIDE_LOAD1]] 291; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 292; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 293; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12 294; CHECK: middle.block: 295; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]]) 296; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] 297; CHECK: scalar.ph: 298; CHECK-NEXT: br label [[DOTLR_PH:%.*]] 299; CHECK: .lr.ph: 300; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !13 301; CHECK: ._crit_edge: 302; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 303; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] 304; 305entry: 306 br label %.lr.ph 307 308.lr.ph: ; preds = %entry, %.lr.ph 309 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 310 %sum.02 = phi i32 [ %l7, %.lr.ph ], [ 19, %entry ] 311 %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 312 %l3 = load i32, i32* %l2, align 4 313 %l4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 314 %l5 = load i32, i32* %l4, align 4 315 %l6 = mul i32 %sum.02, %l3 316 %l7 = mul i32 %l6, %l5 317 %indvars.iv.next = add i64 %indvars.iv, 1 318 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 319 %exitcond = icmp eq i32 %lftr.wideiv, 256 320 br i1 %exitcond, label %._crit_edge, label %.lr.ph 321 322._crit_edge: ; preds = %.lr.ph 323 %sum.0.lcssa = phi i32 [ %l7, %.lr.ph ] 324 ret i32 %sum.0.lcssa 325} 326 327define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out) { 328; CHECK-LABEL: @start_at_non_zero( 329; CHECK-NEXT: entry: 330; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 331; CHECK: vector.ph: 332; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 333; CHECK: vector.body: 334; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 335; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 120, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 336; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[INDEX]] 337; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 338; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 339; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[COEFF:%.*]], i64 [[INDEX]] 340; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* 341; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 342; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] 343; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[TMP4]], [[VEC_PHI]] 344; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 345; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 346; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14 347; CHECK: middle.block: 348; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) 349; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 350; CHECK: scalar.ph: 351; CHECK-NEXT: br label [[FOR_BODY:%.*]] 352; CHECK: for.body: 353; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !15 354; CHECK: for.end: 355; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 356; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] 357; 358entry: 359 br label %for.body 360 361for.body: ; preds = %entry, %for.body 362 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 363 %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ] 364 %arrayidx = getelementptr inbounds i32, i32* %in, i64 %indvars.iv 365 %l0 = load i32, i32* %arrayidx, align 4 366 %arrayidx2 = getelementptr inbounds i32, i32* %coeff, i64 %indvars.iv 367 %l1 = load i32, i32* %arrayidx2, align 4 368 %mul = mul nsw i32 %l1, %l0 369 %add = add nsw i32 %mul, %sum.09 370 %indvars.iv.next = add i64 %indvars.iv, 1 371 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 372 %exitcond = icmp eq i32 %lftr.wideiv, 256 373 br i1 %exitcond, label %for.end, label %for.body 374 375for.end: ; preds = %for.body, %entry 376 %sum.0.lcssa = phi i32 [ %add, %for.body ] 377 ret i32 %sum.0.lcssa 378} 379 380define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) { 381; CHECK-LABEL: @reduction_and( 382; CHECK-NEXT: entry: 383; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 384; CHECK: vector.ph: 385; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 386; CHECK: vector.body: 387; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 388; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 389; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 390; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 391; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 392; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] 393; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* 394; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 395; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] 396; CHECK-NEXT: [[TMP5]] = and <4 x i32> [[TMP4]], [[WIDE_LOAD1]] 397; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 398; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 399; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16 400; CHECK: middle.block: 401; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP5]]) 402; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 403; CHECK: scalar.ph: 404; CHECK-NEXT: br label [[FOR_BODY:%.*]] 405; CHECK: for.body: 406; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !17 407; CHECK: for.end: 408; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 409; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] 410; 411entry: 412 br label %for.body 413 414for.body: ; preds = %entry, %for.body 415 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 416 %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ] 417 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 418 %l0 = load i32, i32* %arrayidx, align 4 419 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 420 %l1 = load i32, i32* %arrayidx2, align 4 421 %add = and i32 %result.08, %l0 422 %and = and i32 %add, %l1 423 %indvars.iv.next = add i64 %indvars.iv, 1 424 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 425 %exitcond = icmp eq i32 %lftr.wideiv, 256 426 br i1 %exitcond, label %for.end, label %for.body 427 428for.end: ; preds = %for.body, %entry 429 %result.0.lcssa = phi i32 [ %and, %for.body ] 430 ret i32 %result.0.lcssa 431} 432 433define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) { 434; CHECK-LABEL: @reduction_or( 435; CHECK-NEXT: entry: 436; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 437; CHECK: vector.ph: 438; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 439; CHECK: vector.body: 440; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 441; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 442; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 443; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 444; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 445; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] 446; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* 447; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 448; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] 449; CHECK-NEXT: [[TMP5]] = or <4 x i32> [[TMP4]], [[VEC_PHI]] 450; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 451; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 452; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !18 453; CHECK: middle.block: 454; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) 455; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 456; CHECK: scalar.ph: 457; CHECK-NEXT: br label [[FOR_BODY:%.*]] 458; CHECK: for.body: 459; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !19 460; CHECK: for.end: 461; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 462; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] 463; 464entry: 465 br label %for.body 466 467for.body: ; preds = %entry, %for.body 468 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 469 %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ] 470 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 471 %l0 = load i32, i32* %arrayidx, align 4 472 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 473 %l1 = load i32, i32* %arrayidx2, align 4 474 %add = add nsw i32 %l1, %l0 475 %or = or i32 %add, %result.08 476 %indvars.iv.next = add i64 %indvars.iv, 1 477 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 478 %exitcond = icmp eq i32 %lftr.wideiv, 256 479 br i1 %exitcond, label %for.end, label %for.body 480 481for.end: ; preds = %for.body, %entry 482 %result.0.lcssa = phi i32 [ %or, %for.body ] 483 ret i32 %result.0.lcssa 484} 485 486define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) { 487; CHECK-LABEL: @reduction_xor( 488; CHECK-NEXT: entry: 489; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 490; CHECK: vector.ph: 491; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 492; CHECK: vector.body: 493; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 494; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 495; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 496; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 497; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 498; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] 499; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* 500; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 501; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] 502; CHECK-NEXT: [[TMP5]] = xor <4 x i32> [[TMP4]], [[VEC_PHI]] 503; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 504; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 505; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !20 506; CHECK: middle.block: 507; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]]) 508; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 509; CHECK: scalar.ph: 510; CHECK-NEXT: br label [[FOR_BODY:%.*]] 511; CHECK: for.body: 512; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !21 513; CHECK: for.end: 514; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 515; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] 516; 517entry: 518 br label %for.body 519 520for.body: ; preds = %entry, %for.body 521 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 522 %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ] 523 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 524 %l0 = load i32, i32* %arrayidx, align 4 525 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 526 %l1 = load i32, i32* %arrayidx2, align 4 527 %add = add nsw i32 %l1, %l0 528 %xor = xor i32 %add, %result.08 529 %indvars.iv.next = add i64 %indvars.iv, 1 530 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 531 %exitcond = icmp eq i32 %lftr.wideiv, 256 532 br i1 %exitcond, label %for.end, label %for.body 533 534for.end: ; preds = %for.body, %entry 535 %result.0.lcssa = phi i32 [ %xor, %for.body ] 536 ret i32 %result.0.lcssa 537} 538 539define float @reduction_fadd(float* nocapture %A, float* nocapture %B) { 540; CHECK-LABEL: @reduction_fadd( 541; CHECK-NEXT: entry: 542; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 543; CHECK: vector.ph: 544; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 545; CHECK: vector.body: 546; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 547; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 548; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] 549; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>* 550; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 551; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] 552; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* 553; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 554; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] 555; CHECK-NEXT: [[TMP5]] = fadd fast <4 x float> [[TMP4]], [[WIDE_LOAD1]] 556; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 557; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 558; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !22 559; CHECK: middle.block: 560; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) 561; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 562; CHECK: scalar.ph: 563; CHECK-NEXT: br label [[FOR_BODY:%.*]] 564; CHECK: for.body: 565; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !23 566; CHECK: for.end: 567; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 568; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] 569; 570entry: 571 br label %for.body 572 573for.body: ; preds = %entry, %for.body 574 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 575 %result.08 = phi float [ %fadd, %for.body ], [ 0.0, %entry ] 576 %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv 577 %l0 = load float, float* %arrayidx, align 4 578 %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv 579 %l1 = load float, float* %arrayidx2, align 4 580 %add = fadd fast float %result.08, %l0 581 %fadd = fadd fast float %add, %l1 582 %indvars.iv.next = add i64 %indvars.iv, 1 583 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 584 %exitcond = icmp eq i32 %lftr.wideiv, 256 585 br i1 %exitcond, label %for.end, label %for.body 586 587for.end: ; preds = %for.body, %entry 588 %result.0.lcssa = phi float [ %fadd, %for.body ] 589 ret float %result.0.lcssa 590} 591 592define float @reduction_fmul(float* nocapture %A, float* nocapture %B) { 593; CHECK-LABEL: @reduction_fmul( 594; CHECK-NEXT: entry: 595; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 596; CHECK: vector.ph: 597; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 598; CHECK: vector.body: 599; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 600; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 601; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] 602; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>* 603; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 604; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] 605; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* 606; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 607; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] 608; CHECK-NEXT: [[TMP5]] = fmul fast <4 x float> [[TMP4]], [[WIDE_LOAD1]] 609; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 610; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 611; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !24 612; CHECK: middle.block: 613; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP5]]) 614; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 615; CHECK: scalar.ph: 616; CHECK-NEXT: br label [[FOR_BODY:%.*]] 617; CHECK: for.body: 618; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !25 619; CHECK: for.end: 620; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 621; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] 622; 623entry: 624 br label %for.body 625 626for.body: ; preds = %entry, %for.body 627 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 628 %result.08 = phi float [ %fmul, %for.body ], [ 0.0, %entry ] 629 %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv 630 %l0 = load float, float* %arrayidx, align 4 631 %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv 632 %l1 = load float, float* %arrayidx2, align 4 633 %add = fmul fast float %result.08, %l0 634 %fmul = fmul fast float %add, %l1 635 %indvars.iv.next = add i64 %indvars.iv, 1 636 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 637 %exitcond = icmp eq i32 %lftr.wideiv, 256 638 br i1 %exitcond, label %for.end, label %for.body 639 640for.end: ; preds = %for.body, %entry 641 %result.0.lcssa = phi float [ %fmul, %for.body ] 642 ret float %result.0.lcssa 643} 644 645define i32 @reduction_min(i32* nocapture %A, i32* nocapture %B) { 646; CHECK-LABEL: @reduction_min( 647; CHECK-NEXT: entry: 648; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 649; CHECK: vector.ph: 650; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 651; CHECK: vector.body: 652; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 653; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 654; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 655; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 656; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 657; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] 658; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] 659; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 660; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 661; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !26 662; CHECK: middle.block: 663; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]]) 664; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 665; CHECK: scalar.ph: 666; CHECK-NEXT: br label [[FOR_BODY:%.*]] 667; CHECK: for.body: 668; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !27 669; CHECK: for.end: 670; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] 671; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] 672; 673entry: 674 br label %for.body 675 676for.body: ; preds = %entry, %for.body 677 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 678 %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ] 679 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 680 %l0 = load i32, i32* %arrayidx, align 4 681 %c0 = icmp slt i32 %result.08, %l0 682 %v0 = select i1 %c0, i32 %result.08, i32 %l0 683 %indvars.iv.next = add i64 %indvars.iv, 1 684 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 685 %exitcond = icmp eq i32 %lftr.wideiv, 256 686 br i1 %exitcond, label %for.end, label %for.body 687 688for.end: ; preds = %for.body, %entry 689 %result.0.lcssa = phi i32 [ %v0, %for.body ] 690 ret i32 %result.0.lcssa 691} 692 693define i32 @reduction_max(i32* nocapture %A, i32* nocapture %B) { 694; CHECK-LABEL: @reduction_max( 695; CHECK-NEXT: entry: 696; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 697; CHECK: vector.ph: 698; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 699; CHECK: vector.body: 700; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 701; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 702; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 703; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 704; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 705; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] 706; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] 707; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 708; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 709; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !28 710; CHECK: middle.block: 711; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]]) 712; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 713; CHECK: scalar.ph: 714; CHECK-NEXT: br label [[FOR_BODY:%.*]] 715; CHECK: for.body: 716; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !29 717; CHECK: for.end: 718; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] 719; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] 720; 721entry: 722 br label %for.body 723 724for.body: ; preds = %entry, %for.body 725 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 726 %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ] 727 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 728 %l0 = load i32, i32* %arrayidx, align 4 729 %c0 = icmp ugt i32 %result.08, %l0 730 %v0 = select i1 %c0, i32 %result.08, i32 %l0 731 %indvars.iv.next = add i64 %indvars.iv, 1 732 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 733 %exitcond = icmp eq i32 %lftr.wideiv, 256 734 br i1 %exitcond, label %for.end, label %for.body 735 736for.end: ; preds = %for.body, %entry 737 %result.0.lcssa = phi i32 [ %v0, %for.body ] 738 ret i32 %result.0.lcssa 739} 740 741; Sub we can create a reduction, but not inloop 742define i32 @reduction_sub_lhs(i32* noalias nocapture %A) { 743; CHECK-LABEL: @reduction_sub_lhs( 744; CHECK-NEXT: entry: 745; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 746; CHECK: vector.ph: 747; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 748; CHECK: vector.body: 749; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 750; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] 751; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 752; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 753; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 754; CHECK-NEXT: [[TMP2]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] 755; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 756; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 757; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !30 758; CHECK: middle.block: 759; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) 760; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 761; CHECK: scalar.ph: 762; CHECK-NEXT: br label [[FOR_BODY:%.*]] 763; CHECK: for.body: 764; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !31 765; CHECK: for.end: 766; CHECK-NEXT: [[X_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] 767; CHECK-NEXT: ret i32 [[X_0_LCSSA]] 768; 769entry: 770 br label %for.body 771 772for.body: ; preds = %entry, %for.body 773 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 774 %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ] 775 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 776 %l0 = load i32, i32* %arrayidx, align 4 777 %sub = sub nsw i32 %x.05, %l0 778 %indvars.iv.next = add i64 %indvars.iv, 1 779 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 780 %exitcond = icmp eq i32 %lftr.wideiv, 256 781 br i1 %exitcond, label %for.end, label %for.body 782 783for.end: ; preds = %for.body, %entry 784 %x.0.lcssa = phi i32 [ %sub, %for.body ] 785 ret i32 %x.0.lcssa 786} 787 788; Conditional reductions with multi-input phis. 789define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) { 790; CHECK-LABEL: @reduction_conditional( 791; CHECK-NEXT: entry: 792; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 793; CHECK: vector.ph: 794; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[S:%.*]], i32 0 795; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 796; CHECK: vector.body: 797; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 798; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PREDPHI3:%.*]], [[VECTOR_BODY]] ] 799; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] 800; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[TMP1]] to <4 x float>* 801; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 802; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] 803; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>* 804; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 805; CHECK-NEXT: [[TMP5:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] 806; CHECK-NEXT: [[TMP6:%.*]] = fcmp ule <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> 807; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 808; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i1> [[TMP6]], [[TMP5]] 809; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]] 810; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true> 811; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i1> [[TMP8]], [[TMP10]] 812; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true> 813; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD]] 814; CHECK-NEXT: [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[PREDPHI_V]] 815; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP11]], [[TMP12]] 816; CHECK-NEXT: [[PREDPHI3]] = select <4 x i1> [[TMP13]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]] 817; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 818; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128 819; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !32 820; CHECK: middle.block: 821; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]]) 822; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 823; CHECK: scalar.ph: 824; CHECK-NEXT: br label [[FOR_BODY:%.*]] 825; CHECK: for.body: 826; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] 827; CHECK: if.then: 828; CHECK-NEXT: br i1 undef, label [[IF_THEN8:%.*]], label [[IF_ELSE:%.*]] 829; CHECK: if.then8: 830; CHECK-NEXT: br label [[FOR_INC]] 831; CHECK: if.else: 832; CHECK-NEXT: br i1 undef, label [[IF_THEN16:%.*]], label [[FOR_INC]] 833; CHECK: if.then16: 834; CHECK-NEXT: br label [[FOR_INC]] 835; CHECK: for.inc: 836; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !33 837; CHECK: for.end: 838; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi float [ undef, [[FOR_INC]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] 839; CHECK-NEXT: ret float [[SUM_1_LCSSA]] 840; 841entry: 842 br label %for.body 843 844for.body: 845 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 846 %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ] 847 %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv 848 %l0 = load float, float* %arrayidx, align 4 849 %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv 850 %l1 = load float, float* %arrayidx2, align 4 851 %cmp3 = fcmp ogt float %l0, %l1 852 br i1 %cmp3, label %if.then, label %for.inc 853 854if.then: 855 %cmp6 = fcmp ogt float %l1, 1.000000e+00 856 br i1 %cmp6, label %if.then8, label %if.else 857 858if.then8: 859 %add = fadd fast float %sum.033, %l0 860 br label %for.inc 861 862if.else: 863 %cmp14 = fcmp ogt float %l0, 2.000000e+00 864 br i1 %cmp14, label %if.then16, label %for.inc 865 866if.then16: 867 %add19 = fadd fast float %sum.033, %l1 868 br label %for.inc 869 870for.inc: 871 %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ %sum.033, %if.else ], [ %sum.033, %for.body ] 872 %indvars.iv.next = add i64 %indvars.iv, 1 873 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 874 %exitcond = icmp ne i32 %lftr.wideiv, 128 875 br i1 %exitcond, label %for.body, label %for.end 876 877for.end: 878 %sum.1.lcssa = phi float [ %sum.1, %for.inc ] 879 ret float %sum.1.lcssa 880} 881 882define i32 @reduction_sum_multiuse(i32* noalias nocapture %A, i32* noalias nocapture %B) { 883; CHECK-LABEL: @reduction_sum_multiuse( 884; CHECK-NEXT: entry: 885; CHECK-NEXT: br label [[DOTLR_PH:%.*]] 886; CHECK: .lr.ph: 887; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ 0, [[ENTRY:%.*]] ] 888; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], [[DOTLR_PH]] ], [ 0, [[ENTRY]] ] 889; CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] 890; CHECK-NEXT: [[L3:%.*]] = load i32, i32* [[L2]], align 4 891; CHECK-NEXT: [[L6:%.*]] = trunc i64 [[INDVARS_IV]] to i32 892; CHECK-NEXT: [[L7:%.*]] = add i32 [[SUM_02]], [[L6]] 893; CHECK-NEXT: [[L8:%.*]] = add i32 [[L7]], [[L3]] 894; CHECK-NEXT: [[L10]] = add i32 [[L8]], [[SUM_02]] 895; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 896; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 897; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256 898; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[DOTLR_PH]] 899; CHECK: end: 900; CHECK-NEXT: ret i32 [[L10]] 901; 902entry: 903 br label %.lr.ph 904 905.lr.ph: ; preds = %entry, %.lr.ph 906 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 907 %sum.02 = phi i32 [ %l10, %.lr.ph ], [ 0, %entry ] 908 %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 909 %l3 = load i32, i32* %l2, align 4 910 %l4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 911 %l5 = load i32, i32* %l4, align 4 912 %l6 = trunc i64 %indvars.iv to i32 913 %l7 = add i32 %sum.02, %l6 914 %l8 = add i32 %l7, %l3 915 %l9 = add i32 %l8, %l5 916 %l10 = add i32 %l8, %sum.02 917 %indvars.iv.next = add i64 %indvars.iv, 1 918 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 919 %exitcond = icmp eq i32 %lftr.wideiv, 256 920 br i1 %exitcond, label %end, label %.lr.ph 921 922end: 923 %f1 = phi i32 [ %l10, %.lr.ph ] 924 ret i32 %f1 925} 926 927; Predicated loop, cannot (yet) use in-loop reductions. 928define i32 @reduction_predicated(i32* noalias nocapture %A, i32* noalias nocapture %B) { 929; CHECK-LABEL: @reduction_predicated( 930; CHECK-NEXT: entry: 931; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 932; CHECK: vector.ph: 933; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 934; CHECK: vector.body: 935; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 936; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] 937; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ] 938; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] 939; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* 940; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 941; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] 942; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* 943; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 944; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]] 945; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_LOAD]] 946; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[TMP5]], [[WIDE_LOAD1]] 947; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 948; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4> 949; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 950; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !34 951; CHECK: middle.block: 952; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) 953; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] 954; CHECK: scalar.ph: 955; CHECK-NEXT: br label [[DOTLR_PH:%.*]] 956; CHECK: .lr.ph: 957; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !35 958; CHECK: ._crit_edge: 959; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] 960; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] 961; 962entry: 963 br label %.lr.ph 964 965.lr.ph: ; preds = %entry, %.lr.ph 966 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 967 %sum.02 = phi i32 [ %l9, %.lr.ph ], [ 0, %entry ] 968 %l2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 969 %l3 = load i32, i32* %l2, align 4 970 %l4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 971 %l5 = load i32, i32* %l4, align 4 972 %l6 = trunc i64 %indvars.iv to i32 973 %l7 = add i32 %sum.02, %l6 974 %l8 = add i32 %l7, %l3 975 %l9 = add i32 %l8, %l5 976 %indvars.iv.next = add i64 %indvars.iv, 1 977 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 978 %exitcond = icmp eq i32 %lftr.wideiv, 256 979 br i1 %exitcond, label %._crit_edge, label %.lr.ph, !llvm.loop !6 980 981._crit_edge: ; preds = %.lr.ph 982 %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ] 983 ret i32 %sum.0.lcssa 984} 985 986define i8 @reduction_add_trunc(i8* noalias nocapture %A) { 987; CHECK-LABEL: @reduction_add_trunc( 988; CHECK-NEXT: entry: 989; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 990; CHECK: vector.ph: 991; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 992; CHECK: vector.body: 993; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 994; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ <i8 -1, i8 0, i8 0, i8 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 995; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 996; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]] 997; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>* 998; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 4 999; CHECK-NEXT: [[TMP3]] = add <4 x i8> [[VEC_PHI]], [[WIDE_LOAD]] 1000; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 1001; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 1002; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !36 1003; CHECK: middle.block: 1004; CHECK-NEXT: [[TMP5:%.*]] = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> [[TMP3]]) 1005; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] 1006; CHECK: scalar.ph: 1007; CHECK-NEXT: br label [[DOTLR_PH:%.*]] 1008; CHECK: .lr.ph: 1009; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !37 1010; CHECK: ._crit_edge: 1011; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i8 [ undef, [[DOTLR_PH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] 1012; CHECK-NEXT: ret i8 [[SUM_0_LCSSA]] 1013; 1014entry: 1015 br label %.lr.ph 1016 1017.lr.ph: ; preds = %entry, %.lr.ph 1018 %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 1019 %sum.02p = phi i32 [ %l9, %.lr.ph ], [ 255, %entry ] 1020 %sum.02 = and i32 %sum.02p, 255 1021 %l2 = getelementptr inbounds i8, i8* %A, i32 %indvars.iv 1022 %l3 = load i8, i8* %l2, align 4 1023 %l3e = zext i8 %l3 to i32 1024 %l9 = add i32 %sum.02, %l3e 1025 %indvars.iv.next = add i32 %indvars.iv, 1 1026 %exitcond = icmp eq i32 %indvars.iv.next, 256 1027 br i1 %exitcond, label %._crit_edge, label %.lr.ph 1028 1029._crit_edge: ; preds = %.lr.ph 1030 %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ] 1031 %ret = trunc i32 %sum.0.lcssa to i8 1032 ret i8 %ret 1033} 1034 1035 1036define i8 @reduction_and_trunc(i8* noalias nocapture %A) { 1037; CHECK-LABEL: @reduction_and_trunc( 1038; CHECK-NEXT: entry: 1039; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1040; CHECK: vector.ph: 1041; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1042; CHECK: vector.body: 1043; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1044; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 255, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] 1045; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 1046; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]] 1047; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>* 1048; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 4 1049; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> 1050; CHECK-NEXT: [[TMP4]] = and <4 x i32> [[VEC_PHI]], [[TMP3]] 1051; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 1052; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 1053; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !38 1054; CHECK: middle.block: 1055; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP4]]) 1056; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] 1057; CHECK: scalar.ph: 1058; CHECK-NEXT: br label [[DOTLR_PH:%.*]] 1059; CHECK: .lr.ph: 1060; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !39 1061; CHECK: ._crit_edge: 1062; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] 1063; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[SUM_0_LCSSA]] to i8 1064; CHECK-NEXT: ret i8 [[RET]] 1065; 1066entry: 1067 br label %.lr.ph 1068 1069.lr.ph: ; preds = %entry, %.lr.ph 1070 %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 1071 %sum.02p = phi i32 [ %l9, %.lr.ph ], [ 255, %entry ] 1072 %sum.02 = and i32 %sum.02p, 255 1073 %l2 = getelementptr inbounds i8, i8* %A, i32 %indvars.iv 1074 %l3 = load i8, i8* %l2, align 4 1075 %l3e = zext i8 %l3 to i32 1076 %l9 = and i32 %sum.02, %l3e 1077 %indvars.iv.next = add i32 %indvars.iv, 1 1078 %exitcond = icmp eq i32 %indvars.iv.next, 256 1079 br i1 %exitcond, label %._crit_edge, label %.lr.ph 1080 1081._crit_edge: ; preds = %.lr.ph 1082 %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ] 1083 %ret = trunc i32 %sum.0.lcssa to i8 1084 ret i8 %ret 1085} 1086 1087!6 = distinct !{!6, !7, !8} 1088!7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} 1089!8 = !{!"llvm.loop.vectorize.enable", i1 true} 1090