1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -prefer-predicate-over-epilog -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s 3 4target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" 5 6define i32 @reduction_sum_single(i32* noalias nocapture %A) { 7; CHECK-LABEL: @reduction_sum_single( 8; CHECK: vector.body: 9; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP24:%.*]], %pred.load.continue6 ] 10; CHECK: [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23:%.*]] 11; CHECK: middle.block: 12; CHECK: [[TMP26:%.*]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] 13; CHECK: [[TMP27:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP26]]) 14; 15entry: 16 br label %.lr.ph 17 18.lr.ph: ; preds = %entry, %.lr.ph 19 %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 20 %sum.02 = phi i32 [ %l7, %.lr.ph ], [ 0, %entry ] 21 %l2 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv 22 %l3 = load i32, i32* %l2, align 4 23 %l7 = add i32 %sum.02, %l3 24 %indvars.iv.next = add i32 %indvars.iv, 1 25 %exitcond = icmp eq i32 %indvars.iv.next, 257 26 br i1 %exitcond, label %._crit_edge, label %.lr.ph 27 28._crit_edge: ; preds = %.lr.ph 29 %sum.0.lcssa = phi i32 [ %l7, %.lr.ph ] 30 ret i32 %sum.0.lcssa 31} 32 33define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B) { 34; CHECK-LABEL: @reduction_sum( 35; CHECK: vector.body: 36; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue14 ] 37; CHECK: [[TMP44:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND:%.*]] 38; CHECK: [[TMP45:%.*]] = add <4 x i32> [[TMP44]], [[TMP23:%.*]] 39; CHECK: [[TMP46]] = add <4 x i32> [[TMP45]], [[TMP43:%.*]] 40; CHECK: middle.block: 41; CHECK: [[TMP48:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]] 42; CHECK: [[TMP49:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP48]]) 43; 44entry: 45 br label %.lr.ph 46 47.lr.ph: ; preds = %entry, %.lr.ph 48 %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 49 %sum.02 = phi i32 [ %l9, %.lr.ph ], [ 0, %entry ] 50 %l2 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv 51 %l3 = load i32, i32* %l2, align 4 52 %l4 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv 53 %l5 = load i32, i32* %l4, align 4 54 %l7 = add i32 %sum.02, %indvars.iv 55 %l8 = add i32 %l7, %l3 56 %l9 = add i32 %l8, %l5 57 %indvars.iv.next = add i32 %indvars.iv, 1 58 %exitcond = icmp eq i32 %indvars.iv.next, 257 59 br i1 %exitcond, label %._crit_edge, label %.lr.ph 60 61._crit_edge: ; preds = %.lr.ph 62 %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ] 63 ret i32 %sum.0.lcssa 64} 65 66define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B) { 67; CHECK-LABEL: @reduction_prod( 68; CHECK: vector.body: 69; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %vector.ph ], [ [[TMP45:%.*]], %pred.load.continue14 ] 70; CHECK: [[TMP44:%.*]] = mul <4 x i32> [[VEC_PHI]], [[TMP23:%.*]] 71; CHECK: [[TMP45]] = mul <4 x i32> [[TMP44]], [[TMP43:%.*]] 72; CHECK: middle.block: 73; CHECK: [[TMP47:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]] 74; CHECK: [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP47]]) 75; 76entry: 77 br label %.lr.ph 78 79.lr.ph: ; preds = %entry, %.lr.ph 80 %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ] 81 %prod.02 = phi i32 [ %l9, %.lr.ph ], [ 1, %entry ] 82 %l2 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv 83 %l3 = load i32, i32* %l2, align 4 84 %l4 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv 85 %l5 = load i32, i32* %l4, align 4 86 %l8 = mul i32 %prod.02, %l3 87 %l9 = mul i32 %l8, %l5 88 %indvars.iv.next = add i32 %indvars.iv, 1 89 %exitcond = icmp eq i32 %indvars.iv.next, 257 90 br i1 %exitcond, label %._crit_edge, label %.lr.ph 91 92._crit_edge: ; preds = %.lr.ph 93 %prod.0.lcssa = phi i32 [ %l9, %.lr.ph ] 94 ret i32 %prod.0.lcssa 95} 96 97define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) { 98; CHECK-LABEL: @reduction_and( 99; CHECK: vector.body: 100; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ [[TMP45:%.*]], %pred.load.continue14 ] 101; CHECK: [[TMP44:%.*]] = and <4 x i32> [[VEC_PHI]], [[TMP42:%.*]] 102; CHECK: [[TMP45]] = and <4 x i32> [[TMP44]], [[TMP43]] 103; CHECK: middle.block: 104; CHECK: [[TMP47:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]] 105; CHECK: [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP47]]) 106; 107entry: 108 br label %for.body 109 110for.body: ; preds = %entry, %for.body 111 %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 112 %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ] 113 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv 114 %l0 = load i32, i32* %arrayidx, align 4 115 %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv 116 %l1 = load i32, i32* %arrayidx2, align 4 117 %add = and i32 %result.08, %l0 118 %and = and i32 %add, %l1 119 %indvars.iv.next = add i32 %indvars.iv, 1 120 %exitcond = icmp eq i32 %indvars.iv.next, 257 121 br i1 %exitcond, label %for.end, label %for.body 122 123for.end: ; preds = %for.body, %entry 124 %result.0.lcssa = phi i32 [ %and, %for.body ] 125 ret i32 %result.0.lcssa 126} 127 128define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) { 129; CHECK-LABEL: @reduction_or( 130; CHECK: vector.body: 131; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP45:%.*]], %pred.load.continue14 ] 132; CHECK: [[TMP45]] = or <4 x i32> [[TMP44:%.*]], [[VEC_PHI]] 133; CHECK: middle.block: 134; CHECK: [[TMP47:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]] 135; CHECK: [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP47]]) 136; 137entry: 138 br label %for.body 139 140for.body: ; preds = %entry, %for.body 141 %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 142 %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ] 143 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv 144 %l0 = load i32, i32* %arrayidx, align 4 145 %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv 146 %l1 = load i32, i32* %arrayidx2, align 4 147 %add = add nsw i32 %l1, %l0 148 %or = or i32 %add, %result.08 149 %indvars.iv.next = add i32 %indvars.iv, 1 150 %exitcond = icmp eq i32 %indvars.iv.next, 257 151 br i1 %exitcond, label %for.end, label %for.body 152 153for.end: ; preds = %for.body, %entry 154 %result.0.lcssa = phi i32 [ %or, %for.body ] 155 ret i32 %result.0.lcssa 156} 157 158define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) { 159; CHECK-LABEL: @reduction_xor( 160; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP45:%.*]], %pred.load.continue14 ] 161; CHECK: [[TMP45]] = xor <4 x i32> [[TMP44:%.*]], [[VEC_PHI]] 162; CHECK: middle.block: 163; CHECK: [[TMP47:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]] 164; CHECK: [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP47]]) 165; 166entry: 167 br label %for.body 168 169for.body: ; preds = %entry, %for.body 170 %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 171 %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ] 172 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv 173 %l0 = load i32, i32* %arrayidx, align 4 174 %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv 175 %l1 = load i32, i32* %arrayidx2, align 4 176 %add = add nsw i32 %l1, %l0 177 %xor = xor i32 %add, %result.08 178 %indvars.iv.next = add i32 %indvars.iv, 1 179 %exitcond = icmp eq i32 %indvars.iv.next, 257 180 br i1 %exitcond, label %for.end, label %for.body 181 182for.end: ; preds = %for.body, %entry 183 %result.0.lcssa = phi i32 [ %xor, %for.body ] 184 ret i32 %result.0.lcssa 185} 186 187define float @reduction_fadd(float* nocapture %A, float* nocapture %B) { 188; CHECK-LABEL: @reduction_fadd( 189; CHECK: vector.body: 190; CHECK: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %vector.ph ], [ [[TMP45:%.*]], %pred.load.continue14 ] 191; CHECK: [[TMP44:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP23:%.*]] 192; CHECK: [[TMP45]] = fadd fast <4 x float> [[TMP44]], [[TMP43]] 193; CHECK: middle.block: 194; CHECK: [[TMP47:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x float> [[TMP45]], <4 x float> [[VEC_PHI]] 195; CHECK: [[TMP48:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP47]]) 196; 197entry: 198 br label %for.body 199 200for.body: ; preds = %entry, %for.body 201 %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 202 %result.08 = phi float [ %fadd, %for.body ], [ 0.0, %entry ] 203 %arrayidx = getelementptr inbounds float, float* %A, i32 %indvars.iv 204 %l0 = load float, float* %arrayidx, align 4 205 %arrayidx2 = getelementptr inbounds float, float* %B, i32 %indvars.iv 206 %l1 = load float, float* %arrayidx2, align 4 207 %add = fadd fast float %result.08, %l0 208 %fadd = fadd fast float %add, %l1 209 %indvars.iv.next = add i32 %indvars.iv, 1 210 %exitcond = icmp eq i32 %indvars.iv.next, 257 211 br i1 %exitcond, label %for.end, label %for.body 212 213for.end: ; preds = %for.body, %entry 214 %result.0.lcssa = phi float [ %fadd, %for.body ] 215 ret float %result.0.lcssa 216} 217 218define float @reduction_fmul(float* nocapture %A, float* nocapture %B) { 219; CHECK-LABEL: @reduction_fmul( 220; CHECK: vector.body: 221; CHECK: [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %vector.ph ], [ [[TMP45:%.*]], %pred.load.continue14 ] 222; CHECK: [[TMP44:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[TMP23:%.*]] 223; CHECK: [[TMP45]] = fmul fast <4 x float> [[TMP44]], [[TMP43]] 224; CHECK: middle.block: 225; CHECK: [[TMP47:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x float> [[TMP45]], <4 x float> [[VEC_PHI]] 226; CHECK: [[TMP48:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP47]]) 227; 228entry: 229 br label %for.body 230 231for.body: ; preds = %entry, %for.body 232 %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 233 %result.08 = phi float [ %fmul, %for.body ], [ 0.0, %entry ] 234 %arrayidx = getelementptr inbounds float, float* %A, i32 %indvars.iv 235 %l0 = load float, float* %arrayidx, align 4 236 %arrayidx2 = getelementptr inbounds float, float* %B, i32 %indvars.iv 237 %l1 = load float, float* %arrayidx2, align 4 238 %add = fmul fast float %result.08, %l0 239 %fmul = fmul fast float %add, %l1 240 %indvars.iv.next = add i32 %indvars.iv, 1 241 %exitcond = icmp eq i32 %indvars.iv.next, 257 242 br i1 %exitcond, label %for.end, label %for.body 243 244for.end: ; preds = %for.body, %entry 245 %result.0.lcssa = phi float [ %fmul, %for.body ] 246 ret float %result.0.lcssa 247} 248 249define i32 @reduction_min(i32* nocapture %A, i32* nocapture %B) { 250; CHECK-LABEL: @reduction_min( 251; CHECK: vector.body: 252; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, %vector.ph ], [ [[TMP25:%.*]], %pred.load.continue6 ] 253; CHECK: [[TMP24:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[TMP23:%.*]] 254; CHECK: [[TMP25]] = select <4 x i1> [[TMP24]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]] 255; CHECK: middle.block: 256; CHECK: [[TMP27:%.*]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP25]], <4 x i32> [[VEC_PHI]] 257; CHECK: [[TMP28:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP27]]) 258; 259entry: 260 br label %for.body 261 262for.body: ; preds = %entry, %for.body 263 %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 264 %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ] 265 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv 266 %l0 = load i32, i32* %arrayidx, align 4 267 %c0 = icmp slt i32 %result.08, %l0 268 %v0 = select i1 %c0, i32 %result.08, i32 %l0 269 %indvars.iv.next = add i32 %indvars.iv, 1 270 %exitcond = icmp eq i32 %indvars.iv.next, 257 271 br i1 %exitcond, label %for.end, label %for.body 272 273for.end: ; preds = %for.body, %entry 274 %result.0.lcssa = phi i32 [ %v0, %for.body ] 275 ret i32 %result.0.lcssa 276} 277 278define i32 @reduction_max(i32* nocapture %A, i32* nocapture %B) { 279; CHECK-LABEL: @reduction_max( 280; CHECK: vector.body: 281; CHECK: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, %vector.ph ], [ [[TMP25:%.*]], %pred.load.continue6 ] 282; CHECK: [[TMP24:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[TMP23:%.*]] 283; CHECK: [[TMP25]] = select <4 x i1> [[TMP24]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]] 284; CHECK: middle.block: 285; CHECK: [[TMP27:%.*]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP25]], <4 x i32> [[VEC_PHI]] 286; CHECK: [[TMP28:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP27]]) 287; 288entry: 289 br label %for.body 290 291for.body: ; preds = %entry, %for.body 292 %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 293 %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ] 294 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv 295 %l0 = load i32, i32* %arrayidx, align 4 296 %c0 = icmp ugt i32 %result.08, %l0 297 %v0 = select i1 %c0, i32 %result.08, i32 %l0 298 %indvars.iv.next = add i32 %indvars.iv, 1 299 %exitcond = icmp eq i32 %indvars.iv.next, 257 300 br i1 %exitcond, label %for.end, label %for.body 301 302for.end: ; preds = %for.body, %entry 303 %result.0.lcssa = phi i32 [ %v0, %for.body ] 304 ret i32 %result.0.lcssa 305} 306