1cee313d2SEric Christopher; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s 2cee313d2SEric Christopher 3cee313d2SEric Christophertarget datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 4cee313d2SEric Christopher 5cee313d2SEric Christopher;CHECK-LABEL: @reduction_sum( 6cee313d2SEric Christopher;CHECK: phi <4 x i32> 7cee313d2SEric Christopher;CHECK: load <4 x i32> 8cee313d2SEric Christopher;CHECK: add <4 x i32> 979b1b4a5SSanjay Patel;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> 10cee313d2SEric Christopher;CHECK: ret i32 11cee313d2SEric Christopherdefine i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { 12cee313d2SEric Christopher %1 = icmp sgt i32 %n, 0 13cee313d2SEric Christopher br i1 %1, label %.lr.ph, label %._crit_edge 14cee313d2SEric Christopher 15cee313d2SEric Christopher.lr.ph: ; preds = %0, %.lr.ph 16cee313d2SEric Christopher %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] 17cee313d2SEric Christopher %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] 18cee313d2SEric Christopher %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 19cee313d2SEric Christopher %3 = load i32, i32* %2, align 4 20cee313d2SEric Christopher %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 21cee313d2SEric Christopher %5 = load i32, i32* %4, align 4 22cee313d2SEric Christopher %6 = trunc i64 %indvars.iv to i32 23cee313d2SEric Christopher %7 = add i32 %sum.02, %6 24cee313d2SEric Christopher %8 = add i32 %7, %3 25cee313d2SEric Christopher %9 = add i32 %8, %5 26cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 27cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 28cee313d2SEric Christopher %exitcond = icmp eq i32 %lftr.wideiv, %n 29cee313d2SEric Christopher br i1 %exitcond, label %._crit_edge, label %.lr.ph 30cee313d2SEric Christopher 31cee313d2SEric Christopher._crit_edge: ; preds = %.lr.ph, %0 32cee313d2SEric Christopher %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] 33cee313d2SEric Christopher ret i32 %sum.0.lcssa 34cee313d2SEric Christopher} 35cee313d2SEric Christopher 36cee313d2SEric Christopher;CHECK-LABEL: @reduction_prod( 37cee313d2SEric Christopher;CHECK: phi <4 x i32> 38cee313d2SEric Christopher;CHECK: load <4 x i32> 39cee313d2SEric Christopher;CHECK: mul <4 x i32> 4079b1b4a5SSanjay Patel;CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> 41cee313d2SEric Christopher;CHECK: ret i32 42cee313d2SEric Christopherdefine i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { 43cee313d2SEric Christopher %1 = icmp sgt i32 %n, 0 44cee313d2SEric Christopher br i1 %1, label %.lr.ph, label %._crit_edge 45cee313d2SEric Christopher 46cee313d2SEric Christopher.lr.ph: ; preds = %0, %.lr.ph 47cee313d2SEric Christopher %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] 48cee313d2SEric Christopher %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ] 49cee313d2SEric Christopher %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 50cee313d2SEric Christopher %3 = load i32, i32* %2, align 4 51cee313d2SEric Christopher %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 52cee313d2SEric Christopher %5 = load i32, i32* %4, align 4 53cee313d2SEric Christopher %6 = trunc i64 %indvars.iv to i32 54cee313d2SEric Christopher %7 = mul i32 %prod.02, %6 55cee313d2SEric Christopher %8 = mul i32 %7, %3 56cee313d2SEric Christopher %9 = mul i32 %8, %5 57cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 58cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 59cee313d2SEric Christopher %exitcond = icmp eq i32 %lftr.wideiv, %n 60cee313d2SEric Christopher br i1 %exitcond, label %._crit_edge, label %.lr.ph 61cee313d2SEric Christopher 62cee313d2SEric Christopher._crit_edge: ; preds = %.lr.ph, %0 63cee313d2SEric Christopher %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ] 64cee313d2SEric Christopher ret i32 %prod.0.lcssa 65cee313d2SEric Christopher} 66cee313d2SEric Christopher 67cee313d2SEric Christopher;CHECK-LABEL: @reduction_mix( 68cee313d2SEric Christopher;CHECK: phi <4 x i32> 69cee313d2SEric Christopher;CHECK: load <4 x i32> 70cee313d2SEric Christopher;CHECK: mul nsw <4 x i32> 7179b1b4a5SSanjay Patel;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> 72cee313d2SEric Christopher;CHECK: ret i32 73cee313d2SEric Christopherdefine i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { 74cee313d2SEric Christopher %1 = icmp sgt i32 %n, 0 75cee313d2SEric Christopher br i1 %1, label %.lr.ph, label %._crit_edge 76cee313d2SEric Christopher 77cee313d2SEric Christopher.lr.ph: ; preds = %0, %.lr.ph 78cee313d2SEric Christopher %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] 79cee313d2SEric Christopher %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] 80cee313d2SEric Christopher %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 81cee313d2SEric Christopher %3 = load i32, i32* %2, align 4 82cee313d2SEric Christopher %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 83cee313d2SEric Christopher %5 = load i32, i32* %4, align 4 84cee313d2SEric Christopher %6 = mul nsw i32 %5, %3 85cee313d2SEric Christopher %7 = trunc i64 %indvars.iv to i32 86cee313d2SEric Christopher %8 = add i32 %sum.02, %7 87cee313d2SEric Christopher %9 = add i32 %8, %6 88cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 89cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 90cee313d2SEric Christopher %exitcond = icmp eq i32 %lftr.wideiv, %n 91cee313d2SEric Christopher br i1 %exitcond, label %._crit_edge, label %.lr.ph 92cee313d2SEric Christopher 93cee313d2SEric Christopher._crit_edge: ; preds = %.lr.ph, %0 94cee313d2SEric Christopher %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] 95cee313d2SEric Christopher ret i32 %sum.0.lcssa 96cee313d2SEric Christopher} 97cee313d2SEric Christopher 98cee313d2SEric Christopher;CHECK-LABEL: @reduction_mul( 99cee313d2SEric Christopher;CHECK: mul <4 x i32> 10079b1b4a5SSanjay Patel;CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> 101cee313d2SEric Christopher;CHECK: ret i32 102cee313d2SEric Christopherdefine i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { 103cee313d2SEric Christopher %1 = icmp sgt i32 %n, 0 104cee313d2SEric Christopher br i1 %1, label %.lr.ph, label %._crit_edge 105cee313d2SEric Christopher 106cee313d2SEric Christopher.lr.ph: ; preds = %0, %.lr.ph 107cee313d2SEric Christopher %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] 108cee313d2SEric Christopher %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ] 109cee313d2SEric Christopher %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 110cee313d2SEric Christopher %3 = load i32, i32* %2, align 4 111cee313d2SEric Christopher %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 112cee313d2SEric Christopher %5 = load i32, i32* %4, align 4 113cee313d2SEric Christopher %6 = trunc i64 %indvars.iv to i32 114cee313d2SEric Christopher %7 = add i32 %3, %6 115cee313d2SEric Christopher %8 = add i32 %7, %5 116cee313d2SEric Christopher %9 = mul i32 %8, %sum.02 117cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 118cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 119cee313d2SEric Christopher %exitcond = icmp eq i32 %lftr.wideiv, %n 120cee313d2SEric Christopher br i1 %exitcond, label %._crit_edge, label %.lr.ph 121cee313d2SEric Christopher 122cee313d2SEric Christopher._crit_edge: ; preds = %.lr.ph, %0 123cee313d2SEric Christopher %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] 124cee313d2SEric Christopher ret i32 %sum.0.lcssa 125cee313d2SEric Christopher} 126cee313d2SEric Christopher 127cee313d2SEric Christopher;CHECK-LABEL: @start_at_non_zero( 128cee313d2SEric Christopher;CHECK: phi <4 x i32> 129cee313d2SEric Christopher;CHECK: <i32 120, i32 0, i32 0, i32 0> 13079b1b4a5SSanjay Patel;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> 131cee313d2SEric Christopher;CHECK: ret i32 132cee313d2SEric Christopherdefine i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp { 133cee313d2SEric Christopherentry: 134cee313d2SEric Christopher %cmp7 = icmp sgt i32 %n, 0 135cee313d2SEric Christopher br i1 %cmp7, label %for.body, label %for.end 136cee313d2SEric Christopher 137cee313d2SEric Christopherfor.body: ; preds = %entry, %for.body 138cee313d2SEric Christopher %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 139cee313d2SEric Christopher %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ] 140cee313d2SEric Christopher %arrayidx = getelementptr inbounds i32, i32* %in, i64 %indvars.iv 141cee313d2SEric Christopher %0 = load i32, i32* %arrayidx, align 4 142cee313d2SEric Christopher %arrayidx2 = getelementptr inbounds i32, i32* %coeff, i64 %indvars.iv 143cee313d2SEric Christopher %1 = load i32, i32* %arrayidx2, align 4 144cee313d2SEric Christopher %mul = mul nsw i32 %1, %0 145cee313d2SEric Christopher %add = add nsw i32 %mul, %sum.09 146cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 147cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 148cee313d2SEric Christopher %exitcond = icmp eq i32 %lftr.wideiv, %n 149cee313d2SEric Christopher br i1 %exitcond, label %for.end, label %for.body 150cee313d2SEric Christopher 151cee313d2SEric Christopherfor.end: ; preds = %for.body, %entry 152cee313d2SEric Christopher %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ] 153cee313d2SEric Christopher ret i32 %sum.0.lcssa 154cee313d2SEric Christopher} 155cee313d2SEric Christopher 156cee313d2SEric Christopher;CHECK-LABEL: @reduction_and( 157cee313d2SEric Christopher;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1> 158cee313d2SEric Christopher;CHECK: and <4 x i32> 15979b1b4a5SSanjay Patel;CHECK: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> 160cee313d2SEric Christopher;CHECK: ret i32 161cee313d2SEric Christopherdefine i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly { 162cee313d2SEric Christopherentry: 163cee313d2SEric Christopher %cmp7 = icmp sgt i32 %n, 0 164cee313d2SEric Christopher br i1 %cmp7, label %for.body, label %for.end 165cee313d2SEric Christopher 166cee313d2SEric Christopherfor.body: ; preds = %entry, %for.body 167cee313d2SEric Christopher %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 168cee313d2SEric Christopher %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ] 169cee313d2SEric Christopher %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 170cee313d2SEric Christopher %0 = load i32, i32* %arrayidx, align 4 171cee313d2SEric Christopher %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 172cee313d2SEric Christopher %1 = load i32, i32* %arrayidx2, align 4 173cee313d2SEric Christopher %add = add nsw i32 %1, %0 174cee313d2SEric Christopher %and = and i32 %add, %result.08 175cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 176cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 177cee313d2SEric Christopher %exitcond = icmp eq i32 %lftr.wideiv, %n 178cee313d2SEric Christopher br i1 %exitcond, label %for.end, label %for.body 179cee313d2SEric Christopher 180cee313d2SEric Christopherfor.end: ; preds = %for.body, %entry 181cee313d2SEric Christopher %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ] 182cee313d2SEric Christopher ret i32 %result.0.lcssa 183cee313d2SEric Christopher} 184cee313d2SEric Christopher 185cee313d2SEric Christopher;CHECK-LABEL: @reduction_or( 186cee313d2SEric Christopher;CHECK: or <4 x i32> 18779b1b4a5SSanjay Patel;CHECK: call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> 188cee313d2SEric Christopher;CHECK: ret i32 189cee313d2SEric Christopherdefine i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly { 190cee313d2SEric Christopherentry: 191cee313d2SEric Christopher %cmp7 = icmp sgt i32 %n, 0 192cee313d2SEric Christopher br i1 %cmp7, label %for.body, label %for.end 193cee313d2SEric Christopher 194cee313d2SEric Christopherfor.body: ; preds = %entry, %for.body 195cee313d2SEric Christopher %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 196cee313d2SEric Christopher %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ] 197cee313d2SEric Christopher %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 198cee313d2SEric Christopher %0 = load i32, i32* %arrayidx, align 4 199cee313d2SEric Christopher %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 200cee313d2SEric Christopher %1 = load i32, i32* %arrayidx2, align 4 201cee313d2SEric Christopher %add = add nsw i32 %1, %0 202cee313d2SEric Christopher %or = or i32 %add, %result.08 203cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 204cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 205cee313d2SEric Christopher %exitcond = icmp eq i32 %lftr.wideiv, %n 206cee313d2SEric Christopher br i1 %exitcond, label %for.end, label %for.body 207cee313d2SEric Christopher 208cee313d2SEric Christopherfor.end: ; preds = %for.body, %entry 209cee313d2SEric Christopher %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ] 210cee313d2SEric Christopher ret i32 %result.0.lcssa 211cee313d2SEric Christopher} 212cee313d2SEric Christopher 213cee313d2SEric Christopher;CHECK-LABEL: @reduction_xor( 214cee313d2SEric Christopher;CHECK: xor <4 x i32> 21579b1b4a5SSanjay Patel;CHECK: call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> 216cee313d2SEric Christopher;CHECK: ret i32 217cee313d2SEric Christopherdefine i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly { 218cee313d2SEric Christopherentry: 219cee313d2SEric Christopher %cmp7 = icmp sgt i32 %n, 0 220cee313d2SEric Christopher br i1 %cmp7, label %for.body, label %for.end 221cee313d2SEric Christopher 222cee313d2SEric Christopherfor.body: ; preds = %entry, %for.body 223cee313d2SEric Christopher %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 224cee313d2SEric Christopher %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ] 225cee313d2SEric Christopher %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 226cee313d2SEric Christopher %0 = load i32, i32* %arrayidx, align 4 227cee313d2SEric Christopher %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 228cee313d2SEric Christopher %1 = load i32, i32* %arrayidx2, align 4 229cee313d2SEric Christopher %add = add nsw i32 %1, %0 230cee313d2SEric Christopher %xor = xor i32 %add, %result.08 231cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 232cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 233cee313d2SEric Christopher %exitcond = icmp eq i32 %lftr.wideiv, %n 234cee313d2SEric Christopher br i1 %exitcond, label %for.end, label %for.body 235cee313d2SEric Christopher 236cee313d2SEric Christopherfor.end: ; preds = %for.body, %entry 237cee313d2SEric Christopher %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ] 238cee313d2SEric Christopher ret i32 %result.0.lcssa 239cee313d2SEric Christopher} 240cee313d2SEric Christopher 241cee313d2SEric Christopher; In this code the subtracted variable is on the RHS and this is not an induction variable. 242cee313d2SEric Christopher;CHECK-LABEL: @reduction_sub_rhs( 243cee313d2SEric Christopher;CHECK-NOT: phi <4 x i32> 244cee313d2SEric Christopher;CHECK-NOT: sub nsw <4 x i32> 245cee313d2SEric Christopher;CHECK: ret i32 246cee313d2SEric Christopherdefine i32 @reduction_sub_rhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly { 247cee313d2SEric Christopherentry: 248cee313d2SEric Christopher %cmp4 = icmp sgt i32 %n, 0 249cee313d2SEric Christopher br i1 %cmp4, label %for.body, label %for.end 250cee313d2SEric Christopher 251cee313d2SEric Christopherfor.body: ; preds = %entry, %for.body 252cee313d2SEric Christopher %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 253cee313d2SEric Christopher %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ] 254cee313d2SEric Christopher %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 255cee313d2SEric Christopher %0 = load i32, i32* %arrayidx, align 4 256cee313d2SEric Christopher %sub = sub nsw i32 %0, %x.05 257cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 258cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 259cee313d2SEric Christopher %exitcond = icmp eq i32 %lftr.wideiv, %n 260cee313d2SEric Christopher br i1 %exitcond, label %for.end, label %for.body 261cee313d2SEric Christopher 262cee313d2SEric Christopherfor.end: ; preds = %for.body, %entry 263cee313d2SEric Christopher %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ] 264cee313d2SEric Christopher ret i32 %x.0.lcssa 265cee313d2SEric Christopher} 266cee313d2SEric Christopher 267cee313d2SEric Christopher 268cee313d2SEric Christopher; In this test the reduction variable is on the LHS and we can vectorize it. 269cee313d2SEric Christopher;CHECK-LABEL: @reduction_sub_lhs( 270cee313d2SEric Christopher;CHECK: phi <4 x i32> 271e498be57SAyal Zaks;CHECK: sub <4 x i32> 272cee313d2SEric Christopher;CHECK: ret i32 273cee313d2SEric Christopherdefine i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly { 274cee313d2SEric Christopherentry: 275cee313d2SEric Christopher %cmp4 = icmp sgt i32 %n, 0 276cee313d2SEric Christopher br i1 %cmp4, label %for.body, label %for.end 277cee313d2SEric Christopher 278cee313d2SEric Christopherfor.body: ; preds = %entry, %for.body 279cee313d2SEric Christopher %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 280cee313d2SEric Christopher %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ] 281cee313d2SEric Christopher %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 282cee313d2SEric Christopher %0 = load i32, i32* %arrayidx, align 4 283cee313d2SEric Christopher %sub = sub nsw i32 %x.05, %0 284cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 285cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 286cee313d2SEric Christopher %exitcond = icmp eq i32 %lftr.wideiv, %n 287cee313d2SEric Christopher br i1 %exitcond, label %for.end, label %for.body 288cee313d2SEric Christopher 289cee313d2SEric Christopherfor.end: ; preds = %for.body, %entry 290cee313d2SEric Christopher %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ] 291cee313d2SEric Christopher ret i32 %x.0.lcssa 292cee313d2SEric Christopher} 293cee313d2SEric Christopher 294cee313d2SEric Christopher; We can vectorize conditional reductions with multi-input phis. 295cee313d2SEric Christopher; CHECK: reduction_conditional 296cee313d2SEric Christopher; CHECK: fadd fast <4 x float> 297cee313d2SEric Christopher 298cee313d2SEric Christopherdefine float @reduction_conditional(float* %A, float* %B, float* %C, float %S) { 299cee313d2SEric Christopherentry: 300cee313d2SEric Christopher br label %for.body 301cee313d2SEric Christopher 302cee313d2SEric Christopherfor.body: 303cee313d2SEric Christopher %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 304cee313d2SEric Christopher %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ] 305cee313d2SEric Christopher %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv 306cee313d2SEric Christopher %0 = load float, float* %arrayidx, align 4 307cee313d2SEric Christopher %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv 308cee313d2SEric Christopher %1 = load float, float* %arrayidx2, align 4 309cee313d2SEric Christopher %cmp3 = fcmp ogt float %0, %1 310cee313d2SEric Christopher br i1 %cmp3, label %if.then, label %for.inc 311cee313d2SEric Christopher 312cee313d2SEric Christopherif.then: 313cee313d2SEric Christopher %cmp6 = fcmp ogt float %1, 1.000000e+00 314cee313d2SEric Christopher br i1 %cmp6, label %if.then8, label %if.else 315cee313d2SEric Christopher 316cee313d2SEric Christopherif.then8: 317cee313d2SEric Christopher %add = fadd fast float %sum.033, %0 318cee313d2SEric Christopher br label %for.inc 319cee313d2SEric Christopher 320cee313d2SEric Christopherif.else: 321cee313d2SEric Christopher %cmp14 = fcmp ogt float %0, 2.000000e+00 322cee313d2SEric Christopher br i1 %cmp14, label %if.then16, label %for.inc 323cee313d2SEric Christopher 324cee313d2SEric Christopherif.then16: 325cee313d2SEric Christopher %add19 = fadd fast float %sum.033, %1 326cee313d2SEric Christopher br label %for.inc 327cee313d2SEric Christopher 328cee313d2SEric Christopherfor.inc: 329cee313d2SEric Christopher %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ %sum.033, %if.else ], [ %sum.033, %for.body ] 330cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 331cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 332cee313d2SEric Christopher %exitcond = icmp ne i32 %lftr.wideiv, 128 333cee313d2SEric Christopher br i1 %exitcond, label %for.body, label %for.end 334cee313d2SEric Christopher 335cee313d2SEric Christopherfor.end: 336cee313d2SEric Christopher %sum.1.lcssa = phi float [ %sum.1, %for.inc ] 337cee313d2SEric Christopher ret float %sum.1.lcssa 338cee313d2SEric Christopher} 339cee313d2SEric Christopher 340cee313d2SEric Christopher; We can't vectorize reductions with phi inputs from outside the reduction. 341cee313d2SEric Christopher; CHECK: noreduction_phi 342cee313d2SEric Christopher; CHECK-NOT: fadd <4 x float> 343cee313d2SEric Christopherdefine float @noreduction_phi(float* %A, float* %B, float* %C, float %S) { 344cee313d2SEric Christopherentry: 345cee313d2SEric Christopher br label %for.body 346cee313d2SEric Christopher 347cee313d2SEric Christopherfor.body: 348cee313d2SEric Christopher %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 349cee313d2SEric Christopher %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ] 350cee313d2SEric Christopher %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv 351cee313d2SEric Christopher %0 = load float, float* %arrayidx, align 4 352cee313d2SEric Christopher %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv 353cee313d2SEric Christopher %1 = load float, float* %arrayidx2, align 4 354cee313d2SEric Christopher %cmp3 = fcmp ogt float %0, %1 355cee313d2SEric Christopher br i1 %cmp3, label %if.then, label %for.inc 356cee313d2SEric Christopher 357cee313d2SEric Christopherif.then: 358cee313d2SEric Christopher %cmp6 = fcmp ogt float %1, 1.000000e+00 359cee313d2SEric Christopher br i1 %cmp6, label %if.then8, label %if.else 360cee313d2SEric Christopher 361cee313d2SEric Christopherif.then8: 362cee313d2SEric Christopher %add = fadd fast float %sum.033, %0 363cee313d2SEric Christopher br label %for.inc 364cee313d2SEric Christopher 365cee313d2SEric Christopherif.else: 366cee313d2SEric Christopher %cmp14 = fcmp ogt float %0, 2.000000e+00 367cee313d2SEric Christopher br i1 %cmp14, label %if.then16, label %for.inc 368cee313d2SEric Christopher 369cee313d2SEric Christopherif.then16: 370cee313d2SEric Christopher %add19 = fadd fast float %sum.033, %1 371cee313d2SEric Christopher br label %for.inc 372cee313d2SEric Christopher 373cee313d2SEric Christopherfor.inc: 374cee313d2SEric Christopher %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ 0.000000e+00, %if.else ], [ %sum.033, %for.body ] 375cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 376cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 377cee313d2SEric Christopher %exitcond = icmp ne i32 %lftr.wideiv, 128 378cee313d2SEric Christopher br i1 %exitcond, label %for.body, label %for.end 379cee313d2SEric Christopher 380cee313d2SEric Christopherfor.end: 381cee313d2SEric Christopher %sum.1.lcssa = phi float [ %sum.1, %for.inc ] 382cee313d2SEric Christopher ret float %sum.1.lcssa 383cee313d2SEric Christopher} 384cee313d2SEric Christopher 385cee313d2SEric Christopher; We can't vectorize reductions that feed another header PHI. 386cee313d2SEric Christopher; CHECK: noredux_header_phi 387cee313d2SEric Christopher; CHECK-NOT: fadd <4 x float> 388cee313d2SEric Christopher 389cee313d2SEric Christopherdefine float @noredux_header_phi(float* %A, float* %B, float* %C, float %S) { 390cee313d2SEric Christopherentry: 391cee313d2SEric Christopher br label %for.body 392cee313d2SEric Christopher 393cee313d2SEric Christopherfor.body: 394cee313d2SEric Christopher %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 395cee313d2SEric Christopher %sum2.09 = phi float [ 0.000000e+00, %entry ], [ %add1, %for.body ] 396cee313d2SEric Christopher %sum.08 = phi float [ %S, %entry ], [ %add, %for.body ] 397cee313d2SEric Christopher %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv 398cee313d2SEric Christopher %0 = load float, float* %arrayidx, align 4 399cee313d2SEric Christopher %add = fadd fast float %sum.08, %0 400cee313d2SEric Christopher %add1 = fadd fast float %sum2.09, %add 401cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 402cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 403cee313d2SEric Christopher %exitcond = icmp ne i32 %lftr.wideiv, 128 404cee313d2SEric Christopher br i1 %exitcond, label %for.body, label %for.end 405cee313d2SEric Christopher 406cee313d2SEric Christopherfor.end: 407cee313d2SEric Christopher %add1.lcssa = phi float [ %add1, %for.body ] 408cee313d2SEric Christopher %add.lcssa = phi float [ %add, %for.body ] 409cee313d2SEric Christopher %add2 = fadd fast float %add.lcssa, %add1.lcssa 410cee313d2SEric Christopher ret float %add2 411cee313d2SEric Christopher} 412cee313d2SEric Christopher 413cee313d2SEric Christopher 414cee313d2SEric Christopher; When vectorizing a reduction whose loop header phi value is used outside the 415cee313d2SEric Christopher; loop special care must be taken. Otherwise, the reduced value feeding into the 416cee313d2SEric Christopher; outside user misses a few iterations (VF-1) of the loop. 417cee313d2SEric Christopher; PR16522 418cee313d2SEric Christopher 419cee313d2SEric Christopher; CHECK-LABEL: @phivalueredux( 420cee313d2SEric Christopher; CHECK-NOT: x i32> 421cee313d2SEric Christopher 422cee313d2SEric Christopherdefine i32 @phivalueredux(i32 %p) { 423cee313d2SEric Christopherentry: 424cee313d2SEric Christopher br label %for.body 425cee313d2SEric Christopher 426cee313d2SEric Christopherfor.body: 427cee313d2SEric Christopher %t.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 428cee313d2SEric Christopher %p.addr.02 = phi i32 [ %p, %entry ], [ %xor, %for.body ] 429cee313d2SEric Christopher %xor = xor i32 %p.addr.02, -1 430cee313d2SEric Christopher %inc = add nsw i32 %t.03, 1 431cee313d2SEric Christopher %exitcond = icmp eq i32 %inc, 16 432cee313d2SEric Christopher br i1 %exitcond, label %for.end, label %for.body 433cee313d2SEric Christopher 434cee313d2SEric Christopherfor.end: 435cee313d2SEric Christopher ret i32 %p.addr.02 436cee313d2SEric Christopher} 437cee313d2SEric Christopher 438cee313d2SEric Christopher; Don't vectorize a reduction value that is not the last in a reduction cyle. We 439cee313d2SEric Christopher; would loose iterations (VF-1) on the operations after that use. 440cee313d2SEric Christopher; PR17498 441cee313d2SEric Christopher 442cee313d2SEric Christopher; CHECK-LABEL: not_last_operation 443cee313d2SEric Christopher; CHECK-NOT: x i32> 444cee313d2SEric Christopherdefine i32 @not_last_operation(i32 %p, i32 %val) { 445cee313d2SEric Christopherentry: 446cee313d2SEric Christopher %tobool = icmp eq i32 %p, 0 447cee313d2SEric Christopher br label %for.body 448cee313d2SEric Christopher 449cee313d2SEric Christopherfor.body: 450cee313d2SEric Christopher %inc613.1 = phi i32 [ 0, %entry ], [ %inc6.1, %for.body ] 451cee313d2SEric Christopher %inc511.1 = phi i32 [ %val, %entry ], [ %inc5.1, %for.body ] 452cee313d2SEric Christopher %0 = zext i1 %tobool to i32 453cee313d2SEric Christopher %inc4.1 = xor i32 %0, 1 454cee313d2SEric Christopher %inc511.1.inc4.1 = add nsw i32 %inc511.1, %inc4.1 455cee313d2SEric Christopher %inc5.1 = add nsw i32 %inc511.1.inc4.1, 1 456cee313d2SEric Christopher %inc6.1 = add nsw i32 %inc613.1, 1 457cee313d2SEric Christopher %exitcond.1 = icmp eq i32 %inc6.1, 22 458cee313d2SEric Christopher br i1 %exitcond.1, label %exit, label %for.body 459cee313d2SEric Christopher 460cee313d2SEric Christopherexit: 461cee313d2SEric Christopher %inc.2 = add nsw i32 %inc511.1.inc4.1, 2 462cee313d2SEric Christopher ret i32 %inc.2 463cee313d2SEric Christopher} 464cee313d2SEric Christopher 465cee313d2SEric Christopher;CHECK-LABEL: @reduction_sum_multiuse( 466cee313d2SEric Christopher;CHECK: phi <4 x i32> 467cee313d2SEric Christopher;CHECK: load <4 x i32> 468cee313d2SEric Christopher;CHECK: add <4 x i32> 46979b1b4a5SSanjay Patel;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> 470c23aefd7SRoman Lebedev;CHECK: %sum.copy = phi i32 [ %[[SCALAR:.*]], %.lr.ph ], [ %[[VECTOR:.*]], %middle.block ] 471cee313d2SEric Christopher;CHECK: ret i32 472cee313d2SEric Christopherdefine i32 @reduction_sum_multiuse(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) { 473cee313d2SEric Christopher %1 = icmp sgt i32 %n, 0 474cee313d2SEric Christopher br i1 %1, label %.lr.ph.preheader, label %end 475cee313d2SEric Christopher.lr.ph.preheader: ; preds = %0 476cee313d2SEric Christopher br label %.lr.ph 477cee313d2SEric Christopher 478cee313d2SEric Christopher.lr.ph: ; preds = %0, %.lr.ph 479cee313d2SEric Christopher %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] 480cee313d2SEric Christopher %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %.lr.ph.preheader ] 481cee313d2SEric Christopher %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 482cee313d2SEric Christopher %3 = load i32, i32* %2, align 4 483cee313d2SEric Christopher %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 484cee313d2SEric Christopher %5 = load i32, i32* %4, align 4 485cee313d2SEric Christopher %6 = trunc i64 %indvars.iv to i32 486cee313d2SEric Christopher %7 = add i32 %sum.02, %6 487cee313d2SEric Christopher %8 = add i32 %7, %3 488cee313d2SEric Christopher %9 = add i32 %8, %5 489cee313d2SEric Christopher %indvars.iv.next = add i64 %indvars.iv, 1 490cee313d2SEric Christopher %lftr.wideiv = trunc i64 %indvars.iv.next to i32 491cee313d2SEric Christopher %exitcond = icmp eq i32 %lftr.wideiv, %n 492cee313d2SEric Christopher br i1 %exitcond, label %._crit_edge, label %.lr.ph 493cee313d2SEric Christopher 494cee313d2SEric Christopher._crit_edge: ; preds = %.lr.ph, %0 495cee313d2SEric Christopher %sum.lcssa = phi i32 [ %9, %.lr.ph ] 496cee313d2SEric Christopher %sum.copy = phi i32 [ %9, %.lr.ph ] 497cee313d2SEric Christopher br label %end 498cee313d2SEric Christopher 499cee313d2SEric Christopherend: 500cee313d2SEric Christopher %f1 = phi i32 [ 0, %0 ], [ %sum.lcssa, %._crit_edge ] 501cee313d2SEric Christopher %f2 = phi i32 [ 0, %0 ], [ %sum.copy, %._crit_edge ] 502cee313d2SEric Christopher %final = add i32 %f1, %f2 503cee313d2SEric Christopher ret i32 %final 504cee313d2SEric Christopher} 505cee313d2SEric Christopher 506cee313d2SEric Christopher; This looks like a predicated reduction, but it is a reset of the reduction 507cee313d2SEric Christopher; variable. We cannot vectorize this. 508cee313d2SEric Christopher; CHECK-LABEL: reduction_reset( 509cee313d2SEric Christopher; CHECK-NOT: <4 x i32> 510cee313d2SEric Christopherdefine void @reduction_reset(i32 %N, i32* nocapture readonly %arrayA, i32* nocapture %arrayB) { 511cee313d2SEric Christopherentry: 512cee313d2SEric Christopher %c4 = icmp sgt i32 %N, 0 513cee313d2SEric Christopher br i1 %c4, label %.lr.ph.preheader, label %._crit_edge 514cee313d2SEric Christopher 515cee313d2SEric Christopher.lr.ph.preheader: ; preds = %entry 516cee313d2SEric Christopher %c5 = add i32 %N, -1 517cee313d2SEric Christopher %wide.trip.count = zext i32 %N to i64 518cee313d2SEric Christopher br label %.lr.ph 519cee313d2SEric Christopher 520cee313d2SEric Christopher.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader 521cee313d2SEric Christopher %indvars.iv = phi i64 [ 0, %.lr.ph.preheader ], [ %indvars.iv.next, %.lr.ph ] 522cee313d2SEric Christopher %.017 = phi i32 [ 100, %.lr.ph.preheader ], [ %csel, %.lr.ph ] 523cee313d2SEric Christopher %c6 = getelementptr inbounds i32, i32* %arrayA, i64 %indvars.iv 524cee313d2SEric Christopher %c7 = load i32, i32* %c6, align 4 525cee313d2SEric Christopher %c8 = icmp sgt i32 %c7, 0 526cee313d2SEric Christopher %c9 = add nsw i32 %c7, %.017 527cee313d2SEric Christopher %csel = select i1 %c8, i32 %c9, i32 0 528cee313d2SEric Christopher %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 529cee313d2SEric Christopher %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count 530cee313d2SEric Christopher br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph 531cee313d2SEric Christopher 532cee313d2SEric Christopher._crit_edge.loopexit: ; preds = %.lr.ph 533cee313d2SEric Christopher %csel.lcssa = phi i32 [ %csel, %.lr.ph ] 534cee313d2SEric Christopher %phitmp19 = sext i32 %c5 to i64 535cee313d2SEric Christopher br label %._crit_edge 536cee313d2SEric Christopher 537cee313d2SEric Christopher._crit_edge: ; preds = %._crit_edge.loopexit, %entry 538cee313d2SEric Christopher %.015.lcssa = phi i64 [ -1, %entry ], [ %phitmp19, %._crit_edge.loopexit ] 539cee313d2SEric Christopher %.0.lcssa = phi i32 [ 100, %entry ], [ %csel.lcssa, %._crit_edge.loopexit ] 540cee313d2SEric Christopher %c10 = getelementptr inbounds i32, i32* %arrayB, i64 %.015.lcssa 541cee313d2SEric Christopher store i32 %.0.lcssa, i32* %c10, align 4 542cee313d2SEric Christopher ret void 543cee313d2SEric Christopher} 54479b1b4a5SSanjay Patel 545*faebc6bfSFlorian Hahn; Can vectorize reduction with redundant single-operand phi input. 546*faebc6bfSFlorian Hahndefine i64 @reduction_with_phi_with_one_incoming_on_backedge(i16 %n, i64* %A) { 547*faebc6bfSFlorian Hahn; CHECK-LABEL: @reduction_with_phi_with_one_incoming_on_backedge 548*faebc6bfSFlorian Hahn; CHECK: add <4 x i64> 549*faebc6bfSFlorian Hahn; 550*faebc6bfSFlorian Hahnentry: 551*faebc6bfSFlorian Hahn br label %loop.header 552*faebc6bfSFlorian Hahn 553*faebc6bfSFlorian Hahnloop.header: 554*faebc6bfSFlorian Hahn %iv = phi i16 [ 1, %entry ], [ %iv.next, %loop.latch ] 555*faebc6bfSFlorian Hahn %sum = phi i64 [ 0, %entry ], [ %phi.sum.next, %loop.latch ] 556*faebc6bfSFlorian Hahn %gep.A = getelementptr i64, i64* %A, i16 %iv 557*faebc6bfSFlorian Hahn %lv.A = load i64, i64* %gep.A 558*faebc6bfSFlorian Hahn %sum.next = add nsw i64 %sum, %lv.A 559*faebc6bfSFlorian Hahn br label %loop.bb 560*faebc6bfSFlorian Hahn 561*faebc6bfSFlorian Hahnloop.bb: 562*faebc6bfSFlorian Hahn %phi.sum.next = phi i64 [ %sum.next, %loop.header ] 563*faebc6bfSFlorian Hahn br label %loop.latch 564*faebc6bfSFlorian Hahn 565*faebc6bfSFlorian Hahnloop.latch: 566*faebc6bfSFlorian Hahn %iv.next = add nsw i16 %iv, 1 567*faebc6bfSFlorian Hahn %cond = icmp slt i16 %iv.next, %n 568*faebc6bfSFlorian Hahn br i1 %cond, label %loop.header, label %exit 569*faebc6bfSFlorian Hahn 570*faebc6bfSFlorian Hahnexit: 571*faebc6bfSFlorian Hahn %lcssa.exit = phi i64 [ %phi.sum.next, %loop.latch ] 572*faebc6bfSFlorian Hahn ret i64 %lcssa.exit 573*faebc6bfSFlorian Hahn} 574*faebc6bfSFlorian Hahn 575*faebc6bfSFlorian Hahn; Can vectorize reduction with redundant two-operand phi input. 576*faebc6bfSFlorian Hahndefine i64 @reduction_with_phi_with_two_incoming_on_backedge(i16 %n, i64* %A) { 577*faebc6bfSFlorian Hahn; CHECK-LABEL: @reduction_with_phi_with_two_incoming_on_backedge 578*faebc6bfSFlorian Hahn; CHECK: add <4 x i64> 579*faebc6bfSFlorian Hahn; 580*faebc6bfSFlorian Hahnentry: 581*faebc6bfSFlorian Hahn br label %loop.header 582*faebc6bfSFlorian Hahn 583*faebc6bfSFlorian Hahnloop.header: 584*faebc6bfSFlorian Hahn %iv = phi i16 [ 1, %entry ], [ %iv.next, %loop.latch ] 585*faebc6bfSFlorian Hahn %sum = phi i64 [ 0, %entry ], [ %phi.sum.next, %loop.latch ] 586*faebc6bfSFlorian Hahn %gep.A = getelementptr i64, i64* %A, i16 %iv 587*faebc6bfSFlorian Hahn %lv.A = load i64, i64* %gep.A 588*faebc6bfSFlorian Hahn %sum.next = add nsw i64 %sum, %lv.A 589*faebc6bfSFlorian Hahn %cmp.0 = icmp eq i64 %lv.A, 29 590*faebc6bfSFlorian Hahn br i1 %cmp.0, label %loop.bb, label %loop.latch 591*faebc6bfSFlorian Hahn 592*faebc6bfSFlorian Hahnloop.bb: 593*faebc6bfSFlorian Hahn br label %loop.latch 594*faebc6bfSFlorian Hahn 595*faebc6bfSFlorian Hahnloop.latch: 596*faebc6bfSFlorian Hahn %phi.sum.next = phi i64 [ %sum.next, %loop.bb ], [ %sum.next, %loop.header ] 597*faebc6bfSFlorian Hahn %iv.next = add nsw i16 %iv, 1 598*faebc6bfSFlorian Hahn %cond = icmp slt i16 %iv.next, %n 599*faebc6bfSFlorian Hahn br i1 %cond, label %loop.header, label %exit 600*faebc6bfSFlorian Hahn 601*faebc6bfSFlorian Hahnexit: 602*faebc6bfSFlorian Hahn %lcssa.exit = phi i64 [ %phi.sum.next, %loop.latch ] 603*faebc6bfSFlorian Hahn ret i64 %lcssa.exit 604*faebc6bfSFlorian Hahn} 605*faebc6bfSFlorian Hahn 60679b1b4a5SSanjay Patel; Make sure any check-not directives are not triggered by function declarations. 60779b1b4a5SSanjay Patel; CHECK: declare 608