1; RUN: opt -S < %s -basic-aa -loop-vectorize -force-vector-interleave=1 2>&1 | FileCheck %s 2 3target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 4target triple = "aarch64" 5 6; CHECK-LABEL: @add_a( 7; CHECK: load <16 x i8>, <16 x i8>* 8; CHECK: add <16 x i8> 9; CHECK: store <16 x i8> 10; Function Attrs: nounwind 11define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { 12entry: 13 %cmp8 = icmp sgt i32 %len, 0 14 br i1 %cmp8, label %for.body, label %for.cond.cleanup 15 16for.cond.cleanup: ; preds = %for.body, %entry 17 ret void 18 19for.body: ; preds = %entry, %for.body 20 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 21 %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv 22 %0 = load i8, i8* %arrayidx 23 %conv = zext i8 %0 to i32 24 %add = add nuw nsw i32 %conv, 2 25 %conv1 = trunc i32 %add to i8 26 %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv 27 store i8 %conv1, i8* %arrayidx3 28 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 29 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 30 %exitcond = icmp eq i32 %lftr.wideiv, %len 31 br i1 %exitcond, label %for.cond.cleanup, label %for.body 32} 33 34; Ensure that we preserve nuw/nsw if we're not shrinking the values we're 35; working with. 36; CHECK-LABEL: @add_a1( 37; CHECK: load <16 x i8>, <16 x i8>* 38; CHECK: add nuw nsw <16 x i8> 39; CHECK: store <16 x i8> 40; Function Attrs: nounwind 41define void @add_a1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { 42entry: 43 %cmp8 = icmp sgt i32 %len, 0 44 br i1 %cmp8, label %for.body, label %for.cond.cleanup 45 46for.cond.cleanup: ; preds = %for.body, %entry 47 ret void 48 49for.body: ; preds = %entry, %for.body 50 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 51 %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv 52 %0 = load i8, i8* %arrayidx 53 %add = add nuw nsw i8 %0, 2 54 %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv 55 store i8 %add, i8* %arrayidx3 56 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 57 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 58 %exitcond = icmp eq i32 %lftr.wideiv, %len 59 br i1 %exitcond, label %for.cond.cleanup, label %for.body 60} 61 62; CHECK-LABEL: @add_b( 63; CHECK: load <8 x i16>, <8 x i16>* 64; CHECK: add <8 x i16> 65; CHECK: store <8 x i16> 66; Function Attrs: nounwind 67define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { 68entry: 69 %cmp9 = icmp sgt i32 %len, 0 70 br i1 %cmp9, label %for.body, label %for.cond.cleanup 71 72for.cond.cleanup: ; preds = %for.body, %entry 73 ret void 74 75for.body: ; preds = %entry, %for.body 76 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 77 %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv 78 %0 = load i16, i16* %arrayidx 79 %conv8 = zext i16 %0 to i32 80 %add = add nuw nsw i32 %conv8, 2 81 %conv1 = trunc i32 %add to i16 82 %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv 83 store i16 %conv1, i16* %arrayidx3 84 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 85 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 86 %exitcond = icmp eq i32 %lftr.wideiv, %len 87 br i1 %exitcond, label %for.cond.cleanup, label %for.body 88} 89 90; CHECK-LABEL: @add_c( 91; CHECK: load <8 x i8>, <8 x i8>* 92; CHECK: add <8 x i16> 93; CHECK: store <8 x i16> 94; Function Attrs: nounwind 95define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { 96entry: 97 %cmp8 = icmp sgt i32 %len, 0 98 br i1 %cmp8, label %for.body, label %for.cond.cleanup 99 100for.cond.cleanup: ; preds = %for.body, %entry 101 ret void 102 103for.body: ; preds = %entry, %for.body 104 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 105 %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv 106 %0 = load i8, i8* %arrayidx 107 %conv = zext i8 %0 to i32 108 %add = add nuw nsw i32 %conv, 2 109 %conv1 = trunc i32 %add to i16 110 %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv 111 store i16 %conv1, i16* %arrayidx3 112 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 113 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 114 %exitcond = icmp eq i32 %lftr.wideiv, %len 115 br i1 %exitcond, label %for.cond.cleanup, label %for.body 116} 117 118; CHECK-LABEL: @add_d( 119; CHECK: load <8 x i16> 120; CHECK: add nsw <8 x i32> 121; CHECK: store <8 x i32> 122define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { 123entry: 124 %cmp7 = icmp sgt i32 %len, 0 125 br i1 %cmp7, label %for.body, label %for.cond.cleanup 126 127for.cond.cleanup: ; preds = %for.body, %entry 128 ret void 129 130for.body: ; preds = %entry, %for.body 131 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 132 %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv 133 %0 = load i16, i16* %arrayidx 134 %conv = sext i16 %0 to i32 135 %add = add nsw i32 %conv, 2 136 %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv 137 store i32 %add, i32* %arrayidx2 138 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 139 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 140 %exitcond = icmp eq i32 %lftr.wideiv, %len 141 br i1 %exitcond, label %for.cond.cleanup, label %for.body 142} 143 144; CHECK-LABEL: @add_e( 145; CHECK: load <16 x i8> 146; CHECK: shl <16 x i8> 147; CHECK: add <16 x i8> 148; CHECK: or <16 x i8> 149; CHECK: mul <16 x i8> 150; CHECK: and <16 x i8> 151; CHECK: xor <16 x i8> 152; CHECK: mul <16 x i8> 153; CHECK: store <16 x i8> 154define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 { 155entry: 156 %cmp.32 = icmp sgt i32 %len, 0 157 br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup 158 159for.body.lr.ph: ; preds = %entry 160 %conv11 = zext i8 %arg2 to i32 161 %conv13 = zext i8 %arg1 to i32 162 br label %for.body 163 164for.cond.cleanup: ; preds = %for.body, %entry 165 ret void 166 167for.body: ; preds = %for.body, %for.body.lr.ph 168 %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] 169 %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv 170 %0 = load i8, i8* %arrayidx 171 %conv = zext i8 %0 to i32 172 %add = shl i32 %conv, 4 173 %conv2 = add nuw nsw i32 %add, 32 174 %or = or i32 %conv, 51 175 %mul = mul nuw nsw i32 %or, 60 176 %and = and i32 %conv2, %conv13 177 %mul.masked = and i32 %mul, 252 178 %conv17 = xor i32 %mul.masked, %conv11 179 %mul18 = mul nuw nsw i32 %conv17, %and 180 %conv19 = trunc i32 %mul18 to i8 181 %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv 182 store i8 %conv19, i8* %arrayidx21 183 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 184 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 185 %exitcond = icmp eq i32 %lftr.wideiv, %len 186 br i1 %exitcond, label %for.cond.cleanup, label %for.body 187} 188 189; CHECK-LABEL: @add_f 190; CHECK: load <8 x i16> 191; CHECK: trunc <8 x i16> 192; CHECK: shl <8 x i8> 193; CHECK: add <8 x i8> 194; CHECK: or <8 x i8> 195; CHECK: mul <8 x i8> 196; CHECK: and <8 x i8> 197; CHECK: xor <8 x i8> 198; CHECK: mul <8 x i8> 199; CHECK: store <8 x i8> 200define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 { 201entry: 202 %cmp.32 = icmp sgt i32 %len, 0 203 br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup 204 205for.body.lr.ph: ; preds = %entry 206 %conv11 = zext i8 %arg2 to i32 207 %conv13 = zext i8 %arg1 to i32 208 br label %for.body 209 210for.cond.cleanup: ; preds = %for.body, %entry 211 ret void 212 213for.body: ; preds = %for.body, %for.body.lr.ph 214 %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] 215 %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv 216 %0 = load i16, i16* %arrayidx 217 %conv = sext i16 %0 to i32 218 %add = shl i32 %conv, 4 219 %conv2 = add nsw i32 %add, 32 220 %or = and i32 %conv, 204 221 %conv8 = or i32 %or, 51 222 %mul = mul nuw nsw i32 %conv8, 60 223 %and = and i32 %conv2, %conv13 224 %mul.masked = and i32 %mul, 252 225 %conv17 = xor i32 %mul.masked, %conv11 226 %mul18 = mul nuw nsw i32 %conv17, %and 227 %conv19 = trunc i32 %mul18 to i8 228 %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv 229 store i8 %conv19, i8* %arrayidx21 230 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 231 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 232 %exitcond = icmp eq i32 %lftr.wideiv, %len 233 br i1 %exitcond, label %for.cond.cleanup, label %for.body 234} 235 236; CHECK-LABEL: @add_phifail( 237; CHECK: load <16 x i8>, <16 x i8>* 238; CHECK: add nuw nsw <16 x i32> 239; CHECK: store <16 x i8> 240; Function Attrs: nounwind 241define void @add_phifail(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { 242entry: 243 %cmp8 = icmp sgt i32 %len, 0 244 br i1 %cmp8, label %for.body, label %for.cond.cleanup 245 246for.cond.cleanup: ; preds = %for.body, %entry 247 ret void 248 249for.body: ; preds = %entry, %for.body 250 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 251 %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ] 252 %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv 253 %0 = load i8, i8* %arrayidx 254 %conv = zext i8 %0 to i32 255 %add = add nuw nsw i32 %conv, 2 256 %conv1 = trunc i32 %add to i8 257 %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv 258 store i8 %conv1, i8* %arrayidx3 259 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 260 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 261 %exitcond = icmp eq i32 %lftr.wideiv, %len 262 br i1 %exitcond, label %for.cond.cleanup, label %for.body 263} 264 265; Function Attrs: nounwind 266; When we vectorize this loop, we generate correct code 267; even when %len exactly divides VF (since we extract from the second last index 268; and pass this to the for.cond.cleanup block). Vectorized loop returns 269; the correct value a_phi = p[len -2] 270define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 { 271; CHECK-LABEL: @add_phifail2( 272; CHECK: vector.body: 273; CHECK: %wide.load = load <16 x i8>, <16 x i8>* 274; CHECK: %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32> 275; CHECK: add nuw nsw <16 x i32> 276; CHECK: store <16 x i8> 277; CHECK: add nuw i64 %index, 16 278; CHECK: icmp eq i64 %index.next, %n.vec 279; CHECK: middle.block: 280; CHECK: %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15 281; CHECK: %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14 282; CHECK: for.cond.cleanup: 283; CHECK: %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ] 284; CHECK: %ret = trunc i32 %a_phi.lcssa to i8 285; CHECK: ret i8 %ret 286entry: 287 br label %for.body 288 289for.cond.cleanup: ; preds = %for.body, %entry 290 %ret = trunc i32 %a_phi to i8 291 ret i8 %ret 292 293for.body: ; preds = %entry, %for.body 294 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 295 %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ] 296 %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv 297 %0 = load i8, i8* %arrayidx 298 %conv = zext i8 %0 to i32 299 %add = add nuw nsw i32 %conv, 2 300 %conv1 = trunc i32 %add to i8 301 %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv 302 store i8 %conv1, i8* %arrayidx3 303 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 304 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 305 %exitcond = icmp eq i32 %lftr.wideiv, %len 306 br i1 %exitcond, label %for.cond.cleanup, label %for.body 307} 308 309attributes #0 = { nounwind } 310 311