1; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \ 2; RUN: < %s | FileCheck %s 3 4; RUN: opt -aa-pipeline=basic-aa -passes='loop-distribute,loop-vectorize' -enable-loop-distribute -force-vector-width=4 \ 5; RUN: -verify-loop-info -verify-dom-info -S < %s | \ 6; RUN: FileCheck --check-prefix=VECTORIZE %s 7 8; RUN: opt -aa-pipeline=basic-aa -passes='loop-distribute,print-access-info' -enable-loop-distribute \ 9; RUN: -verify-loop-info -verify-dom-info -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS 10 11; The memcheck version of basic.ll. We should distribute and vectorize the 12; second part of this loop with 5 memchecks (A+1 x {C, D, E} + C x {A, B}) 13; 14; for (i = 0; i < n; i++) { 15; A[i + 1] = A[i] * B[i]; 16; ------------------------------- 17; C[i] = D[i] * E[i]; 18; } 19 20target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 21target triple = "x86_64-apple-macosx10.10.0" 22 23@B = common global i32* null, align 8 24@A = common global i32* null, align 8 25@C = common global i32* null, align 8 26@D = common global i32* null, align 8 27@E = common global i32* null, align 8 28 29; CHECK-LABEL: @f( 30define void @f() { 31entry: 32 %a = load i32*, i32** @A, align 8 33 %b = load i32*, i32** @B, align 8 34 %c = load i32*, i32** @C, align 8 35 %d = load i32*, i32** @D, align 8 36 %e = load i32*, i32** @E, align 8 37 br label %for.body 38 39; We have two compares for each array overlap check. 40; Since the checks to A and A + 4 get merged, this will give us a 41; total of 8 compares. 42; 43; CHECK: for.body.lver.check: 44; CHECK: = icmp 45; CHECK: = icmp 46 47; CHECK: = icmp 48; CHECK: = icmp 49 50; CHECK: = icmp 51; CHECK: = icmp 52 53; CHECK: = icmp 54; CHECK: = icmp 55 56; CHECK-NOT: = icmp 57; CHECK: br i1 %conflict.rdx25, label %for.body.ph.lver.orig, label %for.body.ph.ldist1 58 59; The non-distributed loop that the memchecks fall back on. 60 61; CHECK: for.body.ph.lver.orig: 62; CHECK: br label %for.body.lver.orig 63; CHECK: for.body.lver.orig: 64; CHECK: br i1 %exitcond.lver.orig, label %for.end.loopexit, label %for.body.lver.orig 65 66; Verify the two distributed loops. 67 68; CHECK: for.body.ph.ldist1: 69; CHECK: br label %for.body.ldist1 70; CHECK: for.body.ldist1: 71; CHECK: %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1 72; CHECK: br i1 %exitcond.ldist1, label %for.body.ph, label %for.body.ldist1 73 74; CHECK: for.body.ph: 75; CHECK: br label %for.body 76; CHECK: for.body: 77; CHECK: %mulC = mul i32 %loadD, %loadE 78; CHECK: for.end: 79 80 81; VECTORIZE: mul <4 x i32> 82 83for.body: ; preds = %for.body, %entry 84 %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] 85 86 %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind 87 %loadA = load i32, i32* %arrayidxA, align 4 88 89 %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind 90 %loadB = load i32, i32* %arrayidxB, align 4 91 92 %mulA = mul i32 %loadB, %loadA 93 94 %add = add nuw nsw i64 %ind, 1 95 %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add 96 store i32 %mulA, i32* %arrayidxA_plus_4, align 4 97 98 %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind 99 %loadD = load i32, i32* %arrayidxD, align 4 100 101 %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind 102 %loadE = load i32, i32* %arrayidxE, align 4 103 104 %mulC = mul i32 %loadD, %loadE 105 106 %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind 107 store i32 %mulC, i32* %arrayidxC, align 4 108 109 %exitcond = icmp eq i64 %add, 20 110 br i1 %exitcond, label %for.end, label %for.body 111 112for.end: ; preds = %for.body 113 ret void 114} 115 116; Make sure there's no "Multiple reports generated" assert with a 117; volatile load, and no distribution 118 119; TODO: Distribution of volatile may be possible under some 120; circumstance, but the current implementation does not touch them. 121 122; CHECK-LABEL: @f_volatile_load( 123; CHECK: br label %for.body{{$}} 124 125; CHECK-NOT: load 126 127; CHECK: {{^}}for.body: 128; CHECK: load i32 129; CHECK: load i32 130; CHECK: load volatile i32 131; CHECK: load i32 132; CHECK: br i1 %exitcond, label %for.end, label %for.body{{$}} 133 134; CHECK-NOT: load 135 136; VECTORIZE-NOT: load <4 x i32> 137; VECTORIZE-NOT: mul <4 x i32> 138define void @f_volatile_load() { 139entry: 140 %a = load i32*, i32** @A, align 8 141 %b = load i32*, i32** @B, align 8 142 %c = load i32*, i32** @C, align 8 143 %d = load i32*, i32** @D, align 8 144 %e = load i32*, i32** @E, align 8 145 br label %for.body 146 147for.body: 148 %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] 149 150 %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind 151 %loadA = load i32, i32* %arrayidxA, align 4 152 153 %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind 154 %loadB = load i32, i32* %arrayidxB, align 4 155 156 %mulA = mul i32 %loadB, %loadA 157 158 %add = add nuw nsw i64 %ind, 1 159 %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add 160 store i32 %mulA, i32* %arrayidxA_plus_4, align 4 161 162 %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind 163 %loadD = load volatile i32, i32* %arrayidxD, align 4 164 165 %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind 166 %loadE = load i32, i32* %arrayidxE, align 4 167 168 %mulC = mul i32 %loadD, %loadE 169 170 %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind 171 store i32 %mulC, i32* %arrayidxC, align 4 172 173 %exitcond = icmp eq i64 %add, 20 174 br i1 %exitcond, label %for.end, label %for.body 175 176for.end: 177 ret void 178} 179 180declare i32 @llvm.convergent(i32) #0 181 182; This is the same as f, and would require the same bounds 183; check. However, it is not OK to introduce new control dependencies 184; on the convergent call. 185 186; CHECK-LABEL: @f_with_convergent( 187; CHECK: call i32 @llvm.convergent 188; CHECK-NOT: call i32 @llvm.convergent 189 190; ANALYSIS: for.body: 191; ANALYSIS: Report: cannot add control dependency to convergent operation 192define void @f_with_convergent() #1 { 193entry: 194 %a = load i32*, i32** @A, align 8 195 %b = load i32*, i32** @B, align 8 196 %c = load i32*, i32** @C, align 8 197 %d = load i32*, i32** @D, align 8 198 %e = load i32*, i32** @E, align 8 199 br label %for.body 200 201for.body: ; preds = %for.body, %entry 202 %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] 203 204 %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind 205 %loadA = load i32, i32* %arrayidxA, align 4 206 207 %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind 208 %loadB = load i32, i32* %arrayidxB, align 4 209 210 %mulA = mul i32 %loadB, %loadA 211 212 %add = add nuw nsw i64 %ind, 1 213 %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add 214 store i32 %mulA, i32* %arrayidxA_plus_4, align 4 215 216 %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind 217 %loadD = load i32, i32* %arrayidxD, align 4 218 219 %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind 220 %loadE = load i32, i32* %arrayidxE, align 4 221 222 %convergentD = call i32 @llvm.convergent(i32 %loadD) 223 %mulC = mul i32 %convergentD, %loadE 224 225 %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind 226 store i32 %mulC, i32* %arrayidxC, align 4 227 228 %exitcond = icmp eq i64 %add, 20 229 br i1 %exitcond, label %for.end, label %for.body 230 231for.end: ; preds = %for.body 232 ret void 233} 234 235; Make sure an explicit request for distribution is ignored if it 236; requires possibly illegal checks. 237 238; CHECK-LABEL: @f_with_convergent_forced_distribute( 239; CHECK: call i32 @llvm.convergent 240; CHECK-NOT: call i32 @llvm.convergent 241define void @f_with_convergent_forced_distribute() #1 { 242entry: 243 %a = load i32*, i32** @A, align 8 244 %b = load i32*, i32** @B, align 8 245 %c = load i32*, i32** @C, align 8 246 %d = load i32*, i32** @D, align 8 247 %e = load i32*, i32** @E, align 8 248 br label %for.body 249 250for.body: ; preds = %for.body, %entry 251 %ind = phi i64 [ 0, %entry ], [ %add, %for.body ] 252 253 %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind 254 %loadA = load i32, i32* %arrayidxA, align 4 255 256 %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind 257 %loadB = load i32, i32* %arrayidxB, align 4 258 259 %mulA = mul i32 %loadB, %loadA 260 261 %add = add nuw nsw i64 %ind, 1 262 %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add 263 store i32 %mulA, i32* %arrayidxA_plus_4, align 4 264 265 %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind 266 %loadD = load i32, i32* %arrayidxD, align 4 267 268 %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind 269 %loadE = load i32, i32* %arrayidxE, align 4 270 271 %convergentD = call i32 @llvm.convergent(i32 %loadD) 272 %mulC = mul i32 %convergentD, %loadE 273 274 %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind 275 store i32 %mulC, i32* %arrayidxC, align 4 276 277 %exitcond = icmp eq i64 %add, 20 278 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 279 280for.end: ; preds = %for.body 281 ret void 282} 283 284attributes #0 = { nounwind readnone convergent } 285attributes #1 = { nounwind convergent } 286 287!0 = distinct !{!0, !1} 288!1 = !{!"llvm.loop.distribute.enable", i1 true} 289