1; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \
2; RUN:   < %s | FileCheck %s
3
4; RUN: opt -aa-pipeline=basic-aa -passes='loop-distribute,loop-vectorize' -enable-loop-distribute -force-vector-width=4 \
5; RUN:   -verify-loop-info -verify-dom-info -S < %s | \
6; RUN:   FileCheck --check-prefix=VECTORIZE %s
7
8; RUN: opt -aa-pipeline=basic-aa -passes='loop-distribute,print-access-info' -enable-loop-distribute \
9; RUN:   -verify-loop-info -verify-dom-info -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS
10
11; The memcheck version of basic.ll.  We should distribute and vectorize the
12; second part of this loop with 5 memchecks (A+1 x {C, D, E} + C x {A, B})
13;
14;   for (i = 0; i < n; i++) {
15;     A[i + 1] = A[i] * B[i];
16; -------------------------------
17;     C[i] = D[i] * E[i];
18;   }
19
20target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
21target triple = "x86_64-apple-macosx10.10.0"
22
23@B = common global i32* null, align 8
24@A = common global i32* null, align 8
25@C = common global i32* null, align 8
26@D = common global i32* null, align 8
27@E = common global i32* null, align 8
28
29; CHECK-LABEL: @f(
30define void @f() {
31entry:
32  %a = load i32*, i32** @A, align 8
33  %b = load i32*, i32** @B, align 8
34  %c = load i32*, i32** @C, align 8
35  %d = load i32*, i32** @D, align 8
36  %e = load i32*, i32** @E, align 8
37  br label %for.body
38
39; We have two compares for each array overlap check.
40; Since the checks to A and A + 4 get merged, this will give us a
41; total of 8 compares.
42;
43; CHECK: for.body.lver.check:
44; CHECK:     = icmp
45; CHECK:     = icmp
46
47; CHECK:     = icmp
48; CHECK:     = icmp
49
50; CHECK:     = icmp
51; CHECK:     = icmp
52
53; CHECK:     = icmp
54; CHECK:     = icmp
55
56; CHECK-NOT: = icmp
57; CHECK:     br i1 %conflict.rdx25, label %for.body.ph.lver.orig, label %for.body.ph.ldist1
58
59; The non-distributed loop that the memchecks fall back on.
60
61; CHECK: for.body.ph.lver.orig:
62; CHECK:     br label %for.body.lver.orig
63; CHECK: for.body.lver.orig:
64; CHECK:    br i1 %exitcond.lver.orig, label %for.end.loopexit, label %for.body.lver.orig
65
66; Verify the two distributed loops.
67
68; CHECK: for.body.ph.ldist1:
69; CHECK:     br label %for.body.ldist1
70; CHECK: for.body.ldist1:
71; CHECK:    %mulA.ldist1 = mul i32 %loadB.ldist1, %loadA.ldist1
72; CHECK:    br i1 %exitcond.ldist1, label %for.body.ph, label %for.body.ldist1
73
74; CHECK: for.body.ph:
75; CHECK:    br label %for.body
76; CHECK: for.body:
77; CHECK:    %mulC = mul i32 %loadD, %loadE
78; CHECK: for.end:
79
80
81; VECTORIZE: mul <4 x i32>
82
83for.body:                                         ; preds = %for.body, %entry
84  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
85
86  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
87  %loadA = load i32, i32* %arrayidxA, align 4
88
89  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
90  %loadB = load i32, i32* %arrayidxB, align 4
91
92  %mulA = mul i32 %loadB, %loadA
93
94  %add = add nuw nsw i64 %ind, 1
95  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
96  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
97
98  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
99  %loadD = load i32, i32* %arrayidxD, align 4
100
101  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
102  %loadE = load i32, i32* %arrayidxE, align 4
103
104  %mulC = mul i32 %loadD, %loadE
105
106  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
107  store i32 %mulC, i32* %arrayidxC, align 4
108
109  %exitcond = icmp eq i64 %add, 20
110  br i1 %exitcond, label %for.end, label %for.body
111
112for.end:                                          ; preds = %for.body
113  ret void
114}
115
116; Make sure there's no "Multiple reports generated" assert with a
117; volatile load, and no distribution
118
119; TODO: Distribution of volatile may be possible under some
120; circumstance, but the current implementation does not touch them.
121
122; CHECK-LABEL: @f_volatile_load(
123; CHECK: br label %for.body{{$}}
124
125; CHECK-NOT: load
126
127; CHECK: {{^}}for.body:
128; CHECK: load i32
129; CHECK: load i32
130; CHECK: load volatile i32
131; CHECK: load i32
132; CHECK: br i1 %exitcond, label %for.end, label %for.body{{$}}
133
134; CHECK-NOT: load
135
136; VECTORIZE-NOT: load <4 x i32>
137; VECTORIZE-NOT: mul <4 x i32>
138define void @f_volatile_load() {
139entry:
140  %a = load i32*, i32** @A, align 8
141  %b = load i32*, i32** @B, align 8
142  %c = load i32*, i32** @C, align 8
143  %d = load i32*, i32** @D, align 8
144  %e = load i32*, i32** @E, align 8
145  br label %for.body
146
147for.body:
148  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
149
150  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
151  %loadA = load i32, i32* %arrayidxA, align 4
152
153  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
154  %loadB = load i32, i32* %arrayidxB, align 4
155
156  %mulA = mul i32 %loadB, %loadA
157
158  %add = add nuw nsw i64 %ind, 1
159  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
160  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
161
162  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
163  %loadD = load volatile i32, i32* %arrayidxD, align 4
164
165  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
166  %loadE = load i32, i32* %arrayidxE, align 4
167
168  %mulC = mul i32 %loadD, %loadE
169
170  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
171  store i32 %mulC, i32* %arrayidxC, align 4
172
173  %exitcond = icmp eq i64 %add, 20
174  br i1 %exitcond, label %for.end, label %for.body
175
176for.end:
177  ret void
178}
179
180declare i32 @llvm.convergent(i32) #0
181
182; This is the same as f, and would require the same bounds
183; check. However, it is not OK to introduce new control dependencies
184; on the convergent call.
185
186; CHECK-LABEL: @f_with_convergent(
187; CHECK: call i32 @llvm.convergent
188; CHECK-NOT: call i32 @llvm.convergent
189
190; ANALYSIS: for.body:
191; ANALYSIS: Report: cannot add control dependency to convergent operation
192define void @f_with_convergent() #1 {
193entry:
194  %a = load i32*, i32** @A, align 8
195  %b = load i32*, i32** @B, align 8
196  %c = load i32*, i32** @C, align 8
197  %d = load i32*, i32** @D, align 8
198  %e = load i32*, i32** @E, align 8
199  br label %for.body
200
201for.body:                                         ; preds = %for.body, %entry
202  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
203
204  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
205  %loadA = load i32, i32* %arrayidxA, align 4
206
207  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
208  %loadB = load i32, i32* %arrayidxB, align 4
209
210  %mulA = mul i32 %loadB, %loadA
211
212  %add = add nuw nsw i64 %ind, 1
213  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
214  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
215
216  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
217  %loadD = load i32, i32* %arrayidxD, align 4
218
219  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
220  %loadE = load i32, i32* %arrayidxE, align 4
221
222  %convergentD = call i32 @llvm.convergent(i32 %loadD)
223  %mulC = mul i32 %convergentD, %loadE
224
225  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
226  store i32 %mulC, i32* %arrayidxC, align 4
227
228  %exitcond = icmp eq i64 %add, 20
229  br i1 %exitcond, label %for.end, label %for.body
230
231for.end:                                          ; preds = %for.body
232  ret void
233}
234
235; Make sure an explicit request for distribution is ignored if it
236; requires possibly illegal checks.
237
238; CHECK-LABEL: @f_with_convergent_forced_distribute(
239; CHECK: call i32 @llvm.convergent
240; CHECK-NOT: call i32 @llvm.convergent
241define void @f_with_convergent_forced_distribute() #1 {
242entry:
243  %a = load i32*, i32** @A, align 8
244  %b = load i32*, i32** @B, align 8
245  %c = load i32*, i32** @C, align 8
246  %d = load i32*, i32** @D, align 8
247  %e = load i32*, i32** @E, align 8
248  br label %for.body
249
250for.body:                                         ; preds = %for.body, %entry
251  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
252
253  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
254  %loadA = load i32, i32* %arrayidxA, align 4
255
256  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
257  %loadB = load i32, i32* %arrayidxB, align 4
258
259  %mulA = mul i32 %loadB, %loadA
260
261  %add = add nuw nsw i64 %ind, 1
262  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
263  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
264
265  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
266  %loadD = load i32, i32* %arrayidxD, align 4
267
268  %arrayidxE = getelementptr inbounds i32, i32* %e, i64 %ind
269  %loadE = load i32, i32* %arrayidxE, align 4
270
271  %convergentD = call i32 @llvm.convergent(i32 %loadD)
272  %mulC = mul i32 %convergentD, %loadE
273
274  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
275  store i32 %mulC, i32* %arrayidxC, align 4
276
277  %exitcond = icmp eq i64 %add, 20
278  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
279
280for.end:                                          ; preds = %for.body
281  ret void
282}
283
284attributes #0 = { nounwind readnone convergent }
285attributes #1 = { nounwind convergent }
286
287!0 = distinct !{!0, !1}
288!1 = !{!"llvm.loop.distribute.enable", i1 true}
289