1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
3
4; First licm pass is to hoist/sink invariant stores if possible. Today LICM does
5; not hoist/sink the invariant stores. Even if that changes, we should still
6; vectorize this loop in case licm is not run.
7
8; The next licm pass after vectorization is to hoist/sink loop invariant
9; instructions.
10target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
11
12; all tests check that it is legal to vectorize the stores to invariant
13; address.
14
15
16; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b)
17
18
19define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
20; CHECK-LABEL: @inv_val_store_to_inv_address_with_reduction(
21; CHECK-NEXT:  entry:
22; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
23; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
24; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4
25; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
26; CHECK:       vector.memcheck:
27; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
28; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
29; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
30; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
31; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
32; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
33; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
34; CHECK:       vector.ph:
35; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804
36; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
37; CHECK:       vector.body:
38; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
39; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
40; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
41; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
42; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !0
43; CHECK-NEXT:    [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
44; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !3, !noalias !0
45; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
46; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
47; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
48; CHECK:       middle.block:
49; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_BODY]] ]
50; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]])
51; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
52; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
53; CHECK:       scalar.ph:
54; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
55; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
56; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
57; CHECK:       for.body:
58; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
59; CHECK-NEXT:    [[I0:%.*]] = phi i32 [ [[I3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
60; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
61; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[I1]], align 8
62; CHECK-NEXT:    [[I3]] = add i32 [[I0]], [[I2]]
63; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
64; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
65; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
66; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
67; CHECK:       for.end.loopexit:
68; CHECK-NEXT:    [[I3_LCSSA:%.*]] = phi i32 [ [[I3]], [[FOR_BODY]] ]
69; CHECK-NEXT:    br label [[FOR_END]]
70; CHECK:       for.end:
71; CHECK-NEXT:    [[I4:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[I3_LCSSA]], [[FOR_END_LOOPEXIT]] ]
72; CHECK-NEXT:    ret i32 [[I4]]
73;
74entry:
75  %ntrunc = trunc i64 %n to i32
76  br label %for.body
77
78for.body:                                         ; preds = %for.body, %entry
79  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
80  %i0 = phi i32 [ %i3, %for.body ], [ 0, %entry ]
81  %i1 = getelementptr inbounds i32, i32* %b, i64 %i
82  %i2 = load i32, i32* %i1, align 8
83  %i3 = add i32 %i0, %i2
84  store i32 %ntrunc, i32* %a
85  %i.next = add nuw nsw i64 %i, 1
86  %cond = icmp slt i64 %i.next, %n
87  br i1 %cond, label %for.body, label %for.end
88
89for.end:                                          ; preds = %for.body
90  %i4 = phi i32 [ %i3, %for.body ]
91  ret i32 %i4
92}
93
94define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) {
95; CHECK-LABEL: @inv_val_store_to_inv_address(
96; CHECK-NEXT:  entry:
97; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
98; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
99; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4
100; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
101; CHECK:       vector.memcheck:
102; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
103; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
104; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
105; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
106; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
107; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
108; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
109; CHECK:       vector.ph:
110; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804
111; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
112; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
113; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
114; CHECK:       vector.body:
115; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
116; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
117; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !8, !noalias !11
118; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
119; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP1]], align 4, !alias.scope !11
120; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
121; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
122; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
123; CHECK:       middle.block:
124; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
125; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
126; CHECK:       scalar.ph:
127; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
128; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
129; CHECK:       for.body:
130; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
131; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
132; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
133; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[I1]], align 4
134; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
135; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
136; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
137; CHECK:       for.end.loopexit:
138; CHECK-NEXT:    br label [[FOR_END]]
139; CHECK:       for.end:
140; CHECK-NEXT:    ret void
141;
142entry:
143  %ntrunc = trunc i64 %n to i32
144  br label %for.body
145
146for.body:                                         ; preds = %for.body, %entry
147  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
148  %i1 = getelementptr inbounds i32, i32* %b, i64 %i
149  %i2 = load i32, i32* %i1, align 8
150  store i32 %ntrunc, i32* %a
151  store i32 %ntrunc, i32* %i1
152  %i.next = add nuw nsw i64 %i, 1
153  %cond = icmp slt i64 %i.next, %n
154  br i1 %cond, label %for.body, label %for.end
155
156for.end:                                          ; preds = %for.body
157  ret void
158}
159
160
161; Both of these tests below are handled as predicated stores.
162
163; Conditional store
164; if (b[i] == k) a = ntrunc
165; TODO: We can be better with the code gen for the first test and we can have
166; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1.
167
168
169
170define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
171; CHECK-LABEL: @inv_val_store_to_inv_address_conditional(
172; CHECK-NEXT:  entry:
173; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
174; CHECK-NEXT:    [[SMAX6:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
175; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX6]], 4
176; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
177; CHECK:       vector.memcheck:
178; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
179; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[SMAX]]
180; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1
181; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[B]]
182; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
183; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
184; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
185; CHECK:       vector.ph:
186; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX6]], 9223372036854775804
187; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[K:%.*]], i64 0
188; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
189; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[NTRUNC]], i64 0
190; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer
191; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
192; CHECK:       vector.body:
193; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
194; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
195; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
196; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !15, !noalias !18
197; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
198; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
199; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32>* [[TMP3]], align 4, !alias.scope !15, !noalias !18
200; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
201; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
202; CHECK:       pred.store.if:
203; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18
204; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
205; CHECK:       pred.store.continue:
206; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
207; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
208; CHECK:       pred.store.if9:
209; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18
210; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
211; CHECK:       pred.store.continue10:
212; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
213; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
214; CHECK:       pred.store.if11:
215; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18
216; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
217; CHECK:       pred.store.continue12:
218; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
219; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]]
220; CHECK:       pred.store.if13:
221; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !18
222; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
223; CHECK:       pred.store.continue14:
224; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
225; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
226; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
227; CHECK:       middle.block:
228; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
229; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
230; CHECK:       scalar.ph:
231; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
232; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
233; CHECK:       for.body:
234; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
235; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
236; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[I1]], align 8
237; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I2]], [[K]]
238; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[I1]], align 4
239; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
240; CHECK:       cond_store:
241; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
242; CHECK-NEXT:    br label [[LATCH]]
243; CHECK:       latch:
244; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
245; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
246; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP21:![0-9]+]]
247; CHECK:       for.end.loopexit:
248; CHECK-NEXT:    br label [[FOR_END]]
249; CHECK:       for.end:
250; CHECK-NEXT:    ret void
251;
252entry:
253  %ntrunc = trunc i64 %n to i32
254  br label %for.body
255
256for.body:                                         ; preds = %for.body, %entry
257  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
258  %i1 = getelementptr inbounds i32, i32* %b, i64 %i
259  %i2 = load i32, i32* %i1, align 8
260  %cmp = icmp eq i32 %i2, %k
261  store i32 %ntrunc, i32* %i1
262  br i1 %cmp, label %cond_store, label %latch
263
264cond_store:
265  store i32 %ntrunc, i32* %a
266  br label %latch
267
268latch:
269  %i.next = add nuw nsw i64 %i, 1
270  %cond = icmp slt i64 %i.next, %n
271  br i1 %cond, label %for.body, label %for.end
272
273for.end:                                          ; preds = %for.body
274  ret void
275}
276
277; if (b[i] == k)
278;    a = ntrunc
279; else a = k;
280; TODO: We could vectorize this once we support multiple uniform stores to the
281; same address.
282define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) {
283; CHECK-LABEL: @inv_val_store_to_inv_address_conditional_diff_values(
284; CHECK-NEXT:  entry:
285; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
286; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
287; CHECK:       for.body:
288; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
289; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I]]
290; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[I1]], align 8
291; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I2]], [[K:%.*]]
292; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[I1]], align 4
293; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
294; CHECK:       cond_store:
295; CHECK-NEXT:    br label [[LATCH]]
296; CHECK:       cond_store_k:
297; CHECK-NEXT:    br label [[LATCH]]
298; CHECK:       latch:
299; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ [[K]], [[COND_STORE_K]] ], [ [[NTRUNC]], [[COND_STORE]] ]
300; CHECK-NEXT:    store i32 [[STOREMERGE]], i32* [[A:%.*]], align 4
301; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
302; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
303; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
304; CHECK:       for.end:
305; CHECK-NEXT:    ret void
306;
307entry:
308  %ntrunc = trunc i64 %n to i32
309  br label %for.body
310
311for.body:                                         ; preds = %for.body, %entry
312  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
313  %i1 = getelementptr inbounds i32, i32* %b, i64 %i
314  %i2 = load i32, i32* %i1, align 8
315  %cmp = icmp eq i32 %i2, %k
316  store i32 %ntrunc, i32* %i1
317  br i1 %cmp, label %cond_store, label %cond_store_k
318
319cond_store:
320  store i32 %ntrunc, i32* %a
321  br label %latch
322
323cond_store_k:
324  store i32 %k, i32 * %a
325  br label %latch
326
327latch:
328  %i.next = add nuw nsw i64 %i, 1
329  %cond = icmp slt i64 %i.next, %n
330  br i1 %cond, label %for.body, label %for.end
331
332for.end:                                          ; preds = %for.body
333  ret void
334}
335
336; Multiple variant stores to the same uniform address
337; We do not vectorize such loops currently.
338;  for(; i < itr; i++) {
339;    for(; j < itr; j++) {
340;      var1[i] = var2[j] + var1[i];
341;      var1[i]++;
342;    }
343;  }
344
345define i32 @multiple_uniform_stores(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
346; CHECK-LABEL: @multiple_uniform_stores(
347; CHECK-NEXT:  entry:
348; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
349; CHECK-NEXT:    br i1 [[CMP20]], label [[FOR_END10:%.*]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]]
350; CHECK:       for.cond1.preheader.preheader:
351; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
352; CHECK:       for.cond1.preheader:
353; CHECK-NEXT:    [[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], [[FOR_INC8:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
354; CHECK-NEXT:    [[J_022:%.*]] = phi i32 [ [[J_1_LCSSA:%.*]], [[FOR_INC8]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
355; CHECK-NEXT:    [[CMP218:%.*]] = icmp ult i32 [[J_022]], [[ITR]]
356; CHECK-NEXT:    br i1 [[CMP218]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC8]]
357; CHECK:       for.body3.lr.ph:
358; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[VAR1:%.*]], i64 [[INDVARS_IV23]]
359; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[J_022]] to i64
360; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
361; CHECK:       for.body3:
362; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY3_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
363; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[VAR2:%.*]], i64 [[INDVARS_IV]]
364; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
365; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
366; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
367; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i32 [[ADD]], 1
368; CHECK-NEXT:    store i32 [[TMP3]], i32* [[ARRAYIDX5]], align 4
369; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
370; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
371; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]]
372; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_INC8_LOOPEXIT:%.*]], label [[FOR_BODY3]]
373; CHECK:       for.inc8.loopexit:
374; CHECK-NEXT:    br label [[FOR_INC8]]
375; CHECK:       for.inc8:
376; CHECK-NEXT:    [[J_1_LCSSA]] = phi i32 [ [[J_022]], [[FOR_COND1_PREHEADER]] ], [ [[ITR]], [[FOR_INC8_LOOPEXIT]] ]
377; CHECK-NEXT:    [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1
378; CHECK-NEXT:    [[LFTR_WIDEIV25:%.*]] = trunc i64 [[INDVARS_IV_NEXT24]] to i32
379; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i32 [[LFTR_WIDEIV25]], [[ITR]]
380; CHECK-NEXT:    br i1 [[EXITCOND26]], label [[FOR_END10_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
381; CHECK:       for.end10.loopexit:
382; CHECK-NEXT:    br label [[FOR_END10]]
383; CHECK:       for.end10:
384; CHECK-NEXT:    ret i32 undef
385;
386entry:
387  %cmp20 = icmp eq i32 %itr, 0
388  br i1 %cmp20, label %for.end10, label %for.cond1.preheader
389
390for.cond1.preheader:                              ; preds = %entry, %for.inc8
391  %indvars.iv23 = phi i64 [ %indvars.iv.next24, %for.inc8 ], [ 0, %entry ]
392  %j.022 = phi i32 [ %j.1.lcssa, %for.inc8 ], [ 0, %entry ]
393  %cmp218 = icmp ult i32 %j.022, %itr
394  br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8
395
396for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
397  %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23
398  %0 = zext i32 %j.022 to i64
399  br label %for.body3
400
401for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
402  %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
403  %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv
404  %1 = load i32, i32* %arrayidx, align 4
405  %2 = load i32, i32* %arrayidx5, align 4
406  %add = add nsw i32 %2, %1
407  store i32 %add, i32* %arrayidx5, align 4
408  %3 = load i32, i32* %arrayidx5, align 4
409  %4 = add nsw i32 %3, 1
410  store i32 %4, i32* %arrayidx5, align 4
411  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
412  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
413  %exitcond = icmp eq i32 %lftr.wideiv, %itr
414  br i1 %exitcond, label %for.inc8, label %for.body3
415
416for.inc8:                                         ; preds = %for.body3, %for.cond1.preheader
417  %j.1.lcssa = phi i32 [ %j.022, %for.cond1.preheader ], [ %itr, %for.body3 ]
418  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
419  %lftr.wideiv25 = trunc i64 %indvars.iv.next24 to i32
420  %exitcond26 = icmp eq i32 %lftr.wideiv25, %itr
421  br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
422
423for.end10:                                        ; preds = %for.inc8, %entry
424  ret i32 undef
425}
426
427; second uniform store to the same address is conditional.
428; we do not vectorize this.
429define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
430; CHECK-LABEL: @multiple_uniform_stores_conditional(
431; CHECK-NEXT:  entry:
432; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i32 [[ITR:%.*]], 0
433; CHECK-NEXT:    br i1 [[CMP20]], label [[FOR_END10:%.*]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]]
434; CHECK:       for.cond1.preheader.preheader:
435; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
436; CHECK:       for.cond1.preheader:
437; CHECK-NEXT:    [[INDVARS_IV23:%.*]] = phi i64 [ [[INDVARS_IV_NEXT24:%.*]], [[FOR_INC8:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
438; CHECK-NEXT:    [[J_022:%.*]] = phi i32 [ [[J_1_LCSSA:%.*]], [[FOR_INC8]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
439; CHECK-NEXT:    [[CMP218:%.*]] = icmp ult i32 [[J_022]], [[ITR]]
440; CHECK-NEXT:    br i1 [[CMP218]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_INC8]]
441; CHECK:       for.body3.lr.ph:
442; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[VAR1:%.*]], i64 [[INDVARS_IV23]]
443; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[J_022]] to i64
444; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
445; CHECK:       for.body3:
446; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY3_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LATCH:%.*]] ]
447; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[VAR2:%.*]], i64 [[INDVARS_IV]]
448; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
449; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
450; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
451; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i32 [[ADD]], 42
452; CHECK-NEXT:    br i1 [[TMP3]], label [[COND_STORE:%.*]], label [[LATCH]]
453; CHECK:       cond_store:
454; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i32 [[ADD]], 1
455; CHECK-NEXT:    br label [[LATCH]]
456; CHECK:       latch:
457; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i32 [ [[TMP4]], [[COND_STORE]] ], [ [[ADD]], [[FOR_BODY3]] ]
458; CHECK-NEXT:    store i32 [[STOREMERGE]], i32* [[ARRAYIDX5]], align 4
459; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
460; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
461; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[ITR]]
462; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_INC8_LOOPEXIT:%.*]], label [[FOR_BODY3]]
463; CHECK:       for.inc8.loopexit:
464; CHECK-NEXT:    br label [[FOR_INC8]]
465; CHECK:       for.inc8:
466; CHECK-NEXT:    [[J_1_LCSSA]] = phi i32 [ [[J_022]], [[FOR_COND1_PREHEADER]] ], [ [[ITR]], [[FOR_INC8_LOOPEXIT]] ]
467; CHECK-NEXT:    [[INDVARS_IV_NEXT24]] = add nuw nsw i64 [[INDVARS_IV23]], 1
468; CHECK-NEXT:    [[LFTR_WIDEIV25:%.*]] = trunc i64 [[INDVARS_IV_NEXT24]] to i32
469; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i32 [[LFTR_WIDEIV25]], [[ITR]]
470; CHECK-NEXT:    br i1 [[EXITCOND26]], label [[FOR_END10_LOOPEXIT:%.*]], label [[FOR_COND1_PREHEADER]]
471; CHECK:       for.end10.loopexit:
472; CHECK-NEXT:    br label [[FOR_END10]]
473; CHECK:       for.end10:
474; CHECK-NEXT:    ret i32 undef
475;
476entry:
477  %cmp20 = icmp eq i32 %itr, 0
478  br i1 %cmp20, label %for.end10, label %for.cond1.preheader
479
480for.cond1.preheader:                              ; preds = %entry, %for.inc8
481  %indvars.iv23 = phi i64 [ %indvars.iv.next24, %for.inc8 ], [ 0, %entry ]
482  %j.022 = phi i32 [ %j.1.lcssa, %for.inc8 ], [ 0, %entry ]
483  %cmp218 = icmp ult i32 %j.022, %itr
484  br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8
485
486for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
487  %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23
488  %0 = zext i32 %j.022 to i64
489  br label %for.body3
490
491for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
492  %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %latch ]
493  %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv
494  %1 = load i32, i32* %arrayidx, align 4
495  %2 = load i32, i32* %arrayidx5, align 4
496  %add = add nsw i32 %2, %1
497  store i32 %add, i32* %arrayidx5, align 4
498  %3 = load i32, i32* %arrayidx5, align 4
499  %4 = add nsw i32 %3, 1
500  %5 = icmp ugt i32 %3, 42
501  br i1 %5, label %cond_store, label %latch
502
503cond_store:
504  store i32 %4, i32* %arrayidx5, align 4
505  br label %latch
506
507latch:
508  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
509  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
510  %exitcond = icmp eq i32 %lftr.wideiv, %itr
511  br i1 %exitcond, label %for.inc8, label %for.body3
512
513for.inc8:                                         ; preds = %for.body3, %for.cond1.preheader
514  %j.1.lcssa = phi i32 [ %j.022, %for.cond1.preheader ], [ %itr, %latch ]
515  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
516  %lftr.wideiv25 = trunc i64 %indvars.iv.next24 to i32
517  %exitcond26 = icmp eq i32 %lftr.wideiv25, %itr
518  br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
519
520for.end10:                                        ; preds = %for.inc8, %entry
521  ret i32 undef
522}
523
524; cannot vectorize loop with unsafe dependency between uniform load (%i10) and store
525; (%i12) to the same address
526; PR39653
527; Note: %i10 could be replaced by phi(%arg4, %i12), a potentially vectorizable
528; 1st-order-recurrence
529define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) {
530; CHECK-LABEL: @unsafe_dep_uniform_load_store(
531; CHECK-NEXT:  bb:
532; CHECK-NEXT:    [[I6:%.*]] = getelementptr inbounds i16, i16* [[ARG3:%.*]], i64 [[ARG5:%.*]]
533; CHECK-NEXT:    br label [[BB7:%.*]]
534; CHECK:       bb7:
535; CHECK-NEXT:    [[I121:%.*]] = phi i32 [ [[ARG4:%.*]], [[BB:%.*]] ], [ [[I12:%.*]], [[BB7]] ]
536; CHECK-NEXT:    [[I8:%.*]] = phi i64 [ 0, [[BB]] ], [ [[I24:%.*]], [[BB7]] ]
537; CHECK-NEXT:    [[I9:%.*]] = phi i32 [ [[ARG1:%.*]], [[BB]] ], [ [[I23:%.*]], [[BB7]] ]
538; CHECK-NEXT:    [[I11:%.*]] = mul nsw i32 [[I9]], [[I121]]
539; CHECK-NEXT:    [[I12]] = srem i32 [[I11]], 65536
540; CHECK-NEXT:    [[I13:%.*]] = add nsw i32 [[I12]], [[I9]]
541; CHECK-NEXT:    [[I14:%.*]] = trunc i32 [[I13]] to i16
542; CHECK-NEXT:    [[I15:%.*]] = trunc i64 [[I8]] to i32
543; CHECK-NEXT:    [[I16:%.*]] = add i32 [[I15]], [[ARG:%.*]]
544; CHECK-NEXT:    [[I17:%.*]] = zext i32 [[I16]] to i64
545; CHECK-NEXT:    [[I18:%.*]] = getelementptr inbounds i16, i16* [[I6]], i64 [[I17]]
546; CHECK-NEXT:    store i16 [[I14]], i16* [[I18]], align 2
547; CHECK-NEXT:    [[I19:%.*]] = add i32 [[I13]], [[I9]]
548; CHECK-NEXT:    [[I20:%.*]] = trunc i32 [[I19]] to i16
549; CHECK-NEXT:    [[I21:%.*]] = and i16 [[I20]], 255
550; CHECK-NEXT:    [[I22:%.*]] = getelementptr inbounds i16, i16* [[ARG3]], i64 [[I17]]
551; CHECK-NEXT:    store i16 [[I21]], i16* [[I22]], align 2
552; CHECK-NEXT:    [[I23]] = add nsw i32 [[I9]], 1
553; CHECK-NEXT:    [[I24]] = add nuw nsw i64 [[I8]], 1
554; CHECK-NEXT:    [[I25:%.*]] = icmp eq i64 [[I24]], [[ARG2:%.*]]
555; CHECK-NEXT:    br i1 [[I25]], label [[BB26:%.*]], label [[BB7]]
556; CHECK:       bb26:
557; CHECK-NEXT:    ret void
558;
559bb:
560  %i = alloca i32
561  store i32 %arg4, i32* %i
562  %i6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5
563  br label %bb7
564
565bb7:
566  %i8 = phi i64 [ 0, %bb ], [ %i24, %bb7 ]
567  %i9 = phi i32 [ %arg1, %bb ], [ %i23, %bb7 ]
568  %i10 = load i32, i32* %i
569  %i11 = mul nsw i32 %i9, %i10
570  %i12 = srem i32 %i11, 65536
571  %i13 = add nsw i32 %i12, %i9
572  %i14 = trunc i32 %i13 to i16
573  %i15 = trunc i64 %i8 to i32
574  %i16 = add i32 %arg, %i15
575  %i17 = zext i32 %i16 to i64
576  %i18 = getelementptr inbounds i16, i16* %i6, i64 %i17
577  store i16 %i14, i16* %i18, align 2
578  %i19 = add i32 %i13, %i9
579  %i20 = trunc i32 %i19 to i16
580  %i21 = and i16 %i20, 255
581  %i22 = getelementptr inbounds i16, i16* %arg3, i64 %i17
582  store i16 %i21, i16* %i22, align 2
583  %i23 = add nsw i32 %i9, 1
584  %i24 = add nuw nsw i64 %i8, 1
585  %i25 = icmp eq i64 %i24, %arg2
586  store i32 %i12, i32* %i
587  br i1 %i25, label %bb26, label %bb7
588
589bb26:
590  ret void
591}
592
593; Make sure any check-not directives are not triggered by function declarations.
594