1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -o - -S -load-store-vectorizer -dce %s | FileCheck %s
3
4; Make sure LoadStoreVectorizer vectorizes the loads below.
5; In order to prove that the vectorization is safe, it tries to
6; match nested adds and find an expression that adds a constant
7; value to an existing index and the result doesn't overflow.
8
9target triple = "x86_64--"
10
11define void @ld_v4i8_add_nsw(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
12; CHECK-LABEL: @ld_v4i8_add_nsw(
13; CHECK-NEXT:  bb:
14; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
15; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]]
16; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
17; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
18; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
19; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
20; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
21; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
22; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
23; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
24; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
25; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
26; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
27; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
28; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
29; CHECK-NEXT:    ret void
30;
31bb:
32  %tmp = add nsw i32 %v0, -1
33  %tmp1 = add nsw i32 %v1, %tmp
34  %tmp2 = sext i32 %tmp1 to i64
35  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
36  %tmp4 = load i8, i8* %tmp3, align 1
37  %tmp5 = add nsw i32 %v1, %v0
38  %tmp6 = sext i32 %tmp5 to i64
39  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
40  %tmp8 = load i8, i8* %tmp7, align 1
41  %tmp9 = add nsw i32 %v0, 1
42  %tmp10 = add nsw i32 %v1, %tmp9
43  %tmp11 = sext i32 %tmp10 to i64
44  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
45  %tmp13 = load i8, i8* %tmp12, align 1
46  %tmp14 = add nsw i32 %v0, 2
47  %tmp15 = add nsw i32 %v1, %tmp14
48  %tmp16 = sext i32 %tmp15 to i64
49  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
50  %tmp18 = load i8, i8* %tmp17, align 1
51  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
52  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
53  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
54  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
55  store <4 x i8> %tmp22, <4 x i8>* %dst
56  ret void
57}
58
59define void @ld_v4i8_add_nuw(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
60; CHECK-LABEL: @ld_v4i8_add_nuw(
61; CHECK-NEXT:  bb:
62; CHECK-NEXT:    [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1
63; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]]
64; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
65; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
66; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
67; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
68; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
69; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
70; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
71; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
72; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
73; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
74; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
75; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
76; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
77; CHECK-NEXT:    ret void
78;
79bb:
80  %tmp = add nuw i32 %v0, -1
81  %tmp1 = add nuw i32 %v1, %tmp
82  %tmp2 = zext i32 %tmp1 to i64
83  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
84  %tmp4 = load i8, i8* %tmp3, align 1
85  %tmp5 = add nuw i32 %v1, %v0
86  %tmp6 = zext i32 %tmp5 to i64
87  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
88  %tmp8 = load i8, i8* %tmp7, align 1
89  %tmp9 = add nuw i32 %v0, 1
90  %tmp10 = add nuw i32 %v1, %tmp9
91  %tmp11 = zext i32 %tmp10 to i64
92  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
93  %tmp13 = load i8, i8* %tmp12, align 1
94  %tmp14 = add nuw i32 %v0, 2
95  %tmp15 = add nuw i32 %v1, %tmp14
96  %tmp16 = zext i32 %tmp15 to i64
97  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
98  %tmp18 = load i8, i8* %tmp17, align 1
99  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
100  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
101  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
102  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
103  store <4 x i8> %tmp22, <4 x i8>* %dst
104  ret void
105}
106
107; Apply different operand orders for the nested add sequences
108define void @ld_v4i8_add_nsw_operand_orders(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
109; CHECK-LABEL: @ld_v4i8_add_nsw_operand_orders(
110; CHECK-NEXT:  bb:
111; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
112; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]]
113; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
114; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
115; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
116; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
117; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
118; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
119; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
120; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
121; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
122; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
123; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
124; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
125; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
126; CHECK-NEXT:    ret void
127;
128bb:
129  %tmp = add nsw i32 %v0, -1
130  %tmp1 = add nsw i32 %v1, %tmp
131  %tmp2 = sext i32 %tmp1 to i64
132  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
133  %tmp4 = load i8, i8* %tmp3, align 1
134  %tmp5 = add nsw i32 %v0, %v1
135  %tmp6 = sext i32 %tmp5 to i64
136  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
137  %tmp8 = load i8, i8* %tmp7, align 1
138  %tmp9 = add nsw i32 %v0, 1
139  %tmp10 = add nsw i32 %tmp9, %v1
140  %tmp11 = sext i32 %tmp10 to i64
141  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
142  %tmp13 = load i8, i8* %tmp12, align 1
143  %tmp14 = add nsw i32 %v0, 2
144  %tmp15 = add nsw i32 %v1, %tmp14
145  %tmp16 = sext i32 %tmp15 to i64
146  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
147  %tmp18 = load i8, i8* %tmp17, align 1
148  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
149  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
150  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
151  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
152  store <4 x i8> %tmp22, <4 x i8>* %dst
153  ret void
154}
155
156; Apply different operand orders for the nested add sequences
157define void @ld_v4i8_add_nuw_operand_orders(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
158; CHECK-LABEL: @ld_v4i8_add_nuw_operand_orders(
159; CHECK-NEXT:  bb:
160; CHECK-NEXT:    [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1
161; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]]
162; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
163; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
164; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
165; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
166; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
167; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
168; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
169; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
170; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
171; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
172; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
173; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
174; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
175; CHECK-NEXT:    ret void
176;
177bb:
178  %tmp = add nuw i32 %v0, -1
179  %tmp1 = add nuw i32 %v1, %tmp
180  %tmp2 = zext i32 %tmp1 to i64
181  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
182  %tmp4 = load i8, i8* %tmp3, align 1
183  %tmp5 = add nuw i32 %v0, %v1
184  %tmp6 = zext i32 %tmp5 to i64
185  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
186  %tmp8 = load i8, i8* %tmp7, align 1
187  %tmp9 = add nuw i32 %v0, 1
188  %tmp10 = add nuw i32 %tmp9, %v1
189  %tmp11 = zext i32 %tmp10 to i64
190  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
191  %tmp13 = load i8, i8* %tmp12, align 1
192  %tmp14 = add nuw i32 %v0, 2
193  %tmp15 = add nuw i32 %v1, %tmp14
194  %tmp16 = zext i32 %tmp15 to i64
195  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
196  %tmp18 = load i8, i8* %tmp17, align 1
197  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
198  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
199  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
200  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
201  store <4 x i8> %tmp22, <4 x i8>* %dst
202  ret void
203}
204
205define void @ld_v4i8_add_known_bits(i32 %ind0, i32 %ind1, i8* %src, <4 x i8>* %dst) {
206; CHECK-LABEL: @ld_v4i8_add_known_bits(
207; CHECK-NEXT:  bb:
208; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
209; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 4
210; CHECK-NEXT:    [[TMP:%.*]] = add i32 [[V0]], -1
211; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[V1]], [[TMP]]
212; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
213; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
214; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1
215; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
216; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
217; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]]
218; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <3 x i8>*
219; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i8>, <3 x i8>* [[TMP0]], align 1
220; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0
221; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1
222; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2
223; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
224; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
225; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
226; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
227; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
228; CHECK-NEXT:    ret void
229;
230bb:
231  %v0 = mul i32 %ind0, 4
232  %v1 = mul i32 %ind1, 4
233  %tmp = add i32 %v0, -1
234  %tmp1 = add i32 %v1, %tmp
235  %tmp2 = sext i32 %tmp1 to i64
236  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
237  %tmp4 = load i8, i8* %tmp3, align 1
238  %tmp5 = add i32 %v1, %v0
239  %tmp6 = sext i32 %tmp5 to i64
240  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
241  %tmp8 = load i8, i8* %tmp7, align 1
242  %tmp9 = add i32 %v0, 1
243  %tmp10 = add i32 %v1, %tmp9
244  %tmp11 = sext i32 %tmp10 to i64
245  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
246  %tmp13 = load i8, i8* %tmp12, align 1
247  %tmp14 = add i32 %v0, 2
248  %tmp15 = add i32 %v1, %tmp14
249  %tmp16 = sext i32 %tmp15 to i64
250  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
251  %tmp18 = load i8, i8* %tmp17, align 1
252  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
253  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
254  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
255  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
256  store <4 x i8> %tmp22, <4 x i8>* %dst
257  ret void
258}
259
260define void @ld_v4i8_add_known_bits1(i32 %ind0, i32 %ind1, i8* %src, <4 x i8>* %dst) {
261; CHECK-LABEL: @ld_v4i8_add_known_bits1(
262; CHECK-NEXT:  bb:
263; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
264; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 4
265; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
266; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
267; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]]
268; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
269; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
270; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
271; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
272; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
273; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
274; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0
275; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
276; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
277; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
278; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
279; CHECK-NEXT:    ret void
280;
281bb:
282  %v0 = mul i32 %ind0, 4
283  %v1 = mul i32 %ind1, 4
284  %tmp = add i32 %v0, 3
285  %tmp1 = add i32 %v1, %tmp
286  %tmp2 = sext i32 %tmp1 to i64
287  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
288  %tmp4 = load i8, i8* %tmp3, align 1
289  %tmp5 = add i32 %v1, %v0
290  %tmp6 = sext i32 %tmp5 to i64
291  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
292  %tmp8 = load i8, i8* %tmp7, align 1
293  %tmp9 = add i32 %v0, 1
294  %tmp10 = add i32 %v1, %tmp9
295  %tmp11 = sext i32 %tmp10 to i64
296  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
297  %tmp13 = load i8, i8* %tmp12, align 1
298  %tmp14 = add i32 %v0, 2
299  %tmp15 = add i32 %v1, %tmp14
300  %tmp16 = sext i32 %tmp15 to i64
301  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
302  %tmp18 = load i8, i8* %tmp17, align 1
303  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
304  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
305  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
306  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
307  store <4 x i8> %tmp22, <4 x i8>* %dst
308  ret void
309}
310
311define void @ld_v4i8_add_known_bits_by_assume(i32 %ind0, i32 %ind1, i8* %src, <4 x i8>* %dst) {
312; CHECK-LABEL: @ld_v4i8_add_known_bits_by_assume(
313; CHECK-NEXT:  bb:
314; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 3
315; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
316; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[V0]], 3
317; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0
318; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[V1]], 3
319; CHECK-NEXT:    [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0
320; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I]])
321; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I_1]])
322; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
323; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
324; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]]
325; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
326; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
327; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
328; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
329; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
330; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
331; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0
332; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
333; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
334; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
335; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
336; CHECK-NEXT:    ret void
337;
338bb:
339  %v0 = mul i32 %ind0, 3
340  %v1 = mul i32 %ind1, 3
341  %and.i = and i32 %v0, 3
342  %cmp.i = icmp eq i32 %and.i, 0
343  %and.i.1 = and i32 %v1, 3
344  %cmp.i.1 = icmp eq i32 %and.i.1, 0
345  call void @llvm.assume(i1 %cmp.i)
346  call void @llvm.assume(i1 %cmp.i.1)
347  %tmp = add i32 %v0, 3
348  %tmp1 = add i32 %v1, %tmp
349  %tmp2 = sext i32 %tmp1 to i64
350  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
351  %tmp4 = load i8, i8* %tmp3, align 1
352  %tmp5 = add i32 %v1, %v0
353  %tmp6 = sext i32 %tmp5 to i64
354  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
355  %tmp8 = load i8, i8* %tmp7, align 1
356  %tmp9 = add i32 %v0, 1
357  %tmp10 = add i32 %v1, %tmp9
358  %tmp11 = sext i32 %tmp10 to i64
359  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
360  %tmp13 = load i8, i8* %tmp12, align 1
361  %tmp14 = add i32 %v0, 2
362  %tmp15 = add i32 %v1, %tmp14
363  %tmp16 = sext i32 %tmp15 to i64
364  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
365  %tmp18 = load i8, i8* %tmp17, align 1
366  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
367  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
368  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
369  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
370  store <4 x i8> %tmp22, <4 x i8>* %dst
371  ret void
372}
373
374declare void @llvm.assume(i1)
375
376define void @ld_v4i8_add_assume_on_arg(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
377; CHECK-LABEL: @ld_v4i8_add_assume_on_arg(
378; CHECK-NEXT:  bb:
379; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[V0:%.*]], 3
380; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0
381; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[V1:%.*]], 3
382; CHECK-NEXT:    [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0
383; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I]])
384; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I_1]])
385; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[V0]], -1
386; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[V1]], [[TMP]]
387; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
388; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
389; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1
390; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
391; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
392; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]]
393; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <3 x i8>*
394; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i8>, <3 x i8>* [[TMP0]], align 1
395; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0
396; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1
397; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2
398; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
399; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
400; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
401; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
402; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
403; CHECK-NEXT:    ret void
404;
405bb:
406  %and.i = and i32 %v0, 3
407  %cmp.i = icmp eq i32 %and.i, 0
408  %and.i.1 = and i32 %v1, 3
409  %cmp.i.1 = icmp eq i32 %and.i.1, 0
410  call void @llvm.assume(i1 %cmp.i)
411  call void @llvm.assume(i1 %cmp.i.1)
412  %tmp = add nsw i32 %v0, -1
413  %tmp1 = add i32 %v1, %tmp
414  %tmp2 = sext i32 %tmp1 to i64
415  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
416  %tmp4 = load i8, i8* %tmp3, align 1
417  %tmp5 = add i32 %v1, %v0
418  %tmp6 = sext i32 %tmp5 to i64
419  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
420  %tmp8 = load i8, i8* %tmp7, align 1
421  %tmp9 = add nsw i32 %v0, 1
422  %tmp10 = add i32 %v1, %tmp9
423  %tmp11 = sext i32 %tmp10 to i64
424  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
425  %tmp13 = load i8, i8* %tmp12, align 1
426  %tmp14 = add nsw i32 %v0, 2
427  %tmp15 = add i32 %v1, %tmp14
428  %tmp16 = sext i32 %tmp15 to i64
429  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
430  %tmp18 = load i8, i8* %tmp17, align 1
431  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
432  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
433  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
434  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
435  store <4 x i8> %tmp22, <4 x i8>* %dst
436  ret void
437}
438
439define void @ld_v4i8_add_assume_on_arg1(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
440; CHECK-LABEL: @ld_v4i8_add_assume_on_arg1(
441; CHECK-NEXT:  bb:
442; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[V0:%.*]], 3
443; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0
444; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[V1:%.*]], 3
445; CHECK-NEXT:    [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0
446; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I]])
447; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I_1]])
448; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
449; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
450; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]]
451; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
452; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
453; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
454; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
455; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
456; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
457; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0
458; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
459; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
460; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
461; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
462; CHECK-NEXT:    ret void
463;
464bb:
465  %and.i = and i32 %v0, 3
466  %cmp.i = icmp eq i32 %and.i, 0
467  %and.i.1 = and i32 %v1, 3
468  %cmp.i.1 = icmp eq i32 %and.i.1, 0
469  call void @llvm.assume(i1 %cmp.i)
470  call void @llvm.assume(i1 %cmp.i.1)
471  %tmp = add nsw i32 %v0, 3
472  %tmp1 = add i32 %v1, %tmp
473  %tmp2 = sext i32 %tmp1 to i64
474  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
475  %tmp4 = load i8, i8* %tmp3, align 1
476  %tmp5 = add i32 %v1, %v0
477  %tmp6 = sext i32 %tmp5 to i64
478  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
479  %tmp8 = load i8, i8* %tmp7, align 1
480  %tmp9 = add nsw i32 %v0, 1
481  %tmp10 = add i32 %v1, %tmp9
482  %tmp11 = sext i32 %tmp10 to i64
483  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
484  %tmp13 = load i8, i8* %tmp12, align 1
485  %tmp14 = add nsw i32 %v0, 2
486  %tmp15 = add i32 %v1, %tmp14
487  %tmp16 = sext i32 %tmp15 to i64
488  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
489  %tmp18 = load i8, i8* %tmp17, align 1
490  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
491  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
492  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
493  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
494  store <4 x i8> %tmp22, <4 x i8>* %dst
495  ret void
496}
497
498; Address computations are partly separated by control flow and with llvm.assume placed
499; in the second basic block
500
501define void @ld_v2i8_add_different_contexts(i32 %ind0, i32 %ind1, i8* %src, <2 x i8>* %dst) {
502; CHECK-LABEL: @ld_v2i8_add_different_contexts(
503; CHECK-NEXT:  bb:
504; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
505; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
506; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
507; CHECK-NEXT:    [[BIT_COND:%.*]] = icmp eq i32 [[V1]], 0
508; CHECK-NEXT:    br i1 [[BIT_COND]], label [[BB_LOADS:%.*]], label [[BB_SKIP:%.*]]
509; CHECK:       bb.loads:
510; CHECK-NEXT:    call void @llvm.assume(i1 [[BIT_COND]])
511; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
512; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]]
513; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>*
514; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1
515; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
516; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
517; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
518; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
519; CHECK-NEXT:    store <2 x i8> [[TMP20]], <2 x i8>* [[DST:%.*]]
520; CHECK-NEXT:    br label [[BB_SKIP]]
521; CHECK:       bb.skip:
522; CHECK-NEXT:    ret void
523;
524bb:
525  %v0 = mul i32 %ind0, 4
526  %v1 = mul i32 %ind1, 3
527  %tmp5 = add i32 %v1, %v0
528  %bit_cond = icmp eq i32 %v1, 0
529  br i1 %bit_cond, label %bb.loads, label %bb.skip
530
531bb.loads:
532  call void @llvm.assume(i1 %bit_cond)
533  %tmp = add nsw i32 %v0, 1
534  %tmp1 = add i32 %v1, %tmp
535  %tmp2 = sext i32 %tmp1 to i64
536  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
537  %tmp4 = load i8, i8* %tmp3, align 1
538  %tmp6 = sext i32 %tmp5 to i64
539  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
540  %tmp8 = load i8, i8* %tmp7, align 1
541  %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0
542  %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1
543  store <2 x i8> %tmp20, <2 x i8>* %dst
544  br label %bb.skip
545
546bb.skip:
547  ret void
548}
549
550; Same as ld_v2i8_add_different_contexts but with llvm.assume placed between loads
551
552define void @ld_v2i8_add_different_contexts1(i32 %ind0, i32 %ind1, i8* %src, <2 x i8>* %dst) {
553; CHECK-LABEL: @ld_v2i8_add_different_contexts1(
554; CHECK-NEXT:  bb:
555; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
556; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
557; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
558; CHECK-NEXT:    [[BIT_COND:%.*]] = icmp eq i32 [[V1]], 0
559; CHECK-NEXT:    br i1 [[BIT_COND]], label [[BB_LOADS:%.*]], label [[BB_SKIP:%.*]]
560; CHECK:       bb.loads:
561; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
562; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]]
563; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>*
564; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1
565; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
566; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
567; CHECK-NEXT:    call void @llvm.assume(i1 [[BIT_COND]])
568; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
569; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
570; CHECK-NEXT:    store <2 x i8> [[TMP20]], <2 x i8>* [[DST:%.*]]
571; CHECK-NEXT:    br label [[BB_SKIP]]
572; CHECK:       bb.skip:
573; CHECK-NEXT:    ret void
574;
575bb:
576  %v0 = mul i32 %ind0, 4
577  %v1 = mul i32 %ind1, 3
578  %tmp5 = add i32 %v1, %v0
579  %bit_cond = icmp eq i32 %v1, 0
580  br i1 %bit_cond, label %bb.loads, label %bb.skip
581
582bb.loads:
583  %tmp6 = sext i32 %tmp5 to i64
584  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
585  %tmp8 = load i8, i8* %tmp7, align 1
586  call void @llvm.assume(i1 %bit_cond)
587  %tmp = add nsw i32 %v0, 1
588  %tmp1 = add i32 %v1, %tmp
589  %tmp2 = sext i32 %tmp1 to i64
590  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
591  %tmp4 = load i8, i8* %tmp3, align 1
592  %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0
593  %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1
594  store <2 x i8> %tmp20, <2 x i8>* %dst
595  br label %bb.skip
596
597bb.skip:
598  ret void
599}
600
601; llvm.assume is placed between loads in a single basic block
602
603define void @ld_v2i8_add_context(i32 %ind0, i32 %ind1, i8* %src, <2 x i8>* %dst) {
604; CHECK-LABEL: @ld_v2i8_add_context(
605; CHECK-NEXT:  bb:
606; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
607; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
608; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
609; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
610; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]]
611; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>*
612; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1
613; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
614; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
615; CHECK-NEXT:    [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0
616; CHECK-NEXT:    call void @llvm.assume(i1 [[BIT_COND]])
617; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
618; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
619; CHECK-NEXT:    store <2 x i8> [[TMP20]], <2 x i8>* [[DST:%.*]]
620; CHECK-NEXT:    ret void
621;
622bb:
623  %v0 = mul i32 %ind0, 4
624  %v1 = mul i32 %ind1, 3
625  %tmp5 = add i32 %v1, %v0
626  %tmp6 = sext i32 %tmp5 to i64
627  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
628  %tmp8 = load i8, i8* %tmp7, align 1
629  %bit_cond = icmp eq i32 %tmp5, 0
630  call void @llvm.assume(i1 %bit_cond)
631  %tmp = add nsw i32 %v0, 1
632  %tmp1 = add i32 %v1, %tmp
633  %tmp2 = sext i32 %tmp1 to i64
634  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
635  %tmp4 = load i8, i8* %tmp3, align 1
636  %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0
637  %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1
638  store <2 x i8> %tmp20, <2 x i8>* %dst
639  ret void
640}
641
642; Placing llvm.assume after all the loads and stores in the basic block still works
643
644define void @ld_v2i8_add_context1(i32 %ind0, i32 %ind1, i8* %src, <2 x i8>* %dst) {
645; CHECK-LABEL: @ld_v2i8_add_context1(
646; CHECK-NEXT:  bb:
647; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
648; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
649; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
650; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
651; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]]
652; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>*
653; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1
654; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
655; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
656; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
657; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
658; CHECK-NEXT:    store <2 x i8> [[TMP20]], <2 x i8>* [[DST:%.*]]
659; CHECK-NEXT:    [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0
660; CHECK-NEXT:    call void @llvm.assume(i1 [[BIT_COND]])
661; CHECK-NEXT:    ret void
662;
663bb:
664  %v0 = mul i32 %ind0, 4
665  %v1 = mul i32 %ind1, 3
666  %tmp5 = add i32 %v1, %v0
667  %tmp6 = sext i32 %tmp5 to i64
668  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
669  %tmp8 = load i8, i8* %tmp7, align 1
670  %tmp = add nsw i32 %v0, 1
671  %tmp1 = add i32 %v1, %tmp
672  %tmp2 = sext i32 %tmp1 to i64
673  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
674  %tmp4 = load i8, i8* %tmp3, align 1
675  %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0
676  %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1
677  store <2 x i8> %tmp20, <2 x i8>* %dst
678  %bit_cond = icmp eq i32 %tmp5, 0
679  call void @llvm.assume(i1 %bit_cond)
680  ret void
681}
682
683; Make sure we don't vectorize the loads below because the source of
684; sext instructions doesn't have the nsw flag or known bits allowing
685; to apply the vectorization.
686
687define void @ld_v4i8_add_not_safe(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
688; CHECK-LABEL: @ld_v4i8_add_not_safe(
689; CHECK-NEXT:  bb:
690; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
691; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[V1:%.*]], [[TMP]]
692; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
693; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
694; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1
695; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
696; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
697; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]]
698; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP7]], align 1
699; CHECK-NEXT:    [[TMP9:%.*]] = add nsw i32 [[V0]], 1
700; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[V1]], [[TMP9]]
701; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
702; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP11]]
703; CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP12]], align 1
704; CHECK-NEXT:    [[TMP14:%.*]] = add nsw i32 [[V0]], 2
705; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[V1]], [[TMP14]]
706; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
707; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP16]]
708; CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[TMP17]], align 1
709; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
710; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP8]], i32 1
711; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP13]], i32 2
712; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP18]], i32 3
713; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
714; CHECK-NEXT:    ret void
715;
716bb:
717  %tmp = add nsw i32 %v0, -1
718  %tmp1 = add i32 %v1, %tmp
719  %tmp2 = sext i32 %tmp1 to i64
720  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
721  %tmp4 = load i8, i8* %tmp3, align 1
722  %tmp5 = add i32 %v1, %v0
723  %tmp6 = sext i32 %tmp5 to i64
724  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
725  %tmp8 = load i8, i8* %tmp7, align 1
726  %tmp9 = add nsw i32 %v0, 1
727  %tmp10 = add i32 %v1, %tmp9
728  %tmp11 = sext i32 %tmp10 to i64
729  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
730  %tmp13 = load i8, i8* %tmp12, align 1
731  %tmp14 = add nsw i32 %v0, 2
732  %tmp15 = add i32 %v1, %tmp14
733  %tmp16 = sext i32 %tmp15 to i64
734  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
735  %tmp18 = load i8, i8* %tmp17, align 1
736  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
737  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
738  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
739  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
740  store <4 x i8> %tmp22, <4 x i8>* %dst
741  ret void
742}
743