1; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
2
3target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
4
5; Check vectorization on an interleaved load group of factor 2 and an interleaved
6; store group of factor 2.
7
8; int AB[1024];
9; int CD[1024];
10;  void test_array_load2_store2(int C, int D) {
11;   for (int i = 0; i < 1024; i+=2) {
12;     int A = AB[i];
13;     int B = AB[i+1];
14;     CD[i] = A + C;
15;     CD[i+1] = B * D;
16;   }
17; }
18
19; CHECK-LABEL: @test_array_load2_store2(
20; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
21; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
22; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23; CHECK: add nsw <4 x i32>
24; CHECK: mul nsw <4 x i32>
25; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
26; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
27
28@AB = common global [1024 x i32] zeroinitializer, align 4
29@CD = common global [1024 x i32] zeroinitializer, align 4
30
31define void @test_array_load2_store2(i32 %C, i32 %D) {
32entry:
33  br label %for.body
34
35for.body:                                         ; preds = %for.body, %entry
36  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
37  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
38  %tmp = load i32, i32* %arrayidx0, align 4
39  %tmp1 = or i64 %indvars.iv, 1
40  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1
41  %tmp2 = load i32, i32* %arrayidx1, align 4
42  %add = add nsw i32 %tmp, %C
43  %mul = mul nsw i32 %tmp2, %D
44  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
45  store i32 %add, i32* %arrayidx2, align 4
46  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1
47  store i32 %mul, i32* %arrayidx3, align 4
48  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
49  %cmp = icmp slt i64 %indvars.iv.next, 1024
50  br i1 %cmp, label %for.body, label %for.end
51
52for.end:                                          ; preds = %for.body
53  ret void
54}
55
56; int A[3072];
57; struct ST S[1024];
58; void test_struct_st3() {
59;   int *ptr = A;
60;   for (int i = 0; i < 1024; i++) {
61;     int X1 = *ptr++;
62;     int X2 = *ptr++;
63;     int X3 = *ptr++;
64;     T[i].x = X1 + 1;
65;     T[i].y = X2 + 2;
66;     T[i].z = X3 + 3;
67;   }
68; }
69
70; CHECK-LABEL: @test_struct_array_load3_store3(
71; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
72; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
73; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
74; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
75; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
76; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
77; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
78; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
79; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
80; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
81; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
82
83%struct.ST3 = type { i32, i32, i32 }
84@A = common global [3072 x i32] zeroinitializer, align 4
85@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
86
87define void @test_struct_array_load3_store3() {
88entry:
89  br label %for.body
90
91for.body:                                         ; preds = %for.body, %entry
92  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
93  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
94  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
95  %tmp = load i32, i32* %ptr.016, align 4
96  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
97  %tmp1 = load i32, i32* %incdec.ptr, align 4
98  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
99  %tmp2 = load i32, i32* %incdec.ptr1, align 4
100  %add = add nsw i32 %tmp, 1
101  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
102  store i32 %add, i32* %x, align 4
103  %add3 = add nsw i32 %tmp1, 2
104  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
105  store i32 %add3, i32* %y, align 4
106  %add6 = add nsw i32 %tmp2, 3
107  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
108  store i32 %add6, i32* %z, align 4
109  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
110  %exitcond = icmp eq i64 %indvars.iv.next, 1024
111  br i1 %exitcond, label %for.end, label %for.body
112
113for.end:                                          ; preds = %for.body
114  ret void
115}
116
117; Check vectorization on an interleaved load group of factor 4.
118
119; struct ST4{
120;   int x;
121;   int y;
122;   int z;
123;   int w;
124; };
125; int test_struct_load4(struct ST4 *S) {
126;   int r = 0;
127;   for (int i = 0; i < 1024; i++) {
128;      r += S[i].x;
129;      r -= S[i].y;
130;      r += S[i].z;
131;      r -= S[i].w;
132;   }
133;   return r;
134; }
135
136%struct.ST4 = type { i32, i32, i32, i32 }
137
138define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
139; CHECK-LABEL: @test_struct_load4(
140; CHECK-NEXT:  entry:
141; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
142; CHECK:       vector.ph:
143; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
144; CHECK:       vector.body:
145; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
146; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
147; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], i64 [[INDEX]], i32 0
148; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
149; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
150; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
151; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
152; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
153; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
154; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]]
155; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[STRIDED_VEC2]]
156; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]]
157; CHECK-NEXT:    [[TMP5]] = sub <4 x i32> [[TMP3]], [[TMP4]]
158; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
159; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
160; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
161; CHECK:       middle.block:
162; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
163; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[RDX_SHUF]]
164; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
165; CHECK-NEXT:    [[BIN_RDX5:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF4]]
166; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX5]], i32 0
167; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
168; CHECK:       scalar.ph:
169; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
170; CHECK:       for.body:
171; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !7
172; CHECK:       for.end:
173; CHECK-NEXT:    [[SUB8_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
174; CHECK-NEXT:    ret i32 [[SUB8_LCSSA]]
175;
176entry:
177  br label %for.body
178
179for.body:                                         ; preds = %for.body, %entry
180  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
181  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
182  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
183  %tmp = load i32, i32* %x, align 4
184  %add = add nsw i32 %tmp, %r.022
185  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
186  %tmp1 = load i32, i32* %y, align 4
187  %sub = sub i32 %add, %tmp1
188  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
189  %tmp2 = load i32, i32* %z, align 4
190  %add5 = add nsw i32 %sub, %tmp2
191  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
192  %tmp3 = load i32, i32* %w, align 4
193  %sub8 = sub i32 %add5, %tmp3
194  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
195  %exitcond = icmp eq i64 %indvars.iv.next, 1024
196  br i1 %exitcond, label %for.end, label %for.body
197
198for.end:                                          ; preds = %for.body
199  ret i32 %sub8
200}
201
202; Check vectorization on an interleaved store group of factor 4.
203
204; void test_struct_store4(int *A, struct ST4 *B) {
205;   int *ptr = A;
206;   for (int i = 0; i < 1024; i++) {
207;     int X = *ptr++;
208;     B[i].x = X + 1;
209;     B[i].y = X * 2;
210;     B[i].z = X + 3;
211;     B[i].w = X + 4;
212;   }
213; }
214
215; CHECK-LABEL: @test_struct_store4(
216; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>*
217; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
218; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
219; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
220; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
221; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
222; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
223; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
224; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
225
226define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) {
227entry:
228  br label %for.body
229
230for.cond.cleanup:                                 ; preds = %for.body
231  ret void
232
233for.body:                                         ; preds = %for.body, %entry
234  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
235  %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ]
236  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1
237  %tmp = load i32, i32* %ptr.024, align 4
238  %add = add nsw i32 %tmp, 1
239  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
240  store i32 %add, i32* %x, align 4
241  %mul = shl nsw i32 %tmp, 1
242  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1
243  store i32 %mul, i32* %y, align 4
244  %add3 = add nsw i32 %tmp, 3
245  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
246  store i32 %add3, i32* %z, align 4
247  %add6 = add nsw i32 %tmp, 4
248  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
249  store i32 %add6, i32* %w, align 4
250  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
251  %exitcond = icmp eq i64 %indvars.iv.next, 1024
252  br i1 %exitcond, label %for.cond.cleanup, label %for.body
253}
254
255; Check vectorization on a reverse interleaved load group of factor 2 and
256; a reverse interleaved store group of factor 2.
257
258; struct ST2 {
259;  int x;
260;  int y;
261; };
262;
263; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
264;   for (int i = 1023; i >= 0; i--) {
265;     int a = A[i].x + i;  // interleaved load of index 0
266;     int b = A[i].y - i;  // interleaved load of index 1
267;     B[i].x = a;          // interleaved store of index 0
268;     B[i].y = b;          // interleaved store of index 1
269;   }
270; }
271
272; CHECK-LABEL: @test_reversed_load2_store2(
273; CHECK: %[[G0:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %offset.idx, i32 0
274; CHECK: %[[G1:.+]] = getelementptr inbounds i32, i32* %[[G0]], i64 -6
275; CHECK: %[[B0:.+]] = bitcast i32* %[[G1]] to <8 x i32>*
276; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 4
277; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
278; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
279; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
280; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
281; CHECK: add nsw <4 x i32>
282; CHECK: sub nsw <4 x i32>
283; CHECK: %[[G2:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %offset.idx, i32 1
284; CHECK: %[[G3:.+]] = getelementptr inbounds i32, i32* %[[G2]], i64 -7
285; CHECK: %[[B1:.+]] = bitcast i32* %[[G3]] to <8 x i32>*
286; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
287; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
288; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
289; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %[[B1]], align 4
290
291%struct.ST2 = type { i32, i32 }
292
293define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) {
294entry:
295  br label %for.body
296
297for.cond.cleanup:                                 ; preds = %for.body
298  ret void
299
300for.body:                                         ; preds = %for.body, %entry
301  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
302  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
303  %tmp = load i32, i32* %x, align 4
304  %tmp1 = trunc i64 %indvars.iv to i32
305  %add = add nsw i32 %tmp, %tmp1
306  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
307  %tmp2 = load i32, i32* %y, align 4
308  %sub = sub nsw i32 %tmp2, %tmp1
309  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
310  store i32 %add, i32* %x5, align 4
311  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
312  store i32 %sub, i32* %y8, align 4
313  %indvars.iv.next = add nsw i64 %indvars.iv, -1
314  %cmp = icmp sgt i64 %indvars.iv, 0
315  br i1 %cmp, label %for.body, label %for.cond.cleanup
316}
317
318; Check vectorization on an interleaved load group of factor 2 with 1 gap
319; (missing the load of odd elements). Because the vectorized loop would
320; speculatively access memory out-of-bounds, we must execute at least one
321; iteration of the scalar loop.
322
323; void even_load_static_tc(int *A, int *B) {
324;  for (unsigned i = 0; i < 1024; i+=2)
325;     B[i/2] = A[i] * 2;
326; }
327
328; CHECK-LABEL: @even_load_static_tc(
329; CHECK: vector.body:
330; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
331; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
332; CHECK:   icmp eq i64 %index.next, 508
333; CHECK: middle.block:
334; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
335
336define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
337entry:
338  br label %for.body
339
340for.cond.cleanup:                                 ; preds = %for.body
341  ret void
342
343for.body:                                         ; preds = %for.body, %entry
344  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
345  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
346  %tmp = load i32, i32* %arrayidx, align 4
347  %mul = shl nsw i32 %tmp, 1
348  %tmp1 = lshr exact i64 %indvars.iv, 1
349  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
350  store i32 %mul, i32* %arrayidx2, align 4
351  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
352  %cmp = icmp ult i64 %indvars.iv.next, 1024
353  br i1 %cmp, label %for.body, label %for.cond.cleanup
354}
355
356; Check vectorization on an interleaved load group of factor 2 with 1 gap
357; (missing the load of odd elements). Because the vectorized loop would
358; speculatively access memory out-of-bounds, we must execute at least one
359; iteration of the scalar loop.
360
361; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
362;  for (unsigned i = 0; i < N; i+=2)
363;     B[i/2] = A[i] * 2;
364; }
365
366; CHECK-LABEL: @even_load_dynamic_tc(
367; CHECK: vector.ph:
368; CHECK:   %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3
369; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
370; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
371; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
372; CHECK: vector.body:
373; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
374; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
375; CHECK:   icmp eq i64 %index.next, %n.vec
376; CHECK: middle.block:
377; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
378
379define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) {
380entry:
381  br label %for.body
382
383for.cond.cleanup:                                 ; preds = %for.body
384  ret void
385
386for.body:                                         ; preds = %for.body, %entry
387  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
388  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
389  %tmp = load i32, i32* %arrayidx, align 4
390  %mul = shl nsw i32 %tmp, 1
391  %tmp1 = lshr exact i64 %indvars.iv, 1
392  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
393  store i32 %mul, i32* %arrayidx2, align 4
394  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
395  %cmp = icmp ult i64 %indvars.iv.next, %N
396  br i1 %cmp, label %for.body, label %for.cond.cleanup
397}
398
399; Check vectorization on a reverse interleaved load group of factor 2 with 1
400; gap and a reverse interleaved store group of factor 2. The interleaved load
401; group should be removed since it has a gap and is reverse.
402
403; struct pair {
404;  int x;
405;  int y;
406; };
407;
408; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
409;   for (int i = 1023; i >= 0; i--) {
410;     int a = X + i;
411;     int b = A[i].y - i;
412;     B[i].x = a;
413;     B[i].y = b;
414;   }
415; }
416
417; CHECK-LABEL: @load_gap_reverse(
418; CHECK-NOT: %wide.vec = load <8 x i64>, <8 x i64>* %{{.*}}, align 8
419; CHECK-NOT: %strided.vec = shufflevector <8 x i64> %wide.vec, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
420
421%pair = type { i64, i64 }
422define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) {
423entry:
424  br label %for.body
425
426for.body:
427  %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
428  %0 = add nsw i64 %X, %i
429  %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0
430  %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1
431  %3 = load i64, i64* %2, align 8
432  %4 = sub nsw i64 %3, %i
433  store i64 %0, i64* %1, align 8
434  store i64 %4, i64* %2, align 8
435  %i.next = add nsw i64 %i, -1
436  %cond = icmp sgt i64 %i, 0
437  br i1 %cond, label %for.body, label %for.exit
438
439for.exit:
440  ret void
441}
442
443; Check vectorization on interleaved access groups identified from mixed
444; loads/stores.
445; void mixed_load2_store2(int *A, int *B) {
446;   for (unsigned i = 0; i < 1024; i+=2)  {
447;     B[i] = A[i] * A[i+1];
448;     B[i+1] = A[i] + A[i+1];
449;   }
450; }
451
452; CHECK-LABEL: @mixed_load2_store2(
453; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
454; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
455; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
456; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
457; CHECK: store <8 x i32> %interleaved.vec
458
459define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
460entry:
461  br label %for.body
462
463for.cond.cleanup:                                 ; preds = %for.body
464  ret void
465
466for.body:                                         ; preds = %for.body, %entry
467  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
468  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
469  %tmp = load i32, i32* %arrayidx, align 4
470  %tmp1 = or i64 %indvars.iv, 1
471  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1
472  %tmp2 = load i32, i32* %arrayidx2, align 4
473  %mul = mul nsw i32 %tmp2, %tmp
474  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
475  store i32 %mul, i32* %arrayidx4, align 4
476  %tmp3 = load i32, i32* %arrayidx, align 4
477  %tmp4 = load i32, i32* %arrayidx2, align 4
478  %add10 = add nsw i32 %tmp4, %tmp3
479  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1
480  store i32 %add10, i32* %arrayidx13, align 4
481  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
482  %cmp = icmp ult i64 %indvars.iv.next, 1024
483  br i1 %cmp, label %for.body, label %for.cond.cleanup
484}
485
486; Check vectorization on interleaved access groups identified from mixed
487; loads/stores.
488; void mixed_load3_store3(int *A) {
489;   for (unsigned i = 0; i < 1024; i++)  {
490;     *A++ += i;
491;     *A++ += i;
492;     *A++ += i;
493;   }
494; }
495
496; CHECK-LABEL: @mixed_load3_store3(
497; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
498; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
499; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
500; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
501; CHECK: %interleaved.vec = shufflevector <8 x i32> %{{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
502; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* %{{.*}}, align 4
503
504define void @mixed_load3_store3(i32* nocapture %A) {
505entry:
506  br label %for.body
507
508for.cond.cleanup:                                 ; preds = %for.body
509  ret void
510
511for.body:                                         ; preds = %for.body, %entry
512  %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
513  %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ]
514  %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1
515  %tmp = load i32, i32* %A.addr.012, align 4
516  %add = add i32 %tmp, %i.013
517  store i32 %add, i32* %A.addr.012, align 4
518  %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2
519  %tmp1 = load i32, i32* %incdec.ptr, align 4
520  %add2 = add i32 %tmp1, %i.013
521  store i32 %add2, i32* %incdec.ptr, align 4
522  %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3
523  %tmp2 = load i32, i32* %incdec.ptr1, align 4
524  %add4 = add i32 %tmp2, %i.013
525  store i32 %add4, i32* %incdec.ptr1, align 4
526  %inc = add nuw nsw i32 %i.013, 1
527  %exitcond = icmp eq i32 %inc, 1024
528  br i1 %exitcond, label %for.cond.cleanup, label %for.body
529}
530
531; Check vectorization on interleaved access groups with members having different
532; kinds of type.
533
534; struct IntFloat {
535;   int a;
536;   float b;
537; };
538;
539; int SA;
540; float SB;
541;
542; void int_float_struct(struct IntFloat *A) {
543;   int SumA;
544;   float SumB;
545;   for (unsigned i = 0; i < 1024; i++)  {
546;     SumA += A[i].a;
547;     SumB += A[i].b;
548;   }
549;   SA = SumA;
550;   SB = SumB;
551; }
552
553; CHECK-LABEL: @int_float_struct(
554; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
555; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
556; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
557; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float>
558; CHECK: add <4 x i32>
559; CHECK: fadd fast <4 x float>
560
561%struct.IntFloat = type { i32, float }
562
563@SA = common global i32 0, align 4
564@SB = common global float 0.000000e+00, align 4
565
566define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
567entry:
568  br label %for.body
569
570for.cond.cleanup:                                 ; preds = %for.body
571  store i32 %add, i32* @SA, align 4
572  store float %add3, float* @SB, align 4
573  ret void
574
575for.body:                                         ; preds = %for.body, %entry
576  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
577  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
578  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
579  %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
580  %tmp = load i32, i32* %a, align 4
581  %add = add nsw i32 %tmp, %SumA.013
582  %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
583  %tmp1 = load float, float* %b, align 4
584  %add3 = fadd fast float %SumB.014, %tmp1
585  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
586  %exitcond = icmp eq i64 %indvars.iv.next, 1024
587  br i1 %exitcond, label %for.cond.cleanup, label %for.body
588}
589
590; Check vectorization of interleaved access groups in the presence of
591; dependences (PR27626). The following tests check that we don't reorder
592; dependent loads and stores when generating code for interleaved access
593; groups. Stores should be scalarized because the required code motion would
594; break dependences, and the remaining interleaved load groups should have
595; gaps.
596
597; PR27626_0: Ensure a strided store is not moved after a dependent (zero
598;            distance) strided load.
599
600; void PR27626_0(struct pair *p, int z, int n) {
601;   for (int i = 0; i < n; i++) {
602;     p[i].x = z;
603;     p[i].y = p[i].x;
604;   }
605; }
606
607; CHECK-LABEL: @PR27626_0(
608; CHECK: vector.ph:
609; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
610; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
611; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
612; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
613; CHECK: vector.body:
614; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
615; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
616; CHECK:   store i32 %[[X1]], {{.*}}
617; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
618; CHECK:   store i32 %[[X2]], {{.*}}
619; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
620; CHECK:   store i32 %[[X3]], {{.*}}
621; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
622; CHECK:   store i32 %[[X4]], {{.*}}
623
624%pair.i32 = type { i32, i32 }
625define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
626entry:
627  br label %for.body
628
629for.body:
630  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
631  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
632  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
633  store i32 %z, i32* %p_i.x, align 4
634  %0 = load i32, i32* %p_i.x, align 4
635  store i32 %0, i32 *%p_i.y, align 4
636  %i.next = add nuw nsw i64 %i, 1
637  %cond = icmp slt i64 %i.next, %n
638  br i1 %cond, label %for.body, label %for.end
639
640for.end:
641  ret void
642}
643
644; PR27626_1: Ensure a strided load is not moved before a dependent (zero
645;            distance) strided store.
646
647; void PR27626_1(struct pair *p, int n) {
648;   int s = 0;
649;   for (int i = 0; i < n; i++) {
650;     p[i].y = p[i].x;
651;     s += p[i].y
652;   }
653; }
654
655; CHECK-LABEL: @PR27626_1(
656; CHECK: vector.ph:
657; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
658; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
659; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
660; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
661; CHECK: vector.body:
662; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
663; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
664; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
665; CHECK:   store i32 %[[X1:.+]], {{.*}}
666; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
667; CHECK:   store i32 %[[X2:.+]], {{.*}}
668; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
669; CHECK:   store i32 %[[X3:.+]], {{.*}}
670; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
671; CHECK:   store i32 %[[X4:.+]], {{.*}}
672; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
673; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
674; CHECK:   add <4 x i32> %[[S1]], %[[Phi]]
675
676define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
677entry:
678  br label %for.body
679
680for.body:
681  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
682  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
683  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
684  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
685  %0 = load i32, i32* %p_i.x, align 4
686  store i32 %0, i32* %p_i.y, align 4
687  %1 = load i32, i32* %p_i.y, align 4
688  %2 = add nsw i32 %1, %s
689  %i.next = add nuw nsw i64 %i, 1
690  %cond = icmp slt i64 %i.next, %n
691  br i1 %cond, label %for.body, label %for.end
692
693for.end:
694  %3 = phi i32 [ %2, %for.body ]
695  ret i32 %3
696}
697
698; PR27626_2: Ensure a strided store is not moved after a dependent (negative
699;            distance) strided load.
700
701; void PR27626_2(struct pair *p, int z, int n) {
702;   for (int i = 0; i < n; i++) {
703;     p[i].x = z;
704;     p[i].y = p[i - 1].x;
705;   }
706; }
707
708; CHECK-LABEL: @PR27626_2(
709; CHECK: vector.ph:
710; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
711; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
712; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
713; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
714; CHECK: vector.body:
715; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
716; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
717; CHECK:   store i32 %[[X1]], {{.*}}
718; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
719; CHECK:   store i32 %[[X2]], {{.*}}
720; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
721; CHECK:   store i32 %[[X3]], {{.*}}
722; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
723; CHECK:   store i32 %[[X4]], {{.*}}
724
725define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
726entry:
727  br label %for.body
728
729for.body:
730  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
731  %i_minus_1 = add nuw nsw i64 %i, -1
732  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
733  %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
734  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
735  store i32 %z, i32* %p_i.x, align 4
736  %0 = load i32, i32* %p_i_minus_1.x, align 4
737  store i32 %0, i32 *%p_i.y, align 4
738  %i.next = add nuw nsw i64 %i, 1
739  %cond = icmp slt i64 %i.next, %n
740  br i1 %cond, label %for.body, label %for.end
741
742for.end:
743  ret void
744}
745
746; PR27626_3: Ensure a strided load is not moved before a dependent (negative
747;            distance) strided store.
748
749; void PR27626_3(struct pair *p, int z, int n) {
750;   for (int i = 0; i < n; i++) {
751;     p[i + 1].y = p[i].x;
752;     s += p[i].y;
753;   }
754; }
755
756; CHECK-LABEL: @PR27626_3(
757; CHECK: vector.ph:
758; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
759; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
760; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
761; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
762; CHECK: vector.body:
763; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
764; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
765; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
766; CHECK:   store i32 %[[X1:.+]], {{.*}}
767; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
768; CHECK:   store i32 %[[X2:.+]], {{.*}}
769; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
770; CHECK:   store i32 %[[X3:.+]], {{.*}}
771; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
772; CHECK:   store i32 %[[X4:.+]], {{.*}}
773; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
774; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
775; CHECK:   add <4 x i32> %[[S1]], %[[Phi]]
776
777define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
778entry:
779  br label %for.body
780
781for.body:
782  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
783  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
784  %i_plus_1 = add nuw nsw i64 %i, 1
785  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
786  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
787  %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
788  %0 = load i32, i32* %p_i.x, align 4
789  store i32 %0, i32* %p_i_plus_1.y, align 4
790  %1 = load i32, i32* %p_i.y, align 4
791  %2 = add nsw i32 %1, %s
792  %i.next = add nuw nsw i64 %i, 1
793  %cond = icmp slt i64 %i.next, %n
794  br i1 %cond, label %for.body, label %for.end
795
796for.end:
797  %3 = phi i32 [ %2, %for.body ]
798  ret i32 %3
799}
800
801; PR27626_4: Ensure we form an interleaved group for strided stores in the
802;            presence of a write-after-write dependence. We create a group for
803;            (2) and (3) while excluding (1).
804
805; void PR27626_4(int *a, int x, int y, int z, int n) {
806;   for (int i = 0; i < n; i += 2) {
807;     a[i] = x;      // (1)
808;     a[i] = y;      // (2)
809;     a[i + 1] = z;  // (3)
810;   }
811; }
812
813; CHECK-LABEL: @PR27626_4(
814; CHECK: vector.ph:
815; CHECK:   %[[INS_Y:.+]] = insertelement <4 x i32> poison, i32 %y, i32 0
816; CHECK:   %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> poison, <4 x i32> zeroinitializer
817; CHECK:   %[[INS_Z:.+]] = insertelement <4 x i32> poison, i32 %z, i32 0
818; CHECK:   %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> poison, <4 x i32> zeroinitializer
819; CHECK: vector.body:
820; CHECK:   store i32 %x, {{.*}}
821; CHECK:   store i32 %x, {{.*}}
822; CHECK:   store i32 %x, {{.*}}
823; CHECK:   store i32 %x, {{.*}}
824; CHECK:   %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
825; CHECK:   store <8 x i32> %[[VEC]], {{.*}}
826
827define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
828entry:
829  br label %for.body
830
831for.body:
832  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
833  %i_plus_1 = add i64 %i, 1
834  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
835  %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
836  store i32 %x, i32* %a_i, align 4
837  store i32 %y, i32* %a_i, align 4
838  store i32 %z, i32* %a_i_plus_1, align 4
839  %i.next = add nuw nsw i64 %i, 2
840  %cond = icmp slt i64 %i.next, %n
841  br i1 %cond, label %for.body, label %for.end
842
843for.end:
844  ret void
845}
846
847; PR27626_5: Ensure we do not form an interleaved group for strided stores in
848;            the presence of a write-after-write dependence.
849
850; void PR27626_5(int *a, int x, int y, int z, int n) {
851;   for (int i = 3; i < n; i += 2) {
852;     a[i - 1] = x;
853;     a[i - 3] = y;
854;     a[i] = z;
855;   }
856; }
857
858; CHECK-LABEL: @PR27626_5(
859; CHECK: vector.body:
860; CHECK:   store i32 %x, {{.*}}
861; CHECK:   store i32 %x, {{.*}}
862; CHECK:   store i32 %x, {{.*}}
863; CHECK:   store i32 %x, {{.*}}
864; CHECK:   store i32 %y, {{.*}}
865; CHECK:   store i32 %y, {{.*}}
866; CHECK:   store i32 %y, {{.*}}
867; CHECK:   store i32 %y, {{.*}}
868; CHECK:   store i32 %z, {{.*}}
869; CHECK:   store i32 %z, {{.*}}
870; CHECK:   store i32 %z, {{.*}}
871; CHECK:   store i32 %z, {{.*}}
872
873define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
874entry:
875  br label %for.body
876
877for.body:
878  %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
879  %i_minus_1 = sub i64 %i, 1
880  %i_minus_3 = sub i64 %i_minus_1, 2
881  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
882  %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
883  %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
884  store i32 %x, i32* %a_i_minus_1, align 4
885  store i32 %y, i32* %a_i_minus_3, align 4
886  store i32 %z, i32* %a_i, align 4
887  %i.next = add nuw nsw i64 %i, 2
888  %cond = icmp slt i64 %i.next, %n
889  br i1 %cond, label %for.body, label %for.end
890
891for.end:
892  ret void
893}
894
895; PR34743: Ensure that a cast which needs to sink after a load that belongs to
896; an interleaved group, indeeded gets sunk.
897
898; void PR34743(short *a, int *b, int n) {
899;   for (int i = 0, iv = 0; iv < n; i++, iv += 2) {
900;     b[i] = a[iv] * a[iv+1] * a[iv+2];
901;   }
902; }
903
904; CHECK-LABEL: @PR34743(
905; CHECK: vector.body:
906; CHECK:   %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[VSHUF1:.+]], %vector.body ]
907; CHECK:   %wide.vec = load <8 x i16>
908; CHECK:   %[[VSHUF0:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
909; CHECK:   %[[VSHUF1:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
910; CHECK:   %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %[[VSHUF1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
911; CHECK:   sext <4 x i16> %[[VSHUF0]] to <4 x i32>
912; CHECK:   sext <4 x i16> %[[VSHUF]] to <4 x i32>
913; CHECK:   sext <4 x i16> %[[VSHUF1]] to <4 x i32>
914; CHECK:   mul nsw <4 x i32>
915; CHECK:   mul nsw <4 x i32>
916
917define void @PR34743(i16* %a, i32* %b, i64 %n) {
918entry:
919  %.pre = load i16, i16* %a
920  br label %loop
921
922loop:
923  %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ]
924  %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ]
925  %i = phi i64 [ 0, %entry ], [ %i1, %loop ]
926  %conv = sext i16 %0 to i32
927  %i1 = add nuw nsw i64 %i, 1
928  %iv1 = add nuw nsw i64 %iv, 1
929  %iv2 = add nuw nsw i64 %iv, 2
930  %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1
931  %load1 = load i16, i16* %gep1, align 4
932  %conv1 = sext i16 %load1 to i32
933  %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2
934  %load2 = load i16, i16* %gep2, align 4
935  %conv2 = sext i16 %load2 to i32
936  %mul01 = mul nsw i32 %conv, %conv1
937  %mul012 = mul nsw i32 %mul01, %conv2
938  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
939  store i32 %mul012, i32* %arrayidx5
940  %exitcond = icmp eq i64 %iv, %n
941  br i1 %exitcond, label %end, label %loop
942
943end:
944  ret void
945}
946
947attributes #0 = { "unsafe-fp-math"="true" }
948