1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
5
6; Check vectorization on an interleaved load group of factor 2 and an interleaved
7; store group of factor 2.
8
9; int AB[1024];
10; int CD[1024];
11;  void test_array_load2_store2(int C, int D) {
12;   for (int i = 0; i < 1024; i+=2) {
13;     int A = AB[i];
14;     int B = AB[i+1];
15;     CD[i] = A + C;
16;     CD[i+1] = B * D;
17;   }
18; }
19
20
21@AB = common global [1024 x i32] zeroinitializer, align 4
22@CD = common global [1024 x i32] zeroinitializer, align 4
23
24define void @test_array_load2_store2(i32 %C, i32 %D) {
25; CHECK-LABEL: @test_array_load2_store2(
26; CHECK-NEXT:  entry:
27; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
28; CHECK:       vector.ph:
29; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
30; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
31; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[D:%.*]], i32 0
32; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
33; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
34; CHECK:       vector.body:
35; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
36; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
37; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[OFFSET_IDX]]
38; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
39; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
40; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
41; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
42; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1
43; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
44; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]]
45; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP2]]
46; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -1
47; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
48; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
49; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4
50; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
51; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
52; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
53; CHECK:       middle.block:
54; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
55; CHECK:       scalar.ph:
56; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
57; CHECK:       for.body:
58; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_END]], [[LOOP2:!llvm.loop !.*]]
59; CHECK:       for.end:
60; CHECK-NEXT:    ret void
61;
62entry:
63  br label %for.body
64
65for.body:                                         ; preds = %for.body, %entry
66  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
67  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
68  %tmp = load i32, i32* %arrayidx0, align 4
69  %tmp1 = or i64 %indvars.iv, 1
70  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1
71  %tmp2 = load i32, i32* %arrayidx1, align 4
72  %add = add nsw i32 %tmp, %C
73  %mul = mul nsw i32 %tmp2, %D
74  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
75  store i32 %add, i32* %arrayidx2, align 4
76  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1
77  store i32 %mul, i32* %arrayidx3, align 4
78  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
79  %cmp = icmp slt i64 %indvars.iv.next, 1024
80  br i1 %cmp, label %for.body, label %for.end
81
82for.end:                                          ; preds = %for.body
83  ret void
84}
85
86; int A[3072];
87; struct ST S[1024];
88; void test_struct_st3() {
89;   int *ptr = A;
90;   for (int i = 0; i < 1024; i++) {
91;     int X1 = *ptr++;
92;     int X2 = *ptr++;
93;     int X3 = *ptr++;
94;     T[i].x = X1 + 1;
95;     T[i].y = X2 + 2;
96;     T[i].z = X3 + 3;
97;   }
98; }
99
100
101%struct.ST3 = type { i32, i32, i32 }
102@A = common global [3072 x i32] zeroinitializer, align 4
103@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
104
105define void @test_struct_array_load3_store3() {
106; CHECK-LABEL: @test_struct_array_load3_store3(
107; CHECK-NEXT:  entry:
108; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
109; CHECK:       vector.ph:
110; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
111; CHECK:       vector.body:
112; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
113; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
114; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr [3072 x i32], [3072 x i32]* @A, i64 0, i64 [[TMP0]]
115; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
116; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
117; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
118; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
119; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
120; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
121; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
122; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
123; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDEX]], i32 2
124; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -2
125; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
126; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
127; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
128; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
129; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4
130; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
131; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
132; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
133; CHECK:       middle.block:
134; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
135; CHECK:       scalar.ph:
136; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
137; CHECK:       for.body:
138; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]]
139; CHECK:       for.end:
140; CHECK-NEXT:    ret void
141;
142entry:
143  br label %for.body
144
145for.body:                                         ; preds = %for.body, %entry
146  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
147  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
148  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
149  %tmp = load i32, i32* %ptr.016, align 4
150  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
151  %tmp1 = load i32, i32* %incdec.ptr, align 4
152  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
153  %tmp2 = load i32, i32* %incdec.ptr1, align 4
154  %add = add nsw i32 %tmp, 1
155  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
156  store i32 %add, i32* %x, align 4
157  %add3 = add nsw i32 %tmp1, 2
158  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
159  store i32 %add3, i32* %y, align 4
160  %add6 = add nsw i32 %tmp2, 3
161  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
162  store i32 %add6, i32* %z, align 4
163  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
164  %exitcond = icmp eq i64 %indvars.iv.next, 1024
165  br i1 %exitcond, label %for.end, label %for.body
166
167for.end:                                          ; preds = %for.body
168  ret void
169}
170
171; Check vectorization on an interleaved load group of factor 4.
172
173; struct ST4{
174;   int x;
175;   int y;
176;   int z;
177;   int w;
178; };
179; int test_struct_load4(struct ST4 *S) {
180;   int r = 0;
181;   for (int i = 0; i < 1024; i++) {
182;      r += S[i].x;
183;      r -= S[i].y;
184;      r += S[i].z;
185;      r -= S[i].w;
186;   }
187;   return r;
188; }
189
190%struct.ST4 = type { i32, i32, i32, i32 }
191
192define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
193;
194; CHECK-LABEL: @test_struct_load4(
195; CHECK-NEXT:  entry:
196; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
197; CHECK:       vector.ph:
198; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
199; CHECK:       vector.body:
200; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
201; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
202; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], i64 [[INDEX]], i32 0
203; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
204; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
205; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
206; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
207; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
208; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
209; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]]
210; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[STRIDED_VEC2]]
211; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]]
212; CHECK-NEXT:    [[TMP5]] = sub <4 x i32> [[TMP3]], [[TMP4]]
213; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
214; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
215; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
216; CHECK:       middle.block:
217; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
218; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
219; CHECK:       scalar.ph:
220; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
221; CHECK:       for.body:
222; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]]
223; CHECK:       for.end:
224; CHECK-NEXT:    [[SUB8_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
225; CHECK-NEXT:    ret i32 [[SUB8_LCSSA]]
226;
227entry:
228  br label %for.body
229
230for.body:                                         ; preds = %for.body, %entry
231  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
232  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
233  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
234  %tmp = load i32, i32* %x, align 4
235  %add = add nsw i32 %tmp, %r.022
236  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
237  %tmp1 = load i32, i32* %y, align 4
238  %sub = sub i32 %add, %tmp1
239  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
240  %tmp2 = load i32, i32* %z, align 4
241  %add5 = add nsw i32 %sub, %tmp2
242  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
243  %tmp3 = load i32, i32* %w, align 4
244  %sub8 = sub i32 %add5, %tmp3
245  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
246  %exitcond = icmp eq i64 %indvars.iv.next, 1024
247  br i1 %exitcond, label %for.end, label %for.body
248
249for.end:                                          ; preds = %for.body
250  ret i32 %sub8
251}
252
253; Check vectorization on an interleaved store group of factor 4.
254
255; void test_struct_store4(int *A, struct ST4 *B) {
256;   int *ptr = A;
257;   for (int i = 0; i < 1024; i++) {
258;     int X = *ptr++;
259;     B[i].x = X + 1;
260;     B[i].y = X * 2;
261;     B[i].z = X + 3;
262;     B[i].w = X + 4;
263;   }
264; }
265
266
267define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) {
268; CHECK-LABEL: @test_struct_store4(
269; CHECK-NEXT:  entry:
270; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
271; CHECK:       vector.ph:
272; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
273; CHECK:       vector.body:
274; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
275; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
276; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>*
277; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
278; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
279; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
280; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
281; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4>
282; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[B:%.*]], i64 [[INDEX]], i32 3
283; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3
284; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
285; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
286; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
287; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
288; CHECK-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[TMP7]], align 4
289; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
290; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
291; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
292; CHECK:       middle.block:
293; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
294; CHECK:       scalar.ph:
295; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
296; CHECK:       for.cond.cleanup:
297; CHECK-NEXT:    ret void
298; CHECK:       for.body:
299; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]]
300;
301entry:
302  br label %for.body
303
304for.cond.cleanup:                                 ; preds = %for.body
305  ret void
306
307for.body:                                         ; preds = %for.body, %entry
308  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
309  %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ]
310  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1
311  %tmp = load i32, i32* %ptr.024, align 4
312  %add = add nsw i32 %tmp, 1
313  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
314  store i32 %add, i32* %x, align 4
315  %mul = shl nsw i32 %tmp, 1
316  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1
317  store i32 %mul, i32* %y, align 4
318  %add3 = add nsw i32 %tmp, 3
319  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
320  store i32 %add3, i32* %z, align 4
321  %add6 = add nsw i32 %tmp, 4
322  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
323  store i32 %add6, i32* %w, align 4
324  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
325  %exitcond = icmp eq i64 %indvars.iv.next, 1024
326  br i1 %exitcond, label %for.cond.cleanup, label %for.body
327}
328
329; Check vectorization on a reverse interleaved load group of factor 2 and
330; a reverse interleaved store group of factor 2.
331
332; struct ST2 {
333;  int x;
334;  int y;
335; };
336;
337; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
338;   for (int i = 1023; i >= 0; i--) {
339;     int a = A[i].x + i;  // interleaved load of index 0
340;     int b = A[i].y - i;  // interleaved load of index 1
341;     B[i].x = a;          // interleaved store of index 0
342;     B[i].y = b;          // interleaved store of index 1
343;   }
344; }
345
346
347%struct.ST2 = type { i32, i32 }
348
349define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) {
350; CHECK-LABEL: @test_reversed_load2_store2(
351; CHECK-NEXT:  entry:
352; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
353; CHECK:       vector.ph:
354; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
355; CHECK:       vector.body:
356; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
357; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ]
358; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
359; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], %struct.ST2* [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
360; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 -6
361; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
362; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
363; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
364; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
365; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
366; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
367; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND3]]
368; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND3]]
369; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
370; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7
371; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
372; CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
373; CHECK-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
374; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE5]], <4 x i32> [[REVERSE6]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
375; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4
376; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
377; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <4 x i32> [[VEC_IND3]], <i32 -4, i32 -4, i32 -4, i32 -4>
378; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
379; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
380; CHECK:       middle.block:
381; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
382; CHECK:       scalar.ph:
383; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
384; CHECK:       for.cond.cleanup:
385; CHECK-NEXT:    ret void
386; CHECK:       for.body:
387; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], [[LOOP11:!llvm.loop !.*]]
388;
389entry:
390  br label %for.body
391
392for.cond.cleanup:                                 ; preds = %for.body
393  ret void
394
395for.body:                                         ; preds = %for.body, %entry
396  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
397  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
398  %tmp = load i32, i32* %x, align 4
399  %tmp1 = trunc i64 %indvars.iv to i32
400  %add = add nsw i32 %tmp, %tmp1
401  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
402  %tmp2 = load i32, i32* %y, align 4
403  %sub = sub nsw i32 %tmp2, %tmp1
404  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
405  store i32 %add, i32* %x5, align 4
406  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
407  store i32 %sub, i32* %y8, align 4
408  %indvars.iv.next = add nsw i64 %indvars.iv, -1
409  %cmp = icmp sgt i64 %indvars.iv, 0
410  br i1 %cmp, label %for.body, label %for.cond.cleanup
411}
412
413; Check vectorization on an interleaved load group of factor 2 with 1 gap
414; (missing the load of odd elements). Because the vectorized loop would
415; speculatively access memory out-of-bounds, we must execute at least one
416; iteration of the scalar loop.
417
418; void even_load_static_tc(int *A, int *B) {
419;  for (unsigned i = 0; i < 1024; i+=2)
420;     B[i/2] = A[i] * 2;
421; }
422
423
424define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
425; CHECK-LABEL: @even_load_static_tc(
426; CHECK-NEXT:  entry:
427; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
428; CHECK:       vector.ph:
429; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
430; CHECK:       vector.body:
431; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
432; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
433; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
434; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
435; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
436; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
437; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
438; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[INDEX]], 9223372036854775804
439; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
440; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
441; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP5]], align 4
442; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
443; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 508
444; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
445; CHECK:       middle.block:
446; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
447; CHECK:       scalar.ph:
448; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1016, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
449; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
450; CHECK:       for.cond.cleanup:
451; CHECK-NEXT:    ret void
452; CHECK:       for.body:
453; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
454; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
455; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
456; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
457; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
458; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
459; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
460; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
461; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022
462; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], [[LOOP13:!llvm.loop !.*]]
463;
464entry:
465  br label %for.body
466
467for.cond.cleanup:                                 ; preds = %for.body
468  ret void
469
470for.body:                                         ; preds = %for.body, %entry
471  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
472  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
473  %tmp = load i32, i32* %arrayidx, align 4
474  %mul = shl nsw i32 %tmp, 1
475  %tmp1 = lshr exact i64 %indvars.iv, 1
476  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
477  store i32 %mul, i32* %arrayidx2, align 4
478  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
479  %cmp = icmp ult i64 %indvars.iv.next, 1024
480  br i1 %cmp, label %for.body, label %for.cond.cleanup
481}
482
483; Check vectorization on an interleaved load group of factor 2 with 1 gap
484; (missing the load of odd elements). Because the vectorized loop would
485; speculatively access memory out-of-bounds, we must execute at least one
486; iteration of the scalar loop.
487
488; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
489;  for (unsigned i = 0; i < N; i+=2)
490;     B[i/2] = A[i] * 2;
491; }
492
493
494define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) {
495; CHECK-LABEL: @even_load_dynamic_tc(
496; CHECK-NEXT:  entry:
497; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[N:%.*]], 2
498; CHECK-NEXT:    [[UMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 2
499; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[UMAX]], -1
500; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 1
501; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
502; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8
503; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
504; CHECK:       vector.ph:
505; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP3]], 3
506; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
507; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 4, i64 [[N_MOD_VF]]
508; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[TMP5]]
509; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
510; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
511; CHECK:       vector.body:
512; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
513; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
514; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
515; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
516; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4
517; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
518; CHECK-NEXT:    [[TMP8:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
519; CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[INDEX]], 9223372036854775804
520; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP9]]
521; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
522; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4
523; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
524; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
525; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
526; CHECK:       middle.block:
527; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
528; CHECK:       scalar.ph:
529; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
530; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
531; CHECK:       for.cond.cleanup:
532; CHECK-NEXT:    ret void
533; CHECK:       for.body:
534; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
535; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
536; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
537; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
538; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
539; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
540; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
541; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
542; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]]
543; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], [[LOOP15:!llvm.loop !.*]]
544;
545entry:
546  br label %for.body
547
548for.cond.cleanup:                                 ; preds = %for.body
549  ret void
550
551for.body:                                         ; preds = %for.body, %entry
552  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
553  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
554  %tmp = load i32, i32* %arrayidx, align 4
555  %mul = shl nsw i32 %tmp, 1
556  %tmp1 = lshr exact i64 %indvars.iv, 1
557  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
558  store i32 %mul, i32* %arrayidx2, align 4
559  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
560  %cmp = icmp ult i64 %indvars.iv.next, %N
561  br i1 %cmp, label %for.body, label %for.cond.cleanup
562}
563
564; Check vectorization on a reverse interleaved load group of factor 2 with 1
565; gap and a reverse interleaved store group of factor 2. The interleaved load
566; group should be removed since it has a gap and is reverse.
567
568; struct pair {
569;  int x;
570;  int y;
571; };
572;
573; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
574;   for (int i = 1023; i >= 0; i--) {
575;     int a = X + i;
576;     int b = A[i].y - i;
577;     B[i].x = a;
578;     B[i].y = b;
579;   }
580; }
581
582
583%pair = type { i64, i64 }
584define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) {
585; CHECK-LABEL: @load_gap_reverse(
586; CHECK-NEXT:  entry:
587; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
588; CHECK:       vector.ph:
589; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X:%.*]], i32 0
590; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
591; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
592; CHECK:       vector.body:
593; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
594; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
595; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
596; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 1022, [[INDEX]]
597; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 1021, [[INDEX]]
598; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 1020, [[INDEX]]
599; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
600; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P1:%.*]], i64 [[OFFSET_IDX]], i32 0
601; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP0]], i32 0
602; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP1]], i32 0
603; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP2]], i32 0
604; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1
605; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP0]], i32 1
606; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP1]], i32 1
607; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP2]], i32 1
608; CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[TMP8]], align 8
609; CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP9]], align 8
610; CHECK-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP10]], align 8
611; CHECK-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP11]], align 8
612; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
613; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
614; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
615; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
616; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]]
617; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
618; CHECK-NEXT:    store i64 [[TMP21]], i64* [[TMP4]], align 8
619; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
620; CHECK-NEXT:    store i64 [[TMP22]], i64* [[TMP5]], align 8
621; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
622; CHECK-NEXT:    store i64 [[TMP23]], i64* [[TMP6]], align 8
623; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
624; CHECK-NEXT:    store i64 [[TMP24]], i64* [[TMP7]], align 8
625; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
626; CHECK-NEXT:    store i64 [[TMP25]], i64* [[TMP8]], align 8
627; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
628; CHECK-NEXT:    store i64 [[TMP26]], i64* [[TMP9]], align 8
629; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
630; CHECK-NEXT:    store i64 [[TMP27]], i64* [[TMP10]], align 8
631; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
632; CHECK-NEXT:    store i64 [[TMP28]], i64* [[TMP11]], align 8
633; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
634; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
635; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
636; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]]
637; CHECK:       middle.block:
638; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
639; CHECK:       scalar.ph:
640; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
641; CHECK:       for.body:
642; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_EXIT]], [[LOOP17:!llvm.loop !.*]]
643; CHECK:       for.exit:
644; CHECK-NEXT:    ret void
645;
646entry:
647  br label %for.body
648
649for.body:
650  %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
651  %0 = add nsw i64 %X, %i
652  %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0
653  %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1
654  %3 = load i64, i64* %2, align 8
655  %4 = sub nsw i64 %3, %i
656  store i64 %0, i64* %1, align 8
657  store i64 %4, i64* %2, align 8
658  %i.next = add nsw i64 %i, -1
659  %cond = icmp sgt i64 %i, 0
660  br i1 %cond, label %for.body, label %for.exit
661
662for.exit:
663  ret void
664}
665
666; Check vectorization on interleaved access groups identified from mixed
667; loads/stores.
668; void mixed_load2_store2(int *A, int *B) {
669;   for (unsigned i = 0; i < 1024; i+=2)  {
670;     B[i] = A[i] * A[i+1];
671;     B[i+1] = A[i] + A[i+1];
672;   }
673; }
674
675
676define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
677; CHECK-LABEL: @mixed_load2_store2(
678; CHECK-NEXT:  entry:
679; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
680; CHECK:       vector.ph:
681; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
682; CHECK:       vector.body:
683; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
684; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
685; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
686; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
687; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
688; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
689; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
690; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1
691; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
692; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
693; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
694; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
695; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 -1
696; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 [[TMP2]]
697; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
698; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
699; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4
700; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
701; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
702; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]]
703; CHECK:       middle.block:
704; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
705; CHECK:       scalar.ph:
706; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
707; CHECK:       for.cond.cleanup:
708; CHECK-NEXT:    ret void
709; CHECK:       for.body:
710; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], [[LOOP19:!llvm.loop !.*]]
711;
712entry:
713  br label %for.body
714
715for.cond.cleanup:                                 ; preds = %for.body
716  ret void
717
718for.body:                                         ; preds = %for.body, %entry
719  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
720  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
721  %tmp = load i32, i32* %arrayidx, align 4
722  %tmp1 = or i64 %indvars.iv, 1
723  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1
724  %tmp2 = load i32, i32* %arrayidx2, align 4
725  %mul = mul nsw i32 %tmp2, %tmp
726  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
727  store i32 %mul, i32* %arrayidx4, align 4
728  %tmp3 = load i32, i32* %arrayidx, align 4
729  %tmp4 = load i32, i32* %arrayidx2, align 4
730  %add10 = add nsw i32 %tmp4, %tmp3
731  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1
732  store i32 %add10, i32* %arrayidx13, align 4
733  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
734  %cmp = icmp ult i64 %indvars.iv.next, 1024
735  br i1 %cmp, label %for.body, label %for.cond.cleanup
736}
737
738; Check vectorization on interleaved access groups identified from mixed
739; loads/stores.
740; void mixed_load3_store3(int *A) {
741;   for (unsigned i = 0; i < 1024; i++)  {
742;     *A++ += i;
743;     *A++ += i;
744;     *A++ += i;
745;   }
746; }
747
748
749define void @mixed_load3_store3(i32* nocapture %A) {
750; CHECK-LABEL: @mixed_load3_store3(
751; CHECK-NEXT:  entry:
752; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
753; CHECK:       vector.ph:
754; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
755; CHECK:       vector.body:
756; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
757; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
758; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
759; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP0]]
760; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
761; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
762; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
763; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
764; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
765; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]]
766; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[NEXT_GEP]], i64 2
767; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]]
768; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]]
769; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 -2
770; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
771; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
772; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
773; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
774; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4
775; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
776; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
777; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
778; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]]
779; CHECK:       middle.block:
780; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
781; CHECK:       scalar.ph:
782; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
783; CHECK:       for.cond.cleanup:
784; CHECK-NEXT:    ret void
785; CHECK:       for.body:
786; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP21:!llvm.loop !.*]]
787;
788entry:
789  br label %for.body
790
791for.cond.cleanup:                                 ; preds = %for.body
792  ret void
793
794for.body:                                         ; preds = %for.body, %entry
795  %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
796  %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ]
797  %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1
798  %tmp = load i32, i32* %A.addr.012, align 4
799  %add = add i32 %tmp, %i.013
800  store i32 %add, i32* %A.addr.012, align 4
801  %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2
802  %tmp1 = load i32, i32* %incdec.ptr, align 4
803  %add2 = add i32 %tmp1, %i.013
804  store i32 %add2, i32* %incdec.ptr, align 4
805  %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3
806  %tmp2 = load i32, i32* %incdec.ptr1, align 4
807  %add4 = add i32 %tmp2, %i.013
808  store i32 %add4, i32* %incdec.ptr1, align 4
809  %inc = add nuw nsw i32 %i.013, 1
810  %exitcond = icmp eq i32 %inc, 1024
811  br i1 %exitcond, label %for.cond.cleanup, label %for.body
812}
813
814; Check vectorization on interleaved access groups with members having different
815; kinds of type.
816
817; struct IntFloat {
818;   int a;
819;   float b;
820; };
821;
822; int SA;
823; float SB;
824;
825; void int_float_struct(struct IntFloat *A) {
826;   int SumA;
827;   float SumB;
828;   for (unsigned i = 0; i < 1024; i++)  {
829;     SumA += A[i].a;
830;     SumB += A[i].b;
831;   }
832;   SA = SumA;
833;   SB = SumB;
834; }
835
836
837%struct.IntFloat = type { i32, float }
838
839@SA = common global i32 0, align 4
840@SB = common global float 0.000000e+00, align 4
841
842define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
843; CHECK-LABEL: @int_float_struct(
844; CHECK-NEXT:  entry:
845; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
846; CHECK:       vector.ph:
847; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
848; CHECK:       vector.body:
849; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
850; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
851; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
852; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], %struct.IntFloat* [[A:%.*]], i64 [[INDEX]], i32 0
853; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
854; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
855; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
856; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
857; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[STRIDED_VEC2]] to <4 x float>
858; CHECK-NEXT:    [[TMP3]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI1]]
859; CHECK-NEXT:    [[TMP4]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP2]]
860; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
861; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
862; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]]
863; CHECK:       middle.block:
864; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
865; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
866; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
867; CHECK:       scalar.ph:
868; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
869; CHECK:       for.cond.cleanup:
870; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
871; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
872; CHECK-NEXT:    store i32 [[ADD_LCSSA]], i32* @SA, align 4
873; CHECK-NEXT:    store float [[ADD3_LCSSA]], float* @SB, align 4
874; CHECK-NEXT:    ret void
875; CHECK:       for.body:
876; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP23:!llvm.loop !.*]]
877;
878entry:
879  br label %for.body
880
881for.cond.cleanup:                                 ; preds = %for.body
882  store i32 %add, i32* @SA, align 4
883  store float %add3, float* @SB, align 4
884  ret void
885
886for.body:                                         ; preds = %for.body, %entry
887  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
888  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
889  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
890  %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
891  %tmp = load i32, i32* %a, align 4
892  %add = add nsw i32 %tmp, %SumA.013
893  %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
894  %tmp1 = load float, float* %b, align 4
895  %add3 = fadd fast float %SumB.014, %tmp1
896  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
897  %exitcond = icmp eq i64 %indvars.iv.next, 1024
898  br i1 %exitcond, label %for.cond.cleanup, label %for.body
899}
900
901; Check vectorization of interleaved access groups in the presence of
902; dependences (PR27626). The following tests check that we don't reorder
903; dependent loads and stores when generating code for interleaved access
904; groups. Stores should be scalarized because the required code motion would
905; break dependences, and the remaining interleaved load groups should have
906; gaps.
907
908; PR27626_0: Ensure a strided store is not moved after a dependent (zero
909;            distance) strided load.
910
911; void PR27626_0(struct pair *p, int z, int n) {
912;   for (int i = 0; i < n; i++) {
913;     p[i].x = z;
914;     p[i].y = p[i].x;
915;   }
916; }
917
918
919%pair.i32 = type { i32, i32 }
920define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
921; CHECK-LABEL: @PR27626_0(
922; CHECK-NEXT:  entry:
923; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1
924; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
925; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
926; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
927; CHECK:       vector.ph:
928; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
929; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
930; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
931; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP2]]
932; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
933; CHECK:       vector.body:
934; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
935; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 1
936; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 2
937; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 3
938; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
939; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0
940; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0
941; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 0
942; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
943; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
944; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
945; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1
946; CHECK-NEXT:    store i32 [[Z:%.*]], i32* [[TMP6]], align 4
947; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP7]], align 4
948; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP8]], align 4
949; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP9]], align 4
950; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
951; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP14]], align 4
952; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
953; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP10]], align 4
954; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
955; CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP11]], align 4
956; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
957; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP12]], align 4
958; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
959; CHECK-NEXT:    store i32 [[TMP18]], i32* [[TMP13]], align 4
960; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
961; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
962; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP24:!llvm.loop !.*]]
963; CHECK:       middle.block:
964; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
965; CHECK:       scalar.ph:
966; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
967; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
968; CHECK:       for.body:
969; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
970; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
971; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
972; CHECK-NEXT:    store i32 [[Z]], i32* [[P_I_X]], align 4
973; CHECK-NEXT:    store i32 [[Z]], i32* [[P_I_Y]], align 4
974; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
975; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
976; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP25:!llvm.loop !.*]]
977; CHECK:       for.end:
978; CHECK-NEXT:    ret void
979;
980entry:
981  br label %for.body
982
983for.body:
984  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
985  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
986  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
987  store i32 %z, i32* %p_i.x, align 4
988  %0 = load i32, i32* %p_i.x, align 4
989  store i32 %0, i32 *%p_i.y, align 4
990  %i.next = add nuw nsw i64 %i, 1
991  %cond = icmp slt i64 %i.next, %n
992  br i1 %cond, label %for.body, label %for.end
993
994for.end:
995  ret void
996}
997
998; PR27626_1: Ensure a strided load is not moved before a dependent (zero
999;            distance) strided store.
1000
1001; void PR27626_1(struct pair *p, int n) {
1002;   int s = 0;
1003;   for (int i = 0; i < n; i++) {
1004;     p[i].y = p[i].x;
1005;     s += p[i].y
1006;   }
1007; }
1008
1009
1010define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
1011; CHECK-LABEL: @PR27626_1(
1012; CHECK-NEXT:  entry:
1013; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1
1014; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
1015; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1016; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1017; CHECK:       vector.ph:
1018; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1019; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1020; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
1021; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP2]]
1022; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1023; CHECK:       vector.body:
1024; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1025; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
1026; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 1
1027; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 2
1028; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 3
1029; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
1030; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
1031; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
1032; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
1033; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1
1034; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
1035; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4
1036; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
1037; CHECK-NEXT:    store i32 [[TMP12]], i32* [[TMP7]], align 4
1038; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
1039; CHECK-NEXT:    store i32 [[TMP13]], i32* [[TMP8]], align 4
1040; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
1041; CHECK-NEXT:    store i32 [[TMP14]], i32* [[TMP9]], align 4
1042; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
1043; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP10]], align 4
1044; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>*
1045; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4
1046; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1047; CHECK-NEXT:    [[TMP17]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1048; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1049; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1050; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP26:!llvm.loop !.*]]
1051; CHECK:       middle.block:
1052; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]])
1053; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
1054; CHECK:       scalar.ph:
1055; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1056; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1057; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1058; CHECK:       for.body:
1059; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1060; CHECK-NEXT:    [[S:%.*]] = phi i32 [ [[TMP21:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1061; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
1062; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
1063; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[P_I_X]], align 4
1064; CHECK-NEXT:    store i32 [[TMP20]], i32* [[P_I_Y]], align 4
1065; CHECK-NEXT:    [[TMP21]] = add nsw i32 [[TMP20]], [[S]]
1066; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1067; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1068; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP27:!llvm.loop !.*]]
1069; CHECK:       for.end:
1070; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ [[TMP21]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
1071; CHECK-NEXT:    ret i32 [[TMP22]]
1072;
1073entry:
1074  br label %for.body
1075
1076for.body:
1077  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1078  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1079  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
1080  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
1081  %0 = load i32, i32* %p_i.x, align 4
1082  store i32 %0, i32* %p_i.y, align 4
1083  %1 = load i32, i32* %p_i.y, align 4
1084  %2 = add nsw i32 %1, %s
1085  %i.next = add nuw nsw i64 %i, 1
1086  %cond = icmp slt i64 %i.next, %n
1087  br i1 %cond, label %for.body, label %for.end
1088
1089for.end:
1090  %3 = phi i32 [ %2, %for.body ]
1091  ret i32 %3
1092}
1093
1094; PR27626_2: Ensure a strided store is not moved after a dependent (negative
1095;            distance) strided load.
1096
1097; void PR27626_2(struct pair *p, int z, int n) {
1098;   for (int i = 0; i < n; i++) {
1099;     p[i].x = z;
1100;     p[i].y = p[i - 1].x;
1101;   }
1102; }
1103
1104
1105define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
1106; CHECK-LABEL: @PR27626_2(
1107; CHECK-NEXT:  entry:
1108; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1
1109; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
1110; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1111; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1112; CHECK:       vector.ph:
1113; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1114; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1115; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
1116; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP2]]
1117; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1118; CHECK:       vector.body:
1119; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1120; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 1
1121; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 2
1122; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 3
1123; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
1124; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0
1125; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0
1126; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 0
1127; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0
1128; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
1129; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
1130; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
1131; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1
1132; CHECK-NEXT:    store i32 [[Z:%.*]], i32* [[TMP6]], align 4
1133; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP7]], align 4
1134; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP8]], align 4
1135; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP9]], align 4
1136; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
1137; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
1138; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
1139; CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP11]], align 4
1140; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
1141; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP12]], align 4
1142; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
1143; CHECK-NEXT:    store i32 [[TMP18]], i32* [[TMP13]], align 4
1144; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
1145; CHECK-NEXT:    store i32 [[TMP19]], i32* [[TMP14]], align 4
1146; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1147; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1148; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP28:!llvm.loop !.*]]
1149; CHECK:       middle.block:
1150; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
1151; CHECK:       scalar.ph:
1152; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1153; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1154; CHECK:       for.body:
1155; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1156; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
1157; CHECK-NEXT:    [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0
1158; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
1159; CHECK-NEXT:    store i32 [[Z]], i32* [[P_I_X]], align 4
1160; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[P_I_MINUS_1_X]], align 4
1161; CHECK-NEXT:    store i32 [[TMP21]], i32* [[P_I_Y]], align 4
1162; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1163; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1164; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP29:!llvm.loop !.*]]
1165; CHECK:       for.end:
1166; CHECK-NEXT:    ret void
1167;
1168entry:
1169  br label %for.body
1170
1171for.body:
1172  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1173  %i_minus_1 = add nuw nsw i64 %i, -1
1174  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
1175  %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
1176  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
1177  store i32 %z, i32* %p_i.x, align 4
1178  %0 = load i32, i32* %p_i_minus_1.x, align 4
1179  store i32 %0, i32 *%p_i.y, align 4
1180  %i.next = add nuw nsw i64 %i, 1
1181  %cond = icmp slt i64 %i.next, %n
1182  br i1 %cond, label %for.body, label %for.end
1183
1184for.end:
1185  ret void
1186}
1187
1188; PR27626_3: Ensure a strided load is not moved before a dependent (negative
1189;            distance) strided store.
1190
1191; void PR27626_3(struct pair *p, int z, int n) {
1192;   for (int i = 0; i < n; i++) {
1193;     p[i + 1].y = p[i].x;
1194;     s += p[i].y;
1195;   }
1196; }
1197
1198
1199define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
1200; CHECK-LABEL: @PR27626_3(
1201; CHECK-NEXT:  entry:
1202; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1
1203; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
1204; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1205; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1206; CHECK:       vector.ph:
1207; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1208; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1209; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]]
1210; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP2]]
1211; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1212; CHECK:       vector.body:
1213; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1214; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1215; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
1216; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
1217; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
1218; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
1219; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
1220; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP6]], i32 1
1221; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
1222; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP8]], i32 1
1223; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
1224; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP10]], i32 1
1225; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
1226; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP12]], i32 1
1227; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
1228; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP14]], align 4
1229; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
1230; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP7]], align 4
1231; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
1232; CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP9]], align 4
1233; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
1234; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP11]], align 4
1235; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
1236; CHECK-NEXT:    store i32 [[TMP18]], i32* [[TMP13]], align 4
1237; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
1238; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4
1239; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1240; CHECK-NEXT:    [[TMP20]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1241; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1242; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
1243; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1244; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP30:!llvm.loop !.*]]
1245; CHECK:       middle.block:
1246; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP20]])
1247; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
1248; CHECK:       scalar.ph:
1249; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1250; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1251; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1252; CHECK:       for.body:
1253; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1254; CHECK-NEXT:    [[S:%.*]] = phi i32 [ [[TMP25:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1255; CHECK-NEXT:    [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1
1256; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
1257; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
1258; CHECK-NEXT:    [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I_PLUS_1]], i32 1
1259; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[P_I_X]], align 4
1260; CHECK-NEXT:    store i32 [[TMP23]], i32* [[P_I_PLUS_1_Y]], align 4
1261; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* [[P_I_Y]], align 4
1262; CHECK-NEXT:    [[TMP25]] = add nsw i32 [[TMP24]], [[S]]
1263; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1264; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1265; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP31:!llvm.loop !.*]]
1266; CHECK:       for.end:
1267; CHECK-NEXT:    [[TMP26:%.*]] = phi i32 [ [[TMP25]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
1268; CHECK-NEXT:    ret i32 [[TMP26]]
1269;
1270entry:
1271  br label %for.body
1272
1273for.body:
1274  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1275  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1276  %i_plus_1 = add nuw nsw i64 %i, 1
1277  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
1278  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
1279  %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
1280  %0 = load i32, i32* %p_i.x, align 4
1281  store i32 %0, i32* %p_i_plus_1.y, align 4
1282  %1 = load i32, i32* %p_i.y, align 4
1283  %2 = add nsw i32 %1, %s
1284  %i.next = add nuw nsw i64 %i, 1
1285  %cond = icmp slt i64 %i.next, %n
1286  br i1 %cond, label %for.body, label %for.end
1287
1288for.end:
1289  %3 = phi i32 [ %2, %for.body ]
1290  ret i32 %3
1291}
1292
1293; PR27626_4: Ensure we form an interleaved group for strided stores in the
1294;            presence of a write-after-write dependence. We create a group for
1295;            (2) and (3) while excluding (1).
1296
1297; void PR27626_4(int *a, int x, int y, int z, int n) {
1298;   for (int i = 0; i < n; i += 2) {
1299;     a[i] = x;      // (1)
1300;     a[i] = y;      // (2)
1301;     a[i + 1] = z;  // (3)
1302;   }
1303; }
1304
1305
1306define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
1307; CHECK-LABEL: @PR27626_4(
1308; CHECK-NEXT:  entry:
1309; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 2
1310; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 2
1311; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[SMAX]], -1
1312; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 1
1313; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
1314; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 6
1315; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1316; CHECK:       vector.ph:
1317; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775804
1318; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
1319; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i32 0
1320; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
1321; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Z:%.*]], i32 0
1322; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
1323; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1324; CHECK:       vector.body:
1325; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1326; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1327; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2
1328; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 4
1329; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 6
1330; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[OFFSET_IDX]], 1
1331; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
1332; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
1333; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
1334; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]]
1335; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 -1
1336; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[TMP8]], align 4
1337; CHECK-NEXT:    store i32 [[X]], i32* [[TMP9]], align 4
1338; CHECK-NEXT:    store i32 [[X]], i32* [[TMP10]], align 4
1339; CHECK-NEXT:    store i32 [[X]], i32* [[TMP11]], align 4
1340; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 [[TMP7]]
1341; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <8 x i32>*
1342; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1343; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP14]], align 4
1344; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1345; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1346; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP32:!llvm.loop !.*]]
1347; CHECK:       middle.block:
1348; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
1349; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1350; CHECK:       scalar.ph:
1351; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1352; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1353; CHECK:       for.body:
1354; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1355; CHECK-NEXT:    [[I_PLUS_1:%.*]] = or i64 [[I]], 1
1356; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
1357; CHECK-NEXT:    [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_PLUS_1]]
1358; CHECK-NEXT:    store i32 [[Y]], i32* [[A_I]], align 4
1359; CHECK-NEXT:    store i32 [[Z]], i32* [[A_I_PLUS_1]], align 4
1360; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1361; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1362; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP33:!llvm.loop !.*]]
1363; CHECK:       for.end:
1364; CHECK-NEXT:    ret void
1365;
1366entry:
1367  br label %for.body
1368
1369for.body:
1370  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1371  %i_plus_1 = add i64 %i, 1
1372  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
1373  %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
1374  store i32 %x, i32* %a_i, align 4
1375  store i32 %y, i32* %a_i, align 4
1376  store i32 %z, i32* %a_i_plus_1, align 4
1377  %i.next = add nuw nsw i64 %i, 2
1378  %cond = icmp slt i64 %i.next, %n
1379  br i1 %cond, label %for.body, label %for.end
1380
1381for.end:
1382  ret void
1383}
1384
1385; PR27626_5: Ensure we do not form an interleaved group for strided stores in
1386;            the presence of a write-after-write dependence.
1387
1388; void PR27626_5(int *a, int x, int y, int z, int n) {
1389;   for (int i = 3; i < n; i += 2) {
1390;     a[i - 1] = x;
1391;     a[i - 3] = y;
1392;     a[i] = z;
1393;   }
1394; }
1395
1396
1397define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
1398; CHECK-LABEL: @PR27626_5(
1399; CHECK-NEXT:  entry:
1400; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 5
1401; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 5
1402; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[SMAX]], -4
1403; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 1
1404; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
1405; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 6
1406; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1407; CHECK:       vector.ph:
1408; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775804
1409; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[N_VEC]], 1
1410; CHECK-NEXT:    [[IND_END:%.*]] = or i64 [[TMP4]], 3
1411; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1412; CHECK:       vector.body:
1413; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1414; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1415; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[INDEX]], 1
1416; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i64 [[TMP5]], 3
1417; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 2
1418; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP5]], 7
1419; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 6
1420; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
1421; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -3, i64 -3, i64 -3, i64 -3>
1422; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
1423; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]]
1424; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
1425; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP8]]
1426; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0
1427; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
1428; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
1429; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
1430; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
1431; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP19]]
1432; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
1433; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]]
1434; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP10]], i32 0
1435; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP23]]
1436; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP10]], i32 1
1437; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP25]]
1438; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2
1439; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP27]]
1440; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3
1441; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP29]]
1442; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[TMP16]], align 4
1443; CHECK-NEXT:    store i32 [[X]], i32* [[TMP18]], align 4
1444; CHECK-NEXT:    store i32 [[X]], i32* [[TMP20]], align 4
1445; CHECK-NEXT:    store i32 [[X]], i32* [[TMP22]], align 4
1446; CHECK-NEXT:    store i32 [[Y:%.*]], i32* [[TMP24]], align 4
1447; CHECK-NEXT:    store i32 [[Y]], i32* [[TMP26]], align 4
1448; CHECK-NEXT:    store i32 [[Y]], i32* [[TMP28]], align 4
1449; CHECK-NEXT:    store i32 [[Y]], i32* [[TMP30]], align 4
1450; CHECK-NEXT:    store i32 [[Z:%.*]], i32* [[TMP11]], align 4
1451; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP12]], align 4
1452; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP13]], align 4
1453; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP14]], align 4
1454; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1455; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
1456; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1457; CHECK-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP34:!llvm.loop !.*]]
1458; CHECK:       middle.block:
1459; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
1460; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1461; CHECK:       scalar.ph:
1462; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
1463; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1464; CHECK:       for.body:
1465; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1466; CHECK-NEXT:    [[I_MINUS_1:%.*]] = add i64 [[I]], -1
1467; CHECK-NEXT:    [[I_MINUS_3:%.*]] = add i64 [[I]], -3
1468; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
1469; CHECK-NEXT:    [[A_I_MINUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_1]]
1470; CHECK-NEXT:    [[A_I_MINUS_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_3]]
1471; CHECK-NEXT:    store i32 [[X]], i32* [[A_I_MINUS_1]], align 4
1472; CHECK-NEXT:    store i32 [[Y]], i32* [[A_I_MINUS_3]], align 4
1473; CHECK-NEXT:    store i32 [[Z]], i32* [[A_I]], align 4
1474; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1475; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1476; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP35:!llvm.loop !.*]]
1477; CHECK:       for.end:
1478; CHECK-NEXT:    ret void
1479;
1480entry:
1481  br label %for.body
1482
1483for.body:
1484  %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
1485  %i_minus_1 = sub i64 %i, 1
1486  %i_minus_3 = sub i64 %i_minus_1, 2
1487  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
1488  %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
1489  %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
1490  store i32 %x, i32* %a_i_minus_1, align 4
1491  store i32 %y, i32* %a_i_minus_3, align 4
1492  store i32 %z, i32* %a_i, align 4
1493  %i.next = add nuw nsw i64 %i, 2
1494  %cond = icmp slt i64 %i.next, %n
1495  br i1 %cond, label %for.body, label %for.end
1496
1497for.end:
1498  ret void
1499}
1500
1501; PR34743: Ensure that a cast which needs to sink after a load that belongs to
1502; an interleaved group, indeeded gets sunk.
1503
1504; void PR34743(short *a, int *b, int n) {
1505;   for (int i = 0, iv = 0; iv < n; i++, iv += 2) {
1506;     b[i] = a[iv] * a[iv+1] * a[iv+2];
1507;   }
1508; }
1509
1510
1511define void @PR34743(i16* %a, i32* %b, i64 %n) {
1512; CHECK-LABEL: @PR34743(
1513; CHECK-NEXT:  entry:
1514; CHECK-NEXT:    [[DOTPRE:%.*]] = load i16, i16* [[A:%.*]], align 2
1515; CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1
1516; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1
1517; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 6
1518; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1519; CHECK:       vector.memcheck:
1520; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[N]], 1
1521; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
1522; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP3]]
1523; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i16, i16* [[A]], i64 1
1524; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[N]], -2
1525; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 3
1526; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i16, i16* [[A]], i64 [[TMP5]]
1527; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[SCEVGEP5]] to i32*
1528; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[TMP6]], [[B]]
1529; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[SCEVGEP]] to i16*
1530; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i16* [[SCEVGEP3]], [[TMP7]]
1531; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1532; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1533; CHECK:       vector.ph:
1534; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], -4
1535; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
1536; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i32 3
1537; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1538; CHECK:       vector.body:
1539; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1540; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STRIDED_VEC8:%.*]], [[VECTOR_BODY]] ]
1541; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1542; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[OFFSET_IDX]], 1
1543; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP8]]
1544; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>*
1545; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP10]], align 4
1546; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1547; CHECK-NEXT:    [[STRIDED_VEC8]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1548; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[STRIDED_VEC8]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1549; CHECK-NEXT:    [[TMP12:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
1550; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i16> [[TMP11]] to <4 x i32>
1551; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
1552; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP12]]
1553; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]]
1554; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
1555; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
1556; CHECK-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP18]], align 4, !alias.scope !36, !noalias !39
1557; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1558; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1559; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]]
1560; CHECK:       middle.block:
1561; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
1562; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i16> [[WIDE_VEC]], i32 7
1563; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
1564; CHECK:       scalar.ph:
1565; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
1566; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
1567; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
1568; CHECK-NEXT:    br label [[LOOP:%.*]]
1569; CHECK:       loop:
1570; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
1571; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ]
1572; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL7]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ]
1573; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32
1574; CHECK-NEXT:    [[I1]] = add nuw nsw i64 [[I]], 1
1575; CHECK-NEXT:    [[IV1:%.*]] = or i64 [[IV]], 1
1576; CHECK-NEXT:    [[IV2]] = add nuw nsw i64 [[IV]], 2
1577; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV1]]
1578; CHECK-NEXT:    [[LOAD1:%.*]] = load i16, i16* [[GEP1]], align 4
1579; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32
1580; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV2]]
1581; CHECK-NEXT:    [[LOAD2]] = load i16, i16* [[GEP2]], align 4
1582; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32
1583; CHECK-NEXT:    [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]]
1584; CHECK-NEXT:    [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]]
1585; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
1586; CHECK-NEXT:    store i32 [[MUL012]], i32* [[ARRAYIDX5]], align 4
1587; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]]
1588; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END]], label [[LOOP]], [[LOOP42:!llvm.loop !.*]]
1589; CHECK:       end:
1590; CHECK-NEXT:    ret void
1591;
1592entry:
1593  %.pre = load i16, i16* %a
1594  br label %loop
1595
1596loop:
1597  %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ]
1598  %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ]
1599  %i = phi i64 [ 0, %entry ], [ %i1, %loop ]
1600  %conv = sext i16 %0 to i32
1601  %i1 = add nuw nsw i64 %i, 1
1602  %iv1 = add nuw nsw i64 %iv, 1
1603  %iv2 = add nuw nsw i64 %iv, 2
1604  %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1
1605  %load1 = load i16, i16* %gep1, align 4
1606  %conv1 = sext i16 %load1 to i32
1607  %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2
1608  %load2 = load i16, i16* %gep2, align 4
1609  %conv2 = sext i16 %load2 to i32
1610  %mul01 = mul nsw i32 %conv, %conv1
1611  %mul012 = mul nsw i32 %mul01, %conv2
1612  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
1613  store i32 %mul012, i32* %arrayidx5
1614  %exitcond = icmp eq i64 %iv, %n
1615  br i1 %exitcond, label %end, label %loop
1616
1617end:
1618  ret void
1619}
1620
1621attributes #0 = { "unsafe-fp-math"="true" }
1622