1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
5
6; Check vectorization on an interleaved load group of factor 2 and an interleaved
7; store group of factor 2.
8
9; int AB[1024];
10; int CD[1024];
11;  void test_array_load2_store2(int C, int D) {
12;   for (int i = 0; i < 1024; i+=2) {
13;     int A = AB[i];
14;     int B = AB[i+1];
15;     CD[i] = A + C;
16;     CD[i+1] = B * D;
17;   }
18; }
19
20
21@AB = common global [1024 x i32] zeroinitializer, align 4
22@CD = common global [1024 x i32] zeroinitializer, align 4
23
24define void @test_array_load2_store2(i32 %C, i32 %D) {
25; CHECK-LABEL: @test_array_load2_store2(
26; CHECK-NEXT:  entry:
27; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
28; CHECK:       vector.ph:
29; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
30; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
31; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[D:%.*]], i32 0
32; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
33; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
34; CHECK:       vector.body:
35; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
36; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
37; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[OFFSET_IDX]]
38; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
39; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
40; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
41; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
42; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1
43; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
44; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]]
45; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP2]]
46; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -1
47; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
48; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
49; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4
50; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
51; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
52; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
53; CHECK:       middle.block:
54; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
55; CHECK:       scalar.ph:
56; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
57; CHECK:       for.body:
58; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]]
59; CHECK:       for.end:
60; CHECK-NEXT:    ret void
61;
62entry:
63  br label %for.body
64
65for.body:                                         ; preds = %for.body, %entry
66  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
67  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
68  %tmp = load i32, i32* %arrayidx0, align 4
69  %tmp1 = or i64 %indvars.iv, 1
70  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1
71  %tmp2 = load i32, i32* %arrayidx1, align 4
72  %add = add nsw i32 %tmp, %C
73  %mul = mul nsw i32 %tmp2, %D
74  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
75  store i32 %add, i32* %arrayidx2, align 4
76  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1
77  store i32 %mul, i32* %arrayidx3, align 4
78  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
79  %cmp = icmp slt i64 %indvars.iv.next, 1024
80  br i1 %cmp, label %for.body, label %for.end
81
82for.end:                                          ; preds = %for.body
83  ret void
84}
85
86; int A[3072];
87; struct ST S[1024];
88; void test_struct_st3() {
89;   int *ptr = A;
90;   for (int i = 0; i < 1024; i++) {
91;     int X1 = *ptr++;
92;     int X2 = *ptr++;
93;     int X3 = *ptr++;
94;     T[i].x = X1 + 1;
95;     T[i].y = X2 + 2;
96;     T[i].z = X3 + 3;
97;   }
98; }
99
100
101%struct.ST3 = type { i32, i32, i32 }
102@A = common global [3072 x i32] zeroinitializer, align 4
103@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
104
105define void @test_struct_array_load3_store3() {
106; CHECK-LABEL: @test_struct_array_load3_store3(
107; CHECK-NEXT:  entry:
108; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
109; CHECK:       vector.ph:
110; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
111; CHECK:       vector.body:
112; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
113; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
114; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr [3072 x i32], [3072 x i32]* @A, i64 0, i64 [[TMP0]]
115; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
116; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
117; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
118; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
119; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
120; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
121; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
122; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
123; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDEX]], i32 2
124; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -2
125; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
126; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
127; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
128; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
129; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4
130; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
131; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
132; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
133; CHECK:       middle.block:
134; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
135; CHECK:       scalar.ph:
136; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
137; CHECK:       for.body:
138; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
139; CHECK:       for.end:
140; CHECK-NEXT:    ret void
141;
142entry:
143  br label %for.body
144
145for.body:                                         ; preds = %for.body, %entry
146  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
147  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
148  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
149  %tmp = load i32, i32* %ptr.016, align 4
150  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
151  %tmp1 = load i32, i32* %incdec.ptr, align 4
152  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
153  %tmp2 = load i32, i32* %incdec.ptr1, align 4
154  %add = add nsw i32 %tmp, 1
155  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
156  store i32 %add, i32* %x, align 4
157  %add3 = add nsw i32 %tmp1, 2
158  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
159  store i32 %add3, i32* %y, align 4
160  %add6 = add nsw i32 %tmp2, 3
161  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
162  store i32 %add6, i32* %z, align 4
163  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
164  %exitcond = icmp eq i64 %indvars.iv.next, 1024
165  br i1 %exitcond, label %for.end, label %for.body
166
167for.end:                                          ; preds = %for.body
168  ret void
169}
170
171; Check vectorization on an interleaved load group of factor 4.
172
173; struct ST4{
174;   int x;
175;   int y;
176;   int z;
177;   int w;
178; };
179; int test_struct_load4(struct ST4 *S) {
180;   int r = 0;
181;   for (int i = 0; i < 1024; i++) {
182;      r += S[i].x;
183;      r -= S[i].y;
184;      r += S[i].z;
185;      r -= S[i].w;
186;   }
187;   return r;
188; }
189
190%struct.ST4 = type { i32, i32, i32, i32 }
191
192define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
193;
194; CHECK-LABEL: @test_struct_load4(
195; CHECK-NEXT:  entry:
196; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
197; CHECK:       vector.ph:
198; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
199; CHECK:       vector.body:
200; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
201; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
202; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], i64 [[INDEX]], i32 0
203; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
204; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
205; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
206; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
207; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
208; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
209; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]]
210; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[STRIDED_VEC2]]
211; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]]
212; CHECK-NEXT:    [[TMP5]] = sub <4 x i32> [[TMP3]], [[TMP4]]
213; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
214; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
215; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
216; CHECK:       middle.block:
217; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
218; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
219; CHECK:       scalar.ph:
220; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
221; CHECK:       for.body:
222; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
223; CHECK:       for.end:
224; CHECK-NEXT:    [[SUB8_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
225; CHECK-NEXT:    ret i32 [[SUB8_LCSSA]]
226;
227entry:
228  br label %for.body
229
230for.body:                                         ; preds = %for.body, %entry
231  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
232  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
233  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
234  %tmp = load i32, i32* %x, align 4
235  %add = add nsw i32 %tmp, %r.022
236  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
237  %tmp1 = load i32, i32* %y, align 4
238  %sub = sub i32 %add, %tmp1
239  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
240  %tmp2 = load i32, i32* %z, align 4
241  %add5 = add nsw i32 %sub, %tmp2
242  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
243  %tmp3 = load i32, i32* %w, align 4
244  %sub8 = sub i32 %add5, %tmp3
245  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
246  %exitcond = icmp eq i64 %indvars.iv.next, 1024
247  br i1 %exitcond, label %for.end, label %for.body
248
249for.end:                                          ; preds = %for.body
250  ret i32 %sub8
251}
252
253; Check vectorization on an interleaved store group of factor 4.
254
255; void test_struct_store4(int *A, struct ST4 *B) {
256;   int *ptr = A;
257;   for (int i = 0; i < 1024; i++) {
258;     int X = *ptr++;
259;     B[i].x = X + 1;
260;     B[i].y = X * 2;
261;     B[i].z = X + 3;
262;     B[i].w = X + 4;
263;   }
264; }
265
266
267define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) {
268; CHECK-LABEL: @test_struct_store4(
269; CHECK-NEXT:  entry:
270; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
271; CHECK:       vector.ph:
272; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
273; CHECK:       vector.body:
274; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
275; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
276; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>*
277; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
278; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
279; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
280; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
281; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4>
282; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[B:%.*]], i64 [[INDEX]], i32 3
283; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3
284; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
285; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
286; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
287; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
288; CHECK-NEXT:    store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[TMP7]], align 4
289; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
290; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
291; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
292; CHECK:       middle.block:
293; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
294; CHECK:       scalar.ph:
295; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
296; CHECK:       for.cond.cleanup:
297; CHECK-NEXT:    ret void
298; CHECK:       for.body:
299; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
300;
301entry:
302  br label %for.body
303
304for.cond.cleanup:                                 ; preds = %for.body
305  ret void
306
307for.body:                                         ; preds = %for.body, %entry
308  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
309  %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ]
310  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1
311  %tmp = load i32, i32* %ptr.024, align 4
312  %add = add nsw i32 %tmp, 1
313  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
314  store i32 %add, i32* %x, align 4
315  %mul = shl nsw i32 %tmp, 1
316  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1
317  store i32 %mul, i32* %y, align 4
318  %add3 = add nsw i32 %tmp, 3
319  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
320  store i32 %add3, i32* %z, align 4
321  %add6 = add nsw i32 %tmp, 4
322  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
323  store i32 %add6, i32* %w, align 4
324  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
325  %exitcond = icmp eq i64 %indvars.iv.next, 1024
326  br i1 %exitcond, label %for.cond.cleanup, label %for.body
327}
328
329; Check vectorization on a reverse interleaved load group of factor 2 and
330; a reverse interleaved store group of factor 2.
331
332; struct ST2 {
333;  int x;
334;  int y;
335; };
336;
337; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
338;   for (int i = 1023; i >= 0; i--) {
339;     int a = A[i].x + i;  // interleaved load of index 0
340;     int b = A[i].y - i;  // interleaved load of index 1
341;     B[i].x = a;          // interleaved store of index 0
342;     B[i].y = b;          // interleaved store of index 1
343;   }
344; }
345
346
347%struct.ST2 = type { i32, i32 }
348
349define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) {
350; CHECK-LABEL: @test_reversed_load2_store2(
351; CHECK-NEXT:  entry:
352; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
353; CHECK:       vector.ph:
354; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
355; CHECK:       vector.body:
356; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
357; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ]
358; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
359; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], %struct.ST2* [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
360; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 -6
361; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>*
362; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
363; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
364; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
365; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
366; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
367; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND3]]
368; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND3]]
369; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
370; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7
371; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
372; CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
373; CHECK-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
374; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE5]], <4 x i32> [[REVERSE6]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
375; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4
376; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
377; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <4 x i32> [[VEC_IND3]], <i32 -4, i32 -4, i32 -4, i32 -4>
378; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
379; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
380; CHECK:       middle.block:
381; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
382; CHECK:       scalar.ph:
383; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
384; CHECK:       for.cond.cleanup:
385; CHECK-NEXT:    ret void
386; CHECK:       for.body:
387; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]]
388;
389entry:
390  br label %for.body
391
392for.cond.cleanup:                                 ; preds = %for.body
393  ret void
394
395for.body:                                         ; preds = %for.body, %entry
396  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
397  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
398  %tmp = load i32, i32* %x, align 4
399  %tmp1 = trunc i64 %indvars.iv to i32
400  %add = add nsw i32 %tmp, %tmp1
401  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
402  %tmp2 = load i32, i32* %y, align 4
403  %sub = sub nsw i32 %tmp2, %tmp1
404  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
405  store i32 %add, i32* %x5, align 4
406  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
407  store i32 %sub, i32* %y8, align 4
408  %indvars.iv.next = add nsw i64 %indvars.iv, -1
409  %cmp = icmp sgt i64 %indvars.iv, 0
410  br i1 %cmp, label %for.body, label %for.cond.cleanup
411}
412
413; Check vectorization on an interleaved load group of factor 2 with 1 gap
414; (missing the load of odd elements). Because the vectorized loop would
415; speculatively access memory out-of-bounds, we must execute at least one
416; iteration of the scalar loop.
417
418; void even_load_static_tc(int *A, int *B) {
419;  for (unsigned i = 0; i < 1024; i+=2)
420;     B[i/2] = A[i] * 2;
421; }
422
423
424define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
425; CHECK-LABEL: @even_load_static_tc(
426; CHECK-NEXT:  entry:
427; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
428; CHECK:       vector.ph:
429; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
430; CHECK:       vector.body:
431; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
432; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
433; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
434; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
435; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
436; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
437; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
438; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[INDEX]], 9223372036854775804
439; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
440; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
441; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP5]], align 4
442; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
443; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 508
444; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
445; CHECK:       middle.block:
446; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
447; CHECK:       scalar.ph:
448; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1016, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
449; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
450; CHECK:       for.cond.cleanup:
451; CHECK-NEXT:    ret void
452; CHECK:       for.body:
453; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
454; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
455; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
456; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
457; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
458; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
459; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
460; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
461; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022
462; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP13:![0-9]+]]
463;
464entry:
465  br label %for.body
466
467for.cond.cleanup:                                 ; preds = %for.body
468  ret void
469
470for.body:                                         ; preds = %for.body, %entry
471  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
472  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
473  %tmp = load i32, i32* %arrayidx, align 4
474  %mul = shl nsw i32 %tmp, 1
475  %tmp1 = lshr exact i64 %indvars.iv, 1
476  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
477  store i32 %mul, i32* %arrayidx2, align 4
478  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
479  %cmp = icmp ult i64 %indvars.iv.next, 1024
480  br i1 %cmp, label %for.body, label %for.cond.cleanup
481}
482
483; Check vectorization on an interleaved load group of factor 2 with 1 gap
484; (missing the load of odd elements). Because the vectorized loop would
485; speculatively access memory out-of-bounds, we must execute at least one
486; iteration of the scalar loop.
487
488; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
489;  for (unsigned i = 0; i < N; i+=2)
490;     B[i/2] = A[i] * 2;
491; }
492
493
494define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) {
495; CHECK-LABEL: @even_load_dynamic_tc(
496; CHECK-NEXT:  entry:
497; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 2)
498; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
499; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
500; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
501; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
502; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
503; CHECK:       vector.ph:
504; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3
505; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
506; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
507; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
508; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
509; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
510; CHECK:       vector.body:
511; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
512; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
513; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
514; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
515; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4
516; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
517; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
518; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[INDEX]], 9223372036854775804
519; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP8]]
520; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
521; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
522; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
523; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
524; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
525; CHECK:       middle.block:
526; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
527; CHECK:       scalar.ph:
528; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
529; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
530; CHECK:       for.cond.cleanup:
531; CHECK-NEXT:    ret void
532; CHECK:       for.body:
533; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
534; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
535; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
536; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP]], 1
537; CHECK-NEXT:    [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1
538; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
539; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
540; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
541; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]]
542; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP15:![0-9]+]]
543;
544entry:
545  br label %for.body
546
547for.cond.cleanup:                                 ; preds = %for.body
548  ret void
549
550for.body:                                         ; preds = %for.body, %entry
551  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
552  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
553  %tmp = load i32, i32* %arrayidx, align 4
554  %mul = shl nsw i32 %tmp, 1
555  %tmp1 = lshr exact i64 %indvars.iv, 1
556  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
557  store i32 %mul, i32* %arrayidx2, align 4
558  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
559  %cmp = icmp ult i64 %indvars.iv.next, %N
560  br i1 %cmp, label %for.body, label %for.cond.cleanup
561}
562
563; Check vectorization on a reverse interleaved load group of factor 2 with 1
564; gap and a reverse interleaved store group of factor 2. The interleaved load
565; group should be removed since it has a gap and is reverse.
566
567; struct pair {
568;  int x;
569;  int y;
570; };
571;
572; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
573;   for (int i = 1023; i >= 0; i--) {
574;     int a = X + i;
575;     int b = A[i].y - i;
576;     B[i].x = a;
577;     B[i].y = b;
578;   }
579; }
580
581
582%pair = type { i64, i64 }
583define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) {
584; CHECK-LABEL: @load_gap_reverse(
585; CHECK-NEXT:  entry:
586; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
587; CHECK:       vector.ph:
588; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X:%.*]], i32 0
589; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
590; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
591; CHECK:       vector.body:
592; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
593; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
594; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
595; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 1022, [[INDEX]]
596; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 1021, [[INDEX]]
597; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 1020, [[INDEX]]
598; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
599; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P1:%.*]], i64 [[OFFSET_IDX]], i32 0
600; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP0]], i32 0
601; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP1]], i32 0
602; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP2]], i32 0
603; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1
604; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP0]], i32 1
605; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP1]], i32 1
606; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP2]], i32 1
607; CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[TMP8]], align 8
608; CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP9]], align 8
609; CHECK-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP10]], align 8
610; CHECK-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP11]], align 8
611; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0
612; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1
613; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2
614; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3
615; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]]
616; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
617; CHECK-NEXT:    store i64 [[TMP21]], i64* [[TMP4]], align 8
618; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
619; CHECK-NEXT:    store i64 [[TMP22]], i64* [[TMP5]], align 8
620; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
621; CHECK-NEXT:    store i64 [[TMP23]], i64* [[TMP6]], align 8
622; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
623; CHECK-NEXT:    store i64 [[TMP24]], i64* [[TMP7]], align 8
624; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
625; CHECK-NEXT:    store i64 [[TMP25]], i64* [[TMP8]], align 8
626; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
627; CHECK-NEXT:    store i64 [[TMP26]], i64* [[TMP9]], align 8
628; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
629; CHECK-NEXT:    store i64 [[TMP27]], i64* [[TMP10]], align 8
630; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
631; CHECK-NEXT:    store i64 [[TMP28]], i64* [[TMP11]], align 8
632; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
633; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
634; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
635; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
636; CHECK:       middle.block:
637; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
638; CHECK:       scalar.ph:
639; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
640; CHECK:       for.body:
641; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP17:![0-9]+]]
642; CHECK:       for.exit:
643; CHECK-NEXT:    ret void
644;
645entry:
646  br label %for.body
647
648for.body:
649  %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
650  %0 = add nsw i64 %X, %i
651  %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0
652  %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1
653  %3 = load i64, i64* %2, align 8
654  %4 = sub nsw i64 %3, %i
655  store i64 %0, i64* %1, align 8
656  store i64 %4, i64* %2, align 8
657  %i.next = add nsw i64 %i, -1
658  %cond = icmp sgt i64 %i, 0
659  br i1 %cond, label %for.body, label %for.exit
660
661for.exit:
662  ret void
663}
664
665; Check vectorization on interleaved access groups identified from mixed
666; loads/stores.
667; void mixed_load2_store2(int *A, int *B) {
668;   for (unsigned i = 0; i < 1024; i+=2)  {
669;     B[i] = A[i] * A[i+1];
670;     B[i+1] = A[i] + A[i+1];
671;   }
672; }
673
674
675define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
676; CHECK-LABEL: @mixed_load2_store2(
677; CHECK-NEXT:  entry:
678; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
679; CHECK:       vector.ph:
680; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
681; CHECK:       vector.body:
682; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
683; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
684; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
685; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
686; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
687; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
688; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
689; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1
690; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
691; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
692; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
693; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
694; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 -1
695; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 [[TMP2]]
696; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
697; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
698; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4
699; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
700; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
701; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
702; CHECK:       middle.block:
703; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
704; CHECK:       scalar.ph:
705; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
706; CHECK:       for.cond.cleanup:
707; CHECK-NEXT:    ret void
708; CHECK:       for.body:
709; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP19:![0-9]+]]
710;
711entry:
712  br label %for.body
713
714for.cond.cleanup:                                 ; preds = %for.body
715  ret void
716
717for.body:                                         ; preds = %for.body, %entry
718  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
719  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
720  %tmp = load i32, i32* %arrayidx, align 4
721  %tmp1 = or i64 %indvars.iv, 1
722  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1
723  %tmp2 = load i32, i32* %arrayidx2, align 4
724  %mul = mul nsw i32 %tmp2, %tmp
725  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
726  store i32 %mul, i32* %arrayidx4, align 4
727  %tmp3 = load i32, i32* %arrayidx, align 4
728  %tmp4 = load i32, i32* %arrayidx2, align 4
729  %add10 = add nsw i32 %tmp4, %tmp3
730  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1
731  store i32 %add10, i32* %arrayidx13, align 4
732  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
733  %cmp = icmp ult i64 %indvars.iv.next, 1024
734  br i1 %cmp, label %for.body, label %for.cond.cleanup
735}
736
737; Check vectorization on interleaved access groups identified from mixed
738; loads/stores.
739; void mixed_load3_store3(int *A) {
740;   for (unsigned i = 0; i < 1024; i++)  {
741;     *A++ += i;
742;     *A++ += i;
743;     *A++ += i;
744;   }
745; }
746
747
748define void @mixed_load3_store3(i32* nocapture %A) {
749; CHECK-LABEL: @mixed_load3_store3(
750; CHECK-NEXT:  entry:
751; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
752; CHECK:       vector.ph:
753; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
754; CHECK:       vector.body:
755; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
756; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
757; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
758; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP0]]
759; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
760; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
761; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
762; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
763; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
764; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]]
765; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[NEXT_GEP]], i64 2
766; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]]
767; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]]
768; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 -2
769; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
770; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
771; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
772; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
773; CHECK-NEXT:    store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4
774; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
775; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
776; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
777; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
778; CHECK:       middle.block:
779; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
780; CHECK:       scalar.ph:
781; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
782; CHECK:       for.cond.cleanup:
783; CHECK-NEXT:    ret void
784; CHECK:       for.body:
785; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
786;
787entry:
788  br label %for.body
789
790for.cond.cleanup:                                 ; preds = %for.body
791  ret void
792
793for.body:                                         ; preds = %for.body, %entry
794  %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
795  %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ]
796  %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1
797  %tmp = load i32, i32* %A.addr.012, align 4
798  %add = add i32 %tmp, %i.013
799  store i32 %add, i32* %A.addr.012, align 4
800  %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2
801  %tmp1 = load i32, i32* %incdec.ptr, align 4
802  %add2 = add i32 %tmp1, %i.013
803  store i32 %add2, i32* %incdec.ptr, align 4
804  %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3
805  %tmp2 = load i32, i32* %incdec.ptr1, align 4
806  %add4 = add i32 %tmp2, %i.013
807  store i32 %add4, i32* %incdec.ptr1, align 4
808  %inc = add nuw nsw i32 %i.013, 1
809  %exitcond = icmp eq i32 %inc, 1024
810  br i1 %exitcond, label %for.cond.cleanup, label %for.body
811}
812
813; Check vectorization on interleaved access groups with members having different
814; kinds of type.
815
816; struct IntFloat {
817;   int a;
818;   float b;
819; };
820;
821; int SA;
822; float SB;
823;
824; void int_float_struct(struct IntFloat *A) {
825;   int SumA;
826;   float SumB;
827;   for (unsigned i = 0; i < 1024; i++)  {
828;     SumA += A[i].a;
829;     SumB += A[i].b;
830;   }
831;   SA = SumA;
832;   SB = SumB;
833; }
834
835
836%struct.IntFloat = type { i32, float }
837
838@SA = common global i32 0, align 4
839@SB = common global float 0.000000e+00, align 4
840
841define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
842; CHECK-LABEL: @int_float_struct(
843; CHECK-NEXT:  entry:
844; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
845; CHECK:       vector.ph:
846; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
847; CHECK:       vector.body:
848; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
849; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
850; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
851; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], %struct.IntFloat* [[A:%.*]], i64 [[INDEX]], i32 0
852; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
853; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
854; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
855; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
856; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[STRIDED_VEC2]] to <4 x float>
857; CHECK-NEXT:    [[TMP3]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI1]]
858; CHECK-NEXT:    [[TMP4]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP2]]
859; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
860; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
861; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
862; CHECK:       middle.block:
863; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
864; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
865; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
866; CHECK:       scalar.ph:
867; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
868; CHECK:       for.cond.cleanup:
869; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
870; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
871; CHECK-NEXT:    store i32 [[ADD_LCSSA]], i32* @SA, align 4
872; CHECK-NEXT:    store float [[ADD3_LCSSA]], float* @SB, align 4
873; CHECK-NEXT:    ret void
874; CHECK:       for.body:
875; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
876;
877entry:
878  br label %for.body
879
880for.cond.cleanup:                                 ; preds = %for.body
881  store i32 %add, i32* @SA, align 4
882  store float %add3, float* @SB, align 4
883  ret void
884
885for.body:                                         ; preds = %for.body, %entry
886  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
887  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
888  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
889  %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
890  %tmp = load i32, i32* %a, align 4
891  %add = add nsw i32 %tmp, %SumA.013
892  %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
893  %tmp1 = load float, float* %b, align 4
894  %add3 = fadd fast float %SumB.014, %tmp1
895  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
896  %exitcond = icmp eq i64 %indvars.iv.next, 1024
897  br i1 %exitcond, label %for.cond.cleanup, label %for.body
898}
899
900; Check vectorization of interleaved access groups in the presence of
901; dependences (PR27626). The following tests check that we don't reorder
902; dependent loads and stores when generating code for interleaved access
903; groups. Stores should be scalarized because the required code motion would
904; break dependences, and the remaining interleaved load groups should have
905; gaps.
906
907; PR27626_0: Ensure a strided store is not moved after a dependent (zero
908;            distance) strided load.
909
910; void PR27626_0(struct pair *p, int z, int n) {
911;   for (int i = 0; i < n; i++) {
912;     p[i].x = z;
913;     p[i].y = p[i].x;
914;   }
915; }
916
917
918%pair.i32 = type { i32, i32 }
919define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
920; CHECK-LABEL: @PR27626_0(
921; CHECK-NEXT:  entry:
922; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
923; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
924; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
925; CHECK:       vector.ph:
926; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
927; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
928; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
929; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
930; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
931; CHECK:       vector.body:
932; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
933; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 1
934; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 2
935; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 3
936; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
937; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 0
938; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0
939; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0
940; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
941; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1
942; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
943; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
944; CHECK-NEXT:    store i32 [[Z:%.*]], i32* [[TMP5]], align 4
945; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP6]], align 4
946; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP7]], align 4
947; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP8]], align 4
948; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
949; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
950; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
951; CHECK-NEXT:    store i32 [[TMP14]], i32* [[TMP9]], align 4
952; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
953; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP10]], align 4
954; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
955; CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP11]], align 4
956; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
957; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP12]], align 4
958; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
959; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
960; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
961; CHECK:       middle.block:
962; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
963; CHECK:       scalar.ph:
964; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
965; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
966; CHECK:       for.body:
967; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
968; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
969; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
970; CHECK-NEXT:    store i32 [[Z]], i32* [[P_I_X]], align 4
971; CHECK-NEXT:    store i32 [[Z]], i32* [[P_I_Y]], align 4
972; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
973; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
974; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]]
975; CHECK:       for.end:
976; CHECK-NEXT:    ret void
977;
978entry:
979  br label %for.body
980
981for.body:
982  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
983  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
984  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
985  store i32 %z, i32* %p_i.x, align 4
986  %0 = load i32, i32* %p_i.x, align 4
987  store i32 %0, i32 *%p_i.y, align 4
988  %i.next = add nuw nsw i64 %i, 1
989  %cond = icmp slt i64 %i.next, %n
990  br i1 %cond, label %for.body, label %for.end
991
992for.end:
993  ret void
994}
995
996; PR27626_1: Ensure a strided load is not moved before a dependent (zero
997;            distance) strided store.
998
999; void PR27626_1(struct pair *p, int n) {
1000;   int s = 0;
1001;   for (int i = 0; i < n; i++) {
1002;     p[i].y = p[i].x;
1003;     s += p[i].y
1004;   }
1005; }
1006
1007
1008define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
1009; CHECK-LABEL: @PR27626_1(
1010; CHECK-NEXT:  entry:
1011; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1012; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1013; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1014; CHECK:       vector.ph:
1015; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1016; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1017; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1018; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1019; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1020; CHECK:       vector.body:
1021; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1022; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
1023; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 1
1024; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 2
1025; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 3
1026; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
1027; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
1028; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1
1029; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
1030; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
1031; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
1032; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4
1033; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
1034; CHECK-NEXT:    store i32 [[TMP11]], i32* [[TMP6]], align 4
1035; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
1036; CHECK-NEXT:    store i32 [[TMP12]], i32* [[TMP7]], align 4
1037; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
1038; CHECK-NEXT:    store i32 [[TMP13]], i32* [[TMP8]], align 4
1039; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
1040; CHECK-NEXT:    store i32 [[TMP14]], i32* [[TMP9]], align 4
1041; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
1042; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
1043; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1044; CHECK-NEXT:    [[TMP16]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1045; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1046; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1047; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
1048; CHECK:       middle.block:
1049; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP16]])
1050; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
1051; CHECK:       scalar.ph:
1052; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1053; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1054; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1055; CHECK:       for.body:
1056; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1057; CHECK-NEXT:    [[S:%.*]] = phi i32 [ [[TMP20:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1058; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
1059; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
1060; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[P_I_X]], align 4
1061; CHECK-NEXT:    store i32 [[TMP19]], i32* [[P_I_Y]], align 4
1062; CHECK-NEXT:    [[TMP20]] = add nsw i32 [[TMP19]], [[S]]
1063; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1064; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1065; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP27:![0-9]+]]
1066; CHECK:       for.end:
1067; CHECK-NEXT:    [[TMP21:%.*]] = phi i32 [ [[TMP20]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
1068; CHECK-NEXT:    ret i32 [[TMP21]]
1069;
1070entry:
1071  br label %for.body
1072
1073for.body:
1074  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1075  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1076  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
1077  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
1078  %0 = load i32, i32* %p_i.x, align 4
1079  store i32 %0, i32* %p_i.y, align 4
1080  %1 = load i32, i32* %p_i.y, align 4
1081  %2 = add nsw i32 %1, %s
1082  %i.next = add nuw nsw i64 %i, 1
1083  %cond = icmp slt i64 %i.next, %n
1084  br i1 %cond, label %for.body, label %for.end
1085
1086for.end:
1087  %3 = phi i32 [ %2, %for.body ]
1088  ret i32 %3
1089}
1090
1091; PR27626_2: Ensure a strided store is not moved after a dependent (negative
1092;            distance) strided load.
1093
1094; void PR27626_2(struct pair *p, int z, int n) {
1095;   for (int i = 0; i < n; i++) {
1096;     p[i].x = z;
1097;     p[i].y = p[i - 1].x;
1098;   }
1099; }
1100
1101
1102define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
1103; CHECK-LABEL: @PR27626_2(
1104; CHECK-NEXT:  entry:
1105; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1106; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1107; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1108; CHECK:       vector.ph:
1109; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1110; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1111; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1112; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1113; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1114; CHECK:       vector.body:
1115; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1116; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDEX]], 1
1117; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDEX]], 2
1118; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDEX]], 3
1119; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
1120; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 0
1121; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0
1122; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0
1123; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0
1124; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
1125; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1
1126; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1
1127; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1
1128; CHECK-NEXT:    store i32 [[Z:%.*]], i32* [[TMP5]], align 4
1129; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP6]], align 4
1130; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP7]], align 4
1131; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP8]], align 4
1132; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
1133; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP14]], align 4
1134; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
1135; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP10]], align 4
1136; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
1137; CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP11]], align 4
1138; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
1139; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP12]], align 4
1140; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
1141; CHECK-NEXT:    store i32 [[TMP18]], i32* [[TMP13]], align 4
1142; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1143; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1144; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
1145; CHECK:       middle.block:
1146; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
1147; CHECK:       scalar.ph:
1148; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1149; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1150; CHECK:       for.body:
1151; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1152; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
1153; CHECK-NEXT:    [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0
1154; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
1155; CHECK-NEXT:    store i32 [[Z]], i32* [[P_I_X]], align 4
1156; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[P_I_MINUS_1_X]], align 4
1157; CHECK-NEXT:    store i32 [[TMP20]], i32* [[P_I_Y]], align 4
1158; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1159; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1160; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP29:![0-9]+]]
1161; CHECK:       for.end:
1162; CHECK-NEXT:    ret void
1163;
1164entry:
1165  br label %for.body
1166
1167for.body:
1168  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1169  %i_minus_1 = add nuw nsw i64 %i, -1
1170  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
1171  %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
1172  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
1173  store i32 %z, i32* %p_i.x, align 4
1174  %0 = load i32, i32* %p_i_minus_1.x, align 4
1175  store i32 %0, i32 *%p_i.y, align 4
1176  %i.next = add nuw nsw i64 %i, 1
1177  %cond = icmp slt i64 %i.next, %n
1178  br i1 %cond, label %for.body, label %for.end
1179
1180for.end:
1181  ret void
1182}
1183
1184; PR27626_3: Ensure a strided load is not moved before a dependent (negative
1185;            distance) strided store.
1186
1187; void PR27626_3(struct pair *p, int z, int n) {
1188;   for (int i = 0; i < n; i++) {
1189;     p[i + 1].y = p[i].x;
1190;     s += p[i].y;
1191;   }
1192; }
1193
1194
1195define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
1196; CHECK-LABEL: @PR27626_3(
1197; CHECK-NEXT:  entry:
1198; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
1199; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5
1200; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1201; CHECK:       vector.ph:
1202; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3
1203; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
1204; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]]
1205; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]]
1206; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1207; CHECK:       vector.body:
1208; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1209; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1210; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
1211; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
1212; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0
1213; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1
1214; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
1215; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1
1216; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
1217; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP7]], i32 1
1218; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
1219; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1
1220; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
1221; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP11]], i32 1
1222; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
1223; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
1224; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0
1225; CHECK-NEXT:    store i32 [[TMP14]], i32* [[TMP6]], align 4
1226; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2
1227; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP8]], align 4
1228; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4
1229; CHECK-NEXT:    store i32 [[TMP16]], i32* [[TMP10]], align 4
1230; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6
1231; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP12]], align 4
1232; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
1233; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP18]], align 4
1234; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1235; CHECK-NEXT:    [[TMP19]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]]
1236; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1237; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
1238; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1239; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1240; CHECK:       middle.block:
1241; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP19]])
1242; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
1243; CHECK:       scalar.ph:
1244; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1245; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
1246; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1247; CHECK:       for.body:
1248; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1249; CHECK-NEXT:    [[S:%.*]] = phi i32 [ [[TMP24:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
1250; CHECK-NEXT:    [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1
1251; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0
1252; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1
1253; CHECK-NEXT:    [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I_PLUS_1]], i32 1
1254; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[P_I_X]], align 4
1255; CHECK-NEXT:    store i32 [[TMP22]], i32* [[P_I_PLUS_1_Y]], align 4
1256; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* [[P_I_Y]], align 4
1257; CHECK-NEXT:    [[TMP24]] = add nsw i32 [[TMP23]], [[S]]
1258; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
1259; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1260; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP31:![0-9]+]]
1261; CHECK:       for.end:
1262; CHECK-NEXT:    [[TMP25:%.*]] = phi i32 [ [[TMP24]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
1263; CHECK-NEXT:    ret i32 [[TMP25]]
1264;
1265entry:
1266  br label %for.body
1267
1268for.body:
1269  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1270  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
1271  %i_plus_1 = add nuw nsw i64 %i, 1
1272  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
1273  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
1274  %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
1275  %0 = load i32, i32* %p_i.x, align 4
1276  store i32 %0, i32* %p_i_plus_1.y, align 4
1277  %1 = load i32, i32* %p_i.y, align 4
1278  %2 = add nsw i32 %1, %s
1279  %i.next = add nuw nsw i64 %i, 1
1280  %cond = icmp slt i64 %i.next, %n
1281  br i1 %cond, label %for.body, label %for.end
1282
1283for.end:
1284  %3 = phi i32 [ %2, %for.body ]
1285  ret i32 %3
1286}
1287
1288; PR27626_4: Ensure we form an interleaved group for strided stores in the
1289;            presence of a write-after-write dependence. We create a group for
1290;            (2) and (3) while excluding (1).
1291
1292; void PR27626_4(int *a, int x, int y, int z, int n) {
1293;   for (int i = 0; i < n; i += 2) {
1294;     a[i] = x;      // (1)
1295;     a[i] = y;      // (2)
1296;     a[i + 1] = z;  // (3)
1297;   }
1298; }
1299
1300
1301define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
1302; CHECK-LABEL: @PR27626_4(
1303; CHECK-NEXT:  entry:
1304; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2)
1305; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
1306; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
1307; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
1308; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6
1309; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1310; CHECK:       vector.ph:
1311; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
1312; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
1313; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i32 0
1314; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
1315; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Z:%.*]], i32 0
1316; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
1317; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1318; CHECK:       vector.body:
1319; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1320; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1321; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 2
1322; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 4
1323; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 6
1324; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 1
1325; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
1326; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
1327; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
1328; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
1329; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 -1
1330; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[TMP7]], align 4
1331; CHECK-NEXT:    store i32 [[X]], i32* [[TMP8]], align 4
1332; CHECK-NEXT:    store i32 [[X]], i32* [[TMP9]], align 4
1333; CHECK-NEXT:    store i32 [[X]], i32* [[TMP10]], align 4
1334; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 [[TMP6]]
1335; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
1336; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1337; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP13]], align 4
1338; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1339; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1340; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
1341; CHECK:       middle.block:
1342; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
1343; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1344; CHECK:       scalar.ph:
1345; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
1346; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1347; CHECK:       for.body:
1348; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1349; CHECK-NEXT:    [[I_PLUS_1:%.*]] = or i64 [[I]], 1
1350; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
1351; CHECK-NEXT:    [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_PLUS_1]]
1352; CHECK-NEXT:    store i32 [[Y]], i32* [[A_I]], align 4
1353; CHECK-NEXT:    store i32 [[Z]], i32* [[A_I_PLUS_1]], align 4
1354; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1355; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1356; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP33:![0-9]+]]
1357; CHECK:       for.end:
1358; CHECK-NEXT:    ret void
1359;
1360entry:
1361  br label %for.body
1362
1363for.body:
1364  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
1365  %i_plus_1 = add i64 %i, 1
1366  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
1367  %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
1368  store i32 %x, i32* %a_i, align 4
1369  store i32 %y, i32* %a_i, align 4
1370  store i32 %z, i32* %a_i_plus_1, align 4
1371  %i.next = add nuw nsw i64 %i, 2
1372  %cond = icmp slt i64 %i.next, %n
1373  br i1 %cond, label %for.body, label %for.end
1374
1375for.end:
1376  ret void
1377}
1378
1379; PR27626_5: Ensure we do not form an interleaved group for strided stores in
1380;            the presence of a write-after-write dependence.
1381
1382; void PR27626_5(int *a, int x, int y, int z, int n) {
1383;   for (int i = 3; i < n; i += 2) {
1384;     a[i - 1] = x;
1385;     a[i - 3] = y;
1386;     a[i] = z;
1387;   }
1388; }
1389
1390
1391define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
1392; CHECK-LABEL: @PR27626_5(
1393; CHECK-NEXT:  entry:
1394; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 5)
1395; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4
1396; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
1397; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
1398; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6
1399; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1400; CHECK:       vector.ph:
1401; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804
1402; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[N_VEC]], 1
1403; CHECK-NEXT:    [[IND_END:%.*]] = or i64 [[TMP3]], 3
1404; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1405; CHECK:       vector.body:
1406; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1407; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1408; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
1409; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i64 [[TMP4]], 3
1410; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP4]], 5
1411; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP4]], 7
1412; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP4]], 9
1413; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
1414; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -3, i64 -3, i64 -3, i64 -3>
1415; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]]
1416; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
1417; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]]
1418; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
1419; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0
1420; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP14]]
1421; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1
1422; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
1423; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2
1424; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]]
1425; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3
1426; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]]
1427; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0
1428; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]]
1429; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
1430; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP24]]
1431; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
1432; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP26]]
1433; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
1434; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP28]]
1435; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[TMP15]], align 4
1436; CHECK-NEXT:    store i32 [[X]], i32* [[TMP17]], align 4
1437; CHECK-NEXT:    store i32 [[X]], i32* [[TMP19]], align 4
1438; CHECK-NEXT:    store i32 [[X]], i32* [[TMP21]], align 4
1439; CHECK-NEXT:    store i32 [[Y:%.*]], i32* [[TMP23]], align 4
1440; CHECK-NEXT:    store i32 [[Y]], i32* [[TMP25]], align 4
1441; CHECK-NEXT:    store i32 [[Y]], i32* [[TMP27]], align 4
1442; CHECK-NEXT:    store i32 [[Y]], i32* [[TMP29]], align 4
1443; CHECK-NEXT:    store i32 [[Z:%.*]], i32* [[TMP10]], align 4
1444; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP11]], align 4
1445; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP12]], align 4
1446; CHECK-NEXT:    store i32 [[Z]], i32* [[TMP13]], align 4
1447; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1448; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8>
1449; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1450; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
1451; CHECK:       middle.block:
1452; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
1453; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1454; CHECK:       scalar.ph:
1455; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
1456; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
1457; CHECK:       for.body:
1458; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
1459; CHECK-NEXT:    [[I_MINUS_1:%.*]] = add i64 [[I]], -1
1460; CHECK-NEXT:    [[I_MINUS_3:%.*]] = add i64 [[I]], -3
1461; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
1462; CHECK-NEXT:    [[A_I_MINUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_1]]
1463; CHECK-NEXT:    [[A_I_MINUS_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_3]]
1464; CHECK-NEXT:    store i32 [[X]], i32* [[A_I_MINUS_1]], align 4
1465; CHECK-NEXT:    store i32 [[Y]], i32* [[A_I_MINUS_3]], align 4
1466; CHECK-NEXT:    store i32 [[Z]], i32* [[A_I]], align 4
1467; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 2
1468; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
1469; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP35:![0-9]+]]
1470; CHECK:       for.end:
1471; CHECK-NEXT:    ret void
1472;
1473entry:
1474  br label %for.body
1475
1476for.body:
1477  %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
1478  %i_minus_1 = sub i64 %i, 1
1479  %i_minus_3 = sub i64 %i_minus_1, 2
1480  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
1481  %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
1482  %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
1483  store i32 %x, i32* %a_i_minus_1, align 4
1484  store i32 %y, i32* %a_i_minus_3, align 4
1485  store i32 %z, i32* %a_i, align 4
1486  %i.next = add nuw nsw i64 %i, 2
1487  %cond = icmp slt i64 %i.next, %n
1488  br i1 %cond, label %for.body, label %for.end
1489
1490for.end:
1491  ret void
1492}
1493
1494; PR34743: Ensure that a cast which needs to sink after a load that belongs to
1495; an interleaved group, indeeded gets sunk.
1496
1497; void PR34743(short *a, int *b, int n) {
1498;   for (int i = 0, iv = 0; iv < n; i++, iv += 2) {
1499;     b[i] = a[iv] * a[iv+1] * a[iv+2];
1500;   }
1501; }
1502
1503
1504define void @PR34743(i16* %a, i32* %b, i64 %n) {
1505; CHECK-LABEL: @PR34743(
1506; CHECK-NEXT:  entry:
1507; CHECK-NEXT:    [[DOTPRE:%.*]] = load i16, i16* [[A:%.*]], align 2
1508; CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1
1509; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1
1510; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 6
1511; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1512; CHECK:       vector.memcheck:
1513; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[N]], 1
1514; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
1515; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP3]]
1516; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i16, i16* [[A]], i64 1
1517; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[N]], -2
1518; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 3
1519; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i16, i16* [[A]], i64 [[TMP5]]
1520; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16* [[SCEVGEP5]] to i32*
1521; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[TMP6]], [[B]]
1522; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[SCEVGEP]] to i16*
1523; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i16* [[SCEVGEP3]], [[TMP7]]
1524; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1525; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1526; CHECK:       vector.ph:
1527; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], -4
1528; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
1529; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i32 3
1530; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1531; CHECK:       vector.body:
1532; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1533; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STRIDED_VEC8:%.*]], [[VECTOR_BODY]] ]
1534; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
1535; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[OFFSET_IDX]], 1
1536; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP8]]
1537; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>*
1538; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP10]], align 4
1539; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1540; CHECK-NEXT:    [[STRIDED_VEC8]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1541; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[STRIDED_VEC8]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1542; CHECK-NEXT:    [[TMP12:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
1543; CHECK-NEXT:    [[TMP13:%.*]] = sext <4 x i16> [[TMP11]] to <4 x i32>
1544; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
1545; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP12]]
1546; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]]
1547; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
1548; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
1549; CHECK-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP18]], align 4, !alias.scope !36, !noalias !39
1550; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
1551; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1552; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
1553; CHECK:       middle.block:
1554; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
1555; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i16> [[WIDE_VEC]], i32 7
1556; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
1557; CHECK:       scalar.ph:
1558; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
1559; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
1560; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
1561; CHECK-NEXT:    br label [[LOOP:%.*]]
1562; CHECK:       loop:
1563; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ]
1564; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ]
1565; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL7]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ]
1566; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32
1567; CHECK-NEXT:    [[I1]] = add nuw nsw i64 [[I]], 1
1568; CHECK-NEXT:    [[IV1:%.*]] = or i64 [[IV]], 1
1569; CHECK-NEXT:    [[IV2]] = add nuw nsw i64 [[IV]], 2
1570; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV1]]
1571; CHECK-NEXT:    [[LOAD1:%.*]] = load i16, i16* [[GEP1]], align 4
1572; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32
1573; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV2]]
1574; CHECK-NEXT:    [[LOAD2]] = load i16, i16* [[GEP2]], align 4
1575; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32
1576; CHECK-NEXT:    [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]]
1577; CHECK-NEXT:    [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]]
1578; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
1579; CHECK-NEXT:    store i32 [[MUL012]], i32* [[ARRAYIDX5]], align 4
1580; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]]
1581; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP42:![0-9]+]]
1582; CHECK:       end:
1583; CHECK-NEXT:    ret void
1584;
1585entry:
1586  %.pre = load i16, i16* %a
1587  br label %loop
1588
1589loop:
1590  %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ]
1591  %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ]
1592  %i = phi i64 [ 0, %entry ], [ %i1, %loop ]
1593  %conv = sext i16 %0 to i32
1594  %i1 = add nuw nsw i64 %i, 1
1595  %iv1 = add nuw nsw i64 %iv, 1
1596  %iv2 = add nuw nsw i64 %iv, 2
1597  %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1
1598  %load1 = load i16, i16* %gep1, align 4
1599  %conv1 = sext i16 %load1 to i32
1600  %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2
1601  %load2 = load i16, i16* %gep2, align 4
1602  %conv2 = sext i16 %load2 to i32
1603  %mul01 = mul nsw i32 %conv, %conv1
1604  %mul012 = mul nsw i32 %mul01, %conv2
1605  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
1606  store i32 %mul012, i32* %arrayidx5
1607  %exitcond = icmp eq i64 %iv, %n
1608  br i1 %exitcond, label %end, label %loop
1609
1610end:
1611  ret void
1612}
1613
1614attributes #0 = { "unsafe-fp-math"="true" }
1615