1; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s 2 3target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 4 5; Check vectorization on an interleaved load group of factor 2 and an interleaved 6; store group of factor 2. 7 8; int AB[1024]; 9; int CD[1024]; 10; void test_array_load2_store2(int C, int D) { 11; for (int i = 0; i < 1024; i+=2) { 12; int A = AB[i]; 13; int B = AB[i+1]; 14; CD[i] = A + C; 15; CD[i+1] = B * D; 16; } 17; } 18 19; CHECK-LABEL: @test_array_load2_store2( 20; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 21; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 22; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 23; CHECK: add nsw <4 x i32> 24; CHECK: mul nsw <4 x i32> 25; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 26; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 27 28@AB = common global [1024 x i32] zeroinitializer, align 4 29@CD = common global [1024 x i32] zeroinitializer, align 4 30 31define void @test_array_load2_store2(i32 %C, i32 %D) { 32entry: 33 br label %for.body 34 35for.body: ; preds = %for.body, %entry 36 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 37 %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv 38 %tmp = load i32, i32* %arrayidx0, align 4 39 %tmp1 = or i64 %indvars.iv, 1 40 %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1 41 %tmp2 = load i32, i32* %arrayidx1, align 4 42 %add = add nsw i32 %tmp, %C 43 %mul = mul nsw i32 %tmp2, %D 44 %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv 45 store i32 %add, i32* %arrayidx2, align 4 46 %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1 47 store i32 %mul, i32* %arrayidx3, align 4 48 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 49 %cmp = icmp slt i64 %indvars.iv.next, 1024 50 br i1 %cmp, label %for.body, label %for.end 51 52for.end: ; preds = %for.body 53 ret void 54} 55 56; int A[3072]; 57; struct ST S[1024]; 58; void test_struct_st3() { 59; int *ptr = A; 60; for (int i = 0; i < 1024; i++) { 61; int X1 = *ptr++; 62; int X2 = *ptr++; 63; int X3 = *ptr++; 64; T[i].x = X1 + 1; 65; T[i].y = X2 + 2; 66; T[i].z = X3 + 3; 67; } 68; } 69 70; CHECK-LABEL: @test_struct_array_load3_store3( 71; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 72; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 73; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 74; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 75; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1> 76; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2> 77; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3> 78; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 79; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 80; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 81; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4 82 83%struct.ST3 = type { i32, i32, i32 } 84@A = common global [3072 x i32] zeroinitializer, align 4 85@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 86 87define void @test_struct_array_load3_store3() { 88entry: 89 br label %for.body 90 91for.body: ; preds = %for.body, %entry 92 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 93 %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] 94 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 95 %tmp = load i32, i32* %ptr.016, align 4 96 %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 97 %tmp1 = load i32, i32* %incdec.ptr, align 4 98 %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 99 %tmp2 = load i32, i32* %incdec.ptr1, align 4 100 %add = add nsw i32 %tmp, 1 101 %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 102 store i32 %add, i32* %x, align 4 103 %add3 = add nsw i32 %tmp1, 2 104 %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 105 store i32 %add3, i32* %y, align 4 106 %add6 = add nsw i32 %tmp2, 3 107 %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 108 store i32 %add6, i32* %z, align 4 109 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 110 %exitcond = icmp eq i64 %indvars.iv.next, 1024 111 br i1 %exitcond, label %for.end, label %for.body 112 113for.end: ; preds = %for.body 114 ret void 115} 116 117; Check vectorization on an interleaved load group of factor 4. 118 119; struct ST4{ 120; int x; 121; int y; 122; int z; 123; int w; 124; }; 125; int test_struct_load4(struct ST4 *S) { 126; int r = 0; 127; for (int i = 0; i < 1024; i++) { 128; r += S[i].x; 129; r -= S[i].y; 130; r += S[i].z; 131; r -= S[i].w; 132; } 133; return r; 134; } 135 136%struct.ST4 = type { i32, i32, i32, i32 } 137 138define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { 139; CHECK-LABEL: @test_struct_load4( 140; CHECK-NEXT: entry: 141; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 142; CHECK: vector.ph: 143; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 144; CHECK: vector.body: 145; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 146; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 147; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], i64 [[INDEX]], i32 0 148; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* 149; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4 150; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 151; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 152; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 153; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 154; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]] 155; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[STRIDED_VEC2]] 156; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]] 157; CHECK-NEXT: [[TMP5]] = sub <4 x i32> [[TMP3]], [[TMP4]] 158; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 159; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 160; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 161; CHECK: middle.block: 162; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 163; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[RDX_SHUF]] 164; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 165; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF4]] 166; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX5]], i32 0 167; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 168; CHECK: scalar.ph: 169; CHECK-NEXT: br label [[FOR_BODY:%.*]] 170; CHECK: for.body: 171; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !7 172; CHECK: for.end: 173; CHECK-NEXT: [[SUB8_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 174; CHECK-NEXT: ret i32 [[SUB8_LCSSA]] 175; 176entry: 177 br label %for.body 178 179for.body: ; preds = %for.body, %entry 180 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 181 %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] 182 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 183 %tmp = load i32, i32* %x, align 4 184 %add = add nsw i32 %tmp, %r.022 185 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 186 %tmp1 = load i32, i32* %y, align 4 187 %sub = sub i32 %add, %tmp1 188 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 189 %tmp2 = load i32, i32* %z, align 4 190 %add5 = add nsw i32 %sub, %tmp2 191 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 192 %tmp3 = load i32, i32* %w, align 4 193 %sub8 = sub i32 %add5, %tmp3 194 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 195 %exitcond = icmp eq i64 %indvars.iv.next, 1024 196 br i1 %exitcond, label %for.end, label %for.body 197 198for.end: ; preds = %for.body 199 ret i32 %sub8 200} 201 202; Check vectorization on an interleaved store group of factor 4. 203 204; void test_struct_store4(int *A, struct ST4 *B) { 205; int *ptr = A; 206; for (int i = 0; i < 1024; i++) { 207; int X = *ptr++; 208; B[i].x = X + 1; 209; B[i].y = X * 2; 210; B[i].z = X + 3; 211; B[i].w = X + 4; 212; } 213; } 214 215; CHECK-LABEL: @test_struct_store4( 216; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 217; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1> 218; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1> 219; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3> 220; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4> 221; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 222; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 223; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 224; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4 225 226define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) { 227entry: 228 br label %for.body 229 230for.cond.cleanup: ; preds = %for.body 231 ret void 232 233for.body: ; preds = %for.body, %entry 234 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 235 %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ] 236 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1 237 %tmp = load i32, i32* %ptr.024, align 4 238 %add = add nsw i32 %tmp, 1 239 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0 240 store i32 %add, i32* %x, align 4 241 %mul = shl nsw i32 %tmp, 1 242 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1 243 store i32 %mul, i32* %y, align 4 244 %add3 = add nsw i32 %tmp, 3 245 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2 246 store i32 %add3, i32* %z, align 4 247 %add6 = add nsw i32 %tmp, 4 248 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3 249 store i32 %add6, i32* %w, align 4 250 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 251 %exitcond = icmp eq i64 %indvars.iv.next, 1024 252 br i1 %exitcond, label %for.cond.cleanup, label %for.body 253} 254 255; Check vectorization on a reverse interleaved load group of factor 2 and 256; a reverse interleaved store group of factor 2. 257 258; struct ST2 { 259; int x; 260; int y; 261; }; 262; 263; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { 264; for (int i = 1023; i >= 0; i--) { 265; int a = A[i].x + i; // interleaved load of index 0 266; int b = A[i].y - i; // interleaved load of index 1 267; B[i].x = a; // interleaved store of index 0 268; B[i].y = b; // interleaved store of index 1 269; } 270; } 271 272; CHECK-LABEL: @test_reversed_load2_store2( 273; CHECK: %[[G0:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %offset.idx, i32 0 274; CHECK: %[[G1:.+]] = getelementptr inbounds i32, i32* %[[G0]], i64 -6 275; CHECK: %[[B0:.+]] = bitcast i32* %[[G1]] to <8 x i32>* 276; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 4 277; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 278; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 279; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 280; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 281; CHECK: add nsw <4 x i32> 282; CHECK: sub nsw <4 x i32> 283; CHECK: %[[G2:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %offset.idx, i32 1 284; CHECK: %[[G3:.+]] = getelementptr inbounds i32, i32* %[[G2]], i64 -7 285; CHECK: %[[B1:.+]] = bitcast i32* %[[G3]] to <8 x i32>* 286; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 287; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 288; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 289; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %[[B1]], align 4 290 291%struct.ST2 = type { i32, i32 } 292 293define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) { 294entry: 295 br label %for.body 296 297for.cond.cleanup: ; preds = %for.body 298 ret void 299 300for.body: ; preds = %for.body, %entry 301 %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] 302 %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 303 %tmp = load i32, i32* %x, align 4 304 %tmp1 = trunc i64 %indvars.iv to i32 305 %add = add nsw i32 %tmp, %tmp1 306 %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 307 %tmp2 = load i32, i32* %y, align 4 308 %sub = sub nsw i32 %tmp2, %tmp1 309 %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 310 store i32 %add, i32* %x5, align 4 311 %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 312 store i32 %sub, i32* %y8, align 4 313 %indvars.iv.next = add nsw i64 %indvars.iv, -1 314 %cmp = icmp sgt i64 %indvars.iv, 0 315 br i1 %cmp, label %for.body, label %for.cond.cleanup 316} 317 318; Check vectorization on an interleaved load group of factor 2 with 1 gap 319; (missing the load of odd elements). Because the vectorized loop would 320; speculatively access memory out-of-bounds, we must execute at least one 321; iteration of the scalar loop. 322 323; void even_load_static_tc(int *A, int *B) { 324; for (unsigned i = 0; i < 1024; i+=2) 325; B[i/2] = A[i] * 2; 326; } 327 328; CHECK-LABEL: @even_load_static_tc( 329; CHECK: vector.body: 330; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 331; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 332; CHECK: icmp eq i64 %index.next, 508 333; CHECK: middle.block: 334; CHECK: br i1 false, label %for.cond.cleanup, label %scalar.ph 335 336define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { 337entry: 338 br label %for.body 339 340for.cond.cleanup: ; preds = %for.body 341 ret void 342 343for.body: ; preds = %for.body, %entry 344 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 345 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 346 %tmp = load i32, i32* %arrayidx, align 4 347 %mul = shl nsw i32 %tmp, 1 348 %tmp1 = lshr exact i64 %indvars.iv, 1 349 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 350 store i32 %mul, i32* %arrayidx2, align 4 351 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 352 %cmp = icmp ult i64 %indvars.iv.next, 1024 353 br i1 %cmp, label %for.body, label %for.cond.cleanup 354} 355 356; Check vectorization on an interleaved load group of factor 2 with 1 gap 357; (missing the load of odd elements). Because the vectorized loop would 358; speculatively access memory out-of-bounds, we must execute at least one 359; iteration of the scalar loop. 360 361; void even_load_dynamic_tc(int *A, int *B, unsigned N) { 362; for (unsigned i = 0; i < N; i+=2) 363; B[i/2] = A[i] * 2; 364; } 365 366; CHECK-LABEL: @even_load_dynamic_tc( 367; CHECK: vector.ph: 368; CHECK: %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3 369; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 370; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 371; CHECK: %n.vec = sub i64 %[[N]], %[[R]] 372; CHECK: vector.body: 373; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 374; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 375; CHECK: icmp eq i64 %index.next, %n.vec 376; CHECK: middle.block: 377; CHECK: br i1 false, label %for.cond.cleanup, label %scalar.ph 378 379define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) { 380entry: 381 br label %for.body 382 383for.cond.cleanup: ; preds = %for.body 384 ret void 385 386for.body: ; preds = %for.body, %entry 387 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 388 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 389 %tmp = load i32, i32* %arrayidx, align 4 390 %mul = shl nsw i32 %tmp, 1 391 %tmp1 = lshr exact i64 %indvars.iv, 1 392 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 393 store i32 %mul, i32* %arrayidx2, align 4 394 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 395 %cmp = icmp ult i64 %indvars.iv.next, %N 396 br i1 %cmp, label %for.body, label %for.cond.cleanup 397} 398 399; Check vectorization on a reverse interleaved load group of factor 2 with 1 400; gap and a reverse interleaved store group of factor 2. The interleaved load 401; group should be removed since it has a gap and is reverse. 402 403; struct pair { 404; int x; 405; int y; 406; }; 407; 408; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) { 409; for (int i = 1023; i >= 0; i--) { 410; int a = X + i; 411; int b = A[i].y - i; 412; B[i].x = a; 413; B[i].y = b; 414; } 415; } 416 417; CHECK-LABEL: @load_gap_reverse( 418; CHECK-NOT: %wide.vec = load <8 x i64>, <8 x i64>* %{{.*}}, align 8 419; CHECK-NOT: %strided.vec = shufflevector <8 x i64> %wide.vec, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 420 421%pair = type { i64, i64 } 422define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) { 423entry: 424 br label %for.body 425 426for.body: 427 %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ] 428 %0 = add nsw i64 %X, %i 429 %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0 430 %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1 431 %3 = load i64, i64* %2, align 8 432 %4 = sub nsw i64 %3, %i 433 store i64 %0, i64* %1, align 8 434 store i64 %4, i64* %2, align 8 435 %i.next = add nsw i64 %i, -1 436 %cond = icmp sgt i64 %i, 0 437 br i1 %cond, label %for.body, label %for.exit 438 439for.exit: 440 ret void 441} 442 443; Check vectorization on interleaved access groups identified from mixed 444; loads/stores. 445; void mixed_load2_store2(int *A, int *B) { 446; for (unsigned i = 0; i < 1024; i+=2) { 447; B[i] = A[i] * A[i+1]; 448; B[i+1] = A[i] + A[i+1]; 449; } 450; } 451 452; CHECK-LABEL: @mixed_load2_store2( 453; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 454; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 455; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 456; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 457; CHECK: store <8 x i32> %interleaved.vec 458 459define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { 460entry: 461 br label %for.body 462 463for.cond.cleanup: ; preds = %for.body 464 ret void 465 466for.body: ; preds = %for.body, %entry 467 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 468 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 469 %tmp = load i32, i32* %arrayidx, align 4 470 %tmp1 = or i64 %indvars.iv, 1 471 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1 472 %tmp2 = load i32, i32* %arrayidx2, align 4 473 %mul = mul nsw i32 %tmp2, %tmp 474 %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 475 store i32 %mul, i32* %arrayidx4, align 4 476 %tmp3 = load i32, i32* %arrayidx, align 4 477 %tmp4 = load i32, i32* %arrayidx2, align 4 478 %add10 = add nsw i32 %tmp4, %tmp3 479 %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1 480 store i32 %add10, i32* %arrayidx13, align 4 481 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 482 %cmp = icmp ult i64 %indvars.iv.next, 1024 483 br i1 %cmp, label %for.body, label %for.cond.cleanup 484} 485 486; Check vectorization on interleaved access groups identified from mixed 487; loads/stores. 488; void mixed_load3_store3(int *A) { 489; for (unsigned i = 0; i < 1024; i++) { 490; *A++ += i; 491; *A++ += i; 492; *A++ += i; 493; } 494; } 495 496; CHECK-LABEL: @mixed_load3_store3( 497; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 498; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 499; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 500; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 501; CHECK: %interleaved.vec = shufflevector <8 x i32> %{{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 502; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* %{{.*}}, align 4 503 504define void @mixed_load3_store3(i32* nocapture %A) { 505entry: 506 br label %for.body 507 508for.cond.cleanup: ; preds = %for.body 509 ret void 510 511for.body: ; preds = %for.body, %entry 512 %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 513 %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ] 514 %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1 515 %tmp = load i32, i32* %A.addr.012, align 4 516 %add = add i32 %tmp, %i.013 517 store i32 %add, i32* %A.addr.012, align 4 518 %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2 519 %tmp1 = load i32, i32* %incdec.ptr, align 4 520 %add2 = add i32 %tmp1, %i.013 521 store i32 %add2, i32* %incdec.ptr, align 4 522 %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3 523 %tmp2 = load i32, i32* %incdec.ptr1, align 4 524 %add4 = add i32 %tmp2, %i.013 525 store i32 %add4, i32* %incdec.ptr1, align 4 526 %inc = add nuw nsw i32 %i.013, 1 527 %exitcond = icmp eq i32 %inc, 1024 528 br i1 %exitcond, label %for.cond.cleanup, label %for.body 529} 530 531; Check vectorization on interleaved access groups with members having different 532; kinds of type. 533 534; struct IntFloat { 535; int a; 536; float b; 537; }; 538; 539; int SA; 540; float SB; 541; 542; void int_float_struct(struct IntFloat *A) { 543; int SumA; 544; float SumB; 545; for (unsigned i = 0; i < 1024; i++) { 546; SumA += A[i].a; 547; SumB += A[i].b; 548; } 549; SA = SumA; 550; SB = SumB; 551; } 552 553; CHECK-LABEL: @int_float_struct( 554; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 555; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 556; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 557; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float> 558; CHECK: add <4 x i32> 559; CHECK: fadd fast <4 x float> 560 561%struct.IntFloat = type { i32, float } 562 563@SA = common global i32 0, align 4 564@SB = common global float 0.000000e+00, align 4 565 566define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 { 567entry: 568 br label %for.body 569 570for.cond.cleanup: ; preds = %for.body 571 store i32 %add, i32* @SA, align 4 572 store float %add3, float* @SB, align 4 573 ret void 574 575for.body: ; preds = %for.body, %entry 576 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 577 %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] 578 %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] 579 %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0 580 %tmp = load i32, i32* %a, align 4 581 %add = add nsw i32 %tmp, %SumA.013 582 %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1 583 %tmp1 = load float, float* %b, align 4 584 %add3 = fadd fast float %SumB.014, %tmp1 585 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 586 %exitcond = icmp eq i64 %indvars.iv.next, 1024 587 br i1 %exitcond, label %for.cond.cleanup, label %for.body 588} 589 590; Check vectorization of interleaved access groups in the presence of 591; dependences (PR27626). The following tests check that we don't reorder 592; dependent loads and stores when generating code for interleaved access 593; groups. Stores should be scalarized because the required code motion would 594; break dependences, and the remaining interleaved load groups should have 595; gaps. 596 597; PR27626_0: Ensure a strided store is not moved after a dependent (zero 598; distance) strided load. 599 600; void PR27626_0(struct pair *p, int z, int n) { 601; for (int i = 0; i < n; i++) { 602; p[i].x = z; 603; p[i].y = p[i].x; 604; } 605; } 606 607; CHECK-LABEL: @PR27626_0( 608; CHECK: vector.ph: 609; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 610; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 611; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 612; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]] 613; CHECK: vector.body: 614; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 615; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 616; CHECK: store i32 %[[X1]], {{.*}} 617; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 618; CHECK: store i32 %[[X2]], {{.*}} 619; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 620; CHECK: store i32 %[[X3]], {{.*}} 621; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 622; CHECK: store i32 %[[X4]], {{.*}} 623 624%pair.i32 = type { i32, i32 } 625define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) { 626entry: 627 br label %for.body 628 629for.body: 630 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 631 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 632 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 633 store i32 %z, i32* %p_i.x, align 4 634 %0 = load i32, i32* %p_i.x, align 4 635 store i32 %0, i32 *%p_i.y, align 4 636 %i.next = add nuw nsw i64 %i, 1 637 %cond = icmp slt i64 %i.next, %n 638 br i1 %cond, label %for.body, label %for.end 639 640for.end: 641 ret void 642} 643 644; PR27626_1: Ensure a strided load is not moved before a dependent (zero 645; distance) strided store. 646 647; void PR27626_1(struct pair *p, int n) { 648; int s = 0; 649; for (int i = 0; i < n; i++) { 650; p[i].y = p[i].x; 651; s += p[i].y 652; } 653; } 654 655; CHECK-LABEL: @PR27626_1( 656; CHECK: vector.ph: 657; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 658; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 659; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 660; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]] 661; CHECK: vector.body: 662; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ] 663; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 664; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0 665; CHECK: store i32 %[[X1:.+]], {{.*}} 666; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2 667; CHECK: store i32 %[[X2:.+]], {{.*}} 668; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4 669; CHECK: store i32 %[[X3:.+]], {{.*}} 670; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6 671; CHECK: store i32 %[[X4:.+]], {{.*}} 672; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 673; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 674; CHECK: add <4 x i32> %[[S1]], %[[Phi]] 675 676define i32 @PR27626_1(%pair.i32 *%p, i64 %n) { 677entry: 678 br label %for.body 679 680for.body: 681 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 682 %s = phi i32 [ %2, %for.body ], [ 0, %entry ] 683 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 684 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 685 %0 = load i32, i32* %p_i.x, align 4 686 store i32 %0, i32* %p_i.y, align 4 687 %1 = load i32, i32* %p_i.y, align 4 688 %2 = add nsw i32 %1, %s 689 %i.next = add nuw nsw i64 %i, 1 690 %cond = icmp slt i64 %i.next, %n 691 br i1 %cond, label %for.body, label %for.end 692 693for.end: 694 %3 = phi i32 [ %2, %for.body ] 695 ret i32 %3 696} 697 698; PR27626_2: Ensure a strided store is not moved after a dependent (negative 699; distance) strided load. 700 701; void PR27626_2(struct pair *p, int z, int n) { 702; for (int i = 0; i < n; i++) { 703; p[i].x = z; 704; p[i].y = p[i - 1].x; 705; } 706; } 707 708; CHECK-LABEL: @PR27626_2( 709; CHECK: vector.ph: 710; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 711; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 712; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 713; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]] 714; CHECK: vector.body: 715; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 716; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 717; CHECK: store i32 %[[X1]], {{.*}} 718; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 719; CHECK: store i32 %[[X2]], {{.*}} 720; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 721; CHECK: store i32 %[[X3]], {{.*}} 722; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 723; CHECK: store i32 %[[X4]], {{.*}} 724 725define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) { 726entry: 727 br label %for.body 728 729for.body: 730 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 731 %i_minus_1 = add nuw nsw i64 %i, -1 732 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 733 %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0 734 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 735 store i32 %z, i32* %p_i.x, align 4 736 %0 = load i32, i32* %p_i_minus_1.x, align 4 737 store i32 %0, i32 *%p_i.y, align 4 738 %i.next = add nuw nsw i64 %i, 1 739 %cond = icmp slt i64 %i.next, %n 740 br i1 %cond, label %for.body, label %for.end 741 742for.end: 743 ret void 744} 745 746; PR27626_3: Ensure a strided load is not moved before a dependent (negative 747; distance) strided store. 748 749; void PR27626_3(struct pair *p, int z, int n) { 750; for (int i = 0; i < n; i++) { 751; p[i + 1].y = p[i].x; 752; s += p[i].y; 753; } 754; } 755 756; CHECK-LABEL: @PR27626_3( 757; CHECK: vector.ph: 758; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 759; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 760; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 761; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]] 762; CHECK: vector.body: 763; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ] 764; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 765; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0 766; CHECK: store i32 %[[X1:.+]], {{.*}} 767; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2 768; CHECK: store i32 %[[X2:.+]], {{.*}} 769; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4 770; CHECK: store i32 %[[X3:.+]], {{.*}} 771; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6 772; CHECK: store i32 %[[X4:.+]], {{.*}} 773; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 774; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 775; CHECK: add <4 x i32> %[[S1]], %[[Phi]] 776 777define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) { 778entry: 779 br label %for.body 780 781for.body: 782 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 783 %s = phi i32 [ %2, %for.body ], [ 0, %entry ] 784 %i_plus_1 = add nuw nsw i64 %i, 1 785 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 786 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 787 %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1 788 %0 = load i32, i32* %p_i.x, align 4 789 store i32 %0, i32* %p_i_plus_1.y, align 4 790 %1 = load i32, i32* %p_i.y, align 4 791 %2 = add nsw i32 %1, %s 792 %i.next = add nuw nsw i64 %i, 1 793 %cond = icmp slt i64 %i.next, %n 794 br i1 %cond, label %for.body, label %for.end 795 796for.end: 797 %3 = phi i32 [ %2, %for.body ] 798 ret i32 %3 799} 800 801; PR27626_4: Ensure we form an interleaved group for strided stores in the 802; presence of a write-after-write dependence. We create a group for 803; (2) and (3) while excluding (1). 804 805; void PR27626_4(int *a, int x, int y, int z, int n) { 806; for (int i = 0; i < n; i += 2) { 807; a[i] = x; // (1) 808; a[i] = y; // (2) 809; a[i + 1] = z; // (3) 810; } 811; } 812 813; CHECK-LABEL: @PR27626_4( 814; CHECK: vector.ph: 815; CHECK: %[[INS_Y:.+]] = insertelement <4 x i32> poison, i32 %y, i32 0 816; CHECK: %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> poison, <4 x i32> zeroinitializer 817; CHECK: %[[INS_Z:.+]] = insertelement <4 x i32> poison, i32 %z, i32 0 818; CHECK: %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> poison, <4 x i32> zeroinitializer 819; CHECK: vector.body: 820; CHECK: store i32 %x, {{.*}} 821; CHECK: store i32 %x, {{.*}} 822; CHECK: store i32 %x, {{.*}} 823; CHECK: store i32 %x, {{.*}} 824; CHECK: %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 825; CHECK: store <8 x i32> %[[VEC]], {{.*}} 826 827define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { 828entry: 829 br label %for.body 830 831for.body: 832 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 833 %i_plus_1 = add i64 %i, 1 834 %a_i = getelementptr inbounds i32, i32* %a, i64 %i 835 %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1 836 store i32 %x, i32* %a_i, align 4 837 store i32 %y, i32* %a_i, align 4 838 store i32 %z, i32* %a_i_plus_1, align 4 839 %i.next = add nuw nsw i64 %i, 2 840 %cond = icmp slt i64 %i.next, %n 841 br i1 %cond, label %for.body, label %for.end 842 843for.end: 844 ret void 845} 846 847; PR27626_5: Ensure we do not form an interleaved group for strided stores in 848; the presence of a write-after-write dependence. 849 850; void PR27626_5(int *a, int x, int y, int z, int n) { 851; for (int i = 3; i < n; i += 2) { 852; a[i - 1] = x; 853; a[i - 3] = y; 854; a[i] = z; 855; } 856; } 857 858; CHECK-LABEL: @PR27626_5( 859; CHECK: vector.body: 860; CHECK: store i32 %x, {{.*}} 861; CHECK: store i32 %x, {{.*}} 862; CHECK: store i32 %x, {{.*}} 863; CHECK: store i32 %x, {{.*}} 864; CHECK: store i32 %y, {{.*}} 865; CHECK: store i32 %y, {{.*}} 866; CHECK: store i32 %y, {{.*}} 867; CHECK: store i32 %y, {{.*}} 868; CHECK: store i32 %z, {{.*}} 869; CHECK: store i32 %z, {{.*}} 870; CHECK: store i32 %z, {{.*}} 871; CHECK: store i32 %z, {{.*}} 872 873define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { 874entry: 875 br label %for.body 876 877for.body: 878 %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ] 879 %i_minus_1 = sub i64 %i, 1 880 %i_minus_3 = sub i64 %i_minus_1, 2 881 %a_i = getelementptr inbounds i32, i32* %a, i64 %i 882 %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1 883 %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3 884 store i32 %x, i32* %a_i_minus_1, align 4 885 store i32 %y, i32* %a_i_minus_3, align 4 886 store i32 %z, i32* %a_i, align 4 887 %i.next = add nuw nsw i64 %i, 2 888 %cond = icmp slt i64 %i.next, %n 889 br i1 %cond, label %for.body, label %for.end 890 891for.end: 892 ret void 893} 894 895; PR34743: Ensure that a cast which needs to sink after a load that belongs to 896; an interleaved group, indeeded gets sunk. 897 898; void PR34743(short *a, int *b, int n) { 899; for (int i = 0, iv = 0; iv < n; i++, iv += 2) { 900; b[i] = a[iv] * a[iv+1] * a[iv+2]; 901; } 902; } 903 904; CHECK-LABEL: @PR34743( 905; CHECK: vector.body: 906; CHECK: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[VSHUF1:.+]], %vector.body ] 907; CHECK: %wide.vec = load <8 x i16> 908; CHECK: %[[VSHUF0:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 909; CHECK: %[[VSHUF1:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 910; CHECK: %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %[[VSHUF1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> 911; CHECK: sext <4 x i16> %[[VSHUF0]] to <4 x i32> 912; CHECK: sext <4 x i16> %[[VSHUF]] to <4 x i32> 913; CHECK: sext <4 x i16> %[[VSHUF1]] to <4 x i32> 914; CHECK: mul nsw <4 x i32> 915; CHECK: mul nsw <4 x i32> 916 917define void @PR34743(i16* %a, i32* %b, i64 %n) { 918entry: 919 %.pre = load i16, i16* %a 920 br label %loop 921 922loop: 923 %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ] 924 %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ] 925 %i = phi i64 [ 0, %entry ], [ %i1, %loop ] 926 %conv = sext i16 %0 to i32 927 %i1 = add nuw nsw i64 %i, 1 928 %iv1 = add nuw nsw i64 %iv, 1 929 %iv2 = add nuw nsw i64 %iv, 2 930 %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1 931 %load1 = load i16, i16* %gep1, align 4 932 %conv1 = sext i16 %load1 to i32 933 %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2 934 %load2 = load i16, i16* %gep2, align 4 935 %conv2 = sext i16 %load2 to i32 936 %mul01 = mul nsw i32 %conv, %conv1 937 %mul012 = mul nsw i32 %mul01, %conv2 938 %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i 939 store i32 %mul012, i32* %arrayidx5 940 %exitcond = icmp eq i64 %iv, %n 941 br i1 %exitcond, label %end, label %loop 942 943end: 944 ret void 945} 946 947attributes #0 = { "unsafe-fp-math"="true" } 948