1; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s 2 3target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 4 5; Check vectorization on an interleaved load group of factor 2 and an interleaved 6; store group of factor 2. 7 8; int AB[1024]; 9; int CD[1024]; 10; void test_array_load2_store2(int C, int D) { 11; for (int i = 0; i < 1024; i+=2) { 12; int A = AB[i]; 13; int B = AB[i+1]; 14; CD[i] = A + C; 15; CD[i+1] = B * D; 16; } 17; } 18 19; CHECK-LABEL: @test_array_load2_store2( 20; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 21; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 22; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 23; CHECK: add nsw <4 x i32> 24; CHECK: mul nsw <4 x i32> 25; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 26; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 27 28@AB = common global [1024 x i32] zeroinitializer, align 4 29@CD = common global [1024 x i32] zeroinitializer, align 4 30 31define void @test_array_load2_store2(i32 %C, i32 %D) { 32entry: 33 br label %for.body 34 35for.body: ; preds = %for.body, %entry 36 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 37 %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv 38 %tmp = load i32, i32* %arrayidx0, align 4 39 %tmp1 = or i64 %indvars.iv, 1 40 %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1 41 %tmp2 = load i32, i32* %arrayidx1, align 4 42 %add = add nsw i32 %tmp, %C 43 %mul = mul nsw i32 %tmp2, %D 44 %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv 45 store i32 %add, i32* %arrayidx2, align 4 46 %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1 47 store i32 %mul, i32* %arrayidx3, align 4 48 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 49 %cmp = icmp slt i64 %indvars.iv.next, 1024 50 br i1 %cmp, label %for.body, label %for.end 51 52for.end: ; preds = %for.body 53 ret void 54} 55 56; int A[3072]; 57; struct ST S[1024]; 58; void test_struct_st3() { 59; int *ptr = A; 60; for (int i = 0; i < 1024; i++) { 61; int X1 = *ptr++; 62; int X2 = *ptr++; 63; int X3 = *ptr++; 64; T[i].x = X1 + 1; 65; T[i].y = X2 + 2; 66; T[i].z = X3 + 3; 67; } 68; } 69 70; CHECK-LABEL: @test_struct_array_load3_store3( 71; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 72; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 73; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 74; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 75; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1> 76; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2> 77; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3> 78; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 79; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 80; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 81; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4 82 83%struct.ST3 = type { i32, i32, i32 } 84@A = common global [3072 x i32] zeroinitializer, align 4 85@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 86 87define void @test_struct_array_load3_store3() { 88entry: 89 br label %for.body 90 91for.body: ; preds = %for.body, %entry 92 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 93 %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] 94 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 95 %tmp = load i32, i32* %ptr.016, align 4 96 %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 97 %tmp1 = load i32, i32* %incdec.ptr, align 4 98 %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 99 %tmp2 = load i32, i32* %incdec.ptr1, align 4 100 %add = add nsw i32 %tmp, 1 101 %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 102 store i32 %add, i32* %x, align 4 103 %add3 = add nsw i32 %tmp1, 2 104 %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 105 store i32 %add3, i32* %y, align 4 106 %add6 = add nsw i32 %tmp2, 3 107 %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 108 store i32 %add6, i32* %z, align 4 109 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 110 %exitcond = icmp eq i64 %indvars.iv.next, 1024 111 br i1 %exitcond, label %for.end, label %for.body 112 113for.end: ; preds = %for.body 114 ret void 115} 116 117; Check vectorization on an interleaved load group of factor 4. 118 119; struct ST4{ 120; int x; 121; int y; 122; int z; 123; int w; 124; }; 125; int test_struct_load4(struct ST4 *S) { 126; int r = 0; 127; for (int i = 0; i < 1024; i++) { 128; r += S[i].x; 129; r -= S[i].y; 130; r += S[i].z; 131; r -= S[i].w; 132; } 133; return r; 134; } 135 136; CHECK-LABEL: @test_struct_load4( 137; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4 138; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 139; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 140; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 141; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 142; CHECK: add nsw <4 x i32> 143; CHECK: sub <4 x i32> 144; CHECK: add nsw <4 x i32> 145; CHECK: sub <4 x i32> 146 147%struct.ST4 = type { i32, i32, i32, i32 } 148 149define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { 150entry: 151 br label %for.body 152 153for.body: ; preds = %for.body, %entry 154 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 155 %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] 156 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 157 %tmp = load i32, i32* %x, align 4 158 %add = add nsw i32 %tmp, %r.022 159 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 160 %tmp1 = load i32, i32* %y, align 4 161 %sub = sub i32 %add, %tmp1 162 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 163 %tmp2 = load i32, i32* %z, align 4 164 %add5 = add nsw i32 %sub, %tmp2 165 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 166 %tmp3 = load i32, i32* %w, align 4 167 %sub8 = sub i32 %add5, %tmp3 168 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 169 %exitcond = icmp eq i64 %indvars.iv.next, 1024 170 br i1 %exitcond, label %for.end, label %for.body 171 172for.end: ; preds = %for.body 173 ret i32 %sub8 174} 175 176; Check vectorization on an interleaved store group of factor 4. 177 178; void test_struct_store4(int *A, struct ST4 *B) { 179; int *ptr = A; 180; for (int i = 0; i < 1024; i++) { 181; int X = *ptr++; 182; B[i].x = X + 1; 183; B[i].y = X * 2; 184; B[i].z = X + 3; 185; B[i].w = X + 4; 186; } 187; } 188 189; CHECK-LABEL: @test_struct_store4( 190; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 191; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1> 192; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1> 193; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3> 194; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4> 195; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 196; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 197; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 198; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4 199 200define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) { 201entry: 202 br label %for.body 203 204for.cond.cleanup: ; preds = %for.body 205 ret void 206 207for.body: ; preds = %for.body, %entry 208 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 209 %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ] 210 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1 211 %tmp = load i32, i32* %ptr.024, align 4 212 %add = add nsw i32 %tmp, 1 213 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0 214 store i32 %add, i32* %x, align 4 215 %mul = shl nsw i32 %tmp, 1 216 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1 217 store i32 %mul, i32* %y, align 4 218 %add3 = add nsw i32 %tmp, 3 219 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2 220 store i32 %add3, i32* %z, align 4 221 %add6 = add nsw i32 %tmp, 4 222 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3 223 store i32 %add6, i32* %w, align 4 224 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 225 %exitcond = icmp eq i64 %indvars.iv.next, 1024 226 br i1 %exitcond, label %for.cond.cleanup, label %for.body 227} 228 229; Check vectorization on a reverse interleaved load group of factor 2 and 230; a reverse interleaved store group of factor 2. 231 232; struct ST2 { 233; int x; 234; int y; 235; }; 236; 237; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { 238; for (int i = 1023; i >= 0; i--) { 239; int a = A[i].x + i; // interleaved load of index 0 240; int b = A[i].y - i; // interleaved load of index 1 241; B[i].x = a; // interleaved store of index 0 242; B[i].y = b; // interleaved store of index 1 243; } 244; } 245 246; CHECK-LABEL: @test_reversed_load2_store2( 247; CHECK: %[[G0:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %offset.idx, i32 0 248; CHECK: %[[G1:.+]] = getelementptr i32, i32* %[[G0]], i64 -6 249; CHECK: %[[B0:.+]] = bitcast i32* %[[G1]] to <8 x i32>* 250; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 4 251; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 252; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 253; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 254; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 255; CHECK: add nsw <4 x i32> 256; CHECK: sub nsw <4 x i32> 257; CHECK: %[[G2:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %offset.idx, i32 1 258; CHECK: %[[G3:.+]] = getelementptr i32, i32* %[[G2]], i64 -7 259; CHECK: %[[B1:.+]] = bitcast i32* %[[G3]] to <8 x i32>* 260; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 261; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 262; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 263; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %[[B1]], align 4 264 265%struct.ST2 = type { i32, i32 } 266 267define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) { 268entry: 269 br label %for.body 270 271for.cond.cleanup: ; preds = %for.body 272 ret void 273 274for.body: ; preds = %for.body, %entry 275 %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] 276 %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 277 %tmp = load i32, i32* %x, align 4 278 %tmp1 = trunc i64 %indvars.iv to i32 279 %add = add nsw i32 %tmp, %tmp1 280 %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 281 %tmp2 = load i32, i32* %y, align 4 282 %sub = sub nsw i32 %tmp2, %tmp1 283 %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 284 store i32 %add, i32* %x5, align 4 285 %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 286 store i32 %sub, i32* %y8, align 4 287 %indvars.iv.next = add nsw i64 %indvars.iv, -1 288 %cmp = icmp sgt i64 %indvars.iv, 0 289 br i1 %cmp, label %for.body, label %for.cond.cleanup 290} 291 292; Check vectorization on an interleaved load group of factor 2 with 1 gap 293; (missing the load of odd elements). Because the vectorized loop would 294; speculatively access memory out-of-bounds, we must execute at least one 295; iteration of the scalar loop. 296 297; void even_load_static_tc(int *A, int *B) { 298; for (unsigned i = 0; i < 1024; i+=2) 299; B[i/2] = A[i] * 2; 300; } 301 302; CHECK-LABEL: @even_load_static_tc( 303; CHECK: vector.body: 304; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 305; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 306; CHECK: icmp eq i64 %index.next, 508 307; CHECK: middle.block: 308; CHECK: br i1 false, label %for.cond.cleanup, label %scalar.ph 309 310define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { 311entry: 312 br label %for.body 313 314for.cond.cleanup: ; preds = %for.body 315 ret void 316 317for.body: ; preds = %for.body, %entry 318 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 319 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 320 %tmp = load i32, i32* %arrayidx, align 4 321 %mul = shl nsw i32 %tmp, 1 322 %tmp1 = lshr exact i64 %indvars.iv, 1 323 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 324 store i32 %mul, i32* %arrayidx2, align 4 325 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 326 %cmp = icmp ult i64 %indvars.iv.next, 1024 327 br i1 %cmp, label %for.body, label %for.cond.cleanup 328} 329 330; Check vectorization on an interleaved load group of factor 2 with 1 gap 331; (missing the load of odd elements). Because the vectorized loop would 332; speculatively access memory out-of-bounds, we must execute at least one 333; iteration of the scalar loop. 334 335; void even_load_dynamic_tc(int *A, int *B, unsigned N) { 336; for (unsigned i = 0; i < N; i+=2) 337; B[i/2] = A[i] * 2; 338; } 339 340; CHECK-LABEL: @even_load_dynamic_tc( 341; CHECK: min.iters.checked: 342; CHECK: %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3 343; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 344; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 345; CHECK: %n.vec = sub i64 %[[N]], %[[R]] 346; CHECK: vector.body: 347; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 348; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 349; CHECK: icmp eq i64 %index.next, %n.vec 350; CHECK: middle.block: 351; CHECK: br i1 false, label %for.cond.cleanup, label %scalar.ph 352 353define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) { 354entry: 355 br label %for.body 356 357for.cond.cleanup: ; preds = %for.body 358 ret void 359 360for.body: ; preds = %for.body, %entry 361 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 362 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 363 %tmp = load i32, i32* %arrayidx, align 4 364 %mul = shl nsw i32 %tmp, 1 365 %tmp1 = lshr exact i64 %indvars.iv, 1 366 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 367 store i32 %mul, i32* %arrayidx2, align 4 368 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 369 %cmp = icmp ult i64 %indvars.iv.next, %N 370 br i1 %cmp, label %for.body, label %for.cond.cleanup 371} 372 373; Check vectorization on a reverse interleaved load group of factor 2 with 1 374; gap and a reverse interleaved store group of factor 2. The interleaved load 375; group should be removed since it has a gap and is reverse. 376 377; struct pair { 378; int x; 379; int y; 380; }; 381; 382; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) { 383; for (int i = 1023; i >= 0; i--) { 384; int a = X + i; 385; int b = A[i].y - i; 386; B[i].x = a; 387; B[i].y = b; 388; } 389; } 390 391; CHECK-LABEL: @load_gap_reverse( 392; CHECK-NOT: %wide.vec = load <8 x i64>, <8 x i64>* %{{.*}}, align 8 393; CHECK-NOT: %strided.vec = shufflevector <8 x i64> %wide.vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 394 395%pair = type { i64, i64 } 396define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) { 397entry: 398 br label %for.body 399 400for.body: 401 %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ] 402 %0 = add nsw i64 %X, %i 403 %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0 404 %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1 405 %3 = load i64, i64* %2, align 8 406 %4 = sub nsw i64 %3, %i 407 store i64 %0, i64* %1, align 8 408 store i64 %4, i64* %2, align 8 409 %i.next = add nsw i64 %i, -1 410 %cond = icmp sgt i64 %i, 0 411 br i1 %cond, label %for.body, label %for.exit 412 413for.exit: 414 ret void 415} 416 417; Check vectorization on interleaved access groups identified from mixed 418; loads/stores. 419; void mixed_load2_store2(int *A, int *B) { 420; for (unsigned i = 0; i < 1024; i+=2) { 421; B[i] = A[i] * A[i+1]; 422; B[i+1] = A[i] + A[i+1]; 423; } 424; } 425 426; CHECK-LABEL: @mixed_load2_store2( 427; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 428; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 429; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 430; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 431; CHECK: store <8 x i32> %interleaved.vec 432 433define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { 434entry: 435 br label %for.body 436 437for.cond.cleanup: ; preds = %for.body 438 ret void 439 440for.body: ; preds = %for.body, %entry 441 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 442 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 443 %tmp = load i32, i32* %arrayidx, align 4 444 %tmp1 = or i64 %indvars.iv, 1 445 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1 446 %tmp2 = load i32, i32* %arrayidx2, align 4 447 %mul = mul nsw i32 %tmp2, %tmp 448 %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 449 store i32 %mul, i32* %arrayidx4, align 4 450 %tmp3 = load i32, i32* %arrayidx, align 4 451 %tmp4 = load i32, i32* %arrayidx2, align 4 452 %add10 = add nsw i32 %tmp4, %tmp3 453 %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1 454 store i32 %add10, i32* %arrayidx13, align 4 455 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 456 %cmp = icmp ult i64 %indvars.iv.next, 1024 457 br i1 %cmp, label %for.body, label %for.cond.cleanup 458} 459 460; Check vectorization on interleaved access groups identified from mixed 461; loads/stores. 462; void mixed_load3_store3(int *A) { 463; for (unsigned i = 0; i < 1024; i++) { 464; *A++ += i; 465; *A++ += i; 466; *A++ += i; 467; } 468; } 469 470; CHECK-LABEL: @mixed_load3_store3( 471; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 472; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 473; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 474; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 475; CHECK: %interleaved.vec = shufflevector <8 x i32> %{{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 476; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* %{{.*}}, align 4 477 478define void @mixed_load3_store3(i32* nocapture %A) { 479entry: 480 br label %for.body 481 482for.cond.cleanup: ; preds = %for.body 483 ret void 484 485for.body: ; preds = %for.body, %entry 486 %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 487 %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ] 488 %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1 489 %tmp = load i32, i32* %A.addr.012, align 4 490 %add = add i32 %tmp, %i.013 491 store i32 %add, i32* %A.addr.012, align 4 492 %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2 493 %tmp1 = load i32, i32* %incdec.ptr, align 4 494 %add2 = add i32 %tmp1, %i.013 495 store i32 %add2, i32* %incdec.ptr, align 4 496 %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3 497 %tmp2 = load i32, i32* %incdec.ptr1, align 4 498 %add4 = add i32 %tmp2, %i.013 499 store i32 %add4, i32* %incdec.ptr1, align 4 500 %inc = add nuw nsw i32 %i.013, 1 501 %exitcond = icmp eq i32 %inc, 1024 502 br i1 %exitcond, label %for.cond.cleanup, label %for.body 503} 504 505; Check vectorization on interleaved access groups with members having different 506; kinds of type. 507 508; struct IntFloat { 509; int a; 510; float b; 511; }; 512; 513; int SA; 514; float SB; 515; 516; void int_float_struct(struct IntFloat *A) { 517; int SumA; 518; float SumB; 519; for (unsigned i = 0; i < 1024; i++) { 520; SumA += A[i].a; 521; SumB += A[i].b; 522; } 523; SA = SumA; 524; SB = SumB; 525; } 526 527; CHECK-LABEL: @int_float_struct( 528; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 529; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 530; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 531; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float> 532; CHECK: add nsw <4 x i32> 533; CHECK: fadd fast <4 x float> 534 535%struct.IntFloat = type { i32, float } 536 537@SA = common global i32 0, align 4 538@SB = common global float 0.000000e+00, align 4 539 540define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 { 541entry: 542 br label %for.body 543 544for.cond.cleanup: ; preds = %for.body 545 store i32 %add, i32* @SA, align 4 546 store float %add3, float* @SB, align 4 547 ret void 548 549for.body: ; preds = %for.body, %entry 550 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 551 %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] 552 %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] 553 %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0 554 %tmp = load i32, i32* %a, align 4 555 %add = add nsw i32 %tmp, %SumA.013 556 %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1 557 %tmp1 = load float, float* %b, align 4 558 %add3 = fadd fast float %SumB.014, %tmp1 559 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 560 %exitcond = icmp eq i64 %indvars.iv.next, 1024 561 br i1 %exitcond, label %for.cond.cleanup, label %for.body 562} 563 564; Check vectorization of interleaved access groups in the presence of 565; dependences (PR27626). The following tests check that we don't reorder 566; dependent loads and stores when generating code for interleaved access 567; groups. Stores should be scalarized because the required code motion would 568; break dependences, and the remaining interleaved load groups should have 569; gaps. 570 571; PR27626_0: Ensure a strided store is not moved after a dependent (zero 572; distance) strided load. 573 574; void PR27626_0(struct pair *p, int z, int n) { 575; for (int i = 0; i < n; i++) { 576; p[i].x = z; 577; p[i].y = p[i].x; 578; } 579; } 580 581; CHECK-LABEL: @PR27626_0( 582; CHECK: min.iters.checked: 583; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 584; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 585; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 586; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]] 587; CHECK: vector.body: 588; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 589; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 590; CHECK: store i32 %[[X1]], {{.*}} 591; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 592; CHECK: store i32 %[[X2]], {{.*}} 593; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 594; CHECK: store i32 %[[X3]], {{.*}} 595; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 596; CHECK: store i32 %[[X4]], {{.*}} 597 598%pair.i32 = type { i32, i32 } 599define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) { 600entry: 601 br label %for.body 602 603for.body: 604 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 605 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 606 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 607 store i32 %z, i32* %p_i.x, align 4 608 %0 = load i32, i32* %p_i.x, align 4 609 store i32 %0, i32 *%p_i.y, align 4 610 %i.next = add nuw nsw i64 %i, 1 611 %cond = icmp slt i64 %i.next, %n 612 br i1 %cond, label %for.body, label %for.end 613 614for.end: 615 ret void 616} 617 618; PR27626_1: Ensure a strided load is not moved before a dependent (zero 619; distance) strided store. 620 621; void PR27626_1(struct pair *p, int n) { 622; int s = 0; 623; for (int i = 0; i < n; i++) { 624; p[i].y = p[i].x; 625; s += p[i].y 626; } 627; } 628 629; CHECK-LABEL: @PR27626_1( 630; CHECK: min.iters.checked: 631; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 632; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 633; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 634; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]] 635; CHECK: vector.body: 636; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ] 637; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 638; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0 639; CHECK: store i32 %[[X1:.+]], {{.*}} 640; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2 641; CHECK: store i32 %[[X2:.+]], {{.*}} 642; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4 643; CHECK: store i32 %[[X3:.+]], {{.*}} 644; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6 645; CHECK: store i32 %[[X4:.+]], {{.*}} 646; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 647; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 648; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]] 649 650define i32 @PR27626_1(%pair.i32 *%p, i64 %n) { 651entry: 652 br label %for.body 653 654for.body: 655 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 656 %s = phi i32 [ %2, %for.body ], [ 0, %entry ] 657 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 658 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 659 %0 = load i32, i32* %p_i.x, align 4 660 store i32 %0, i32* %p_i.y, align 4 661 %1 = load i32, i32* %p_i.y, align 4 662 %2 = add nsw i32 %1, %s 663 %i.next = add nuw nsw i64 %i, 1 664 %cond = icmp slt i64 %i.next, %n 665 br i1 %cond, label %for.body, label %for.end 666 667for.end: 668 %3 = phi i32 [ %2, %for.body ] 669 ret i32 %3 670} 671 672; PR27626_2: Ensure a strided store is not moved after a dependent (negative 673; distance) strided load. 674 675; void PR27626_2(struct pair *p, int z, int n) { 676; for (int i = 0; i < n; i++) { 677; p[i].x = z; 678; p[i].y = p[i - 1].x; 679; } 680; } 681 682; CHECK-LABEL: @PR27626_2( 683; CHECK: min.iters.checked: 684; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 685; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 686; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 687; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]] 688; CHECK: vector.body: 689; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 690; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0 691; CHECK: store i32 %[[X1]], {{.*}} 692; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2 693; CHECK: store i32 %[[X2]], {{.*}} 694; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4 695; CHECK: store i32 %[[X3]], {{.*}} 696; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6 697; CHECK: store i32 %[[X4]], {{.*}} 698 699define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) { 700entry: 701 br label %for.body 702 703for.body: 704 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 705 %i_minus_1 = add nuw nsw i64 %i, -1 706 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 707 %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0 708 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 709 store i32 %z, i32* %p_i.x, align 4 710 %0 = load i32, i32* %p_i_minus_1.x, align 4 711 store i32 %0, i32 *%p_i.y, align 4 712 %i.next = add nuw nsw i64 %i, 1 713 %cond = icmp slt i64 %i.next, %n 714 br i1 %cond, label %for.body, label %for.end 715 716for.end: 717 ret void 718} 719 720; PR27626_3: Ensure a strided load is not moved before a dependent (negative 721; distance) strided store. 722 723; void PR27626_3(struct pair *p, int z, int n) { 724; for (int i = 0; i < n; i++) { 725; p[i + 1].y = p[i].x; 726; s += p[i].y; 727; } 728; } 729 730; CHECK-LABEL: @PR27626_3( 731; CHECK: min.iters.checked: 732; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3 733; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0 734; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf 735; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]] 736; CHECK: vector.body: 737; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ] 738; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 739; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0 740; CHECK: store i32 %[[X1:.+]], {{.*}} 741; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2 742; CHECK: store i32 %[[X2:.+]], {{.*}} 743; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4 744; CHECK: store i32 %[[X3:.+]], {{.*}} 745; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6 746; CHECK: store i32 %[[X4:.+]], {{.*}} 747; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}} 748; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 749; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]] 750 751define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) { 752entry: 753 br label %for.body 754 755for.body: 756 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 757 %s = phi i32 [ %2, %for.body ], [ 0, %entry ] 758 %i_plus_1 = add nuw nsw i64 %i, 1 759 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 760 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 761 %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1 762 %0 = load i32, i32* %p_i.x, align 4 763 store i32 %0, i32* %p_i_plus_1.y, align 4 764 %1 = load i32, i32* %p_i.y, align 4 765 %2 = add nsw i32 %1, %s 766 %i.next = add nuw nsw i64 %i, 1 767 %cond = icmp slt i64 %i.next, %n 768 br i1 %cond, label %for.body, label %for.end 769 770for.end: 771 %3 = phi i32 [ %2, %for.body ] 772 ret i32 %3 773} 774 775; PR27626_4: Ensure we form an interleaved group for strided stores in the 776; presence of a write-after-write dependence. We create a group for 777; (2) and (3) while excluding (1). 778 779; void PR27626_4(int *a, int x, int y, int z, int n) { 780; for (int i = 0; i < n; i += 2) { 781; a[i] = x; // (1) 782; a[i] = y; // (2) 783; a[i + 1] = z; // (3) 784; } 785; } 786 787; CHECK-LABEL: @PR27626_4( 788; CHECK: vector.ph: 789; CHECK: %[[INS_Y:.+]] = insertelement <4 x i32> undef, i32 %y, i32 0 790; CHECK: %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> undef, <4 x i32> zeroinitializer 791; CHECK: %[[INS_Z:.+]] = insertelement <4 x i32> undef, i32 %z, i32 0 792; CHECK: %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> undef, <4 x i32> zeroinitializer 793; CHECK: vector.body: 794; CHECK: store i32 %x, {{.*}} 795; CHECK: store i32 %x, {{.*}} 796; CHECK: store i32 %x, {{.*}} 797; CHECK: store i32 %x, {{.*}} 798; CHECK: %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 799; CHECK: store <8 x i32> %[[VEC]], {{.*}} 800 801define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { 802entry: 803 br label %for.body 804 805for.body: 806 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 807 %i_plus_1 = add i64 %i, 1 808 %a_i = getelementptr inbounds i32, i32* %a, i64 %i 809 %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1 810 store i32 %x, i32* %a_i, align 4 811 store i32 %y, i32* %a_i, align 4 812 store i32 %z, i32* %a_i_plus_1, align 4 813 %i.next = add nuw nsw i64 %i, 2 814 %cond = icmp slt i64 %i.next, %n 815 br i1 %cond, label %for.body, label %for.end 816 817for.end: 818 ret void 819} 820 821; PR27626_5: Ensure we do not form an interleaved group for strided stores in 822; the presence of a write-after-write dependence. 823 824; void PR27626_5(int *a, int x, int y, int z, int n) { 825; for (int i = 3; i < n; i += 2) { 826; a[i - 1] = x; 827; a[i - 3] = y; 828; a[i] = z; 829; } 830; } 831 832; CHECK-LABEL: @PR27626_5( 833; CHECK: vector.body: 834; CHECK: store i32 %x, {{.*}} 835; CHECK: store i32 %x, {{.*}} 836; CHECK: store i32 %x, {{.*}} 837; CHECK: store i32 %x, {{.*}} 838; CHECK: store i32 %y, {{.*}} 839; CHECK: store i32 %y, {{.*}} 840; CHECK: store i32 %y, {{.*}} 841; CHECK: store i32 %y, {{.*}} 842; CHECK: store i32 %z, {{.*}} 843; CHECK: store i32 %z, {{.*}} 844; CHECK: store i32 %z, {{.*}} 845; CHECK: store i32 %z, {{.*}} 846 847define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { 848entry: 849 br label %for.body 850 851for.body: 852 %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ] 853 %i_minus_1 = sub i64 %i, 1 854 %i_minus_3 = sub i64 %i_minus_1, 2 855 %a_i = getelementptr inbounds i32, i32* %a, i64 %i 856 %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1 857 %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3 858 store i32 %x, i32* %a_i_minus_1, align 4 859 store i32 %y, i32* %a_i_minus_3, align 4 860 store i32 %z, i32* %a_i, align 4 861 %i.next = add nuw nsw i64 %i, 2 862 %cond = icmp slt i64 %i.next, %n 863 br i1 %cond, label %for.body, label %for.end 864 865for.end: 866 ret void 867} 868 869attributes #0 = { "unsafe-fp-math"="true" } 870