1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 5 6; Check vectorization on an interleaved load group of factor 2 and an interleaved 7; store group of factor 2. 8 9; int AB[1024]; 10; int CD[1024]; 11; void test_array_load2_store2(int C, int D) { 12; for (int i = 0; i < 1024; i+=2) { 13; int A = AB[i]; 14; int B = AB[i+1]; 15; CD[i] = A + C; 16; CD[i+1] = B * D; 17; } 18; } 19 20 21@AB = common global [1024 x i32] zeroinitializer, align 4 22@CD = common global [1024 x i32] zeroinitializer, align 4 23 24define void @test_array_load2_store2(i32 %C, i32 %D) { 25; CHECK-LABEL: @test_array_load2_store2( 26; CHECK-NEXT: entry: 27; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 28; CHECK: vector.ph: 29; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 30; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer 31; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[D:%.*]], i32 0 32; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer 33; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 34; CHECK: vector.body: 35; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 36; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 37; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[OFFSET_IDX]] 38; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* 39; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 40; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 41; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 42; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1 43; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] 44; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]] 45; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP2]] 46; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -1 47; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 48; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 49; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4 50; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 51; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 52; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 53; CHECK: middle.block: 54; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 55; CHECK: scalar.ph: 56; CHECK-NEXT: br label [[FOR_BODY:%.*]] 57; CHECK: for.body: 58; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]] 59; CHECK: for.end: 60; CHECK-NEXT: ret void 61; 62entry: 63 br label %for.body 64 65for.body: ; preds = %for.body, %entry 66 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 67 %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv 68 %tmp = load i32, i32* %arrayidx0, align 4 69 %tmp1 = or i64 %indvars.iv, 1 70 %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1 71 %tmp2 = load i32, i32* %arrayidx1, align 4 72 %add = add nsw i32 %tmp, %C 73 %mul = mul nsw i32 %tmp2, %D 74 %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv 75 store i32 %add, i32* %arrayidx2, align 4 76 %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1 77 store i32 %mul, i32* %arrayidx3, align 4 78 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 79 %cmp = icmp slt i64 %indvars.iv.next, 1024 80 br i1 %cmp, label %for.body, label %for.end 81 82for.end: ; preds = %for.body 83 ret void 84} 85 86; int A[3072]; 87; struct ST S[1024]; 88; void test_struct_st3() { 89; int *ptr = A; 90; for (int i = 0; i < 1024; i++) { 91; int X1 = *ptr++; 92; int X2 = *ptr++; 93; int X3 = *ptr++; 94; T[i].x = X1 + 1; 95; T[i].y = X2 + 2; 96; T[i].z = X3 + 3; 97; } 98; } 99 100 101%struct.ST3 = type { i32, i32, i32 } 102@A = common global [3072 x i32] zeroinitializer, align 4 103@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 104 105define void @test_struct_array_load3_store3() { 106; CHECK-LABEL: @test_struct_array_load3_store3( 107; CHECK-NEXT: entry: 108; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 109; CHECK: vector.ph: 110; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 111; CHECK: vector.body: 112; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 113; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3 114; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr [3072 x i32], [3072 x i32]* @A, i64 0, i64 [[TMP0]] 115; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>* 116; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4 117; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 118; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 119; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 120; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1> 121; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2> 122; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3> 123; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDEX]], i32 2 124; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -2 125; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>* 126; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 127; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 128; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 129; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4 130; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 131; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 132; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] 133; CHECK: middle.block: 134; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 135; CHECK: scalar.ph: 136; CHECK-NEXT: br label [[FOR_BODY:%.*]] 137; CHECK: for.body: 138; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] 139; CHECK: for.end: 140; CHECK-NEXT: ret void 141; 142entry: 143 br label %for.body 144 145for.body: ; preds = %for.body, %entry 146 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 147 %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] 148 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 149 %tmp = load i32, i32* %ptr.016, align 4 150 %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 151 %tmp1 = load i32, i32* %incdec.ptr, align 4 152 %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 153 %tmp2 = load i32, i32* %incdec.ptr1, align 4 154 %add = add nsw i32 %tmp, 1 155 %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 156 store i32 %add, i32* %x, align 4 157 %add3 = add nsw i32 %tmp1, 2 158 %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 159 store i32 %add3, i32* %y, align 4 160 %add6 = add nsw i32 %tmp2, 3 161 %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 162 store i32 %add6, i32* %z, align 4 163 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 164 %exitcond = icmp eq i64 %indvars.iv.next, 1024 165 br i1 %exitcond, label %for.end, label %for.body 166 167for.end: ; preds = %for.body 168 ret void 169} 170 171; Check vectorization on an interleaved load group of factor 4. 172 173; struct ST4{ 174; int x; 175; int y; 176; int z; 177; int w; 178; }; 179; int test_struct_load4(struct ST4 *S) { 180; int r = 0; 181; for (int i = 0; i < 1024; i++) { 182; r += S[i].x; 183; r -= S[i].y; 184; r += S[i].z; 185; r -= S[i].w; 186; } 187; return r; 188; } 189 190%struct.ST4 = type { i32, i32, i32, i32 } 191 192define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { 193; 194; CHECK-LABEL: @test_struct_load4( 195; CHECK-NEXT: entry: 196; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 197; CHECK: vector.ph: 198; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 199; CHECK: vector.body: 200; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 201; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 202; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], i64 [[INDEX]], i32 0 203; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* 204; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4 205; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 206; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 207; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 208; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 209; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]] 210; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[STRIDED_VEC2]] 211; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]] 212; CHECK-NEXT: [[TMP5]] = sub <4 x i32> [[TMP3]], [[TMP4]] 213; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 214; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 215; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] 216; CHECK: middle.block: 217; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) 218; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 219; CHECK: scalar.ph: 220; CHECK-NEXT: br label [[FOR_BODY:%.*]] 221; CHECK: for.body: 222; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] 223; CHECK: for.end: 224; CHECK-NEXT: [[SUB8_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 225; CHECK-NEXT: ret i32 [[SUB8_LCSSA]] 226; 227entry: 228 br label %for.body 229 230for.body: ; preds = %for.body, %entry 231 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 232 %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] 233 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 234 %tmp = load i32, i32* %x, align 4 235 %add = add nsw i32 %tmp, %r.022 236 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 237 %tmp1 = load i32, i32* %y, align 4 238 %sub = sub i32 %add, %tmp1 239 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 240 %tmp2 = load i32, i32* %z, align 4 241 %add5 = add nsw i32 %sub, %tmp2 242 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 243 %tmp3 = load i32, i32* %w, align 4 244 %sub8 = sub i32 %add5, %tmp3 245 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 246 %exitcond = icmp eq i64 %indvars.iv.next, 1024 247 br i1 %exitcond, label %for.end, label %for.body 248 249for.end: ; preds = %for.body 250 ret i32 %sub8 251} 252 253; Check vectorization on an interleaved store group of factor 4. 254 255; void test_struct_store4(int *A, struct ST4 *B) { 256; int *ptr = A; 257; for (int i = 0; i < 1024; i++) { 258; int X = *ptr++; 259; B[i].x = X + 1; 260; B[i].y = X * 2; 261; B[i].z = X + 3; 262; B[i].w = X + 4; 263; } 264; } 265 266 267define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) { 268; CHECK-LABEL: @test_struct_store4( 269; CHECK-NEXT: entry: 270; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 271; CHECK: vector.ph: 272; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 273; CHECK: vector.body: 274; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 275; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] 276; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>* 277; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 278; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1> 279; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1> 280; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3> 281; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4> 282; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[B:%.*]], i64 [[INDEX]], i32 3 283; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3 284; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* 285; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 286; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 287; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 288; CHECK-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[TMP7]], align 4 289; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 290; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 291; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] 292; CHECK: middle.block: 293; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 294; CHECK: scalar.ph: 295; CHECK-NEXT: br label [[FOR_BODY:%.*]] 296; CHECK: for.cond.cleanup: 297; CHECK-NEXT: ret void 298; CHECK: for.body: 299; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] 300; 301entry: 302 br label %for.body 303 304for.cond.cleanup: ; preds = %for.body 305 ret void 306 307for.body: ; preds = %for.body, %entry 308 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 309 %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ] 310 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1 311 %tmp = load i32, i32* %ptr.024, align 4 312 %add = add nsw i32 %tmp, 1 313 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0 314 store i32 %add, i32* %x, align 4 315 %mul = shl nsw i32 %tmp, 1 316 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1 317 store i32 %mul, i32* %y, align 4 318 %add3 = add nsw i32 %tmp, 3 319 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2 320 store i32 %add3, i32* %z, align 4 321 %add6 = add nsw i32 %tmp, 4 322 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3 323 store i32 %add6, i32* %w, align 4 324 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 325 %exitcond = icmp eq i64 %indvars.iv.next, 1024 326 br i1 %exitcond, label %for.cond.cleanup, label %for.body 327} 328 329; Check vectorization on a reverse interleaved load group of factor 2 and 330; a reverse interleaved store group of factor 2. 331 332; struct ST2 { 333; int x; 334; int y; 335; }; 336; 337; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { 338; for (int i = 1023; i >= 0; i--) { 339; int a = A[i].x + i; // interleaved load of index 0 340; int b = A[i].y - i; // interleaved load of index 1 341; B[i].x = a; // interleaved store of index 0 342; B[i].y = b; // interleaved store of index 1 343; } 344; } 345 346 347%struct.ST2 = type { i32, i32 } 348 349define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) { 350; CHECK-LABEL: @test_reversed_load2_store2( 351; CHECK-NEXT: entry: 352; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 353; CHECK: vector.ph: 354; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 355; CHECK: vector.body: 356; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 357; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ] 358; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] 359; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], %struct.ST2* [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 360; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 -6 361; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>* 362; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 363; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 364; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 365; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 366; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 367; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND3]] 368; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND3]] 369; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B:%.*]], i64 [[OFFSET_IDX]], i32 1 370; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7 371; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 372; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 373; CHECK-NEXT: [[REVERSE6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 374; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE5]], <4 x i32> [[REVERSE6]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 375; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4 376; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 377; CHECK-NEXT: [[VEC_IND_NEXT4]] = add <4 x i32> [[VEC_IND3]], <i32 -4, i32 -4, i32 -4, i32 -4> 378; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 379; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] 380; CHECK: middle.block: 381; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 382; CHECK: scalar.ph: 383; CHECK-NEXT: br label [[FOR_BODY:%.*]] 384; CHECK: for.cond.cleanup: 385; CHECK-NEXT: ret void 386; CHECK: for.body: 387; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]] 388; 389entry: 390 br label %for.body 391 392for.cond.cleanup: ; preds = %for.body 393 ret void 394 395for.body: ; preds = %for.body, %entry 396 %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] 397 %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 398 %tmp = load i32, i32* %x, align 4 399 %tmp1 = trunc i64 %indvars.iv to i32 400 %add = add nsw i32 %tmp, %tmp1 401 %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 402 %tmp2 = load i32, i32* %y, align 4 403 %sub = sub nsw i32 %tmp2, %tmp1 404 %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 405 store i32 %add, i32* %x5, align 4 406 %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 407 store i32 %sub, i32* %y8, align 4 408 %indvars.iv.next = add nsw i64 %indvars.iv, -1 409 %cmp = icmp sgt i64 %indvars.iv, 0 410 br i1 %cmp, label %for.body, label %for.cond.cleanup 411} 412 413; Check vectorization on an interleaved load group of factor 2 with 1 gap 414; (missing the load of odd elements). Because the vectorized loop would 415; speculatively access memory out-of-bounds, we must execute at least one 416; iteration of the scalar loop. 417 418; void even_load_static_tc(int *A, int *B) { 419; for (unsigned i = 0; i < 1024; i+=2) 420; B[i/2] = A[i] * 2; 421; } 422 423 424define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { 425; CHECK-LABEL: @even_load_static_tc( 426; CHECK-NEXT: entry: 427; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 428; CHECK: vector.ph: 429; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 430; CHECK: vector.body: 431; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 432; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 433; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 434; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* 435; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 436; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 437; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1> 438; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[INDEX]], 9223372036854775804 439; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]] 440; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* 441; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP5]], align 4 442; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 443; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 508 444; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] 445; CHECK: middle.block: 446; CHECK-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 447; CHECK: scalar.ph: 448; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1016, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 449; CHECK-NEXT: br label [[FOR_BODY:%.*]] 450; CHECK: for.cond.cleanup: 451; CHECK-NEXT: ret void 452; CHECK: for.body: 453; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] 454; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] 455; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 456; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1 457; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1 458; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] 459; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4 460; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 461; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 462; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP13:![0-9]+]] 463; 464entry: 465 br label %for.body 466 467for.cond.cleanup: ; preds = %for.body 468 ret void 469 470for.body: ; preds = %for.body, %entry 471 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 472 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 473 %tmp = load i32, i32* %arrayidx, align 4 474 %mul = shl nsw i32 %tmp, 1 475 %tmp1 = lshr exact i64 %indvars.iv, 1 476 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 477 store i32 %mul, i32* %arrayidx2, align 4 478 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 479 %cmp = icmp ult i64 %indvars.iv.next, 1024 480 br i1 %cmp, label %for.body, label %for.cond.cleanup 481} 482 483; Check vectorization on an interleaved load group of factor 2 with 1 gap 484; (missing the load of odd elements). Because the vectorized loop would 485; speculatively access memory out-of-bounds, we must execute at least one 486; iteration of the scalar loop. 487 488; void even_load_dynamic_tc(int *A, int *B, unsigned N) { 489; for (unsigned i = 0; i < N; i+=2) 490; B[i/2] = A[i] * 2; 491; } 492 493 494define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) { 495; CHECK-LABEL: @even_load_dynamic_tc( 496; CHECK-NEXT: entry: 497; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 2) 498; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 499; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 500; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 501; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 502; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 503; CHECK: vector.ph: 504; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3 505; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 506; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]] 507; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]] 508; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 509; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 510; CHECK: vector.body: 511; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 512; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 513; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 514; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* 515; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4 516; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 517; CHECK-NEXT: [[TMP7:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1> 518; CHECK-NEXT: [[TMP8:%.*]] = and i64 [[INDEX]], 9223372036854775804 519; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP8]] 520; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* 521; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4 522; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 523; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 524; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] 525; CHECK: middle.block: 526; CHECK-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 527; CHECK: scalar.ph: 528; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 529; CHECK-NEXT: br label [[FOR_BODY:%.*]] 530; CHECK: for.cond.cleanup: 531; CHECK-NEXT: ret void 532; CHECK: for.body: 533; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] 534; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] 535; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 536; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1 537; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1 538; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] 539; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4 540; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 541; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]] 542; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP15:![0-9]+]] 543; 544entry: 545 br label %for.body 546 547for.cond.cleanup: ; preds = %for.body 548 ret void 549 550for.body: ; preds = %for.body, %entry 551 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 552 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 553 %tmp = load i32, i32* %arrayidx, align 4 554 %mul = shl nsw i32 %tmp, 1 555 %tmp1 = lshr exact i64 %indvars.iv, 1 556 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 557 store i32 %mul, i32* %arrayidx2, align 4 558 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 559 %cmp = icmp ult i64 %indvars.iv.next, %N 560 br i1 %cmp, label %for.body, label %for.cond.cleanup 561} 562 563; Check vectorization on a reverse interleaved load group of factor 2 with 1 564; gap and a reverse interleaved store group of factor 2. The interleaved load 565; group should be removed since it has a gap and is reverse. 566 567; struct pair { 568; int x; 569; int y; 570; }; 571; 572; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) { 573; for (int i = 1023; i >= 0; i--) { 574; int a = X + i; 575; int b = A[i].y - i; 576; B[i].x = a; 577; B[i].y = b; 578; } 579; } 580 581 582%pair = type { i64, i64 } 583define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) { 584; CHECK-LABEL: @load_gap_reverse( 585; CHECK-NEXT: entry: 586; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 587; CHECK: vector.ph: 588; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X:%.*]], i32 0 589; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer 590; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 591; CHECK: vector.body: 592; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 593; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 594; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] 595; CHECK-NEXT: [[TMP0:%.*]] = sub i64 1022, [[INDEX]] 596; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1021, [[INDEX]] 597; CHECK-NEXT: [[TMP2:%.*]] = sub i64 1020, [[INDEX]] 598; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]] 599; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P1:%.*]], i64 [[OFFSET_IDX]], i32 0 600; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP0]], i32 0 601; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP1]], i32 0 602; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP2]], i32 0 603; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1 604; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP0]], i32 1 605; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP1]], i32 1 606; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP2]], i32 1 607; CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[TMP8]], align 8 608; CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP9]], align 8 609; CHECK-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP10]], align 8 610; CHECK-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP11]], align 8 611; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0 612; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1 613; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2 614; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3 615; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]] 616; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 617; CHECK-NEXT: store i64 [[TMP21]], i64* [[TMP4]], align 8 618; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 619; CHECK-NEXT: store i64 [[TMP22]], i64* [[TMP5]], align 8 620; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 621; CHECK-NEXT: store i64 [[TMP23]], i64* [[TMP6]], align 8 622; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 623; CHECK-NEXT: store i64 [[TMP24]], i64* [[TMP7]], align 8 624; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0 625; CHECK-NEXT: store i64 [[TMP25]], i64* [[TMP8]], align 8 626; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1 627; CHECK-NEXT: store i64 [[TMP26]], i64* [[TMP9]], align 8 628; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2 629; CHECK-NEXT: store i64 [[TMP27]], i64* [[TMP10]], align 8 630; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3 631; CHECK-NEXT: store i64 [[TMP28]], i64* [[TMP11]], align 8 632; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 633; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4> 634; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 635; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] 636; CHECK: middle.block: 637; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 638; CHECK: scalar.ph: 639; CHECK-NEXT: br label [[FOR_BODY:%.*]] 640; CHECK: for.body: 641; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP17:![0-9]+]] 642; CHECK: for.exit: 643; CHECK-NEXT: ret void 644; 645entry: 646 br label %for.body 647 648for.body: 649 %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ] 650 %0 = add nsw i64 %X, %i 651 %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0 652 %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1 653 %3 = load i64, i64* %2, align 8 654 %4 = sub nsw i64 %3, %i 655 store i64 %0, i64* %1, align 8 656 store i64 %4, i64* %2, align 8 657 %i.next = add nsw i64 %i, -1 658 %cond = icmp sgt i64 %i, 0 659 br i1 %cond, label %for.body, label %for.exit 660 661for.exit: 662 ret void 663} 664 665; Check vectorization on interleaved access groups identified from mixed 666; loads/stores. 667; void mixed_load2_store2(int *A, int *B) { 668; for (unsigned i = 0; i < 1024; i+=2) { 669; B[i] = A[i] * A[i+1]; 670; B[i+1] = A[i] + A[i+1]; 671; } 672; } 673 674 675define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { 676; CHECK-LABEL: @mixed_load2_store2( 677; CHECK-NEXT: entry: 678; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 679; CHECK: vector.ph: 680; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 681; CHECK: vector.body: 682; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 683; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 684; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 685; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* 686; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 687; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 688; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 689; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1 690; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] 691; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 692; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 693; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] 694; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 -1 695; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 [[TMP2]] 696; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 697; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 698; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4 699; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 700; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 701; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] 702; CHECK: middle.block: 703; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 704; CHECK: scalar.ph: 705; CHECK-NEXT: br label [[FOR_BODY:%.*]] 706; CHECK: for.cond.cleanup: 707; CHECK-NEXT: ret void 708; CHECK: for.body: 709; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP19:![0-9]+]] 710; 711entry: 712 br label %for.body 713 714for.cond.cleanup: ; preds = %for.body 715 ret void 716 717for.body: ; preds = %for.body, %entry 718 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 719 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 720 %tmp = load i32, i32* %arrayidx, align 4 721 %tmp1 = or i64 %indvars.iv, 1 722 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1 723 %tmp2 = load i32, i32* %arrayidx2, align 4 724 %mul = mul nsw i32 %tmp2, %tmp 725 %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 726 store i32 %mul, i32* %arrayidx4, align 4 727 %tmp3 = load i32, i32* %arrayidx, align 4 728 %tmp4 = load i32, i32* %arrayidx2, align 4 729 %add10 = add nsw i32 %tmp4, %tmp3 730 %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1 731 store i32 %add10, i32* %arrayidx13, align 4 732 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 733 %cmp = icmp ult i64 %indvars.iv.next, 1024 734 br i1 %cmp, label %for.body, label %for.cond.cleanup 735} 736 737; Check vectorization on interleaved access groups identified from mixed 738; loads/stores. 739; void mixed_load3_store3(int *A) { 740; for (unsigned i = 0; i < 1024; i++) { 741; *A++ += i; 742; *A++ += i; 743; *A++ += i; 744; } 745; } 746 747 748define void @mixed_load3_store3(i32* nocapture %A) { 749; CHECK-LABEL: @mixed_load3_store3( 750; CHECK-NEXT: entry: 751; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 752; CHECK: vector.ph: 753; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 754; CHECK: vector.body: 755; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 756; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 757; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3 758; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP0]] 759; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>* 760; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4 761; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 762; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 763; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 764; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]] 765; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[NEXT_GEP]], i64 2 766; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]] 767; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]] 768; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 -2 769; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>* 770; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 771; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 772; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 773; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4 774; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 775; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4> 776; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 777; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] 778; CHECK: middle.block: 779; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 780; CHECK: scalar.ph: 781; CHECK-NEXT: br label [[FOR_BODY:%.*]] 782; CHECK: for.cond.cleanup: 783; CHECK-NEXT: ret void 784; CHECK: for.body: 785; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] 786; 787entry: 788 br label %for.body 789 790for.cond.cleanup: ; preds = %for.body 791 ret void 792 793for.body: ; preds = %for.body, %entry 794 %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 795 %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ] 796 %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1 797 %tmp = load i32, i32* %A.addr.012, align 4 798 %add = add i32 %tmp, %i.013 799 store i32 %add, i32* %A.addr.012, align 4 800 %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2 801 %tmp1 = load i32, i32* %incdec.ptr, align 4 802 %add2 = add i32 %tmp1, %i.013 803 store i32 %add2, i32* %incdec.ptr, align 4 804 %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3 805 %tmp2 = load i32, i32* %incdec.ptr1, align 4 806 %add4 = add i32 %tmp2, %i.013 807 store i32 %add4, i32* %incdec.ptr1, align 4 808 %inc = add nuw nsw i32 %i.013, 1 809 %exitcond = icmp eq i32 %inc, 1024 810 br i1 %exitcond, label %for.cond.cleanup, label %for.body 811} 812 813; Check vectorization on interleaved access groups with members having different 814; kinds of type. 815 816; struct IntFloat { 817; int a; 818; float b; 819; }; 820; 821; int SA; 822; float SB; 823; 824; void int_float_struct(struct IntFloat *A) { 825; int SumA; 826; float SumB; 827; for (unsigned i = 0; i < 1024; i++) { 828; SumA += A[i].a; 829; SumB += A[i].b; 830; } 831; SA = SumA; 832; SB = SumB; 833; } 834 835 836%struct.IntFloat = type { i32, float } 837 838@SA = common global i32 0, align 4 839@SB = common global float 0.000000e+00, align 4 840 841define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 { 842; CHECK-LABEL: @int_float_struct( 843; CHECK-NEXT: entry: 844; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 845; CHECK: vector.ph: 846; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 847; CHECK: vector.body: 848; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 849; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] 850; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 851; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], %struct.IntFloat* [[A:%.*]], i64 [[INDEX]], i32 0 852; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* 853; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 854; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 855; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 856; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[STRIDED_VEC2]] to <4 x float> 857; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI1]] 858; CHECK-NEXT: [[TMP4]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP2]] 859; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 860; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 861; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] 862; CHECK: middle.block: 863; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) 864; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) 865; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 866; CHECK: scalar.ph: 867; CHECK-NEXT: br label [[FOR_BODY:%.*]] 868; CHECK: for.cond.cleanup: 869; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] 870; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 871; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* @SA, align 4 872; CHECK-NEXT: store float [[ADD3_LCSSA]], float* @SB, align 4 873; CHECK-NEXT: ret void 874; CHECK: for.body: 875; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] 876; 877entry: 878 br label %for.body 879 880for.cond.cleanup: ; preds = %for.body 881 store i32 %add, i32* @SA, align 4 882 store float %add3, float* @SB, align 4 883 ret void 884 885for.body: ; preds = %for.body, %entry 886 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 887 %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] 888 %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] 889 %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0 890 %tmp = load i32, i32* %a, align 4 891 %add = add nsw i32 %tmp, %SumA.013 892 %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1 893 %tmp1 = load float, float* %b, align 4 894 %add3 = fadd fast float %SumB.014, %tmp1 895 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 896 %exitcond = icmp eq i64 %indvars.iv.next, 1024 897 br i1 %exitcond, label %for.cond.cleanup, label %for.body 898} 899 900; Check vectorization of interleaved access groups in the presence of 901; dependences (PR27626). The following tests check that we don't reorder 902; dependent loads and stores when generating code for interleaved access 903; groups. Stores should be scalarized because the required code motion would 904; break dependences, and the remaining interleaved load groups should have 905; gaps. 906 907; PR27626_0: Ensure a strided store is not moved after a dependent (zero 908; distance) strided load. 909 910; void PR27626_0(struct pair *p, int z, int n) { 911; for (int i = 0; i < n; i++) { 912; p[i].x = z; 913; p[i].y = p[i].x; 914; } 915; } 916 917 918%pair.i32 = type { i32, i32 } 919define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) { 920; CHECK-LABEL: @PR27626_0( 921; CHECK-NEXT: entry: 922; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) 923; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5 924; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 925; CHECK: vector.ph: 926; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3 927; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 928; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]] 929; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] 930; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 931; CHECK: vector.body: 932; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 933; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 934; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2 935; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3 936; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 937; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 0 938; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0 939; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0 940; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 941; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1 942; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1 943; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1 944; CHECK-NEXT: store i32 [[Z:%.*]], i32* [[TMP5]], align 4 945; CHECK-NEXT: store i32 [[Z]], i32* [[TMP6]], align 4 946; CHECK-NEXT: store i32 [[Z]], i32* [[TMP7]], align 4 947; CHECK-NEXT: store i32 [[Z]], i32* [[TMP8]], align 4 948; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* 949; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 950; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0 951; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP9]], align 4 952; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2 953; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP10]], align 4 954; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4 955; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP11]], align 4 956; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6 957; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP12]], align 4 958; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 959; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 960; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] 961; CHECK: middle.block: 962; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] 963; CHECK: scalar.ph: 964; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 965; CHECK-NEXT: br label [[FOR_BODY:%.*]] 966; CHECK: for.body: 967; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 968; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 969; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 970; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_X]], align 4 971; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_Y]], align 4 972; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 973; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 974; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP25:![0-9]+]] 975; CHECK: for.end: 976; CHECK-NEXT: ret void 977; 978entry: 979 br label %for.body 980 981for.body: 982 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 983 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 984 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 985 store i32 %z, i32* %p_i.x, align 4 986 %0 = load i32, i32* %p_i.x, align 4 987 store i32 %0, i32 *%p_i.y, align 4 988 %i.next = add nuw nsw i64 %i, 1 989 %cond = icmp slt i64 %i.next, %n 990 br i1 %cond, label %for.body, label %for.end 991 992for.end: 993 ret void 994} 995 996; PR27626_1: Ensure a strided load is not moved before a dependent (zero 997; distance) strided store. 998 999; void PR27626_1(struct pair *p, int n) { 1000; int s = 0; 1001; for (int i = 0; i < n; i++) { 1002; p[i].y = p[i].x; 1003; s += p[i].y 1004; } 1005; } 1006 1007 1008define i32 @PR27626_1(%pair.i32 *%p, i64 %n) { 1009; CHECK-LABEL: @PR27626_1( 1010; CHECK-NEXT: entry: 1011; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) 1012; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5 1013; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1014; CHECK: vector.ph: 1015; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3 1016; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 1017; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]] 1018; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] 1019; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1020; CHECK: vector.body: 1021; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1022; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] 1023; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 1024; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2 1025; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3 1026; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 1027; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 1028; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1 1029; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1 1030; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1 1031; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* 1032; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4 1033; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0 1034; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP6]], align 4 1035; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2 1036; CHECK-NEXT: store i32 [[TMP12]], i32* [[TMP7]], align 4 1037; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4 1038; CHECK-NEXT: store i32 [[TMP13]], i32* [[TMP8]], align 4 1039; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6 1040; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP9]], align 4 1041; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 1042; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 1043; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1044; CHECK-NEXT: [[TMP16]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]] 1045; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 1046; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1047; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] 1048; CHECK: middle.block: 1049; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP16]]) 1050; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] 1051; CHECK: scalar.ph: 1052; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 1053; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 1054; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1055; CHECK: for.body: 1056; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1057; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP20:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 1058; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 1059; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 1060; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[P_I_X]], align 4 1061; CHECK-NEXT: store i32 [[TMP19]], i32* [[P_I_Y]], align 4 1062; CHECK-NEXT: [[TMP20]] = add nsw i32 [[TMP19]], [[S]] 1063; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 1064; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1065; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP27:![0-9]+]] 1066; CHECK: for.end: 1067; CHECK-NEXT: [[TMP21:%.*]] = phi i32 [ [[TMP20]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] 1068; CHECK-NEXT: ret i32 [[TMP21]] 1069; 1070entry: 1071 br label %for.body 1072 1073for.body: 1074 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 1075 %s = phi i32 [ %2, %for.body ], [ 0, %entry ] 1076 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 1077 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 1078 %0 = load i32, i32* %p_i.x, align 4 1079 store i32 %0, i32* %p_i.y, align 4 1080 %1 = load i32, i32* %p_i.y, align 4 1081 %2 = add nsw i32 %1, %s 1082 %i.next = add nuw nsw i64 %i, 1 1083 %cond = icmp slt i64 %i.next, %n 1084 br i1 %cond, label %for.body, label %for.end 1085 1086for.end: 1087 %3 = phi i32 [ %2, %for.body ] 1088 ret i32 %3 1089} 1090 1091; PR27626_2: Ensure a strided store is not moved after a dependent (negative 1092; distance) strided load. 1093 1094; void PR27626_2(struct pair *p, int z, int n) { 1095; for (int i = 0; i < n; i++) { 1096; p[i].x = z; 1097; p[i].y = p[i - 1].x; 1098; } 1099; } 1100 1101 1102define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) { 1103; CHECK-LABEL: @PR27626_2( 1104; CHECK-NEXT: entry: 1105; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) 1106; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5 1107; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1108; CHECK: vector.ph: 1109; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3 1110; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 1111; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]] 1112; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] 1113; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1114; CHECK: vector.body: 1115; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1116; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 1117; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2 1118; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3 1119; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 1120; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 0 1121; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0 1122; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0 1123; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0 1124; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 1125; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1 1126; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1 1127; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1 1128; CHECK-NEXT: store i32 [[Z:%.*]], i32* [[TMP5]], align 4 1129; CHECK-NEXT: store i32 [[Z]], i32* [[TMP6]], align 4 1130; CHECK-NEXT: store i32 [[Z]], i32* [[TMP7]], align 4 1131; CHECK-NEXT: store i32 [[Z]], i32* [[TMP8]], align 4 1132; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* 1133; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP14]], align 4 1134; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0 1135; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP10]], align 4 1136; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2 1137; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP11]], align 4 1138; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4 1139; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP12]], align 4 1140; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6 1141; CHECK-NEXT: store i32 [[TMP18]], i32* [[TMP13]], align 4 1142; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 1143; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1144; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] 1145; CHECK: middle.block: 1146; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] 1147; CHECK: scalar.ph: 1148; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 1149; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1150; CHECK: for.body: 1151; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1152; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 1153; CHECK-NEXT: [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0 1154; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 1155; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_X]], align 4 1156; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[P_I_MINUS_1_X]], align 4 1157; CHECK-NEXT: store i32 [[TMP20]], i32* [[P_I_Y]], align 4 1158; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 1159; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1160; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP29:![0-9]+]] 1161; CHECK: for.end: 1162; CHECK-NEXT: ret void 1163; 1164entry: 1165 br label %for.body 1166 1167for.body: 1168 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 1169 %i_minus_1 = add nuw nsw i64 %i, -1 1170 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 1171 %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0 1172 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 1173 store i32 %z, i32* %p_i.x, align 4 1174 %0 = load i32, i32* %p_i_minus_1.x, align 4 1175 store i32 %0, i32 *%p_i.y, align 4 1176 %i.next = add nuw nsw i64 %i, 1 1177 %cond = icmp slt i64 %i.next, %n 1178 br i1 %cond, label %for.body, label %for.end 1179 1180for.end: 1181 ret void 1182} 1183 1184; PR27626_3: Ensure a strided load is not moved before a dependent (negative 1185; distance) strided store. 1186 1187; void PR27626_3(struct pair *p, int z, int n) { 1188; for (int i = 0; i < n; i++) { 1189; p[i + 1].y = p[i].x; 1190; s += p[i].y; 1191; } 1192; } 1193 1194 1195define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) { 1196; CHECK-LABEL: @PR27626_3( 1197; CHECK-NEXT: entry: 1198; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) 1199; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5 1200; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1201; CHECK: vector.ph: 1202; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3 1203; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 1204; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]] 1205; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] 1206; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1207; CHECK: vector.body: 1208; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1209; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 1210; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] 1211; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1> 1212; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 1213; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 1214; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 1215; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1 1216; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 1217; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP7]], i32 1 1218; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 1219; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1 1220; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 1221; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP11]], i32 1 1222; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>* 1223; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 1224; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0 1225; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP6]], align 4 1226; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2 1227; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP8]], align 4 1228; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4 1229; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP10]], align 4 1230; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6 1231; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP12]], align 4 1232; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* 1233; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP18]], align 4 1234; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1235; CHECK-NEXT: [[TMP19]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]] 1236; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 1237; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> 1238; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1239; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] 1240; CHECK: middle.block: 1241; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP19]]) 1242; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] 1243; CHECK: scalar.ph: 1244; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 1245; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 1246; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1247; CHECK: for.body: 1248; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1249; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP24:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 1250; CHECK-NEXT: [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1 1251; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 1252; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 1253; CHECK-NEXT: [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I_PLUS_1]], i32 1 1254; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[P_I_X]], align 4 1255; CHECK-NEXT: store i32 [[TMP22]], i32* [[P_I_PLUS_1_Y]], align 4 1256; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[P_I_Y]], align 4 1257; CHECK-NEXT: [[TMP24]] = add nsw i32 [[TMP23]], [[S]] 1258; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 1259; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1260; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP31:![0-9]+]] 1261; CHECK: for.end: 1262; CHECK-NEXT: [[TMP25:%.*]] = phi i32 [ [[TMP24]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] 1263; CHECK-NEXT: ret i32 [[TMP25]] 1264; 1265entry: 1266 br label %for.body 1267 1268for.body: 1269 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 1270 %s = phi i32 [ %2, %for.body ], [ 0, %entry ] 1271 %i_plus_1 = add nuw nsw i64 %i, 1 1272 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 1273 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 1274 %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1 1275 %0 = load i32, i32* %p_i.x, align 4 1276 store i32 %0, i32* %p_i_plus_1.y, align 4 1277 %1 = load i32, i32* %p_i.y, align 4 1278 %2 = add nsw i32 %1, %s 1279 %i.next = add nuw nsw i64 %i, 1 1280 %cond = icmp slt i64 %i.next, %n 1281 br i1 %cond, label %for.body, label %for.end 1282 1283for.end: 1284 %3 = phi i32 [ %2, %for.body ] 1285 ret i32 %3 1286} 1287 1288; PR27626_4: Ensure we form an interleaved group for strided stores in the 1289; presence of a write-after-write dependence. We create a group for 1290; (2) and (3) while excluding (1). 1291 1292; void PR27626_4(int *a, int x, int y, int z, int n) { 1293; for (int i = 0; i < n; i += 2) { 1294; a[i] = x; // (1) 1295; a[i] = y; // (2) 1296; a[i + 1] = z; // (3) 1297; } 1298; } 1299 1300 1301define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { 1302; CHECK-LABEL: @PR27626_4( 1303; CHECK-NEXT: entry: 1304; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2) 1305; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 1306; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 1307; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 1308; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6 1309; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1310; CHECK: vector.ph: 1311; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804 1312; CHECK-NEXT: [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1 1313; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i32 0 1314; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer 1315; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Z:%.*]], i32 0 1316; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer 1317; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1318; CHECK: vector.body: 1319; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1320; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 1321; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 2 1322; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 4 1323; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 6 1324; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 1 1325; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 1326; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]] 1327; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] 1328; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]] 1329; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 -1 1330; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP7]], align 4 1331; CHECK-NEXT: store i32 [[X]], i32* [[TMP8]], align 4 1332; CHECK-NEXT: store i32 [[X]], i32* [[TMP9]], align 4 1333; CHECK-NEXT: store i32 [[X]], i32* [[TMP10]], align 4 1334; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 [[TMP6]] 1335; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* 1336; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 1337; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP13]], align 4 1338; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 1339; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1340; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] 1341; CHECK: middle.block: 1342; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] 1343; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] 1344; CHECK: scalar.ph: 1345; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 1346; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1347; CHECK: for.body: 1348; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1349; CHECK-NEXT: [[I_PLUS_1:%.*]] = or i64 [[I]], 1 1350; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] 1351; CHECK-NEXT: [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_PLUS_1]] 1352; CHECK-NEXT: store i32 [[Y]], i32* [[A_I]], align 4 1353; CHECK-NEXT: store i32 [[Z]], i32* [[A_I_PLUS_1]], align 4 1354; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 1355; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1356; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP33:![0-9]+]] 1357; CHECK: for.end: 1358; CHECK-NEXT: ret void 1359; 1360entry: 1361 br label %for.body 1362 1363for.body: 1364 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 1365 %i_plus_1 = add i64 %i, 1 1366 %a_i = getelementptr inbounds i32, i32* %a, i64 %i 1367 %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1 1368 store i32 %x, i32* %a_i, align 4 1369 store i32 %y, i32* %a_i, align 4 1370 store i32 %z, i32* %a_i_plus_1, align 4 1371 %i.next = add nuw nsw i64 %i, 2 1372 %cond = icmp slt i64 %i.next, %n 1373 br i1 %cond, label %for.body, label %for.end 1374 1375for.end: 1376 ret void 1377} 1378 1379; PR27626_5: Ensure we do not form an interleaved group for strided stores in 1380; the presence of a write-after-write dependence. 1381 1382; void PR27626_5(int *a, int x, int y, int z, int n) { 1383; for (int i = 3; i < n; i += 2) { 1384; a[i - 1] = x; 1385; a[i - 3] = y; 1386; a[i] = z; 1387; } 1388; } 1389 1390 1391define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { 1392; CHECK-LABEL: @PR27626_5( 1393; CHECK-NEXT: entry: 1394; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 5) 1395; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4 1396; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 1397; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 1398; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6 1399; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1400; CHECK: vector.ph: 1401; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804 1402; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[N_VEC]], 1 1403; CHECK-NEXT: [[IND_END:%.*]] = or i64 [[TMP3]], 3 1404; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1405; CHECK: vector.body: 1406; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1407; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 1408; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[INDEX]], 1 1409; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[TMP4]], 3 1410; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP4]], 5 1411; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP4]], 7 1412; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 9 1413; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1> 1414; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -3, i64 -3, i64 -3, i64 -3> 1415; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 1416; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]] 1417; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]] 1418; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]] 1419; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i32 0 1420; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP14]] 1421; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i32 1 1422; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]] 1423; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP8]], i32 2 1424; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]] 1425; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP8]], i32 3 1426; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]] 1427; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0 1428; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]] 1429; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1 1430; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP24]] 1431; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2 1432; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP26]] 1433; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3 1434; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP28]] 1435; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP15]], align 4 1436; CHECK-NEXT: store i32 [[X]], i32* [[TMP17]], align 4 1437; CHECK-NEXT: store i32 [[X]], i32* [[TMP19]], align 4 1438; CHECK-NEXT: store i32 [[X]], i32* [[TMP21]], align 4 1439; CHECK-NEXT: store i32 [[Y:%.*]], i32* [[TMP23]], align 4 1440; CHECK-NEXT: store i32 [[Y]], i32* [[TMP25]], align 4 1441; CHECK-NEXT: store i32 [[Y]], i32* [[TMP27]], align 4 1442; CHECK-NEXT: store i32 [[Y]], i32* [[TMP29]], align 4 1443; CHECK-NEXT: store i32 [[Z:%.*]], i32* [[TMP10]], align 4 1444; CHECK-NEXT: store i32 [[Z]], i32* [[TMP11]], align 4 1445; CHECK-NEXT: store i32 [[Z]], i32* [[TMP12]], align 4 1446; CHECK-NEXT: store i32 [[Z]], i32* [[TMP13]], align 4 1447; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 1448; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8> 1449; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1450; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] 1451; CHECK: middle.block: 1452; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] 1453; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] 1454; CHECK: scalar.ph: 1455; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] 1456; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1457; CHECK: for.body: 1458; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1459; CHECK-NEXT: [[I_MINUS_1:%.*]] = add i64 [[I]], -1 1460; CHECK-NEXT: [[I_MINUS_3:%.*]] = add i64 [[I]], -3 1461; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] 1462; CHECK-NEXT: [[A_I_MINUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_1]] 1463; CHECK-NEXT: [[A_I_MINUS_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_3]] 1464; CHECK-NEXT: store i32 [[X]], i32* [[A_I_MINUS_1]], align 4 1465; CHECK-NEXT: store i32 [[Y]], i32* [[A_I_MINUS_3]], align 4 1466; CHECK-NEXT: store i32 [[Z]], i32* [[A_I]], align 4 1467; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 1468; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1469; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP35:![0-9]+]] 1470; CHECK: for.end: 1471; CHECK-NEXT: ret void 1472; 1473entry: 1474 br label %for.body 1475 1476for.body: 1477 %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ] 1478 %i_minus_1 = sub i64 %i, 1 1479 %i_minus_3 = sub i64 %i_minus_1, 2 1480 %a_i = getelementptr inbounds i32, i32* %a, i64 %i 1481 %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1 1482 %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3 1483 store i32 %x, i32* %a_i_minus_1, align 4 1484 store i32 %y, i32* %a_i_minus_3, align 4 1485 store i32 %z, i32* %a_i, align 4 1486 %i.next = add nuw nsw i64 %i, 2 1487 %cond = icmp slt i64 %i.next, %n 1488 br i1 %cond, label %for.body, label %for.end 1489 1490for.end: 1491 ret void 1492} 1493 1494; PR34743: Ensure that a cast which needs to sink after a load that belongs to 1495; an interleaved group, indeeded gets sunk. 1496 1497; void PR34743(short *a, int *b, int n) { 1498; for (int i = 0, iv = 0; iv < n; i++, iv += 2) { 1499; b[i] = a[iv] * a[iv+1] * a[iv+2]; 1500; } 1501; } 1502 1503 1504define void @PR34743(i16* %a, i32* %b, i64 %n) { 1505; CHECK-LABEL: @PR34743( 1506; CHECK-NEXT: entry: 1507; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[A:%.*]], align 2 1508; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1 1509; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1 1510; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 6 1511; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 1512; CHECK: vector.memcheck: 1513; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[N]], 1 1514; CHECK-NEXT: [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1 1515; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP3]] 1516; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i16, i16* [[A]], i64 1 1517; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[N]], -2 1518; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], 3 1519; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i16, i16* [[A]], i64 [[TMP5]] 1520; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[SCEVGEP5]] to i32* 1521; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[TMP6]], [[B]] 1522; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[SCEVGEP]] to i16* 1523; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i16* [[SCEVGEP3]], [[TMP7]] 1524; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 1525; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 1526; CHECK: vector.ph: 1527; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], -4 1528; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 1529; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i32 3 1530; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1531; CHECK: vector.body: 1532; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1533; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STRIDED_VEC8:%.*]], [[VECTOR_BODY]] ] 1534; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 1535; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[OFFSET_IDX]], 1 1536; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP8]] 1537; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>* 1538; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP10]], align 4 1539; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1540; CHECK-NEXT: [[STRIDED_VEC8]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1541; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[STRIDED_VEC8]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> 1542; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> 1543; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[TMP11]] to <4 x i32> 1544; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> 1545; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP12]] 1546; CHECK-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]] 1547; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] 1548; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>* 1549; CHECK-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP18]], align 4, !alias.scope !36, !noalias !39 1550; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 1551; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1552; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] 1553; CHECK: middle.block: 1554; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] 1555; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i16> [[WIDE_VEC]], i32 7 1556; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] 1557; CHECK: scalar.ph: 1558; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] 1559; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] 1560; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] 1561; CHECK-NEXT: br label [[LOOP:%.*]] 1562; CHECK: loop: 1563; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ] 1564; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ] 1565; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL7]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ] 1566; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32 1567; CHECK-NEXT: [[I1]] = add nuw nsw i64 [[I]], 1 1568; CHECK-NEXT: [[IV1:%.*]] = or i64 [[IV]], 1 1569; CHECK-NEXT: [[IV2]] = add nuw nsw i64 [[IV]], 2 1570; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV1]] 1571; CHECK-NEXT: [[LOAD1:%.*]] = load i16, i16* [[GEP1]], align 4 1572; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32 1573; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV2]] 1574; CHECK-NEXT: [[LOAD2]] = load i16, i16* [[GEP2]], align 4 1575; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32 1576; CHECK-NEXT: [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]] 1577; CHECK-NEXT: [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]] 1578; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] 1579; CHECK-NEXT: store i32 [[MUL012]], i32* [[ARRAYIDX5]], align 4 1580; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]] 1581; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP42:![0-9]+]] 1582; CHECK: end: 1583; CHECK-NEXT: ret void 1584; 1585entry: 1586 %.pre = load i16, i16* %a 1587 br label %loop 1588 1589loop: 1590 %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ] 1591 %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ] 1592 %i = phi i64 [ 0, %entry ], [ %i1, %loop ] 1593 %conv = sext i16 %0 to i32 1594 %i1 = add nuw nsw i64 %i, 1 1595 %iv1 = add nuw nsw i64 %iv, 1 1596 %iv2 = add nuw nsw i64 %iv, 2 1597 %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1 1598 %load1 = load i16, i16* %gep1, align 4 1599 %conv1 = sext i16 %load1 to i32 1600 %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2 1601 %load2 = load i16, i16* %gep2, align 4 1602 %conv2 = sext i16 %load2 to i32 1603 %mul01 = mul nsw i32 %conv, %conv1 1604 %mul012 = mul nsw i32 %mul01, %conv2 1605 %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i 1606 store i32 %mul012, i32* %arrayidx5 1607 %exitcond = icmp eq i64 %iv, %n 1608 br i1 %exitcond, label %end, label %loop 1609 1610end: 1611 ret void 1612} 1613 1614attributes #0 = { "unsafe-fp-math"="true" } 1615