1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 5 6; Check vectorization on an interleaved load group of factor 2 and an interleaved 7; store group of factor 2. 8 9; int AB[1024]; 10; int CD[1024]; 11; void test_array_load2_store2(int C, int D) { 12; for (int i = 0; i < 1024; i+=2) { 13; int A = AB[i]; 14; int B = AB[i+1]; 15; CD[i] = A + C; 16; CD[i+1] = B * D; 17; } 18; } 19 20 21@AB = common global [1024 x i32] zeroinitializer, align 4 22@CD = common global [1024 x i32] zeroinitializer, align 4 23 24define void @test_array_load2_store2(i32 %C, i32 %D) { 25; CHECK-LABEL: @test_array_load2_store2( 26; CHECK-NEXT: entry: 27; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 28; CHECK: vector.ph: 29; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 30; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer 31; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[D:%.*]], i32 0 32; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer 33; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 34; CHECK: vector.body: 35; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 36; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 37; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[OFFSET_IDX]] 38; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* 39; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 40; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 41; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 42; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1 43; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] 44; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]] 45; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP2]] 46; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -1 47; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 48; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 49; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4 50; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 51; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 52; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] 53; CHECK: middle.block: 54; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 55; CHECK: scalar.ph: 56; CHECK-NEXT: br label [[FOR_BODY:%.*]] 57; CHECK: for.body: 58; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_END]], [[LOOP2:!llvm.loop !.*]] 59; CHECK: for.end: 60; CHECK-NEXT: ret void 61; 62entry: 63 br label %for.body 64 65for.body: ; preds = %for.body, %entry 66 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 67 %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv 68 %tmp = load i32, i32* %arrayidx0, align 4 69 %tmp1 = or i64 %indvars.iv, 1 70 %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1 71 %tmp2 = load i32, i32* %arrayidx1, align 4 72 %add = add nsw i32 %tmp, %C 73 %mul = mul nsw i32 %tmp2, %D 74 %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv 75 store i32 %add, i32* %arrayidx2, align 4 76 %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1 77 store i32 %mul, i32* %arrayidx3, align 4 78 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 79 %cmp = icmp slt i64 %indvars.iv.next, 1024 80 br i1 %cmp, label %for.body, label %for.end 81 82for.end: ; preds = %for.body 83 ret void 84} 85 86; int A[3072]; 87; struct ST S[1024]; 88; void test_struct_st3() { 89; int *ptr = A; 90; for (int i = 0; i < 1024; i++) { 91; int X1 = *ptr++; 92; int X2 = *ptr++; 93; int X3 = *ptr++; 94; T[i].x = X1 + 1; 95; T[i].y = X2 + 2; 96; T[i].z = X3 + 3; 97; } 98; } 99 100 101%struct.ST3 = type { i32, i32, i32 } 102@A = common global [3072 x i32] zeroinitializer, align 4 103@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 104 105define void @test_struct_array_load3_store3() { 106; CHECK-LABEL: @test_struct_array_load3_store3( 107; CHECK-NEXT: entry: 108; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 109; CHECK: vector.ph: 110; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 111; CHECK: vector.body: 112; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 113; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3 114; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr [3072 x i32], [3072 x i32]* @A, i64 0, i64 [[TMP0]] 115; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>* 116; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4 117; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 118; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 119; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 120; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1> 121; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2> 122; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3> 123; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDEX]], i32 2 124; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -2 125; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>* 126; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 127; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 128; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 129; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4 130; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 131; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 132; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] 133; CHECK: middle.block: 134; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 135; CHECK: scalar.ph: 136; CHECK-NEXT: br label [[FOR_BODY:%.*]] 137; CHECK: for.body: 138; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]] 139; CHECK: for.end: 140; CHECK-NEXT: ret void 141; 142entry: 143 br label %for.body 144 145for.body: ; preds = %for.body, %entry 146 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 147 %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] 148 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 149 %tmp = load i32, i32* %ptr.016, align 4 150 %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 151 %tmp1 = load i32, i32* %incdec.ptr, align 4 152 %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 153 %tmp2 = load i32, i32* %incdec.ptr1, align 4 154 %add = add nsw i32 %tmp, 1 155 %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 156 store i32 %add, i32* %x, align 4 157 %add3 = add nsw i32 %tmp1, 2 158 %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 159 store i32 %add3, i32* %y, align 4 160 %add6 = add nsw i32 %tmp2, 3 161 %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 162 store i32 %add6, i32* %z, align 4 163 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 164 %exitcond = icmp eq i64 %indvars.iv.next, 1024 165 br i1 %exitcond, label %for.end, label %for.body 166 167for.end: ; preds = %for.body 168 ret void 169} 170 171; Check vectorization on an interleaved load group of factor 4. 172 173; struct ST4{ 174; int x; 175; int y; 176; int z; 177; int w; 178; }; 179; int test_struct_load4(struct ST4 *S) { 180; int r = 0; 181; for (int i = 0; i < 1024; i++) { 182; r += S[i].x; 183; r -= S[i].y; 184; r += S[i].z; 185; r -= S[i].w; 186; } 187; return r; 188; } 189 190%struct.ST4 = type { i32, i32, i32, i32 } 191 192define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { 193; 194; CHECK-LABEL: @test_struct_load4( 195; CHECK-NEXT: entry: 196; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 197; CHECK: vector.ph: 198; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 199; CHECK: vector.body: 200; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 201; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 202; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], i64 [[INDEX]], i32 0 203; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* 204; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4 205; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 206; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 207; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 208; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 209; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]] 210; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[STRIDED_VEC2]] 211; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]] 212; CHECK-NEXT: [[TMP5]] = sub <4 x i32> [[TMP3]], [[TMP4]] 213; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 214; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 215; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] 216; CHECK: middle.block: 217; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) 218; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 219; CHECK: scalar.ph: 220; CHECK-NEXT: br label [[FOR_BODY:%.*]] 221; CHECK: for.body: 222; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]] 223; CHECK: for.end: 224; CHECK-NEXT: [[SUB8_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 225; CHECK-NEXT: ret i32 [[SUB8_LCSSA]] 226; 227entry: 228 br label %for.body 229 230for.body: ; preds = %for.body, %entry 231 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 232 %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] 233 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 234 %tmp = load i32, i32* %x, align 4 235 %add = add nsw i32 %tmp, %r.022 236 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 237 %tmp1 = load i32, i32* %y, align 4 238 %sub = sub i32 %add, %tmp1 239 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 240 %tmp2 = load i32, i32* %z, align 4 241 %add5 = add nsw i32 %sub, %tmp2 242 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 243 %tmp3 = load i32, i32* %w, align 4 244 %sub8 = sub i32 %add5, %tmp3 245 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 246 %exitcond = icmp eq i64 %indvars.iv.next, 1024 247 br i1 %exitcond, label %for.end, label %for.body 248 249for.end: ; preds = %for.body 250 ret i32 %sub8 251} 252 253; Check vectorization on an interleaved store group of factor 4. 254 255; void test_struct_store4(int *A, struct ST4 *B) { 256; int *ptr = A; 257; for (int i = 0; i < 1024; i++) { 258; int X = *ptr++; 259; B[i].x = X + 1; 260; B[i].y = X * 2; 261; B[i].z = X + 3; 262; B[i].w = X + 4; 263; } 264; } 265 266 267define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) { 268; CHECK-LABEL: @test_struct_store4( 269; CHECK-NEXT: entry: 270; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 271; CHECK: vector.ph: 272; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 273; CHECK: vector.body: 274; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 275; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] 276; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>* 277; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 278; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1> 279; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1> 280; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3> 281; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4> 282; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[B:%.*]], i64 [[INDEX]], i32 3 283; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3 284; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* 285; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 286; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 287; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 288; CHECK-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[TMP7]], align 4 289; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 290; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 291; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] 292; CHECK: middle.block: 293; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 294; CHECK: scalar.ph: 295; CHECK-NEXT: br label [[FOR_BODY:%.*]] 296; CHECK: for.cond.cleanup: 297; CHECK-NEXT: ret void 298; CHECK: for.body: 299; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]] 300; 301entry: 302 br label %for.body 303 304for.cond.cleanup: ; preds = %for.body 305 ret void 306 307for.body: ; preds = %for.body, %entry 308 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 309 %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ] 310 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1 311 %tmp = load i32, i32* %ptr.024, align 4 312 %add = add nsw i32 %tmp, 1 313 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0 314 store i32 %add, i32* %x, align 4 315 %mul = shl nsw i32 %tmp, 1 316 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1 317 store i32 %mul, i32* %y, align 4 318 %add3 = add nsw i32 %tmp, 3 319 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2 320 store i32 %add3, i32* %z, align 4 321 %add6 = add nsw i32 %tmp, 4 322 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3 323 store i32 %add6, i32* %w, align 4 324 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 325 %exitcond = icmp eq i64 %indvars.iv.next, 1024 326 br i1 %exitcond, label %for.cond.cleanup, label %for.body 327} 328 329; Check vectorization on a reverse interleaved load group of factor 2 and 330; a reverse interleaved store group of factor 2. 331 332; struct ST2 { 333; int x; 334; int y; 335; }; 336; 337; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { 338; for (int i = 1023; i >= 0; i--) { 339; int a = A[i].x + i; // interleaved load of index 0 340; int b = A[i].y - i; // interleaved load of index 1 341; B[i].x = a; // interleaved store of index 0 342; B[i].y = b; // interleaved store of index 1 343; } 344; } 345 346 347%struct.ST2 = type { i32, i32 } 348 349define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) { 350; CHECK-LABEL: @test_reversed_load2_store2( 351; CHECK-NEXT: entry: 352; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 353; CHECK: vector.ph: 354; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 355; CHECK: vector.body: 356; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 357; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ] 358; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] 359; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], %struct.ST2* [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 360; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 -6 361; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>* 362; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 363; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 364; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 365; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 366; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 367; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND3]] 368; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND3]] 369; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B:%.*]], i64 [[OFFSET_IDX]], i32 1 370; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7 371; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 372; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 373; CHECK-NEXT: [[REVERSE6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 374; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE5]], <4 x i32> [[REVERSE6]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 375; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4 376; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 377; CHECK-NEXT: [[VEC_IND_NEXT4]] = add <4 x i32> [[VEC_IND3]], <i32 -4, i32 -4, i32 -4, i32 -4> 378; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 379; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] 380; CHECK: middle.block: 381; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 382; CHECK: scalar.ph: 383; CHECK-NEXT: br label [[FOR_BODY:%.*]] 384; CHECK: for.cond.cleanup: 385; CHECK-NEXT: ret void 386; CHECK: for.body: 387; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], [[LOOP11:!llvm.loop !.*]] 388; 389entry: 390 br label %for.body 391 392for.cond.cleanup: ; preds = %for.body 393 ret void 394 395for.body: ; preds = %for.body, %entry 396 %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] 397 %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 398 %tmp = load i32, i32* %x, align 4 399 %tmp1 = trunc i64 %indvars.iv to i32 400 %add = add nsw i32 %tmp, %tmp1 401 %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 402 %tmp2 = load i32, i32* %y, align 4 403 %sub = sub nsw i32 %tmp2, %tmp1 404 %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 405 store i32 %add, i32* %x5, align 4 406 %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 407 store i32 %sub, i32* %y8, align 4 408 %indvars.iv.next = add nsw i64 %indvars.iv, -1 409 %cmp = icmp sgt i64 %indvars.iv, 0 410 br i1 %cmp, label %for.body, label %for.cond.cleanup 411} 412 413; Check vectorization on an interleaved load group of factor 2 with 1 gap 414; (missing the load of odd elements). Because the vectorized loop would 415; speculatively access memory out-of-bounds, we must execute at least one 416; iteration of the scalar loop. 417 418; void even_load_static_tc(int *A, int *B) { 419; for (unsigned i = 0; i < 1024; i+=2) 420; B[i/2] = A[i] * 2; 421; } 422 423 424define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { 425; CHECK-LABEL: @even_load_static_tc( 426; CHECK-NEXT: entry: 427; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 428; CHECK: vector.ph: 429; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 430; CHECK: vector.body: 431; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 432; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 433; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 434; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* 435; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 436; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 437; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1> 438; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[INDEX]], 9223372036854775804 439; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]] 440; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* 441; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP5]], align 4 442; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 443; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 508 444; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] 445; CHECK: middle.block: 446; CHECK-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 447; CHECK: scalar.ph: 448; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1016, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 449; CHECK-NEXT: br label [[FOR_BODY:%.*]] 450; CHECK: for.cond.cleanup: 451; CHECK-NEXT: ret void 452; CHECK: for.body: 453; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] 454; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] 455; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 456; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1 457; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1 458; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] 459; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4 460; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 461; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 462; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], [[LOOP13:!llvm.loop !.*]] 463; 464entry: 465 br label %for.body 466 467for.cond.cleanup: ; preds = %for.body 468 ret void 469 470for.body: ; preds = %for.body, %entry 471 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 472 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 473 %tmp = load i32, i32* %arrayidx, align 4 474 %mul = shl nsw i32 %tmp, 1 475 %tmp1 = lshr exact i64 %indvars.iv, 1 476 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 477 store i32 %mul, i32* %arrayidx2, align 4 478 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 479 %cmp = icmp ult i64 %indvars.iv.next, 1024 480 br i1 %cmp, label %for.body, label %for.cond.cleanup 481} 482 483; Check vectorization on an interleaved load group of factor 2 with 1 gap 484; (missing the load of odd elements). Because the vectorized loop would 485; speculatively access memory out-of-bounds, we must execute at least one 486; iteration of the scalar loop. 487 488; void even_load_dynamic_tc(int *A, int *B, unsigned N) { 489; for (unsigned i = 0; i < N; i+=2) 490; B[i/2] = A[i] * 2; 491; } 492 493 494define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) { 495; CHECK-LABEL: @even_load_dynamic_tc( 496; CHECK-NEXT: entry: 497; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[N:%.*]], 2 498; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 2 499; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[UMAX]], -1 500; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 1 501; CHECK-NEXT: [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1 502; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8 503; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 504; CHECK: vector.ph: 505; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP3]], 3 506; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 507; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 4, i64 [[N_MOD_VF]] 508; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[TMP5]] 509; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 510; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 511; CHECK: vector.body: 512; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 513; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 514; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 515; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 516; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4 517; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 518; CHECK-NEXT: [[TMP8:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1> 519; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[INDEX]], 9223372036854775804 520; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP9]] 521; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* 522; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4 523; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 524; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 525; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]] 526; CHECK: middle.block: 527; CHECK-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 528; CHECK: scalar.ph: 529; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 530; CHECK-NEXT: br label [[FOR_BODY:%.*]] 531; CHECK: for.cond.cleanup: 532; CHECK-NEXT: ret void 533; CHECK: for.body: 534; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] 535; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] 536; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 537; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1 538; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1 539; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] 540; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4 541; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 542; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]] 543; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], [[LOOP15:!llvm.loop !.*]] 544; 545entry: 546 br label %for.body 547 548for.cond.cleanup: ; preds = %for.body 549 ret void 550 551for.body: ; preds = %for.body, %entry 552 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 553 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 554 %tmp = load i32, i32* %arrayidx, align 4 555 %mul = shl nsw i32 %tmp, 1 556 %tmp1 = lshr exact i64 %indvars.iv, 1 557 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 558 store i32 %mul, i32* %arrayidx2, align 4 559 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 560 %cmp = icmp ult i64 %indvars.iv.next, %N 561 br i1 %cmp, label %for.body, label %for.cond.cleanup 562} 563 564; Check vectorization on a reverse interleaved load group of factor 2 with 1 565; gap and a reverse interleaved store group of factor 2. The interleaved load 566; group should be removed since it has a gap and is reverse. 567 568; struct pair { 569; int x; 570; int y; 571; }; 572; 573; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) { 574; for (int i = 1023; i >= 0; i--) { 575; int a = X + i; 576; int b = A[i].y - i; 577; B[i].x = a; 578; B[i].y = b; 579; } 580; } 581 582 583%pair = type { i64, i64 } 584define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) { 585; CHECK-LABEL: @load_gap_reverse( 586; CHECK-NEXT: entry: 587; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 588; CHECK: vector.ph: 589; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X:%.*]], i32 0 590; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer 591; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 592; CHECK: vector.body: 593; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 594; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 595; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] 596; CHECK-NEXT: [[TMP0:%.*]] = sub i64 1022, [[INDEX]] 597; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1021, [[INDEX]] 598; CHECK-NEXT: [[TMP2:%.*]] = sub i64 1020, [[INDEX]] 599; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]] 600; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P1:%.*]], i64 [[OFFSET_IDX]], i32 0 601; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP0]], i32 0 602; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP1]], i32 0 603; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP2]], i32 0 604; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1 605; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP0]], i32 1 606; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP1]], i32 1 607; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP2]], i32 1 608; CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[TMP8]], align 8 609; CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP9]], align 8 610; CHECK-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP10]], align 8 611; CHECK-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP11]], align 8 612; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i32 0 613; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1 614; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2 615; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3 616; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]] 617; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 618; CHECK-NEXT: store i64 [[TMP21]], i64* [[TMP4]], align 8 619; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 620; CHECK-NEXT: store i64 [[TMP22]], i64* [[TMP5]], align 8 621; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 622; CHECK-NEXT: store i64 [[TMP23]], i64* [[TMP6]], align 8 623; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 624; CHECK-NEXT: store i64 [[TMP24]], i64* [[TMP7]], align 8 625; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0 626; CHECK-NEXT: store i64 [[TMP25]], i64* [[TMP8]], align 8 627; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1 628; CHECK-NEXT: store i64 [[TMP26]], i64* [[TMP9]], align 8 629; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2 630; CHECK-NEXT: store i64 [[TMP27]], i64* [[TMP10]], align 8 631; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3 632; CHECK-NEXT: store i64 [[TMP28]], i64* [[TMP11]], align 8 633; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 634; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4> 635; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 636; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]] 637; CHECK: middle.block: 638; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 639; CHECK: scalar.ph: 640; CHECK-NEXT: br label [[FOR_BODY:%.*]] 641; CHECK: for.body: 642; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_EXIT]], [[LOOP17:!llvm.loop !.*]] 643; CHECK: for.exit: 644; CHECK-NEXT: ret void 645; 646entry: 647 br label %for.body 648 649for.body: 650 %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ] 651 %0 = add nsw i64 %X, %i 652 %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0 653 %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1 654 %3 = load i64, i64* %2, align 8 655 %4 = sub nsw i64 %3, %i 656 store i64 %0, i64* %1, align 8 657 store i64 %4, i64* %2, align 8 658 %i.next = add nsw i64 %i, -1 659 %cond = icmp sgt i64 %i, 0 660 br i1 %cond, label %for.body, label %for.exit 661 662for.exit: 663 ret void 664} 665 666; Check vectorization on interleaved access groups identified from mixed 667; loads/stores. 668; void mixed_load2_store2(int *A, int *B) { 669; for (unsigned i = 0; i < 1024; i+=2) { 670; B[i] = A[i] * A[i+1]; 671; B[i+1] = A[i] + A[i+1]; 672; } 673; } 674 675 676define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { 677; CHECK-LABEL: @mixed_load2_store2( 678; CHECK-NEXT: entry: 679; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 680; CHECK: vector.ph: 681; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 682; CHECK: vector.body: 683; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 684; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 685; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 686; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* 687; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 688; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 689; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 690; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1 691; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] 692; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 693; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 694; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] 695; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 -1 696; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 [[TMP2]] 697; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 698; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 699; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4 700; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 701; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 702; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] 703; CHECK: middle.block: 704; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 705; CHECK: scalar.ph: 706; CHECK-NEXT: br label [[FOR_BODY:%.*]] 707; CHECK: for.cond.cleanup: 708; CHECK-NEXT: ret void 709; CHECK: for.body: 710; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], [[LOOP19:!llvm.loop !.*]] 711; 712entry: 713 br label %for.body 714 715for.cond.cleanup: ; preds = %for.body 716 ret void 717 718for.body: ; preds = %for.body, %entry 719 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 720 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 721 %tmp = load i32, i32* %arrayidx, align 4 722 %tmp1 = or i64 %indvars.iv, 1 723 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1 724 %tmp2 = load i32, i32* %arrayidx2, align 4 725 %mul = mul nsw i32 %tmp2, %tmp 726 %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 727 store i32 %mul, i32* %arrayidx4, align 4 728 %tmp3 = load i32, i32* %arrayidx, align 4 729 %tmp4 = load i32, i32* %arrayidx2, align 4 730 %add10 = add nsw i32 %tmp4, %tmp3 731 %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1 732 store i32 %add10, i32* %arrayidx13, align 4 733 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 734 %cmp = icmp ult i64 %indvars.iv.next, 1024 735 br i1 %cmp, label %for.body, label %for.cond.cleanup 736} 737 738; Check vectorization on interleaved access groups identified from mixed 739; loads/stores. 740; void mixed_load3_store3(int *A) { 741; for (unsigned i = 0; i < 1024; i++) { 742; *A++ += i; 743; *A++ += i; 744; *A++ += i; 745; } 746; } 747 748 749define void @mixed_load3_store3(i32* nocapture %A) { 750; CHECK-LABEL: @mixed_load3_store3( 751; CHECK-NEXT: entry: 752; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 753; CHECK: vector.ph: 754; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 755; CHECK: vector.body: 756; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 757; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 758; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3 759; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP0]] 760; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>* 761; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4 762; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 763; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 764; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 765; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]] 766; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[NEXT_GEP]], i64 2 767; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]] 768; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]] 769; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 -2 770; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>* 771; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 772; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 773; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 774; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4 775; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 776; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4> 777; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 778; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]] 779; CHECK: middle.block: 780; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 781; CHECK: scalar.ph: 782; CHECK-NEXT: br label [[FOR_BODY:%.*]] 783; CHECK: for.cond.cleanup: 784; CHECK-NEXT: ret void 785; CHECK: for.body: 786; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP21:!llvm.loop !.*]] 787; 788entry: 789 br label %for.body 790 791for.cond.cleanup: ; preds = %for.body 792 ret void 793 794for.body: ; preds = %for.body, %entry 795 %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 796 %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ] 797 %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1 798 %tmp = load i32, i32* %A.addr.012, align 4 799 %add = add i32 %tmp, %i.013 800 store i32 %add, i32* %A.addr.012, align 4 801 %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2 802 %tmp1 = load i32, i32* %incdec.ptr, align 4 803 %add2 = add i32 %tmp1, %i.013 804 store i32 %add2, i32* %incdec.ptr, align 4 805 %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3 806 %tmp2 = load i32, i32* %incdec.ptr1, align 4 807 %add4 = add i32 %tmp2, %i.013 808 store i32 %add4, i32* %incdec.ptr1, align 4 809 %inc = add nuw nsw i32 %i.013, 1 810 %exitcond = icmp eq i32 %inc, 1024 811 br i1 %exitcond, label %for.cond.cleanup, label %for.body 812} 813 814; Check vectorization on interleaved access groups with members having different 815; kinds of type. 816 817; struct IntFloat { 818; int a; 819; float b; 820; }; 821; 822; int SA; 823; float SB; 824; 825; void int_float_struct(struct IntFloat *A) { 826; int SumA; 827; float SumB; 828; for (unsigned i = 0; i < 1024; i++) { 829; SumA += A[i].a; 830; SumB += A[i].b; 831; } 832; SA = SumA; 833; SB = SumB; 834; } 835 836 837%struct.IntFloat = type { i32, float } 838 839@SA = common global i32 0, align 4 840@SB = common global float 0.000000e+00, align 4 841 842define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 { 843; CHECK-LABEL: @int_float_struct( 844; CHECK-NEXT: entry: 845; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 846; CHECK: vector.ph: 847; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 848; CHECK: vector.body: 849; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 850; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] 851; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 852; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], %struct.IntFloat* [[A:%.*]], i64 [[INDEX]], i32 0 853; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* 854; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 855; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 856; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 857; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[STRIDED_VEC2]] to <4 x float> 858; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI1]] 859; CHECK-NEXT: [[TMP4]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP2]] 860; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 861; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 862; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]] 863; CHECK: middle.block: 864; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) 865; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) 866; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 867; CHECK: scalar.ph: 868; CHECK-NEXT: br label [[FOR_BODY:%.*]] 869; CHECK: for.cond.cleanup: 870; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] 871; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 872; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* @SA, align 4 873; CHECK-NEXT: store float [[ADD3_LCSSA]], float* @SB, align 4 874; CHECK-NEXT: ret void 875; CHECK: for.body: 876; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP23:!llvm.loop !.*]] 877; 878entry: 879 br label %for.body 880 881for.cond.cleanup: ; preds = %for.body 882 store i32 %add, i32* @SA, align 4 883 store float %add3, float* @SB, align 4 884 ret void 885 886for.body: ; preds = %for.body, %entry 887 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 888 %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] 889 %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] 890 %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0 891 %tmp = load i32, i32* %a, align 4 892 %add = add nsw i32 %tmp, %SumA.013 893 %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1 894 %tmp1 = load float, float* %b, align 4 895 %add3 = fadd fast float %SumB.014, %tmp1 896 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 897 %exitcond = icmp eq i64 %indvars.iv.next, 1024 898 br i1 %exitcond, label %for.cond.cleanup, label %for.body 899} 900 901; Check vectorization of interleaved access groups in the presence of 902; dependences (PR27626). The following tests check that we don't reorder 903; dependent loads and stores when generating code for interleaved access 904; groups. Stores should be scalarized because the required code motion would 905; break dependences, and the remaining interleaved load groups should have 906; gaps. 907 908; PR27626_0: Ensure a strided store is not moved after a dependent (zero 909; distance) strided load. 910 911; void PR27626_0(struct pair *p, int z, int n) { 912; for (int i = 0; i < n; i++) { 913; p[i].x = z; 914; p[i].y = p[i].x; 915; } 916; } 917 918 919%pair.i32 = type { i32, i32 } 920define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) { 921; CHECK-LABEL: @PR27626_0( 922; CHECK-NEXT: entry: 923; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1 924; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 925; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5 926; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 927; CHECK: vector.ph: 928; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3 929; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 930; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]] 931; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP2]] 932; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 933; CHECK: vector.body: 934; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 935; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1 936; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 2 937; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 3 938; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 939; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0 940; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0 941; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 0 942; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 943; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1 944; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1 945; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1 946; CHECK-NEXT: store i32 [[Z:%.*]], i32* [[TMP6]], align 4 947; CHECK-NEXT: store i32 [[Z]], i32* [[TMP7]], align 4 948; CHECK-NEXT: store i32 [[Z]], i32* [[TMP8]], align 4 949; CHECK-NEXT: store i32 [[Z]], i32* [[TMP9]], align 4 950; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 951; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP14]], align 4 952; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0 953; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP10]], align 4 954; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2 955; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP11]], align 4 956; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4 957; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP12]], align 4 958; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6 959; CHECK-NEXT: store i32 [[TMP18]], i32* [[TMP13]], align 4 960; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 961; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 962; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP24:!llvm.loop !.*]] 963; CHECK: middle.block: 964; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] 965; CHECK: scalar.ph: 966; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 967; CHECK-NEXT: br label [[FOR_BODY:%.*]] 968; CHECK: for.body: 969; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 970; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 971; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 972; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_X]], align 4 973; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_Y]], align 4 974; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 975; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 976; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP25:!llvm.loop !.*]] 977; CHECK: for.end: 978; CHECK-NEXT: ret void 979; 980entry: 981 br label %for.body 982 983for.body: 984 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 985 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 986 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 987 store i32 %z, i32* %p_i.x, align 4 988 %0 = load i32, i32* %p_i.x, align 4 989 store i32 %0, i32 *%p_i.y, align 4 990 %i.next = add nuw nsw i64 %i, 1 991 %cond = icmp slt i64 %i.next, %n 992 br i1 %cond, label %for.body, label %for.end 993 994for.end: 995 ret void 996} 997 998; PR27626_1: Ensure a strided load is not moved before a dependent (zero 999; distance) strided store. 1000 1001; void PR27626_1(struct pair *p, int n) { 1002; int s = 0; 1003; for (int i = 0; i < n; i++) { 1004; p[i].y = p[i].x; 1005; s += p[i].y 1006; } 1007; } 1008 1009 1010define i32 @PR27626_1(%pair.i32 *%p, i64 %n) { 1011; CHECK-LABEL: @PR27626_1( 1012; CHECK-NEXT: entry: 1013; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1 1014; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 1015; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5 1016; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1017; CHECK: vector.ph: 1018; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3 1019; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 1020; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]] 1021; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP2]] 1022; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1023; CHECK: vector.body: 1024; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1025; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] 1026; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1 1027; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 2 1028; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 3 1029; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 1030; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 1031; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1 1032; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1 1033; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1 1034; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 1035; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4 1036; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0 1037; CHECK-NEXT: store i32 [[TMP12]], i32* [[TMP7]], align 4 1038; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2 1039; CHECK-NEXT: store i32 [[TMP13]], i32* [[TMP8]], align 4 1040; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4 1041; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP9]], align 4 1042; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6 1043; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP10]], align 4 1044; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>* 1045; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4 1046; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1047; CHECK-NEXT: [[TMP17]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]] 1048; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 1049; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1050; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP26:!llvm.loop !.*]] 1051; CHECK: middle.block: 1052; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]]) 1053; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] 1054; CHECK: scalar.ph: 1055; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 1056; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 1057; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1058; CHECK: for.body: 1059; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1060; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP21:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 1061; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 1062; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 1063; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[P_I_X]], align 4 1064; CHECK-NEXT: store i32 [[TMP20]], i32* [[P_I_Y]], align 4 1065; CHECK-NEXT: [[TMP21]] = add nsw i32 [[TMP20]], [[S]] 1066; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 1067; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1068; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP27:!llvm.loop !.*]] 1069; CHECK: for.end: 1070; CHECK-NEXT: [[TMP22:%.*]] = phi i32 [ [[TMP21]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] 1071; CHECK-NEXT: ret i32 [[TMP22]] 1072; 1073entry: 1074 br label %for.body 1075 1076for.body: 1077 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 1078 %s = phi i32 [ %2, %for.body ], [ 0, %entry ] 1079 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 1080 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 1081 %0 = load i32, i32* %p_i.x, align 4 1082 store i32 %0, i32* %p_i.y, align 4 1083 %1 = load i32, i32* %p_i.y, align 4 1084 %2 = add nsw i32 %1, %s 1085 %i.next = add nuw nsw i64 %i, 1 1086 %cond = icmp slt i64 %i.next, %n 1087 br i1 %cond, label %for.body, label %for.end 1088 1089for.end: 1090 %3 = phi i32 [ %2, %for.body ] 1091 ret i32 %3 1092} 1093 1094; PR27626_2: Ensure a strided store is not moved after a dependent (negative 1095; distance) strided load. 1096 1097; void PR27626_2(struct pair *p, int z, int n) { 1098; for (int i = 0; i < n; i++) { 1099; p[i].x = z; 1100; p[i].y = p[i - 1].x; 1101; } 1102; } 1103 1104 1105define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) { 1106; CHECK-LABEL: @PR27626_2( 1107; CHECK-NEXT: entry: 1108; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1 1109; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 1110; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5 1111; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1112; CHECK: vector.ph: 1113; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3 1114; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 1115; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]] 1116; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP2]] 1117; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1118; CHECK: vector.body: 1119; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1120; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1 1121; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 2 1122; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 3 1123; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 1124; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0 1125; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0 1126; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 0 1127; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0 1128; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 1129; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1 1130; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1 1131; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1 1132; CHECK-NEXT: store i32 [[Z:%.*]], i32* [[TMP6]], align 4 1133; CHECK-NEXT: store i32 [[Z]], i32* [[TMP7]], align 4 1134; CHECK-NEXT: store i32 [[Z]], i32* [[TMP8]], align 4 1135; CHECK-NEXT: store i32 [[Z]], i32* [[TMP9]], align 4 1136; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* 1137; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 1138; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0 1139; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP11]], align 4 1140; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2 1141; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP12]], align 4 1142; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4 1143; CHECK-NEXT: store i32 [[TMP18]], i32* [[TMP13]], align 4 1144; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6 1145; CHECK-NEXT: store i32 [[TMP19]], i32* [[TMP14]], align 4 1146; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 1147; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1148; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP28:!llvm.loop !.*]] 1149; CHECK: middle.block: 1150; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] 1151; CHECK: scalar.ph: 1152; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 1153; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1154; CHECK: for.body: 1155; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1156; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 1157; CHECK-NEXT: [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0 1158; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 1159; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_X]], align 4 1160; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[P_I_MINUS_1_X]], align 4 1161; CHECK-NEXT: store i32 [[TMP21]], i32* [[P_I_Y]], align 4 1162; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 1163; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1164; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP29:!llvm.loop !.*]] 1165; CHECK: for.end: 1166; CHECK-NEXT: ret void 1167; 1168entry: 1169 br label %for.body 1170 1171for.body: 1172 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 1173 %i_minus_1 = add nuw nsw i64 %i, -1 1174 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 1175 %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0 1176 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 1177 store i32 %z, i32* %p_i.x, align 4 1178 %0 = load i32, i32* %p_i_minus_1.x, align 4 1179 store i32 %0, i32 *%p_i.y, align 4 1180 %i.next = add nuw nsw i64 %i, 1 1181 %cond = icmp slt i64 %i.next, %n 1182 br i1 %cond, label %for.body, label %for.end 1183 1184for.end: 1185 ret void 1186} 1187 1188; PR27626_3: Ensure a strided load is not moved before a dependent (negative 1189; distance) strided store. 1190 1191; void PR27626_3(struct pair *p, int z, int n) { 1192; for (int i = 0; i < n; i++) { 1193; p[i + 1].y = p[i].x; 1194; s += p[i].y; 1195; } 1196; } 1197 1198 1199define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) { 1200; CHECK-LABEL: @PR27626_3( 1201; CHECK-NEXT: entry: 1202; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1 1203; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 1204; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5 1205; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1206; CHECK: vector.ph: 1207; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3 1208; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 1209; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 4, i64 [[N_MOD_VF]] 1210; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP2]] 1211; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1212; CHECK: vector.body: 1213; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1214; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 1215; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] 1216; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1> 1217; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 1218; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 1219; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 1220; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP6]], i32 1 1221; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 1222; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP8]], i32 1 1223; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 1224; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP10]], i32 1 1225; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 1226; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP12]], i32 1 1227; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* 1228; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP14]], align 4 1229; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 0 1230; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP7]], align 4 1231; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 2 1232; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP9]], align 4 1233; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 4 1234; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP11]], align 4 1235; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i32 6 1236; CHECK-NEXT: store i32 [[TMP18]], i32* [[TMP13]], align 4 1237; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* 1238; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4 1239; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1240; CHECK-NEXT: [[TMP20]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]] 1241; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 1242; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> 1243; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1244; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP30:!llvm.loop !.*]] 1245; CHECK: middle.block: 1246; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP20]]) 1247; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] 1248; CHECK: scalar.ph: 1249; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 1250; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 1251; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1252; CHECK: for.body: 1253; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1254; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP25:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 1255; CHECK-NEXT: [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1 1256; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 1257; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 1258; CHECK-NEXT: [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I_PLUS_1]], i32 1 1259; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[P_I_X]], align 4 1260; CHECK-NEXT: store i32 [[TMP23]], i32* [[P_I_PLUS_1_Y]], align 4 1261; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[P_I_Y]], align 4 1262; CHECK-NEXT: [[TMP25]] = add nsw i32 [[TMP24]], [[S]] 1263; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 1264; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1265; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP31:!llvm.loop !.*]] 1266; CHECK: for.end: 1267; CHECK-NEXT: [[TMP26:%.*]] = phi i32 [ [[TMP25]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] 1268; CHECK-NEXT: ret i32 [[TMP26]] 1269; 1270entry: 1271 br label %for.body 1272 1273for.body: 1274 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 1275 %s = phi i32 [ %2, %for.body ], [ 0, %entry ] 1276 %i_plus_1 = add nuw nsw i64 %i, 1 1277 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 1278 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 1279 %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1 1280 %0 = load i32, i32* %p_i.x, align 4 1281 store i32 %0, i32* %p_i_plus_1.y, align 4 1282 %1 = load i32, i32* %p_i.y, align 4 1283 %2 = add nsw i32 %1, %s 1284 %i.next = add nuw nsw i64 %i, 1 1285 %cond = icmp slt i64 %i.next, %n 1286 br i1 %cond, label %for.body, label %for.end 1287 1288for.end: 1289 %3 = phi i32 [ %2, %for.body ] 1290 ret i32 %3 1291} 1292 1293; PR27626_4: Ensure we form an interleaved group for strided stores in the 1294; presence of a write-after-write dependence. We create a group for 1295; (2) and (3) while excluding (1). 1296 1297; void PR27626_4(int *a, int x, int y, int z, int n) { 1298; for (int i = 0; i < n; i += 2) { 1299; a[i] = x; // (1) 1300; a[i] = y; // (2) 1301; a[i + 1] = z; // (3) 1302; } 1303; } 1304 1305 1306define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { 1307; CHECK-LABEL: @PR27626_4( 1308; CHECK-NEXT: entry: 1309; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 2 1310; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 2 1311; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[SMAX]], -1 1312; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 1 1313; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 1314; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 6 1315; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1316; CHECK: vector.ph: 1317; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775804 1318; CHECK-NEXT: [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1 1319; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i32 0 1320; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer 1321; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Z:%.*]], i32 0 1322; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer 1323; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1324; CHECK: vector.body: 1325; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1326; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 1327; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2 1328; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 4 1329; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 6 1330; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[OFFSET_IDX]], 1 1331; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 1332; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] 1333; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]] 1334; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]] 1335; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 -1 1336; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP8]], align 4 1337; CHECK-NEXT: store i32 [[X]], i32* [[TMP9]], align 4 1338; CHECK-NEXT: store i32 [[X]], i32* [[TMP10]], align 4 1339; CHECK-NEXT: store i32 [[X]], i32* [[TMP11]], align 4 1340; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 [[TMP7]] 1341; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <8 x i32>* 1342; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 1343; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP14]], align 4 1344; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 1345; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1346; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP32:!llvm.loop !.*]] 1347; CHECK: middle.block: 1348; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] 1349; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] 1350; CHECK: scalar.ph: 1351; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 1352; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1353; CHECK: for.body: 1354; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1355; CHECK-NEXT: [[I_PLUS_1:%.*]] = or i64 [[I]], 1 1356; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] 1357; CHECK-NEXT: [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_PLUS_1]] 1358; CHECK-NEXT: store i32 [[Y]], i32* [[A_I]], align 4 1359; CHECK-NEXT: store i32 [[Z]], i32* [[A_I_PLUS_1]], align 4 1360; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 1361; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1362; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP33:!llvm.loop !.*]] 1363; CHECK: for.end: 1364; CHECK-NEXT: ret void 1365; 1366entry: 1367 br label %for.body 1368 1369for.body: 1370 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 1371 %i_plus_1 = add i64 %i, 1 1372 %a_i = getelementptr inbounds i32, i32* %a, i64 %i 1373 %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1 1374 store i32 %x, i32* %a_i, align 4 1375 store i32 %y, i32* %a_i, align 4 1376 store i32 %z, i32* %a_i_plus_1, align 4 1377 %i.next = add nuw nsw i64 %i, 2 1378 %cond = icmp slt i64 %i.next, %n 1379 br i1 %cond, label %for.body, label %for.end 1380 1381for.end: 1382 ret void 1383} 1384 1385; PR27626_5: Ensure we do not form an interleaved group for strided stores in 1386; the presence of a write-after-write dependence. 1387 1388; void PR27626_5(int *a, int x, int y, int z, int n) { 1389; for (int i = 3; i < n; i += 2) { 1390; a[i - 1] = x; 1391; a[i - 3] = y; 1392; a[i] = z; 1393; } 1394; } 1395 1396 1397define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { 1398; CHECK-LABEL: @PR27626_5( 1399; CHECK-NEXT: entry: 1400; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 5 1401; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 5 1402; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[SMAX]], -4 1403; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 1 1404; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 1405; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 6 1406; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1407; CHECK: vector.ph: 1408; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775804 1409; CHECK-NEXT: [[TMP4:%.*]] = shl nuw i64 [[N_VEC]], 1 1410; CHECK-NEXT: [[IND_END:%.*]] = or i64 [[TMP4]], 3 1411; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1412; CHECK: vector.body: 1413; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1414; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 1415; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[INDEX]], 1 1416; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[TMP5]], 3 1417; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 2 1418; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP5]], 7 1419; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 6 1420; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1> 1421; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -3, i64 -3, i64 -3, i64 -3> 1422; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 1423; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]] 1424; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]] 1425; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP8]] 1426; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0 1427; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]] 1428; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1 1429; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]] 1430; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2 1431; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP19]] 1432; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3 1433; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]] 1434; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP10]], i32 0 1435; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP23]] 1436; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP10]], i32 1 1437; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP25]] 1438; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP10]], i32 2 1439; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP27]] 1440; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP10]], i32 3 1441; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP29]] 1442; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP16]], align 4 1443; CHECK-NEXT: store i32 [[X]], i32* [[TMP18]], align 4 1444; CHECK-NEXT: store i32 [[X]], i32* [[TMP20]], align 4 1445; CHECK-NEXT: store i32 [[X]], i32* [[TMP22]], align 4 1446; CHECK-NEXT: store i32 [[Y:%.*]], i32* [[TMP24]], align 4 1447; CHECK-NEXT: store i32 [[Y]], i32* [[TMP26]], align 4 1448; CHECK-NEXT: store i32 [[Y]], i32* [[TMP28]], align 4 1449; CHECK-NEXT: store i32 [[Y]], i32* [[TMP30]], align 4 1450; CHECK-NEXT: store i32 [[Z:%.*]], i32* [[TMP11]], align 4 1451; CHECK-NEXT: store i32 [[Z]], i32* [[TMP12]], align 4 1452; CHECK-NEXT: store i32 [[Z]], i32* [[TMP13]], align 4 1453; CHECK-NEXT: store i32 [[Z]], i32* [[TMP14]], align 4 1454; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 1455; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8> 1456; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1457; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP34:!llvm.loop !.*]] 1458; CHECK: middle.block: 1459; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] 1460; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] 1461; CHECK: scalar.ph: 1462; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] 1463; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1464; CHECK: for.body: 1465; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1466; CHECK-NEXT: [[I_MINUS_1:%.*]] = add i64 [[I]], -1 1467; CHECK-NEXT: [[I_MINUS_3:%.*]] = add i64 [[I]], -3 1468; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] 1469; CHECK-NEXT: [[A_I_MINUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_1]] 1470; CHECK-NEXT: [[A_I_MINUS_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_3]] 1471; CHECK-NEXT: store i32 [[X]], i32* [[A_I_MINUS_1]], align 4 1472; CHECK-NEXT: store i32 [[Y]], i32* [[A_I_MINUS_3]], align 4 1473; CHECK-NEXT: store i32 [[Z]], i32* [[A_I]], align 4 1474; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 1475; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1476; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP35:!llvm.loop !.*]] 1477; CHECK: for.end: 1478; CHECK-NEXT: ret void 1479; 1480entry: 1481 br label %for.body 1482 1483for.body: 1484 %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ] 1485 %i_minus_1 = sub i64 %i, 1 1486 %i_minus_3 = sub i64 %i_minus_1, 2 1487 %a_i = getelementptr inbounds i32, i32* %a, i64 %i 1488 %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1 1489 %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3 1490 store i32 %x, i32* %a_i_minus_1, align 4 1491 store i32 %y, i32* %a_i_minus_3, align 4 1492 store i32 %z, i32* %a_i, align 4 1493 %i.next = add nuw nsw i64 %i, 2 1494 %cond = icmp slt i64 %i.next, %n 1495 br i1 %cond, label %for.body, label %for.end 1496 1497for.end: 1498 ret void 1499} 1500 1501; PR34743: Ensure that a cast which needs to sink after a load that belongs to 1502; an interleaved group, indeeded gets sunk. 1503 1504; void PR34743(short *a, int *b, int n) { 1505; for (int i = 0, iv = 0; iv < n; i++, iv += 2) { 1506; b[i] = a[iv] * a[iv+1] * a[iv+2]; 1507; } 1508; } 1509 1510 1511define void @PR34743(i16* %a, i32* %b, i64 %n) { 1512; CHECK-LABEL: @PR34743( 1513; CHECK-NEXT: entry: 1514; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[A:%.*]], align 2 1515; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1 1516; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1 1517; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 6 1518; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 1519; CHECK: vector.memcheck: 1520; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[N]], 1 1521; CHECK-NEXT: [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1 1522; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP3]] 1523; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i16, i16* [[A]], i64 1 1524; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[N]], -2 1525; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], 3 1526; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i16, i16* [[A]], i64 [[TMP5]] 1527; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[SCEVGEP5]] to i32* 1528; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[TMP6]], [[B]] 1529; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[SCEVGEP]] to i16* 1530; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i16* [[SCEVGEP3]], [[TMP7]] 1531; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 1532; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 1533; CHECK: vector.ph: 1534; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], -4 1535; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 1536; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i32 3 1537; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1538; CHECK: vector.body: 1539; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1540; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STRIDED_VEC8:%.*]], [[VECTOR_BODY]] ] 1541; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 1542; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[OFFSET_IDX]], 1 1543; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP8]] 1544; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>* 1545; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP10]], align 4 1546; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1547; CHECK-NEXT: [[STRIDED_VEC8]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1548; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[STRIDED_VEC8]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> 1549; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> 1550; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[TMP11]] to <4 x i32> 1551; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> 1552; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP12]] 1553; CHECK-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]] 1554; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] 1555; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>* 1556; CHECK-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP18]], align 4, !alias.scope !36, !noalias !39 1557; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 1558; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1559; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]] 1560; CHECK: middle.block: 1561; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] 1562; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i16> [[WIDE_VEC]], i32 7 1563; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] 1564; CHECK: scalar.ph: 1565; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] 1566; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] 1567; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] 1568; CHECK-NEXT: br label [[LOOP:%.*]] 1569; CHECK: loop: 1570; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ] 1571; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ] 1572; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL7]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ] 1573; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32 1574; CHECK-NEXT: [[I1]] = add nuw nsw i64 [[I]], 1 1575; CHECK-NEXT: [[IV1:%.*]] = or i64 [[IV]], 1 1576; CHECK-NEXT: [[IV2]] = add nuw nsw i64 [[IV]], 2 1577; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV1]] 1578; CHECK-NEXT: [[LOAD1:%.*]] = load i16, i16* [[GEP1]], align 4 1579; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32 1580; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV2]] 1581; CHECK-NEXT: [[LOAD2]] = load i16, i16* [[GEP2]], align 4 1582; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32 1583; CHECK-NEXT: [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]] 1584; CHECK-NEXT: [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]] 1585; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] 1586; CHECK-NEXT: store i32 [[MUL012]], i32* [[ARRAYIDX5]], align 4 1587; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]] 1588; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[LOOP]], [[LOOP42:!llvm.loop !.*]] 1589; CHECK: end: 1590; CHECK-NEXT: ret void 1591; 1592entry: 1593 %.pre = load i16, i16* %a 1594 br label %loop 1595 1596loop: 1597 %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ] 1598 %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ] 1599 %i = phi i64 [ 0, %entry ], [ %i1, %loop ] 1600 %conv = sext i16 %0 to i32 1601 %i1 = add nuw nsw i64 %i, 1 1602 %iv1 = add nuw nsw i64 %iv, 1 1603 %iv2 = add nuw nsw i64 %iv, 2 1604 %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1 1605 %load1 = load i16, i16* %gep1, align 4 1606 %conv1 = sext i16 %load1 to i32 1607 %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2 1608 %load2 = load i16, i16* %gep2, align 4 1609 %conv2 = sext i16 %load2 to i32 1610 %mul01 = mul nsw i32 %conv, %conv1 1611 %mul012 = mul nsw i32 %mul01, %conv2 1612 %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i 1613 store i32 %mul012, i32* %arrayidx5 1614 %exitcond = icmp eq i64 %iv, %n 1615 br i1 %exitcond, label %end, label %loop 1616 1617end: 1618 ret void 1619} 1620 1621attributes #0 = { "unsafe-fp-math"="true" } 1622