1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 5 6; Check vectorization on an interleaved load group of factor 2 and an interleaved 7; store group of factor 2. 8 9; int AB[1024]; 10; int CD[1024]; 11; void test_array_load2_store2(int C, int D) { 12; for (int i = 0; i < 1024; i+=2) { 13; int A = AB[i]; 14; int B = AB[i+1]; 15; CD[i] = A + C; 16; CD[i+1] = B * D; 17; } 18; } 19 20 21@AB = common global [1024 x i32] zeroinitializer, align 4 22@CD = common global [1024 x i32] zeroinitializer, align 4 23 24define void @test_array_load2_store2(i32 %C, i32 %D) { 25; CHECK-LABEL: @test_array_load2_store2( 26; CHECK-NEXT: entry: 27; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 28; CHECK: vector.ph: 29; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0 30; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer 31; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[D:%.*]], i64 0 32; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer 33; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 34; CHECK: vector.body: 35; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 36; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 37; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 [[OFFSET_IDX]] 38; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* 39; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 40; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 41; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 42; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1 43; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] 44; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]] 45; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 [[TMP2]] 46; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -1 47; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 48; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 49; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4 50; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 51; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 52; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 53; CHECK: middle.block: 54; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 55; CHECK: scalar.ph: 56; CHECK-NEXT: br label [[FOR_BODY:%.*]] 57; CHECK: for.body: 58; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]] 59; CHECK: for.end: 60; CHECK-NEXT: ret void 61; 62entry: 63 br label %for.body 64 65for.body: ; preds = %for.body, %entry 66 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 67 %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv 68 %tmp = load i32, i32* %arrayidx0, align 4 69 %tmp1 = or i64 %indvars.iv, 1 70 %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1 71 %tmp2 = load i32, i32* %arrayidx1, align 4 72 %add = add nsw i32 %tmp, %C 73 %mul = mul nsw i32 %tmp2, %D 74 %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv 75 store i32 %add, i32* %arrayidx2, align 4 76 %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1 77 store i32 %mul, i32* %arrayidx3, align 4 78 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 79 %cmp = icmp slt i64 %indvars.iv.next, 1024 80 br i1 %cmp, label %for.body, label %for.end 81 82for.end: ; preds = %for.body 83 ret void 84} 85 86; int A[3072]; 87; struct ST S[1024]; 88; void test_struct_st3() { 89; int *ptr = A; 90; for (int i = 0; i < 1024; i++) { 91; int X1 = *ptr++; 92; int X2 = *ptr++; 93; int X3 = *ptr++; 94; T[i].x = X1 + 1; 95; T[i].y = X2 + 2; 96; T[i].z = X3 + 3; 97; } 98; } 99 100 101%struct.ST3 = type { i32, i32, i32 } 102@A = common global [3072 x i32] zeroinitializer, align 4 103@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 104 105define void @test_struct_array_load3_store3() { 106; CHECK-LABEL: @test_struct_array_load3_store3( 107; CHECK-NEXT: entry: 108; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 109; CHECK: vector.ph: 110; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 111; CHECK: vector.body: 112; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 113; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3 114; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr [3072 x i32], [3072 x i32]* @A, i64 0, i64 [[TMP0]] 115; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>* 116; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4 117; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 118; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 119; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 120; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1> 121; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2> 122; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3> 123; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDEX]], i32 2 124; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -2 125; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>* 126; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 127; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 128; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 129; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4 130; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 131; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 132; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] 133; CHECK: middle.block: 134; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 135; CHECK: scalar.ph: 136; CHECK-NEXT: br label [[FOR_BODY:%.*]] 137; CHECK: for.body: 138; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] 139; CHECK: for.end: 140; CHECK-NEXT: ret void 141; 142entry: 143 br label %for.body 144 145for.body: ; preds = %for.body, %entry 146 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 147 %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] 148 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 149 %tmp = load i32, i32* %ptr.016, align 4 150 %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 151 %tmp1 = load i32, i32* %incdec.ptr, align 4 152 %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 153 %tmp2 = load i32, i32* %incdec.ptr1, align 4 154 %add = add nsw i32 %tmp, 1 155 %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 156 store i32 %add, i32* %x, align 4 157 %add3 = add nsw i32 %tmp1, 2 158 %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 159 store i32 %add3, i32* %y, align 4 160 %add6 = add nsw i32 %tmp2, 3 161 %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 162 store i32 %add6, i32* %z, align 4 163 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 164 %exitcond = icmp eq i64 %indvars.iv.next, 1024 165 br i1 %exitcond, label %for.end, label %for.body 166 167for.end: ; preds = %for.body 168 ret void 169} 170 171; Check vectorization on an interleaved load group of factor 4. 172 173; struct ST4{ 174; int x; 175; int y; 176; int z; 177; int w; 178; }; 179; int test_struct_load4(struct ST4 *S) { 180; int r = 0; 181; for (int i = 0; i < 1024; i++) { 182; r += S[i].x; 183; r -= S[i].y; 184; r += S[i].z; 185; r -= S[i].w; 186; } 187; return r; 188; } 189 190%struct.ST4 = type { i32, i32, i32, i32 } 191 192define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { 193; 194; CHECK-LABEL: @test_struct_load4( 195; CHECK-NEXT: entry: 196; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 197; CHECK: vector.ph: 198; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 199; CHECK: vector.body: 200; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 201; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 202; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], i64 [[INDEX]], i32 0 203; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* 204; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4 205; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 206; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 207; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 208; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 209; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]] 210; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[STRIDED_VEC2]] 211; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]] 212; CHECK-NEXT: [[TMP5]] = sub <4 x i32> [[TMP3]], [[TMP4]] 213; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 214; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 215; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] 216; CHECK: middle.block: 217; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) 218; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 219; CHECK: scalar.ph: 220; CHECK-NEXT: br label [[FOR_BODY:%.*]] 221; CHECK: for.body: 222; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] 223; CHECK: for.end: 224; CHECK-NEXT: [[SUB8_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 225; CHECK-NEXT: ret i32 [[SUB8_LCSSA]] 226; 227entry: 228 br label %for.body 229 230for.body: ; preds = %for.body, %entry 231 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 232 %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] 233 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 234 %tmp = load i32, i32* %x, align 4 235 %add = add nsw i32 %tmp, %r.022 236 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 237 %tmp1 = load i32, i32* %y, align 4 238 %sub = sub i32 %add, %tmp1 239 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 240 %tmp2 = load i32, i32* %z, align 4 241 %add5 = add nsw i32 %sub, %tmp2 242 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 243 %tmp3 = load i32, i32* %w, align 4 244 %sub8 = sub i32 %add5, %tmp3 245 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 246 %exitcond = icmp eq i64 %indvars.iv.next, 1024 247 br i1 %exitcond, label %for.end, label %for.body 248 249for.end: ; preds = %for.body 250 ret i32 %sub8 251} 252 253; Check vectorization on an interleaved store group of factor 4. 254 255; void test_struct_store4(int *A, struct ST4 *B) { 256; int *ptr = A; 257; for (int i = 0; i < 1024; i++) { 258; int X = *ptr++; 259; B[i].x = X + 1; 260; B[i].y = X * 2; 261; B[i].z = X + 3; 262; B[i].w = X + 4; 263; } 264; } 265 266 267define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) { 268; CHECK-LABEL: @test_struct_store4( 269; CHECK-NEXT: entry: 270; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 271; CHECK: vector.ph: 272; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 273; CHECK: vector.body: 274; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 275; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]] 276; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>* 277; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 278; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1> 279; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1> 280; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3> 281; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4> 282; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[B:%.*]], i64 [[INDEX]], i32 3 283; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3 284; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* 285; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 286; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 287; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 288; CHECK-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], <16 x i32>* [[TMP7]], align 4 289; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 290; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 291; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] 292; CHECK: middle.block: 293; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 294; CHECK: scalar.ph: 295; CHECK-NEXT: br label [[FOR_BODY:%.*]] 296; CHECK: for.cond.cleanup: 297; CHECK-NEXT: ret void 298; CHECK: for.body: 299; CHECK-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] 300; 301entry: 302 br label %for.body 303 304for.cond.cleanup: ; preds = %for.body 305 ret void 306 307for.body: ; preds = %for.body, %entry 308 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 309 %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ] 310 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1 311 %tmp = load i32, i32* %ptr.024, align 4 312 %add = add nsw i32 %tmp, 1 313 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0 314 store i32 %add, i32* %x, align 4 315 %mul = shl nsw i32 %tmp, 1 316 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1 317 store i32 %mul, i32* %y, align 4 318 %add3 = add nsw i32 %tmp, 3 319 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2 320 store i32 %add3, i32* %z, align 4 321 %add6 = add nsw i32 %tmp, 4 322 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3 323 store i32 %add6, i32* %w, align 4 324 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 325 %exitcond = icmp eq i64 %indvars.iv.next, 1024 326 br i1 %exitcond, label %for.cond.cleanup, label %for.body 327} 328 329; Check vectorization on a reverse interleaved load group of factor 2 and 330; a reverse interleaved store group of factor 2. 331 332; struct ST2 { 333; int x; 334; int y; 335; }; 336; 337; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { 338; for (int i = 1023; i >= 0; i--) { 339; int a = A[i].x + i; // interleaved load of index 0 340; int b = A[i].y - i; // interleaved load of index 1 341; B[i].x = a; // interleaved store of index 0 342; B[i].y = b; // interleaved store of index 1 343; } 344; } 345 346 347%struct.ST2 = type { i32, i32 } 348 349define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) { 350; CHECK-LABEL: @test_reversed_load2_store2( 351; CHECK-NEXT: entry: 352; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 353; CHECK: vector.ph: 354; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 355; CHECK: vector.body: 356; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 357; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 358; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] 359; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], %struct.ST2* [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 360; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 -6 361; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <8 x i32>* 362; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4 363; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 364; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 365; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 366; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 367; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND]] 368; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND]] 369; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B:%.*]], i64 [[OFFSET_IDX]], i32 1 370; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7 371; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 372; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 373; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 374; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE3]], <4 x i32> [[REVERSE4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 375; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4 376; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 377; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 -4, i32 -4, i32 -4, i32 -4> 378; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 379; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] 380; CHECK: middle.block: 381; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 382; CHECK: scalar.ph: 383; CHECK-NEXT: br label [[FOR_BODY:%.*]] 384; CHECK: for.cond.cleanup: 385; CHECK-NEXT: ret void 386; CHECK: for.body: 387; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]] 388; 389entry: 390 br label %for.body 391 392for.cond.cleanup: ; preds = %for.body 393 ret void 394 395for.body: ; preds = %for.body, %entry 396 %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] 397 %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 398 %tmp = load i32, i32* %x, align 4 399 %tmp1 = trunc i64 %indvars.iv to i32 400 %add = add nsw i32 %tmp, %tmp1 401 %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 402 %tmp2 = load i32, i32* %y, align 4 403 %sub = sub nsw i32 %tmp2, %tmp1 404 %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 405 store i32 %add, i32* %x5, align 4 406 %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 407 store i32 %sub, i32* %y8, align 4 408 %indvars.iv.next = add nsw i64 %indvars.iv, -1 409 %cmp = icmp sgt i64 %indvars.iv, 0 410 br i1 %cmp, label %for.body, label %for.cond.cleanup 411} 412 413; Check vectorization on an interleaved load group of factor 2 with 1 gap 414; (missing the load of odd elements). Because the vectorized loop would 415; speculatively access memory out-of-bounds, we must execute at least one 416; iteration of the scalar loop. 417 418; void even_load_static_tc(int *A, int *B) { 419; for (unsigned i = 0; i < 1024; i+=2) 420; B[i/2] = A[i] * 2; 421; } 422 423 424define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { 425; CHECK-LABEL: @even_load_static_tc( 426; CHECK-NEXT: entry: 427; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 428; CHECK: vector.ph: 429; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 430; CHECK: vector.body: 431; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 432; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 433; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 434; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* 435; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 436; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 437; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1> 438; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[INDEX]], 9223372036854775804 439; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]] 440; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* 441; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP5]], align 4 442; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 443; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 508 444; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] 445; CHECK: middle.block: 446; CHECK-NEXT: br label [[SCALAR_PH]] 447; CHECK: scalar.ph: 448; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1016, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 449; CHECK-NEXT: br label [[FOR_BODY:%.*]] 450; CHECK: for.cond.cleanup: 451; CHECK-NEXT: ret void 452; CHECK: for.body: 453; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] 454; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] 455; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 456; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1 457; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1 458; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] 459; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4 460; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 461; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 462; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP13:![0-9]+]] 463; 464entry: 465 br label %for.body 466 467for.cond.cleanup: ; preds = %for.body 468 ret void 469 470for.body: ; preds = %for.body, %entry 471 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 472 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 473 %tmp = load i32, i32* %arrayidx, align 4 474 %mul = shl nsw i32 %tmp, 1 475 %tmp1 = lshr exact i64 %indvars.iv, 1 476 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 477 store i32 %mul, i32* %arrayidx2, align 4 478 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 479 %cmp = icmp ult i64 %indvars.iv.next, 1024 480 br i1 %cmp, label %for.body, label %for.cond.cleanup 481} 482 483; Check vectorization on an interleaved load group of factor 2 with 1 gap 484; (missing the load of odd elements). Because the vectorized loop would 485; speculatively access memory out-of-bounds, we must execute at least one 486; iteration of the scalar loop. 487 488; void even_load_dynamic_tc(int *A, int *B, unsigned N) { 489; for (unsigned i = 0; i < N; i+=2) 490; B[i/2] = A[i] * 2; 491; } 492 493 494define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) { 495; CHECK-LABEL: @even_load_dynamic_tc( 496; CHECK-NEXT: entry: 497; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 2) 498; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 499; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 500; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 501; CHECK: vector.ph: 502; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 503; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 504; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3 505; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 506; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]] 507; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]] 508; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 509; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 510; CHECK: vector.body: 511; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 512; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 513; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 514; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* 515; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4 516; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 517; CHECK-NEXT: [[TMP7:%.*]] = shl nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1> 518; CHECK-NEXT: [[TMP8:%.*]] = and i64 [[INDEX]], 9223372036854775804 519; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP8]] 520; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* 521; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4 522; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 523; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 524; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] 525; CHECK: middle.block: 526; CHECK-NEXT: br label [[SCALAR_PH]] 527; CHECK: scalar.ph: 528; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 529; CHECK-NEXT: br label [[FOR_BODY:%.*]] 530; CHECK: for.cond.cleanup: 531; CHECK-NEXT: ret void 532; CHECK: for.body: 533; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] 534; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] 535; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 536; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP]], 1 537; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[INDVARS_IV]], 1 538; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] 539; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4 540; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 541; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]] 542; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP15:![0-9]+]] 543; 544entry: 545 br label %for.body 546 547for.cond.cleanup: ; preds = %for.body 548 ret void 549 550for.body: ; preds = %for.body, %entry 551 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 552 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 553 %tmp = load i32, i32* %arrayidx, align 4 554 %mul = shl nsw i32 %tmp, 1 555 %tmp1 = lshr exact i64 %indvars.iv, 1 556 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1 557 store i32 %mul, i32* %arrayidx2, align 4 558 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 559 %cmp = icmp ult i64 %indvars.iv.next, %N 560 br i1 %cmp, label %for.body, label %for.cond.cleanup 561} 562 563; Check vectorization on a reverse interleaved load group of factor 2 with 1 564; gap and a reverse interleaved store group of factor 2. The interleaved load 565; group should be removed since it has a gap and is reverse. 566 567; struct pair { 568; int x; 569; int y; 570; }; 571; 572; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) { 573; for (int i = 1023; i >= 0; i--) { 574; int a = X + i; 575; int b = A[i].y - i; 576; B[i].x = a; 577; B[i].y = b; 578; } 579; } 580 581 582%pair = type { i64, i64 } 583define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) { 584; CHECK-LABEL: @load_gap_reverse( 585; CHECK-NEXT: entry: 586; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 587; CHECK: vector.ph: 588; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X:%.*]], i64 0 589; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer 590; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 591; CHECK: vector.body: 592; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 593; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 594; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] 595; CHECK-NEXT: [[TMP0:%.*]] = sub i64 1022, [[INDEX]] 596; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1021, [[INDEX]] 597; CHECK-NEXT: [[TMP2:%.*]] = sub i64 1020, [[INDEX]] 598; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]] 599; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], %pair* [[P1:%.*]], i64 [[OFFSET_IDX]], i32 0 600; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP0]], i32 0 601; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP1]], i32 0 602; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P1]], i64 [[TMP2]], i32 0 603; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2:%.*]], i64 [[OFFSET_IDX]], i32 1 604; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP0]], i32 1 605; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP1]], i32 1 606; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR]], %pair* [[P2]], i64 [[TMP2]], i32 1 607; CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[TMP8]], align 8 608; CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP9]], align 8 609; CHECK-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP10]], align 8 610; CHECK-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP11]], align 8 611; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[TMP12]], i64 0 612; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i64 1 613; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i64 2 614; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i64 3 615; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]] 616; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0 617; CHECK-NEXT: store i64 [[TMP21]], i64* [[TMP4]], align 8 618; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1 619; CHECK-NEXT: store i64 [[TMP22]], i64* [[TMP5]], align 8 620; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2 621; CHECK-NEXT: store i64 [[TMP23]], i64* [[TMP6]], align 8 622; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3 623; CHECK-NEXT: store i64 [[TMP24]], i64* [[TMP7]], align 8 624; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0 625; CHECK-NEXT: store i64 [[TMP25]], i64* [[TMP8]], align 8 626; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1 627; CHECK-NEXT: store i64 [[TMP26]], i64* [[TMP9]], align 8 628; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2 629; CHECK-NEXT: store i64 [[TMP27]], i64* [[TMP10]], align 8 630; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3 631; CHECK-NEXT: store i64 [[TMP28]], i64* [[TMP11]], align 8 632; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 633; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4> 634; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 635; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] 636; CHECK: middle.block: 637; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 638; CHECK: scalar.ph: 639; CHECK-NEXT: br label [[FOR_BODY:%.*]] 640; CHECK: for.body: 641; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP17:![0-9]+]] 642; CHECK: for.exit: 643; CHECK-NEXT: ret void 644; 645entry: 646 br label %for.body 647 648for.body: 649 %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ] 650 %0 = add nsw i64 %X, %i 651 %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0 652 %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1 653 %3 = load i64, i64* %2, align 8 654 %4 = sub nsw i64 %3, %i 655 store i64 %0, i64* %1, align 8 656 store i64 %4, i64* %2, align 8 657 %i.next = add nsw i64 %i, -1 658 %cond = icmp sgt i64 %i, 0 659 br i1 %cond, label %for.body, label %for.exit 660 661for.exit: 662 ret void 663} 664 665; Check vectorization on interleaved access groups identified from mixed 666; loads/stores. 667; void mixed_load2_store2(int *A, int *B) { 668; for (unsigned i = 0; i < 1024; i+=2) { 669; B[i] = A[i] * A[i+1]; 670; B[i+1] = A[i] + A[i+1]; 671; } 672; } 673 674 675define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { 676; CHECK-LABEL: @mixed_load2_store2( 677; CHECK-NEXT: entry: 678; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 679; CHECK: vector.ph: 680; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 681; CHECK: vector.body: 682; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 683; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 684; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 685; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* 686; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 687; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 688; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 689; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1 690; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] 691; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 692; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 693; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] 694; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 -1 695; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 [[TMP2]] 696; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 697; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 698; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4 699; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 700; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 701; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] 702; CHECK: middle.block: 703; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 704; CHECK: scalar.ph: 705; CHECK-NEXT: br label [[FOR_BODY:%.*]] 706; CHECK: for.cond.cleanup: 707; CHECK-NEXT: ret void 708; CHECK: for.body: 709; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP19:![0-9]+]] 710; 711entry: 712 br label %for.body 713 714for.cond.cleanup: ; preds = %for.body 715 ret void 716 717for.body: ; preds = %for.body, %entry 718 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 719 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 720 %tmp = load i32, i32* %arrayidx, align 4 721 %tmp1 = or i64 %indvars.iv, 1 722 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1 723 %tmp2 = load i32, i32* %arrayidx2, align 4 724 %mul = mul nsw i32 %tmp2, %tmp 725 %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 726 store i32 %mul, i32* %arrayidx4, align 4 727 %tmp3 = load i32, i32* %arrayidx, align 4 728 %tmp4 = load i32, i32* %arrayidx2, align 4 729 %add10 = add nsw i32 %tmp4, %tmp3 730 %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1 731 store i32 %add10, i32* %arrayidx13, align 4 732 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 733 %cmp = icmp ult i64 %indvars.iv.next, 1024 734 br i1 %cmp, label %for.body, label %for.cond.cleanup 735} 736 737; Check vectorization on interleaved access groups identified from mixed 738; loads/stores. 739; void mixed_load3_store3(int *A) { 740; for (unsigned i = 0; i < 1024; i++) { 741; *A++ += i; 742; *A++ += i; 743; *A++ += i; 744; } 745; } 746 747 748define void @mixed_load3_store3(i32* nocapture %A) { 749; CHECK-LABEL: @mixed_load3_store3( 750; CHECK-NEXT: entry: 751; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 752; CHECK: vector.ph: 753; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 754; CHECK: vector.body: 755; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 756; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 757; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[INDEX]], 3 758; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP0]] 759; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>* 760; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4 761; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 762; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 763; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 764; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]] 765; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[NEXT_GEP]], i64 2 766; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]] 767; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]] 768; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 -2 769; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>* 770; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 771; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 772; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11> 773; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4 774; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 775; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4> 776; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 777; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] 778; CHECK: middle.block: 779; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 780; CHECK: scalar.ph: 781; CHECK-NEXT: br label [[FOR_BODY:%.*]] 782; CHECK: for.cond.cleanup: 783; CHECK-NEXT: ret void 784; CHECK: for.body: 785; CHECK-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] 786; 787entry: 788 br label %for.body 789 790for.cond.cleanup: ; preds = %for.body 791 ret void 792 793for.body: ; preds = %for.body, %entry 794 %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 795 %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ] 796 %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1 797 %tmp = load i32, i32* %A.addr.012, align 4 798 %add = add i32 %tmp, %i.013 799 store i32 %add, i32* %A.addr.012, align 4 800 %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2 801 %tmp1 = load i32, i32* %incdec.ptr, align 4 802 %add2 = add i32 %tmp1, %i.013 803 store i32 %add2, i32* %incdec.ptr, align 4 804 %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3 805 %tmp2 = load i32, i32* %incdec.ptr1, align 4 806 %add4 = add i32 %tmp2, %i.013 807 store i32 %add4, i32* %incdec.ptr1, align 4 808 %inc = add nuw nsw i32 %i.013, 1 809 %exitcond = icmp eq i32 %inc, 1024 810 br i1 %exitcond, label %for.cond.cleanup, label %for.body 811} 812 813; Check vectorization on interleaved access groups with members having different 814; kinds of type. 815 816; struct IntFloat { 817; int a; 818; float b; 819; }; 820; 821; int SA; 822; float SB; 823; 824; void int_float_struct(struct IntFloat *A) { 825; int SumA; 826; float SumB; 827; for (unsigned i = 0; i < 1024; i++) { 828; SumA += A[i].a; 829; SumB += A[i].b; 830; } 831; SA = SumA; 832; SB = SumB; 833; } 834 835 836%struct.IntFloat = type { i32, float } 837 838@SA = common global i32 0, align 4 839@SB = common global float 0.000000e+00, align 4 840 841define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 { 842; CHECK-LABEL: @int_float_struct( 843; CHECK-NEXT: entry: 844; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 845; CHECK: vector.ph: 846; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 847; CHECK: vector.body: 848; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 849; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] 850; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 851; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], %struct.IntFloat* [[A:%.*]], i64 [[INDEX]], i32 0 852; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* 853; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 854; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 855; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 856; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[STRIDED_VEC2]] to <4 x float> 857; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI1]] 858; CHECK-NEXT: [[TMP4]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP2]] 859; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 860; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 861; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] 862; CHECK: middle.block: 863; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) 864; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) 865; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] 866; CHECK: scalar.ph: 867; CHECK-NEXT: br label [[FOR_BODY:%.*]] 868; CHECK: for.cond.cleanup: 869; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] 870; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] 871; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* @SA, align 4 872; CHECK-NEXT: store float [[ADD3_LCSSA]], float* @SB, align 4 873; CHECK-NEXT: ret void 874; CHECK: for.body: 875; CHECK-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] 876; 877entry: 878 br label %for.body 879 880for.cond.cleanup: ; preds = %for.body 881 store i32 %add, i32* @SA, align 4 882 store float %add3, float* @SB, align 4 883 ret void 884 885for.body: ; preds = %for.body, %entry 886 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 887 %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] 888 %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] 889 %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0 890 %tmp = load i32, i32* %a, align 4 891 %add = add nsw i32 %tmp, %SumA.013 892 %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1 893 %tmp1 = load float, float* %b, align 4 894 %add3 = fadd fast float %SumB.014, %tmp1 895 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 896 %exitcond = icmp eq i64 %indvars.iv.next, 1024 897 br i1 %exitcond, label %for.cond.cleanup, label %for.body 898} 899 900; Check vectorization of interleaved access groups in the presence of 901; dependences (PR27626). The following tests check that we don't reorder 902; dependent loads and stores when generating code for interleaved access 903; groups. Stores should be scalarized because the required code motion would 904; break dependences, and the remaining interleaved load groups should have 905; gaps. 906 907; PR27626_0: Ensure a strided store is not moved after a dependent (zero 908; distance) strided load. 909 910; void PR27626_0(struct pair *p, int z, int n) { 911; for (int i = 0; i < n; i++) { 912; p[i].x = z; 913; p[i].y = p[i].x; 914; } 915; } 916 917 918%pair.i32 = type { i32, i32 } 919define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) { 920; CHECK-LABEL: @PR27626_0( 921; CHECK-NEXT: entry: 922; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) 923; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5 924; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 925; CHECK: vector.ph: 926; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3 927; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 928; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]] 929; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] 930; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 931; CHECK: vector.body: 932; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 933; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 934; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2 935; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3 936; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 937; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 0 938; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0 939; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0 940; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 941; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1 942; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1 943; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1 944; CHECK-NEXT: store i32 [[Z:%.*]], i32* [[TMP5]], align 4 945; CHECK-NEXT: store i32 [[Z]], i32* [[TMP6]], align 4 946; CHECK-NEXT: store i32 [[Z]], i32* [[TMP7]], align 4 947; CHECK-NEXT: store i32 [[Z]], i32* [[TMP8]], align 4 948; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* 949; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 950; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 951; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP9]], align 4 952; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 953; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP10]], align 4 954; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 955; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP11]], align 4 956; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 957; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP12]], align 4 958; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 959; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 960; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] 961; CHECK: middle.block: 962; CHECK-NEXT: br label [[SCALAR_PH]] 963; CHECK: scalar.ph: 964; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 965; CHECK-NEXT: br label [[FOR_BODY:%.*]] 966; CHECK: for.body: 967; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 968; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 969; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 970; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_X]], align 4 971; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_Y]], align 4 972; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 973; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 974; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP25:![0-9]+]] 975; CHECK: for.end: 976; CHECK-NEXT: ret void 977; 978entry: 979 br label %for.body 980 981for.body: 982 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 983 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 984 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 985 store i32 %z, i32* %p_i.x, align 4 986 %0 = load i32, i32* %p_i.x, align 4 987 store i32 %0, i32 *%p_i.y, align 4 988 %i.next = add nuw nsw i64 %i, 1 989 %cond = icmp slt i64 %i.next, %n 990 br i1 %cond, label %for.body, label %for.end 991 992for.end: 993 ret void 994} 995 996; PR27626_1: Ensure a strided load is not moved before a dependent (zero 997; distance) strided store. 998 999; void PR27626_1(struct pair *p, int n) { 1000; int s = 0; 1001; for (int i = 0; i < n; i++) { 1002; p[i].y = p[i].x; 1003; s += p[i].y 1004; } 1005; } 1006 1007 1008define i32 @PR27626_1(%pair.i32 *%p, i64 %n) { 1009; CHECK-LABEL: @PR27626_1( 1010; CHECK-NEXT: entry: 1011; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) 1012; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5 1013; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1014; CHECK: vector.ph: 1015; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3 1016; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 1017; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]] 1018; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] 1019; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1020; CHECK: vector.body: 1021; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1022; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] 1023; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 1024; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2 1025; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3 1026; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 1027; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 1028; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1 1029; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1 1030; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1 1031; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* 1032; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4 1033; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 1034; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP6]], align 4 1035; CHECK-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 1036; CHECK-NEXT: store i32 [[TMP12]], i32* [[TMP7]], align 4 1037; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 1038; CHECK-NEXT: store i32 [[TMP13]], i32* [[TMP8]], align 4 1039; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 1040; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP9]], align 4 1041; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* 1042; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 1043; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1044; CHECK-NEXT: [[TMP16]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]] 1045; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 1046; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1047; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] 1048; CHECK: middle.block: 1049; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP16]]) 1050; CHECK-NEXT: br label [[SCALAR_PH]] 1051; CHECK: scalar.ph: 1052; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 1053; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 1054; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1055; CHECK: for.body: 1056; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1057; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP20:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 1058; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 1059; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 1060; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[P_I_X]], align 4 1061; CHECK-NEXT: store i32 [[TMP19]], i32* [[P_I_Y]], align 4 1062; CHECK-NEXT: [[TMP20]] = add nsw i32 [[TMP19]], [[S]] 1063; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 1064; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1065; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP27:![0-9]+]] 1066; CHECK: for.end: 1067; CHECK-NEXT: ret i32 [[TMP20]] 1068; 1069entry: 1070 br label %for.body 1071 1072for.body: 1073 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 1074 %s = phi i32 [ %2, %for.body ], [ 0, %entry ] 1075 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 1076 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 1077 %0 = load i32, i32* %p_i.x, align 4 1078 store i32 %0, i32* %p_i.y, align 4 1079 %1 = load i32, i32* %p_i.y, align 4 1080 %2 = add nsw i32 %1, %s 1081 %i.next = add nuw nsw i64 %i, 1 1082 %cond = icmp slt i64 %i.next, %n 1083 br i1 %cond, label %for.body, label %for.end 1084 1085for.end: 1086 %3 = phi i32 [ %2, %for.body ] 1087 ret i32 %3 1088} 1089 1090; PR27626_2: Ensure a strided store is not moved after a dependent (negative 1091; distance) strided load. 1092 1093; void PR27626_2(struct pair *p, int z, int n) { 1094; for (int i = 0; i < n; i++) { 1095; p[i].x = z; 1096; p[i].y = p[i - 1].x; 1097; } 1098; } 1099 1100 1101define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) { 1102; CHECK-LABEL: @PR27626_2( 1103; CHECK-NEXT: entry: 1104; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) 1105; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5 1106; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1107; CHECK: vector.ph: 1108; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3 1109; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 1110; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]] 1111; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] 1112; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1113; CHECK: vector.body: 1114; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1115; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 1 1116; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 2 1117; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 3 1118; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 1119; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 0 1120; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 0 1121; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 0 1122; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0 1123; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 1124; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP2]], i32 1 1125; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP3]], i32 1 1126; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP4]], i32 1 1127; CHECK-NEXT: store i32 [[Z:%.*]], i32* [[TMP5]], align 4 1128; CHECK-NEXT: store i32 [[Z]], i32* [[TMP6]], align 4 1129; CHECK-NEXT: store i32 [[Z]], i32* [[TMP7]], align 4 1130; CHECK-NEXT: store i32 [[Z]], i32* [[TMP8]], align 4 1131; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* 1132; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP14]], align 4 1133; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 1134; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP10]], align 4 1135; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 1136; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP11]], align 4 1137; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 1138; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP12]], align 4 1139; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 1140; CHECK-NEXT: store i32 [[TMP18]], i32* [[TMP13]], align 4 1141; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 1142; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1143; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] 1144; CHECK: middle.block: 1145; CHECK-NEXT: br label [[SCALAR_PH]] 1146; CHECK: scalar.ph: 1147; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 1148; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1149; CHECK: for.body: 1150; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1151; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 1152; CHECK-NEXT: [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 -1, i32 0 1153; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 1154; CHECK-NEXT: store i32 [[Z]], i32* [[P_I_X]], align 4 1155; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[P_I_MINUS_1_X]], align 4 1156; CHECK-NEXT: store i32 [[TMP20]], i32* [[P_I_Y]], align 4 1157; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 1158; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1159; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP29:![0-9]+]] 1160; CHECK: for.end: 1161; CHECK-NEXT: ret void 1162; 1163entry: 1164 br label %for.body 1165 1166for.body: 1167 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 1168 %i_minus_1 = add nuw nsw i64 %i, -1 1169 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 1170 %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0 1171 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 1172 store i32 %z, i32* %p_i.x, align 4 1173 %0 = load i32, i32* %p_i_minus_1.x, align 4 1174 store i32 %0, i32 *%p_i.y, align 4 1175 %i.next = add nuw nsw i64 %i, 1 1176 %cond = icmp slt i64 %i.next, %n 1177 br i1 %cond, label %for.body, label %for.end 1178 1179for.end: 1180 ret void 1181} 1182 1183; PR27626_3: Ensure a strided load is not moved before a dependent (negative 1184; distance) strided store. 1185 1186; void PR27626_3(struct pair *p, int z, int n) { 1187; for (int i = 0; i < n; i++) { 1188; p[i + 1].y = p[i].x; 1189; s += p[i].y; 1190; } 1191; } 1192 1193 1194define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) { 1195; CHECK-LABEL: @PR27626_3( 1196; CHECK-NEXT: entry: 1197; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1) 1198; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 5 1199; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1200; CHECK: vector.ph: 1201; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[SMAX]], 3 1202; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 1203; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[TMP0]], i64 4, i64 [[N_MOD_VF]] 1204; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP1]] 1205; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1206; CHECK: vector.body: 1207; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1208; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 1209; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] 1210; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1> 1211; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], %pair.i32* [[P:%.*]], i64 [[INDEX]], i32 0 1212; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[INDEX]], i32 1 1213; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0 1214; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP5]], i32 1 1215; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1 1216; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP7]], i32 1 1217; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2 1218; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP9]], i32 1 1219; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3 1220; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[TMP11]], i32 1 1221; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>* 1222; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 1223; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0 1224; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP6]], align 4 1225; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2 1226; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP8]], align 4 1227; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4 1228; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP10]], align 4 1229; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6 1230; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP12]], align 4 1231; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* 1232; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP18]], align 4 1233; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1234; CHECK-NEXT: [[TMP19]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_PHI]] 1235; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 1236; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4> 1237; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1238; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] 1239; CHECK: middle.block: 1240; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP19]]) 1241; CHECK-NEXT: br label [[SCALAR_PH]] 1242; CHECK: scalar.ph: 1243; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 1244; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 1245; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1246; CHECK: for.body: 1247; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1248; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP24:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 1249; CHECK-NEXT: [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1 1250; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 0 1251; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I]], i32 1 1252; CHECK-NEXT: [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], %pair.i32* [[P]], i64 [[I_PLUS_1]], i32 1 1253; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[P_I_X]], align 4 1254; CHECK-NEXT: store i32 [[TMP22]], i32* [[P_I_PLUS_1_Y]], align 4 1255; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[P_I_Y]], align 4 1256; CHECK-NEXT: [[TMP24]] = add nsw i32 [[TMP23]], [[S]] 1257; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 1258; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1259; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP31:![0-9]+]] 1260; CHECK: for.end: 1261; CHECK-NEXT: ret i32 [[TMP24]] 1262; 1263entry: 1264 br label %for.body 1265 1266for.body: 1267 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 1268 %s = phi i32 [ %2, %for.body ], [ 0, %entry ] 1269 %i_plus_1 = add nuw nsw i64 %i, 1 1270 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0 1271 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1 1272 %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1 1273 %0 = load i32, i32* %p_i.x, align 4 1274 store i32 %0, i32* %p_i_plus_1.y, align 4 1275 %1 = load i32, i32* %p_i.y, align 4 1276 %2 = add nsw i32 %1, %s 1277 %i.next = add nuw nsw i64 %i, 1 1278 %cond = icmp slt i64 %i.next, %n 1279 br i1 %cond, label %for.body, label %for.end 1280 1281for.end: 1282 %3 = phi i32 [ %2, %for.body ] 1283 ret i32 %3 1284} 1285 1286; PR27626_4: Ensure we form an interleaved group for strided stores in the 1287; presence of a write-after-write dependence. We create a group for 1288; (2) and (3) while excluding (1). 1289 1290; void PR27626_4(int *a, int x, int y, int z, int n) { 1291; for (int i = 0; i < n; i += 2) { 1292; a[i] = x; // (1) 1293; a[i] = y; // (2) 1294; a[i + 1] = z; // (3) 1295; } 1296; } 1297 1298 1299define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { 1300; CHECK-LABEL: @PR27626_4( 1301; CHECK-NEXT: entry: 1302; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2) 1303; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 1304; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 1305; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 1306; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6 1307; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1308; CHECK: vector.ph: 1309; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804 1310; CHECK-NEXT: [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1 1311; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[Y:%.*]], i64 0 1312; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer 1313; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Z:%.*]], i64 0 1314; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer 1315; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1316; CHECK: vector.body: 1317; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1318; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 1319; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 2 1320; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 4 1321; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 6 1322; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 1 1323; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 1324; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]] 1325; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] 1326; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]] 1327; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[A]], i64 -1 1328; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP7]], align 4 1329; CHECK-NEXT: store i32 [[X]], i32* [[TMP8]], align 4 1330; CHECK-NEXT: store i32 [[X]], i32* [[TMP9]], align 4 1331; CHECK-NEXT: store i32 [[X]], i32* [[TMP10]], align 4 1332; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 [[TMP6]] 1333; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* 1334; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 1335; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP13]], align 4 1336; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 1337; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1338; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] 1339; CHECK: middle.block: 1340; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] 1341; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] 1342; CHECK: scalar.ph: 1343; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 1344; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1345; CHECK: for.body: 1346; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1347; CHECK-NEXT: [[I_PLUS_1:%.*]] = or i64 [[I]], 1 1348; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] 1349; CHECK-NEXT: [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_PLUS_1]] 1350; CHECK-NEXT: store i32 [[Y]], i32* [[A_I]], align 4 1351; CHECK-NEXT: store i32 [[Z]], i32* [[A_I_PLUS_1]], align 4 1352; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 1353; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1354; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP33:![0-9]+]] 1355; CHECK: for.end: 1356; CHECK-NEXT: ret void 1357; 1358entry: 1359 br label %for.body 1360 1361for.body: 1362 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] 1363 %i_plus_1 = add i64 %i, 1 1364 %a_i = getelementptr inbounds i32, i32* %a, i64 %i 1365 %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1 1366 store i32 %x, i32* %a_i, align 4 1367 store i32 %y, i32* %a_i, align 4 1368 store i32 %z, i32* %a_i_plus_1, align 4 1369 %i.next = add nuw nsw i64 %i, 2 1370 %cond = icmp slt i64 %i.next, %n 1371 br i1 %cond, label %for.body, label %for.end 1372 1373for.end: 1374 ret void 1375} 1376 1377; PR27626_5: Ensure we do not form an interleaved group for strided stores in 1378; the presence of a write-after-write dependence. 1379 1380; void PR27626_5(int *a, int x, int y, int z, int n) { 1381; for (int i = 3; i < n; i += 2) { 1382; a[i - 1] = x; 1383; a[i - 3] = y; 1384; a[i] = z; 1385; } 1386; } 1387 1388 1389define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { 1390; CHECK-LABEL: @PR27626_5( 1391; CHECK-NEXT: entry: 1392; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 5) 1393; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4 1394; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 1395; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 1396; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 6 1397; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 1398; CHECK: vector.ph: 1399; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 9223372036854775804 1400; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[N_VEC]], 1 1401; CHECK-NEXT: [[IND_END:%.*]] = or i64 [[TMP3]], 3 1402; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1403; CHECK: vector.body: 1404; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1405; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 1406; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[INDEX]], 1 1407; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[TMP4]], 3 1408; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP4]], 5 1409; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP4]], 7 1410; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 9 1411; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1> 1412; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[VEC_IND]], <i64 -3, i64 -3, i64 -3, i64 -3> 1413; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] 1414; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]] 1415; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]] 1416; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]] 1417; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0 1418; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP14]] 1419; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1 1420; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]] 1421; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2 1422; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP18]] 1423; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3 1424; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]] 1425; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP9]], i64 0 1426; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]] 1427; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP9]], i64 1 1428; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP24]] 1429; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i64> [[TMP9]], i64 2 1430; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP26]] 1431; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP9]], i64 3 1432; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP28]] 1433; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP15]], align 4 1434; CHECK-NEXT: store i32 [[X]], i32* [[TMP17]], align 4 1435; CHECK-NEXT: store i32 [[X]], i32* [[TMP19]], align 4 1436; CHECK-NEXT: store i32 [[X]], i32* [[TMP21]], align 4 1437; CHECK-NEXT: store i32 [[Y:%.*]], i32* [[TMP23]], align 4 1438; CHECK-NEXT: store i32 [[Y]], i32* [[TMP25]], align 4 1439; CHECK-NEXT: store i32 [[Y]], i32* [[TMP27]], align 4 1440; CHECK-NEXT: store i32 [[Y]], i32* [[TMP29]], align 4 1441; CHECK-NEXT: store i32 [[Z:%.*]], i32* [[TMP10]], align 4 1442; CHECK-NEXT: store i32 [[Z]], i32* [[TMP11]], align 4 1443; CHECK-NEXT: store i32 [[Z]], i32* [[TMP12]], align 4 1444; CHECK-NEXT: store i32 [[Z]], i32* [[TMP13]], align 4 1445; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 1446; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8> 1447; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1448; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] 1449; CHECK: middle.block: 1450; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] 1451; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] 1452; CHECK: scalar.ph: 1453; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] 1454; CHECK-NEXT: br label [[FOR_BODY:%.*]] 1455; CHECK: for.body: 1456; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 1457; CHECK-NEXT: [[I_MINUS_1:%.*]] = add i64 [[I]], -1 1458; CHECK-NEXT: [[I_MINUS_3:%.*]] = add i64 [[I]], -3 1459; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]] 1460; CHECK-NEXT: [[A_I_MINUS_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_1]] 1461; CHECK-NEXT: [[A_I_MINUS_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_MINUS_3]] 1462; CHECK-NEXT: store i32 [[X]], i32* [[A_I_MINUS_1]], align 4 1463; CHECK-NEXT: store i32 [[Y]], i32* [[A_I_MINUS_3]], align 4 1464; CHECK-NEXT: store i32 [[Z]], i32* [[A_I]], align 4 1465; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 1466; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] 1467; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP35:![0-9]+]] 1468; CHECK: for.end: 1469; CHECK-NEXT: ret void 1470; 1471entry: 1472 br label %for.body 1473 1474for.body: 1475 %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ] 1476 %i_minus_1 = sub i64 %i, 1 1477 %i_minus_3 = sub i64 %i_minus_1, 2 1478 %a_i = getelementptr inbounds i32, i32* %a, i64 %i 1479 %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1 1480 %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3 1481 store i32 %x, i32* %a_i_minus_1, align 4 1482 store i32 %y, i32* %a_i_minus_3, align 4 1483 store i32 %z, i32* %a_i, align 4 1484 %i.next = add nuw nsw i64 %i, 2 1485 %cond = icmp slt i64 %i.next, %n 1486 br i1 %cond, label %for.body, label %for.end 1487 1488for.end: 1489 ret void 1490} 1491 1492; PR34743: Ensure that a cast which needs to sink after a load that belongs to 1493; an interleaved group, indeeded gets sunk. 1494 1495; void PR34743(short *a, int *b, int n) { 1496; for (int i = 0, iv = 0; iv < n; i++, iv += 2) { 1497; b[i] = a[iv] * a[iv+1] * a[iv+2]; 1498; } 1499; } 1500 1501 1502define void @PR34743(i16* %a, i32* %b, i64 %n) { 1503; CHECK-LABEL: @PR34743( 1504; CHECK-NEXT: entry: 1505; CHECK-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[A:%.*]], align 2 1506; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1 1507; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1 1508; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 6 1509; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 1510; CHECK: vector.memcheck: 1511; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[N]], 1 1512; CHECK-NEXT: [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1 1513; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP3]] 1514; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i16, i16* [[A]], i64 1 1515; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[N]], -2 1516; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], 3 1517; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i16, i16* [[A]], i64 [[TMP5]] 1518; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[SCEVGEP5]] to i32* 1519; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[TMP6]], [[B]] 1520; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[SCEVGEP]] to i16* 1521; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i16* [[SCEVGEP3]], [[TMP7]] 1522; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 1523; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 1524; CHECK: vector.ph: 1525; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], -4 1526; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 1527; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i16> poison, i16 [[DOTPRE]], i64 3 1528; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1529; CHECK: vector.body: 1530; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1531; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STRIDED_VEC8:%.*]], [[VECTOR_BODY]] ] 1532; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 1533; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[OFFSET_IDX]], 1 1534; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP8]] 1535; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <8 x i16>* 1536; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, <8 x i16>* [[TMP10]], align 4 1537; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1538; CHECK-NEXT: [[STRIDED_VEC8]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1539; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> 1540; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[STRIDED_VEC8]], <4 x i32> <i32 3, i32 4, i32 5, i32 6> 1541; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[TMP12]] to <4 x i32> 1542; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32> 1543; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP11]] 1544; CHECK-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]] 1545; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] 1546; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>* 1547; CHECK-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP18]], align 4, !alias.scope !36, !noalias !39 1548; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 1549; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1550; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] 1551; CHECK: middle.block: 1552; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] 1553; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i16> [[WIDE_VEC]], i64 7 1554; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] 1555; CHECK: scalar.ph: 1556; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] 1557; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] 1558; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] 1559; CHECK-NEXT: br label [[LOOP:%.*]] 1560; CHECK: loop: 1561; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ] 1562; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ] 1563; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL7]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ] 1564; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32 1565; CHECK-NEXT: [[I1]] = add nuw nsw i64 [[I]], 1 1566; CHECK-NEXT: [[IV1:%.*]] = or i64 [[IV]], 1 1567; CHECK-NEXT: [[IV2]] = add nuw nsw i64 [[IV]], 2 1568; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV1]] 1569; CHECK-NEXT: [[LOAD1:%.*]] = load i16, i16* [[GEP1]], align 4 1570; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32 1571; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[IV2]] 1572; CHECK-NEXT: [[LOAD2]] = load i16, i16* [[GEP2]], align 4 1573; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32 1574; CHECK-NEXT: [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]] 1575; CHECK-NEXT: [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]] 1576; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] 1577; CHECK-NEXT: store i32 [[MUL012]], i32* [[ARRAYIDX5]], align 4 1578; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]] 1579; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP42:![0-9]+]] 1580; CHECK: end: 1581; CHECK-NEXT: ret void 1582; 1583entry: 1584 %.pre = load i16, i16* %a 1585 br label %loop 1586 1587loop: 1588 %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ] 1589 %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ] 1590 %i = phi i64 [ 0, %entry ], [ %i1, %loop ] 1591 %conv = sext i16 %0 to i32 1592 %i1 = add nuw nsw i64 %i, 1 1593 %iv1 = add nuw nsw i64 %iv, 1 1594 %iv2 = add nuw nsw i64 %iv, 2 1595 %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1 1596 %load1 = load i16, i16* %gep1, align 4 1597 %conv1 = sext i16 %load1 to i32 1598 %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2 1599 %load2 = load i16, i16* %gep2, align 4 1600 %conv2 = sext i16 %load2 to i32 1601 %mul01 = mul nsw i32 %conv, %conv1 1602 %mul012 = mul nsw i32 %mul01, %conv2 1603 %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i 1604 store i32 %mul012, i32* %arrayidx5 1605 %exitcond = icmp eq i64 %iv, %n 1606 br i1 %exitcond, label %end, label %loop 1607 1608end: 1609 ret void 1610} 1611 1612attributes #0 = { "unsafe-fp-math"="true" } 1613