1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -loop-vectorize -simplifycfg -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 3; RUN: opt < %s -loop-vectorize -simplifycfg -mcpu=knl -force-vector-width=2 -force-target-max-vector-interleave=1 -S | FileCheck %s -check-prefix=FVW2 4 5; With a force-vector-width, it is sometimes more profitable to generate 6; scalarized and predicated stores instead of masked scatter. Disable 7; interleaving to simplify CHECKs in that scenario. 8 9target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 10target triple = "x86_64-pc_linux" 11 12; The source code: 13; 14;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) { 15; 16; for (int i=0; i < SIZE; ++i) { 17; if (trigger[i] > 0) { 18; out[i] = in[index[i]] + (float) 0.5; 19; } 20; } 21;} 22 23; Function Attrs: nounwind uwtable 24define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) { 25; AVX512-LABEL: @foo1( 26; AVX512-NEXT: iter.check: 27; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 28; AVX512: vector.body: 29; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 30; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX1]], 0 31; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]] 32; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 33; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* 34; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4 35; AVX512-NEXT: [[TMP4:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer 36; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[TMP0]] 37; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0 38; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* 39; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP7]], i32 4, <16 x i1> [[TMP4]], <16 x i32> poison) 40; AVX512-NEXT: [[TMP8:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64> 41; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP8]] 42; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP4]], <16 x float> undef) 43; AVX512-NEXT: [[TMP10:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01> 44; AVX512-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[TMP0]] 45; AVX512-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP11]], i32 0 46; AVX512-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <16 x float>* 47; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP10]], <16 x float>* [[TMP13]], i32 4, <16 x i1> [[TMP4]]) 48; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16 49; AVX512-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 50; AVX512-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 51; AVX512: middle.block: 52; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 53; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] 54; AVX512: for.body: 55; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ] 56; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] 57; AVX512-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 58; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP15]], 0 59; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] 60; AVX512: if.then: 61; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDVARS_IV]] 62; AVX512-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 63; AVX512-NEXT: [[IDXPROM4:%.*]] = sext i32 [[TMP16]] to i64 64; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[IDXPROM4]] 65; AVX512-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX5]], align 4 66; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP17]], 5.000000e-01 67; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]] 68; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4 69; AVX512-NEXT: br label [[FOR_INC]] 70; AVX512: for.inc: 71; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 72; AVX512-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 73; AVX512-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] 74; AVX512: for.end: 75; AVX512-NEXT: ret void 76; 77; FVW2-LABEL: @foo1( 78; FVW2-NEXT: entry: 79; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 80; FVW2: vector.body: 81; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 82; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX1]], 0 83; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]] 84; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 85; FVW2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* 86; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 87; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer 88; FVW2-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[TMP0]] 89; FVW2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0 90; FVW2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* 91; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP7]], i32 4, <2 x i1> [[TMP4]], <2 x i32> poison) 92; FVW2-NEXT: [[TMP8:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64> 93; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP8]] 94; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) 95; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01> 96; FVW2-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[TMP0]] 97; FVW2-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP11]], i32 0 98; FVW2-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>* 99; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP10]], <2 x float>* [[TMP13]], i32 4, <2 x i1> [[TMP4]]) 100; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2 101; FVW2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 102; FVW2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 103; FVW2: middle.block: 104; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 105; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] 106; FVW2: for.body: 107; FVW2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ] 108; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] 109; FVW2-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 110; FVW2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP15]], 0 111; FVW2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] 112; FVW2: if.then: 113; FVW2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDVARS_IV]] 114; FVW2-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 115; FVW2-NEXT: [[IDXPROM4:%.*]] = sext i32 [[TMP16]] to i64 116; FVW2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[IDXPROM4]] 117; FVW2-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX5]], align 4 118; FVW2-NEXT: [[ADD:%.*]] = fadd float [[TMP17]], 5.000000e-01 119; FVW2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]] 120; FVW2-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4 121; FVW2-NEXT: br label [[FOR_INC]] 122; FVW2: for.inc: 123; FVW2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 124; FVW2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 125; FVW2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] 126; FVW2: for.end: 127; FVW2-NEXT: ret void 128; 129entry: 130 br label %for.body 131 132for.body: 133 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 134 %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv 135 %0 = load i32, i32* %arrayidx, align 4 136 %cmp1 = icmp sgt i32 %0, 0 137 br i1 %cmp1, label %if.then, label %for.inc 138 139if.then: 140 %arrayidx3 = getelementptr inbounds i32, i32* %index, i64 %indvars.iv 141 %1 = load i32, i32* %arrayidx3, align 4 142 %idxprom4 = sext i32 %1 to i64 143 %arrayidx5 = getelementptr inbounds float, float* %in, i64 %idxprom4 144 %2 = load float, float* %arrayidx5, align 4 145 %add = fadd float %2, 5.000000e-01 146 %arrayidx7 = getelementptr inbounds float, float* %out, i64 %indvars.iv 147 store float %add, float* %arrayidx7, align 4 148 br label %for.inc 149 150for.inc: 151 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 152 %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 153 br i1 %exitcond.not, label %for.end, label %for.body 154 155for.end: 156 ret void 157} 158 159; The source code 160;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) { 161; 162; for (int i=0; i<SIZE; i += 16) { 163; if (trigger[i] > 0) { 164; out[i] = in[i].b + (float) 0.5; 165; } 166; } 167;} 168 169%struct.In = type { float, float } 170 171define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { 172; AVX512-LABEL: @foo2( 173; AVX512-NEXT: entry: 174; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 175; AVX512: vector.body: 176; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 177; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 178; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] 179; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef) 180; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer 181; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 182; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef) 183; AVX512-NEXT: [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01> 184; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]] 185; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP19]], <16 x float*> [[TMP20]], i32 4, <16 x i1> [[TMP17]]) 186; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16 187; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256> 188; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 189; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] 190; AVX512: middle.block: 191; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256 192; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] 193; AVX512: for.body: 194; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ] 195; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] 196; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 197; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0 198; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] 199; AVX512: if.then: 200; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1 201; AVX512-NEXT: [[TMP23:%.*]] = load float, float* [[B]], align 4 202; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01 203; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]] 204; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4 205; AVX512-NEXT: br label [[FOR_INC]] 206; AVX512: for.inc: 207; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 208; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080 209; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]] 210; AVX512: for.end: 211; AVX512-NEXT: ret void 212; 213; FVW2-LABEL: @foo2( 214; FVW2-NEXT: entry: 215; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 216; FVW2: vector.body: 217; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] 218; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ] 219; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16 220; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 221; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16 222; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]] 223; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] 224; FVW2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 225; FVW2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4 226; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 227; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 228; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer 229; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 230; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef) 231; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01> 232; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 233; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] 234; FVW2: pred.store.if: 235; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[TMP0]] 236; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 237; FVW2-NEXT: store float [[TMP13]], float* [[TMP12]], align 4 238; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] 239; FVW2: pred.store.continue: 240; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 241; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] 242; FVW2: pred.store.if2: 243; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP1]] 244; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 245; FVW2-NEXT: store float [[TMP16]], float* [[TMP15]], align 4 246; FVW2-NEXT: br label [[PRED_STORE_CONTINUE3]] 247; FVW2: pred.store.continue3: 248; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2 249; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32> 250; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 251; FVW2-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] 252; FVW2: middle.block: 253; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256 254; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] 255; FVW2: for.body: 256; FVW2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ] 257; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] 258; FVW2-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 259; FVW2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP18]], 0 260; FVW2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] 261; FVW2: if.then: 262; FVW2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1 263; FVW2-NEXT: [[TMP19:%.*]] = load float, float* [[B]], align 4 264; FVW2-NEXT: [[ADD:%.*]] = fadd float [[TMP19]], 5.000000e-01 265; FVW2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]] 266; FVW2-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4 267; FVW2-NEXT: br label [[FOR_INC]] 268; FVW2: for.inc: 269; FVW2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 270; FVW2-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080 271; FVW2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]] 272; FVW2: for.end: 273; FVW2-NEXT: ret void 274; 275entry: 276 br label %for.body 277 278for.body: 279 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 280 %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv 281 %0 = load i32, i32* %arrayidx, align 4 282 %cmp1 = icmp sgt i32 %0, 0 283 br i1 %cmp1, label %if.then, label %for.inc 284 285if.then: 286 %b = getelementptr inbounds %struct.In, %struct.In* %in, i64 %indvars.iv, i32 1 287 %1 = load float, float* %b, align 4 288 %add = fadd float %1, 5.000000e-01 289 %arrayidx5 = getelementptr inbounds float, float* %out, i64 %indvars.iv 290 store float %add, float* %arrayidx5, align 4 291 br label %for.inc 292 293for.inc: 294 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 295 %cmp = icmp ult i64 %indvars.iv, 4080 296 br i1 %cmp, label %for.body, label %for.end 297 298for.end: 299 ret void 300} 301 302; The source code 303;struct Out { 304; float a; 305; float b; 306;}; 307;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) { 308; 309; for (int i=0; i<SIZE; i += 16) { 310; if (trigger[i] > 0) { 311; out[i].b = in[i].b + (float) 0.5; 312; } 313; } 314;} 315 316%struct.Out = type { float, float } 317 318define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) { 319; AVX512-LABEL: @foo3( 320; AVX512-NEXT: entry: 321; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 322; AVX512: vector.body: 323; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 324; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 325; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] 326; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef) 327; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer 328; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 329; AVX512-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef) 330; AVX512-NEXT: [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01> 331; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1 332; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP19]], <16 x float*> [[TMP20]], i32 4, <16 x i1> [[TMP17]]) 333; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 334; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256> 335; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 336; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] 337; AVX512: middle.block: 338; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256 339; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] 340; AVX512: for.body: 341; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ] 342; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] 343; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 344; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0 345; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] 346; AVX512: if.then: 347; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1 348; AVX512-NEXT: [[TMP23:%.*]] = load float, float* [[B]], align 4 349; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01 350; AVX512-NEXT: [[B6:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[INDVARS_IV]], i32 1 351; AVX512-NEXT: store float [[ADD]], float* [[B6]], align 4 352; AVX512-NEXT: br label [[FOR_INC]] 353; AVX512: for.inc: 354; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 355; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080 356; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]] 357; AVX512: for.end: 358; AVX512-NEXT: ret void 359; 360; FVW2-LABEL: @foo3( 361; FVW2-NEXT: entry: 362; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 363; FVW2: vector.body: 364; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] 365; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ] 366; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 367; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 368; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16 369; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]] 370; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] 371; FVW2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 372; FVW2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4 373; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 374; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 375; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer 376; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 377; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef) 378; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01> 379; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 380; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] 381; FVW2: pred.store.if: 382; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[TMP0]], i32 1 383; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 384; FVW2-NEXT: store float [[TMP13]], float* [[TMP12]], align 4 385; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] 386; FVW2: pred.store.continue: 387; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 388; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]] 389; FVW2: pred.store.if1: 390; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP1]], i32 1 391; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 392; FVW2-NEXT: store float [[TMP16]], float* [[TMP15]], align 4 393; FVW2-NEXT: br label [[PRED_STORE_CONTINUE2]] 394; FVW2: pred.store.continue2: 395; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 396; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32> 397; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 398; FVW2-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] 399; FVW2: middle.block: 400; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256 401; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] 402; FVW2: for.body: 403; FVW2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ] 404; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] 405; FVW2-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 406; FVW2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP18]], 0 407; FVW2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] 408; FVW2: if.then: 409; FVW2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1 410; FVW2-NEXT: [[TMP19:%.*]] = load float, float* [[B]], align 4 411; FVW2-NEXT: [[ADD:%.*]] = fadd float [[TMP19]], 5.000000e-01 412; FVW2-NEXT: [[B6:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[INDVARS_IV]], i32 1 413; FVW2-NEXT: store float [[ADD]], float* [[B6]], align 4 414; FVW2-NEXT: br label [[FOR_INC]] 415; FVW2: for.inc: 416; FVW2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 417; FVW2-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080 418; FVW2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]] 419; FVW2: for.end: 420; FVW2-NEXT: ret void 421; 422entry: 423 br label %for.body 424 425for.body: 426 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 427 %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv 428 %0 = load i32, i32* %arrayidx, align 4 429 %cmp1 = icmp sgt i32 %0, 0 430 br i1 %cmp1, label %if.then, label %for.inc 431 432if.then: 433 %b = getelementptr inbounds %struct.In, %struct.In* %in, i64 %indvars.iv, i32 1 434 %1 = load float, float* %b, align 4 435 %add = fadd float %1, 5.000000e-01 436 %b6 = getelementptr inbounds %struct.Out, %struct.Out* %out, i64 %indvars.iv, i32 1 437 store float %add, float* %b6, align 4 438 br label %for.inc 439 440for.inc: 441 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 442 %cmp = icmp ult i64 %indvars.iv, 4080 443 br i1 %cmp, label %for.body, label %for.end 444 445for.end: 446 ret void 447} 448declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>) 449 450; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1 451 452define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { 453; AVX512-LABEL: @foo2_addrspace( 454; AVX512-NEXT: entry: 455; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 456; AVX512: vector.body: 457; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 458; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 459; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] 460; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef) 461; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer 462; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 463; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef) 464; AVX512-NEXT: [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01> 465; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]] 466; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP19]], <16 x float addrspace(1)*> [[TMP20]], i32 4, <16 x i1> [[TMP17]]) 467; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16 468; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256> 469; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 470; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] 471; AVX512: middle.block: 472; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256 473; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] 474; AVX512: for.body: 475; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ] 476; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] 477; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 478; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0 479; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] 480; AVX512: if.then: 481; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV]], i32 1 482; AVX512-NEXT: [[TMP23:%.*]] = load float, float addrspace(1)* [[B]], align 4 483; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01 484; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV]] 485; AVX512-NEXT: store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4 486; AVX512-NEXT: br label [[FOR_INC]] 487; AVX512: for.inc: 488; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 489; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080 490; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP9:![0-9]+]] 491; AVX512: for.end: 492; AVX512-NEXT: ret void 493; 494; FVW2-LABEL: @foo2_addrspace( 495; FVW2-NEXT: entry: 496; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 497; FVW2: vector.body: 498; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] 499; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ] 500; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16 501; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 502; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16 503; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]] 504; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] 505; FVW2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 506; FVW2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4 507; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 508; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 509; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer 510; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 511; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef) 512; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01> 513; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 514; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] 515; FVW2: pred.store.if: 516; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[TMP0]] 517; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 518; FVW2-NEXT: store float [[TMP13]], float addrspace(1)* [[TMP12]], align 4 519; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] 520; FVW2: pred.store.continue: 521; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 522; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] 523; FVW2: pred.store.if2: 524; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP1]] 525; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 526; FVW2-NEXT: store float [[TMP16]], float addrspace(1)* [[TMP15]], align 4 527; FVW2-NEXT: br label [[PRED_STORE_CONTINUE3]] 528; FVW2: pred.store.continue3: 529; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2 530; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32> 531; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 532; FVW2-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] 533; FVW2: middle.block: 534; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256 535; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] 536; FVW2: for.body: 537; FVW2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ] 538; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] 539; FVW2-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 540; FVW2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP18]], 0 541; FVW2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] 542; FVW2: if.then: 543; FVW2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV]], i32 1 544; FVW2-NEXT: [[TMP19:%.*]] = load float, float addrspace(1)* [[B]], align 4 545; FVW2-NEXT: [[ADD:%.*]] = fadd float [[TMP19]], 5.000000e-01 546; FVW2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV]] 547; FVW2-NEXT: store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4 548; FVW2-NEXT: br label [[FOR_INC]] 549; FVW2: for.inc: 550; FVW2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 551; FVW2-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080 552; FVW2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP9:![0-9]+]] 553; FVW2: for.end: 554; FVW2-NEXT: ret void 555; 556entry: 557 br label %for.body 558 559for.body: 560 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 561 %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv 562 %0 = load i32, i32* %arrayidx, align 4 563 %cmp1 = icmp sgt i32 %0, 0 564 br i1 %cmp1, label %if.then, label %for.inc 565 566if.then: 567 %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, i64 %indvars.iv, i32 1 568 %1 = load float, float addrspace(1)* %b, align 4 569 %add = fadd float %1, 5.000000e-01 570 %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %out, i64 %indvars.iv 571 store float %add, float addrspace(1)* %arrayidx5, align 4 572 br label %for.inc 573 574for.inc: 575 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 576 %cmp = icmp ult i64 %indvars.iv, 4080 577 br i1 %cmp, label %for.body, label %for.end 578 579for.end: 580 ret void 581} 582 583; Same as foo2_addrspace but here only the input has the non-default address space. 584 585define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) { 586; AVX512-LABEL: @foo2_addrspace2( 587; AVX512-NEXT: entry: 588; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 589; AVX512: vector.body: 590; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 591; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 592; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] 593; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef) 594; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer 595; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 596; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef) 597; AVX512-NEXT: [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01> 598; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]] 599; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP19]], <16 x float*> [[TMP20]], i32 4, <16 x i1> [[TMP17]]) 600; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16 601; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256> 602; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 603; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] 604; AVX512: middle.block: 605; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256 606; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] 607; AVX512: for.body: 608; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ] 609; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] 610; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 611; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0 612; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] 613; AVX512: if.then: 614; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV]], i32 1 615; AVX512-NEXT: [[TMP23:%.*]] = load float, float addrspace(1)* [[B]], align 4 616; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01 617; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]] 618; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4 619; AVX512-NEXT: br label [[FOR_INC]] 620; AVX512: for.inc: 621; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 622; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080 623; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]] 624; AVX512: for.end: 625; AVX512-NEXT: ret void 626; 627; FVW2-LABEL: @foo2_addrspace2( 628; FVW2-NEXT: entry: 629; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 630; FVW2: vector.body: 631; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] 632; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ] 633; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16 634; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 635; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16 636; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]] 637; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] 638; FVW2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 639; FVW2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4 640; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 641; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 642; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer 643; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 644; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef) 645; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01> 646; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 647; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] 648; FVW2: pred.store.if: 649; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[TMP0]] 650; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 651; FVW2-NEXT: store float [[TMP13]], float* [[TMP12]], align 4 652; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] 653; FVW2: pred.store.continue: 654; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 655; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] 656; FVW2: pred.store.if2: 657; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP1]] 658; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 659; FVW2-NEXT: store float [[TMP16]], float* [[TMP15]], align 4 660; FVW2-NEXT: br label [[PRED_STORE_CONTINUE3]] 661; FVW2: pred.store.continue3: 662; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2 663; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32> 664; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 665; FVW2-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] 666; FVW2: middle.block: 667; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256 668; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] 669; FVW2: for.body: 670; FVW2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ] 671; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] 672; FVW2-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 673; FVW2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP18]], 0 674; FVW2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] 675; FVW2: if.then: 676; FVW2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV]], i32 1 677; FVW2-NEXT: [[TMP19:%.*]] = load float, float addrspace(1)* [[B]], align 4 678; FVW2-NEXT: [[ADD:%.*]] = fadd float [[TMP19]], 5.000000e-01 679; FVW2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]] 680; FVW2-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4 681; FVW2-NEXT: br label [[FOR_INC]] 682; FVW2: for.inc: 683; FVW2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 684; FVW2-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080 685; FVW2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]] 686; FVW2: for.end: 687; FVW2-NEXT: ret void 688; 689entry: 690 br label %for.body 691 692for.body: 693 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 694 %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv 695 %0 = load i32, i32* %arrayidx, align 4 696 %cmp1 = icmp sgt i32 %0, 0 697 br i1 %cmp1, label %if.then, label %for.inc 698 699if.then: 700 %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, i64 %indvars.iv, i32 1 701 %1 = load float, float addrspace(1)* %b, align 4 702 %add = fadd float %1, 5.000000e-01 703 %arrayidx5 = getelementptr inbounds float, float* %out, i64 %indvars.iv 704 store float %add, float* %arrayidx5, align 4 705 br label %for.inc 706 707for.inc: 708 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 709 %cmp = icmp ult i64 %indvars.iv, 4080 710 br i1 %cmp, label %for.body, label %for.end 711 712for.end: 713 ret void 714} 715 716; Same as foo2_addrspace but here only the output has the non-default address space. 717 718define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) { 719; AVX512-LABEL: @foo2_addrspace3( 720; AVX512-NEXT: entry: 721; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 722; AVX512: vector.body: 723; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 724; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 725; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] 726; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef) 727; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer 728; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 729; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef) 730; AVX512-NEXT: [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01> 731; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]] 732; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP19]], <16 x float addrspace(1)*> [[TMP20]], i32 4, <16 x i1> [[TMP17]]) 733; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16 734; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256> 735; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 736; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] 737; AVX512: middle.block: 738; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256 739; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] 740; AVX512: for.body: 741; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ] 742; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] 743; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 744; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0 745; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] 746; AVX512: if.then: 747; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1 748; AVX512-NEXT: [[TMP23:%.*]] = load float, float* [[B]], align 4 749; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01 750; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV]] 751; AVX512-NEXT: store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4 752; AVX512-NEXT: br label [[FOR_INC]] 753; AVX512: for.inc: 754; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 755; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080 756; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP13:![0-9]+]] 757; AVX512: for.end: 758; AVX512-NEXT: ret void 759; 760; FVW2-LABEL: @foo2_addrspace3( 761; FVW2-NEXT: entry: 762; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 763; FVW2: vector.body: 764; FVW2-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] 765; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ] 766; FVW2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16 767; FVW2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 768; FVW2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16 769; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]] 770; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] 771; FVW2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 772; FVW2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4 773; FVW2-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 774; FVW2-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1 775; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer 776; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 777; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef) 778; FVW2-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01> 779; FVW2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 780; FVW2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] 781; FVW2: pred.store.if: 782; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[TMP0]] 783; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 784; FVW2-NEXT: store float [[TMP13]], float addrspace(1)* [[TMP12]], align 4 785; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] 786; FVW2: pred.store.continue: 787; FVW2-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 788; FVW2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] 789; FVW2: pred.store.if2: 790; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP1]] 791; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 792; FVW2-NEXT: store float [[TMP16]], float addrspace(1)* [[TMP15]], align 4 793; FVW2-NEXT: br label [[PRED_STORE_CONTINUE3]] 794; FVW2: pred.store.continue3: 795; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2 796; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32> 797; FVW2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 798; FVW2-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] 799; FVW2: middle.block: 800; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256 801; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] 802; FVW2: for.body: 803; FVW2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ] 804; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] 805; FVW2-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 806; FVW2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP18]], 0 807; FVW2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] 808; FVW2: if.then: 809; FVW2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1 810; FVW2-NEXT: [[TMP19:%.*]] = load float, float* [[B]], align 4 811; FVW2-NEXT: [[ADD:%.*]] = fadd float [[TMP19]], 5.000000e-01 812; FVW2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV]] 813; FVW2-NEXT: store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4 814; FVW2-NEXT: br label [[FOR_INC]] 815; FVW2: for.inc: 816; FVW2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 817; FVW2-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080 818; FVW2-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP13:![0-9]+]] 819; FVW2: for.end: 820; FVW2-NEXT: ret void 821; 822entry: 823 br label %for.body 824 825for.body: 826 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 827 %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv 828 %0 = load i32, i32* %arrayidx, align 4 829 %cmp1 = icmp sgt i32 %0, 0 830 br i1 %cmp1, label %if.then, label %for.inc 831 832if.then: 833 %b = getelementptr inbounds %struct.In, %struct.In* %in, i64 %indvars.iv, i32 1 834 %1 = load float, float* %b, align 4 835 %add = fadd float %1, 5.000000e-01 836 %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %out, i64 %indvars.iv 837 store float %add, float addrspace(1)* %arrayidx5, align 4 838 br label %for.inc 839 840for.inc: 841 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16 842 %cmp = icmp ult i64 %indvars.iv, 4080 843 br i1 %cmp, label %for.body, label %for.end 844 845for.end: 846 ret void 847} 848 849; Using gathers is not profitable for this function. PR48429. 850define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly noalias %ptr, float* nocapture noalias %dest) { 851; AVX512-LABEL: @test_gather_not_profitable_pr48429( 852; AVX512-NEXT: entry: 853; AVX512-NEXT: [[DEST1:%.*]] = bitcast float* [[DEST:%.*]] to i8* 854; AVX512-NEXT: [[PTR3:%.*]] = bitcast float* [[PTR:%.*]] to i8* 855; AVX512-NEXT: [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64 856; AVX512-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 [[IDX_EXT]] 857; AVX512-NEXT: [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0 858; AVX512-NEXT: br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]] 859; AVX512: for.body.lr.ph: 860; AVX512-NEXT: [[MUL:%.*]] = sub nsw i32 0, [[D]] 861; AVX512-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 862; AVX512-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2 863; AVX512-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4 864; AVX512-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 865; AVX512-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 866; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 32 867; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 868; AVX512: vector.memcheck: 869; AVX512-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2 870; AVX512-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4 871; AVX512-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 2 872; AVX512-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 4 873; AVX512-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 2 874; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP8]] 875; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* 876; AVX512-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP6]], 1 877; AVX512-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP9]] 878; AVX512-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8* 879; AVX512-NEXT: [[SCEVGEP6:%.*]] = getelementptr float, float* [[PTR]], i64 [[IDXPROM]] 880; AVX512-NEXT: [[SCEVGEP67:%.*]] = bitcast float* [[SCEVGEP6]] to i8* 881; AVX512-NEXT: [[TMP10:%.*]] = add i64 [[TMP6]], 1 882; AVX512-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], [[IDX_EXT]] 883; AVX512-NEXT: [[SCEVGEP8:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP11]] 884; AVX512-NEXT: [[SCEVGEP89:%.*]] = bitcast float* [[SCEVGEP8]] to i8* 885; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[DEST1]], [[SCEVGEP45]] 886; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[PTR3]], [[SCEVGEP2]] 887; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 888; AVX512-NEXT: [[BOUND010:%.*]] = icmp ult i8* [[DEST1]], [[SCEVGEP89]] 889; AVX512-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] 890; AVX512-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] 891; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] 892; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 893; AVX512: vector.ph: 894; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16 895; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] 896; AVX512-NEXT: [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]] 897; AVX512-NEXT: [[TMP12:%.*]] = mul i64 [[N_VEC]], 16 898; AVX512-NEXT: [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]] 899; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] 900; AVX512: vector.body: 901; AVX512-NEXT: [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] 902; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 903; AVX512-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 904; AVX512-NEXT: [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP13]] 905; AVX512-NEXT: [[TMP14:%.*]] = getelementptr float, float* [[POINTER_PHI]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240> 906; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] 907; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP15]], i32 0 908; AVX512-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <16 x float>* 909; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, <16 x float>* [[TMP17]], align 4, !alias.scope !14 910; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD]], <16 x float*> [[TMP14]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !17, !noalias !19 911; AVX512-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[NEXT_GEP]], i32 0 912; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* 913; AVX512-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x float>, <16 x float>* [[TMP19]], align 4, !alias.scope !21 914; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP14]], i64 1 915; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15]], <16 x float*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !17, !noalias !19 916; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 917; AVX512-NEXT: [[PTR_IND]] = getelementptr float, float* [[POINTER_PHI]], i64 256 918; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 919; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] 920; AVX512: middle.block: 921; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] 922; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]] 923; AVX512: scalar.ph: 924; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi float* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ] 925; AVX512-NEXT: [[BC_RESUME_VAL13:%.*]] = phi float* [ [[IND_END14]], [[MIDDLE_BLOCK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ] 926; AVX512-NEXT: br label [[FOR_BODY:%.*]] 927; AVX512: for.body: 928; AVX512-NEXT: [[PTR_ADDR_012:%.*]] = phi float* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] 929; AVX512-NEXT: [[DEST_ADDR_011:%.*]] = phi float* [ [[BC_RESUME_VAL13]], [[SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ] 930; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]] 931; AVX512-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX]], align 4 932; AVX512-NEXT: store float [[TMP22]], float* [[DEST_ADDR_011]], align 4 933; AVX512-NEXT: [[TMP23:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 934; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1 935; AVX512-NEXT: store float [[TMP23]], float* [[ARRAYIDX5]], align 4 936; AVX512-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1 937; AVX512-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16 938; AVX512-NEXT: [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]] 939; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] 940; AVX512: for.end: 941; AVX512-NEXT: ret void 942; 943; FVW2-LABEL: @test_gather_not_profitable_pr48429( 944; FVW2-NEXT: entry: 945; FVW2-NEXT: [[DEST1:%.*]] = bitcast float* [[DEST:%.*]] to i8* 946; FVW2-NEXT: [[PTR3:%.*]] = bitcast float* [[PTR:%.*]] to i8* 947; FVW2-NEXT: [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64 948; FVW2-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 [[IDX_EXT]] 949; FVW2-NEXT: [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0 950; FVW2-NEXT: br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]] 951; FVW2: for.body.lr.ph: 952; FVW2-NEXT: [[MUL:%.*]] = sub nsw i32 0, [[D]] 953; FVW2-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 954; FVW2-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2 955; FVW2-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4 956; FVW2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 957; FVW2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 958; FVW2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2 959; FVW2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 960; FVW2: vector.memcheck: 961; FVW2-NEXT: [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2 962; FVW2-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4 963; FVW2-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 2 964; FVW2-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 4 965; FVW2-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 2 966; FVW2-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP8]] 967; FVW2-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* 968; FVW2-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP6]], 1 969; FVW2-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP9]] 970; FVW2-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8* 971; FVW2-NEXT: [[SCEVGEP6:%.*]] = getelementptr float, float* [[PTR]], i64 [[IDXPROM]] 972; FVW2-NEXT: [[SCEVGEP67:%.*]] = bitcast float* [[SCEVGEP6]] to i8* 973; FVW2-NEXT: [[TMP10:%.*]] = add i64 [[TMP6]], 1 974; FVW2-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], [[IDX_EXT]] 975; FVW2-NEXT: [[SCEVGEP8:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP11]] 976; FVW2-NEXT: [[SCEVGEP89:%.*]] = bitcast float* [[SCEVGEP8]] to i8* 977; FVW2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[DEST1]], [[SCEVGEP45]] 978; FVW2-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[PTR3]], [[SCEVGEP2]] 979; FVW2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 980; FVW2-NEXT: [[BOUND010:%.*]] = icmp ult i8* [[DEST1]], [[SCEVGEP89]] 981; FVW2-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]] 982; FVW2-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]] 983; FVW2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]] 984; FVW2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 985; FVW2: vector.ph: 986; FVW2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2 987; FVW2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] 988; FVW2-NEXT: [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]] 989; FVW2-NEXT: [[TMP12:%.*]] = mul i64 [[N_VEC]], 16 990; FVW2-NEXT: [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]] 991; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] 992; FVW2: vector.body: 993; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 994; FVW2-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 995; FVW2-NEXT: [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP13]] 996; FVW2-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 0 997; FVW2-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 16 998; FVW2-NEXT: [[NEXT_GEP15:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP15]] 999; FVW2-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 1 1000; FVW2-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 16 1001; FVW2-NEXT: [[NEXT_GEP16:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP17]] 1002; FVW2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] 1003; FVW2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP18]], i32 0 1004; FVW2-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <2 x float>* 1005; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP20]], align 4, !alias.scope !14 1006; FVW2-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 1007; FVW2-NEXT: store float [[TMP21]], float* [[NEXT_GEP15]], align 4, !alias.scope !17, !noalias !19 1008; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 1009; FVW2-NEXT: store float [[TMP22]], float* [[NEXT_GEP16]], align 4, !alias.scope !17, !noalias !19 1010; FVW2-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[NEXT_GEP]], i32 0 1011; FVW2-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <2 x float>* 1012; FVW2-NEXT: [[WIDE_LOAD17:%.*]] = load <2 x float>, <2 x float>* [[TMP24]], align 4, !alias.scope !21 1013; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP15]], i64 1 1014; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP16]], i64 1 1015; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[WIDE_LOAD17]], i32 0 1016; FVW2-NEXT: store float [[TMP27]], float* [[TMP25]], align 4, !alias.scope !17, !noalias !19 1017; FVW2-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD17]], i32 1 1018; FVW2-NEXT: store float [[TMP28]], float* [[TMP26]], align 4, !alias.scope !17, !noalias !19 1019; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 1020; FVW2-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1021; FVW2-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] 1022; FVW2: middle.block: 1023; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] 1024; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]] 1025; FVW2: scalar.ph: 1026; FVW2-NEXT: [[BC_RESUME_VAL:%.*]] = phi float* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ] 1027; FVW2-NEXT: [[BC_RESUME_VAL13:%.*]] = phi float* [ [[IND_END14]], [[MIDDLE_BLOCK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ] 1028; FVW2-NEXT: br label [[FOR_BODY:%.*]] 1029; FVW2: for.body: 1030; FVW2-NEXT: [[PTR_ADDR_012:%.*]] = phi float* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] 1031; FVW2-NEXT: [[DEST_ADDR_011:%.*]] = phi float* [ [[BC_RESUME_VAL13]], [[SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ] 1032; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]] 1033; FVW2-NEXT: [[TMP30:%.*]] = load float, float* [[ARRAYIDX]], align 4 1034; FVW2-NEXT: store float [[TMP30]], float* [[DEST_ADDR_011]], align 4 1035; FVW2-NEXT: [[TMP31:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 1036; FVW2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1 1037; FVW2-NEXT: store float [[TMP31]], float* [[ARRAYIDX5]], align 4 1038; FVW2-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1 1039; FVW2-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16 1040; FVW2-NEXT: [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]] 1041; FVW2-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] 1042; FVW2: for.end: 1043; FVW2-NEXT: ret void 1044; 1045entry: 1046 %idx.ext = sext i32 %d to i64 1047 %add.ptr = getelementptr inbounds float, float* %ptr, i64 %idx.ext 1048 %cmp.not10 = icmp eq i32 %d, 0 1049 br i1 %cmp.not10, label %for.end, label %for.body.lr.ph 1050 1051for.body.lr.ph: 1052 %mul = sub nsw i32 0, %d 1053 %idxprom = sext i32 %mul to i64 1054 br label %for.body 1055 1056for.body: 1057 %ptr.addr.012 = phi float* [ %ptr, %for.body.lr.ph ], [ %incdec.ptr, %for.body ] 1058 %dest.addr.011 = phi float* [ %dest, %for.body.lr.ph ], [ %add.ptr6, %for.body ] 1059 %arrayidx = getelementptr inbounds float, float* %ptr.addr.012, i64 %idxprom 1060 %0 = load float, float* %arrayidx, align 4 1061 store float %0, float* %dest.addr.011, align 4 1062 %1 = load float, float* %ptr.addr.012, align 4 1063 %arrayidx5 = getelementptr inbounds float, float* %dest.addr.011, i64 1 1064 store float %1, float* %arrayidx5, align 4 1065 %incdec.ptr = getelementptr inbounds float, float* %ptr.addr.012, i64 1 1066 %add.ptr6 = getelementptr inbounds float, float* %dest.addr.011, i64 16 1067 %cmp.not = icmp eq float* %incdec.ptr, %add.ptr 1068 br i1 %cmp.not, label %for.end, label %for.body 1069 1070for.end: 1071 ret void 1072} 1073