1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -loop-vectorize -simplifycfg -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
3; RUN: opt < %s -loop-vectorize -simplifycfg -mcpu=knl -force-vector-width=2 -force-target-max-vector-interleave=1 -S | FileCheck %s -check-prefix=FVW2
4
5; With a force-vector-width, it is sometimes more profitable to generate
6; scalarized and predicated stores instead of masked scatter. Disable
7; interleaving to simplify CHECKs in that scenario.
8
9target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
10target triple = "x86_64-pc_linux"
11
12; The source code:
13;
14;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) {
15;
16;  for (int i=0; i < SIZE; ++i) {
17;    if (trigger[i] > 0) {
18;      out[i] = in[index[i]] + (float) 0.5;
19;    }
20;  }
21;}
22
23; Function Attrs: nounwind uwtable
24define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) {
25; AVX512-LABEL: @foo1(
26; AVX512-NEXT:  iter.check:
27; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
28; AVX512:       vector.body:
29; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
30; AVX512-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX1]], 0
31; AVX512-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]]
32; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
33; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
34; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4
35; AVX512-NEXT:    [[TMP4:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer
36; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[TMP0]]
37; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0
38; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
39; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP7]], i32 4, <16 x i1> [[TMP4]], <16 x i32> poison)
40; AVX512-NEXT:    [[TMP8:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64>
41; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP8]]
42; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP4]], <16 x float> undef)
43; AVX512-NEXT:    [[TMP10:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
44; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[TMP0]]
45; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr float, float* [[TMP11]], i32 0
46; AVX512-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <16 x float>*
47; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP10]], <16 x float>* [[TMP13]], i32 4, <16 x i1> [[TMP4]])
48; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
49; AVX512-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
50; AVX512-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
51; AVX512:       middle.block:
52; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
53; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
54; AVX512:       for.body:
55; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
56; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
57; AVX512-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
58; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP15]], 0
59; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
60; AVX512:       if.then:
61; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDVARS_IV]]
62; AVX512-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
63; AVX512-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[TMP16]] to i64
64; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[IDXPROM4]]
65; AVX512-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX5]], align 4
66; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP17]], 5.000000e-01
67; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]]
68; AVX512-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
69; AVX512-NEXT:    br label [[FOR_INC]]
70; AVX512:       for.inc:
71; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
72; AVX512-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
73; AVX512-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
74; AVX512:       for.end:
75; AVX512-NEXT:    ret void
76;
77; FVW2-LABEL: @foo1(
78; FVW2-NEXT:  entry:
79; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
80; FVW2:       vector.body:
81; FVW2-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
82; FVW2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX1]], 0
83; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]]
84; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
85; FVW2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
86; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
87; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
88; FVW2-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], i64 [[TMP0]]
89; FVW2-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0
90; FVW2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
91; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP7]], i32 4, <2 x i1> [[TMP4]], <2 x i32> poison)
92; FVW2-NEXT:    [[TMP8:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
93; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP8]]
94; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
95; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
96; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr float, float* [[OUT:%.*]], i64 [[TMP0]]
97; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr float, float* [[TMP11]], i32 0
98; FVW2-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>*
99; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP10]], <2 x float>* [[TMP13]], i32 4, <2 x i1> [[TMP4]])
100; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
101; FVW2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
102; FVW2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
103; FVW2:       middle.block:
104; FVW2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
105; FVW2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
106; FVW2:       for.body:
107; FVW2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
108; FVW2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
109; FVW2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
110; FVW2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP15]], 0
111; FVW2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
112; FVW2:       if.then:
113; FVW2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDVARS_IV]]
114; FVW2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
115; FVW2-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[TMP16]] to i64
116; FVW2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[IN]], i64 [[IDXPROM4]]
117; FVW2-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX5]], align 4
118; FVW2-NEXT:    [[ADD:%.*]] = fadd float [[TMP17]], 5.000000e-01
119; FVW2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]]
120; FVW2-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
121; FVW2-NEXT:    br label [[FOR_INC]]
122; FVW2:       for.inc:
123; FVW2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
124; FVW2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096
125; FVW2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
126; FVW2:       for.end:
127; FVW2-NEXT:    ret void
128;
129entry:
130  br label %for.body
131
132for.body:
133  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
134  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
135  %0 = load i32, i32* %arrayidx, align 4
136  %cmp1 = icmp sgt i32 %0, 0
137  br i1 %cmp1, label %if.then, label %for.inc
138
139if.then:
140  %arrayidx3 = getelementptr inbounds i32, i32* %index, i64 %indvars.iv
141  %1 = load i32, i32* %arrayidx3, align 4
142  %idxprom4 = sext i32 %1 to i64
143  %arrayidx5 = getelementptr inbounds float, float* %in, i64 %idxprom4
144  %2 = load float, float* %arrayidx5, align 4
145  %add = fadd float %2, 5.000000e-01
146  %arrayidx7 = getelementptr inbounds float, float* %out, i64 %indvars.iv
147  store float %add, float* %arrayidx7, align 4
148  br label %for.inc
149
150for.inc:
151  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
152  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
153  br i1 %exitcond.not, label %for.end, label %for.body
154
155for.end:
156  ret void
157}
158
159; The source code
160;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) {
161;
162;  for (int i=0; i<SIZE; i += 16) {
163;    if (trigger[i] > 0) {
164;      out[i] = in[i].b + (float) 0.5;
165;    }
166;  }
167;}
168
169%struct.In = type { float, float }
170
171define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
172; AVX512-LABEL: @foo2(
173; AVX512-NEXT:  entry:
174; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
175; AVX512:       vector.body:
176; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
177; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
178; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
179; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
180; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
181; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
182; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
183; AVX512-NEXT:    [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
184; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
185; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP19]], <16 x float*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
186; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
187; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
188; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
189; AVX512-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
190; AVX512:       middle.block:
191; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
192; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
193; AVX512:       for.body:
194; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
195; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
196; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
197; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
198; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
199; AVX512:       if.then:
200; AVX512-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1
201; AVX512-NEXT:    [[TMP23:%.*]] = load float, float* [[B]], align 4
202; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
203; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]]
204; AVX512-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
205; AVX512-NEXT:    br label [[FOR_INC]]
206; AVX512:       for.inc:
207; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
208; AVX512-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080
209; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]]
210; AVX512:       for.end:
211; AVX512-NEXT:    ret void
212;
213; FVW2-LABEL: @foo2(
214; FVW2-NEXT:  entry:
215; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
216; FVW2:       vector.body:
217; FVW2-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
218; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
219; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16
220; FVW2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
221; FVW2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
222; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]]
223; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
224; FVW2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
225; FVW2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4
226; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
227; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
228; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
229; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
230; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef)
231; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
232; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
233; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
234; FVW2:       pred.store.if:
235; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[TMP0]]
236; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
237; FVW2-NEXT:    store float [[TMP13]], float* [[TMP12]], align 4
238; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
239; FVW2:       pred.store.continue:
240; FVW2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
241; FVW2-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
242; FVW2:       pred.store.if2:
243; FVW2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP1]]
244; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
245; FVW2-NEXT:    store float [[TMP16]], float* [[TMP15]], align 4
246; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
247; FVW2:       pred.store.continue3:
248; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
249; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
250; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
251; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
252; FVW2:       middle.block:
253; FVW2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
254; FVW2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
255; FVW2:       for.body:
256; FVW2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
257; FVW2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
258; FVW2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
259; FVW2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP18]], 0
260; FVW2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
261; FVW2:       if.then:
262; FVW2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1
263; FVW2-NEXT:    [[TMP19:%.*]] = load float, float* [[B]], align 4
264; FVW2-NEXT:    [[ADD:%.*]] = fadd float [[TMP19]], 5.000000e-01
265; FVW2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]]
266; FVW2-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
267; FVW2-NEXT:    br label [[FOR_INC]]
268; FVW2:       for.inc:
269; FVW2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
270; FVW2-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080
271; FVW2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]]
272; FVW2:       for.end:
273; FVW2-NEXT:    ret void
274;
275entry:
276  br label %for.body
277
278for.body:
279  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
280  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
281  %0 = load i32, i32* %arrayidx, align 4
282  %cmp1 = icmp sgt i32 %0, 0
283  br i1 %cmp1, label %if.then, label %for.inc
284
285if.then:
286  %b = getelementptr inbounds %struct.In, %struct.In* %in, i64 %indvars.iv, i32 1
287  %1 = load float, float* %b, align 4
288  %add = fadd float %1, 5.000000e-01
289  %arrayidx5 = getelementptr inbounds float, float* %out, i64 %indvars.iv
290  store float %add, float* %arrayidx5, align 4
291  br label %for.inc
292
293for.inc:
294  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
295  %cmp = icmp ult i64 %indvars.iv, 4080
296  br i1 %cmp, label %for.body, label %for.end
297
298for.end:
299  ret void
300}
301
302; The source code
303;struct Out {
304;  float a;
305;  float b;
306;};
307;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
308;
309;  for (int i=0; i<SIZE; i += 16) {
310;    if (trigger[i] > 0) {
311;      out[i].b = in[i].b + (float) 0.5;
312;    }
313;  }
314;}
315
316%struct.Out = type { float, float }
317
318define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) {
319; AVX512-LABEL: @foo3(
320; AVX512-NEXT:  entry:
321; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
322; AVX512:       vector.body:
323; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
324; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
325; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
326; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
327; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
328; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
329; AVX512-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
330; AVX512-NEXT:    [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
331; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1
332; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP19]], <16 x float*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
333; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
334; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
335; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
336; AVX512-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
337; AVX512:       middle.block:
338; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
339; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
340; AVX512:       for.body:
341; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
342; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
343; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
344; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
345; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
346; AVX512:       if.then:
347; AVX512-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1
348; AVX512-NEXT:    [[TMP23:%.*]] = load float, float* [[B]], align 4
349; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
350; AVX512-NEXT:    [[B6:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[INDVARS_IV]], i32 1
351; AVX512-NEXT:    store float [[ADD]], float* [[B6]], align 4
352; AVX512-NEXT:    br label [[FOR_INC]]
353; AVX512:       for.inc:
354; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
355; AVX512-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080
356; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]]
357; AVX512:       for.end:
358; AVX512-NEXT:    ret void
359;
360; FVW2-LABEL: @foo3(
361; FVW2-NEXT:  entry:
362; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
363; FVW2:       vector.body:
364; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
365; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
366; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
367; FVW2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
368; FVW2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
369; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]]
370; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
371; FVW2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
372; FVW2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4
373; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
374; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
375; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
376; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
377; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef)
378; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
379; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
380; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
381; FVW2:       pred.store.if:
382; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[TMP0]], i32 1
383; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
384; FVW2-NEXT:    store float [[TMP13]], float* [[TMP12]], align 4
385; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
386; FVW2:       pred.store.continue:
387; FVW2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
388; FVW2-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
389; FVW2:       pred.store.if1:
390; FVW2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP1]], i32 1
391; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
392; FVW2-NEXT:    store float [[TMP16]], float* [[TMP15]], align 4
393; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE2]]
394; FVW2:       pred.store.continue2:
395; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
396; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
397; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
398; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
399; FVW2:       middle.block:
400; FVW2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
401; FVW2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
402; FVW2:       for.body:
403; FVW2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
404; FVW2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
405; FVW2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
406; FVW2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP18]], 0
407; FVW2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
408; FVW2:       if.then:
409; FVW2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1
410; FVW2-NEXT:    [[TMP19:%.*]] = load float, float* [[B]], align 4
411; FVW2-NEXT:    [[ADD:%.*]] = fadd float [[TMP19]], 5.000000e-01
412; FVW2-NEXT:    [[B6:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[INDVARS_IV]], i32 1
413; FVW2-NEXT:    store float [[ADD]], float* [[B6]], align 4
414; FVW2-NEXT:    br label [[FOR_INC]]
415; FVW2:       for.inc:
416; FVW2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
417; FVW2-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080
418; FVW2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]]
419; FVW2:       for.end:
420; FVW2-NEXT:    ret void
421;
422entry:
423  br label %for.body
424
425for.body:
426  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
427  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
428  %0 = load i32, i32* %arrayidx, align 4
429  %cmp1 = icmp sgt i32 %0, 0
430  br i1 %cmp1, label %if.then, label %for.inc
431
432if.then:
433  %b = getelementptr inbounds %struct.In, %struct.In* %in, i64 %indvars.iv, i32 1
434  %1 = load float, float* %b, align 4
435  %add = fadd float %1, 5.000000e-01
436  %b6 = getelementptr inbounds %struct.Out, %struct.Out* %out, i64 %indvars.iv, i32 1
437  store float %add, float* %b6, align 4
438  br label %for.inc
439
440for.inc:
441  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
442  %cmp = icmp ult i64 %indvars.iv, 4080
443  br i1 %cmp, label %for.body, label %for.end
444
445for.end:
446  ret void
447}
448declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
449
450; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1
451
452define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
453; AVX512-LABEL: @foo2_addrspace(
454; AVX512-NEXT:  entry:
455; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
456; AVX512:       vector.body:
457; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
458; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
459; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
460; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
461; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
462; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
463; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
464; AVX512-NEXT:    [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
465; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
466; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP19]], <16 x float addrspace(1)*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
467; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
468; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
469; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
470; AVX512-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
471; AVX512:       middle.block:
472; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
473; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
474; AVX512:       for.body:
475; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
476; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
477; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
478; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
479; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
480; AVX512:       if.then:
481; AVX512-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV]], i32 1
482; AVX512-NEXT:    [[TMP23:%.*]] = load float, float addrspace(1)* [[B]], align 4
483; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
484; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV]]
485; AVX512-NEXT:    store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4
486; AVX512-NEXT:    br label [[FOR_INC]]
487; AVX512:       for.inc:
488; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
489; AVX512-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080
490; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP9:![0-9]+]]
491; AVX512:       for.end:
492; AVX512-NEXT:    ret void
493;
494; FVW2-LABEL: @foo2_addrspace(
495; FVW2-NEXT:  entry:
496; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
497; FVW2:       vector.body:
498; FVW2-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
499; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
500; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16
501; FVW2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
502; FVW2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
503; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]]
504; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
505; FVW2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
506; FVW2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4
507; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
508; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
509; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
510; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
511; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef)
512; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
513; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
514; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
515; FVW2:       pred.store.if:
516; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[TMP0]]
517; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
518; FVW2-NEXT:    store float [[TMP13]], float addrspace(1)* [[TMP12]], align 4
519; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
520; FVW2:       pred.store.continue:
521; FVW2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
522; FVW2-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
523; FVW2:       pred.store.if2:
524; FVW2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP1]]
525; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
526; FVW2-NEXT:    store float [[TMP16]], float addrspace(1)* [[TMP15]], align 4
527; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
528; FVW2:       pred.store.continue3:
529; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
530; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
531; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
532; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
533; FVW2:       middle.block:
534; FVW2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
535; FVW2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
536; FVW2:       for.body:
537; FVW2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
538; FVW2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
539; FVW2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
540; FVW2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP18]], 0
541; FVW2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
542; FVW2:       if.then:
543; FVW2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV]], i32 1
544; FVW2-NEXT:    [[TMP19:%.*]] = load float, float addrspace(1)* [[B]], align 4
545; FVW2-NEXT:    [[ADD:%.*]] = fadd float [[TMP19]], 5.000000e-01
546; FVW2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV]]
547; FVW2-NEXT:    store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4
548; FVW2-NEXT:    br label [[FOR_INC]]
549; FVW2:       for.inc:
550; FVW2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
551; FVW2-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080
552; FVW2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP9:![0-9]+]]
553; FVW2:       for.end:
554; FVW2-NEXT:    ret void
555;
556entry:
557  br label %for.body
558
559for.body:
560  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
561  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
562  %0 = load i32, i32* %arrayidx, align 4
563  %cmp1 = icmp sgt i32 %0, 0
564  br i1 %cmp1, label %if.then, label %for.inc
565
566if.then:
567  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, i64 %indvars.iv, i32 1
568  %1 = load float, float addrspace(1)* %b, align 4
569  %add = fadd float %1, 5.000000e-01
570  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %out, i64 %indvars.iv
571  store float %add, float addrspace(1)* %arrayidx5, align 4
572  br label %for.inc
573
574for.inc:
575  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
576  %cmp = icmp ult i64 %indvars.iv, 4080
577  br i1 %cmp, label %for.body, label %for.end
578
579for.end:
580  ret void
581}
582
583; Same as foo2_addrspace but here only the input has the non-default address space.
584
585define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
586; AVX512-LABEL: @foo2_addrspace2(
587; AVX512-NEXT:  entry:
588; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
589; AVX512:       vector.body:
590; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
591; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
592; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
593; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
594; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
595; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
596; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
597; AVX512-NEXT:    [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
598; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
599; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP19]], <16 x float*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
600; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
601; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
602; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
603; AVX512-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
604; AVX512:       middle.block:
605; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
606; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
607; AVX512:       for.body:
608; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
609; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
610; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
611; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
612; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
613; AVX512:       if.then:
614; AVX512-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV]], i32 1
615; AVX512-NEXT:    [[TMP23:%.*]] = load float, float addrspace(1)* [[B]], align 4
616; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
617; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]]
618; AVX512-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
619; AVX512-NEXT:    br label [[FOR_INC]]
620; AVX512:       for.inc:
621; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
622; AVX512-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080
623; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]]
624; AVX512:       for.end:
625; AVX512-NEXT:    ret void
626;
627; FVW2-LABEL: @foo2_addrspace2(
628; FVW2-NEXT:  entry:
629; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
630; FVW2:       vector.body:
631; FVW2-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
632; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
633; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16
634; FVW2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
635; FVW2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
636; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]]
637; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
638; FVW2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
639; FVW2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4
640; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
641; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
642; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
643; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
644; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef)
645; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
646; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
647; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
648; FVW2:       pred.store.if:
649; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[TMP0]]
650; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
651; FVW2-NEXT:    store float [[TMP13]], float* [[TMP12]], align 4
652; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
653; FVW2:       pred.store.continue:
654; FVW2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
655; FVW2-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
656; FVW2:       pred.store.if2:
657; FVW2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP1]]
658; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
659; FVW2-NEXT:    store float [[TMP16]], float* [[TMP15]], align 4
660; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
661; FVW2:       pred.store.continue3:
662; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
663; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
664; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
665; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
666; FVW2:       middle.block:
667; FVW2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
668; FVW2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
669; FVW2:       for.body:
670; FVW2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
671; FVW2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
672; FVW2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
673; FVW2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP18]], 0
674; FVW2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
675; FVW2:       if.then:
676; FVW2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV]], i32 1
677; FVW2-NEXT:    [[TMP19:%.*]] = load float, float addrspace(1)* [[B]], align 4
678; FVW2-NEXT:    [[ADD:%.*]] = fadd float [[TMP19]], 5.000000e-01
679; FVW2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]]
680; FVW2-NEXT:    store float [[ADD]], float* [[ARRAYIDX5]], align 4
681; FVW2-NEXT:    br label [[FOR_INC]]
682; FVW2:       for.inc:
683; FVW2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
684; FVW2-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080
685; FVW2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]]
686; FVW2:       for.end:
687; FVW2-NEXT:    ret void
688;
689entry:
690  br label %for.body
691
692for.body:
693  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
694  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
695  %0 = load i32, i32* %arrayidx, align 4
696  %cmp1 = icmp sgt i32 %0, 0
697  br i1 %cmp1, label %if.then, label %for.inc
698
699if.then:
700  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, i64 %indvars.iv, i32 1
701  %1 = load float, float addrspace(1)* %b, align 4
702  %add = fadd float %1, 5.000000e-01
703  %arrayidx5 = getelementptr inbounds float, float* %out, i64 %indvars.iv
704  store float %add, float* %arrayidx5, align 4
705  br label %for.inc
706
707for.inc:
708  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
709  %cmp = icmp ult i64 %indvars.iv, 4080
710  br i1 %cmp, label %for.body, label %for.end
711
712for.end:
713  ret void
714}
715
716; Same as foo2_addrspace but here only the output has the non-default address space.
717
718define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
719; AVX512-LABEL: @foo2_addrspace3(
720; AVX512-NEXT:  entry:
721; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
722; AVX512:       vector.body:
723; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
724; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
725; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
726; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
727; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
728; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
729; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
730; AVX512-NEXT:    [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
731; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
732; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP19]], <16 x float addrspace(1)*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
733; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
734; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
735; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
736; AVX512-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
737; AVX512:       middle.block:
738; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
739; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
740; AVX512:       for.body:
741; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
742; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
743; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
744; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
745; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
746; AVX512:       if.then:
747; AVX512-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1
748; AVX512-NEXT:    [[TMP23:%.*]] = load float, float* [[B]], align 4
749; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
750; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV]]
751; AVX512-NEXT:    store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4
752; AVX512-NEXT:    br label [[FOR_INC]]
753; AVX512:       for.inc:
754; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
755; AVX512-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080
756; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP13:![0-9]+]]
757; AVX512:       for.end:
758; AVX512-NEXT:    ret void
759;
760; FVW2-LABEL: @foo2_addrspace3(
761; FVW2-NEXT:  entry:
762; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
763; FVW2:       vector.body:
764; FVW2-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
765; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
766; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16
767; FVW2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
768; FVW2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
769; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[TMP0]]
770; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
771; FVW2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
772; FVW2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4
773; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
774; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
775; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
776; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
777; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef)
778; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
779; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
780; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
781; FVW2:       pred.store.if:
782; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[TMP0]]
783; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
784; FVW2-NEXT:    store float [[TMP13]], float addrspace(1)* [[TMP12]], align 4
785; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
786; FVW2:       pred.store.continue:
787; FVW2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
788; FVW2-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
789; FVW2:       pred.store.if2:
790; FVW2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP1]]
791; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
792; FVW2-NEXT:    store float [[TMP16]], float addrspace(1)* [[TMP15]], align 4
793; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
794; FVW2:       pred.store.continue3:
795; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
796; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
797; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
798; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
799; FVW2:       middle.block:
800; FVW2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 256, 256
801; FVW2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
802; FVW2:       for.body:
803; FVW2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
804; FVW2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
805; FVW2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
806; FVW2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP18]], 0
807; FVW2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
808; FVW2:       if.then:
809; FVW2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1
810; FVW2-NEXT:    [[TMP19:%.*]] = load float, float* [[B]], align 4
811; FVW2-NEXT:    [[ADD:%.*]] = fadd float [[TMP19]], 5.000000e-01
812; FVW2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV]]
813; FVW2-NEXT:    store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4
814; FVW2-NEXT:    br label [[FOR_INC]]
815; FVW2:       for.inc:
816; FVW2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
817; FVW2-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 4080
818; FVW2-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP13:![0-9]+]]
819; FVW2:       for.end:
820; FVW2-NEXT:    ret void
821;
822entry:
823  br label %for.body
824
825for.body:
826  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
827  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
828  %0 = load i32, i32* %arrayidx, align 4
829  %cmp1 = icmp sgt i32 %0, 0
830  br i1 %cmp1, label %if.then, label %for.inc
831
832if.then:
833  %b = getelementptr inbounds %struct.In, %struct.In* %in, i64 %indvars.iv, i32 1
834  %1 = load float, float* %b, align 4
835  %add = fadd float %1, 5.000000e-01
836  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %out, i64 %indvars.iv
837  store float %add, float addrspace(1)* %arrayidx5, align 4
838  br label %for.inc
839
840for.inc:
841  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
842  %cmp = icmp ult i64 %indvars.iv, 4080
843  br i1 %cmp, label %for.body, label %for.end
844
845for.end:
846  ret void
847}
848
849; Using gathers is not profitable for this function. PR48429.
850define void @test_gather_not_profitable_pr48429(i32 %d, float* readonly noalias %ptr, float* nocapture noalias %dest) {
851; AVX512-LABEL: @test_gather_not_profitable_pr48429(
852; AVX512-NEXT:  entry:
853; AVX512-NEXT:    [[DEST1:%.*]] = bitcast float* [[DEST:%.*]] to i8*
854; AVX512-NEXT:    [[PTR3:%.*]] = bitcast float* [[PTR:%.*]] to i8*
855; AVX512-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64
856; AVX512-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 [[IDX_EXT]]
857; AVX512-NEXT:    [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0
858; AVX512-NEXT:    br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]]
859; AVX512:       for.body.lr.ph:
860; AVX512-NEXT:    [[MUL:%.*]] = sub nsw i32 0, [[D]]
861; AVX512-NEXT:    [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64
862; AVX512-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2
863; AVX512-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4
864; AVX512-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
865; AVX512-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
866; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 32
867; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
868; AVX512:       vector.memcheck:
869; AVX512-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2
870; AVX512-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4
871; AVX512-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 2
872; AVX512-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 4
873; AVX512-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 2
874; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP8]]
875; AVX512-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
876; AVX512-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP6]], 1
877; AVX512-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP9]]
878; AVX512-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
879; AVX512-NEXT:    [[SCEVGEP6:%.*]] = getelementptr float, float* [[PTR]], i64 [[IDXPROM]]
880; AVX512-NEXT:    [[SCEVGEP67:%.*]] = bitcast float* [[SCEVGEP6]] to i8*
881; AVX512-NEXT:    [[TMP10:%.*]] = add i64 [[TMP6]], 1
882; AVX512-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], [[IDX_EXT]]
883; AVX512-NEXT:    [[SCEVGEP8:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP11]]
884; AVX512-NEXT:    [[SCEVGEP89:%.*]] = bitcast float* [[SCEVGEP8]] to i8*
885; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[DEST1]], [[SCEVGEP45]]
886; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[PTR3]], [[SCEVGEP2]]
887; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
888; AVX512-NEXT:    [[BOUND010:%.*]] = icmp ult i8* [[DEST1]], [[SCEVGEP89]]
889; AVX512-NEXT:    [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]]
890; AVX512-NEXT:    [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]]
891; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]]
892; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
893; AVX512:       vector.ph:
894; AVX512-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16
895; AVX512-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
896; AVX512-NEXT:    [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]]
897; AVX512-NEXT:    [[TMP12:%.*]] = mul i64 [[N_VEC]], 16
898; AVX512-NEXT:    [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]]
899; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
900; AVX512:       vector.body:
901; AVX512-NEXT:    [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
902; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
903; AVX512-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
904; AVX512-NEXT:    [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP13]]
905; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr float, float* [[POINTER_PHI]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
906; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]]
907; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP15]], i32 0
908; AVX512-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <16 x float>*
909; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, <16 x float>* [[TMP17]], align 4, !alias.scope !14
910; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD]], <16 x float*> [[TMP14]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !17, !noalias !19
911; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr float, float* [[NEXT_GEP]], i32 0
912; AVX512-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
913; AVX512-NEXT:    [[WIDE_LOAD15:%.*]] = load <16 x float>, <16 x float>* [[TMP19]], align 4, !alias.scope !21
914; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP14]], i64 1
915; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15]], <16 x float*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>), !alias.scope !17, !noalias !19
916; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
917; AVX512-NEXT:    [[PTR_IND]] = getelementptr float, float* [[POINTER_PHI]], i64 256
918; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
919; AVX512-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
920; AVX512:       middle.block:
921; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
922; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]]
923; AVX512:       scalar.ph:
924; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ]
925; AVX512-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi float* [ [[IND_END14]], [[MIDDLE_BLOCK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ]
926; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
927; AVX512:       for.body:
928; AVX512-NEXT:    [[PTR_ADDR_012:%.*]] = phi float* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
929; AVX512-NEXT:    [[DEST_ADDR_011:%.*]] = phi float* [ [[BC_RESUME_VAL13]], [[SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ]
930; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]]
931; AVX512-NEXT:    [[TMP22:%.*]] = load float, float* [[ARRAYIDX]], align 4
932; AVX512-NEXT:    store float [[TMP22]], float* [[DEST_ADDR_011]], align 4
933; AVX512-NEXT:    [[TMP23:%.*]] = load float, float* [[PTR_ADDR_012]], align 4
934; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1
935; AVX512-NEXT:    store float [[TMP23]], float* [[ARRAYIDX5]], align 4
936; AVX512-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1
937; AVX512-NEXT:    [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16
938; AVX512-NEXT:    [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]]
939; AVX512-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
940; AVX512:       for.end:
941; AVX512-NEXT:    ret void
942;
943; FVW2-LABEL: @test_gather_not_profitable_pr48429(
944; FVW2-NEXT:  entry:
945; FVW2-NEXT:    [[DEST1:%.*]] = bitcast float* [[DEST:%.*]] to i8*
946; FVW2-NEXT:    [[PTR3:%.*]] = bitcast float* [[PTR:%.*]] to i8*
947; FVW2-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64
948; FVW2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 [[IDX_EXT]]
949; FVW2-NEXT:    [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0
950; FVW2-NEXT:    br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]]
951; FVW2:       for.body.lr.ph:
952; FVW2-NEXT:    [[MUL:%.*]] = sub nsw i32 0, [[D]]
953; FVW2-NEXT:    [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64
954; FVW2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2
955; FVW2-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4
956; FVW2-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
957; FVW2-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
958; FVW2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2
959; FVW2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
960; FVW2:       vector.memcheck:
961; FVW2-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2
962; FVW2-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4
963; FVW2-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 2
964; FVW2-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 4
965; FVW2-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 2
966; FVW2-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP8]]
967; FVW2-NEXT:    [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
968; FVW2-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP6]], 1
969; FVW2-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP9]]
970; FVW2-NEXT:    [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
971; FVW2-NEXT:    [[SCEVGEP6:%.*]] = getelementptr float, float* [[PTR]], i64 [[IDXPROM]]
972; FVW2-NEXT:    [[SCEVGEP67:%.*]] = bitcast float* [[SCEVGEP6]] to i8*
973; FVW2-NEXT:    [[TMP10:%.*]] = add i64 [[TMP6]], 1
974; FVW2-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], [[IDX_EXT]]
975; FVW2-NEXT:    [[SCEVGEP8:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP11]]
976; FVW2-NEXT:    [[SCEVGEP89:%.*]] = bitcast float* [[SCEVGEP8]] to i8*
977; FVW2-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[DEST1]], [[SCEVGEP45]]
978; FVW2-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[PTR3]], [[SCEVGEP2]]
979; FVW2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
980; FVW2-NEXT:    [[BOUND010:%.*]] = icmp ult i8* [[DEST1]], [[SCEVGEP89]]
981; FVW2-NEXT:    [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]]
982; FVW2-NEXT:    [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]]
983; FVW2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]]
984; FVW2-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
985; FVW2:       vector.ph:
986; FVW2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
987; FVW2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
988; FVW2-NEXT:    [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]]
989; FVW2-NEXT:    [[TMP12:%.*]] = mul i64 [[N_VEC]], 16
990; FVW2-NEXT:    [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]]
991; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
992; FVW2:       vector.body:
993; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
994; FVW2-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
995; FVW2-NEXT:    [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[TMP13]]
996; FVW2-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
997; FVW2-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 16
998; FVW2-NEXT:    [[NEXT_GEP15:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP15]]
999; FVW2-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 1
1000; FVW2-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 16
1001; FVW2-NEXT:    [[NEXT_GEP16:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP17]]
1002; FVW2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]]
1003; FVW2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP18]], i32 0
1004; FVW2-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <2 x float>*
1005; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP20]], align 4, !alias.scope !14
1006; FVW2-NEXT:    [[TMP21:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
1007; FVW2-NEXT:    store float [[TMP21]], float* [[NEXT_GEP15]], align 4, !alias.scope !17, !noalias !19
1008; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
1009; FVW2-NEXT:    store float [[TMP22]], float* [[NEXT_GEP16]], align 4, !alias.scope !17, !noalias !19
1010; FVW2-NEXT:    [[TMP23:%.*]] = getelementptr float, float* [[NEXT_GEP]], i32 0
1011; FVW2-NEXT:    [[TMP24:%.*]] = bitcast float* [[TMP23]] to <2 x float>*
1012; FVW2-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x float>, <2 x float>* [[TMP24]], align 4, !alias.scope !21
1013; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP15]], i64 1
1014; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP16]], i64 1
1015; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x float> [[WIDE_LOAD17]], i32 0
1016; FVW2-NEXT:    store float [[TMP27]], float* [[TMP25]], align 4, !alias.scope !17, !noalias !19
1017; FVW2-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD17]], i32 1
1018; FVW2-NEXT:    store float [[TMP28]], float* [[TMP26]], align 4, !alias.scope !17, !noalias !19
1019; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
1020; FVW2-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1021; FVW2-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
1022; FVW2:       middle.block:
1023; FVW2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
1024; FVW2-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]]
1025; FVW2:       scalar.ph:
1026; FVW2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi float* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ]
1027; FVW2-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi float* [ [[IND_END14]], [[MIDDLE_BLOCK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ]
1028; FVW2-NEXT:    br label [[FOR_BODY:%.*]]
1029; FVW2:       for.body:
1030; FVW2-NEXT:    [[PTR_ADDR_012:%.*]] = phi float* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
1031; FVW2-NEXT:    [[DEST_ADDR_011:%.*]] = phi float* [ [[BC_RESUME_VAL13]], [[SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ]
1032; FVW2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]]
1033; FVW2-NEXT:    [[TMP30:%.*]] = load float, float* [[ARRAYIDX]], align 4
1034; FVW2-NEXT:    store float [[TMP30]], float* [[DEST_ADDR_011]], align 4
1035; FVW2-NEXT:    [[TMP31:%.*]] = load float, float* [[PTR_ADDR_012]], align 4
1036; FVW2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1
1037; FVW2-NEXT:    store float [[TMP31]], float* [[ARRAYIDX5]], align 4
1038; FVW2-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1
1039; FVW2-NEXT:    [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16
1040; FVW2-NEXT:    [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]]
1041; FVW2-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
1042; FVW2:       for.end:
1043; FVW2-NEXT:    ret void
1044;
1045entry:
1046  %idx.ext = sext i32 %d to i64
1047  %add.ptr = getelementptr inbounds float, float* %ptr, i64 %idx.ext
1048  %cmp.not10 = icmp eq i32 %d, 0
1049  br i1 %cmp.not10, label %for.end, label %for.body.lr.ph
1050
1051for.body.lr.ph:
1052  %mul = sub nsw i32 0, %d
1053  %idxprom = sext i32 %mul to i64
1054  br label %for.body
1055
1056for.body:
1057  %ptr.addr.012 = phi float* [ %ptr, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
1058  %dest.addr.011 = phi float* [ %dest, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
1059  %arrayidx = getelementptr inbounds float, float* %ptr.addr.012, i64 %idxprom
1060  %0 = load float, float* %arrayidx, align 4
1061  store float %0, float* %dest.addr.011, align 4
1062  %1 = load float, float* %ptr.addr.012, align 4
1063  %arrayidx5 = getelementptr inbounds float, float* %dest.addr.011, i64 1
1064  store float %1, float* %arrayidx5, align 4
1065  %incdec.ptr = getelementptr inbounds float, float* %ptr.addr.012, i64 1
1066  %add.ptr6 = getelementptr inbounds float, float* %dest.addr.011, i64 16
1067  %cmp.not = icmp eq float* %incdec.ptr, %add.ptr
1068  br i1 %cmp.not, label %for.end, label %for.body
1069
1070for.end:
1071  ret void
1072}
1073