1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s  -loop-vectorize -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX1
3; RUN: opt < %s  -loop-vectorize -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX2
4; RUN: opt < %s  -loop-vectorize -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
5
6target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
7target triple = "x86_64-pc_linux"
8
9; The source code:
10;
11;void foo1(int *A, int *B, int *trigger) {
12;
13;  for (int i=0; i<10000; i++) {
14;    if (trigger[i] < 100) {
15;          A[i] = B[i] + trigger[i];
16;    }
17;  }
18;}
19
20define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
21; AVX1-LABEL: @foo1(
22; AVX1-NEXT:  entry:
23; AVX1-NEXT:    [[B3:%.*]] = ptrtoint i32* [[B:%.*]] to i64
24; AVX1-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64
25; AVX1-NEXT:    [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64
26; AVX1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
27; AVX1:       vector.memcheck:
28; AVX1-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
29; AVX1-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
30; AVX1-NEXT:    [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
31; AVX1-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32
32; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
33; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
34; AVX1:       vector.ph:
35; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
36; AVX1:       vector.body:
37; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
38; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
39; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
40; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
41; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
42; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4
43; AVX1-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
44; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]]
45; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0
46; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
47; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x i32> poison)
48; AVX1-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
49; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]]
50; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i32 0
51; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
52; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP10]], <8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP6]])
53; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
54; AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
55; AVX1-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
56; AVX1:       middle.block:
57; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
58; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
59; AVX1:       scalar.ph:
60; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
61; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
62; AVX1:       for.body:
63; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
64; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
65; AVX1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
66; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP15]], 100
67; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
68; AVX1:       if.then:
69; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
70; AVX1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
71; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
72; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
73; AVX1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
74; AVX1-NEXT:    br label [[FOR_INC]]
75; AVX1:       for.inc:
76; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
77; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
78; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
79; AVX1:       for.end:
80; AVX1-NEXT:    ret void
81;
82; AVX2-LABEL: @foo1(
83; AVX2-NEXT:  entry:
84; AVX2-NEXT:    [[B3:%.*]] = ptrtoint i32* [[B:%.*]] to i64
85; AVX2-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64
86; AVX2-NEXT:    [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64
87; AVX2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
88; AVX2:       vector.memcheck:
89; AVX2-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
90; AVX2-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
91; AVX2-NEXT:    [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
92; AVX2-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128
93; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
94; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
95; AVX2:       vector.ph:
96; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
97; AVX2:       vector.body:
98; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
99; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
100; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
101; AVX2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 16
102; AVX2-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 24
103; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
104; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
105; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
106; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
107; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
108; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
109; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4
110; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 8
111; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
112; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
113; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
114; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
115; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
116; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 24
117; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>*
118; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP17]], align 4
119; AVX2-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
120; AVX2-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
121; AVX2-NEXT:    [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
122; AVX2-NEXT:    [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
123; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]]
124; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]]
125; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]]
126; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]]
127; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32* [[TMP22]], i32 0
128; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>*
129; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison)
130; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32* [[TMP22]], i32 8
131; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>*
132; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison)
133; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32* [[TMP22]], i32 16
134; AVX2-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>*
135; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x i32> poison)
136; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32* [[TMP22]], i32 24
137; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <8 x i32>*
138; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x i32> poison)
139; AVX2-NEXT:    [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
140; AVX2-NEXT:    [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
141; AVX2-NEXT:    [[TMP36:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
142; AVX2-NEXT:    [[TMP37:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
143; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]]
144; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]]
145; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]]
146; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP5]]
147; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32* [[TMP38]], i32 0
148; AVX2-NEXT:    [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>*
149; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP18]])
150; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32* [[TMP38]], i32 8
151; AVX2-NEXT:    [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>*
152; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP19]])
153; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32* [[TMP38]], i32 16
154; AVX2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>*
155; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP36]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP20]])
156; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr i32, i32* [[TMP38]], i32 24
157; AVX2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>*
158; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP37]], <8 x i32>* [[TMP49]], i32 4, <8 x i1> [[TMP21]])
159; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
160; AVX2-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
161; AVX2-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
162; AVX2:       middle.block:
163; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
164; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
165; AVX2:       scalar.ph:
166; AVX2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
167; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
168; AVX2:       for.body:
169; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
170; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
171; AVX2-NEXT:    [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
172; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP51]], 100
173; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
174; AVX2:       if.then:
175; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
176; AVX2-NEXT:    [[TMP52:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
177; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP52]], [[TMP51]]
178; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
179; AVX2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
180; AVX2-NEXT:    br label [[FOR_INC]]
181; AVX2:       for.inc:
182; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
183; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
184; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
185; AVX2:       for.end:
186; AVX2-NEXT:    ret void
187;
188; AVX512-LABEL: @foo1(
189; AVX512-NEXT:  iter.check:
190; AVX512-NEXT:    [[B3:%.*]] = ptrtoint i32* [[B:%.*]] to i64
191; AVX512-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64
192; AVX512-NEXT:    [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64
193; AVX512-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
194; AVX512:       vector.memcheck:
195; AVX512-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
196; AVX512-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 256
197; AVX512-NEXT:    [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
198; AVX512-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 256
199; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
200; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
201; AVX512:       vector.main.loop.iter.check:
202; AVX512-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
203; AVX512:       vector.ph:
204; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
205; AVX512:       vector.body:
206; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
207; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
208; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 16
209; AVX512-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 32
210; AVX512-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 48
211; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
212; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
213; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
214; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
215; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
216; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
217; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
218; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
219; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
220; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4
221; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 32
222; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>*
223; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4
224; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 48
225; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>*
226; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32>* [[TMP17]], align 4
227; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
228; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
229; AVX512-NEXT:    [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
230; AVX512-NEXT:    [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
231; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]]
232; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]]
233; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]]
234; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]]
235; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32* [[TMP22]], i32 0
236; AVX512-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>*
237; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison)
238; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32* [[TMP22]], i32 16
239; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>*
240; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison)
241; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32* [[TMP22]], i32 32
242; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
243; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x i32> poison)
244; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32* [[TMP22]], i32 48
245; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <16 x i32>*
246; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x i32> poison)
247; AVX512-NEXT:    [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
248; AVX512-NEXT:    [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
249; AVX512-NEXT:    [[TMP36:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
250; AVX512-NEXT:    [[TMP37:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
251; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]]
252; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]]
253; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]]
254; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP5]]
255; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32* [[TMP38]], i32 0
256; AVX512-NEXT:    [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <16 x i32>*
257; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP18]])
258; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32* [[TMP38]], i32 16
259; AVX512-NEXT:    [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>*
260; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP19]])
261; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32* [[TMP38]], i32 32
262; AVX512-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>*
263; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP36]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP20]])
264; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr i32, i32* [[TMP38]], i32 48
265; AVX512-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>*
266; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP37]], <16 x i32>* [[TMP49]], i32 4, <16 x i1> [[TMP21]])
267; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
268; AVX512-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
269; AVX512-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
270; AVX512:       middle.block:
271; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
272; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
273; AVX512:       vec.epilog.iter.check:
274; AVX512-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
275; AVX512:       vec.epilog.ph:
276; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
277; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
278; AVX512:       vec.epilog.vector.body:
279; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
280; AVX512-NEXT:    [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 0
281; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP51]]
282; AVX512-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 0
283; AVX512-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <8 x i32>*
284; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP54]], align 4
285; AVX512-NEXT:    [[TMP55:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
286; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP51]]
287; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr i32, i32* [[TMP56]], i32 0
288; AVX512-NEXT:    [[TMP58:%.*]] = bitcast i32* [[TMP57]] to <8 x i32>*
289; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP58]], i32 4, <8 x i1> [[TMP55]], <8 x i32> poison)
290; AVX512-NEXT:    [[TMP59:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]]
291; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP51]]
292; AVX512-NEXT:    [[TMP61:%.*]] = getelementptr i32, i32* [[TMP60]], i32 0
293; AVX512-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <8 x i32>*
294; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP59]], <8 x i32>* [[TMP62]], i32 4, <8 x i1> [[TMP55]])
295; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8
296; AVX512-NEXT:    [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
297; AVX512-NEXT:    br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
298; AVX512:       vec.epilog.middle.block:
299; AVX512-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 10000, 10000
300; AVX512-NEXT:    br i1 [[CMP_N11]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
301; AVX512:       vec.epilog.scalar.ph:
302; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
303; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
304; AVX512:       for.body:
305; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
306; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
307; AVX512-NEXT:    [[TMP64:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
308; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP64]], 100
309; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
310; AVX512:       if.then:
311; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
312; AVX512-NEXT:    [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
313; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP65]], [[TMP64]]
314; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
315; AVX512-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
316; AVX512-NEXT:    br label [[FOR_INC]]
317; AVX512:       for.inc:
318; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
319; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
320; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
321; AVX512:       for.end:
322; AVX512-NEXT:    ret void
323;
324entry:
325  br label %for.body
326
327for.body:                                         ; preds = %for.inc, %entry
328  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
329  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
330  %0 = load i32, i32* %arrayidx, align 4
331  %cmp1 = icmp slt i32 %0, 100
332  br i1 %cmp1, label %if.then, label %for.inc
333
334if.then:                                          ; preds = %for.body
335  %arrayidx3 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
336  %1 = load i32, i32* %arrayidx3, align 4
337  %add = add nsw i32 %1, %0
338  %arrayidx7 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
339  store i32 %add, i32* %arrayidx7, align 4
340  br label %for.inc
341
342for.inc:                                          ; preds = %for.body, %if.then
343  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
344  %exitcond = icmp eq i64 %indvars.iv.next, 10000
345  br i1 %exitcond, label %for.end, label %for.body
346
347for.end:                                          ; preds = %for.inc
348  ret void
349}
350
351; The same as @foo1 but all the pointers are address space 1 pointers.
352
353define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* nocapture readonly %B, i32 addrspace(1)* nocapture readonly %trigger) local_unnamed_addr #0 {
354; AVX1-LABEL: @foo1_addrspace1(
355; AVX1-NEXT:  entry:
356; AVX1-NEXT:    [[B3:%.*]] = ptrtoint i32 addrspace(1)* [[B:%.*]] to i64
357; AVX1-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32 addrspace(1)* [[TRIGGER:%.*]] to i64
358; AVX1-NEXT:    [[A1:%.*]] = ptrtoint i32 addrspace(1)* [[A:%.*]] to i64
359; AVX1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
360; AVX1:       vector.memcheck:
361; AVX1-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
362; AVX1-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
363; AVX1-NEXT:    [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
364; AVX1-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32
365; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
366; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
367; AVX1:       vector.ph:
368; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
369; AVX1:       vector.body:
370; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
371; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
372; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]]
373; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP3]], i32 0
374; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <8 x i32> addrspace(1)*
375; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4
376; AVX1-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
377; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]]
378; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP7]], i32 0
379; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)*
380; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x i32> poison)
381; AVX1-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
382; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]]
383; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP11]], i32 0
384; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)*
385; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP10]], <8 x i32> addrspace(1)* [[TMP13]], i32 4, <8 x i1> [[TMP6]])
386; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
387; AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
388; AVX1-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
389; AVX1:       middle.block:
390; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
391; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
392; AVX1:       scalar.ph:
393; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
394; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
395; AVX1:       for.body:
396; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
397; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
398; AVX1-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
399; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP15]], 100
400; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
401; AVX1:       if.then:
402; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
403; AVX1-NEXT:    [[TMP16:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
404; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
405; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
406; AVX1-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
407; AVX1-NEXT:    br label [[FOR_INC]]
408; AVX1:       for.inc:
409; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
410; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
411; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
412; AVX1:       for.end:
413; AVX1-NEXT:    ret void
414;
415; AVX2-LABEL: @foo1_addrspace1(
416; AVX2-NEXT:  entry:
417; AVX2-NEXT:    [[B3:%.*]] = ptrtoint i32 addrspace(1)* [[B:%.*]] to i64
418; AVX2-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32 addrspace(1)* [[TRIGGER:%.*]] to i64
419; AVX2-NEXT:    [[A1:%.*]] = ptrtoint i32 addrspace(1)* [[A:%.*]] to i64
420; AVX2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
421; AVX2:       vector.memcheck:
422; AVX2-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
423; AVX2-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
424; AVX2-NEXT:    [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
425; AVX2-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128
426; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
427; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
428; AVX2:       vector.ph:
429; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
430; AVX2:       vector.body:
431; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
432; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
433; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
434; AVX2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 16
435; AVX2-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 24
436; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]]
437; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]]
438; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP4]]
439; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP5]]
440; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 0
441; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)*
442; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4
443; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 8
444; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)*
445; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4
446; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 16
447; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)*
448; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4
449; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 24
450; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <8 x i32> addrspace(1)*
451; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP17]], align 4
452; AVX2-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
453; AVX2-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
454; AVX2-NEXT:    [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
455; AVX2-NEXT:    [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
456; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]]
457; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]]
458; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP4]]
459; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP5]]
460; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 0
461; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)*
462; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison)
463; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 8
464; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)*
465; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison)
466; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 16
467; AVX2-NEXT:    [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)*
468; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x i32> poison)
469; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 24
470; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <8 x i32> addrspace(1)*
471; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x i32> poison)
472; AVX2-NEXT:    [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
473; AVX2-NEXT:    [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
474; AVX2-NEXT:    [[TMP36:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
475; AVX2-NEXT:    [[TMP37:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
476; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]]
477; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]]
478; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP4]]
479; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP5]]
480; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 0
481; AVX2-NEXT:    [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <8 x i32> addrspace(1)*
482; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP18]])
483; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 8
484; AVX2-NEXT:    [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)*
485; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP19]])
486; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 16
487; AVX2-NEXT:    [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)*
488; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP36]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP20]])
489; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 24
490; AVX2-NEXT:    [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <8 x i32> addrspace(1)*
491; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP37]], <8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP21]])
492; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
493; AVX2-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
494; AVX2-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
495; AVX2:       middle.block:
496; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
497; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
498; AVX2:       scalar.ph:
499; AVX2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
500; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
501; AVX2:       for.body:
502; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
503; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
504; AVX2-NEXT:    [[TMP51:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
505; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP51]], 100
506; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
507; AVX2:       if.then:
508; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
509; AVX2-NEXT:    [[TMP52:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
510; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP52]], [[TMP51]]
511; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
512; AVX2-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
513; AVX2-NEXT:    br label [[FOR_INC]]
514; AVX2:       for.inc:
515; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
516; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
517; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
518; AVX2:       for.end:
519; AVX2-NEXT:    ret void
520;
521; AVX512-LABEL: @foo1_addrspace1(
522; AVX512-NEXT:  iter.check:
523; AVX512-NEXT:    [[B3:%.*]] = ptrtoint i32 addrspace(1)* [[B:%.*]] to i64
524; AVX512-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32 addrspace(1)* [[TRIGGER:%.*]] to i64
525; AVX512-NEXT:    [[A1:%.*]] = ptrtoint i32 addrspace(1)* [[A:%.*]] to i64
526; AVX512-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
527; AVX512:       vector.memcheck:
528; AVX512-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
529; AVX512-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 256
530; AVX512-NEXT:    [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
531; AVX512-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 256
532; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
533; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
534; AVX512:       vector.main.loop.iter.check:
535; AVX512-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
536; AVX512:       vector.ph:
537; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
538; AVX512:       vector.body:
539; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
540; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
541; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 16
542; AVX512-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 32
543; AVX512-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 48
544; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]]
545; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]]
546; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP4]]
547; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP5]]
548; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 0
549; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <16 x i32> addrspace(1)*
550; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4
551; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 16
552; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)*
553; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4
554; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 32
555; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)*
556; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4
557; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 48
558; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <16 x i32> addrspace(1)*
559; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP17]], align 4
560; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
561; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
562; AVX512-NEXT:    [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
563; AVX512-NEXT:    [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
564; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]]
565; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]]
566; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP4]]
567; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP5]]
568; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 0
569; AVX512-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)*
570; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison)
571; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 16
572; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)*
573; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison)
574; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 32
575; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)*
576; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x i32> poison)
577; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 48
578; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <16 x i32> addrspace(1)*
579; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x i32> poison)
580; AVX512-NEXT:    [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
581; AVX512-NEXT:    [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
582; AVX512-NEXT:    [[TMP36:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
583; AVX512-NEXT:    [[TMP37:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
584; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]]
585; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]]
586; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP4]]
587; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP5]]
588; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 0
589; AVX512-NEXT:    [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <16 x i32> addrspace(1)*
590; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP18]])
591; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 16
592; AVX512-NEXT:    [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)*
593; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP19]])
594; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 32
595; AVX512-NEXT:    [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)*
596; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP36]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP20]])
597; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 48
598; AVX512-NEXT:    [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <16 x i32> addrspace(1)*
599; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP37]], <16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP21]])
600; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
601; AVX512-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
602; AVX512-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
603; AVX512:       middle.block:
604; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
605; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
606; AVX512:       vec.epilog.iter.check:
607; AVX512-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
608; AVX512:       vec.epilog.ph:
609; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
610; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
611; AVX512:       vec.epilog.vector.body:
612; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
613; AVX512-NEXT:    [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 0
614; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP51]]
615; AVX512-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP52]], i32 0
616; AVX512-NEXT:    [[TMP54:%.*]] = bitcast i32 addrspace(1)* [[TMP53]] to <8 x i32> addrspace(1)*
617; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP54]], align 4
618; AVX512-NEXT:    [[TMP55:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
619; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP51]]
620; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i32 0
621; AVX512-NEXT:    [[TMP58:%.*]] = bitcast i32 addrspace(1)* [[TMP57]] to <8 x i32> addrspace(1)*
622; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP58]], i32 4, <8 x i1> [[TMP55]], <8 x i32> poison)
623; AVX512-NEXT:    [[TMP59:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]]
624; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP51]]
625; AVX512-NEXT:    [[TMP61:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP60]], i32 0
626; AVX512-NEXT:    [[TMP62:%.*]] = bitcast i32 addrspace(1)* [[TMP61]] to <8 x i32> addrspace(1)*
627; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP59]], <8 x i32> addrspace(1)* [[TMP62]], i32 4, <8 x i1> [[TMP55]])
628; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8
629; AVX512-NEXT:    [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
630; AVX512-NEXT:    br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
631; AVX512:       vec.epilog.middle.block:
632; AVX512-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 10000, 10000
633; AVX512-NEXT:    br i1 [[CMP_N11]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
634; AVX512:       vec.epilog.scalar.ph:
635; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
636; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
637; AVX512:       for.body:
638; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
639; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
640; AVX512-NEXT:    [[TMP64:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
641; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP64]], 100
642; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
643; AVX512:       if.then:
644; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
645; AVX512-NEXT:    [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
646; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP65]], [[TMP64]]
647; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
648; AVX512-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
649; AVX512-NEXT:    br label [[FOR_INC]]
650; AVX512:       for.inc:
651; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
652; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
653; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
654; AVX512:       for.end:
655; AVX512-NEXT:    ret void
656;
657entry:
658  br label %for.body
659
660for.body:                                         ; preds = %for.inc, %entry
661  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
662  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %trigger, i64 %indvars.iv
663  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
664  %cmp1 = icmp slt i32 %0, 100
665  br i1 %cmp1, label %if.then, label %for.inc
666
667if.then:                                          ; preds = %for.body
668  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %B, i64 %indvars.iv
669  %1 = load i32, i32 addrspace(1)* %arrayidx3, align 4
670  %add = add nsw i32 %1, %0
671  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %indvars.iv
672  store i32 %add, i32 addrspace(1)* %arrayidx7, align 4
673  br label %for.inc
674
675for.inc:                                          ; preds = %for.body, %if.then
676  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
677  %exitcond = icmp eq i64 %indvars.iv.next, 10000
678  br i1 %exitcond, label %for.end, label %for.body
679
680for.end:                                          ; preds = %for.inc
681  ret void
682}
683
684; The source code:
685;
686;void foo2(float *A, float *B, int *trigger) {
687;
688;  for (int i=0; i<10000; i++) {
689;    if (trigger[i] < 100) {
690;          A[i] = B[i] + trigger[i];
691;    }
692;  }
693;}
694
695define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
696; AVX1-LABEL: @foo2(
697; AVX1-NEXT:  entry:
698; AVX1-NEXT:    [[B3:%.*]] = ptrtoint float* [[B:%.*]] to i64
699; AVX1-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64
700; AVX1-NEXT:    [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64
701; AVX1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
702; AVX1:       vector.memcheck:
703; AVX1-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
704; AVX1-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
705; AVX1-NEXT:    [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
706; AVX1-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32
707; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
708; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
709; AVX1:       vector.ph:
710; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
711; AVX1:       vector.body:
712; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
713; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
714; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
715; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
716; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
717; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4
718; AVX1-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
719; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]]
720; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr float, float* [[TMP7]], i32 0
721; AVX1-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
722; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x float> poison)
723; AVX1-NEXT:    [[TMP10:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
724; AVX1-NEXT:    [[TMP11:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP10]]
725; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]]
726; AVX1-NEXT:    [[TMP13:%.*]] = getelementptr float, float* [[TMP12]], i32 0
727; AVX1-NEXT:    [[TMP14:%.*]] = bitcast float* [[TMP13]] to <8 x float>*
728; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP11]], <8 x float>* [[TMP14]], i32 4, <8 x i1> [[TMP6]])
729; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
730; AVX1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
731; AVX1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
732; AVX1:       middle.block:
733; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
734; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
735; AVX1:       scalar.ph:
736; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
737; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
738; AVX1:       for.body:
739; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
740; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
741; AVX1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
742; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP16]], 100
743; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
744; AVX1:       if.then:
745; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
746; AVX1-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX3]], align 4
747; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP16]] to float
748; AVX1-NEXT:    [[ADD:%.*]] = fadd float [[TMP17]], [[CONV]]
749; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
750; AVX1-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
751; AVX1-NEXT:    br label [[FOR_INC]]
752; AVX1:       for.inc:
753; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
754; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
755; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
756; AVX1:       for.end:
757; AVX1-NEXT:    ret void
758;
759; AVX2-LABEL: @foo2(
760; AVX2-NEXT:  entry:
761; AVX2-NEXT:    [[B3:%.*]] = ptrtoint float* [[B:%.*]] to i64
762; AVX2-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64
763; AVX2-NEXT:    [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64
764; AVX2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
765; AVX2:       vector.memcheck:
766; AVX2-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
767; AVX2-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
768; AVX2-NEXT:    [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
769; AVX2-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128
770; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
771; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
772; AVX2:       vector.ph:
773; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
774; AVX2:       vector.body:
775; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
776; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
777; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
778; AVX2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 16
779; AVX2-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 24
780; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
781; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
782; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
783; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
784; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
785; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
786; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4
787; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 8
788; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
789; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
790; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
791; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
792; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
793; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 24
794; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>*
795; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP17]], align 4
796; AVX2-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
797; AVX2-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
798; AVX2-NEXT:    [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
799; AVX2-NEXT:    [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
800; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]]
801; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]]
802; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr float, float* [[B]], i64 [[TMP4]]
803; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[B]], i64 [[TMP5]]
804; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr float, float* [[TMP22]], i32 0
805; AVX2-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>*
806; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison)
807; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr float, float* [[TMP22]], i32 8
808; AVX2-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>*
809; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison)
810; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr float, float* [[TMP22]], i32 16
811; AVX2-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
812; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x float> poison)
813; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr float, float* [[TMP22]], i32 24
814; AVX2-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
815; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x float> poison)
816; AVX2-NEXT:    [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
817; AVX2-NEXT:    [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD5]] to <8 x float>
818; AVX2-NEXT:    [[TMP36:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x float>
819; AVX2-NEXT:    [[TMP37:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x float>
820; AVX2-NEXT:    [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP34]]
821; AVX2-NEXT:    [[TMP39:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD8]], [[TMP35]]
822; AVX2-NEXT:    [[TMP40:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD9]], [[TMP36]]
823; AVX2-NEXT:    [[TMP41:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD10]], [[TMP37]]
824; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]]
825; AVX2-NEXT:    [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]]
826; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr float, float* [[A]], i64 [[TMP4]]
827; AVX2-NEXT:    [[TMP45:%.*]] = getelementptr float, float* [[A]], i64 [[TMP5]]
828; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr float, float* [[TMP42]], i32 0
829; AVX2-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <8 x float>*
830; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP18]])
831; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr float, float* [[TMP42]], i32 8
832; AVX2-NEXT:    [[TMP49:%.*]] = bitcast float* [[TMP48]] to <8 x float>*
833; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP19]])
834; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr float, float* [[TMP42]], i32 16
835; AVX2-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP50]] to <8 x float>*
836; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP40]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP20]])
837; AVX2-NEXT:    [[TMP52:%.*]] = getelementptr float, float* [[TMP42]], i32 24
838; AVX2-NEXT:    [[TMP53:%.*]] = bitcast float* [[TMP52]] to <8 x float>*
839; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP41]], <8 x float>* [[TMP53]], i32 4, <8 x i1> [[TMP21]])
840; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
841; AVX2-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
842; AVX2-NEXT:    br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
843; AVX2:       middle.block:
844; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
845; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
846; AVX2:       scalar.ph:
847; AVX2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
848; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
849; AVX2:       for.body:
850; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
851; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
852; AVX2-NEXT:    [[TMP55:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
853; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP55]], 100
854; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
855; AVX2:       if.then:
856; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
857; AVX2-NEXT:    [[TMP56:%.*]] = load float, float* [[ARRAYIDX3]], align 4
858; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP55]] to float
859; AVX2-NEXT:    [[ADD:%.*]] = fadd float [[TMP56]], [[CONV]]
860; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
861; AVX2-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
862; AVX2-NEXT:    br label [[FOR_INC]]
863; AVX2:       for.inc:
864; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
865; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
866; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
867; AVX2:       for.end:
868; AVX2-NEXT:    ret void
869;
870; AVX512-LABEL: @foo2(
871; AVX512-NEXT:  iter.check:
872; AVX512-NEXT:    [[B3:%.*]] = ptrtoint float* [[B:%.*]] to i64
873; AVX512-NEXT:    [[TRIGGER2:%.*]] = ptrtoint i32* [[TRIGGER:%.*]] to i64
874; AVX512-NEXT:    [[A1:%.*]] = ptrtoint float* [[A:%.*]] to i64
875; AVX512-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
876; AVX512:       vector.memcheck:
877; AVX512-NEXT:    [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
878; AVX512-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 256
879; AVX512-NEXT:    [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
880; AVX512-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 256
881; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
882; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
883; AVX512:       vector.main.loop.iter.check:
884; AVX512-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
885; AVX512:       vector.ph:
886; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
887; AVX512:       vector.body:
888; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
889; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
890; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 16
891; AVX512-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 32
892; AVX512-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 48
893; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
894; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
895; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
896; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
897; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
898; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
899; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
900; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
901; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
902; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4
903; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 32
904; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>*
905; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4
906; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 48
907; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>*
908; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32>* [[TMP17]], align 4
909; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
910; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
911; AVX512-NEXT:    [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
912; AVX512-NEXT:    [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
913; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]]
914; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]]
915; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr float, float* [[B]], i64 [[TMP4]]
916; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[B]], i64 [[TMP5]]
917; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr float, float* [[TMP22]], i32 0
918; AVX512-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>*
919; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison)
920; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr float, float* [[TMP22]], i32 16
921; AVX512-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>*
922; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison)
923; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr float, float* [[TMP22]], i32 32
924; AVX512-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>*
925; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x float> poison)
926; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr float, float* [[TMP22]], i32 48
927; AVX512-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <16 x float>*
928; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x float> poison)
929; AVX512-NEXT:    [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
930; AVX512-NEXT:    [[TMP35:%.*]] = sitofp <16 x i32> [[WIDE_LOAD5]] to <16 x float>
931; AVX512-NEXT:    [[TMP36:%.*]] = sitofp <16 x i32> [[WIDE_LOAD6]] to <16 x float>
932; AVX512-NEXT:    [[TMP37:%.*]] = sitofp <16 x i32> [[WIDE_LOAD7]] to <16 x float>
933; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP34]]
934; AVX512-NEXT:    [[TMP39:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD8]], [[TMP35]]
935; AVX512-NEXT:    [[TMP40:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD9]], [[TMP36]]
936; AVX512-NEXT:    [[TMP41:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD10]], [[TMP37]]
937; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]]
938; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]]
939; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr float, float* [[A]], i64 [[TMP4]]
940; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr float, float* [[A]], i64 [[TMP5]]
941; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr float, float* [[TMP42]], i32 0
942; AVX512-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <16 x float>*
943; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP18]])
944; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr float, float* [[TMP42]], i32 16
945; AVX512-NEXT:    [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>*
946; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP19]])
947; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr float, float* [[TMP42]], i32 32
948; AVX512-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP50]] to <16 x float>*
949; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP40]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP20]])
950; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr float, float* [[TMP42]], i32 48
951; AVX512-NEXT:    [[TMP53:%.*]] = bitcast float* [[TMP52]] to <16 x float>*
952; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP41]], <16 x float>* [[TMP53]], i32 4, <16 x i1> [[TMP21]])
953; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
954; AVX512-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
955; AVX512-NEXT:    br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
956; AVX512:       middle.block:
957; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
958; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
959; AVX512:       vec.epilog.iter.check:
960; AVX512-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
961; AVX512:       vec.epilog.ph:
962; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
963; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
964; AVX512:       vec.epilog.vector.body:
965; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
966; AVX512-NEXT:    [[TMP55:%.*]] = add i64 [[OFFSET_IDX]], 0
967; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP55]]
968; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i32 0
969; AVX512-NEXT:    [[TMP58:%.*]] = bitcast i32* [[TMP57]] to <8 x i32>*
970; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP58]], align 4
971; AVX512-NEXT:    [[TMP59:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
972; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr float, float* [[B]], i64 [[TMP55]]
973; AVX512-NEXT:    [[TMP61:%.*]] = getelementptr float, float* [[TMP60]], i32 0
974; AVX512-NEXT:    [[TMP62:%.*]] = bitcast float* [[TMP61]] to <8 x float>*
975; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP62]], i32 4, <8 x i1> [[TMP59]], <8 x float> poison)
976; AVX512-NEXT:    [[TMP63:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float>
977; AVX512-NEXT:    [[TMP64:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD14]], [[TMP63]]
978; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr float, float* [[A]], i64 [[TMP55]]
979; AVX512-NEXT:    [[TMP66:%.*]] = getelementptr float, float* [[TMP65]], i32 0
980; AVX512-NEXT:    [[TMP67:%.*]] = bitcast float* [[TMP66]] to <8 x float>*
981; AVX512-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP64]], <8 x float>* [[TMP67]], i32 4, <8 x i1> [[TMP59]])
982; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8
983; AVX512-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
984; AVX512-NEXT:    br i1 [[TMP68]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
985; AVX512:       vec.epilog.middle.block:
986; AVX512-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 10000, 10000
987; AVX512-NEXT:    br i1 [[CMP_N11]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
988; AVX512:       vec.epilog.scalar.ph:
989; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
990; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
991; AVX512:       for.body:
992; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
993; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
994; AVX512-NEXT:    [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
995; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP69]], 100
996; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
997; AVX512:       if.then:
998; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
999; AVX512-NEXT:    [[TMP70:%.*]] = load float, float* [[ARRAYIDX3]], align 4
1000; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP69]] to float
1001; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP70]], [[CONV]]
1002; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
1003; AVX512-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
1004; AVX512-NEXT:    br label [[FOR_INC]]
1005; AVX512:       for.inc:
1006; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
1007; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
1008; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
1009; AVX512:       for.end:
1010; AVX512-NEXT:    ret void
1011;
1012entry:
1013  br label %for.body
1014
1015for.body:                                         ; preds = %for.inc, %entry
1016  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
1017  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
1018  %0 = load i32, i32* %arrayidx, align 4
1019  %cmp1 = icmp slt i32 %0, 100
1020  br i1 %cmp1, label %if.then, label %for.inc
1021
1022if.then:                                          ; preds = %for.body
1023  %arrayidx3 = getelementptr inbounds float, float* %B, i64 %indvars.iv
1024  %1 = load float, float* %arrayidx3, align 4
1025  %conv = sitofp i32 %0 to float
1026  %add = fadd float %1, %conv
1027  %arrayidx7 = getelementptr inbounds float, float* %A, i64 %indvars.iv
1028  store float %add, float* %arrayidx7, align 4
1029  br label %for.inc
1030
1031for.inc:                                          ; preds = %for.body, %if.then
1032  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
1033  %exitcond = icmp eq i64 %indvars.iv.next, 10000
1034  br i1 %exitcond, label %for.end, label %for.body
1035
1036for.end:                                          ; preds = %for.inc
1037  ret void
1038}
1039
1040; The source code:
1041;
1042;void foo3(double *A, double *B, int *trigger) {
1043;
1044;  for (int i=0; i<10000; i++) {
1045;    if (trigger[i] < 100) {
1046;          A[i] = B[i] + trigger[i];
1047;    }
1048;  }
1049;}
1050
1051define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
1052; AVX-LABEL: @foo3(
1053; AVX-NEXT:  entry:
1054; AVX-NEXT:    [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
1055; AVX-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
1056; AVX-NEXT:    [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
1057; AVX-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1058; AVX:       vector.memcheck:
1059; AVX-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 10000
1060; AVX-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
1061; AVX-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000
1062; AVX-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
1063; AVX-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 10000
1064; AVX-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
1065; AVX-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
1066; AVX-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
1067; AVX-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1068; AVX-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
1069; AVX-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
1070; AVX-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
1071; AVX-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
1072; AVX-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1073; AVX:       vector.ph:
1074; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
1075; AVX:       vector.body:
1076; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1077; AVX-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
1078; AVX-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
1079; AVX-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
1080; AVX-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
1081; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
1082; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
1083; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
1084; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
1085; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
1086; AVX-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
1087; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !7
1088; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
1089; AVX-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
1090; AVX-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !7
1091; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
1092; AVX-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
1093; AVX-NEXT:    [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !7
1094; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12
1095; AVX-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
1096; AVX-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !alias.scope !7
1097; AVX-NEXT:    [[TMP16:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
1098; AVX-NEXT:    [[TMP17:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100>
1099; AVX-NEXT:    [[TMP18:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100>
1100; AVX-NEXT:    [[TMP19:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100>
1101; AVX-NEXT:    [[TMP20:%.*]] = getelementptr double, double* [[B]], i64 [[TMP0]]
1102; AVX-NEXT:    [[TMP21:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]]
1103; AVX-NEXT:    [[TMP22:%.*]] = getelementptr double, double* [[B]], i64 [[TMP2]]
1104; AVX-NEXT:    [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]]
1105; AVX-NEXT:    [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0
1106; AVX-NEXT:    [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
1107; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], <4 x double> poison), !alias.scope !10
1108; AVX-NEXT:    [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 4
1109; AVX-NEXT:    [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>*
1110; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> poison), !alias.scope !10
1111; AVX-NEXT:    [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 8
1112; AVX-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
1113; AVX-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> poison), !alias.scope !10
1114; AVX-NEXT:    [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 12
1115; AVX-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
1116; AVX-NEXT:    [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> poison), !alias.scope !10
1117; AVX-NEXT:    [[TMP32:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
1118; AVX-NEXT:    [[TMP33:%.*]] = sitofp <4 x i32> [[WIDE_LOAD12]] to <4 x double>
1119; AVX-NEXT:    [[TMP34:%.*]] = sitofp <4 x i32> [[WIDE_LOAD13]] to <4 x double>
1120; AVX-NEXT:    [[TMP35:%.*]] = sitofp <4 x i32> [[WIDE_LOAD14]] to <4 x double>
1121; AVX-NEXT:    [[TMP36:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP32]]
1122; AVX-NEXT:    [[TMP37:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]]
1123; AVX-NEXT:    [[TMP38:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]]
1124; AVX-NEXT:    [[TMP39:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]]
1125; AVX-NEXT:    [[TMP40:%.*]] = getelementptr double, double* [[A]], i64 [[TMP0]]
1126; AVX-NEXT:    [[TMP41:%.*]] = getelementptr double, double* [[A]], i64 [[TMP1]]
1127; AVX-NEXT:    [[TMP42:%.*]] = getelementptr double, double* [[A]], i64 [[TMP2]]
1128; AVX-NEXT:    [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]]
1129; AVX-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0
1130; AVX-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
1131; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP36]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[TMP16]]), !alias.scope !12, !noalias !14
1132; AVX-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 4
1133; AVX-NEXT:    [[TMP47:%.*]] = bitcast double* [[TMP46]] to <4 x double>*
1134; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP37]], <4 x double>* [[TMP47]], i32 8, <4 x i1> [[TMP17]]), !alias.scope !12, !noalias !14
1135; AVX-NEXT:    [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 8
1136; AVX-NEXT:    [[TMP49:%.*]] = bitcast double* [[TMP48]] to <4 x double>*
1137; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP38]], <4 x double>* [[TMP49]], i32 8, <4 x i1> [[TMP18]]), !alias.scope !12, !noalias !14
1138; AVX-NEXT:    [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 12
1139; AVX-NEXT:    [[TMP51:%.*]] = bitcast double* [[TMP50]] to <4 x double>*
1140; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !12, !noalias !14
1141; AVX-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
1142; AVX-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
1143; AVX-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
1144; AVX:       middle.block:
1145; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
1146; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1147; AVX:       scalar.ph:
1148; AVX-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
1149; AVX-NEXT:    br label [[FOR_BODY:%.*]]
1150; AVX:       for.body:
1151; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1152; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
1153; AVX-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
1154; AVX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP53]], 100
1155; AVX-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1156; AVX:       if.then:
1157; AVX-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
1158; AVX-NEXT:    [[TMP54:%.*]] = load double, double* [[ARRAYIDX3]], align 8
1159; AVX-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP53]] to double
1160; AVX-NEXT:    [[ADD:%.*]] = fadd double [[TMP54]], [[CONV]]
1161; AVX-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
1162; AVX-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
1163; AVX-NEXT:    br label [[FOR_INC]]
1164; AVX:       for.inc:
1165; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
1166; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
1167; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
1168; AVX:       for.end:
1169; AVX-NEXT:    ret void
1170;
1171; AVX512-LABEL: @foo3(
1172; AVX512-NEXT:  entry:
1173; AVX512-NEXT:    [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
1174; AVX512-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
1175; AVX512-NEXT:    [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
1176; AVX512-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1177; AVX512:       vector.memcheck:
1178; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 10000
1179; AVX512-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
1180; AVX512-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000
1181; AVX512-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
1182; AVX512-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 10000
1183; AVX512-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
1184; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
1185; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
1186; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1187; AVX512-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
1188; AVX512-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
1189; AVX512-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
1190; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
1191; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1192; AVX512:       vector.ph:
1193; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
1194; AVX512:       vector.body:
1195; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1196; AVX512-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
1197; AVX512-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
1198; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
1199; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
1200; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
1201; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
1202; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
1203; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
1204; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
1205; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
1206; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !11
1207; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
1208; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
1209; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !11
1210; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
1211; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
1212; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !11
1213; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24
1214; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
1215; AVX512-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !11
1216; AVX512-NEXT:    [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
1217; AVX512-NEXT:    [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
1218; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
1219; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
1220; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr double, double* [[B]], i64 [[TMP0]]
1221; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]]
1222; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr double, double* [[B]], i64 [[TMP2]]
1223; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]]
1224; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0
1225; AVX512-NEXT:    [[TMP25:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
1226; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], <8 x double> poison), !alias.scope !14
1227; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 8
1228; AVX512-NEXT:    [[TMP27:%.*]] = bitcast double* [[TMP26]] to <8 x double>*
1229; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> poison), !alias.scope !14
1230; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 16
1231; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
1232; AVX512-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> poison), !alias.scope !14
1233; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 24
1234; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
1235; AVX512-NEXT:    [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> poison), !alias.scope !14
1236; AVX512-NEXT:    [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
1237; AVX512-NEXT:    [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x double>
1238; AVX512-NEXT:    [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double>
1239; AVX512-NEXT:    [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD14]] to <8 x double>
1240; AVX512-NEXT:    [[TMP36:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP32]]
1241; AVX512-NEXT:    [[TMP37:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]]
1242; AVX512-NEXT:    [[TMP38:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]]
1243; AVX512-NEXT:    [[TMP39:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]]
1244; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr double, double* [[A]], i64 [[TMP0]]
1245; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr double, double* [[A]], i64 [[TMP1]]
1246; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr double, double* [[A]], i64 [[TMP2]]
1247; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]]
1248; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0
1249; AVX512-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>*
1250; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP36]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[TMP16]]), !alias.scope !16, !noalias !18
1251; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 8
1252; AVX512-NEXT:    [[TMP47:%.*]] = bitcast double* [[TMP46]] to <8 x double>*
1253; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP37]], <8 x double>* [[TMP47]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !16, !noalias !18
1254; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 16
1255; AVX512-NEXT:    [[TMP49:%.*]] = bitcast double* [[TMP48]] to <8 x double>*
1256; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP38]], <8 x double>* [[TMP49]], i32 8, <8 x i1> [[TMP18]]), !alias.scope !16, !noalias !18
1257; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 24
1258; AVX512-NEXT:    [[TMP51:%.*]] = bitcast double* [[TMP50]] to <8 x double>*
1259; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18
1260; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
1261; AVX512-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
1262; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
1263; AVX512:       middle.block:
1264; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
1265; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1266; AVX512:       scalar.ph:
1267; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
1268; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
1269; AVX512:       for.body:
1270; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1271; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
1272; AVX512-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
1273; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP53]], 100
1274; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1275; AVX512:       if.then:
1276; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
1277; AVX512-NEXT:    [[TMP54:%.*]] = load double, double* [[ARRAYIDX3]], align 8
1278; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP53]] to double
1279; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP54]], [[CONV]]
1280; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
1281; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
1282; AVX512-NEXT:    br label [[FOR_INC]]
1283; AVX512:       for.inc:
1284; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
1285; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
1286; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
1287; AVX512:       for.end:
1288; AVX512-NEXT:    ret void
1289;
1290entry:
1291  br label %for.body
1292
1293for.body:                                         ; preds = %for.inc, %entry
1294  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
1295  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
1296  %0 = load i32, i32* %arrayidx, align 4
1297  %cmp1 = icmp slt i32 %0, 100
1298  br i1 %cmp1, label %if.then, label %for.inc
1299
1300if.then:                                          ; preds = %for.body
1301  %arrayidx3 = getelementptr inbounds double, double* %B, i64 %indvars.iv
1302  %1 = load double, double* %arrayidx3, align 8
1303  %conv = sitofp i32 %0 to double
1304  %add = fadd double %1, %conv
1305  %arrayidx7 = getelementptr inbounds double, double* %A, i64 %indvars.iv
1306  store double %add, double* %arrayidx7, align 8
1307  br label %for.inc
1308
1309for.inc:                                          ; preds = %for.body, %if.then
1310  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
1311  %exitcond = icmp eq i64 %indvars.iv.next, 10000
1312  br i1 %exitcond, label %for.end, label %for.body
1313
1314for.end:                                          ; preds = %for.inc
1315  ret void
1316}
1317
1318; The source code:
1319;
1320;void foo4(double *A, double *B, int *trigger) {
1321;
1322;  for (int i=0; i<10000; i += 16) {
1323;    if (trigger[i] < 100) {
1324;          A[i] = B[i*2] + trigger[i]; << non-cosecutive access
1325;    }
1326;  }
1327;}
1328
1329define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
1330; AVX-LABEL: @foo4(
1331; AVX-NEXT:  entry:
1332; AVX-NEXT:    br label [[FOR_BODY:%.*]]
1333; AVX:       for.body:
1334; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1335; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
1336; AVX-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
1337; AVX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
1338; AVX-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1339; AVX:       if.then:
1340; AVX-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
1341; AVX-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]]
1342; AVX-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8
1343; AVX-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
1344; AVX-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
1345; AVX-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
1346; AVX-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
1347; AVX-NEXT:    br label [[FOR_INC]]
1348; AVX:       for.inc:
1349; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
1350; AVX-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
1351; AVX-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
1352; AVX:       for.end:
1353; AVX-NEXT:    ret void
1354;
1355; AVX512-LABEL: @foo4(
1356; AVX512-NEXT:  entry:
1357; AVX512-NEXT:    [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
1358; AVX512-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
1359; AVX512-NEXT:    [[B6:%.*]] = bitcast double* [[B:%.*]] to i8*
1360; AVX512-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1361; AVX512:       vector.memcheck:
1362; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 9985
1363; AVX512-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
1364; AVX512-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 9985
1365; AVX512-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
1366; AVX512-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[B]], i64 19969
1367; AVX512-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
1368; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
1369; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
1370; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1371; AVX512-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
1372; AVX512-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP2]]
1373; AVX512-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
1374; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
1375; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1376; AVX512:       vector.ph:
1377; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
1378; AVX512:       vector.body:
1379; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1380; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1381; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]]
1382; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP0]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !21
1383; AVX512-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
1384; AVX512-NEXT:    [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
1385; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP2]]
1386; AVX512-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> undef), !alias.scope !24
1387; AVX512-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double>
1388; AVX512-NEXT:    [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER12]], [[TMP4]]
1389; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND]]
1390; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP5]], <8 x double*> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !26, !noalias !28
1391; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
1392; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128>
1393; AVX512-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
1394; AVX512-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
1395; AVX512:       middle.block:
1396; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 625, 624
1397; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1398; AVX512:       scalar.ph:
1399; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
1400; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
1401; AVX512:       for.body:
1402; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1403; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
1404; AVX512-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
1405; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP8]], 100
1406; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1407; AVX512:       if.then:
1408; AVX512-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
1409; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP9]]
1410; AVX512-NEXT:    [[TMP10:%.*]] = load double, double* [[ARRAYIDX3]], align 8
1411; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP8]] to double
1412; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP10]], [[CONV]]
1413; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
1414; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
1415; AVX512-NEXT:    br label [[FOR_INC]]
1416; AVX512:       for.inc:
1417; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
1418; AVX512-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
1419; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP30:![0-9]+]]
1420; AVX512:       for.end:
1421; AVX512-NEXT:    ret void
1422;
1423entry:
1424  br label %for.body
1425
1426for.body:                                         ; preds = %entry, %for.inc
1427  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
1428  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
1429  %0 = load i32, i32* %arrayidx, align 4
1430  %cmp1 = icmp slt i32 %0, 100
1431  br i1 %cmp1, label %if.then, label %for.inc
1432
1433if.then:                                          ; preds = %for.body
1434  %1 = shl nuw nsw i64 %indvars.iv, 1
1435  %arrayidx3 = getelementptr inbounds double, double* %B, i64 %1
1436  %2 = load double, double* %arrayidx3, align 8
1437  %conv = sitofp i32 %0 to double
1438  %add = fadd double %2, %conv
1439  %arrayidx7 = getelementptr inbounds double, double* %A, i64 %indvars.iv
1440  store double %add, double* %arrayidx7, align 8
1441  br label %for.inc
1442
1443for.inc:                                          ; preds = %for.body, %if.then
1444  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
1445  %cmp = icmp ult i64 %indvars.iv.next, 10000
1446  br i1 %cmp, label %for.body, label %for.end
1447
1448for.end:                                          ; preds = %for.inc
1449  ret void
1450}
1451
1452@a = common global [1 x i32*] zeroinitializer, align 8
1453@c = common global i32* null, align 8
1454
1455; Reverse loop
1456;void foo6(double *in, double *out, unsigned size, int *trigger) {
1457;
1458;  for (int i=SIZE-1; i>=0; i--) {
1459;    if (trigger[i] > 0) {
1460;      out[i] = in[i] + (double) 0.5;
1461;    }
1462;  }
1463;}
1464
1465define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %size, i32* nocapture readonly %trigger) local_unnamed_addr #0 {
1466; AVX1-LABEL: @foo6(
1467; AVX1-NEXT:  entry:
1468; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
1469; AVX1:       for.body:
1470; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 4095, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1471; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
1472; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
1473; AVX1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
1474; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1475; AVX1:       if.then:
1476; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN:%.*]], i64 [[INDVARS_IV]]
1477; AVX1-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX3]], align 8
1478; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e-01
1479; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDVARS_IV]]
1480; AVX1-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
1481; AVX1-NEXT:    br label [[FOR_INC]]
1482; AVX1:       for.inc:
1483; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
1484; AVX1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
1485; AVX1-NEXT:    br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_BODY]]
1486; AVX1:       for.end:
1487; AVX1-NEXT:    ret void
1488;
1489; AVX2-LABEL: @foo6(
1490; AVX2-NEXT:  entry:
1491; AVX2-NEXT:    [[OUT1:%.*]] = bitcast double* [[OUT:%.*]] to i8*
1492; AVX2-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
1493; AVX2-NEXT:    [[IN6:%.*]] = bitcast double* [[IN:%.*]] to i8*
1494; AVX2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1495; AVX2:       vector.memcheck:
1496; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT]], i64 4096
1497; AVX2-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
1498; AVX2-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 4096
1499; AVX2-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
1500; AVX2-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[IN]], i64 4096
1501; AVX2-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
1502; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP45]]
1503; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
1504; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1505; AVX2-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP78]]
1506; AVX2-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[IN6]], [[SCEVGEP2]]
1507; AVX2-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
1508; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
1509; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1510; AVX2:       vector.ph:
1511; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
1512; AVX2:       vector.body:
1513; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1514; AVX2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
1515; AVX2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
1516; AVX2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -4
1517; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -8
1518; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -12
1519; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
1520; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
1521; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
1522; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
1523; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
1524; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3
1525; AVX2-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
1526; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !17
1527; AVX2-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1528; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -4
1529; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3
1530; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
1531; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !17
1532; AVX2-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD12]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1533; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8
1534; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -3
1535; AVX2-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
1536; AVX2-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !17
1537; AVX2-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD14]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1538; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -12
1539; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -3
1540; AVX2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
1541; AVX2-NEXT:    [[WIDE_LOAD16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !17
1542; AVX2-NEXT:    [[REVERSE17:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD16]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1543; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
1544; AVX2-NEXT:    [[TMP21:%.*]] = icmp sgt <4 x i32> [[REVERSE13]], zeroinitializer
1545; AVX2-NEXT:    [[TMP22:%.*]] = icmp sgt <4 x i32> [[REVERSE15]], zeroinitializer
1546; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt <4 x i32> [[REVERSE17]], zeroinitializer
1547; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP0]]
1548; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP1]]
1549; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP2]]
1550; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP3]]
1551; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr double, double* [[TMP24]], i32 0
1552; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -3
1553; AVX2-NEXT:    [[REVERSE18:%.*]] = shufflevector <4 x i1> [[TMP20]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1554; AVX2-NEXT:    [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>*
1555; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE18]], <4 x double> poison), !alias.scope !20
1556; AVX2-NEXT:    [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1557; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -4
1558; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -3
1559; AVX2-NEXT:    [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP21]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1560; AVX2-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
1561; AVX2-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !20
1562; AVX2-NEXT:    [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1563; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -8
1564; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -3
1565; AVX2-NEXT:    [[REVERSE23:%.*]] = shufflevector <4 x i1> [[TMP22]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1566; AVX2-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>*
1567; AVX2-NEXT:    [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> poison), !alias.scope !20
1568; AVX2-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD24]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1569; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -12
1570; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -3
1571; AVX2-NEXT:    [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1572; AVX2-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
1573; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> poison), !alias.scope !20
1574; AVX2-NEXT:    [[REVERSE28:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD27]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1575; AVX2-NEXT:    [[TMP40:%.*]] = fadd <4 x double> [[REVERSE19]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
1576; AVX2-NEXT:    [[TMP41:%.*]] = fadd <4 x double> [[REVERSE22]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
1577; AVX2-NEXT:    [[TMP42:%.*]] = fadd <4 x double> [[REVERSE25]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
1578; AVX2-NEXT:    [[TMP43:%.*]] = fadd <4 x double> [[REVERSE28]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
1579; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP0]]
1580; AVX2-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
1581; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
1582; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
1583; AVX2-NEXT:    [[REVERSE29:%.*]] = shufflevector <4 x double> [[TMP40]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1584; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0
1585; AVX2-NEXT:    [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -3
1586; AVX2-NEXT:    [[TMP50:%.*]] = bitcast double* [[TMP49]] to <4 x double>*
1587; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope !22, !noalias !24
1588; AVX2-NEXT:    [[REVERSE31:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1589; AVX2-NEXT:    [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -4
1590; AVX2-NEXT:    [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -3
1591; AVX2-NEXT:    [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>*
1592; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !22, !noalias !24
1593; AVX2-NEXT:    [[REVERSE33:%.*]] = shufflevector <4 x double> [[TMP42]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1594; AVX2-NEXT:    [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -8
1595; AVX2-NEXT:    [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -3
1596; AVX2-NEXT:    [[TMP56:%.*]] = bitcast double* [[TMP55]] to <4 x double>*
1597; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !22, !noalias !24
1598; AVX2-NEXT:    [[REVERSE35:%.*]] = shufflevector <4 x double> [[TMP43]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1599; AVX2-NEXT:    [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -12
1600; AVX2-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -3
1601; AVX2-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
1602; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !22, !noalias !24
1603; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
1604; AVX2-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
1605; AVX2-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
1606; AVX2:       middle.block:
1607; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
1608; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1609; AVX2:       scalar.ph:
1610; AVX2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ]
1611; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
1612; AVX2:       for.body:
1613; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1614; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
1615; AVX2-NEXT:    [[TMP61:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
1616; AVX2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP61]], 0
1617; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1618; AVX2:       if.then:
1619; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
1620; AVX2-NEXT:    [[TMP62:%.*]] = load double, double* [[ARRAYIDX3]], align 8
1621; AVX2-NEXT:    [[ADD:%.*]] = fadd double [[TMP62]], 5.000000e-01
1622; AVX2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
1623; AVX2-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
1624; AVX2-NEXT:    br label [[FOR_INC]]
1625; AVX2:       for.inc:
1626; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
1627; AVX2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
1628; AVX2-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
1629; AVX2:       for.end:
1630; AVX2-NEXT:    ret void
1631;
1632; AVX512-LABEL: @foo6(
1633; AVX512-NEXT:  entry:
1634; AVX512-NEXT:    [[OUT1:%.*]] = bitcast double* [[OUT:%.*]] to i8*
1635; AVX512-NEXT:    [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8*
1636; AVX512-NEXT:    [[IN6:%.*]] = bitcast double* [[IN:%.*]] to i8*
1637; AVX512-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1638; AVX512:       vector.memcheck:
1639; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT]], i64 4096
1640; AVX512-NEXT:    [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
1641; AVX512-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 4096
1642; AVX512-NEXT:    [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
1643; AVX512-NEXT:    [[SCEVGEP7:%.*]] = getelementptr double, double* [[IN]], i64 4096
1644; AVX512-NEXT:    [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
1645; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP45]]
1646; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP2]]
1647; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1648; AVX512-NEXT:    [[BOUND09:%.*]] = icmp ult i8* [[OUT1]], [[SCEVGEP78]]
1649; AVX512-NEXT:    [[BOUND110:%.*]] = icmp ult i8* [[IN6]], [[SCEVGEP2]]
1650; AVX512-NEXT:    [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
1651; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
1652; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1653; AVX512:       vector.ph:
1654; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
1655; AVX512:       vector.body:
1656; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1657; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
1658; AVX512-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
1659; AVX512-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -8
1660; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -16
1661; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], -24
1662; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
1663; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
1664; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
1665; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
1666; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
1667; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -7
1668; AVX512-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
1669; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !31
1670; AVX512-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1671; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8
1672; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -7
1673; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
1674; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !31
1675; AVX512-NEXT:    [[REVERSE13:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD12]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1676; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -16
1677; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -7
1678; AVX512-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>*
1679; AVX512-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4, !alias.scope !31
1680; AVX512-NEXT:    [[REVERSE15:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD14]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1681; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -24
1682; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -7
1683; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>*
1684; AVX512-NEXT:    [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4, !alias.scope !31
1685; AVX512-NEXT:    [[REVERSE17:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD16]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1686; AVX512-NEXT:    [[TMP20:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
1687; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <8 x i32> [[REVERSE13]], zeroinitializer
1688; AVX512-NEXT:    [[TMP22:%.*]] = icmp sgt <8 x i32> [[REVERSE15]], zeroinitializer
1689; AVX512-NEXT:    [[TMP23:%.*]] = icmp sgt <8 x i32> [[REVERSE17]], zeroinitializer
1690; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP0]]
1691; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP1]]
1692; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP2]]
1693; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr double, double* [[IN]], i64 [[TMP3]]
1694; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr double, double* [[TMP24]], i32 0
1695; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -7
1696; AVX512-NEXT:    [[REVERSE18:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1697; AVX512-NEXT:    [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>*
1698; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE18]], <8 x double> poison), !alias.scope !34
1699; AVX512-NEXT:    [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1700; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -8
1701; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -7
1702; AVX512-NEXT:    [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1703; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
1704; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !34
1705; AVX512-NEXT:    [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1706; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -16
1707; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -7
1708; AVX512-NEXT:    [[REVERSE23:%.*]] = shufflevector <8 x i1> [[TMP22]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1709; AVX512-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>*
1710; AVX512-NEXT:    [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> poison), !alias.scope !34
1711; AVX512-NEXT:    [[REVERSE25:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD24]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1712; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -24
1713; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -7
1714; AVX512-NEXT:    [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP23]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1715; AVX512-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
1716; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> poison), !alias.scope !34
1717; AVX512-NEXT:    [[REVERSE28:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD27]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1718; AVX512-NEXT:    [[TMP40:%.*]] = fadd <8 x double> [[REVERSE19]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
1719; AVX512-NEXT:    [[TMP41:%.*]] = fadd <8 x double> [[REVERSE22]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
1720; AVX512-NEXT:    [[TMP42:%.*]] = fadd <8 x double> [[REVERSE25]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
1721; AVX512-NEXT:    [[TMP43:%.*]] = fadd <8 x double> [[REVERSE28]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
1722; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP0]]
1723; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
1724; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
1725; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
1726; AVX512-NEXT:    [[REVERSE29:%.*]] = shufflevector <8 x double> [[TMP40]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1727; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0
1728; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -7
1729; AVX512-NEXT:    [[TMP50:%.*]] = bitcast double* [[TMP49]] to <8 x double>*
1730; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope !36, !noalias !38
1731; AVX512-NEXT:    [[REVERSE31:%.*]] = shufflevector <8 x double> [[TMP41]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1732; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -8
1733; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -7
1734; AVX512-NEXT:    [[TMP53:%.*]] = bitcast double* [[TMP52]] to <8 x double>*
1735; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !36, !noalias !38
1736; AVX512-NEXT:    [[REVERSE33:%.*]] = shufflevector <8 x double> [[TMP42]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1737; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -16
1738; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -7
1739; AVX512-NEXT:    [[TMP56:%.*]] = bitcast double* [[TMP55]] to <8 x double>*
1740; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !36, !noalias !38
1741; AVX512-NEXT:    [[REVERSE35:%.*]] = shufflevector <8 x double> [[TMP43]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1742; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -24
1743; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -7
1744; AVX512-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>*
1745; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !36, !noalias !38
1746; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
1747; AVX512-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
1748; AVX512-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
1749; AVX512:       middle.block:
1750; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
1751; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
1752; AVX512:       scalar.ph:
1753; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ]
1754; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
1755; AVX512:       for.body:
1756; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1757; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
1758; AVX512-NEXT:    [[TMP61:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
1759; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP61]], 0
1760; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
1761; AVX512:       if.then:
1762; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
1763; AVX512-NEXT:    [[TMP62:%.*]] = load double, double* [[ARRAYIDX3]], align 8
1764; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP62]], 5.000000e-01
1765; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
1766; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
1767; AVX512-NEXT:    br label [[FOR_INC]]
1768; AVX512:       for.inc:
1769; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
1770; AVX512-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
1771; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
1772; AVX512:       for.end:
1773; AVX512-NEXT:    ret void
1774;
1775entry:
1776  br label %for.body
1777
1778for.body:                                         ; preds = %for.inc, %entry
1779  %indvars.iv = phi i64 [ 4095, %entry ], [ %indvars.iv.next, %for.inc ]
1780  %arrayidx = getelementptr inbounds i32, i32* %trigger, i64 %indvars.iv
1781  %0 = load i32, i32* %arrayidx, align 4
1782  %cmp1 = icmp sgt i32 %0, 0
1783  br i1 %cmp1, label %if.then, label %for.inc
1784
1785if.then:                                          ; preds = %for.body
1786  %arrayidx3 = getelementptr inbounds double, double* %in, i64 %indvars.iv
1787  %1 = load double, double* %arrayidx3, align 8
1788  %add = fadd double %1, 5.000000e-01
1789  %arrayidx5 = getelementptr inbounds double, double* %out, i64 %indvars.iv
1790  store double %add, double* %arrayidx5, align 8
1791  br label %for.inc
1792
1793for.inc:                                          ; preds = %for.body, %if.then
1794  %indvars.iv.next = add nsw i64 %indvars.iv, -1
1795  %cmp = icmp eq i64 %indvars.iv, 0
1796  br i1 %cmp, label %for.end, label %for.body
1797
1798for.end:                                          ; preds = %for.inc
1799  ret void
1800}
1801
1802; void foo7 (double * __restrict__  out, double ** __restrict__  in,
1803;           bool * __restrict__ trigger, unsigned size) {
1804;
1805;  for (unsigned i=0; i<size; i++)
1806;    if (trigger[i] && (in[i] != 0))
1807;      out[i] = (double) 0.5;
1808; }
1809
1810define void @foo7(double* noalias nocapture %out, double** noalias nocapture readonly %in, i8* noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 {
1811; AVX1-LABEL: @foo7(
1812; AVX1-NEXT:  entry:
1813; AVX1-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
1814; AVX1-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
1815; AVX1:       for.body.preheader:
1816; AVX1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
1817; AVX1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
1818; AVX1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1819; AVX1:       vector.ph:
1820; AVX1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
1821; AVX1-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
1822; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
1823; AVX1:       vector.body:
1824; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1825; AVX1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
1826; AVX1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
1827; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
1828; AVX1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
1829; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
1830; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
1831; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
1832; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
1833; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
1834; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
1835; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
1836; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
1837; AVX1-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
1838; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
1839; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
1840; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
1841; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
1842; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
1843; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
1844; AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
1845; AVX1-NEXT:    [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
1846; AVX1-NEXT:    [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
1847; AVX1-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
1848; AVX1-NEXT:    [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
1849; AVX1-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
1850; AVX1-NEXT:    [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
1851; AVX1-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
1852; AVX1-NEXT:    [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
1853; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], i64 [[TMP0]]
1854; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP1]]
1855; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP2]]
1856; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP3]]
1857; AVX1-NEXT:    [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
1858; AVX1-NEXT:    [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
1859; AVX1-NEXT:    [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
1860; AVX1-NEXT:    [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
1861; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 0
1862; AVX1-NEXT:    [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>*
1863; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> poison)
1864; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 4
1865; AVX1-NEXT:    [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>*
1866; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> poison)
1867; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 8
1868; AVX1-NEXT:    [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>*
1869; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> poison)
1870; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], i32 12
1871; AVX1-NEXT:    [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>*
1872; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> poison)
1873; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
1874; AVX1-NEXT:    [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer
1875; AVX1-NEXT:    [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer
1876; AVX1-NEXT:    [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer
1877; AVX1-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]]
1878; AVX1-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
1879; AVX1-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
1880; AVX1-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
1881; AVX1-NEXT:    [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
1882; AVX1-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
1883; AVX1-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
1884; AVX1-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
1885; AVX1-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
1886; AVX1-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
1887; AVX1-NEXT:    [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
1888; AVX1-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
1889; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
1890; AVX1-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
1891; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
1892; AVX1-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4
1893; AVX1-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
1894; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
1895; AVX1-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8
1896; AVX1-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
1897; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
1898; AVX1-NEXT:    [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12
1899; AVX1-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
1900; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
1901; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
1902; AVX1-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1903; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
1904; AVX1:       middle.block:
1905; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
1906; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
1907; AVX1:       scalar.ph:
1908; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
1909; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
1910; AVX1:       for.body:
1911; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
1912; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
1913; AVX1-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
1914; AVX1-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
1915; AVX1-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
1916; AVX1-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
1917; AVX1:       land.lhs.true:
1918; AVX1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
1919; AVX1-NEXT:    [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
1920; AVX1-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP67]], null
1921; AVX1-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
1922; AVX1:       if.then:
1923; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
1924; AVX1-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
1925; AVX1-NEXT:    br label [[FOR_INC]]
1926; AVX1:       for.inc:
1927; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
1928; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
1929; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
1930; AVX1:       for.end.loopexit:
1931; AVX1-NEXT:    br label [[FOR_END]]
1932; AVX1:       for.end:
1933; AVX1-NEXT:    ret void
1934;
1935; AVX2-LABEL: @foo7(
1936; AVX2-NEXT:  entry:
1937; AVX2-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
1938; AVX2-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
1939; AVX2:       for.body.preheader:
1940; AVX2-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
1941; AVX2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
1942; AVX2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1943; AVX2:       vector.ph:
1944; AVX2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
1945; AVX2-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
1946; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
1947; AVX2:       vector.body:
1948; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1949; AVX2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
1950; AVX2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
1951; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
1952; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
1953; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
1954; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
1955; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
1956; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
1957; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
1958; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
1959; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
1960; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
1961; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
1962; AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
1963; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
1964; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
1965; AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
1966; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
1967; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
1968; AVX2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
1969; AVX2-NEXT:    [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
1970; AVX2-NEXT:    [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
1971; AVX2-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
1972; AVX2-NEXT:    [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
1973; AVX2-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
1974; AVX2-NEXT:    [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
1975; AVX2-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
1976; AVX2-NEXT:    [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
1977; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], i64 [[TMP0]]
1978; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP1]]
1979; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP2]]
1980; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP3]]
1981; AVX2-NEXT:    [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
1982; AVX2-NEXT:    [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
1983; AVX2-NEXT:    [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
1984; AVX2-NEXT:    [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
1985; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 0
1986; AVX2-NEXT:    [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>*
1987; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> poison)
1988; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 4
1989; AVX2-NEXT:    [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>*
1990; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> poison)
1991; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 8
1992; AVX2-NEXT:    [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>*
1993; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> poison)
1994; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], i32 12
1995; AVX2-NEXT:    [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>*
1996; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> poison)
1997; AVX2-NEXT:    [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
1998; AVX2-NEXT:    [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer
1999; AVX2-NEXT:    [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer
2000; AVX2-NEXT:    [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer
2001; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]]
2002; AVX2-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
2003; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
2004; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
2005; AVX2-NEXT:    [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
2006; AVX2-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
2007; AVX2-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
2008; AVX2-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
2009; AVX2-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
2010; AVX2-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
2011; AVX2-NEXT:    [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
2012; AVX2-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
2013; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
2014; AVX2-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
2015; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
2016; AVX2-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4
2017; AVX2-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
2018; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
2019; AVX2-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8
2020; AVX2-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
2021; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
2022; AVX2-NEXT:    [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12
2023; AVX2-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
2024; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
2025; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
2026; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2027; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
2028; AVX2:       middle.block:
2029; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2030; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
2031; AVX2:       scalar.ph:
2032; AVX2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
2033; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
2034; AVX2:       for.body:
2035; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
2036; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
2037; AVX2-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
2038; AVX2-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
2039; AVX2-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
2040; AVX2-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
2041; AVX2:       land.lhs.true:
2042; AVX2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
2043; AVX2-NEXT:    [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
2044; AVX2-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP67]], null
2045; AVX2-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
2046; AVX2:       if.then:
2047; AVX2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
2048; AVX2-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
2049; AVX2-NEXT:    br label [[FOR_INC]]
2050; AVX2:       for.inc:
2051; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
2052; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
2053; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
2054; AVX2:       for.end.loopexit:
2055; AVX2-NEXT:    br label [[FOR_END]]
2056; AVX2:       for.end:
2057; AVX2-NEXT:    ret void
2058;
2059; AVX512-LABEL: @foo7(
2060; AVX512-NEXT:  entry:
2061; AVX512-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
2062; AVX512-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
2063; AVX512:       for.body.preheader:
2064; AVX512-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
2065; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
2066; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2067; AVX512:       vector.ph:
2068; AVX512-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
2069; AVX512-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
2070; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
2071; AVX512:       vector.body:
2072; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2073; AVX512-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
2074; AVX512-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
2075; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
2076; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
2077; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
2078; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
2079; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
2080; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
2081; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
2082; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <8 x i8>*
2083; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP9]], align 1
2084; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
2085; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
2086; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 1
2087; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 16
2088; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>*
2089; AVX512-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1
2090; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 24
2091; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <8 x i8>*
2092; AVX512-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 1
2093; AVX512-NEXT:    [[TMP16:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2094; AVX512-NEXT:    [[TMP17:%.*]] = and <8 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2095; AVX512-NEXT:    [[TMP18:%.*]] = and <8 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2096; AVX512-NEXT:    [[TMP19:%.*]] = and <8 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2097; AVX512-NEXT:    [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP16]], zeroinitializer
2098; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer
2099; AVX512-NEXT:    [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer
2100; AVX512-NEXT:    [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP19]], zeroinitializer
2101; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], i64 [[TMP0]]
2102; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP1]]
2103; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP2]]
2104; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr double*, double** [[IN]], i64 [[TMP3]]
2105; AVX512-NEXT:    [[TMP28:%.*]] = xor <8 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2106; AVX512-NEXT:    [[TMP29:%.*]] = xor <8 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2107; AVX512-NEXT:    [[TMP30:%.*]] = xor <8 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2108; AVX512-NEXT:    [[TMP31:%.*]] = xor <8 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2109; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 0
2110; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double** [[TMP32]] to <8 x double*>*
2111; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP33]], i32 8, <8 x i1> [[TMP28]], <8 x double*> poison)
2112; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 8
2113; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double** [[TMP34]] to <8 x double*>*
2114; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x double*> poison)
2115; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 16
2116; AVX512-NEXT:    [[TMP37:%.*]] = bitcast double** [[TMP36]] to <8 x double*>*
2117; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x double*> poison)
2118; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], i32 24
2119; AVX512-NEXT:    [[TMP39:%.*]] = bitcast double** [[TMP38]] to <8 x double*>*
2120; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x double*> poison)
2121; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
2122; AVX512-NEXT:    [[TMP41:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer
2123; AVX512-NEXT:    [[TMP42:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer
2124; AVX512-NEXT:    [[TMP43:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer
2125; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]]
2126; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
2127; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
2128; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
2129; AVX512-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2130; AVX512-NEXT:    [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2131; AVX512-NEXT:    [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2132; AVX512-NEXT:    [[TMP51:%.*]] = xor <8 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2133; AVX512-NEXT:    [[TMP52:%.*]] = select <8 x i1> [[TMP28]], <8 x i1> [[TMP48]], <8 x i1> zeroinitializer
2134; AVX512-NEXT:    [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer
2135; AVX512-NEXT:    [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
2136; AVX512-NEXT:    [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer
2137; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
2138; AVX512-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>*
2139; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]])
2140; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 8
2141; AVX512-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>*
2142; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP59]], i32 8, <8 x i1> [[TMP53]])
2143; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 16
2144; AVX512-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <8 x double>*
2145; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP61]], i32 8, <8 x i1> [[TMP54]])
2146; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 24
2147; AVX512-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <8 x double>*
2148; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]])
2149; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
2150; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2151; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
2152; AVX512:       middle.block:
2153; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2154; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
2155; AVX512:       scalar.ph:
2156; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
2157; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
2158; AVX512:       for.body:
2159; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
2160; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
2161; AVX512-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
2162; AVX512-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
2163; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
2164; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
2165; AVX512:       land.lhs.true:
2166; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
2167; AVX512-NEXT:    [[TMP67:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
2168; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP67]], null
2169; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
2170; AVX512:       if.then:
2171; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
2172; AVX512-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
2173; AVX512-NEXT:    br label [[FOR_INC]]
2174; AVX512:       for.inc:
2175; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
2176; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
2177; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
2178; AVX512:       for.end.loopexit:
2179; AVX512-NEXT:    br label [[FOR_END]]
2180; AVX512:       for.end:
2181; AVX512-NEXT:    ret void
2182;
2183entry:
2184  %cmp5 = icmp eq i32 %size, 0
2185  br i1 %cmp5, label %for.end, label %for.body.preheader
2186
2187for.body.preheader:                               ; preds = %entry
2188  %wide.trip.count = zext i32 %size to i64
2189  br label %for.body
2190
2191for.body:                                         ; preds = %for.inc, %for.body.preheader
2192  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
2193  %arrayidx = getelementptr inbounds i8, i8* %trigger, i64 %indvars.iv
2194  %0 = load i8, i8* %arrayidx, align 1
2195  %1 = and i8 %0, 1
2196  %tobool = icmp eq i8 %1, 0
2197  br i1 %tobool, label %for.inc, label %land.lhs.true
2198
2199land.lhs.true:                                    ; preds = %for.body
2200  %arrayidx2 = getelementptr inbounds double*, double** %in, i64 %indvars.iv
2201  %2 = load double*, double** %arrayidx2, align 8
2202  %cmp3 = icmp eq double* %2, null
2203  br i1 %cmp3, label %for.inc, label %if.then
2204
2205if.then:                                          ; preds = %land.lhs.true
2206  %arrayidx5 = getelementptr inbounds double, double* %out, i64 %indvars.iv
2207  store double 5.000000e-01, double* %arrayidx5, align 8
2208  br label %for.inc
2209
2210for.inc:                                          ; preds = %land.lhs.true, %for.body, %if.then
2211  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
2212  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
2213  br i1 %exitcond, label %for.end, label %for.body
2214
2215for.end:                                          ; preds = %for.inc, %entry
2216  ret void
2217}
2218
2219;typedef int (*fp)();
2220;void foo8 (double* __restrict__  out, fp* __restrict__ in, bool * __restrict__ trigger, unsigned size) {
2221;
2222;  for (unsigned i=0; i<size; i++)
2223;    if (trigger[i] && (in[i] != 0))
2224;      out[i] = (double) 0.5;
2225;}
2226
2227define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture readonly %in, i8* noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 {
2228; AVX1-LABEL: @foo8(
2229; AVX1-NEXT:  entry:
2230; AVX1-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
2231; AVX1-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
2232; AVX1:       for.body.preheader:
2233; AVX1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
2234; AVX1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
2235; AVX1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2236; AVX1:       vector.ph:
2237; AVX1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
2238; AVX1-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
2239; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
2240; AVX1:       vector.body:
2241; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2242; AVX1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
2243; AVX1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
2244; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
2245; AVX1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
2246; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
2247; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
2248; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
2249; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
2250; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
2251; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
2252; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
2253; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
2254; AVX1-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
2255; AVX1-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
2256; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
2257; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
2258; AVX1-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
2259; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
2260; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
2261; AVX1-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
2262; AVX1-NEXT:    [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
2263; AVX1-NEXT:    [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
2264; AVX1-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
2265; AVX1-NEXT:    [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
2266; AVX1-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
2267; AVX1-NEXT:    [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
2268; AVX1-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
2269; AVX1-NEXT:    [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
2270; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]]
2271; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP1]]
2272; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP2]]
2273; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP3]]
2274; AVX1-NEXT:    [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
2275; AVX1-NEXT:    [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
2276; AVX1-NEXT:    [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
2277; AVX1-NEXT:    [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
2278; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 0
2279; AVX1-NEXT:    [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>*
2280; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> poison)
2281; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 4
2282; AVX1-NEXT:    [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>*
2283; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> poison)
2284; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 8
2285; AVX1-NEXT:    [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>*
2286; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> poison)
2287; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 12
2288; AVX1-NEXT:    [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>*
2289; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> poison)
2290; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
2291; AVX1-NEXT:    [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer
2292; AVX1-NEXT:    [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer
2293; AVX1-NEXT:    [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer
2294; AVX1-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]]
2295; AVX1-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
2296; AVX1-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
2297; AVX1-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
2298; AVX1-NEXT:    [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
2299; AVX1-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
2300; AVX1-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
2301; AVX1-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
2302; AVX1-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
2303; AVX1-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
2304; AVX1-NEXT:    [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
2305; AVX1-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
2306; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
2307; AVX1-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
2308; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
2309; AVX1-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4
2310; AVX1-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
2311; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
2312; AVX1-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8
2313; AVX1-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
2314; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
2315; AVX1-NEXT:    [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12
2316; AVX1-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
2317; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
2318; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
2319; AVX1-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2320; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
2321; AVX1:       middle.block:
2322; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2323; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
2324; AVX1:       scalar.ph:
2325; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
2326; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
2327; AVX1:       for.body:
2328; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
2329; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
2330; AVX1-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
2331; AVX1-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
2332; AVX1-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
2333; AVX1-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
2334; AVX1:       land.lhs.true:
2335; AVX1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
2336; AVX1-NEXT:    [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
2337; AVX1-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null
2338; AVX1-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
2339; AVX1:       if.then:
2340; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
2341; AVX1-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
2342; AVX1-NEXT:    br label [[FOR_INC]]
2343; AVX1:       for.inc:
2344; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
2345; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
2346; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
2347; AVX1:       for.end.loopexit:
2348; AVX1-NEXT:    br label [[FOR_END]]
2349; AVX1:       for.end:
2350; AVX1-NEXT:    ret void
2351;
2352; AVX2-LABEL: @foo8(
2353; AVX2-NEXT:  entry:
2354; AVX2-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
2355; AVX2-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
2356; AVX2:       for.body.preheader:
2357; AVX2-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
2358; AVX2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
2359; AVX2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2360; AVX2:       vector.ph:
2361; AVX2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
2362; AVX2-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
2363; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
2364; AVX2:       vector.body:
2365; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2366; AVX2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
2367; AVX2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
2368; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
2369; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
2370; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
2371; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
2372; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
2373; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
2374; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
2375; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
2376; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1
2377; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4
2378; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>*
2379; AVX2-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1
2380; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
2381; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
2382; AVX2-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
2383; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12
2384; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
2385; AVX2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
2386; AVX2-NEXT:    [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
2387; AVX2-NEXT:    [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
2388; AVX2-NEXT:    [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
2389; AVX2-NEXT:    [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
2390; AVX2-NEXT:    [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer
2391; AVX2-NEXT:    [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
2392; AVX2-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
2393; AVX2-NEXT:    [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
2394; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]]
2395; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP1]]
2396; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP2]]
2397; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP3]]
2398; AVX2-NEXT:    [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true>
2399; AVX2-NEXT:    [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true>
2400; AVX2-NEXT:    [[TMP30:%.*]] = xor <4 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true>
2401; AVX2-NEXT:    [[TMP31:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
2402; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 0
2403; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>*
2404; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> poison)
2405; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 4
2406; AVX2-NEXT:    [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>*
2407; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> poison)
2408; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 8
2409; AVX2-NEXT:    [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>*
2410; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> poison)
2411; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 12
2412; AVX2-NEXT:    [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>*
2413; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> poison)
2414; AVX2-NEXT:    [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
2415; AVX2-NEXT:    [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer
2416; AVX2-NEXT:    [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer
2417; AVX2-NEXT:    [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer
2418; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]]
2419; AVX2-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
2420; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
2421; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
2422; AVX2-NEXT:    [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true>
2423; AVX2-NEXT:    [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true>
2424; AVX2-NEXT:    [[TMP50:%.*]] = xor <4 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true>
2425; AVX2-NEXT:    [[TMP51:%.*]] = xor <4 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true>
2426; AVX2-NEXT:    [[TMP52:%.*]] = select <4 x i1> [[TMP28]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
2427; AVX2-NEXT:    [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x i1> zeroinitializer
2428; AVX2-NEXT:    [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
2429; AVX2-NEXT:    [[TMP55:%.*]] = select <4 x i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
2430; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
2431; AVX2-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
2432; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
2433; AVX2-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4
2434; AVX2-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
2435; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
2436; AVX2-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8
2437; AVX2-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
2438; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
2439; AVX2-NEXT:    [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 12
2440; AVX2-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
2441; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
2442; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
2443; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2444; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
2445; AVX2:       middle.block:
2446; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2447; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
2448; AVX2:       scalar.ph:
2449; AVX2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
2450; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
2451; AVX2:       for.body:
2452; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
2453; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
2454; AVX2-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
2455; AVX2-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
2456; AVX2-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
2457; AVX2-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
2458; AVX2:       land.lhs.true:
2459; AVX2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
2460; AVX2-NEXT:    [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
2461; AVX2-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null
2462; AVX2-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
2463; AVX2:       if.then:
2464; AVX2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
2465; AVX2-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
2466; AVX2-NEXT:    br label [[FOR_INC]]
2467; AVX2:       for.inc:
2468; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
2469; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
2470; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
2471; AVX2:       for.end.loopexit:
2472; AVX2-NEXT:    br label [[FOR_END]]
2473; AVX2:       for.end:
2474; AVX2-NEXT:    ret void
2475;
2476; AVX512-LABEL: @foo8(
2477; AVX512-NEXT:  entry:
2478; AVX512-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
2479; AVX512-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
2480; AVX512:       for.body.preheader:
2481; AVX512-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
2482; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
2483; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2484; AVX512:       vector.ph:
2485; AVX512-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
2486; AVX512-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
2487; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
2488; AVX512:       vector.body:
2489; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
2490; AVX512-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
2491; AVX512-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
2492; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
2493; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
2494; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[TMP0]]
2495; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP1]]
2496; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP2]]
2497; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[TMP3]]
2498; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
2499; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <8 x i8>*
2500; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP9]], align 1
2501; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8
2502; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
2503; AVX512-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 1
2504; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 16
2505; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>*
2506; AVX512-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1
2507; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 24
2508; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <8 x i8>*
2509; AVX512-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 1
2510; AVX512-NEXT:    [[TMP16:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2511; AVX512-NEXT:    [[TMP17:%.*]] = and <8 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2512; AVX512-NEXT:    [[TMP18:%.*]] = and <8 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2513; AVX512-NEXT:    [[TMP19:%.*]] = and <8 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2514; AVX512-NEXT:    [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP16]], zeroinitializer
2515; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer
2516; AVX512-NEXT:    [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer
2517; AVX512-NEXT:    [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP19]], zeroinitializer
2518; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], i64 [[TMP0]]
2519; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP1]]
2520; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP2]]
2521; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], i64 [[TMP3]]
2522; AVX512-NEXT:    [[TMP28:%.*]] = xor <8 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2523; AVX512-NEXT:    [[TMP29:%.*]] = xor <8 x i1> [[TMP21]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2524; AVX512-NEXT:    [[TMP30:%.*]] = xor <8 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2525; AVX512-NEXT:    [[TMP31:%.*]] = xor <8 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2526; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 0
2527; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <8 x i32 ()*>*
2528; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP33]], i32 8, <8 x i1> [[TMP28]], <8 x i32 ()*> poison)
2529; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 8
2530; AVX512-NEXT:    [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <8 x i32 ()*>*
2531; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x i32 ()*> poison)
2532; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 16
2533; AVX512-NEXT:    [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <8 x i32 ()*>*
2534; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x i32 ()*> poison)
2535; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 24
2536; AVX512-NEXT:    [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <8 x i32 ()*>*
2537; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x i32 ()*> poison)
2538; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
2539; AVX512-NEXT:    [[TMP41:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer
2540; AVX512-NEXT:    [[TMP42:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer
2541; AVX512-NEXT:    [[TMP43:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer
2542; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 [[TMP0]]
2543; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP1]]
2544; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP2]]
2545; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 [[TMP3]]
2546; AVX512-NEXT:    [[TMP48:%.*]] = xor <8 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2547; AVX512-NEXT:    [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2548; AVX512-NEXT:    [[TMP50:%.*]] = xor <8 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2549; AVX512-NEXT:    [[TMP51:%.*]] = xor <8 x i1> [[TMP43]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
2550; AVX512-NEXT:    [[TMP52:%.*]] = select <8 x i1> [[TMP28]], <8 x i1> [[TMP48]], <8 x i1> zeroinitializer
2551; AVX512-NEXT:    [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 x i1> zeroinitializer
2552; AVX512-NEXT:    [[TMP54:%.*]] = select <8 x i1> [[TMP30]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
2553; AVX512-NEXT:    [[TMP55:%.*]] = select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> zeroinitializer
2554; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
2555; AVX512-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>*
2556; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]])
2557; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 8
2558; AVX512-NEXT:    [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>*
2559; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP59]], i32 8, <8 x i1> [[TMP53]])
2560; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 16
2561; AVX512-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <8 x double>*
2562; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP61]], i32 8, <8 x i1> [[TMP54]])
2563; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 24
2564; AVX512-NEXT:    [[TMP63:%.*]] = bitcast double* [[TMP62]] to <8 x double>*
2565; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]])
2566; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
2567; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
2568; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
2569; AVX512:       middle.block:
2570; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
2571; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
2572; AVX512:       scalar.ph:
2573; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
2574; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
2575; AVX512:       for.body:
2576; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
2577; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
2578; AVX512-NEXT:    [[TMP65:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
2579; AVX512-NEXT:    [[TMP66:%.*]] = and i8 [[TMP65]], 1
2580; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP66]], 0
2581; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
2582; AVX512:       land.lhs.true:
2583; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
2584; AVX512-NEXT:    [[TMP67:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
2585; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP67]], null
2586; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
2587; AVX512:       if.then:
2588; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
2589; AVX512-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
2590; AVX512-NEXT:    br label [[FOR_INC]]
2591; AVX512:       for.inc:
2592; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
2593; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
2594; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
2595; AVX512:       for.end.loopexit:
2596; AVX512-NEXT:    br label [[FOR_END]]
2597; AVX512:       for.end:
2598; AVX512-NEXT:    ret void
2599;
2600entry:
2601  %cmp5 = icmp eq i32 %size, 0
2602  br i1 %cmp5, label %for.end, label %for.body.preheader
2603
2604for.body.preheader:                               ; preds = %entry
2605  %wide.trip.count = zext i32 %size to i64
2606  br label %for.body
2607
2608for.body:                                         ; preds = %for.inc, %for.body.preheader
2609  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
2610  %arrayidx = getelementptr inbounds i8, i8* %trigger, i64 %indvars.iv
2611  %0 = load i8, i8* %arrayidx, align 1
2612  %1 = and i8 %0, 1
2613  %tobool = icmp eq i8 %1, 0
2614  br i1 %tobool, label %for.inc, label %land.lhs.true
2615
2616land.lhs.true:                                    ; preds = %for.body
2617  %arrayidx2 = getelementptr inbounds i32 ()*, i32 ()** %in, i64 %indvars.iv
2618  %2 = load i32 ()*, i32 ()** %arrayidx2, align 8
2619  %cmp3 = icmp eq i32 ()* %2, null
2620  br i1 %cmp3, label %for.inc, label %if.then
2621
2622if.then:                                          ; preds = %land.lhs.true
2623  %arrayidx5 = getelementptr inbounds double, double* %out, i64 %indvars.iv
2624  store double 5.000000e-01, double* %arrayidx5, align 8
2625  br label %for.inc
2626
2627for.inc:                                          ; preds = %land.lhs.true, %for.body, %if.then
2628  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
2629  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
2630  br i1 %exitcond, label %for.end, label %for.body
2631
2632for.end:                                          ; preds = %for.inc, %entry
2633  ret void
2634}
2635
2636attributes #0 = { norecurse nounwind }
2637