1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -loop-vectorize -mtriple=x86_64-apple-darwin %s | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: opt -S -loop-vectorize -mtriple=x86_64-apple-darwin -mattr=+avx %s | FileCheck %s --check-prefixes=CHECK,AVX
4
5; Two mostly identical functions. The only difference is the presence of
6; fast-math flags on the second. The loop is a pretty simple reduction:
7
8; for (int i = 0; i < 32; ++i)
9;   if (arr[i] != 42)
10;     tot += arr[i];
11
12define double @sumIfScalar(double* nocapture readonly %arr) {
13; CHECK-LABEL: @sumIfScalar(
14; CHECK-NEXT:  entry:
15; CHECK-NEXT:    br label [[LOOP:%.*]]
16; CHECK:       loop:
17; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
18; CHECK-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
19; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
20; CHECK-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8
21; CHECK-NEXT:    [[TST:%.*]] = fcmp une double [[NEXTVAL]], 4.200000e+01
22; CHECK-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
23; CHECK:       do.add:
24; CHECK-NEXT:    [[TOT_NEW:%.*]] = fadd double [[TOT]], [[NEXTVAL]]
25; CHECK-NEXT:    br label [[NEXT_ITER]]
26; CHECK:       no.add:
27; CHECK-NEXT:    br label [[NEXT_ITER]]
28; CHECK:       next.iter:
29; CHECK-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
30; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
31; CHECK-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
32; CHECK-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]]
33; CHECK:       done:
34; CHECK-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ]
35; CHECK-NEXT:    ret double [[TOT_NEXT_LCSSA]]
36;
37entry:
38  br label %loop
39
40loop:
41  %i = phi i32 [0, %entry], [%i.next, %next.iter]
42  %tot = phi double [0.0, %entry], [%tot.next, %next.iter]
43
44  %addr = getelementptr double, double* %arr, i32 %i
45  %nextval = load double, double* %addr
46
47  %tst = fcmp une double %nextval, 42.0
48  br i1 %tst, label %do.add, label %no.add
49
50do.add:
51  %tot.new = fadd double %tot, %nextval
52  br label %next.iter
53
54no.add:
55  br label %next.iter
56
57next.iter:
58  %tot.next = phi double [%tot, %no.add], [%tot.new, %do.add]
59  %i.next = add i32 %i, 1
60  %again = icmp ult i32 %i.next, 32
61  br i1 %again, label %loop, label %done
62
63done:
64  ret double %tot.next
65}
66
67define double @sumIfVector(double* nocapture readonly %arr) {
68; SSE-LABEL: @sumIfVector(
69; SSE-NEXT:  entry:
70; SSE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
71; SSE:       vector.ph:
72; SSE-NEXT:    br label [[VECTOR_BODY:%.*]]
73; SSE:       vector.body:
74; SSE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
75; SSE-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
76; SSE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
77; SSE-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
78; SSE-NEXT:    [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
79; SSE-NEXT:    [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>*
80; SSE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
81; SSE-NEXT:    [[TMP4:%.*]] = fcmp fast une <2 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01>
82; SSE-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
83; SSE-NEXT:    [[TMP6:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
84; SSE-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[VEC_PHI]]
85; SSE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
86; SSE-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
87; SSE-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
88; SSE:       middle.block:
89; SSE-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[PREDPHI]])
90; SSE-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
91; SSE-NEXT:    br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
92; SSE:       scalar.ph:
93; SSE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
94; SSE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
95; SSE-NEXT:    br label [[LOOP:%.*]]
96; SSE:       loop:
97; SSE-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
98; SSE-NEXT:    [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
99; SSE-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
100; SSE-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8
101; SSE-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
102; SSE-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
103; SSE:       do.add:
104; SSE-NEXT:    [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]]
105; SSE-NEXT:    br label [[NEXT_ITER]]
106; SSE:       no.add:
107; SSE-NEXT:    br label [[NEXT_ITER]]
108; SSE:       next.iter:
109; SSE-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
110; SSE-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
111; SSE-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
112; SSE-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]]
113; SSE:       done:
114; SSE-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
115; SSE-NEXT:    ret double [[TOT_NEXT_LCSSA]]
116;
117; AVX-LABEL: @sumIfVector(
118; AVX-NEXT:  entry:
119; AVX-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
120; AVX:       vector.ph:
121; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
122; AVX:       vector.body:
123; AVX-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
124; AVX-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
125; AVX-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
126; AVX-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
127; AVX-NEXT:    [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
128; AVX-NEXT:    [[TMP3:%.*]] = bitcast double* [[TMP2]] to <4 x double>*
129; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP3]], align 8
130; AVX-NEXT:    [[TMP4:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01, double 4.200000e+01, double 4.200000e+01>
131; AVX-NEXT:    [[TMP5:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
132; AVX-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
133; AVX-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x double> [[TMP5]], <4 x double> [[VEC_PHI]]
134; AVX-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
135; AVX-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
136; AVX-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
137; AVX:       middle.block:
138; AVX-NEXT:    [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[PREDPHI]])
139; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
140; AVX-NEXT:    br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
141; AVX:       scalar.ph:
142; AVX-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
143; AVX-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
144; AVX-NEXT:    br label [[LOOP:%.*]]
145; AVX:       loop:
146; AVX-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
147; AVX-NEXT:    [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
148; AVX-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
149; AVX-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]], align 8
150; AVX-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
151; AVX-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
152; AVX:       do.add:
153; AVX-NEXT:    [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]]
154; AVX-NEXT:    br label [[NEXT_ITER]]
155; AVX:       no.add:
156; AVX-NEXT:    br label [[NEXT_ITER]]
157; AVX:       next.iter:
158; AVX-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
159; AVX-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
160; AVX-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
161; AVX-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]]
162; AVX:       done:
163; AVX-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
164; AVX-NEXT:    ret double [[TOT_NEXT_LCSSA]]
165;
166entry:
167  br label %loop
168
169loop:
170  %i = phi i32 [0, %entry], [%i.next, %next.iter]
171  %tot = phi double [0.0, %entry], [%tot.next, %next.iter]
172
173  %addr = getelementptr double, double* %arr, i32 %i
174  %nextval = load double, double* %addr
175
176  %tst = fcmp fast une double %nextval, 42.0
177  br i1 %tst, label %do.add, label %no.add
178
179do.add:
180  %tot.new = fadd fast double %tot, %nextval
181  br label %next.iter
182
183no.add:
184  br label %next.iter
185
186next.iter:
187  %tot.next = phi double [%tot, %no.add], [%tot.new, %do.add]
188  %i.next = add i32 %i, 1
189  %again = icmp ult i32 %i.next, 32
190  br i1 %again, label %loop, label %done
191
192done:
193  ret double %tot.next
194}
195
196