1; RUN: opt -S -slp-vectorizer -slp-threshold=-10000 < %s | FileCheck %s
2; RUN: opt -S -slp-vectorizer -slp-threshold=0 < %s | FileCheck %s -check-prefix=ZEROTHRESH
3target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128"
4
5target triple = "x86_64-apple-macosx10.8.0"
6
7define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
8; CHECK-LABEL: @simple_select(
9; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
10; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
11  %c0 = extractelement <4 x i32> %c, i32 0
12  %c1 = extractelement <4 x i32> %c, i32 1
13  %c2 = extractelement <4 x i32> %c, i32 2
14  %c3 = extractelement <4 x i32> %c, i32 3
15  %a0 = extractelement <4 x float> %a, i32 0
16  %a1 = extractelement <4 x float> %a, i32 1
17  %a2 = extractelement <4 x float> %a, i32 2
18  %a3 = extractelement <4 x float> %a, i32 3
19  %b0 = extractelement <4 x float> %b, i32 0
20  %b1 = extractelement <4 x float> %b, i32 1
21  %b2 = extractelement <4 x float> %b, i32 2
22  %b3 = extractelement <4 x float> %b, i32 3
23  %cmp0 = icmp ne i32 %c0, 0
24  %cmp1 = icmp ne i32 %c1, 0
25  %cmp2 = icmp ne i32 %c2, 0
26  %cmp3 = icmp ne i32 %c3, 0
27  %s0 = select i1 %cmp0, float %a0, float %b0
28  %s1 = select i1 %cmp1, float %a1, float %b1
29  %s2 = select i1 %cmp2, float %a2, float %b2
30  %s3 = select i1 %cmp3, float %a3, float %b3
31  %ra = insertelement <4 x float> undef, float %s0, i32 0
32  %rb = insertelement <4 x float> %ra, float %s1, i32 1
33  %rc = insertelement <4 x float> %rb, float %s2, i32 2
34  %rd = insertelement <4 x float> %rc, float %s3, i32 3
35  ret <4 x float> %rd
36}
37
38; Insert in an order different from the vector indices to make sure it
39; doesn't matter
40define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
41; CHECK-LABEL: @simple_select_insert_out_of_order(
42; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
43; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
44  %c0 = extractelement <4 x i32> %c, i32 0
45  %c1 = extractelement <4 x i32> %c, i32 1
46  %c2 = extractelement <4 x i32> %c, i32 2
47  %c3 = extractelement <4 x i32> %c, i32 3
48  %a0 = extractelement <4 x float> %a, i32 0
49  %a1 = extractelement <4 x float> %a, i32 1
50  %a2 = extractelement <4 x float> %a, i32 2
51  %a3 = extractelement <4 x float> %a, i32 3
52  %b0 = extractelement <4 x float> %b, i32 0
53  %b1 = extractelement <4 x float> %b, i32 1
54  %b2 = extractelement <4 x float> %b, i32 2
55  %b3 = extractelement <4 x float> %b, i32 3
56  %cmp0 = icmp ne i32 %c0, 0
57  %cmp1 = icmp ne i32 %c1, 0
58  %cmp2 = icmp ne i32 %c2, 0
59  %cmp3 = icmp ne i32 %c3, 0
60  %s0 = select i1 %cmp0, float %a0, float %b0
61  %s1 = select i1 %cmp1, float %a1, float %b1
62  %s2 = select i1 %cmp2, float %a2, float %b2
63  %s3 = select i1 %cmp3, float %a3, float %b3
64  %ra = insertelement <4 x float> undef, float %s0, i32 2
65  %rb = insertelement <4 x float> %ra, float %s1, i32 1
66  %rc = insertelement <4 x float> %rb, float %s2, i32 0
67  %rd = insertelement <4 x float> %rc, float %s3, i32 3
68  ret <4 x float> %rd
69}
70
71declare void @v4f32_user(<4 x float>) #0
72declare void @f32_user(float) #0
73
74; Multiple users of the final constructed vector
75define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
76; CHECK-LABEL: @simple_select_users(
77; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
78; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
79  %c0 = extractelement <4 x i32> %c, i32 0
80  %c1 = extractelement <4 x i32> %c, i32 1
81  %c2 = extractelement <4 x i32> %c, i32 2
82  %c3 = extractelement <4 x i32> %c, i32 3
83  %a0 = extractelement <4 x float> %a, i32 0
84  %a1 = extractelement <4 x float> %a, i32 1
85  %a2 = extractelement <4 x float> %a, i32 2
86  %a3 = extractelement <4 x float> %a, i32 3
87  %b0 = extractelement <4 x float> %b, i32 0
88  %b1 = extractelement <4 x float> %b, i32 1
89  %b2 = extractelement <4 x float> %b, i32 2
90  %b3 = extractelement <4 x float> %b, i32 3
91  %cmp0 = icmp ne i32 %c0, 0
92  %cmp1 = icmp ne i32 %c1, 0
93  %cmp2 = icmp ne i32 %c2, 0
94  %cmp3 = icmp ne i32 %c3, 0
95  %s0 = select i1 %cmp0, float %a0, float %b0
96  %s1 = select i1 %cmp1, float %a1, float %b1
97  %s2 = select i1 %cmp2, float %a2, float %b2
98  %s3 = select i1 %cmp3, float %a3, float %b3
99  %ra = insertelement <4 x float> undef, float %s0, i32 0
100  %rb = insertelement <4 x float> %ra, float %s1, i32 1
101  %rc = insertelement <4 x float> %rb, float %s2, i32 2
102  %rd = insertelement <4 x float> %rc, float %s3, i32 3
103  call void @v4f32_user(<4 x float> %rd) #0
104  ret <4 x float> %rd
105}
106
107; Unused insertelement
108define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
109; CHECK-LABEL: @simple_select_no_users(
110; CHECK-NOT: icmp ne <4 x i32>
111; CHECK-NOT: select <4 x i1>
112  %c0 = extractelement <4 x i32> %c, i32 0
113  %c1 = extractelement <4 x i32> %c, i32 1
114  %c2 = extractelement <4 x i32> %c, i32 2
115  %c3 = extractelement <4 x i32> %c, i32 3
116  %a0 = extractelement <4 x float> %a, i32 0
117  %a1 = extractelement <4 x float> %a, i32 1
118  %a2 = extractelement <4 x float> %a, i32 2
119  %a3 = extractelement <4 x float> %a, i32 3
120  %b0 = extractelement <4 x float> %b, i32 0
121  %b1 = extractelement <4 x float> %b, i32 1
122  %b2 = extractelement <4 x float> %b, i32 2
123  %b3 = extractelement <4 x float> %b, i32 3
124  %cmp0 = icmp ne i32 %c0, 0
125  %cmp1 = icmp ne i32 %c1, 0
126  %cmp2 = icmp ne i32 %c2, 0
127  %cmp3 = icmp ne i32 %c3, 0
128  %s0 = select i1 %cmp0, float %a0, float %b0
129  %s1 = select i1 %cmp1, float %a1, float %b1
130  %s2 = select i1 %cmp2, float %a2, float %b2
131  %s3 = select i1 %cmp3, float %a3, float %b3
132  %ra = insertelement <4 x float> undef, float %s0, i32 0
133  %rb = insertelement <4 x float> %ra, float %s1, i32 1
134  %rc = insertelement <4 x float> undef, float %s2, i32 2
135  %rd = insertelement <4 x float> %rc, float %s3, i32 3
136  ret <4 x float> %rd
137}
138
139; Make sure infinite loop doesn't happen which I ran into when trying
140; to do this backwards this backwards
141define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
142; CHECK-LABEL: @reconstruct(
143  %c0 = extractelement <4 x i32> %c, i32 0
144  %c1 = extractelement <4 x i32> %c, i32 1
145  %c2 = extractelement <4 x i32> %c, i32 2
146  %c3 = extractelement <4 x i32> %c, i32 3
147  %ra = insertelement <4 x i32> undef, i32 %c0, i32 0
148  %rb = insertelement <4 x i32> %ra, i32 %c1, i32 1
149  %rc = insertelement <4 x i32> %rb, i32 %c2, i32 2
150  %rd = insertelement <4 x i32> %rc, i32 %c3, i32 3
151  ret <4 x i32> %rd
152}
153
154define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
155; CHECK-LABEL: @simple_select_v2(
156; CHECK: icmp ne <2 x i32>
157; CHECK: select <2 x i1>
158  %c0 = extractelement <2 x i32> %c, i32 0
159  %c1 = extractelement <2 x i32> %c, i32 1
160  %a0 = extractelement <2 x float> %a, i32 0
161  %a1 = extractelement <2 x float> %a, i32 1
162  %b0 = extractelement <2 x float> %b, i32 0
163  %b1 = extractelement <2 x float> %b, i32 1
164  %cmp0 = icmp ne i32 %c0, 0
165  %cmp1 = icmp ne i32 %c1, 0
166  %s0 = select i1 %cmp0, float %a0, float %b0
167  %s1 = select i1 %cmp1, float %a1, float %b1
168  %ra = insertelement <2 x float> undef, float %s0, i32 0
169  %rb = insertelement <2 x float> %ra, float %s1, i32 1
170  ret <2 x float> %rb
171}
172
173; Make sure when we construct partial vectors, we don't keep
174; re-visiting the insertelement chains starting with undef
175; (low cost threshold needed to force this to happen)
176define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
177  %c0 = extractelement <4 x i32> %c, i32 0
178  %c1 = extractelement <4 x i32> %c, i32 1
179  %a0 = extractelement <4 x float> %a, i32 0
180  %a1 = extractelement <4 x float> %a, i32 1
181  %b0 = extractelement <4 x float> %b, i32 0
182  %b1 = extractelement <4 x float> %b, i32 1
183  %1 = insertelement <2 x i32> undef, i32 %c0, i32 0
184  %2 = insertelement <2 x i32> %1, i32 %c1, i32 1
185  %3 = icmp ne <2 x i32> %2, zeroinitializer
186  %4 = insertelement <2 x float> undef, float %a0, i32 0
187  %5 = insertelement <2 x float> %4, float %a1, i32 1
188  %6 = insertelement <2 x float> undef, float %b0, i32 0
189  %7 = insertelement <2 x float> %6, float %b1, i32 1
190  %8 = select <2 x i1> %3, <2 x float> %5, <2 x float> %7
191  %9 = extractelement <2 x float> %8, i32 0
192  %ra = insertelement <4 x float> undef, float %9, i32 0
193  %10 = extractelement <2 x float> %8, i32 1
194  %rb = insertelement <4 x float> %ra, float %10, i32 1
195  ret <4 x float> %rb
196}
197
198; Make sure that vectorization happens even if insertelements operations
199; must be rescheduled. The case here is from compiling Julia.
200define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
201; CHECK-LABEL: @reschedule_extract(
202; CHECK: %1 = fadd <4 x float> %a, %b
203  %a0 = extractelement <4 x float> %a, i32 0
204  %b0 = extractelement <4 x float> %b, i32 0
205  %c0 = fadd float %a0, %b0
206  %v0 = insertelement <4 x float> undef, float %c0, i32 0
207  %a1 = extractelement <4 x float> %a, i32 1
208  %b1 = extractelement <4 x float> %b, i32 1
209  %c1 = fadd float %a1, %b1
210  %v1 = insertelement <4 x float> %v0, float %c1, i32 1
211  %a2 = extractelement <4 x float> %a, i32 2
212  %b2 = extractelement <4 x float> %b, i32 2
213  %c2 = fadd float %a2, %b2
214  %v2 = insertelement <4 x float> %v1, float %c2, i32 2
215  %a3 = extractelement <4 x float> %a, i32 3
216  %b3 = extractelement <4 x float> %b, i32 3
217  %c3 = fadd float %a3, %b3
218  %v3 = insertelement <4 x float> %v2, float %c3, i32 3
219  ret <4 x float> %v3
220}
221
222; Check that cost model for vectorization takes credit for
223; instructions that are erased.
224define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
225; ZEROTHRESH-LABEL: @take_credit(
226; ZEROTHRESH-CHECK: %1 = fadd <4 x float> %a, %b
227  %a0 = extractelement <4 x float> %a, i32 0
228  %b0 = extractelement <4 x float> %b, i32 0
229  %c0 = fadd float %a0, %b0
230  %a1 = extractelement <4 x float> %a, i32 1
231  %b1 = extractelement <4 x float> %b, i32 1
232  %c1 = fadd float %a1, %b1
233  %a2 = extractelement <4 x float> %a, i32 2
234  %b2 = extractelement <4 x float> %b, i32 2
235  %c2 = fadd float %a2, %b2
236  %a3 = extractelement <4 x float> %a, i32 3
237  %b3 = extractelement <4 x float> %b, i32 3
238  %c3 = fadd float %a3, %b3
239  %v0 = insertelement <4 x float> undef, float %c0, i32 0
240  %v1 = insertelement <4 x float> %v0, float %c1, i32 1
241  %v2 = insertelement <4 x float> %v1, float %c2, i32 2
242  %v3 = insertelement <4 x float> %v2, float %c3, i32 3
243  ret <4 x float> %v3
244}
245
246attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
247