1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX
4
5;
6; Check that we can commute operands based on the predicate.
7;
8
9define <4 x i32> @icmp_eq_v4i32(<4 x i32> %a, i32* %b) {
10; CHECK-LABEL: @icmp_eq_v4i32(
11; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
12; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
13; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], [[A:%.*]]
14; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
15; CHECK-NEXT:    ret <4 x i32> [[R]]
16;
17  %a0 = extractelement <4 x i32> %a, i32 0
18  %a1 = extractelement <4 x i32> %a, i32 1
19  %a2 = extractelement <4 x i32> %a, i32 2
20  %a3 = extractelement <4 x i32> %a, i32 3
21  %p0 = getelementptr inbounds i32, i32* %b, i32 0
22  %p1 = getelementptr inbounds i32, i32* %b, i32 1
23  %p2 = getelementptr inbounds i32, i32* %b, i32 2
24  %p3 = getelementptr inbounds i32, i32* %b, i32 3
25  %b0 = load i32, i32* %p0, align 4
26  %b1 = load i32, i32* %p1, align 4
27  %b2 = load i32, i32* %p2, align 4
28  %b3 = load i32, i32* %p3, align 4
29  %c0 = icmp eq i32 %a0, %b0
30  %c1 = icmp eq i32 %b1, %a1
31  %c2 = icmp eq i32 %b2, %a2
32  %c3 = icmp eq i32 %a3, %b3
33  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
34  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
35  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
36  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
37  %r = sext <4 x i1> %d3 to <4 x i32>
38  ret <4 x i32> %r
39}
40
41define <4 x i32> @icmp_ne_v4i32(<4 x i32> %a, i32* %b) {
42; CHECK-LABEL: @icmp_ne_v4i32(
43; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
44; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
45; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], [[A:%.*]]
46; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
47; CHECK-NEXT:    ret <4 x i32> [[R]]
48;
49  %a0 = extractelement <4 x i32> %a, i32 0
50  %a1 = extractelement <4 x i32> %a, i32 1
51  %a2 = extractelement <4 x i32> %a, i32 2
52  %a3 = extractelement <4 x i32> %a, i32 3
53  %p0 = getelementptr inbounds i32, i32* %b, i32 0
54  %p1 = getelementptr inbounds i32, i32* %b, i32 1
55  %p2 = getelementptr inbounds i32, i32* %b, i32 2
56  %p3 = getelementptr inbounds i32, i32* %b, i32 3
57  %b0 = load i32, i32* %p0, align 4
58  %b1 = load i32, i32* %p1, align 4
59  %b2 = load i32, i32* %p2, align 4
60  %b3 = load i32, i32* %p3, align 4
61  %c0 = icmp ne i32 %a0, %b0
62  %c1 = icmp ne i32 %b1, %a1
63  %c2 = icmp ne i32 %b2, %a2
64  %c3 = icmp ne i32 %a3, %b3
65  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
66  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
67  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
68  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
69  %r = sext <4 x i1> %d3 to <4 x i32>
70  ret <4 x i32> %r
71}
72
73define <4 x i32> @fcmp_oeq_v4i32(<4 x float> %a, float* %b) {
74; CHECK-LABEL: @fcmp_oeq_v4i32(
75; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
76; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
77; CHECK-NEXT:    [[TMP3:%.*]] = fcmp oeq <4 x float> [[TMP2]], [[A:%.*]]
78; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
79; CHECK-NEXT:    ret <4 x i32> [[R]]
80;
81  %a0 = extractelement <4 x float> %a, i32 0
82  %a1 = extractelement <4 x float> %a, i32 1
83  %a2 = extractelement <4 x float> %a, i32 2
84  %a3 = extractelement <4 x float> %a, i32 3
85  %p0 = getelementptr inbounds float, float* %b, i32 0
86  %p1 = getelementptr inbounds float, float* %b, i32 1
87  %p2 = getelementptr inbounds float, float* %b, i32 2
88  %p3 = getelementptr inbounds float, float* %b, i32 3
89  %b0 = load float, float* %p0, align 4
90  %b1 = load float, float* %p1, align 4
91  %b2 = load float, float* %p2, align 4
92  %b3 = load float, float* %p3, align 4
93  %c0 = fcmp oeq float %a0, %b0
94  %c1 = fcmp oeq float %b1, %a1
95  %c2 = fcmp oeq float %b2, %a2
96  %c3 = fcmp oeq float %a3, %b3
97  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
98  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
99  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
100  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
101  %r = sext <4 x i1> %d3 to <4 x i32>
102  ret <4 x i32> %r
103}
104
105define <4 x i32> @fcmp_uno_v4i32(<4 x float> %a, float* %b) {
106; CHECK-LABEL: @fcmp_uno_v4i32(
107; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
108; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
109; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A:%.*]]
110; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
111; CHECK-NEXT:    ret <4 x i32> [[R]]
112;
113  %a0 = extractelement <4 x float> %a, i32 0
114  %a1 = extractelement <4 x float> %a, i32 1
115  %a2 = extractelement <4 x float> %a, i32 2
116  %a3 = extractelement <4 x float> %a, i32 3
117  %p0 = getelementptr inbounds float, float* %b, i32 0
118  %p1 = getelementptr inbounds float, float* %b, i32 1
119  %p2 = getelementptr inbounds float, float* %b, i32 2
120  %p3 = getelementptr inbounds float, float* %b, i32 3
121  %b0 = load float, float* %p0, align 4
122  %b1 = load float, float* %p1, align 4
123  %b2 = load float, float* %p2, align 4
124  %b3 = load float, float* %p3, align 4
125  %c0 = fcmp uno float %a0, %b0
126  %c1 = fcmp uno float %b1, %a1
127  %c2 = fcmp uno float %b2, %a2
128  %c3 = fcmp uno float %a3, %b3
129  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
130  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
131  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
132  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
133  %r = sext <4 x i1> %d3 to <4 x i32>
134  ret <4 x i32> %r
135}
136
137;
138; Check that we can commute operands by swapping the predicate.
139;
140
141define <4 x i32> @icmp_sgt_slt_v4i32(<4 x i32> %a, i32* %b) {
142; CHECK-LABEL: @icmp_sgt_slt_v4i32(
143; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
144; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
145; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP2]], [[A:%.*]]
146; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
147; CHECK-NEXT:    ret <4 x i32> [[R]]
148;
149  %a0 = extractelement <4 x i32> %a, i32 0
150  %a1 = extractelement <4 x i32> %a, i32 1
151  %a2 = extractelement <4 x i32> %a, i32 2
152  %a3 = extractelement <4 x i32> %a, i32 3
153  %p0 = getelementptr inbounds i32, i32* %b, i32 0
154  %p1 = getelementptr inbounds i32, i32* %b, i32 1
155  %p2 = getelementptr inbounds i32, i32* %b, i32 2
156  %p3 = getelementptr inbounds i32, i32* %b, i32 3
157  %b0 = load i32, i32* %p0, align 4
158  %b1 = load i32, i32* %p1, align 4
159  %b2 = load i32, i32* %p2, align 4
160  %b3 = load i32, i32* %p3, align 4
161  %c0 = icmp sgt i32 %a0, %b0
162  %c1 = icmp slt i32 %b1, %a1
163  %c2 = icmp slt i32 %b2, %a2
164  %c3 = icmp sgt i32 %a3, %b3
165  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
166  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
167  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
168  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
169  %r = sext <4 x i1> %d3 to <4 x i32>
170  ret <4 x i32> %r
171}
172
173define <4 x i32> @icmp_uge_ule_v4i32(<4 x i32> %a, i32* %b) {
174; CHECK-LABEL: @icmp_uge_ule_v4i32(
175; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
176; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
177; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i32> [[TMP2]], [[A:%.*]]
178; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
179; CHECK-NEXT:    ret <4 x i32> [[R]]
180;
181  %a0 = extractelement <4 x i32> %a, i32 0
182  %a1 = extractelement <4 x i32> %a, i32 1
183  %a2 = extractelement <4 x i32> %a, i32 2
184  %a3 = extractelement <4 x i32> %a, i32 3
185  %p0 = getelementptr inbounds i32, i32* %b, i32 0
186  %p1 = getelementptr inbounds i32, i32* %b, i32 1
187  %p2 = getelementptr inbounds i32, i32* %b, i32 2
188  %p3 = getelementptr inbounds i32, i32* %b, i32 3
189  %b0 = load i32, i32* %p0, align 4
190  %b1 = load i32, i32* %p1, align 4
191  %b2 = load i32, i32* %p2, align 4
192  %b3 = load i32, i32* %p3, align 4
193  %c0 = icmp uge i32 %a0, %b0
194  %c1 = icmp ule i32 %b1, %a1
195  %c2 = icmp ule i32 %b2, %a2
196  %c3 = icmp uge i32 %a3, %b3
197  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
198  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
199  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
200  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
201  %r = sext <4 x i1> %d3 to <4 x i32>
202  ret <4 x i32> %r
203}
204
205define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
206; CHECK-LABEL: @fcmp_ogt_olt_v4i32(
207; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
208; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
209; CHECK-NEXT:    [[TMP3:%.*]] = fcmp olt <4 x float> [[TMP2]], [[A:%.*]]
210; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
211; CHECK-NEXT:    ret <4 x i32> [[R]]
212;
213  %a0 = extractelement <4 x float> %a, i32 0
214  %a1 = extractelement <4 x float> %a, i32 1
215  %a2 = extractelement <4 x float> %a, i32 2
216  %a3 = extractelement <4 x float> %a, i32 3
217  %p0 = getelementptr inbounds float, float* %b, i32 0
218  %p1 = getelementptr inbounds float, float* %b, i32 1
219  %p2 = getelementptr inbounds float, float* %b, i32 2
220  %p3 = getelementptr inbounds float, float* %b, i32 3
221  %b0 = load float, float* %p0, align 4
222  %b1 = load float, float* %p1, align 4
223  %b2 = load float, float* %p2, align 4
224  %b3 = load float, float* %p3, align 4
225  %c0 = fcmp ogt float %a0, %b0
226  %c1 = fcmp olt float %b1, %a1
227  %c2 = fcmp olt float %b2, %a2
228  %c3 = fcmp ogt float %a3, %b3
229  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
230  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
231  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
232  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
233  %r = sext <4 x i1> %d3 to <4 x i32>
234  ret <4 x i32> %r
235}
236
237define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
238; SSE-LABEL: @fcmp_ord_uno_v4i32(
239; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
240; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
241; SSE-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
242; SSE-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
243; SSE-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
244; SSE-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
245; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
246; SSE-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
247; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
248; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i64 0
249; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i64 1
250; SSE-NEXT:    [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
251; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
252; SSE-NEXT:    [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
253; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
254; SSE-NEXT:    [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
255; SSE-NEXT:    [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
256; SSE-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
257; SSE-NEXT:    ret <4 x i32> [[R]]
258;
259; AVX-LABEL: @fcmp_ord_uno_v4i32(
260; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
261; AVX-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3
262; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
263; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
264; AVX-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
265; AVX-NEXT:    [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
266; AVX-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
267; AVX-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
268; AVX-NEXT:    [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
269; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
270; AVX-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
271; AVX-NEXT:    [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
272; AVX-NEXT:    [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i64 0
273; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
274; AVX-NEXT:    [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
275; AVX-NEXT:    [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i64 3
276; AVX-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
277; AVX-NEXT:    ret <4 x i32> [[R]]
278;
279  %a0 = extractelement <4 x float> %a, i32 0
280  %a1 = extractelement <4 x float> %a, i32 1
281  %a2 = extractelement <4 x float> %a, i32 2
282  %a3 = extractelement <4 x float> %a, i32 3
283  %p0 = getelementptr inbounds float, float* %b, i32 0
284  %p1 = getelementptr inbounds float, float* %b, i32 1
285  %p2 = getelementptr inbounds float, float* %b, i32 2
286  %p3 = getelementptr inbounds float, float* %b, i32 3
287  %b0 = load float, float* %p0, align 4
288  %b1 = load float, float* %p1, align 4
289  %b2 = load float, float* %p2, align 4
290  %b3 = load float, float* %p3, align 4
291  %c0 = fcmp ord float %a0, %b0
292  %c1 = fcmp uno float %b1, %a1
293  %c2 = fcmp uno float %b2, %a2
294  %c3 = fcmp ord float %a3, %b3
295  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
296  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
297  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
298  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
299  %r = sext <4 x i1> %d3 to <4 x i32>
300  ret <4 x i32> %r
301}
302