1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -slp-vectorizer -instcombine -S | FileCheck %s
3
4target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
5target triple = "aarch64--linux-gnu"
6
7define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
8; CHECK-LABEL: @build_vec_v2i64(
9; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]]
10; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]]
11; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 1, i32 2>
12; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 3>
13; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP3]]
14; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
15;
16  %v0.0 = extractelement <2 x i64> %v0, i32 0
17  %v0.1 = extractelement <2 x i64> %v0, i32 1
18  %v1.0 = extractelement <2 x i64> %v1, i32 0
19  %v1.1 = extractelement <2 x i64> %v1, i32 1
20  %tmp0.0 = add i64 %v0.0, %v1.0
21  %tmp0.1 = add i64 %v0.1, %v1.1
22  %tmp1.0 = sub i64 %v0.0, %v1.0
23  %tmp1.1 = sub i64 %v0.1, %v1.1
24  %tmp2.0 = add i64 %tmp0.0, %tmp0.1
25  %tmp2.1 = add i64 %tmp1.0, %tmp1.1
26  %tmp3.0 = insertelement <2 x i64> undef, i64 %tmp2.0, i32 0
27  %tmp3.1 = insertelement <2 x i64> %tmp3.0, i64 %tmp2.1, i32 1
28  ret <2 x i64> %tmp3.1
29}
30
31define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
32; CHECK-LABEL: @store_chain_v2i64(
33; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>*
34; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
35; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>*
36; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
37; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
38; CHECK-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
39; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 1, i32 2>
40; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> <i32 0, i32 3>
41; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]]
42; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>*
43; CHECK-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 8
44; CHECK-NEXT:    ret void
45;
46  %a.0 = getelementptr i64, i64* %a, i64 0
47  %a.1 = getelementptr i64, i64* %a, i64 1
48  %b.0 = getelementptr i64, i64* %b, i64 0
49  %b.1 = getelementptr i64, i64* %b, i64 1
50  %c.0 = getelementptr i64, i64* %c, i64 0
51  %c.1 = getelementptr i64, i64* %c, i64 1
52  %v0.0 = load i64, i64* %a.0, align 8
53  %v0.1 = load i64, i64* %a.1, align 8
54  %v1.0 = load i64, i64* %b.0, align 8
55  %v1.1 = load i64, i64* %b.1, align 8
56  %tmp0.0 = add i64 %v0.0, %v1.0
57  %tmp0.1 = add i64 %v0.1, %v1.1
58  %tmp1.0 = sub i64 %v0.0, %v1.0
59  %tmp1.1 = sub i64 %v0.1, %v1.1
60  %tmp2.0 = add i64 %tmp0.0, %tmp0.1
61  %tmp2.1 = add i64 %tmp1.0, %tmp1.1
62  store i64 %tmp2.0, i64* %c.0, align 8
63  store i64 %tmp2.1, i64* %c.1, align 8
64  ret void
65}
66
67define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
68; CHECK-LABEL: @build_vec_v4i32(
69; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
70; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
71; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
72; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
73; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
74; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
75;
76  %v0.0 = extractelement <4 x i32> %v0, i32 0
77  %v0.1 = extractelement <4 x i32> %v0, i32 1
78  %v0.2 = extractelement <4 x i32> %v0, i32 2
79  %v0.3 = extractelement <4 x i32> %v0, i32 3
80  %v1.0 = extractelement <4 x i32> %v1, i32 0
81  %v1.1 = extractelement <4 x i32> %v1, i32 1
82  %v1.2 = extractelement <4 x i32> %v1, i32 2
83  %v1.3 = extractelement <4 x i32> %v1, i32 3
84  %tmp0.0 = add i32 %v0.0, %v1.0
85  %tmp0.1 = add i32 %v0.1, %v1.1
86  %tmp0.2 = add i32 %v0.2, %v1.2
87  %tmp0.3 = add i32 %v0.3, %v1.3
88  %tmp1.0 = sub i32 %v0.0, %v1.0
89  %tmp1.1 = sub i32 %v0.1, %v1.1
90  %tmp1.2 = sub i32 %v0.2, %v1.2
91  %tmp1.3 = sub i32 %v0.3, %v1.3
92  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
93  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
94  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
95  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
96  %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
97  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
98  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
99  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
100  ret <4 x i32> %tmp3.3
101}
102
103define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
104; CHECK-LABEL: @build_vec_v4i32_reuse_0(
105; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
106; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]]
107; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
108; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
109; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
110; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
111; CHECK-NEXT:    ret <4 x i32> [[SHUFFLE]]
112;
113  %v0.0 = extractelement <2 x i32> %v0, i32 0
114  %v0.1 = extractelement <2 x i32> %v0, i32 1
115  %v1.0 = extractelement <2 x i32> %v1, i32 0
116  %v1.1 = extractelement <2 x i32> %v1, i32 1
117  %tmp0.0 = add i32 %v0.0, %v1.0
118  %tmp0.1 = add i32 %v0.1, %v1.1
119  %tmp1.0 = sub i32 %v0.0, %v1.0
120  %tmp1.1 = sub i32 %v0.1, %v1.1
121  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
122  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
123  %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
124  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
125  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.0, i32 2
126  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.1, i32 3
127  ret <4 x i32> %tmp3.3
128}
129
130define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
131; CHECK-LABEL: @build_vec_v4i32_reuse_1(
132; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1
133; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0
134; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1
135; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
136; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
137; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
138; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
139; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
140; CHECK-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
141; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
142; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
143; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
144; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
145; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
146; CHECK-NEXT:    [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
147; CHECK-NEXT:    ret <4 x i32> [[TMP2_31]]
148;
149  %v0.0 = extractelement <2 x i32> %v0, i32 0
150  %v0.1 = extractelement <2 x i32> %v0, i32 1
151  %v1.0 = extractelement <2 x i32> %v1, i32 0
152  %v1.1 = extractelement <2 x i32> %v1, i32 1
153  %tmp0.0 = add i32 %v0.0, %v1.0
154  %tmp0.1 = add i32 %v0.1, %v1.1
155  %tmp0.2 = xor i32 %v0.0, %v1.0
156  %tmp0.3 = xor i32 %v0.1, %v1.1
157  %tmp1.0 = sub i32 %tmp0.0, %tmp0.1
158  %tmp1.1 = sub i32 %tmp0.0, %tmp0.1
159  %tmp1.2 = sub i32 %tmp0.2, %tmp0.3
160  %tmp1.3 = sub i32 %tmp0.3, %tmp0.2
161  %tmp2.0 = insertelement <4 x i32> undef, i32 %tmp1.0, i32 0
162  %tmp2.1 = insertelement <4 x i32> %tmp2.0, i32 %tmp1.1, i32 1
163  %tmp2.2 = insertelement <4 x i32> %tmp2.1, i32 %tmp1.2, i32 2
164  %tmp2.3 = insertelement <4 x i32> %tmp2.2, i32 %tmp1.3, i32 3
165  ret <4 x i32> %tmp2.3
166}
167
168define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
169; CHECK-LABEL: @build_vec_v4i32_3_binops(
170; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]]
171; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i32> [[V0]], [[V1]]
172; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 1, i32 2>
173; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
174; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]]
175; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]]
176; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
177; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]]
178; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]]
179; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
180; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
181;
182  %v0.0 = extractelement <2 x i32> %v0, i32 0
183  %v0.1 = extractelement <2 x i32> %v0, i32 1
184  %v1.0 = extractelement <2 x i32> %v1, i32 0
185  %v1.1 = extractelement <2 x i32> %v1, i32 1
186  %tmp0.0 = add i32 %v0.0, %v1.0
187  %tmp0.1 = add i32 %v0.1, %v1.1
188  %tmp0.2 = xor i32 %v0.0, %v1.0
189  %tmp0.3 = xor i32 %v0.1, %v1.1
190  %tmp1.0 = mul i32 %v0.0, %v1.0
191  %tmp1.1 = mul i32 %v0.1, %v1.1
192  %tmp1.2 = xor i32 %v0.0, %v1.0
193  %tmp1.3 = xor i32 %v0.1, %v1.1
194  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
195  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
196  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
197  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
198  %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
199  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
200  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
201  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
202  ret <4 x i32> %tmp3.3
203}
204
205define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
206; CHECK-LABEL: @reduction_v4i32(
207; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]]
208; CHECK-NEXT:    [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]]
209; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 3, i32 6>
210; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 4, i32 2, i32 7>
211; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[TMP3]]
212; CHECK-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 15, i32 15, i32 15, i32 15>
213; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i32> [[TMP6]], <i32 65537, i32 65537, i32 65537, i32 65537>
214; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw <4 x i32> [[TMP7]], <i32 65535, i32 65535, i32 65535, i32 65535>
215; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]]
216; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], [[TMP8]]
217; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
218; CHECK-NEXT:    ret i32 [[TMP11]]
219;
220  %v0.0 = extractelement <4 x i32> %v0, i32 0
221  %v0.1 = extractelement <4 x i32> %v0, i32 1
222  %v0.2 = extractelement <4 x i32> %v0, i32 2
223  %v0.3 = extractelement <4 x i32> %v0, i32 3
224  %v1.0 = extractelement <4 x i32> %v1, i32 0
225  %v1.1 = extractelement <4 x i32> %v1, i32 1
226  %v1.2 = extractelement <4 x i32> %v1, i32 2
227  %v1.3 = extractelement <4 x i32> %v1, i32 3
228  %tmp0.0 = add i32 %v0.0, %v1.0
229  %tmp0.1 = add i32 %v0.1, %v1.1
230  %tmp0.2 = add i32 %v0.2, %v1.2
231  %tmp0.3 = add i32 %v0.3, %v1.3
232  %tmp1.0 = sub i32 %v0.0, %v1.0
233  %tmp1.1 = sub i32 %v0.1, %v1.1
234  %tmp1.2 = sub i32 %v0.2, %v1.2
235  %tmp1.3 = sub i32 %v0.3, %v1.3
236  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
237  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
238  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
239  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
240  %tmp3.0 = lshr i32 %tmp2.0, 15
241  %tmp3.1 = lshr i32 %tmp2.1, 15
242  %tmp3.2 = lshr i32 %tmp2.2, 15
243  %tmp3.3 = lshr i32 %tmp2.3, 15
244  %tmp4.0 = and i32 %tmp3.0, 65537
245  %tmp4.1 = and i32 %tmp3.1, 65537
246  %tmp4.2 = and i32 %tmp3.2, 65537
247  %tmp4.3 = and i32 %tmp3.3, 65537
248  %tmp5.0 = mul nuw i32 %tmp4.0, 65535
249  %tmp5.1 = mul nuw i32 %tmp4.1, 65535
250  %tmp5.2 = mul nuw i32 %tmp4.2, 65535
251  %tmp5.3 = mul nuw i32 %tmp4.3, 65535
252  %tmp6.0 = add i32 %tmp5.0, %tmp2.0
253  %tmp6.1 = add i32 %tmp5.1, %tmp2.1
254  %tmp6.2 = add i32 %tmp5.2, %tmp2.2
255  %tmp6.3 = add i32 %tmp5.3, %tmp2.3
256  %tmp7.0 = xor i32 %tmp6.0, %tmp5.0
257  %tmp7.1 = xor i32 %tmp6.1, %tmp5.1
258  %tmp7.2 = xor i32 %tmp6.2, %tmp5.2
259  %tmp7.3 = xor i32 %tmp6.3, %tmp5.3
260  %reduce.0 = add i32 %tmp7.1, %tmp7.0
261  %reduce.1 = add i32 %reduce.0, %tmp7.2
262  %reduce.2 = add i32 %reduce.1, %tmp7.3
263  ret i32 %reduce.2
264}
265