1; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
2
3; CHECK-LABEL: exchange_1
4; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
5; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
6; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
7; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
8; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
9define i32 @exchange_1(i16* %a, i16* %b, i32 %acc) {
10entry:
11  %addr.a.1 = getelementptr i16, i16* %a, i32 1
12  %addr.b.1 = getelementptr i16, i16* %b, i32 1
13  %ld.a.0 = load i16, i16* %a
14  %sext.a.0 = sext i16 %ld.a.0 to i32
15  %ld.b.0 = load i16, i16* %b
16  %ld.a.1 = load i16, i16* %addr.a.1
17  %ld.b.1 = load i16, i16* %addr.b.1
18  %sext.a.1 = sext i16 %ld.a.1 to i32
19  %sext.b.1 = sext i16 %ld.b.1 to i32
20  %sext.b.0 = sext i16 %ld.b.0 to i32
21  %mul.0 = mul i32 %sext.a.0, %sext.b.1
22  %mul.1 = mul i32 %sext.a.1, %sext.b.0
23  %add = add i32 %mul.0, %mul.1
24  %res = add i32 %add, %acc
25  ret i32 %res
26}
27
28; CHECK-LABEL: exchange_2
29; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
30; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
31; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
32; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
33; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
34define i32 @exchange_2(i16* %a, i16* %b, i32 %acc) {
35entry:
36  %addr.a.1 = getelementptr i16, i16* %a, i32 1
37  %addr.b.1 = getelementptr i16, i16* %b, i32 1
38  %ld.a.0 = load i16, i16* %a
39  %sext.a.0 = sext i16 %ld.a.0 to i32
40  %ld.b.0 = load i16, i16* %b
41  %ld.a.1 = load i16, i16* %addr.a.1
42  %ld.b.1 = load i16, i16* %addr.b.1
43  %sext.a.1 = sext i16 %ld.a.1 to i32
44  %sext.b.1 = sext i16 %ld.b.1 to i32
45  %sext.b.0 = sext i16 %ld.b.0 to i32
46  %mul.0 = mul i32 %sext.b.1, %sext.a.0
47  %mul.1 = mul i32 %sext.b.0, %sext.a.1
48  %add = add i32 %mul.0, %mul.1
49  %res = add i32 %add, %acc
50  ret i32 %res
51}
52
53; CHECK-LABEL: exchange_3
54; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
55; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
56; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
57; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
58; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
59define i32 @exchange_3(i16* %a, i16* %b, i32 %acc) {
60entry:
61  %addr.a.1 = getelementptr i16, i16* %a, i32 1
62  %addr.b.1 = getelementptr i16, i16* %b, i32 1
63  %ld.a.0 = load i16, i16* %a
64  %sext.a.0 = sext i16 %ld.a.0 to i32
65  %ld.b.0 = load i16, i16* %b
66  %ld.a.1 = load i16, i16* %addr.a.1
67  %ld.b.1 = load i16, i16* %addr.b.1
68  %sext.a.1 = sext i16 %ld.a.1 to i32
69  %sext.b.1 = sext i16 %ld.b.1 to i32
70  %sext.b.0 = sext i16 %ld.b.0 to i32
71  %mul.0 = mul i32 %sext.a.0, %sext.b.1
72  %mul.1 = mul i32 %sext.a.1, %sext.b.0
73  %add = add i32 %mul.1, %mul.0
74  %res = add i32 %add, %acc
75  ret i32 %res
76}
77
78; CHECK-LABEL: exchange_4
79; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
80; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
81; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
82; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
83; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
84define i32 @exchange_4(i16* %a, i16* %b, i32 %acc) {
85entry:
86  %addr.a.1 = getelementptr i16, i16* %a, i32 1
87  %addr.b.1 = getelementptr i16, i16* %b, i32 1
88  %ld.a.0 = load i16, i16* %a
89  %sext.a.0 = sext i16 %ld.a.0 to i32
90  %ld.b.0 = load i16, i16* %b
91  %ld.a.1 = load i16, i16* %addr.a.1
92  %ld.b.1 = load i16, i16* %addr.b.1
93  %sext.a.1 = sext i16 %ld.a.1 to i32
94  %sext.b.1 = sext i16 %ld.b.1 to i32
95  %sext.b.0 = sext i16 %ld.b.0 to i32
96  %mul.0 = mul i32 %sext.b.1, %sext.a.0
97  %mul.1 = mul i32 %sext.b.0, %sext.a.1
98  %add = add i32 %mul.1, %mul.0
99  %res = add i32 %add, %acc
100  ret i32 %res
101}
102
103; CHECK-LABEL: exchange_multi_use_1
104; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
105; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
106; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
107; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
108; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
109; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
110; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
111; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
112; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]])
113define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) {
114entry:
115  %addr.a.1 = getelementptr i16, i16* %a, i32 1
116  %addr.b.1 = getelementptr i16, i16* %b, i32 1
117  %ld.a.0 = load i16, i16* %a
118  %sext.a.0 = sext i16 %ld.a.0 to i32
119  %ld.b.0 = load i16, i16* %b
120  %ld.a.1 = load i16, i16* %addr.a.1
121  %ld.b.1 = load i16, i16* %addr.b.1
122  %sext.a.1 = sext i16 %ld.a.1 to i32
123  %sext.b.1 = sext i16 %ld.b.1 to i32
124  %sext.b.0 = sext i16 %ld.b.0 to i32
125  %mul.0 = mul i32 %sext.a.0, %sext.b.1
126  %mul.1 = mul i32 %sext.a.1, %sext.b.0
127  %add = add i32 %mul.0, %mul.1
128  %addr.a.2 = getelementptr i16, i16* %a, i32 2
129  %addr.a.3 = getelementptr i16, i16* %a, i32 3
130  %ld.a.2 = load i16, i16* %addr.a.2
131  %ld.a.3 = load i16, i16* %addr.a.3
132  %sext.a.2 = sext i16 %ld.a.2 to i32
133  %sext.a.3 = sext i16 %ld.a.3 to i32
134  %mul.2 = mul i32 %sext.a.3, %sext.b.1
135  %mul.3 = mul i32 %sext.a.2, %sext.b.0
136  %add.1 = add i32 %mul.2, %mul.3
137  %add.2 = add i32 %add, %add.1
138  %res = add i32 %add.2, %acc
139  ret i32 %res
140}
141
142; CHECK-LABEL: exchange_multi_use_64_1
143; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
144; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
145; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
146; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
147; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc
148; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
149; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
150; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
151; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]])
152define i64 @exchange_multi_use_64_1(i16* %a, i16* %b, i64 %acc) {
153entry:
154  %addr.a.1 = getelementptr i16, i16* %a, i32 1
155  %addr.b.1 = getelementptr i16, i16* %b, i32 1
156  %ld.a.0 = load i16, i16* %a
157  %sext.a.0 = sext i16 %ld.a.0 to i32
158  %ld.b.0 = load i16, i16* %b
159  %ld.a.1 = load i16, i16* %addr.a.1
160  %ld.b.1 = load i16, i16* %addr.b.1
161  %sext.a.1 = sext i16 %ld.a.1 to i32
162  %sext.b.1 = sext i16 %ld.b.1 to i32
163  %sext.b.0 = sext i16 %ld.b.0 to i32
164  %mul.0 = mul i32 %sext.a.0, %sext.b.1
165  %mul.1 = mul i32 %sext.a.1, %sext.b.0
166  %add = add i32 %mul.0, %mul.1
167  %addr.a.2 = getelementptr i16, i16* %a, i32 2
168  %addr.a.3 = getelementptr i16, i16* %a, i32 3
169  %ld.a.2 = load i16, i16* %addr.a.2
170  %ld.a.3 = load i16, i16* %addr.a.3
171  %sext.a.2 = sext i16 %ld.a.2 to i32
172  %sext.a.3 = sext i16 %ld.a.3 to i32
173  %mul.2 = mul i32 %sext.a.3, %sext.b.1
174  %mul.3 = mul i32 %sext.a.2, %sext.b.0
175  %add.1 = add i32 %mul.2, %mul.3
176  %add.2 = add i32 %add, %add.1
177  %sext.add.2 = sext i32 %add.2 to i64
178  %res = add i64 %sext.add.2, %acc
179  ret i64 %res
180}
181
182; CHECK-LABEL: exchange_multi_use_64_2
183; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
184; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
185; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
186; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
187; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc
188; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
189; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
190; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
191; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]])
192define i64 @exchange_multi_use_64_2(i16* %a, i16* %b, i64 %acc) {
193entry:
194  %addr.a.1 = getelementptr i16, i16* %a, i32 1
195  %addr.b.1 = getelementptr i16, i16* %b, i32 1
196  %ld.a.0 = load i16, i16* %a
197  %sext.a.0 = sext i16 %ld.a.0 to i32
198  %ld.b.0 = load i16, i16* %b
199  %ld.a.1 = load i16, i16* %addr.a.1
200  %ld.b.1 = load i16, i16* %addr.b.1
201  %sext.a.1 = sext i16 %ld.a.1 to i32
202  %sext.b.1 = sext i16 %ld.b.1 to i32
203  %sext.b.0 = sext i16 %ld.b.0 to i32
204  %mul.0 = mul i32 %sext.a.0, %sext.b.1
205  %mul.1 = mul i32 %sext.a.1, %sext.b.0
206  %add = add i32 %mul.0, %mul.1
207  %sext.add = sext i32 %add to i64
208  %addr.a.2 = getelementptr i16, i16* %a, i32 2
209  %addr.a.3 = getelementptr i16, i16* %a, i32 3
210  %ld.a.2 = load i16, i16* %addr.a.2
211  %ld.a.3 = load i16, i16* %addr.a.3
212  %sext.a.2 = sext i16 %ld.a.2 to i32
213  %sext.a.3 = sext i16 %ld.a.3 to i32
214  %mul.2 = mul i32 %sext.a.3, %sext.b.1
215  %mul.3 = mul i32 %sext.a.2, %sext.b.0
216  %add.1 = add i32 %mul.2, %mul.3
217  %sext.add.1 = sext i32 %add.1 to i64
218  %add.2 = add i64 %sext.add, %sext.add.1
219  %res = add i64 %add.2, %acc
220  ret i64 %res
221}
222
223; CHECK-LABEL: exchange_multi_use_2
224; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
225; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
226; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
227; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
228; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc
229; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
230; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
231; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
232; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]])
233define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) {
234entry:
235  %addr.a.1 = getelementptr i16, i16* %a, i32 1
236  %addr.b.1 = getelementptr i16, i16* %b, i32 1
237  %ld.a.0 = load i16, i16* %a
238  %sext.a.0 = sext i16 %ld.a.0 to i32
239  %ld.b.0 = load i16, i16* %b
240  %ld.a.1 = load i16, i16* %addr.a.1
241  %ld.b.1 = load i16, i16* %addr.b.1
242  %sext.a.1 = sext i16 %ld.a.1 to i32
243  %sext.b.1 = sext i16 %ld.b.1 to i32
244  %sext.b.0 = sext i16 %ld.b.0 to i32
245  %mul.0 = mul i32 %sext.a.0, %sext.b.0
246  %mul.1 = mul i32 %sext.a.1, %sext.b.1
247  %add = add i32 %mul.0, %mul.1
248  %addr.a.2 = getelementptr i16, i16* %a, i32 2
249  %addr.a.3 = getelementptr i16, i16* %a, i32 3
250  %ld.a.2 = load i16, i16* %addr.a.2
251  %ld.a.3 = load i16, i16* %addr.a.3
252  %sext.a.2 = sext i16 %ld.a.2 to i32
253  %sext.a.3 = sext i16 %ld.a.3 to i32
254  %mul.2 = mul i32 %sext.b.0, %sext.a.3
255  %mul.3 = mul i32 %sext.b.1, %sext.a.2
256  %add.1 = add i32 %mul.2, %mul.3
257  %add.2 = add i32 %add, %add.1
258  %res = add i32 %add.2, %acc
259  ret i32 %res
260}
261
262; TODO: Why aren't two intrinsics generated?
263; CHECK-LABEL: exchange_multi_use_3
264; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
265; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
266; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
267; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
268; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
269; CHECK-NOT: call i32 @llvm.arm.smlad
270; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0
271define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) {
272entry:
273  %addr.a.1 = getelementptr i16, i16* %a, i32 1
274  %addr.b.1 = getelementptr i16, i16* %b, i32 1
275  %ld.a.0 = load i16, i16* %a
276  %sext.a.0 = sext i16 %ld.a.0 to i32
277  %ld.b.0 = load i16, i16* %b
278  %ld.a.1 = load i16, i16* %addr.a.1
279  %ld.b.1 = load i16, i16* %addr.b.1
280  %sext.a.1 = sext i16 %ld.a.1 to i32
281  %sext.b.1 = sext i16 %ld.b.1 to i32
282  %sext.b.0 = sext i16 %ld.b.0 to i32
283  %addr.a.2 = getelementptr i16, i16* %a, i32 2
284  %addr.a.3 = getelementptr i16, i16* %a, i32 3
285  %ld.a.2 = load i16, i16* %addr.a.2
286  %ld.a.3 = load i16, i16* %addr.a.3
287  %sext.a.2 = sext i16 %ld.a.2 to i32
288  %sext.a.3 = sext i16 %ld.a.3 to i32
289  %mul.2 = mul i32 %sext.b.0, %sext.a.3
290  %mul.3 = mul i32 %sext.b.1, %sext.a.2
291  %mul.0 = mul i32 %sext.a.0, %sext.b.0
292  %mul.1 = mul i32 %sext.a.1, %sext.b.1
293  %add = add i32 %mul.0, %mul.1
294  %add.1 = add i32 %mul.2, %mul.3
295  %sub = sub i32 %add, %add.1
296  %res = add i32 %acc, %sub
297  ret i32 %res
298}
299
300; TODO: Would it be better to generate a smlad and then sign extend it?
301; CHECK-LABEL: exchange_multi_use_64_3
302; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
303; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
304; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
305; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
306; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
307; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32*
308; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
309; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 0)
310; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]])
311define i64 @exchange_multi_use_64_3(i16* %a, i16* %b, i64 %acc) {
312entry:
313  %addr.a.1 = getelementptr i16, i16* %a, i32 1
314  %addr.b.1 = getelementptr i16, i16* %b, i32 1
315  %ld.a.0 = load i16, i16* %a
316  %sext.a.0 = sext i16 %ld.a.0 to i32
317  %ld.b.0 = load i16, i16* %b
318  %ld.a.1 = load i16, i16* %addr.a.1
319  %ld.b.1 = load i16, i16* %addr.b.1
320  %sext.a.1 = sext i16 %ld.a.1 to i32
321  %sext.b.1 = sext i16 %ld.b.1 to i32
322  %sext.b.0 = sext i16 %ld.b.0 to i32
323  %addr.a.2 = getelementptr i16, i16* %a, i32 2
324  %addr.a.3 = getelementptr i16, i16* %a, i32 3
325  %ld.a.2 = load i16, i16* %addr.a.2
326  %ld.a.3 = load i16, i16* %addr.a.3
327  %sext.a.2 = sext i16 %ld.a.2 to i32
328  %sext.a.3 = sext i16 %ld.a.3 to i32
329  %mul.2 = mul i32 %sext.b.0, %sext.a.3
330  %mul.3 = mul i32 %sext.b.1, %sext.a.2
331  %mul.0 = mul i32 %sext.a.0, %sext.b.0
332  %mul.1 = mul i32 %sext.a.1, %sext.b.1
333  %add = add i32 %mul.0, %mul.1
334  %add.1 = add i32 %mul.2, %mul.3
335  %sext.add = sext i32 %add to i64
336  %sext.add.1 = sext i32 %add.1 to i64
337  %add.2 = add i64 %sext.add, %sext.add.1
338  %res = sub i64 %acc, %add.2
339  ret i64 %res
340}
341
342; TODO: Why isn't smladx generated too?
343; CHECK-LABEL: exchange_multi_use_4
344; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
345; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
346; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
347; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
348; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0
349; CHECK-NOT: call i32 @llvm.arm.smlad
350define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) {
351entry:
352  %addr.a.1 = getelementptr i16, i16* %a, i32 1
353  %addr.b.1 = getelementptr i16, i16* %b, i32 1
354  %ld.a.0 = load i16, i16* %a
355  %sext.a.0 = sext i16 %ld.a.0 to i32
356  %ld.b.0 = load i16, i16* %b
357  %ld.a.1 = load i16, i16* %addr.a.1
358  %ld.b.1 = load i16, i16* %addr.b.1
359  %sext.a.1 = sext i16 %ld.a.1 to i32
360  %sext.b.1 = sext i16 %ld.b.1 to i32
361  %sext.b.0 = sext i16 %ld.b.0 to i32
362  %addr.a.2 = getelementptr i16, i16* %a, i32 2
363  %addr.a.3 = getelementptr i16, i16* %a, i32 3
364  %ld.a.2 = load i16, i16* %addr.a.2
365  %ld.a.3 = load i16, i16* %addr.a.3
366  %sext.a.2 = sext i16 %ld.a.2 to i32
367  %sext.a.3 = sext i16 %ld.a.3 to i32
368  %mul.2 = mul i32 %sext.b.0, %sext.a.3
369  %mul.3 = mul i32 %sext.b.1, %sext.a.2
370  %mul.0 = mul i32 %sext.a.0, %sext.b.0
371  %mul.1 = mul i32 %sext.a.1, %sext.b.1
372  %add.1 = add i32 %mul.2, %mul.3
373  %add = add i32 %mul.0, %mul.1
374  %sub = sub i32 %add, %add.1
375  %res = add i32 %acc, %sub
376  ret i32 %res
377}
378
379; CHECK-LABEL: exchange_swap
380; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
381; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
382; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
383; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
384; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]]
385define i32 @exchange_swap(i16* %a, i16* %b, i32 %acc) {
386entry:
387  %addr.a.1 = getelementptr i16, i16* %a, i32 1
388  %addr.b.1 = getelementptr i16, i16* %b, i32 1
389  %ld.a.0 = load i16, i16* %a
390  %sext.a.0 = sext i16 %ld.a.0 to i32
391  %ld.b.0 = load i16, i16* %b
392  %ld.a.1 = load i16, i16* %addr.a.1
393  %ld.b.1 = load i16, i16* %addr.b.1
394  %sext.a.1 = sext i16 %ld.a.1 to i32
395  %sext.b.1 = sext i16 %ld.b.1 to i32
396  %sext.b.0 = sext i16 %ld.b.0 to i32
397  %mul.0 = mul i32 %sext.a.1, %sext.b.0
398  %mul.1 = mul i32 %sext.a.0, %sext.b.1
399  %add = add i32 %mul.0, %mul.1
400  %res = add i32 %add, %acc
401  ret i32 %res
402}
403
404; CHECK-LABEL: exchange_swap_2
405; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
406; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
407; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
408; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
409; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
410define i32 @exchange_swap_2(i16* %a, i16* %b, i32 %acc) {
411entry:
412  %addr.a.1 = getelementptr i16, i16* %a, i32 1
413  %addr.b.1 = getelementptr i16, i16* %b, i32 1
414  %ld.a.0 = load i16, i16* %a
415  %sext.a.0 = sext i16 %ld.a.0 to i32
416  %ld.b.0 = load i16, i16* %b
417  %ld.a.1 = load i16, i16* %addr.a.1
418  %ld.b.1 = load i16, i16* %addr.b.1
419  %sext.a.1 = sext i16 %ld.a.1 to i32
420  %sext.b.1 = sext i16 %ld.b.1 to i32
421  %sext.b.0 = sext i16 %ld.b.0 to i32
422  %mul.0 = mul i32 %sext.a.1, %sext.b.0
423  %mul.1 = mul i32 %sext.a.0, %sext.b.1
424  %add = add i32 %mul.1, %mul.0
425  %res = add i32 %add, %acc
426  ret i32 %res
427}
428
429; CHECK-LABEL: exchange_swap_3
430; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
431; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
432; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
433; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
434; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]]
435define i32 @exchange_swap_3(i16* %a, i16* %b, i32 %acc) {
436entry:
437  %addr.a.1 = getelementptr i16, i16* %a, i32 1
438  %addr.b.1 = getelementptr i16, i16* %b, i32 1
439  %ld.a.0 = load i16, i16* %a
440  %sext.a.0 = sext i16 %ld.a.0 to i32
441  %ld.b.0 = load i16, i16* %b
442  %ld.a.1 = load i16, i16* %addr.a.1
443  %ld.b.1 = load i16, i16* %addr.b.1
444  %sext.a.1 = sext i16 %ld.a.1 to i32
445  %sext.b.1 = sext i16 %ld.b.1 to i32
446  %sext.b.0 = sext i16 %ld.b.0 to i32
447  %mul.0 = mul i32 %sext.b.0, %sext.a.1
448  %mul.1 = mul i32 %sext.b.1, %sext.a.0
449  %add = add i32 %mul.1, %mul.0
450  %res = add i32 %add, %acc
451  ret i32 %res
452}
453