1; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,FUNC %s
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,FUNC %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=FUNC,GFX9PLUS %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=FUNC,GFX9PLUS %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=FUNC,GFX9PLUS %s
6; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=EG,FUNC %s
7
8; mul24 and mad24 are affected
9
10; FUNC-LABEL: {{^}}test_mul_v2i32:
11; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
12; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
13
14; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
15; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
16
17define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
18  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
19  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
20  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
21  %result = mul <2 x i32> %a, %b
22  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
23  ret void
24}
25
26; FUNC-LABEL: {{^}}v_mul_v4i32:
27; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
28; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
29; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
30; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
31
32; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
33; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
34; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
35; GCN: v_mul_lo_u32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
36
37define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
38  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
39  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
40  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
41  %result = mul <4 x i32> %a, %b
42  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
43  ret void
44}
45
46; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32:
47; GCN: s_load_dword
48; GCN: s_load_dword
49; GCN: s_mul_i32
50; GCN: buffer_store_dword
51define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
52  %mul = mul i64 %b, %a
53  %trunc = trunc i64 %mul to i32
54  store i32 %trunc, i32 addrspace(1)* %out, align 8
55  ret void
56}
57
58; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32:
59; GCN: s_load_dword
60; GCN: s_load_dword
61; GCN: v_mul_lo_u32
62; GCN: buffer_store_dword
63define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
64  %a = load i64, i64 addrspace(1)* %aptr, align 8
65  %b = load i64, i64 addrspace(1)* %bptr, align 8
66  %mul = mul i64 %b, %a
67  %trunc = trunc i64 %mul to i32
68  store i32 %trunc, i32 addrspace(1)* %out, align 8
69  ret void
70}
71
72; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
73; 32-bits of both arguments are sign bits.
74; FUNC-LABEL: {{^}}mul64_sext_c:
75; EG-DAG: MULLO_INT
76; EG-DAG: MULHI_INT
77; SI-DAG: s_mulk_i32
78; SI-DAG: v_mul_hi_i32
79; VI: v_mad_i64_i32
80define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
81entry:
82  %0 = sext i32 %in to i64
83  %1 = mul i64 %0, 80
84  store i64 %1, i64 addrspace(1)* %out
85  ret void
86}
87
88; FUNC-LABEL: {{^}}v_mul64_sext_c:
89; EG-DAG: MULLO_INT
90; EG-DAG: MULHI_INT
91; SI-DAG: v_mul_lo_u32
92; SI-DAG: v_mul_hi_i32
93; VI: v_mad_i64_i32
94; GCN: s_endpgm
95define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
96  %val = load i32, i32 addrspace(1)* %in, align 4
97  %ext = sext i32 %val to i64
98  %mul = mul i64 %ext, 80
99  store i64 %mul, i64 addrspace(1)* %out, align 8
100  ret void
101}
102
103; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
104; SI-DAG: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, 9
105; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
106; VI: v_mad_i64_i32 v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, 9, 0
107; GCN: s_endpgm
108define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
109  %val = load i32, i32 addrspace(1)* %in, align 4
110  %ext = sext i32 %val to i64
111  %mul = mul i64 %ext, 9
112  store i64 %mul, i64 addrspace(1)* %out, align 8
113  ret void
114}
115
116; FUNC-LABEL: {{^}}s_mul_i32:
117; GCN: s_load_dword [[SRC0:s[0-9]+]],
118; GCN: s_load_dword [[SRC1:s[0-9]+]],
119; GCN: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
120; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
121; GCN: buffer_store_dword [[VRESULT]],
122; GCN: s_endpgm
123define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
124  %mul = mul i32 %a, %b
125  store i32 %mul, i32 addrspace(1)* %out, align 4
126  ret void
127}
128
129; FUNC-LABEL: {{^}}v_mul_i32:
130; GCN: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
131define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
132  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
133  %a = load i32, i32 addrspace(1)* %in
134  %b = load i32, i32 addrspace(1)* %b_ptr
135  %result = mul i32 %a, %b
136  store i32 %result, i32 addrspace(1)* %out
137  ret void
138}
139
140; A standard 64-bit multiply.  The expansion should be around 6 instructions.
141; It would be difficult to match the expansion correctly without writing
142; a really complicated list of FileCheck expressions.  I don't want
143; to confuse people who may 'break' this test with a correct optimization,
144; so this test just uses FUNC-LABEL to make sure the compiler does not
145; crash with a 'failed to select' error.
146
147; FUNC-LABEL: {{^}}s_mul_i64:
148; GFX9PLUS-DAG: s_mul_i32
149; GFX9PLUS-DAG: s_mul_hi_u32
150; GFX9PLUS-DAG: s_mul_i32
151; GFX9PLUS-DAG: s_mul_i32
152; GFX9PLUS: s_endpgm
153define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
154  %mul = mul i64 %a, %b
155  store i64 %mul, i64 addrspace(1)* %out, align 8
156  ret void
157}
158
159; FUNC-LABEL: {{^}}v_mul_i64:
160; GCN: v_mul_lo_u32
161define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
162  %a = load i64, i64 addrspace(1)* %aptr, align 8
163  %b = load i64, i64 addrspace(1)* %bptr, align 8
164  %mul = mul i64 %a, %b
165  store i64 %mul, i64 addrspace(1)* %out, align 8
166  ret void
167}
168
169; FUNC-LABEL: {{^}}mul32_in_branch:
170; GCN: s_mul_i32
171define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
172entry:
173  %0 = icmp eq i32 %a, 0
174  br i1 %0, label %if, label %else
175
176if:
177  %1 = load i32, i32 addrspace(1)* %in
178  br label %endif
179
180else:
181  %2 = mul i32 %a, %b
182  br label %endif
183
184endif:
185  %3 = phi i32 [%1, %if], [%2, %else]
186  store i32 %3, i32 addrspace(1)* %out
187  ret void
188}
189
190; FUNC-LABEL: {{^}}mul64_in_branch:
191; SI-DAG: s_mul_i32
192; SI-DAG: v_mul_hi_u32
193; VI: v_mad_u64_u32
194; GCN: s_endpgm
195define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
196entry:
197  %0 = icmp eq i64 %a, 0
198  br i1 %0, label %if, label %else
199
200if:
201  %1 = load i64, i64 addrspace(1)* %in
202  br label %endif
203
204else:
205  %2 = mul i64 %a, %b
206  br label %endif
207
208endif:
209  %3 = phi i64 [%1, %if], [%2, %else]
210  store i64 %3, i64 addrspace(1)* %out
211  ret void
212}
213
214; FIXME: Load dwordx4
215; FUNC-LABEL: {{^}}s_mul_i128:
216; GCN: s_load_dwordx4
217; GCN: s_load_dwordx4
218
219; SI: v_mul_hi_u32
220; SI: v_mul_hi_u32
221; SI: s_mul_i32
222; SI: v_mul_hi_u32
223; SI: s_mul_i32
224; SI: s_mul_i32
225
226; SI-DAG: s_mul_i32
227; SI-DAG: v_mul_hi_u32
228; SI-DAG: v_mul_hi_u32
229; SI-DAG: s_mul_i32
230; SI-DAG: s_mul_i32
231; SI-DAG: v_mul_hi_u32
232
233; VI-DAG: v_mad_u64_u32
234; VI-DAG: v_mad_u64_u32
235; VI-DAG: v_mad_u64_u32
236; VI-DAG: v_mad_u64_u32
237; VI-DAG: v_mad_u64_u32
238; VI-DAG: v_mad_u64_u32
239; VI-DAG: s_mul_i32
240; VI-DAG: s_mul_i32
241; VI-DAG: s_mul_i32
242; VI-DAG: s_mul_i32
243
244
245; GCN: buffer_store_dwordx4
246define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
247  %mul = mul i128 %a, %b
248  store i128 %mul, i128 addrspace(1)* %out
249  ret void
250}
251
252; FUNC-LABEL: {{^}}v_mul_i128:
253; GCN: {{buffer|flat}}_load_dwordx4
254; GCN: {{buffer|flat}}_load_dwordx4
255
256; SI-DAG: v_mul_lo_u32
257; SI-DAG: v_mul_hi_u32
258; SI-DAG: v_mul_hi_u32
259; SI-DAG: v_mul_lo_u32
260; SI-DAG: v_mul_hi_u32
261; SI-DAG: v_mul_hi_u32
262; SI-DAG: v_mul_lo_u32
263; SI-DAG: v_mul_lo_u32
264; SI-DAG: v_add_i32_e32
265
266; SI-DAG: v_mul_hi_u32
267; SI-DAG: v_mul_lo_u32
268; SI-DAG: v_mul_hi_u32
269; SI-DAG: v_mul_lo_u32
270; SI-DAG: v_mul_lo_u32
271; SI-DAG: v_mul_lo_u32
272; SI-DAG: v_mul_lo_u32
273; SI-DAG: v_mul_lo_u32
274
275; VI-DAG: v_mad_u64_u32
276; VI-DAG: v_mad_u64_u32
277; VI-DAG: v_mad_u64_u32
278; VI-DAG: v_mad_u64_u32
279; VI-DAG: v_mad_u64_u32
280; VI-DAG: v_mad_u64_u32
281; VI-DAG: v_mul_lo_u32
282; VI-DAG: v_mul_lo_u32
283; VI-DAG: v_mul_lo_u32
284
285; GCN: {{buffer|flat}}_store_dwordx4
286define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
287  %tid = call i32 @llvm.amdgcn.workitem.id.x()
288  %gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid
289  %gep.b = getelementptr inbounds i128, i128 addrspace(1)* %bptr, i32 %tid
290  %gep.out = getelementptr inbounds i128, i128 addrspace(1)* %bptr, i32 %tid
291  %a = load i128, i128 addrspace(1)* %gep.a
292  %b = load i128, i128 addrspace(1)* %gep.b
293  %mul = mul i128 %a, %b
294  store i128 %mul, i128 addrspace(1)* %gep.out
295  ret void
296}
297
298declare i32 @llvm.amdgcn.workitem.id.x() #1
299
300attributes #0 = { nounwind }
301attributes #1 = { nounwind readnone}
302