1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s
2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA,GFX10-FMA %s
6; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9_10,FMA,GFX940-FMA %s
7
8declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
9declare float @llvm.fabs.f32(float) nounwind readnone
10
11; GCN-LABEL: {{^}}madak_f32:
12; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
13; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
14; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
15; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
16; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
17; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
18; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
19; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
20; MAD:   v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
21; GFX10-MAD:   v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
22; FMA:   v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
23define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 {
24  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
25  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
26  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
27  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
28
29  %a = load float, float addrspace(1)* %in.a.gep, align 4
30  %b = load float, float addrspace(1)* %in.b.gep, align 4
31
32  %mul = fmul float %a, %b
33  %madak = fadd float %mul, 10.0
34  store float %madak, float addrspace(1)* %out.gep, align 4
35  ret void
36}
37
38; Make sure this is only folded with one use. This is a code size
39; optimization and if we fold the immediate multiple times, we'll undo
40; it.
41
42; GCN-LABEL: {{^}}madak_2_use_f32:
43; GFX9:         v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
44; GFX6-DAG:     buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
45; GFX6-DAG:     buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
46; GFX6-DAG:     buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
47; GFX8_9_10:    {{flat|global}}_load_dword [[VA:v[0-9]+]],
48; GFX8_9_10:    {{flat|global}}_load_dword [[VB:v[0-9]+]],
49; GFX8_9_10:    {{flat|global}}_load_dword [[VC:v[0-9]+]],
50; GFX6-DAG:     v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
51; GFX8-DAG:     v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
52; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
53; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
54; FMA-DAG:      v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
55; MAD-DAG:      v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
56; GFX10-FMA-DAG:v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000
57; GFX940-FMA-DAG:v_fmac_f32_e32 [[VK]], [[VA]], [[VC]]
58; GCN:          s_endpgm
59define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 {
60  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
61
62  %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
63  %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
64  %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2
65
66  %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
67  %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
68
69  %a = load volatile float, float addrspace(1)* %in.gep.0, align 4
70  %b = load volatile float, float addrspace(1)* %in.gep.1, align 4
71  %c = load volatile float, float addrspace(1)* %in.gep.2, align 4
72
73  %mul0 = fmul float %a, %b
74  %mul1 = fmul float %a, %c
75  %madak0 = fadd float %mul0, 10.0
76  %madak1 = fadd float %mul1, 10.0
77
78  store volatile float %madak0, float addrspace(1)* %out.gep.0, align 4
79  store volatile float %madak1, float addrspace(1)* %out.gep.1, align 4
80  ret void
81}
82
83; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
84; GCN: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]]
85; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
86; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
87; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
88define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) #0 {
89  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
90  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
91  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
92
93  %a = load float, float addrspace(1)* %in.a.gep, align 4
94
95  %mul = fmul float 4.0, %a
96  %madak = fadd float %mul, 10.0
97  store float %madak, float addrspace(1)* %out.gep, align 4
98  ret void
99}
100
101; Make sure nothing weird happens with a value that is also allowed as
102; an inline immediate.
103
104; GCN-LABEL: {{^}}madak_inline_imm_f32:
105; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
106; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
107; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
108; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
109; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
110; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
111; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
112; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
113; MAD:   v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
114; GFX10-MAD:   v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
115; FMA:   v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
116define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 {
117  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
118  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
119  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
120  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
121
122  %a = load float, float addrspace(1)* %in.a.gep, align 4
123  %b = load float, float addrspace(1)* %in.b.gep, align 4
124
125  %mul = fmul float %a, %b
126  %madak = fadd float %mul, 4.0
127  store float %madak, float addrspace(1)* %out.gep, align 4
128  ret void
129}
130
131; We can't use an SGPR when forming madak
132; GCN-LABEL: {{^}}s_v_madak_f32:
133; GCN-DAG:      s_load_dword [[SB:s[0-9]+]]
134; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
135; GCN-DAG:      {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
136; GCN-NOT:      v_madak_f32
137; GFX6_8_9:     v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
138; GFX10-MAD:    v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
139; GFX10-FMA:    v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
140; GFX940-FMA:   v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]]
141define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) #0 {
142  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
143  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
144  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
145
146  %a = load float, float addrspace(1)* %in.a.gep, align 4
147
148  %mul = fmul float %a, %b
149  %madak = fadd float %mul, 10.0
150  store float %madak, float addrspace(1)* %out.gep, align 4
151  ret void
152}
153
154; GCN-LABEL: @v_s_madak_f32
155; GCN-DAG:       s_load_dword [[SB:s[0-9]+]]
156; GFX6_8_9-DAG:  v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
157; GCN-DAG:       {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
158; GFX6_8_9-NOT:  v_madak_f32
159; GFX6_8_9:      v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
160; GFX10-MAD:     v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
161; GFX10-FMA:     v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
162; GFX940-FMA:    v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]]
163define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) #0 {
164  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
165  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
166  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
167
168  %b = load float, float addrspace(1)* %in.b.gep, align 4
169
170  %mul = fmul float %a, %b
171  %madak = fadd float %mul, 10.0
172  store float %madak, float addrspace(1)* %out.gep, align 4
173  ret void
174}
175
176; GCN-LABEL: {{^}}s_s_madak_f32:
177; GCN-NOT: v_madak_f32
178; GFX8_9:  v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
179; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
180; GFX10-FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
181; GFX940-FMA: v_fmac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
182define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) #0 {
183  %mul = fmul float %a, %b
184  %madak = fadd float %mul, 10.0
185  store float %madak, float addrspace(1)* %out, align 4
186  ret void
187}
188
189; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
190; GFX6:      buffer_load_dword [[VA:v[0-9]+]]
191; GFX6:      buffer_load_dword [[VB:v[0-9]+]]
192; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
193; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
194; GFX6_8_9:  v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
195; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
196; GFX10-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
197; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{s[0-9]+}}
198; GCN:       s_endpgm
199define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 {
200  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
201  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
202  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
203  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
204
205  %a = load float, float addrspace(1)* %in.a.gep, align 4
206  %b = load float, float addrspace(1)* %in.b.gep, align 4
207
208  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
209
210  %mul = fmul float %a.fabs, %b
211  %madak = fadd float %mul, 10.0
212  store float %madak, float addrspace(1)* %out.gep, align 4
213  ret void
214}
215
216; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
217; GFX6:      buffer_load_dword [[VA:v[0-9]+]]
218; GFX6:      buffer_load_dword [[VB:v[0-9]+]]
219; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
220; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
221; GFX6_8_9:  v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
222; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
223; GFX10-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
224; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}}
225; GCN:       s_endpgm
226define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 {
227  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
228  %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
229  %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
230  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
231
232  %a = load float, float addrspace(1)* %in.a.gep, align 4
233  %b = load float, float addrspace(1)* %in.b.gep, align 4
234
235  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
236
237  %mul = fmul float %a, %b.fabs
238  %madak = fadd float %mul, 10.0
239  store float %madak, float addrspace(1)* %out.gep, align 4
240  ret void
241}
242
243; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10
244; because the implicit immediate already uses the constant bus.
245; On GFX10+ we can use two scalar operands.
246; GCN-LABEL: {{^}}madak_constant_bus_violation:
247; GCN:       s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
248
249; GCN:       {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
250; MAD:       v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000
251; MAD:       v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5
252; GFX10:     v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
253; GFX10-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
254; GFX10-FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
255; GFX940-FMA: v_fmac_f32_e64 [[MADAK:v[0-9]+]], [[SGPR0]], 0.5
256; GCN:       v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
257; GFX6:      buffer_store_dword [[MUL]]
258; GFX8_9_10: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
259define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
260bb:
261  %tmp = icmp eq i32 %arg1, 0
262  br i1 %tmp, label %bb3, label %bb4
263
264bb3:
265  store volatile float 0.0, float addrspace(1)* undef
266  br label %bb4
267
268bb4:
269  %vgpr = load volatile float, float addrspace(1)* undef
270  %tmp0 = fmul float %sgpr0, 0.5
271  %tmp1 = fadd float %tmp0, 42.0
272  %tmp2 = fmul float %tmp1, %vgpr
273  store volatile float %tmp2, float addrspace(1)* undef, align 4
274  ret void
275}
276
277attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
278