1; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on   < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
2; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on   < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
3; RUN: llc -verify-machineinstrs -mcpu=verde  -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on   < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
4; RUN: llc -verify-machineinstrs -mcpu=verde  -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on   < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
5
6; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
7; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
8; RUN: llc -verify-machineinstrs -mcpu=verde  -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
9; RUN: llc -verify-machineinstrs -mcpu=verde  -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
10
11; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
12
13target triple = "amdgcn--"
14
15
16declare i32 @llvm.amdgcn.workitem.id.x() #1
17declare float @llvm.fmuladd.f32(float, float, float) #1
18declare half @llvm.fmuladd.f16(half, half, half) #1
19declare float @llvm.fabs.f32(float) #1
20
21; GCN-LABEL: {{^}}fmuladd_f32:
22; GCN-FLUSH: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
23
24; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
25
26; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
27; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
28define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
29                         float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
30  %r0 = load float, float addrspace(1)* %in1
31  %r1 = load float, float addrspace(1)* %in2
32  %r2 = load float, float addrspace(1)* %in3
33  %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
34  store float %r3, float addrspace(1)* %out
35  ret void
36}
37
38; GCN-LABEL: {{^}}fmul_fadd_f32:
39; GCN-FLUSH: v_mac_f32
40
41; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32
42
43; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
44; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
45
46; GCN-DENORM-STRICT: v_mul_f32_e32
47; GCN-DENORM-STRICT: v_add_f32_e32
48define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
49                           float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
50  %r0 = load volatile float, float addrspace(1)* %in1
51  %r1 = load volatile float, float addrspace(1)* %in2
52  %r2 = load volatile float, float addrspace(1)* %in3
53  %mul = fmul float %r0, %r1
54  %add = fadd float %mul, %r2
55  store float %add, float addrspace(1)* %out
56  ret void
57}
58
59; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32
60; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
61; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
62
63; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
64; SI-FLUSH: buffer_store_dword [[R2]]
65; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
66
67; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
68
69; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
70; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
71
72; SI-DENORM buffer_store_dword [[RESULT]]
73; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
74define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
75  %tid = call i32 @llvm.amdgcn.workitem.id.x()
76  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
77  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
78  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
79
80  %r1 = load volatile float, float addrspace(1)* %gep.0
81  %r2 = load volatile float, float addrspace(1)* %gep.1
82
83  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
84  store float %r3, float addrspace(1)* %gep.out
85  ret void
86}
87
88; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32
89; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
90; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
91
92; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
93; SI-FLUSH: buffer_store_dword [[R2]]
94; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
95
96; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
97
98; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
99; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
100
101; SI-DENORM: buffer_store_dword [[RESULT]]
102; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
103define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
104  %tid = call i32 @llvm.amdgcn.workitem.id.x()
105  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
106  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
107  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
108
109  %r1 = load volatile float, float addrspace(1)* %gep.0
110  %r2 = load volatile float, float addrspace(1)* %gep.1
111
112  %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
113  store float %r3, float addrspace(1)* %gep.out
114  ret void
115}
116
117; GCN-LABEL: {{^}}fadd_a_a_b_f32:
118; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
119; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
120
121; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
122; SI-FLUSH: buffer_store_dword [[R2]]
123; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
124
125; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
126
127; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
128; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
129
130; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
131; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
132
133; SI-DENORM: buffer_store_dword [[RESULT]]
134; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
135define void @fadd_a_a_b_f32(float addrspace(1)* %out,
136                            float addrspace(1)* %in1,
137                            float addrspace(1)* %in2) #0 {
138  %tid = call i32 @llvm.amdgcn.workitem.id.x()
139  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
140  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
141  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
142
143  %r0 = load volatile float, float addrspace(1)* %gep.0
144  %r1 = load volatile float, float addrspace(1)* %gep.1
145
146  %add.0 = fadd float %r0, %r0
147  %add.1 = fadd float %add.0, %r1
148  store float %add.1, float addrspace(1)* %gep.out
149  ret void
150}
151
152; GCN-LABEL: {{^}}fadd_b_a_a_f32:
153; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
154; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
155
156; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
157; SI-FLUSH: buffer_store_dword [[R2]]
158; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
159
160; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
161
162; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
163; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
164
165; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
166; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
167
168; SI-DENORM: buffer_store_dword [[RESULT]]
169; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
170define void @fadd_b_a_a_f32(float addrspace(1)* %out,
171                            float addrspace(1)* %in1,
172                            float addrspace(1)* %in2) #0 {
173  %tid = call i32 @llvm.amdgcn.workitem.id.x()
174  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
175  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
176  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
177
178  %r0 = load volatile float, float addrspace(1)* %gep.0
179  %r1 = load volatile float, float addrspace(1)* %gep.1
180
181  %add.0 = fadd float %r0, %r0
182  %add.1 = fadd float %r1, %add.0
183  store float %add.1, float addrspace(1)* %gep.out
184  ret void
185}
186
187; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
188; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
189; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
190; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
191
192; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
193
194; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]]
195; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
196
197; SI-DENORM: buffer_store_dword [[RESULT]]
198; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
199define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
200  %tid = call i32 @llvm.amdgcn.workitem.id.x()
201  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
202  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
203  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
204
205  %r1 = load volatile float, float addrspace(1)* %gep.0
206  %r2 = load volatile float, float addrspace(1)* %gep.1
207
208  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
209  store float %r3, float addrspace(1)* %gep.out
210  ret void
211}
212
213; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
214; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
215; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
216
217; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
218; SI-FLUSH: buffer_store_dword [[R2]]
219; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
220
221; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
222
223; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
224; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
225
226; SI-DENORM: buffer_store_dword [[RESULT]]
227; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
228define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
229  %tid = call i32 @llvm.amdgcn.workitem.id.x()
230  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
231  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
232  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
233
234  %r1 = load volatile float, float addrspace(1)* %gep.0
235  %r2 = load volatile float, float addrspace(1)* %gep.1
236
237  %r1.fneg = fsub float -0.000000e+00, %r1
238
239  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
240  store float %r3, float addrspace(1)* %gep.out
241  ret void
242}
243
244; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32:
245; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
246; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
247
248; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
249; SI-FLUSH: buffer_store_dword [[R2]]
250; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
251
252; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
253
254; GCN-DENORM-SLOWFMA: v_mul_f32_e32 [[TMP:v[0-9]+]], -2.0, [[R1]]
255; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
256
257; SI-DENORM: buffer_store_dword [[RESULT]]
258; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
259define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
260  %tid = call i32 @llvm.amdgcn.workitem.id.x()
261  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
262  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
263  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
264
265  %r1 = load volatile float, float addrspace(1)* %gep.0
266  %r2 = load volatile float, float addrspace(1)* %gep.1
267
268  %r1.fneg = fsub float -0.000000e+00, %r1
269
270  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
271  store float %r3, float addrspace(1)* %gep.out
272  ret void
273}
274
275; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32:
276; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
277; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
278; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
279; SI-FLUSH: buffer_store_dword [[RESULT]]
280; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
281
282; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
283
284; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
285; GCN-DENORM-SLOWFMA: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
286
287; SI-DENORM: buffer_store_dword [[RESULT]]
288; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
289define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
290  %tid = call i32 @llvm.amdgcn.workitem.id.x()
291  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
292  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
293  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
294
295  %r1 = load volatile float, float addrspace(1)* %gep.0
296  %r2 = load volatile float, float addrspace(1)* %gep.1
297
298  %r2.fneg = fsub float -0.000000e+00, %r2
299
300  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
301  store float %r3, float addrspace(1)* %gep.out
302  ret void
303}
304
305; GCN-LABEL: {{^}}mad_sub_f32:
306; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
307; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
308; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
309; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
310
311; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
312
313; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
314; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
315
316; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
317; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
318
319; SI: buffer_store_dword [[RESULT]]
320; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
321define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
322  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
323  %tid.ext = sext i32 %tid to i64
324  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
325  %add1 = add i64 %tid.ext, 1
326  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
327  %add2 = add i64 %tid.ext, 2
328  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
329  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
330  %a = load volatile float, float addrspace(1)* %gep0, align 4
331  %b = load volatile float, float addrspace(1)* %gep1, align 4
332  %c = load volatile float, float addrspace(1)* %gep2, align 4
333  %mul = fmul float %a, %b
334  %sub = fsub float %mul, %c
335  store float %sub, float addrspace(1)* %outgep, align 4
336  ret void
337}
338
339; GCN-LABEL: {{^}}mad_sub_inv_f32:
340; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
341; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
342; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
343
344; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
345
346; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
347
348; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
349; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
350
351; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
352; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
353
354; SI: buffer_store_dword [[RESULT]]
355; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
356define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
357  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
358  %tid.ext = sext i32 %tid to i64
359  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
360  %add1 = add i64 %tid.ext, 1
361  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
362  %add2 = add i64 %tid.ext, 2
363  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
364  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
365  %a = load volatile float, float addrspace(1)* %gep0, align 4
366  %b = load volatile float, float addrspace(1)* %gep1, align 4
367  %c = load volatile float, float addrspace(1)* %gep2, align 4
368  %mul = fmul float %a, %b
369  %sub = fsub float %c, %mul
370  store float %sub, float addrspace(1)* %outgep, align 4
371  ret void
372}
373
374; GCN-LABEL: {{^}}mad_sub_fabs_f32:
375; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
376; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
377; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
378; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
379
380; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
381
382; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
383; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]],  [[TMP]], |[[REGC]]|
384
385; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
386; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]],  [[TMP]], |[[REGC]]|
387
388; SI: buffer_store_dword [[RESULT]]
389; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
390define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
391  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
392  %tid.ext = sext i32 %tid to i64
393  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
394  %add1 = add i64 %tid.ext, 1
395  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
396  %add2 = add i64 %tid.ext, 2
397  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
398  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
399  %a = load volatile float, float addrspace(1)* %gep0, align 4
400  %b = load volatile float, float addrspace(1)* %gep1, align 4
401  %c = load volatile float, float addrspace(1)* %gep2, align 4
402  %c.abs = call float @llvm.fabs.f32(float %c) #0
403  %mul = fmul float %a, %b
404  %sub = fsub float %mul, %c.abs
405  store float %sub, float addrspace(1)* %outgep, align 4
406  ret void
407}
408
409; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32:
410; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
411; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
412; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
413; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
414
415; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
416
417; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
418; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
419
420; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
421; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
422
423; SI: buffer_store_dword [[RESULT]]
424; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
425define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
426  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
427  %tid.ext = sext i32 %tid to i64
428  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
429  %add1 = add i64 %tid.ext, 1
430  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
431  %add2 = add i64 %tid.ext, 2
432  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
433  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
434  %a = load volatile float, float addrspace(1)* %gep0, align 4
435  %b = load volatile float, float addrspace(1)* %gep1, align 4
436  %c = load volatile float, float addrspace(1)* %gep2, align 4
437  %c.abs = call float @llvm.fabs.f32(float %c) #0
438  %mul = fmul float %a, %b
439  %sub = fsub float %c.abs, %mul
440  store float %sub, float addrspace(1)* %outgep, align 4
441  ret void
442}
443
444; GCN-LABEL: {{^}}neg_neg_mad_f32:
445; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
446; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
447; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
448
449; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGB]], [[REGA]]
450; SI-FLUSH: buffer_store_dword [[REGC]]
451; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
452
453; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
454
455; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
456; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
457
458; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
459; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
460
461; SI-DENORM: buffer_store_dword [[RESULT]]
462; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
463define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
464  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
465  %tid.ext = sext i32 %tid to i64
466  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
467  %add1 = add i64 %tid.ext, 1
468  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
469  %add2 = add i64 %tid.ext, 2
470  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
471  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
472  %a = load volatile float, float addrspace(1)* %gep0, align 4
473  %b = load volatile float, float addrspace(1)* %gep1, align 4
474  %c = load volatile float, float addrspace(1)* %gep2, align 4
475  %nega = fsub float -0.000000e+00, %a
476  %negb = fsub float -0.000000e+00, %b
477  %mul = fmul float %nega, %negb
478  %sub = fadd float %mul, %c
479  store float %sub, float addrspace(1)* %outgep, align 4
480  ret void
481}
482
483; GCN-LABEL: {{^}}mad_fabs_sub_f32:
484; GCN: {{buffer|flat}}_load_dword [[REGA:v[0-9]+]]
485; GCN: {{buffer|flat}}_load_dword [[REGB:v[0-9]+]]
486; GCN: {{buffer|flat}}_load_dword [[REGC:v[0-9]+]]
487; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
488
489; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
490
491; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
492; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
493
494; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
495; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
496
497; SI: buffer_store_dword [[RESULT]]
498; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
499define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
500  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
501  %tid.ext = sext i32 %tid to i64
502  %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
503  %add1 = add i64 %tid.ext, 1
504  %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1
505  %add2 = add i64 %tid.ext, 2
506  %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2
507  %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext
508  %a = load volatile float, float addrspace(1)* %gep0, align 4
509  %b = load volatile float, float addrspace(1)* %gep1, align 4
510  %c = load volatile float, float addrspace(1)* %gep2, align 4
511  %b.abs = call float @llvm.fabs.f32(float %b) #0
512  %mul = fmul float %a, %b.abs
513  %sub = fsub float %mul, %c
514  store float %sub, float addrspace(1)* %outgep, align 4
515  ret void
516}
517
518; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32:
519; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
520; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
521; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
522; SI-FLUSH: buffer_store_dword [[R2]]
523; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
524
525; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
526
527; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
528; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
529
530; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
531; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
532
533; SI-DENORM: buffer_store_dword [[RESULT]]
534; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
535define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
536  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
537  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
538  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
539  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
540
541  %r1 = load volatile float, float addrspace(1)* %gep.0
542  %r2 = load volatile float, float addrspace(1)* %gep.1
543
544  %add = fadd float %r1, %r1
545  %r3 = fsub float %r2, %add
546
547  store float %r3, float addrspace(1)* %gep.out
548  ret void
549}
550
551; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32:
552; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
553; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
554; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
555
556; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
557
558; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
559; GCN-DENORM-SLOWFMA-CONTRACT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
560
561; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
562; GCN-DENORM-STRICT: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
563
564; SI: buffer_store_dword [[RESULT]]
565; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
566define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
567  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
568  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
569  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
570  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
571
572  %r1 = load volatile float, float addrspace(1)* %gep.0
573  %r2 = load volatile float, float addrspace(1)* %gep.1
574
575  %add = fadd float %r1, %r1
576  %r3 = fsub float %add, %r2
577
578  store float %r3, float addrspace(1)* %gep.out
579  ret void
580}
581
582attributes #0 = { nounwind }
583attributes #1 = { nounwind readnone }
584