1; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9,GFX9-F32FLUSH %s
2; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9,GFX9-F32DENORM %s
3; RUN: llc -march=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s
4; RUN: llc -march=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s
5
6;  fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
7
8; GCN-LABEL: {{^}}fadd_fpext_fmul_f16_to_f32:
9; GCN: s_waitcnt
10; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}}
11; GFX9-F32FLUSH-NEXT: s_setpc_b64
12
13; GFX9-F32DENORM-NEXT: v_mul_f16
14; GFX9-F32DENORM-NEXT: v_cvt_f32_f16
15; GFX9-F32DENORM-NEXT: v_add_f32
16define float @fadd_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
17entry:
18  %mul = fmul half %x, %y
19  %mul.ext = fpext half %mul to float
20  %add = fadd float %mul.ext, %z
21  ret float %add
22}
23
24; f16->f64 is not free.
25; GCN-LABEL: {{^}}fadd_fpext_fmul_f16_to_f64:
26; GFX89: v_mul_f16
27; GFX89: v_cvt_f32_f16
28; GFX89: v_cvt_f64_f32
29; GFX89: v_add_f64
30define double @fadd_fpext_fmul_f16_to_f64(half %x, half %y, double %z) #0 {
31entry:
32  %mul = fmul half %x, %y
33  %mul.ext = fpext half %mul to double
34  %add = fadd double %mul.ext, %z
35  ret double %add
36}
37
38; f32->f64 is not free.
39; GCN-LABEL: {{^}}fadd_fpext_fmul_f32_to_f64:
40; GCN: v_mul_f32
41; GCN: v_cvt_f64_f32
42; GCN: v_add_f64
43define double @fadd_fpext_fmul_f32_to_f64(float %x, float %y, double %z) #0 {
44entry:
45  %mul = fmul float %x, %y
46  %mul.ext = fpext float %mul to double
47  %add = fadd double %mul.ext, %z
48  ret double %add
49}
50
51; fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
52; GCN-LABEL: {{^}}fadd_fpext_fmul_f16_to_f32_commute:
53; GCN: s_waitcnt
54; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}}
55; GFX9-F32FLUSH-NEXT: s_setpc_b64
56
57; GFX9-F32DENORM-NEXT: v_mul_f16
58; GFX9-F32DENORM-NEXT: v_cvt_f32_f16
59; GFX9-F32DENORM-NEXT: v_add_f32
60; GFX9-F32DENORM-NEXT: s_setpc_b64
61define float @fadd_fpext_fmul_f16_to_f32_commute(half %x, half %y, float %z) #0 {
62entry:
63  %mul = fmul half %x, %y
64  %mul.ext = fpext half %mul to float
65  %add = fadd float %z, %mul.ext
66  ret float %add
67}
68
69; fold (fadd (fma x, y, (fpext (fmul u, v))), z)
70;   -> (fma x, y, (fma (fpext u), (fpext v), z))
71
72; GCN-LABEL: {{^}}fadd_muladd_fpext_fmul_f16_to_f32:
73; GCN: s_waitcnt
74; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
75; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
76; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
77; GFX9-F32FLUSH-NEXT: s_setpc_b64
78
79; GFX9-F32DENORM-NEXT: v_mul_f16
80; GFX9-F32DENORM-NEXT: v_cvt_f32_f16
81; GFX9-F32DENORM-NEXT: v_fma_f32
82; GFX9-F32DENORM-NEXT: v_add_f32
83; GFX9-F32DENORM-NEXT: s_setpc_b64
84define float @fadd_muladd_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 {
85entry:
86  %mul = fmul half %u, %v
87  %mul.ext = fpext half %mul to float
88  %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.ext)
89  %add = fadd float %fma, %z
90  ret float %add
91}
92
93; fold (fadd x, (fma y, z, (fpext (fmul u, v)))
94;   -> (fma y, z, (fma (fpext u), (fpext v), x))
95; GCN-LABEL: {{^}}fadd_muladd_fpext_fmul_f16_to_f32_commute:
96; GCN: s_waitcnt
97; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
98; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
99; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
100; GFX9-F32FLUSH-NEXT: s_setpc_b64
101
102; GFX9-F32DENORM-NEXT: v_mul_f16
103; GFX9-F32DENORM-NEXT: v_cvt_f32_f16
104; GFX9-F32DENORM-NEXT: v_fma_f32
105; GFX9-F32DENORM-NEXT: v_add_f32
106; GFX9-F32DENORM-NEXT: s_setpc_b64
107define float @fadd_muladd_fpext_fmul_f16_to_f32_commute(float %x, float %y, half %u, half %v, float %z) #0 {
108entry:
109  %mul = fmul half %u, %v
110  %mul.ext = fpext half %mul to float
111  %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.ext)
112  %add = fadd float %z, %fma
113  ret float %add
114}
115
116; GCN-LABEL: {{^}}fadd_fmad_fpext_fmul_f16_to_f32:
117; GCN: s_waitcnt
118; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
119; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
120; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
121; GFX9-F32FLUSH-NEXT: s_setpc_b64
122
123; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
124; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
125; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2
126define float @fadd_fmad_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 {
127entry:
128  %mul = fmul half %u, %v
129  %mul.ext = fpext half %mul to float
130  %mul1 = fmul contract float %x, %y
131  %fmad = fadd contract float %mul1, %mul.ext
132  %add = fadd float %fmad, %z
133  ret float %add
134}
135
136; fold (fadd (fma x, y, (fpext (fmul u, v))), z)
137;   -> (fma x, y, (fma (fpext u), (fpext v), z))
138
139; GCN-LABEL: {{^}}fadd_fma_fpext_fmul_f16_to_f32:
140; GCN: s_waitcnt
141; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
142; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
143; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
144; GFX9-F32FLUSH-NEXT: s_setpc_b64
145
146; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
147; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
148; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2
149; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v0, v4
150; GFX9-F32DENORM-NEXT: s_setpc_b64
151define float @fadd_fma_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 {
152entry:
153  %mul = fmul contract half %u, %v
154  %mul.ext = fpext half %mul to float
155  %fma = call float @llvm.fma.f32(float %x, float %y, float %mul.ext)
156  %add = fadd float %fma, %z
157  ret float %add
158}
159
160; GCN-LABEL: {{^}}fadd_fma_fpext_fmul_f16_to_f32_commute:
161; GCN: s_waitcnt
162; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
163; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
164; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
165; GFX9-F32FLUSH-NEXT: s_setpc_b64
166
167; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v2, v2, v3
168; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
169; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v2
170; GFX9-F32DENORM-NEXT: v_add_f32_e32 v0, v4, v0
171; GFX9-F32DENORM-NEXT: s_setpc_b64
172define float @fadd_fma_fpext_fmul_f16_to_f32_commute(float %x, float %y, half %u, half %v, float %z) #0 {
173entry:
174  %mul = fmul contract half %u, %v
175  %mul.ext = fpext half %mul to float
176  %fma = call float @llvm.fma.f32(float %x, float %y, float %mul.ext)
177  %add = fadd float %z, %fma
178  ret float %add
179}
180
181; fold (fadd x, (fpext (fma y, z, (fmul u, v)))
182;   -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
183
184; GCN-LABEL: {{^}}fadd_fpext_fmuladd_f16_to_f32:
185; GCN: s_waitcnt
186; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0]
187; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0]
188; GFX9-F32FLUSH-NEXT: s_setpc_b64
189
190; GFX9-F32DENORM-NEXT: v_mul_f16
191; GFX9-F32DENORM-NEXT: v_fma_f16
192; GFX9-F32DENORM-NEXT: v_cvt_f32_f16
193; GFX9-F32DENORM-NEXT: v_add_f32
194; GFX9-F32DENORM-NEXT: s_setpc_b64
195define float @fadd_fpext_fmuladd_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 {
196entry:
197  %mul = fmul contract half %u, %v
198  %fma = call half @llvm.fmuladd.f16(half %y, half %z, half %mul)
199  %ext.fma = fpext half %fma to float
200  %add = fadd float %x, %ext.fma
201  ret float %add
202}
203
204; GCN-LABEL: {{^}}fadd_fpext_fma_f16_to_f32:
205; GCN: s_waitcnt
206; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0]
207; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0]
208; GFX9-F32FLUSH-NEXT: s_setpc_b64
209
210; GFX9-F32DENORM-NEXT: v_mul_f16
211; GFX9-F32DENORM-NEXT: v_fma_f16
212; GFX9-F32DENORM-NEXT: v_cvt_f32_f16
213; GFX9-F32DENORM-NEXT: v_add_f32
214; GFX9-F32DENORM-NEXT: s_setpc_b64
215define float @fadd_fpext_fma_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 {
216entry:
217  %mul = fmul contract half %u, %v
218  %fma = call half @llvm.fma.f16(half %y, half %z, half %mul)
219  %ext.fma = fpext half %fma to float
220  %add = fadd float %x, %ext.fma
221  ret float %add
222}
223
224; GCN-LABEL: {{^}}fadd_fpext_fma_f16_to_f32_commute:
225; GCN: s_waitcnt
226; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0]
227; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0]
228; GFX9-F32FLUSH-NEXT: s_setpc_b64
229
230; GFX9-F32DENORM-NEXT: v_mul_f16
231; GFX9-F32DENORM-NEXT: v_fma_f16
232; GFX9-F32DENORM-NEXT: v_cvt_f32_f16
233; GFX9-F32DENORM-NEXT: v_add_f32_e32
234; GFX9-F32DENORM-NEXT: s_setpc_b64
235define float @fadd_fpext_fma_f16_to_f32_commute(float %x, half %y, half %z, half %u, half %v) #0 {
236entry:
237  %mul = fmul contract half %u, %v
238  %fma = call half @llvm.fma.f16(half %y, half %z, half %mul)
239  %ext.fma = fpext half %fma to float
240  %add = fadd float %ext.fma, %x
241  ret float %add
242}
243
244; fold (fsub (fpext (fmul x, y)), z)
245;   -> (fma (fpext x), (fpext y), (fneg z))
246
247; GCN-LABEL: {{^}}fsub_fpext_fmul_f16_to_f32:
248; GCN: s_waitcnt
249; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0]{{$}}
250; GFX9-F32FLUSH-NEXT: s_setpc_b64
251
252; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
253; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
254; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2
255; GFX9-F32DENORM-NEXT: s_setpc_b64
256define float @fsub_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
257entry:
258  %mul = fmul half %x, %y
259  %mul.ext = fpext half %mul to float
260  %add = fsub float %mul.ext, %z
261  ret float %add
262}
263
264; fold (fsub x, (fpext (fmul y, z)))
265;   -> (fma (fneg (fpext y)), (fpext z), x)
266
267; GCN-LABEL: {{^}}fsub_fpext_fmul_f16_to_f32_commute:
268; GCN: s_waitcnt
269; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0]
270; GFX9-F32FLUSH-NEXT: s_setpc_b64
271
272; GFX9-F32DENORM-NEXT: v_mul_f16_e32
273; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32
274; GFX9-F32DENORM-NEXT: v_sub_f32_e32
275; GFX9-F32DENORM-NEXT: s_setpc_b64
276define float @fsub_fpext_fmul_f16_to_f32_commute(float %x, half %y, half %z) #0 {
277entry:
278  %mul = fmul contract half %y, %z
279  %mul.ext = fpext half %mul to float
280  %add = fsub contract float %x, %mul.ext
281  ret float %add
282}
283
284; fold (fsub (fpext (fneg (fmul, x, y))), z)
285;   -> (fneg (fma (fpext x), (fpext y), z))
286
287; GCN-LABEL: {{^}}fsub_fpext_fneg_fmul_f16_to_f32:
288; GCN: s_waitcnt
289; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0]{{$}}
290; GFX9-F32FLUSH-NEXT: s_setpc_b64
291
292; GFX9-F32DENORM-NEXT: v_mul_f16_e64 v0, v0, -v1
293; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
294; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2
295; GFX9-F32DENORM-NEXT: s_setpc_b64
296define float @fsub_fpext_fneg_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
297entry:
298  %mul = fmul half %x, %y
299  %neg.mul = fsub half -0.0, %mul
300  %neg.mul.ext = fpext half %neg.mul to float
301  %add = fsub float %neg.mul.ext, %z
302  ret float %add
303}
304
305; fold (fsub (fneg (fpext (fmul, x, y))), z)
306;   -> (fneg (fma (fpext x)), (fpext y), z)
307
308; GCN-LABEL: {{^}}fsub_fneg_fpext_fmul_f16_to_f32:
309; GCN: s_waitcnt
310; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0]{{$}}
311; GFX9-F32FLUSH-NEXT: s_setpc_b64
312
313; GFX9-F32DENORM-NEXT: v_mul_f16_e64 v0, v0, -v1
314; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
315; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2
316; GFX9-F32DENORM-NEXT: s_setpc_b64
317define float @fsub_fneg_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
318entry:
319  %mul = fmul half %x, %y
320  %mul.ext = fpext half %mul to float
321  %neg.mul.ext = fneg float %mul.ext
322  %add = fsub float %neg.mul.ext, %z
323  ret float %add
324}
325
326; fold (fsub (fmad x, y, (fpext (fmul u, v))), z)
327;    -> (fmad x, y (fmad (fpext u), (fpext v), (fneg z)))
328; GCN-LABEL: {{^}}fsub_muladd_fpext_mul_f16_to_f32:
329; GCN: s_waitcnt
330; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v2, v3, v4, -v2 op_sel_hi:[1,1,0]{{$}}
331; GFX9-F32FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
332; GFX9-F32FLUSH-NEXT: v_mov_b32_e32 v0, v2
333; GFX9-F32FLUSH-NEXT: s_setpc_b64
334
335; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v3, v3, v4
336; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3
337; GFX9-F32DENORM-NEXT: v_fma_f32 v0, v0, v1, v3
338; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2
339; GFX9-F32DENORM-NEXT: s_setpc_b64
340define float @fsub_muladd_fpext_mul_f16_to_f32(float %x, float %y, float %z, half %u, half %v) #0 {
341entry:
342  %mul = fmul reassoc half %u, %v
343  %mul.ext = fpext half %mul to float
344  %fma = call float @llvm.fmuladd.f32(float %x, float %y, float %mul.ext)
345  %add = fsub reassoc float %fma, %z
346  ret float %add
347}
348
349;  fold (fsub (fpext (fmad x, y, (fmul u, v))), z)
350;    -> (fmad (fpext x), (fpext y),
351;            (fmad (fpext u), (fpext v), (fneg z)))
352
353; GCN-LABEL: {{^}}fsub_fpext_muladd_mul_f16_to_f32:
354; GFX9: v_mul_f16
355; GFX9: v_fma_f16
356; GFX9: v_cvt_f32_f16
357; GFX9: v_sub_f32
358; GCN: s_setpc_b64
359define float @fsub_fpext_muladd_mul_f16_to_f32(half %x, half %y, float %z, half %u, half %v) #0 {
360entry:
361  %mul = fmul half %u, %v
362  %fma = call half @llvm.fmuladd.f16(half %x, half %y, half %mul)
363  %fma.ext = fpext half %fma to float
364  %add = fsub float %fma.ext, %z
365  ret float %add
366}
367
368; fold (fsub x, (fmad y, z, (fpext (fmul u, v))))
369;   -> (fmad (fneg y), z, (fmad (fneg (fpext u)), (fpext v), x))
370; GCN-LABEL: {{^}}fsub_muladd_fpext_mul_f16_to_f32_commute:
371; GCN: s_waitcnt
372; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, -v3, v4, v0 op_sel_hi:[1,1,0]{{$}}
373; GFX9-F32FLUSH-NEXT: v_mad_f32 v0, -v1, v2, v0{{$}}
374; GFX9-F32FLUSH-NEXT: s_setpc_b64
375
376; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v3, v3, v4
377; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v3, v3
378; GFX9-F32DENORM-NEXT: v_fma_f32 v1, v1, v2, v3
379; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v1
380; GFX9-F32DENORM-NEXT: s_setpc_b64
381define float @fsub_muladd_fpext_mul_f16_to_f32_commute(float %x, float %y, float %z, half %u, half %v) #0 {
382entry:
383  %mul = fmul reassoc half %u, %v
384  %mul.ext = fpext half %mul to float
385  %fma = call float @llvm.fmuladd.f32(float %y, float %z, float %mul.ext)
386  %add = fsub reassoc float %x, %fma
387  ret float %add
388}
389
390; fold (fsub x, (fpext (fma y, z, (fmul u, v))))
391;    -> (fma (fneg (fpext y)), (fpext z),
392;            (fma (fneg (fpext u)), (fpext v), x))
393; GCN-LABEL: {{^}}fsub_fpext_muladd_mul_f16_to_f32_commute:
394; GCN: s_waitcnt
395; GFX9-NEXT: v_mul_f16_e32 v3, v3, v4
396; GFX9-NEXT: v_fma_f16 v1, v1, v2, v3
397; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
398; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1
399; GFX9-NEXT: s_setpc_b64
400define float @fsub_fpext_muladd_mul_f16_to_f32_commute(float %x, half %y, half %z, half %u, half %v) #0 {
401entry:
402  %mul = fmul half %u, %v
403  %fma = call half @llvm.fmuladd.f16(half %y, half %z, half %mul)
404  %fma.ext = fpext half %fma to float
405  %add = fsub float %x, %fma.ext
406  ret float %add
407}
408
409declare float @llvm.fmuladd.f32(float, float, float) #0
410declare float @llvm.fma.f32(float, float, float) #0
411declare half @llvm.fmuladd.f16(half, half, half) #0
412declare half @llvm.fma.f16(half, half, half) #0
413
414attributes #0 = { nounwind readnone speculatable }
415