1; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI-FLUSH,VI %s
2; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI-FLUSH,VI %s
3
4; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM,VI %s
5; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
6
7; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s
8; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s
9; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s
10; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s
11
12declare i32 @llvm.amdgcn.workitem.id.x() #1
13declare half @llvm.fmuladd.f16(half, half, half) #1
14declare half @llvm.fabs.f16(half) #1
15
16; GCN-LABEL: {{^}}fmuladd_f16:
17; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
18
19; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
20
21; GFX10-FLUSH:  v_mul_f16_e32
22; GFX10-FLUSH:  v_add_f16_e32
23; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
24
25define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
26                         half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
27  %r0 = load half, half addrspace(1)* %in1
28  %r1 = load half, half addrspace(1)* %in2
29  %r2 = load half, half addrspace(1)* %in3
30  %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2)
31  store half %r3, half addrspace(1)* %out
32  ret void
33}
34
35; GCN-LABEL: {{^}}fmul_fadd_f16:
36; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
37
38; VI-DENORM-CONTRACT: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
39
40; GFX10-FLUSH:  v_mul_f16_e32
41; GFX10-FLUSH:  v_add_f16_e32
42; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
43
44define amdgpu_kernel void @fmul_fadd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
45                         half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
46  %r0 = load half, half addrspace(1)* %in1
47  %r1 = load half, half addrspace(1)* %in2
48  %r2 = load half, half addrspace(1)* %in3
49  %mul = fmul half %r0, %r1
50  %add = fadd half %mul, %r2
51  store half %add, half addrspace(1)* %out
52  ret void
53}
54
55; GCN-LABEL: {{^}}fmul_fadd_contract_f16:
56; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
57
58; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
59
60; GFX10-FLUSH:  v_mul_f16_e32
61; GFX10-FLUSH:  v_add_f16_e32
62; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
63
64define amdgpu_kernel void @fmul_fadd_contract_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
65                         half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
66  %r0 = load half, half addrspace(1)* %in1
67  %r1 = load half, half addrspace(1)* %in2
68  %r2 = load half, half addrspace(1)* %in3
69  %mul = fmul contract half %r0, %r1
70  %add = fadd contract half %mul, %r2
71  store half %add, half addrspace(1)* %out
72  ret void
73}
74
75; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
76; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
77; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
78; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
79; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
80
81; VI-DENORM:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
82; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]]
83
84; GFX10-FLUSH:  v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
85; GFX10-FLUSH:  v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
86
87; VI-DENORM:    flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
88; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
89; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
90
91define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
92  %tid = call i32 @llvm.amdgcn.workitem.id.x()
93  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
94  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
95  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
96
97  %r1 = load volatile half, half addrspace(1)* %gep.0
98  %r2 = load volatile half, half addrspace(1)* %gep.1
99
100  %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
101  store half %r3, half addrspace(1)* %gep.out
102  ret void
103}
104
105; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
106; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
107; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
108; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
109; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
110
111; VI-DENORM:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
112; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
113
114; GFX10-FLUSH:  v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
115; GFX10-FLUSH:  v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
116
117; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
118; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
119; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]]
120
121define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
122  %tid = call i32 @llvm.amdgcn.workitem.id.x()
123  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
124  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
125  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
126
127  %r1 = load volatile half, half addrspace(1)* %gep.0
128  %r2 = load volatile half, half addrspace(1)* %gep.1
129
130  %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
131  store half %r3, half addrspace(1)* %gep.out
132  ret void
133}
134
135; GCN-LABEL: {{^}}fadd_a_a_b_f16:
136; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
137; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
138; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
139; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
140
141; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
142; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
143
144; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
145; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
146
147; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
148
149; GFX10-FLUSH:           v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
150; GFX10-FLUSH:           v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
151; GFX10-FLUSH:           global_store_short v{{[0-9]+}}, [[RESULT]]
152; GFX10-DENORM-STRICT:   global_store_short v{{[0-9]+}}, [[RESULT]]
153; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]]
154
155define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
156                            half addrspace(1)* %in1,
157                            half addrspace(1)* %in2) #0 {
158  %tid = call i32 @llvm.amdgcn.workitem.id.x()
159  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
160  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
161  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
162
163  %r0 = load volatile half, half addrspace(1)* %gep.0
164  %r1 = load volatile half, half addrspace(1)* %gep.1
165
166  %add.0 = fadd half %r0, %r0
167  %add.1 = fadd half %add.0, %r1
168  store half %add.1, half addrspace(1)* %gep.out
169  ret void
170}
171
172; GCN-LABEL: {{^}}fadd_b_a_a_f16:
173; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
174; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
175; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
176; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
177
178; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
179; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
180
181; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
182; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]],  [[R2]], [[TMP]]
183
184; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
185
186; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
187; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
188; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
189; GFX10-DENORM-STRICT:   global_store_short v{{[0-9]+}}, [[RESULT]]
190; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]]
191
192define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
193                            half addrspace(1)* %in1,
194                            half addrspace(1)* %in2) #0 {
195  %tid = call i32 @llvm.amdgcn.workitem.id.x()
196  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
197  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
198  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
199
200  %r0 = load volatile half, half addrspace(1)* %gep.0
201  %r1 = load volatile half, half addrspace(1)* %gep.1
202
203  %add.0 = fadd half %r0, %r0
204  %add.1 = fadd half %r1, %add.0
205  store half %add.1, half addrspace(1)* %gep.out
206  ret void
207}
208
209; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16
210; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
211; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
212; VI-FLUSH:     v_mac_f16_e32 [[R2]], -2.0, [[R1]]
213; VI-DENORM:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
214; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
215; VI-FLUSH:  flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
216; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
217; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
218; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
219; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]]
220; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
221define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
222  %tid = call i32 @llvm.amdgcn.workitem.id.x()
223  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
224  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
225  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
226
227  %r1 = load volatile half, half addrspace(1)* %gep.0
228  %r2 = load volatile half, half addrspace(1)* %gep.1
229
230  %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
231  store half %r3, half addrspace(1)* %gep.out
232  ret void
233}
234
235; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
236; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
237; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
238; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
239; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
240
241; VI-DENORM:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
242; VI-DENORM:  flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
243
244; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
245; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
246; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]]
247
248; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
249; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
250define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
251  %tid = call i32 @llvm.amdgcn.workitem.id.x()
252  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
253  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
254  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
255
256  %r1 = load volatile half, half addrspace(1)* %gep.0
257  %r2 = load volatile half, half addrspace(1)* %gep.1
258
259  %r1.fneg = fneg half %r1
260
261  %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
262  store half %r3, half addrspace(1)* %gep.out
263  ret void
264}
265
266; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
267; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
268; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
269; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
270; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
271
272; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
273; VI-DENORM:  flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
274
275; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
276; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
277; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
278
279; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
280; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
281define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
282  %tid = call i32 @llvm.amdgcn.workitem.id.x()
283  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
284  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
285  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
286
287  %r1 = load volatile half, half addrspace(1)* %gep.0
288  %r2 = load volatile half, half addrspace(1)* %gep.1
289
290  %r1.fneg = fneg half %r1
291
292  %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2)
293  store half %r3, half addrspace(1)* %gep.out
294  ret void
295}
296
297; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16
298; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
299; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
300; VI-FLUSH:   v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
301; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
302; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
303; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
304; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
305; GFX10:       global_store_short v{{[0-9]+}}, [[RESULT]]
306define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
307  %tid = call i32 @llvm.amdgcn.workitem.id.x()
308  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
309  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
310  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
311
312  %r1 = load volatile half, half addrspace(1)* %gep.0
313  %r2 = load volatile half, half addrspace(1)* %gep.1
314
315  %r2.fneg = fneg half %r2
316
317  %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg)
318  store half %r3, half addrspace(1)* %gep.out
319  ret void
320}
321
322; GCN-LABEL: {{^}}mad_sub_f16:
323; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
324; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
325; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
326
327; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
328
329; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
330
331; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
332; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
333
334; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
335
336; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
337; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
338; GFX10:       global_store_short v{{[0-9]+}}, [[RESULT]]
339define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
340  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
341  %tid.ext = sext i32 %tid to i64
342  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
343  %add1 = add i64 %tid.ext, 1
344  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
345  %add2 = add i64 %tid.ext, 2
346  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
347  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
348  %a = load volatile half, half addrspace(1)* %gep0, align 2
349  %b = load volatile half, half addrspace(1)* %gep1, align 2
350  %c = load volatile half, half addrspace(1)* %gep2, align 2
351  %mul = fmul half %a, %b
352  %sub = fsub half %mul, %c
353  store half %sub, half addrspace(1)* %outgep, align 2
354  ret void
355}
356
357; GCN-LABEL: {{^}}mad_sub_inv_f16:
358; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
359; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
360; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
361; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
362
363; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
364
365; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
366; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
367
368; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
369
370; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
371; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
372; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]]
373define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
374  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
375  %tid.ext = sext i32 %tid to i64
376  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
377  %add1 = add i64 %tid.ext, 1
378  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
379  %add2 = add i64 %tid.ext, 2
380  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
381  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
382  %a = load volatile half, half addrspace(1)* %gep0, align 2
383  %b = load volatile half, half addrspace(1)* %gep1, align 2
384  %c = load volatile half, half addrspace(1)* %gep2, align 2
385  %mul = fmul half %a, %b
386  %sub = fsub half %c, %mul
387  store half %sub, half addrspace(1)* %outgep, align 2
388  ret void
389}
390
391; GCN-LABEL: {{^}}mad_sub_fabs_f16:
392; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
393; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
394; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
395; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
396
397; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
398
399; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
400; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
401
402; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
403
404; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
405; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
406; GFX10:       global_store_short v{{[0-9]+}}, [[RESULT]]
407define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
408  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
409  %tid.ext = sext i32 %tid to i64
410  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
411  %add1 = add i64 %tid.ext, 1
412  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
413  %add2 = add i64 %tid.ext, 2
414  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
415  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
416  %a = load volatile half, half addrspace(1)* %gep0, align 2
417  %b = load volatile half, half addrspace(1)* %gep1, align 2
418  %c = load volatile half, half addrspace(1)* %gep2, align 2
419  %c.abs = call half @llvm.fabs.f16(half %c) #0
420  %mul = fmul half %a, %b
421  %sub = fsub half %mul, %c.abs
422  store half %sub, half addrspace(1)* %outgep, align 2
423  ret void
424}
425
426; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16:
427; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
428; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
429; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
430
431; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
432
433; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
434
435; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
436; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
437
438; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
439
440; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
441; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
442; GFX10:       global_store_short v{{[0-9]+}}, [[RESULT]]
443define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
444  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
445  %tid.ext = sext i32 %tid to i64
446  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
447  %add1 = add i64 %tid.ext, 1
448  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
449  %add2 = add i64 %tid.ext, 2
450  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
451  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
452  %a = load volatile half, half addrspace(1)* %gep0, align 2
453  %b = load volatile half, half addrspace(1)* %gep1, align 2
454  %c = load volatile half, half addrspace(1)* %gep2, align 2
455  %c.abs = call half @llvm.fabs.f16(half %c) #0
456  %mul = fmul half %a, %b
457  %sub = fsub half %c.abs, %mul
458  store half %sub, half addrspace(1)* %outgep, align 2
459  ret void
460}
461
462; GCN-LABEL: {{^}}neg_neg_mad_f16:
463; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
464; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
465; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
466
467; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
468; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
469
470; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
471; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
472
473; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
474; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
475; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
476
477; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
478; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
479; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]]
480; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]]
481; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]]
482define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
483  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
484  %tid.ext = sext i32 %tid to i64
485  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
486  %add1 = add i64 %tid.ext, 1
487  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
488  %add2 = add i64 %tid.ext, 2
489  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
490  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
491  %a = load volatile half, half addrspace(1)* %gep0, align 2
492  %b = load volatile half, half addrspace(1)* %gep1, align 2
493  %c = load volatile half, half addrspace(1)* %gep2, align 2
494  %nega = fneg half %a
495  %negb = fneg half %b
496  %mul = fmul half %nega, %negb
497  %sub = fadd half %mul, %c
498  store half %sub, half addrspace(1)* %outgep, align 2
499  ret void
500}
501
502; GCN-LABEL: {{^}}mad_fabs_sub_f16:
503; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
504; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
505; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
506
507; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
508
509; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
510
511; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
512; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
513
514; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
515
516; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
517; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
518; GFX10:       global_store_short v{{[0-9]+}}, [[RESULT]]
519define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
520  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
521  %tid.ext = sext i32 %tid to i64
522  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
523  %add1 = add i64 %tid.ext, 1
524  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
525  %add2 = add i64 %tid.ext, 2
526  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
527  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
528  %a = load volatile half, half addrspace(1)* %gep0, align 2
529  %b = load volatile half, half addrspace(1)* %gep1, align 2
530  %c = load volatile half, half addrspace(1)* %gep2, align 2
531  %b.abs = call half @llvm.fabs.f16(half %b) #0
532  %mul = fmul half %a, %b.abs
533  %sub = fsub half %mul, %c
534  store half %sub, half addrspace(1)* %outgep, align 2
535  ret void
536}
537
538; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16:
539; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
540; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
541; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
542; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
543
544; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
545; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
546
547; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
548; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
549
550; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
551
552; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
553; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
554; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]]
555; GFX10-DENORM-STRICT:   global_store_short v{{[0-9]+}}, [[RESULT]]
556; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]]
557define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
558  %tid = call i32 @llvm.amdgcn.workitem.id.x()
559  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
560  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
561  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
562
563  %r1 = load volatile half, half addrspace(1)* %gep.0
564  %r2 = load volatile half, half addrspace(1)* %gep.1
565
566  %add = fadd half %r1, %r1
567  %r3 = fsub half %r2, %add
568
569  store half %r3, half addrspace(1)* %gep.out
570  ret void
571}
572
573; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16:
574; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
575; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
576
577; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
578
579; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
580
581; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
582; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
583
584; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
585
586; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
587; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
588; GFX10:       global_store_short v{{[0-9]+}}, [[RESULT]]
589define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
590  %tid = call i32 @llvm.amdgcn.workitem.id.x()
591  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
592  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
593  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
594
595  %r1 = load volatile half, half addrspace(1)* %gep.0
596  %r2 = load volatile half, half addrspace(1)* %gep.1
597
598  %add = fadd half %r1, %r1
599  %r3 = fsub half %add, %r2
600
601  store half %r3, half addrspace(1)* %gep.out
602  ret void
603}
604
605attributes #0 = { nounwind }
606attributes #1 = { nounwind readnone }
607