1; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
2; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
3; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
4; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
5
6; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
7; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
8; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
9; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
10
11declare i32 @llvm.amdgcn.workitem.id.x() #1
12declare half @llvm.fmuladd.f16(half, half, half) #1
13declare half @llvm.fabs.f16(half) #1
14
15; GCN-LABEL: {{^}}fmuladd_f16:
16; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
17
18; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
19define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
20                         half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
21  %r0 = load half, half addrspace(1)* %in1
22  %r1 = load half, half addrspace(1)* %in2
23  %r2 = load half, half addrspace(1)* %in3
24  %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2)
25  store half %r3, half addrspace(1)* %out
26  ret void
27}
28
29; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
30; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
31; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
32; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
33; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
34
35; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
36; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
37define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
38  %tid = call i32 @llvm.amdgcn.workitem.id.x()
39  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
40  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
41  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
42
43  %r1 = load volatile half, half addrspace(1)* %gep.0
44  %r2 = load volatile half, half addrspace(1)* %gep.1
45
46  %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
47  store half %r3, half addrspace(1)* %gep.out
48  ret void
49}
50
51; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
52; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
53; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
54; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
55; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
56
57; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
58; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
59define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
60  %tid = call i32 @llvm.amdgcn.workitem.id.x()
61  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
62  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
63  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
64
65  %r1 = load volatile half, half addrspace(1)* %gep.0
66  %r2 = load volatile half, half addrspace(1)* %gep.1
67
68  %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
69  store half %r3, half addrspace(1)* %gep.out
70  ret void
71}
72
73; GCN-LABEL: {{^}}fadd_a_a_b_f16:
74; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
75; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
76; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
77; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
78
79; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
80
81; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
82; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
83
84; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
85define void @fadd_a_a_b_f16(half addrspace(1)* %out,
86                            half addrspace(1)* %in1,
87                            half addrspace(1)* %in2) #0 {
88  %tid = call i32 @llvm.amdgcn.workitem.id.x()
89  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
90  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
91  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
92
93  %r0 = load volatile half, half addrspace(1)* %gep.0
94  %r1 = load volatile half, half addrspace(1)* %gep.1
95
96  %add.0 = fadd half %r0, %r0
97  %add.1 = fadd half %add.0, %r1
98  store half %add.1, half addrspace(1)* %gep.out
99  ret void
100}
101
102; GCN-LABEL: {{^}}fadd_b_a_a_f16:
103; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
104; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
105; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
106; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
107
108; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
109
110; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
111; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
112
113; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
114define void @fadd_b_a_a_f16(half addrspace(1)* %out,
115                            half addrspace(1)* %in1,
116                            half addrspace(1)* %in2) #0 {
117  %tid = call i32 @llvm.amdgcn.workitem.id.x()
118  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
119  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
120  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
121
122  %r0 = load volatile half, half addrspace(1)* %gep.0
123  %r1 = load volatile half, half addrspace(1)* %gep.1
124
125  %add.0 = fadd half %r0, %r0
126  %add.1 = fadd half %r1, %add.0
127  store half %add.1, half addrspace(1)* %gep.out
128  ret void
129}
130
131; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16
132; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
133; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
134; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
135; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]]
136; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
137define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
138  %tid = call i32 @llvm.amdgcn.workitem.id.x()
139  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
140  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
141  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
142
143  %r1 = load volatile half, half addrspace(1)* %gep.0
144  %r2 = load volatile half, half addrspace(1)* %gep.1
145
146  %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
147  store half %r3, half addrspace(1)* %gep.out
148  ret void
149}
150
151; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
152; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
153; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
154; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
155; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
156
157; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
158; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
159define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
160  %tid = call i32 @llvm.amdgcn.workitem.id.x()
161  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
162  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
163  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
164
165  %r1 = load volatile half, half addrspace(1)* %gep.0
166  %r2 = load volatile half, half addrspace(1)* %gep.1
167
168  %r1.fneg = fsub half -0.000000e+00, %r1
169
170  %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
171  store half %r3, half addrspace(1)* %gep.out
172  ret void
173}
174
175; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
176; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
177; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
178; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
179; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
180
181; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
182; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
183define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
184  %tid = call i32 @llvm.amdgcn.workitem.id.x()
185  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
186  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
187  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
188
189  %r1 = load volatile half, half addrspace(1)* %gep.0
190  %r2 = load volatile half, half addrspace(1)* %gep.1
191
192  %r1.fneg = fsub half -0.000000e+00, %r1
193
194  %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2)
195  store half %r3, half addrspace(1)* %gep.out
196  ret void
197}
198
199; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16
200; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
201; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
202; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
203; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
204; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
205define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
206  %tid = call i32 @llvm.amdgcn.workitem.id.x()
207  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
208  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
209  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
210
211  %r1 = load volatile half, half addrspace(1)* %gep.0
212  %r2 = load volatile half, half addrspace(1)* %gep.1
213
214  %r2.fneg = fsub half -0.000000e+00, %r2
215
216  %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg)
217  store half %r3, half addrspace(1)* %gep.out
218  ret void
219}
220
221; GCN-LABEL: {{^}}mad_sub_f16:
222; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
223; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
224; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
225
226; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
227
228; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
229
230; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
231; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
232
233; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
234define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
235  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
236  %tid.ext = sext i32 %tid to i64
237  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
238  %add1 = add i64 %tid.ext, 1
239  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
240  %add2 = add i64 %tid.ext, 2
241  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
242  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
243  %a = load volatile half, half addrspace(1)* %gep0, align 2
244  %b = load volatile half, half addrspace(1)* %gep1, align 2
245  %c = load volatile half, half addrspace(1)* %gep2, align 2
246  %mul = fmul half %a, %b
247  %sub = fsub half %mul, %c
248  store half %sub, half addrspace(1)* %outgep, align 2
249  ret void
250}
251
252; GCN-LABEL: {{^}}mad_sub_inv_f16:
253; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
254; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
255; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
256; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
257
258; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
259
260; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
261; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
262
263; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
264define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
265  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
266  %tid.ext = sext i32 %tid to i64
267  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
268  %add1 = add i64 %tid.ext, 1
269  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
270  %add2 = add i64 %tid.ext, 2
271  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
272  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
273  %a = load volatile half, half addrspace(1)* %gep0, align 2
274  %b = load volatile half, half addrspace(1)* %gep1, align 2
275  %c = load volatile half, half addrspace(1)* %gep2, align 2
276  %mul = fmul half %a, %b
277  %sub = fsub half %c, %mul
278  store half %sub, half addrspace(1)* %outgep, align 2
279  ret void
280}
281
282; GCN-LABEL: {{^}}mad_sub_fabs_f16:
283; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
284; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
285; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
286; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
287
288; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
289
290; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
291; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
292
293; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
294define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
295  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
296  %tid.ext = sext i32 %tid to i64
297  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
298  %add1 = add i64 %tid.ext, 1
299  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
300  %add2 = add i64 %tid.ext, 2
301  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
302  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
303  %a = load volatile half, half addrspace(1)* %gep0, align 2
304  %b = load volatile half, half addrspace(1)* %gep1, align 2
305  %c = load volatile half, half addrspace(1)* %gep2, align 2
306  %c.abs = call half @llvm.fabs.f16(half %c) #0
307  %mul = fmul half %a, %b
308  %sub = fsub half %mul, %c.abs
309  store half %sub, half addrspace(1)* %outgep, align 2
310  ret void
311}
312
313; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16:
314; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
315; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
316; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
317
318; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
319
320; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
321
322; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
323; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
324
325; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
326define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
327  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
328  %tid.ext = sext i32 %tid to i64
329  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
330  %add1 = add i64 %tid.ext, 1
331  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
332  %add2 = add i64 %tid.ext, 2
333  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
334  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
335  %a = load volatile half, half addrspace(1)* %gep0, align 2
336  %b = load volatile half, half addrspace(1)* %gep1, align 2
337  %c = load volatile half, half addrspace(1)* %gep2, align 2
338  %c.abs = call half @llvm.fabs.f16(half %c) #0
339  %mul = fmul half %a, %b
340  %sub = fsub half %c.abs, %mul
341  store half %sub, half addrspace(1)* %outgep, align 2
342  ret void
343}
344
345; GCN-LABEL: {{^}}neg_neg_mad_f16:
346; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
347; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
348; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
349
350; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGB]], [[REGA]]
351; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
352
353; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
354
355; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
356; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
357; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
358define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
359  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
360  %tid.ext = sext i32 %tid to i64
361  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
362  %add1 = add i64 %tid.ext, 1
363  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
364  %add2 = add i64 %tid.ext, 2
365  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
366  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
367  %a = load volatile half, half addrspace(1)* %gep0, align 2
368  %b = load volatile half, half addrspace(1)* %gep1, align 2
369  %c = load volatile half, half addrspace(1)* %gep2, align 2
370  %nega = fsub half -0.000000e+00, %a
371  %negb = fsub half -0.000000e+00, %b
372  %mul = fmul half %nega, %negb
373  %sub = fadd half %mul, %c
374  store half %sub, half addrspace(1)* %outgep, align 2
375  ret void
376}
377
378; GCN-LABEL: {{^}}mad_fabs_sub_f16:
379; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
380; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
381; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
382
383; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
384
385; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
386
387; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
388; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
389
390; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
391define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
392  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
393  %tid.ext = sext i32 %tid to i64
394  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
395  %add1 = add i64 %tid.ext, 1
396  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
397  %add2 = add i64 %tid.ext, 2
398  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
399  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
400  %a = load volatile half, half addrspace(1)* %gep0, align 2
401  %b = load volatile half, half addrspace(1)* %gep1, align 2
402  %c = load volatile half, half addrspace(1)* %gep2, align 2
403  %b.abs = call half @llvm.fabs.f16(half %b) #0
404  %mul = fmul half %a, %b.abs
405  %sub = fsub half %mul, %c
406  store half %sub, half addrspace(1)* %outgep, align 2
407  ret void
408}
409
410; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16:
411; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
412; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
413; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
414; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
415
416; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
417
418; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
419; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
420
421; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
422define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
423  %tid = call i32 @llvm.amdgcn.workitem.id.x()
424  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
425  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
426  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
427
428  %r1 = load volatile half, half addrspace(1)* %gep.0
429  %r2 = load volatile half, half addrspace(1)* %gep.1
430
431  %add = fadd half %r1, %r1
432  %r3 = fsub half %r2, %add
433
434  store half %r3, half addrspace(1)* %gep.out
435  ret void
436}
437
438; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16:
439; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
440; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
441
442; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
443
444; VI-DENORM-CONTRACT: v_fma_f16 [[R2]], [[R1]], 2.0, -[[R2]]
445
446; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
447; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
448
449; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
450define void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
451  %tid = call i32 @llvm.amdgcn.workitem.id.x()
452  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
453  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
454  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
455
456  %r1 = load volatile half, half addrspace(1)* %gep.0
457  %r2 = load volatile half, half addrspace(1)* %gep.1
458
459  %add = fadd half %r1, %r1
460  %r3 = fsub half %add, %r2
461
462  store half %r3, half addrspace(1)* %gep.out
463  ret void
464}
465
466attributes #0 = { nounwind }
467attributes #1 = { nounwind readnone }
468