1; RUN: llc -march=amdgcn -mcpu=hawaii -start-before=amdgpu-unify-divergent-exit-nodes -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI %s
2; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -start-before=amdgpu-unify-divergent-exit-nodes -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI %s
3
4; RUN: llc -march=amdgcn -mcpu=fiji -start-before=amdgpu-unify-divergent-exit-nodes --verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI %s
5; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -start-before=amdgpu-unify-divergent-exit-nodes -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI %s
6
7; --------------------------------------------------------------------------------
8; fadd tests
9; --------------------------------------------------------------------------------
10
11; GCN-LABEL: {{^}}v_fneg_add_f32:
12; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
13; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
14
15; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
16; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
17
18; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
19; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
20define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
21  %tid = call i32 @llvm.amdgcn.workitem.id.x()
22  %tid.ext = sext i32 %tid to i64
23  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
24  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
25  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
26  %a = load volatile float, float addrspace(1)* %a.gep
27  %b = load volatile float, float addrspace(1)* %b.gep
28  %add = fadd float %a, %b
29  %fneg = fneg float %add
30  store float %fneg, float addrspace(1)* %out.gep
31  ret void
32}
33
34; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
35; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
36; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
37; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
38; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
39; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
40; GCN-NEXT: s_waitcnt vmcnt(0)
41; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
42; GCN-NEXT: s_waitcnt vmcnt(0)
43define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
44  %tid = call i32 @llvm.amdgcn.workitem.id.x()
45  %tid.ext = sext i32 %tid to i64
46  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
47  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
48  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
49  %a = load volatile float, float addrspace(1)* %a.gep
50  %b = load volatile float, float addrspace(1)* %b.gep
51  %add = fadd float %a, %b
52  %fneg = fneg float %add
53  store volatile float %fneg, float addrspace(1)* %out
54  store volatile float %add, float addrspace(1)* %out
55  ret void
56}
57
58; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
59; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
60; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
61
62; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
63; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
64; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
65
66; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]]
67; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]]
68
69; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
70; GCN-NEXT: s_waitcnt vmcnt(0)
71; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
72; GCN-NEXT: s_waitcnt vmcnt(0)
73define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
74  %tid = call i32 @llvm.amdgcn.workitem.id.x()
75  %tid.ext = sext i32 %tid to i64
76  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
77  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
78  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
79  %a = load volatile float, float addrspace(1)* %a.gep
80  %b = load volatile float, float addrspace(1)* %b.gep
81  %add = fadd float %a, %b
82  %fneg = fneg float %add
83  %use1 = fmul float %add, 4.0
84  store volatile float %fneg, float addrspace(1)* %out
85  store volatile float %use1, float addrspace(1)* %out
86  ret void
87}
88
89; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32:
90; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
91; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
92
93; GCN-SAFE: v_sub_f32_e32
94; GCN-SAFE: v_xor_b32_e32 [[ADD:v[0-9]+]], 0x80000000,
95
96; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
97
98; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
99define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
100  %tid = call i32 @llvm.amdgcn.workitem.id.x()
101  %tid.ext = sext i32 %tid to i64
102  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
103  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
104  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
105  %a = load volatile float, float addrspace(1)* %a.gep
106  %b = load volatile float, float addrspace(1)* %b.gep
107  %fneg.a = fneg float %a
108  %add = fadd float %fneg.a, %b
109  %fneg = fneg float %add
110  store volatile float %fneg, float addrspace(1)* %out
111  ret void
112}
113
114; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32:
115; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
116; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
117
118; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
119; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
120
121; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
122; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
123define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
124  %tid = call i32 @llvm.amdgcn.workitem.id.x()
125  %tid.ext = sext i32 %tid to i64
126  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
127  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
128  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
129  %a = load volatile float, float addrspace(1)* %a.gep
130  %b = load volatile float, float addrspace(1)* %b.gep
131  %fneg.b = fneg float %b
132  %add = fadd float %a, %fneg.b
133  %fneg = fneg float %add
134  store volatile float %fneg, float addrspace(1)* %out
135  ret void
136}
137
138; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32:
139; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
140; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
141
142; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]]
143; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
144
145; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
146; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
147define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
148  %tid = call i32 @llvm.amdgcn.workitem.id.x()
149  %tid.ext = sext i32 %tid to i64
150  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
151  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
152  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
153  %a = load volatile float, float addrspace(1)* %a.gep
154  %b = load volatile float, float addrspace(1)* %b.gep
155  %fneg.a = fneg float %a
156  %fneg.b = fneg float %b
157  %add = fadd float %fneg.a, %fneg.b
158  %fneg = fneg float %add
159  store volatile float %fneg, float addrspace(1)* %out
160  ret void
161}
162
163; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
164; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
165; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
166
167; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
168; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
169; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
170
171; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
172; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
173; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
174; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
175; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
176; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
177define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
178  %tid = call i32 @llvm.amdgcn.workitem.id.x()
179  %tid.ext = sext i32 %tid to i64
180  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
181  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
182  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
183  %a = load volatile float, float addrspace(1)* %a.gep
184  %b = load volatile float, float addrspace(1)* %b.gep
185  %fneg.a = fneg float %a
186  %add = fadd float %fneg.a, %b
187  %fneg = fneg float %add
188  store volatile float %fneg, float addrspace(1)* %out
189  store volatile float %fneg.a, float addrspace(1)* %out
190  ret void
191}
192
193; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32:
194; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
195; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
196
197; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
198; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
199; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
200
201; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
202; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
203; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
204; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
205; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
206; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
207define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
208  %tid = call i32 @llvm.amdgcn.workitem.id.x()
209  %tid.ext = sext i32 %tid to i64
210  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
211  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
212  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
213  %a = load volatile float, float addrspace(1)* %a.gep
214  %b = load volatile float, float addrspace(1)* %b.gep
215  %fneg.a = fneg float %a
216  %add = fadd float %fneg.a, %b
217  %fneg = fneg float %add
218  %use1 = fmul float %fneg.a, %c
219  store volatile float %fneg, float addrspace(1)* %out
220  store volatile float %use1, float addrspace(1)* %out
221  ret void
222}
223
224; This one asserted with -enable-no-signed-zeros-fp-math
225; GCN-LABEL: {{^}}fneg_fadd_0:
226; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]],
227; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]]
228; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]]
229define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 {
230.entry:
231  %tmp7 = fdiv float 1.000000e+00, %tmp6
232  %tmp8 = fmul float 0.000000e+00, %tmp7
233  %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8
234  %.i188 = fadd float %tmp9, 0.000000e+00
235  %tmp10 = fcmp uge float %.i188, %tmp2
236  %tmp11 = fneg float %.i188
237  %.i092 = select i1 %tmp10, float %tmp2, float %tmp11
238  %tmp12 = fcmp ule float %.i092, 0.000000e+00
239  %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000
240  ret float %.i198
241}
242
243; This is a workaround because -enable-no-signed-zeros-fp-math does not set up
244; function attribute unsafe-fp-math automatically. Combine with the previous test
245; when that is done.
246; GCN-LABEL: {{^}}fneg_fadd_0_nsz:
247; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]],
248; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]],
249; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]],
250; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]],
251; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]]
252define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 {
253.entry:
254  %tmp7 = fdiv afn float 1.000000e+00, %tmp6
255  %tmp8 = fmul float 0.000000e+00, %tmp7
256  %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8
257  %.i188 = fadd float %tmp9, 0.000000e+00
258  %tmp10 = fcmp uge float %.i188, %tmp2
259  %tmp11 = fneg float %.i188
260  %.i092 = select i1 %tmp10, float %tmp2, float %tmp11
261  %tmp12 = fcmp ule float %.i092, 0.000000e+00
262  %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000
263  ret float %.i198
264}
265
266; --------------------------------------------------------------------------------
267; fmul tests
268; --------------------------------------------------------------------------------
269
270; GCN-LABEL: {{^}}v_fneg_mul_f32:
271; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
272; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
273; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
274; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
275define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
276  %tid = call i32 @llvm.amdgcn.workitem.id.x()
277  %tid.ext = sext i32 %tid to i64
278  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
279  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
280  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
281  %a = load volatile float, float addrspace(1)* %a.gep
282  %b = load volatile float, float addrspace(1)* %b.gep
283  %mul = fmul float %a, %b
284  %fneg = fneg float %mul
285  store float %fneg, float addrspace(1)* %out.gep
286  ret void
287}
288
289; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
290; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
291; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
292; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
293; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
294; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]]
295; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
296define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
297  %tid = call i32 @llvm.amdgcn.workitem.id.x()
298  %tid.ext = sext i32 %tid to i64
299  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
300  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
301  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
302  %a = load volatile float, float addrspace(1)* %a.gep
303  %b = load volatile float, float addrspace(1)* %b.gep
304  %mul = fmul float %a, %b
305  %fneg = fneg float %mul
306  store volatile float %fneg, float addrspace(1)* %out
307  store volatile float %mul, float addrspace(1)* %out
308  ret void
309}
310
311; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
312; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
313; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
314; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]]
315; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
316
317; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
318; GCN-NEXT: s_waitcnt vmcnt(0)
319; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
320; GCN-NEXT: s_waitcnt vmcnt(0)
321define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
322  %tid = call i32 @llvm.amdgcn.workitem.id.x()
323  %tid.ext = sext i32 %tid to i64
324  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
325  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
326  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
327  %a = load volatile float, float addrspace(1)* %a.gep
328  %b = load volatile float, float addrspace(1)* %b.gep
329  %mul = fmul float %a, %b
330  %fneg = fneg float %mul
331  %use1 = fmul float %mul, 4.0
332  store volatile float %fneg, float addrspace(1)* %out
333  store volatile float %use1, float addrspace(1)* %out
334  ret void
335}
336
337; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
338; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
339; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
340; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
341; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
342define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
343  %tid = call i32 @llvm.amdgcn.workitem.id.x()
344  %tid.ext = sext i32 %tid to i64
345  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
346  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
347  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
348  %a = load volatile float, float addrspace(1)* %a.gep
349  %b = load volatile float, float addrspace(1)* %b.gep
350  %fneg.a = fneg float %a
351  %mul = fmul float %fneg.a, %b
352  %fneg = fneg float %mul
353  store volatile float %fneg, float addrspace(1)* %out
354  ret void
355}
356
357; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
358; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
359; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
360; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
361; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
362define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
363  %tid = call i32 @llvm.amdgcn.workitem.id.x()
364  %tid.ext = sext i32 %tid to i64
365  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
366  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
367  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
368  %a = load volatile float, float addrspace(1)* %a.gep
369  %b = load volatile float, float addrspace(1)* %b.gep
370  %fneg.b = fneg float %b
371  %mul = fmul float %a, %fneg.b
372  %fneg = fneg float %mul
373  store volatile float %fneg, float addrspace(1)* %out
374  ret void
375}
376
377; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32:
378; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
379; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
380; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
381; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
382define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
383  %tid = call i32 @llvm.amdgcn.workitem.id.x()
384  %tid.ext = sext i32 %tid to i64
385  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
386  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
387  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
388  %a = load volatile float, float addrspace(1)* %a.gep
389  %b = load volatile float, float addrspace(1)* %b.gep
390  %fneg.a = fneg float %a
391  %fneg.b = fneg float %b
392  %mul = fmul float %fneg.a, %fneg.b
393  %fneg = fneg float %mul
394  store volatile float %fneg, float addrspace(1)* %out
395  ret void
396}
397
398; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32:
399; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
400; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
401; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
402; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
403
404; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]]
405; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
406define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
407  %tid = call i32 @llvm.amdgcn.workitem.id.x()
408  %tid.ext = sext i32 %tid to i64
409  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
410  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
411  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
412  %a = load volatile float, float addrspace(1)* %a.gep
413  %b = load volatile float, float addrspace(1)* %b.gep
414  %fneg.a = fneg float %a
415  %mul = fmul float %fneg.a, %b
416  %fneg = fneg float %mul
417  store volatile float %fneg, float addrspace(1)* %out
418  store volatile float %fneg.a, float addrspace(1)* %out
419  ret void
420}
421
422; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
423; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
424; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
425; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
426; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
427; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]]
428; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
429define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
430  %tid = call i32 @llvm.amdgcn.workitem.id.x()
431  %tid.ext = sext i32 %tid to i64
432  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
433  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
434  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
435  %a = load volatile float, float addrspace(1)* %a.gep
436  %b = load volatile float, float addrspace(1)* %b.gep
437  %fneg.a = fneg float %a
438  %mul = fmul float %fneg.a, %b
439  %fneg = fneg float %mul
440  %use1 = fmul float %fneg.a, %c
441  store volatile float %fneg, float addrspace(1)* %out
442  store volatile float %use1, float addrspace(1)* %out
443  ret void
444}
445
446; --------------------------------------------------------------------------------
447; fminnum tests
448; --------------------------------------------------------------------------------
449
450; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee:
451; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
452; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
453; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
454; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
455; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
456; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
457define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
458  %tid = call i32 @llvm.amdgcn.workitem.id.x()
459  %tid.ext = sext i32 %tid to i64
460  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
461  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
462  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
463  %a = load volatile float, float addrspace(1)* %a.gep
464  %b = load volatile float, float addrspace(1)* %b.gep
465  %min = call float @llvm.minnum.f32(float %a, float %b)
466  %fneg = fneg float %min
467  store float %fneg, float addrspace(1)* %out.gep
468  ret void
469}
470
471; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee:
472; GCN-NOT: v0
473; GCN-NOT: v1
474; GCN: v_max_f32_e64 v0, -v0, -v1
475; GCN-NEXT: ; return
476define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 {
477  %min = call float @llvm.minnum.f32(float %a, float %b)
478  %fneg = fneg float %min
479  ret float %fneg
480}
481
482; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee:
483; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
484; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
485; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
486; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
487define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
488  %tid = call i32 @llvm.amdgcn.workitem.id.x()
489  %tid.ext = sext i32 %tid to i64
490  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
491  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
492  %a = load volatile float, float addrspace(1)* %a.gep
493  %min = call float @llvm.minnum.f32(float %a, float %a)
494  %min.fneg = fneg float %min
495  store float %min.fneg, float addrspace(1)* %out.gep
496  ret void
497}
498
499; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee:
500; GCN-NOT: v0
501; GCN: v_max_f32_e64 v0, -v0, -v0
502; GCN-NEXT: ; return
503define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 {
504  %min = call float @llvm.minnum.f32(float %a, float %a)
505  %min.fneg = fneg float %min
506  ret float %min.fneg
507}
508
509; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee:
510; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
511; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
512; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
513; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
514define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
515  %tid = call i32 @llvm.amdgcn.workitem.id.x()
516  %tid.ext = sext i32 %tid to i64
517  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
518  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
519  %a = load volatile float, float addrspace(1)* %a.gep
520  %min = call float @llvm.minnum.f32(float 4.0, float %a)
521  %fneg = fneg float %min
522  store float %fneg, float addrspace(1)* %out.gep
523  ret void
524}
525
526; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee:
527; GCN-NOT: v0
528; GCN: v_max_f32_e64 v0, -v0, -4.0
529; GCN-NEXT: ; return
530define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 {
531  %min = call float @llvm.minnum.f32(float 4.0, float %a)
532  %fneg = fneg float %min
533  ret float %fneg
534}
535
536; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee:
537; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
538; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
539; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
540; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
541define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
542  %tid = call i32 @llvm.amdgcn.workitem.id.x()
543  %tid.ext = sext i32 %tid to i64
544  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
545  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
546  %a = load volatile float, float addrspace(1)* %a.gep
547  %min = call float @llvm.minnum.f32(float -4.0, float %a)
548  %fneg = fneg float %min
549  store float %fneg, float addrspace(1)* %out.gep
550  ret void
551}
552
553; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee:
554; GCN-NOT: v0
555; GCN: v_max_f32_e64 v0, -v0, 4.0
556; GCN-NEXT: ; return
557define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 {
558  %min = call float @llvm.minnum.f32(float -4.0, float %a)
559  %fneg = fneg float %min
560  ret float %fneg
561}
562
563; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
564; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
565; GCN-NOT [[A]]
566; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
567; GCN: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MIN]]
568; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
569define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
570  %tid = call i32 @llvm.amdgcn.workitem.id.x()
571  %tid.ext = sext i32 %tid to i64
572  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
573  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
574  %a = load volatile float, float addrspace(1)* %a.gep
575  %min = call nnan float @llvm.minnum.f32(float 0.0, float %a)
576  %fneg = fneg float %min
577  store float %fneg, float addrspace(1)* %out.gep
578  ret void
579}
580
581; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee:
582; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
583; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
584; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
585; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
586define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
587  %tid = call i32 @llvm.amdgcn.workitem.id.x()
588  %tid.ext = sext i32 %tid to i64
589  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
590  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
591  %a = load volatile float, float addrspace(1)* %a.gep
592  %min = call float @llvm.minnum.f32(float -0.0, float %a)
593  %fneg = fneg float %min
594  store float %fneg, float addrspace(1)* %out.gep
595  ret void
596}
597
598; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32:
599; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
600
601; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
602; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
603
604; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
605; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
606; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]]
607
608; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
609define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
610  %tid = call i32 @llvm.amdgcn.workitem.id.x()
611  %tid.ext = sext i32 %tid to i64
612  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
613  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
614  %a = load volatile float, float addrspace(1)* %a.gep
615  %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a)
616  %fneg = fneg float %min
617  store float %fneg, float addrspace(1)* %out.gep
618  ret void
619}
620
621; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32:
622; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
623
624; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
625; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]]
626
627; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
628; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
629
630; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
631define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
632  %tid = call i32 @llvm.amdgcn.workitem.id.x()
633  %tid.ext = sext i32 %tid to i64
634  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
635  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
636  %a = load volatile float, float addrspace(1)* %a.gep
637  %min = call float @llvm.minnum.f32(float 0xBFC45F3060000000, float %a)
638  %fneg = fneg float %min
639  store float %fneg, float addrspace(1)* %out.gep
640  ret void
641}
642
643; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f16:
644; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
645
646; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
647; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]]
648; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
649
650; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]]
651; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
652; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]]
653
654; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
655define amdgpu_kernel void @v_fneg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
656  %tid = call i32 @llvm.amdgcn.workitem.id.x()
657  %tid.ext = sext i32 %tid to i64
658  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
659  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
660  %a = load volatile half, half addrspace(1)* %a.gep
661  %min = call half @llvm.minnum.f16(half 0xH3118, half %a)
662  %fneg = fsub half -0.000000e+00, %min
663  store half %fneg, half addrspace(1)* %out.gep
664  ret void
665}
666
667; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f16:
668; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
669
670; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
671; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]]
672; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
673
674; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]]
675; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
676
677; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
678define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
679  %tid = call i32 @llvm.amdgcn.workitem.id.x()
680  %tid.ext = sext i32 %tid to i64
681  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
682  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
683  %a = load volatile half, half addrspace(1)* %a.gep
684  %min = call half @llvm.minnum.f16(half 0xHB118, half %a)
685  %fneg = fsub half -0.000000e+00, %min
686  store half %fneg, half addrspace(1)* %out.gep
687  ret void
688}
689
690; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f64:
691; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
692
693; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30
694; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
695; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
696; SI: v_max_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]]
697
698; VI: v_min_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[A]], 0.15915494
699; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]]
700
701; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RESULT_LO]]:[[RESULT_HI]]]
702define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
703  %tid = call i32 @llvm.amdgcn.workitem.id.x()
704  %tid.ext = sext i32 %tid to i64
705  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
706  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
707  %a = load volatile double, double addrspace(1)* %a.gep
708  %min = call double @llvm.minnum.f64(double 0x3fc45f306dc9c882, double %a)
709  %fneg = fsub double -0.000000e+00, %min
710  store double %fneg, double addrspace(1)* %out.gep
711  ret void
712}
713
714; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f64:
715; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
716
717; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30
718; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
719; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
720; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]]
721
722; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
723; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494
724
725; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
726define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
727  %tid = call i32 @llvm.amdgcn.workitem.id.x()
728  %tid.ext = sext i32 %tid to i64
729  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
730  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
731  %a = load volatile double, double addrspace(1)* %a.gep
732  %min = call double @llvm.minnum.f64(double 0xbfc45f306dc9c882, double %a)
733  %fneg = fsub double -0.000000e+00, %min
734  store double %fneg, double addrspace(1)* %out.gep
735  ret void
736}
737
738; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_no_ieee:
739; GCN-NOT: v0
740; GCN: v_max_f32_e64 v0, -v0, 0{{$}}
741; GCN-NEXT: ; return
742define amdgpu_ps float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #0 {
743  %min = call float @llvm.minnum.f32(float -0.0, float %a)
744  %fneg = fneg float %min
745  ret float %fneg
746}
747
748; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee:
749; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
750; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
751; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
752; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]]
753; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
754; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
755define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
756  %tid = call i32 @llvm.amdgcn.workitem.id.x()
757  %tid.ext = sext i32 %tid to i64
758  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
759  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
760  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
761  %a = load volatile float, float addrspace(1)* %a.gep
762  %b = load volatile float, float addrspace(1)* %b.gep
763  %min = call float @llvm.minnum.f32(float 0.0, float %a)
764  %fneg = fneg float %min
765  %mul = fmul float %fneg, %b
766  store float %mul, float addrspace(1)* %out.gep
767  ret void
768}
769
770; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32:
771; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
772; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
773
774; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
775
776; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
777; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]]
778
779; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
780; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]]
781; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
782
783; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
784define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
785  %tid = call i32 @llvm.amdgcn.workitem.id.x()
786  %tid.ext = sext i32 %tid to i64
787  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
788  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
789  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
790  %a = load volatile float, float addrspace(1)* %a.gep
791  %b = load volatile float, float addrspace(1)* %b.gep
792  %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a)
793  %fneg = fneg float %min
794  %mul = fmul float %fneg, %b
795  store float %mul, float addrspace(1)* %out.gep
796  ret void
797}
798
799; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee:
800; GCN-NOT: v0
801; GCN-NOT: v1
802; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0
803; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1
804; GCN-NEXT: ; return
805define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
806  %min = call float @llvm.minnum.f32(float 0.0, float %a)
807  %fneg = fneg float %min
808  %mul = fmul float %fneg, %b
809  ret float %mul
810}
811
812; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee:
813; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
814; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
815; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
816; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
817; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
818; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
819; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
820; GCN-NEXT: s_waitcnt vmcnt(0)
821; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
822; GCN-NEXT: s_waitcnt vmcnt(0)
823define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
824  %tid = call i32 @llvm.amdgcn.workitem.id.x()
825  %tid.ext = sext i32 %tid to i64
826  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
827  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
828  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
829  %a = load volatile float, float addrspace(1)* %a.gep
830  %b = load volatile float, float addrspace(1)* %b.gep
831  %min = call float @llvm.minnum.f32(float %a, float %b)
832  %fneg = fneg float %min
833  %use1 = fmul float %min, 4.0
834  store volatile float %fneg, float addrspace(1)* %out
835  store volatile float %use1, float addrspace(1)* %out
836  ret void
837}
838
839; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee:
840; GCN-NOT: v0
841; GCN-NOT: v1
842; GCN: v_max_f32_e64 v0, -v0, -v1
843; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
844; GCN-NEXT: ; return
845define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 {
846  %min = call float @llvm.minnum.f32(float %a, float %b)
847  %fneg = fneg float %min
848  %use1 = fmul float %min, 4.0
849  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
850  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
851  ret <2 x float> %ins1
852}
853
854; --------------------------------------------------------------------------------
855; fmaxnum tests
856; --------------------------------------------------------------------------------
857
858
859; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee:
860; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
861; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
862; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
863; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
864; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
865; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
866define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
867  %tid = call i32 @llvm.amdgcn.workitem.id.x()
868  %tid.ext = sext i32 %tid to i64
869  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
870  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
871  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
872  %a = load volatile float, float addrspace(1)* %a.gep
873  %b = load volatile float, float addrspace(1)* %b.gep
874  %max = call float @llvm.maxnum.f32(float %a, float %b)
875  %fneg = fneg float %max
876  store float %fneg, float addrspace(1)* %out.gep
877  ret void
878}
879
880; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee:
881; GCN-NOT: v0
882; GCN-NOT: v1
883; GCN: v_min_f32_e64 v0, -v0, -v1
884; GCN-NEXT: ; return
885define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 {
886  %max = call float @llvm.maxnum.f32(float %a, float %b)
887  %fneg = fneg float %max
888  ret float %fneg
889}
890
891; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee:
892; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
893; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
894; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
895; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
896define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
897  %tid = call i32 @llvm.amdgcn.workitem.id.x()
898  %tid.ext = sext i32 %tid to i64
899  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
900  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
901  %a = load volatile float, float addrspace(1)* %a.gep
902  %max = call float @llvm.maxnum.f32(float %a, float %a)
903  %max.fneg = fneg float %max
904  store float %max.fneg, float addrspace(1)* %out.gep
905  ret void
906}
907
908; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee:
909; GCN-NOT: v0
910; GCN: v_min_f32_e64 v0, -v0, -v0
911; GCN-NEXT: ; return
912define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 {
913  %max = call float @llvm.maxnum.f32(float %a, float %a)
914  %max.fneg = fneg float %max
915  ret float %max.fneg
916}
917
918; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee:
919; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
920; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
921; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
922; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
923define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
924  %tid = call i32 @llvm.amdgcn.workitem.id.x()
925  %tid.ext = sext i32 %tid to i64
926  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
927  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
928  %a = load volatile float, float addrspace(1)* %a.gep
929  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
930  %fneg = fneg float %max
931  store float %fneg, float addrspace(1)* %out.gep
932  ret void
933}
934
935; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee:
936; GCN-NOT: v0
937; GCN: v_min_f32_e64 v0, -v0, -4.0
938; GCN-NEXT: ; return
939define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 {
940  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
941  %fneg = fneg float %max
942  ret float %fneg
943}
944
945; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee:
946; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
947; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
948; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
949; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
950define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
951  %tid = call i32 @llvm.amdgcn.workitem.id.x()
952  %tid.ext = sext i32 %tid to i64
953  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
954  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
955  %a = load volatile float, float addrspace(1)* %a.gep
956  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
957  %fneg = fneg float %max
958  store float %fneg, float addrspace(1)* %out.gep
959  ret void
960}
961
962; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee:
963; GCN-NOT: v0
964; GCN: v_min_f32_e64 v0, -v0, 4.0
965; GCN-NEXT: ; return
966define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 {
967  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
968  %fneg = fneg float %max
969  ret float %fneg
970}
971
972; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
973; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
974; GCN-NOT: [[A]]
975; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
976; GCN: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]]
977; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
978define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
979  %tid = call i32 @llvm.amdgcn.workitem.id.x()
980  %tid.ext = sext i32 %tid to i64
981  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
982  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
983  %a = load volatile float, float addrspace(1)* %a.gep
984  %max = call nnan float @llvm.maxnum.f32(float 0.0, float %a)
985  %fneg = fneg float %max
986  store float %fneg, float addrspace(1)* %out.gep
987  ret void
988}
989
990; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee:
991; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
992; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
993; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
994; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
995define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
996  %tid = call i32 @llvm.amdgcn.workitem.id.x()
997  %tid.ext = sext i32 %tid to i64
998  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
999  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1000  %a = load volatile float, float addrspace(1)* %a.gep
1001  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
1002  %fneg = fneg float %max
1003  store float %fneg, float addrspace(1)* %out.gep
1004  ret void
1005}
1006
1007; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee:
1008; GCN-NOT: v0
1009; GCN: v_min_f32_e64 v0, -v0, 0{{$}}
1010; GCN-NEXT: ; return
1011define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 {
1012  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
1013  %fneg = fneg float %max
1014  ret float %fneg
1015}
1016
1017; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee:
1018; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1019; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1020; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
1021; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
1022; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
1023; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1024define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1025  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1026  %tid.ext = sext i32 %tid to i64
1027  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1028  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1029  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1030  %a = load volatile float, float addrspace(1)* %a.gep
1031  %b = load volatile float, float addrspace(1)* %b.gep
1032  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
1033  %fneg = fneg float %max
1034  %mul = fmul float %fneg, %b
1035  store float %mul, float addrspace(1)* %out.gep
1036  ret void
1037}
1038
1039; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee:
1040; GCN-NOT: v0
1041; GCN-NOT: v1
1042; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0
1043; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1
1044; GCN-NEXT: ; return
1045define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
1046  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
1047  %fneg = fneg float %max
1048  %mul = fmul float %fneg, %b
1049  ret float %mul
1050}
1051
1052; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee:
1053; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1054; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1055; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
1056; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
1057; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
1058; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
1059; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
1060; GCN-NEXT: s_waitcnt vmcnt(0)
1061; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
1062; GCN-NEXT: s_waitcnt vmcnt(0)
1063define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1064  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1065  %tid.ext = sext i32 %tid to i64
1066  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1067  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1068  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1069  %a = load volatile float, float addrspace(1)* %a.gep
1070  %b = load volatile float, float addrspace(1)* %b.gep
1071  %max = call float @llvm.maxnum.f32(float %a, float %b)
1072  %fneg = fneg float %max
1073  %use1 = fmul float %max, 4.0
1074  store volatile float %fneg, float addrspace(1)* %out
1075  store volatile float %use1, float addrspace(1)* %out
1076  ret void
1077}
1078
1079; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee:
1080; GCN-NOT: v0
1081; GCN-NOT: v1
1082; GCN: v_min_f32_e64 v0, -v0, -v1
1083; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
1084; GCN-NEXT: ; return
1085define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 {
1086  %max = call float @llvm.maxnum.f32(float %a, float %b)
1087  %fneg = fneg float %max
1088  %use1 = fmul float %max, 4.0
1089  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
1090  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
1091  ret <2 x float> %ins1
1092}
1093
1094; --------------------------------------------------------------------------------
1095; fma tests
1096; --------------------------------------------------------------------------------
1097
1098; GCN-LABEL: {{^}}v_fneg_fma_f32:
1099; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1100; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1101; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1102
1103; GCN-SAFE: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
1104; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[RESULT]]
1105
1106; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
1107; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1108define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1109  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1110  %tid.ext = sext i32 %tid to i64
1111  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1112  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1113  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1114  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1115  %a = load volatile float, float addrspace(1)* %a.gep
1116  %b = load volatile float, float addrspace(1)* %b.gep
1117  %c = load volatile float, float addrspace(1)* %c.gep
1118  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
1119  %fneg = fneg float %fma
1120  store float %fneg, float addrspace(1)* %out.gep
1121  ret void
1122}
1123
1124; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32:
1125; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1126; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1127; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1128; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1129; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
1130; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
1131; GCN-NEXT: s_waitcnt vmcnt(0)
1132; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1133; GCN-NEXT: s_waitcnt vmcnt(0)
1134define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1135  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1136  %tid.ext = sext i32 %tid to i64
1137  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1138  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1139  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1140  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1141  %a = load volatile float, float addrspace(1)* %a.gep
1142  %b = load volatile float, float addrspace(1)* %b.gep
1143  %c = load volatile float, float addrspace(1)* %c.gep
1144  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
1145  %fneg = fneg float %fma
1146  store volatile float %fneg, float addrspace(1)* %out
1147  store volatile float %fma, float addrspace(1)* %out
1148  ret void
1149}
1150
1151; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32:
1152; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1153; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1154; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1155
1156; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1157; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
1158; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
1159
1160; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
1161; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]
1162
1163; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
1164; GCN-NEXT: s_waitcnt vmcnt(0)
1165; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1166; GCN-NEXT: s_waitcnt vmcnt(0)
1167define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1168  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1169  %tid.ext = sext i32 %tid to i64
1170  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1171  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1172  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1173  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1174  %a = load volatile float, float addrspace(1)* %a.gep
1175  %b = load volatile float, float addrspace(1)* %b.gep
1176  %c = load volatile float, float addrspace(1)* %c.gep
1177  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
1178  %fneg = fneg float %fma
1179  %use1 = fmul float %fma, 4.0
1180  store volatile float %fneg, float addrspace(1)* %out
1181  store volatile float %use1, float addrspace(1)* %out
1182  ret void
1183}
1184
1185; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32:
1186; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1187; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1188; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1189
1190; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], [[C]]
1191; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1192
1193; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1194; GCN-NSZ-NOT: [[FMA]]
1195; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1196define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1197  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1198  %tid.ext = sext i32 %tid to i64
1199  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1200  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1201  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1202  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1203  %a = load volatile float, float addrspace(1)* %a.gep
1204  %b = load volatile float, float addrspace(1)* %b.gep
1205  %c = load volatile float, float addrspace(1)* %c.gep
1206  %fneg.a = fneg float %a
1207  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1208  %fneg = fneg float %fma
1209  store volatile float %fneg, float addrspace(1)* %out
1210  ret void
1211}
1212
1213; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32:
1214; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1215; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1216; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1217
1218; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
1219; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1220
1221; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1222; GCN-NSZ-NOT: [[FMA]]
1223; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1224define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1225  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1226  %tid.ext = sext i32 %tid to i64
1227  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1228  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1229  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1230  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1231  %a = load volatile float, float addrspace(1)* %a.gep
1232  %b = load volatile float, float addrspace(1)* %b.gep
1233  %c = load volatile float, float addrspace(1)* %c.gep
1234  %fneg.b = fneg float %b
1235  %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c)
1236  %fneg = fneg float %fma
1237  store volatile float %fneg, float addrspace(1)* %out
1238  ret void
1239}
1240
1241; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32:
1242; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1243; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1244; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1245
1246; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1247; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1248
1249; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
1250; GCN-NSZ-NOT: [[FMA]]
1251; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1252define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1253  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1254  %tid.ext = sext i32 %tid to i64
1255  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1256  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1257  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1258  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1259  %a = load volatile float, float addrspace(1)* %a.gep
1260  %b = load volatile float, float addrspace(1)* %b.gep
1261  %c = load volatile float, float addrspace(1)* %c.gep
1262  %fneg.a = fneg float %a
1263  %fneg.b = fneg float %b
1264  %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c)
1265  %fneg = fneg float %fma
1266  store volatile float %fneg, float addrspace(1)* %out
1267  ret void
1268}
1269
1270; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32:
1271; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1272; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1273; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1274
1275; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], -[[C]]
1276; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1277
1278; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1279; GCN-NSZ-NOT: [[FMA]]
1280; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1281define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1282  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1283  %tid.ext = sext i32 %tid to i64
1284  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1285  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1286  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1287  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1288  %a = load volatile float, float addrspace(1)* %a.gep
1289  %b = load volatile float, float addrspace(1)* %b.gep
1290  %c = load volatile float, float addrspace(1)* %c.gep
1291  %fneg.a = fneg float %a
1292  %fneg.c = fneg float %c
1293  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c)
1294  %fneg = fneg float %fma
1295  store volatile float %fneg, float addrspace(1)* %out
1296  ret void
1297}
1298
1299; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32:
1300; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1301; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1302; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1303
1304; GCN-NSZ-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1305; GCN-NSZ-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1306
1307; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
1308; GCN-NSZ-NOT: [[FMA]]
1309; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1310define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1311  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1312  %tid.ext = sext i32 %tid to i64
1313  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1314  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1315  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1316  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1317  %a = load volatile float, float addrspace(1)* %a.gep
1318  %b = load volatile float, float addrspace(1)* %b.gep
1319  %c = load volatile float, float addrspace(1)* %c.gep
1320  %fneg.c = fneg float %c
1321  %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c)
1322  %fneg = fneg float %fma
1323  store volatile float %fneg, float addrspace(1)* %out
1324  ret void
1325}
1326
1327; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32:
1328; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1329; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1330; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1331
1332; GCN-SAFE: v_xor_b32
1333; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]],
1334; GCN-SAFE: v_xor_b32
1335
1336; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1337; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1338
1339; GCN-NSZ-NOT: [[FMA]]
1340; GCN-NSZ-NOT: [[NEG_A]]
1341; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1342; GCN-NSZ-NOT: [[NEG_A]]
1343; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1344define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1345  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1346  %tid.ext = sext i32 %tid to i64
1347  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1348  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1349  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1350  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1351  %a = load volatile float, float addrspace(1)* %a.gep
1352  %b = load volatile float, float addrspace(1)* %b.gep
1353  %c = load volatile float, float addrspace(1)* %c.gep
1354  %fneg.a = fneg float %a
1355  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1356  %fneg = fneg float %fma
1357  store volatile float %fneg, float addrspace(1)* %out
1358  store volatile float %fneg.a, float addrspace(1)* %out
1359  ret void
1360}
1361
1362; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32:
1363; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1364; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1365; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1366
1367; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
1368; GCN-SAFE-DAG: v_fma_f32 [[FMA:v[0-9]+]]
1369; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1370
1371; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1372; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
1373; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
1374; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1375; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
1376define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
1377  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1378  %tid.ext = sext i32 %tid to i64
1379  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1380  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1381  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1382  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1383  %a = load volatile float, float addrspace(1)* %a.gep
1384  %b = load volatile float, float addrspace(1)* %b.gep
1385  %c = load volatile float, float addrspace(1)* %c.gep
1386  %fneg.a = fneg float %a
1387  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1388  %fneg = fneg float %fma
1389  %use1 = fmul float %fneg.a, %d
1390  store volatile float %fneg, float addrspace(1)* %out
1391  store volatile float %use1, float addrspace(1)* %out
1392  ret void
1393}
1394
1395; --------------------------------------------------------------------------------
1396; fmad tests
1397; --------------------------------------------------------------------------------
1398
1399; GCN-LABEL: {{^}}v_fneg_fmad_f32:
1400; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1401; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1402; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1403
1404; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
1405; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]]
1406
1407; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
1408; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1409define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1410  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1411  %tid.ext = sext i32 %tid to i64
1412  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1413  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1414  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1415  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1416  %a = load volatile float, float addrspace(1)* %a.gep
1417  %b = load volatile float, float addrspace(1)* %b.gep
1418  %c = load volatile float, float addrspace(1)* %c.gep
1419  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
1420  %fneg = fneg float %fma
1421  store float %fneg, float addrspace(1)* %out.gep
1422  ret void
1423}
1424
1425; GCN-LABEL: {{^}}v_fneg_fmad_v4f32:
1426
1427; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1428; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1429; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1430; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1431define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %a.ptr, <4 x float> addrspace(1)* %b.ptr, <4 x float> addrspace(1)* %c.ptr) #0 {
1432  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1433  %tid.ext = sext i32 %tid to i64
1434  %a.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a.ptr, i64 %tid.ext
1435  %b.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %b.ptr, i64 %tid.ext
1436  %c.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %c.ptr, i64 %tid.ext
1437  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
1438  %a = load volatile <4 x float>, <4 x float> addrspace(1)* %a.gep
1439  %b = load volatile <4 x float>, <4 x float> addrspace(1)* %b.gep
1440  %c = load volatile <4 x float>, <4 x float> addrspace(1)* %c.gep
1441  %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
1442  %fneg = fneg <4 x float> %fma
1443  store <4 x float> %fneg, <4 x float> addrspace(1)* %out.gep
1444  ret void
1445}
1446
1447; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
1448; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1449; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1450; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1451
1452; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
1453; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
1454; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
1455
1456; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], [[A]], -[[B]], -[[C]]
1457; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
1458
1459; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]]
1460; GCN-NEXT: s_waitcnt vmcnt(0)
1461; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1462; GCN-NEXT: s_waitcnt vmcnt(0)
1463define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1464  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1465  %tid.ext = sext i32 %tid to i64
1466  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1467  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1468  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1469  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1470  %a = load volatile float, float addrspace(1)* %a.gep
1471  %b = load volatile float, float addrspace(1)* %b.gep
1472  %c = load volatile float, float addrspace(1)* %c.gep
1473  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
1474  %fneg = fneg float %fma
1475  %use1 = fmul float %fma, 4.0
1476  store volatile float %fneg, float addrspace(1)* %out
1477  store volatile float %use1, float addrspace(1)* %out
1478  ret void
1479}
1480
1481; --------------------------------------------------------------------------------
1482; fp_extend tests
1483; --------------------------------------------------------------------------------
1484
1485; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64:
1486; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1487; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
1488; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1489define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1490  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1491  %tid.ext = sext i32 %tid to i64
1492  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1493  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1494  %a = load volatile float, float addrspace(1)* %a.gep
1495  %fpext = fpext float %a to double
1496  %fneg = fsub double -0.000000e+00, %fpext
1497  store double %fneg, double addrspace(1)* %out.gep
1498  ret void
1499}
1500
1501; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64:
1502; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1503; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
1504; GCN: {{buffer|flat}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1505define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1506  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1507  %tid.ext = sext i32 %tid to i64
1508  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1509  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1510  %a = load volatile float, float addrspace(1)* %a.gep
1511  %fneg.a = fneg float %a
1512  %fpext = fpext float %fneg.a to double
1513  %fneg = fsub double -0.000000e+00, %fpext
1514  store double %fneg, double addrspace(1)* %out.gep
1515  ret void
1516}
1517
1518; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64:
1519; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1520; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
1521; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
1522; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1523; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FNEG_A]]
1524define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1525  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1526  %tid.ext = sext i32 %tid to i64
1527  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1528  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1529  %a = load volatile float, float addrspace(1)* %a.gep
1530  %fneg.a = fneg float %a
1531  %fpext = fpext float %fneg.a to double
1532  %fneg = fsub double -0.000000e+00, %fpext
1533  store volatile double %fneg, double addrspace(1)* %out.gep
1534  store volatile float %fneg.a, float addrspace(1)* undef
1535  ret void
1536}
1537
1538; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
1539; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1540; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]]
1541; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
1542; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]]
1543; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[CVT_LO]]:[[CVT_HI]]]
1544define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1545  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1546  %tid.ext = sext i32 %tid to i64
1547  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1548  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1549  %a = load volatile float, float addrspace(1)* %a.gep
1550  %fpext = fpext float %a to double
1551  %fneg = fsub double -0.000000e+00, %fpext
1552  store volatile double %fneg, double addrspace(1)* %out.gep
1553  store volatile double %fpext, double addrspace(1)* undef
1554  ret void
1555}
1556
1557; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
1558; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1559; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]]
1560; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
1561; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v[[[CVT_LO]]:[[CVT_HI]]], 4.0
1562; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]]
1563; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1564define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1565  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1566  %tid.ext = sext i32 %tid to i64
1567  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1568  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1569  %a = load volatile float, float addrspace(1)* %a.gep
1570  %fpext = fpext float %a to double
1571  %fneg = fsub double -0.000000e+00, %fpext
1572  %mul = fmul double %fpext, 4.0
1573  store volatile double %fneg, double addrspace(1)* %out.gep
1574  store volatile double %mul, double addrspace(1)* %out.gep
1575  ret void
1576}
1577
1578; FIXME: Source modifiers not folded for f16->f32
1579; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
1580define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
1581  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1582  %tid.ext = sext i32 %tid to i64
1583  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
1584  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1585  %a = load volatile half, half addrspace(1)* %a.gep
1586  %fpext = fpext half %a to float
1587  %fneg = fneg float %fpext
1588  store volatile float %fneg, float addrspace(1)* %out.gep
1589  store volatile float %fpext, float addrspace(1)* %out.gep
1590  ret void
1591}
1592
1593; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
1594define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
1595  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1596  %tid.ext = sext i32 %tid to i64
1597  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
1598  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1599  %a = load volatile half, half addrspace(1)* %a.gep
1600  %fpext = fpext half %a to float
1601  %fneg = fneg float %fpext
1602  %mul = fmul float %fpext, 4.0
1603  store volatile float %fneg, float addrspace(1)* %out.gep
1604  store volatile float %mul, float addrspace(1)* %out.gep
1605  ret void
1606}
1607
1608; --------------------------------------------------------------------------------
1609; fp_round tests
1610; --------------------------------------------------------------------------------
1611
1612; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32:
1613; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1614; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
1615; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1616define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1617  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1618  %tid.ext = sext i32 %tid to i64
1619  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1620  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1621  %a = load volatile double, double addrspace(1)* %a.gep
1622  %fpround = fptrunc double %a to float
1623  %fneg = fneg float %fpround
1624  store float %fneg, float addrspace(1)* %out.gep
1625  ret void
1626}
1627
1628; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32:
1629; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1630; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
1631; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1632define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1633  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1634  %tid.ext = sext i32 %tid to i64
1635  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1636  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1637  %a = load volatile double, double addrspace(1)* %a.gep
1638  %fneg.a = fsub double -0.000000e+00, %a
1639  %fpround = fptrunc double %fneg.a to float
1640  %fneg = fneg float %fpround
1641  store float %fneg, float addrspace(1)* %out.gep
1642  ret void
1643}
1644
1645; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32:
1646; GCN: {{buffer|flat}}_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]]
1647; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v[[[A_LO]]:[[A_HI]]]
1648; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
1649; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1650; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[A_LO]]:[[NEG_A_HI]]]
1651define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1652  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1653  %tid.ext = sext i32 %tid to i64
1654  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1655  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1656  %a = load volatile double, double addrspace(1)* %a.gep
1657  %fneg.a = fsub double -0.000000e+00, %a
1658  %fpround = fptrunc double %fneg.a to float
1659  %fneg = fneg float %fpround
1660  store volatile float %fneg, float addrspace(1)* %out.gep
1661  store volatile double %fneg.a, double addrspace(1)* undef
1662  ret void
1663}
1664
1665; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32:
1666; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1667; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
1668; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s[
1669
1670; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1671; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[USE1]]
1672define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
1673  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1674  %tid.ext = sext i32 %tid to i64
1675  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1676  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1677  %a = load volatile double, double addrspace(1)* %a.gep
1678  %fneg.a = fsub double -0.000000e+00, %a
1679  %fpround = fptrunc double %fneg.a to float
1680  %fneg = fneg float %fpround
1681  %use1 = fmul double %fneg.a, %c
1682  store volatile float %fneg, float addrspace(1)* %out.gep
1683  store volatile double %use1, double addrspace(1)* undef
1684  ret void
1685}
1686
1687; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16:
1688; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1689; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1690; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1691define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1692  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1693  %tid.ext = sext i32 %tid to i64
1694  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1695  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1696  %a = load volatile float, float addrspace(1)* %a.gep
1697  %fpround = fptrunc float %a to half
1698  %fneg = fsub half -0.000000e+00, %fpround
1699  store half %fneg, half addrspace(1)* %out.gep
1700  ret void
1701}
1702
1703; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16:
1704; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1705; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1706; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1707define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1708  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1709  %tid.ext = sext i32 %tid to i64
1710  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1711  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1712  %a = load volatile float, float addrspace(1)* %a.gep
1713  %fneg.a = fneg float %a
1714  %fpround = fptrunc float %fneg.a to half
1715  %fneg = fsub half -0.000000e+00, %fpround
1716  store half %fneg, half addrspace(1)* %out.gep
1717  ret void
1718}
1719
1720; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32:
1721; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1722; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]]
1723; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
1724; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG]]
1725; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[CVT]]
1726define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1727  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1728  %tid.ext = sext i32 %tid to i64
1729  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1730  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1731  %a = load volatile double, double addrspace(1)* %a.gep
1732  %fpround = fptrunc double %a to float
1733  %fneg = fneg float %fpround
1734  store volatile float %fneg, float addrspace(1)* %out.gep
1735  store volatile float %fpround, float addrspace(1)* %out.gep
1736  ret void
1737}
1738
1739; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16:
1740; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1741; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1742; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1743; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1744; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1745define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1746  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1747  %tid.ext = sext i32 %tid to i64
1748  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1749  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1750  %a = load volatile float, float addrspace(1)* %a.gep
1751  %fneg.a = fneg float %a
1752  %fpround = fptrunc float %fneg.a to half
1753  %fneg = fsub half -0.000000e+00, %fpround
1754  store volatile half %fneg, half addrspace(1)* %out.gep
1755  store volatile float %fneg.a, float addrspace(1)* undef
1756  ret void
1757}
1758
1759; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16:
1760; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1761; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1762; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
1763; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1764; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[USE1]]
1765define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
1766  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1767  %tid.ext = sext i32 %tid to i64
1768  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1769  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1770  %a = load volatile float, float addrspace(1)* %a.gep
1771  %fneg.a = fneg float %a
1772  %fpround = fptrunc float %fneg.a to half
1773  %fneg = fsub half -0.000000e+00, %fpround
1774  %use1 = fmul float %fneg.a, %c
1775  store volatile half %fneg, half addrspace(1)* %out.gep
1776  store volatile float %use1, float addrspace(1)* undef
1777  ret void
1778}
1779
1780; --------------------------------------------------------------------------------
1781; rcp tests
1782; --------------------------------------------------------------------------------
1783
1784; GCN-LABEL: {{^}}v_fneg_rcp_f32:
1785; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1786; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1787; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1788define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1789  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1790  %tid.ext = sext i32 %tid to i64
1791  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1792  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1793  %a = load volatile float, float addrspace(1)* %a.gep
1794  %rcp = call float @llvm.amdgcn.rcp.f32(float %a)
1795  %fneg = fneg float %rcp
1796  store float %fneg, float addrspace(1)* %out.gep
1797  ret void
1798}
1799
1800; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32:
1801; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1802; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1803; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1804define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1805  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1806  %tid.ext = sext i32 %tid to i64
1807  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1808  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1809  %a = load volatile float, float addrspace(1)* %a.gep
1810  %fneg.a = fneg float %a
1811  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1812  %fneg = fneg float %rcp
1813  store float %fneg, float addrspace(1)* %out.gep
1814  ret void
1815}
1816
1817; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32:
1818; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1819; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1820; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1821; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1822; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1823define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1824  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1825  %tid.ext = sext i32 %tid to i64
1826  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1827  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1828  %a = load volatile float, float addrspace(1)* %a.gep
1829  %fneg.a = fneg float %a
1830  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1831  %fneg = fneg float %rcp
1832  store volatile float %fneg, float addrspace(1)* %out.gep
1833  store volatile float %fneg.a, float addrspace(1)* undef
1834  ret void
1835}
1836
1837; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32:
1838; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1839; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1840; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
1841; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1842; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1843define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
1844  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1845  %tid.ext = sext i32 %tid to i64
1846  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1847  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1848  %a = load volatile float, float addrspace(1)* %a.gep
1849  %fneg.a = fneg float %a
1850  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1851  %fneg = fneg float %rcp
1852  %use1 = fmul float %fneg.a, %c
1853  store volatile float %fneg, float addrspace(1)* %out.gep
1854  store volatile float %use1, float addrspace(1)* undef
1855  ret void
1856}
1857
1858; --------------------------------------------------------------------------------
1859; fmul_legacy tests
1860; --------------------------------------------------------------------------------
1861
1862; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32:
1863; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1864; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1865; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
1866; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1867define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1868  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1869  %tid.ext = sext i32 %tid to i64
1870  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1871  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1872  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1873  %a = load volatile float, float addrspace(1)* %a.gep
1874  %b = load volatile float, float addrspace(1)* %b.gep
1875  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1876  %fneg = fneg float %mul
1877  store float %fneg, float addrspace(1)* %out.gep
1878  ret void
1879}
1880
1881; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
1882; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1883; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1884; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1885; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
1886; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
1887; GCN-NEXT: s_waitcnt vmcnt(0)
1888; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1889; GCN-NEXT: s_waitcnt vmcnt(0)
1890define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1891  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1892  %tid.ext = sext i32 %tid to i64
1893  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1894  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1895  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1896  %a = load volatile float, float addrspace(1)* %a.gep
1897  %b = load volatile float, float addrspace(1)* %b.gep
1898  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1899  %fneg = fneg float %mul
1900  store volatile float %fneg, float addrspace(1)* %out
1901  store volatile float %mul, float addrspace(1)* %out
1902  ret void
1903}
1904
1905; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
1906; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1907; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1908; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1909; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
1910; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1911; GCN-NEXT: s_waitcnt vmcnt(0)
1912; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1913; GCN-NEXT: s_waitcnt vmcnt(0)
1914define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1915  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1916  %tid.ext = sext i32 %tid to i64
1917  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1918  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1919  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1920  %a = load volatile float, float addrspace(1)* %a.gep
1921  %b = load volatile float, float addrspace(1)* %b.gep
1922  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1923  %fneg = fneg float %mul
1924  %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0)
1925  store volatile float %fneg, float addrspace(1)* %out
1926  store volatile float %use1, float addrspace(1)* %out
1927  ret void
1928}
1929
1930; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
1931; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1932; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1933; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1934; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1935define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1936  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1937  %tid.ext = sext i32 %tid to i64
1938  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1939  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1940  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1941  %a = load volatile float, float addrspace(1)* %a.gep
1942  %b = load volatile float, float addrspace(1)* %b.gep
1943  %fneg.a = fneg float %a
1944  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
1945  %fneg = fneg float %mul
1946  store volatile float %fneg, float addrspace(1)* %out
1947  ret void
1948}
1949
1950; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
1951; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1952; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1953; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1954; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1955define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1956  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1957  %tid.ext = sext i32 %tid to i64
1958  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1959  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1960  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1961  %a = load volatile float, float addrspace(1)* %a.gep
1962  %b = load volatile float, float addrspace(1)* %b.gep
1963  %fneg.b = fneg float %b
1964  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b)
1965  %fneg = fneg float %mul
1966  store volatile float %fneg, float addrspace(1)* %out
1967  ret void
1968}
1969
1970; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32:
1971; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1972; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1973; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1974; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1975define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1976  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1977  %tid.ext = sext i32 %tid to i64
1978  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1979  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1980  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1981  %a = load volatile float, float addrspace(1)* %a.gep
1982  %b = load volatile float, float addrspace(1)* %b.gep
1983  %fneg.a = fneg float %a
1984  %fneg.b = fneg float %b
1985  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b)
1986  %fneg = fneg float %mul
1987  store volatile float %fneg, float addrspace(1)* %out
1988  ret void
1989}
1990
1991; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32:
1992; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1993; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1994; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1995; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
1996; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
1997; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1998define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1999  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2000  %tid.ext = sext i32 %tid to i64
2001  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2002  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2003  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2004  %a = load volatile float, float addrspace(1)* %a.gep
2005  %b = load volatile float, float addrspace(1)* %b.gep
2006  %fneg.a = fneg float %a
2007  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
2008  %fneg = fneg float %mul
2009  store volatile float %fneg, float addrspace(1)* %out
2010  store volatile float %fneg.a, float addrspace(1)* %out
2011  ret void
2012}
2013
2014; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
2015; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2016; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2017; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
2018; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
2019; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
2020; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2021define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
2022  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2023  %tid.ext = sext i32 %tid to i64
2024  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2025  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2026  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2027  %a = load volatile float, float addrspace(1)* %a.gep
2028  %b = load volatile float, float addrspace(1)* %b.gep
2029  %fneg.a = fneg float %a
2030  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
2031  %fneg = fneg float %mul
2032  %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c)
2033  store volatile float %fneg, float addrspace(1)* %out
2034  store volatile float %use1, float addrspace(1)* %out
2035  ret void
2036}
2037
2038; --------------------------------------------------------------------------------
2039; sin tests
2040; --------------------------------------------------------------------------------
2041
2042; GCN-LABEL: {{^}}v_fneg_sin_f32:
2043; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2044; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]]
2045; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
2046; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
2047; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2048define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2049  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2050  %tid.ext = sext i32 %tid to i64
2051  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2052  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2053  %a = load volatile float, float addrspace(1)* %a.gep
2054  %sin = call float @llvm.sin.f32(float %a)
2055  %fneg = fneg float %sin
2056  store float %fneg, float addrspace(1)* %out.gep
2057  ret void
2058}
2059
2060; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32:
2061; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2062; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2063; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2064define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2065  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2066  %tid.ext = sext i32 %tid to i64
2067  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2068  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2069  %a = load volatile float, float addrspace(1)* %a.gep
2070  %sin = call float @llvm.amdgcn.sin.f32(float %a)
2071  %fneg = fneg float %sin
2072  store float %fneg, float addrspace(1)* %out.gep
2073  ret void
2074}
2075
2076; --------------------------------------------------------------------------------
2077; ftrunc tests
2078; --------------------------------------------------------------------------------
2079
2080; GCN-LABEL: {{^}}v_fneg_trunc_f32:
2081; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2082; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2083; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2084define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2085  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2086  %tid.ext = sext i32 %tid to i64
2087  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2088  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2089  %a = load volatile float, float addrspace(1)* %a.gep
2090  %trunc = call float @llvm.trunc.f32(float %a)
2091  %fneg = fneg float %trunc
2092  store float %fneg, float addrspace(1)* %out.gep
2093  ret void
2094}
2095
2096; --------------------------------------------------------------------------------
2097; fround tests
2098; --------------------------------------------------------------------------------
2099
2100; GCN-LABEL: {{^}}v_fneg_round_f32:
2101; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2102; GCN: v_trunc_f32_e32
2103; GCN: v_sub_f32_e32
2104; GCN: v_cndmask_b32
2105
2106; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
2107; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]]
2108
2109; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
2110; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2111define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2112  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2113  %tid.ext = sext i32 %tid to i64
2114  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2115  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2116  %a = load volatile float, float addrspace(1)* %a.gep
2117  %round = call float @llvm.round.f32(float %a)
2118  %fneg = fneg float %round
2119  store float %fneg, float addrspace(1)* %out.gep
2120  ret void
2121}
2122
2123; --------------------------------------------------------------------------------
2124; rint tests
2125; --------------------------------------------------------------------------------
2126
2127; GCN-LABEL: {{^}}v_fneg_rint_f32:
2128; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2129; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2130; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2131define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2132  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2133  %tid.ext = sext i32 %tid to i64
2134  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2135  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2136  %a = load volatile float, float addrspace(1)* %a.gep
2137  %rint = call float @llvm.rint.f32(float %a)
2138  %fneg = fneg float %rint
2139  store float %fneg, float addrspace(1)* %out.gep
2140  ret void
2141}
2142
2143; --------------------------------------------------------------------------------
2144; nearbyint tests
2145; --------------------------------------------------------------------------------
2146
2147; GCN-LABEL: {{^}}v_fneg_nearbyint_f32:
2148; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2149; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2150; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2151define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2152  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2153  %tid.ext = sext i32 %tid to i64
2154  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2155  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2156  %a = load volatile float, float addrspace(1)* %a.gep
2157  %nearbyint = call float @llvm.nearbyint.f32(float %a)
2158  %fneg = fneg float %nearbyint
2159  store float %fneg, float addrspace(1)* %out.gep
2160  ret void
2161}
2162
2163; --------------------------------------------------------------------------------
2164; fcanonicalize tests
2165; --------------------------------------------------------------------------------
2166
2167; GCN-LABEL: {{^}}v_fneg_canonicalize_f32:
2168; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2169; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], -1.0, [[A]]
2170; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2171define amdgpu_kernel void @v_fneg_canonicalize_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2172  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2173  %tid.ext = sext i32 %tid to i64
2174  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2175  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2176  %a = load volatile float, float addrspace(1)* %a.gep
2177  %trunc = call float @llvm.canonicalize.f32(float %a)
2178  %fneg = fneg float %trunc
2179  store float %fneg, float addrspace(1)* %out.gep
2180  ret void
2181}
2182
2183; --------------------------------------------------------------------------------
2184; vintrp tests
2185; --------------------------------------------------------------------------------
2186
2187; GCN-LABEL: {{^}}v_fneg_interp_p1_f32:
2188; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2189; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2190; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
2191; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2192; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2193define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
2194  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2195  %tid.ext = sext i32 %tid to i64
2196  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2197  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2198  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2199  %a = load volatile float, float addrspace(1)* %a.gep
2200  %b = load volatile float, float addrspace(1)* %b.gep
2201  %mul = fmul float %a, %b
2202  %fneg = fneg float %mul
2203  %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0)
2204  %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0)
2205  store volatile float %intrp0, float addrspace(1)* %out.gep
2206  store volatile float %intrp1, float addrspace(1)* %out.gep
2207  ret void
2208}
2209
2210; GCN-LABEL: {{^}}v_fneg_interp_p2_f32:
2211; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2212; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2213; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
2214; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2215; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2216define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
2217  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2218  %tid.ext = sext i32 %tid to i64
2219  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2220  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2221  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2222  %a = load volatile float, float addrspace(1)* %a.gep
2223  %b = load volatile float, float addrspace(1)* %b.gep
2224  %mul = fmul float %a, %b
2225  %fneg = fneg float %mul
2226  %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0)
2227  %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0)
2228  store volatile float %intrp0, float addrspace(1)* %out.gep
2229  store volatile float %intrp1, float addrspace(1)* %out.gep
2230  ret void
2231}
2232
2233; --------------------------------------------------------------------------------
2234; CopyToReg tests
2235; --------------------------------------------------------------------------------
2236
2237; GCN-LABEL: {{^}}v_fneg_copytoreg_f32:
2238; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2239; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2240; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2241; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
2242; GCN: s_cbranch_scc0
2243
2244; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
2245; GCN: s_endpgm
2246
2247; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
2248; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]]
2249; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2250
2251define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
2252  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2253  %tid.ext = sext i32 %tid to i64
2254  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2255  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2256  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2257  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2258  %a = load volatile float, float addrspace(1)* %a.gep
2259  %b = load volatile float, float addrspace(1)* %b.gep
2260  %c = load volatile float, float addrspace(1)* %c.gep
2261  %mul = fmul float %a, %b
2262  %fneg = fneg float %mul
2263  %cmp0 = icmp eq i32 %d, 0
2264  br i1 %cmp0, label %if, label %endif
2265
2266if:
2267  %mul1 = fmul float %fneg, %c
2268  store volatile float %mul1, float addrspace(1)* %out.gep
2269  br label %endif
2270
2271endif:
2272  store volatile float %mul, float addrspace(1)* %out.gep
2273  ret void
2274}
2275
2276; --------------------------------------------------------------------------------
2277; inlineasm tests
2278; --------------------------------------------------------------------------------
2279
2280; Can't fold into use, so should fold into source
2281; GCN-LABEL: {{^}}v_fneg_inlineasm_f32:
2282; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2283; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2284; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
2285; GCN: ; use [[MUL]]
2286; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2287define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
2288  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2289  %tid.ext = sext i32 %tid to i64
2290  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2291  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2292  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2293  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2294  %a = load volatile float, float addrspace(1)* %a.gep
2295  %b = load volatile float, float addrspace(1)* %b.gep
2296  %c = load volatile float, float addrspace(1)* %c.gep
2297  %mul = fmul float %a, %b
2298  %fneg = fneg float %mul
2299  call void asm sideeffect "; use $0", "v"(float %fneg) #0
2300  store volatile float %fneg, float addrspace(1)* %out.gep
2301  ret void
2302}
2303
2304; --------------------------------------------------------------------------------
2305; inlineasm tests
2306; --------------------------------------------------------------------------------
2307
2308; Can't fold into use, so should fold into source
2309; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
2310; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2311; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2312; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
2313; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
2314; GCN: ; use [[NEG]]
2315; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2316define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
2317  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2318  %tid.ext = sext i32 %tid to i64
2319  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2320  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2321  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2322  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2323  %a = load volatile float, float addrspace(1)* %a.gep
2324  %b = load volatile float, float addrspace(1)* %b.gep
2325  %c = load volatile float, float addrspace(1)* %c.gep
2326  %mul = fmul float %a, %b
2327  %fneg = fneg float %mul
2328  call void asm sideeffect "; use $0", "v"(float %fneg) #0
2329  store volatile float %mul, float addrspace(1)* %out.gep
2330  ret void
2331}
2332
2333; --------------------------------------------------------------------------------
2334; code size regression tests
2335; --------------------------------------------------------------------------------
2336
2337; There are multiple users of the fneg that must use a VOP3
2338; instruction, so there is no penalty
2339; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32:
2340; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2341; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2342; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2343
2344; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]]
2345; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
2346
2347; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2348; GCN-NEXT: s_waitcnt vmcnt(0)
2349; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA1]]
2350; GCN-NEXT: s_waitcnt vmcnt(0)
2351define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
2352  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2353  %tid.ext = sext i32 %tid to i64
2354  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2355  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2356  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2357  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2358  %a = load volatile float, float addrspace(1)* %a.gep
2359  %b = load volatile float, float addrspace(1)* %b.gep
2360  %c = load volatile float, float addrspace(1)* %c.gep
2361
2362  %fneg.a = fneg float %a
2363  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
2364  %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0)
2365
2366  store volatile float %fma0, float addrspace(1)* %out
2367  store volatile float %fma1, float addrspace(1)* %out
2368  ret void
2369}
2370
2371; There are multiple users, but both require using a larger encoding
2372; for the modifier.
2373
2374; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32:
2375; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2376; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2377; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2378
2379; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
2380; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
2381; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
2382; GCN-NEXT: s_waitcnt vmcnt(0)
2383; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2384; GCN-NEXT: s_waitcnt vmcnt(0)
2385define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
2386  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2387  %tid.ext = sext i32 %tid to i64
2388  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2389  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2390  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2391  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2392  %a = load volatile float, float addrspace(1)* %a.gep
2393  %b = load volatile float, float addrspace(1)* %b.gep
2394  %c = load volatile float, float addrspace(1)* %c.gep
2395
2396  %fneg.a = fneg float %a
2397  %mul0 = fmul float %fneg.a, %b
2398  %mul1 = fmul float %fneg.a, %c
2399
2400  store volatile float %mul0, float addrspace(1)* %out
2401  store volatile float %mul1, float addrspace(1)* %out
2402  ret void
2403}
2404
2405; One user is VOP3 so has no cost to folding the modifier, the other does.
2406; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32:
2407; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2408; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2409; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2410
2411; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0
2412; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
2413
2414; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2415; GCN-NEXT: s_waitcnt vmcnt(0)
2416; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2417; GCN-NEXT: s_waitcnt vmcnt(0)
2418define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
2419  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2420  %tid.ext = sext i32 %tid to i64
2421  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2422  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2423  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2424  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2425  %a = load volatile float, float addrspace(1)* %a.gep
2426  %b = load volatile float, float addrspace(1)* %b.gep
2427  %c = load volatile float, float addrspace(1)* %c.gep
2428
2429  %fneg.a = fneg float %a
2430  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
2431  %mul1 = fmul float %fneg.a, %c
2432
2433  store volatile float %fma0, float addrspace(1)* %out
2434  store volatile float %mul1, float addrspace(1)* %out
2435  ret void
2436}
2437
2438; The use of the fneg requires a code size increase, but folding into
2439; the source does not
2440
2441; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32:
2442; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2443; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2444; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2445; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
2446
2447; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0
2448; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]]
2449; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
2450
2451; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
2452; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]]
2453; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]
2454
2455; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2456; GCN-NEXT: s_waitcnt vmcnt(0)
2457; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL2]]
2458; GCN-NEXT: s_waitcnt vmcnt(0)
2459define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2460  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2461  %tid.ext = sext i32 %tid to i64
2462  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2463  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2464  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2465  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2466  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2467  %a = load volatile float, float addrspace(1)* %a.gep
2468  %b = load volatile float, float addrspace(1)* %b.gep
2469  %c = load volatile float, float addrspace(1)* %c.gep
2470  %d = load volatile float, float addrspace(1)* %d.gep
2471
2472  %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
2473  %fneg.fma0 = fneg float %fma0
2474  %mul1 = fmul float %fneg.fma0, %c
2475  %mul2 = fmul float %fneg.fma0, %d
2476
2477  store volatile float %mul1, float addrspace(1)* %out
2478  store volatile float %mul2, float addrspace(1)* %out
2479  ret void
2480}
2481
2482; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64:
2483; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
2484; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
2485; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
2486; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
2487
2488; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
2489; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]]
2490; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
2491
2492; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
2493; GCN-NEXT: s_waitcnt vmcnt(0)
2494; GCN-NEXT: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2495; GCN-NEXT: s_waitcnt vmcnt(0)
2496define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
2497  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2498  %tid.ext = sext i32 %tid to i64
2499  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
2500  %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext
2501  %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext
2502  %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext
2503  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
2504  %a = load volatile double, double addrspace(1)* %a.gep
2505  %b = load volatile double, double addrspace(1)* %b.gep
2506  %c = load volatile double, double addrspace(1)* %c.gep
2507  %d = load volatile double, double addrspace(1)* %d.gep
2508
2509  %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
2510  %fneg.fma0 = fsub double -0.0, %fma0
2511  %mul1 = fmul double %fneg.fma0, %c
2512  %mul2 = fmul double %fneg.fma0, %d
2513
2514  store volatile double %mul1, double addrspace(1)* %out
2515  store volatile double %mul2, double addrspace(1)* %out
2516  ret void
2517}
2518
2519; %trunc.a has one fneg use, but it requires a code size increase and
2520; %the fneg can instead be folded for free into the fma.
2521
2522; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32:
2523; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2524; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2525; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2526; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
2527; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2528; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2529define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2530  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2531  %tid.ext = sext i32 %tid to i64
2532  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2533  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2534  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2535  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2536  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2537  %a = load volatile float, float addrspace(1)* %a.gep
2538  %b = load volatile float, float addrspace(1)* %b.gep
2539  %c = load volatile float, float addrspace(1)* %c.gep
2540  %d = load volatile float, float addrspace(1)* %d.gep
2541
2542  %trunc.a = call float @llvm.trunc.f32(float %a)
2543  %trunc.fneg.a = fneg float %trunc.a
2544  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
2545  store volatile float %fma0, float addrspace(1)* %out
2546  ret void
2547}
2548
2549; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src:
2550; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2551; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2552; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2553; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
2554; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
2555; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2556; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]]
2557; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2558; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2559define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2560  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2561  %tid.ext = sext i32 %tid to i64
2562  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2563  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2564  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2565  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2566  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2567  %a = load volatile float, float addrspace(1)* %a.gep
2568  %b = load volatile float, float addrspace(1)* %b.gep
2569  %c = load volatile float, float addrspace(1)* %c.gep
2570  %d = load volatile float, float addrspace(1)* %d.gep
2571
2572  %trunc.a = call float @llvm.trunc.f32(float %a)
2573  %trunc.fneg.a = fneg float %trunc.a
2574  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
2575  %mul1 = fmul float %trunc.a, %d
2576  store volatile float %fma0, float addrspace(1)* %out
2577  store volatile float %mul1, float addrspace(1)* %out
2578  ret void
2579}
2580
2581; The AMDGPU combine to pull fneg into the FMA operands was being
2582; undone by the generic combine to pull the fneg out of the fma if
2583; !isFNegFree. We were reporting false for v2f32 even though it will
2584; be split into f32 where it will be free.
2585; GCN-LABEL: {{^}}fneg_fma_fneg_dagcombine_loop:
2586; GCN: s_brev_b32 [[NEGZERO:s[0-9]+]], 1{{$}}
2587; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], v2, -v4, [[NEGZERO]]
2588; GCN-DAG: v_fma_f32 [[FMA1:v[0-9]+]], v3, -v5, [[NEGZERO]]
2589; GCN-DAG: v_sub_f32_e32 [[SUB0:v[0-9]+]], [[FMA0]], v0
2590; GCN-DAG: v_sub_f32_e32 [[SUB1:v[0-9]+]], [[FMA1]], v1
2591; GCN-DAG: v_mul_f32_e32 v0, [[SUB0]], v4
2592; GCN-DAG: v_mul_f32_e32 v1, [[SUB1]], v5
2593; GCN: s_setpc_b64
2594define <2 x float> @fneg_fma_fneg_dagcombine_loop(<2 x float> %arg, <2 x float> %arg1, <2 x float> %arg2) #0 {
2595bb:
2596  %i3 = call fast <2 x float> @llvm.fma.v2f32(<2 x float> %arg1, <2 x float> %arg2, <2 x float> zeroinitializer)
2597  %i4 = fadd fast <2 x float> %i3, %arg
2598  %i5 = fneg <2 x float> %i4
2599  %i6 = fmul fast <2 x float> %i5, %arg2
2600  ret <2 x float> %i6
2601}
2602
2603; This expects denormal flushing, so can't turn this fmul into fneg
2604; TODO: Keeping this as fmul saves encoding size
2605; GCN-LABEL: {{^}}nnan_fmul_neg1_to_fneg:
2606; GCN: v_sub_f32_e32 [[TMP:v[0-9]+]], 0x80000000, v0
2607; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1
2608define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 {
2609  %mul = fmul float %x, -1.0
2610  %add = fmul nnan float %mul, %y
2611  ret float %add
2612}
2613
2614; It's legal to turn this fmul into an fneg since denormals are
2615; preserved and we know an snan can't happen from the flag.
2616; GCN-LABEL: {{^}}denormal_fmul_neg1_to_fneg:
2617; GCN: v_mul_f32_e64 v0, -v0, v1
2618; GCN-NEXT: s_setpc_b64
2619define float @denormal_fmul_neg1_to_fneg(float %x, float %y) {
2620  %mul = fmul nnan float %x, -1.0
2621  %add = fmul float %mul, %y
2622  ret float %add
2623}
2624
2625; know the source can't be an snan
2626; GCN-LABEL: {{^}}denorm_snan_fmul_neg1_to_fneg:
2627; GCN: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0
2628; GCN: v_mul_f32_e32 v0, [[TMP]], v1
2629; GCN-NEXT: s_setpc_b64
2630define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) {
2631  %canonical = fmul float %x, %x
2632  %mul = fmul float %canonical, -1.0
2633  %add = fmul float %mul, %y
2634  ret float %add
2635}
2636
2637; GCN-LABEL: {{^}}flush_snan_fmul_neg1_to_fneg:
2638; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], 1.0, v0
2639; GCN: v_sub_f32_e32 [[TMP1:v[0-9]+]], 0x80000000, [[TMP0]]
2640; GCN-NEXT: v_mul_f32_e32 v0, [[TMP1]], v1
2641define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 {
2642  %quiet = call float @llvm.canonicalize.f32(float %x)
2643  %mul = fmul float %quiet, -1.0
2644  %add = fmul float %mul, %y
2645  ret float %add
2646}
2647
2648declare i32 @llvm.amdgcn.workitem.id.x() #1
2649declare float @llvm.fma.f32(float, float, float) #1
2650declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
2651declare float @llvm.fmuladd.f32(float, float, float) #1
2652declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
2653declare float @llvm.sin.f32(float) #1
2654declare float @llvm.trunc.f32(float) #1
2655declare float @llvm.round.f32(float) #1
2656declare float @llvm.rint.f32(float) #1
2657declare float @llvm.nearbyint.f32(float) #1
2658declare float @llvm.canonicalize.f32(float) #1
2659declare float @llvm.minnum.f32(float, float) #1
2660declare float @llvm.maxnum.f32(float, float) #1
2661declare half @llvm.minnum.f16(half, half) #1
2662declare double @llvm.minnum.f64(double, double) #1
2663declare double @llvm.fma.f64(double, double, double) #1
2664
2665declare float @llvm.amdgcn.sin.f32(float) #1
2666declare float @llvm.amdgcn.rcp.f32(float) #1
2667declare float @llvm.amdgcn.rcp.legacy(float) #1
2668declare float @llvm.amdgcn.fmul.legacy(float, float) #1
2669declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
2670declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
2671
2672attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2673attributes #1 = { nounwind readnone }
2674attributes #2 = { nounwind "unsafe-fp-math"="true" }
2675attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
2676