1; RUN: llc -march=amdgcn -mcpu=hawaii -start-after=sink -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI %s
2; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI %s
3
4; RUN: llc -march=amdgcn -mcpu=fiji -start-after=sink --verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI %s
5; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI %s
6
7; --------------------------------------------------------------------------------
8; fadd tests
9; --------------------------------------------------------------------------------
10
11; GCN-LABEL: {{^}}v_fneg_add_f32:
12; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
13; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
14
15; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
16; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
17
18; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
19; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
20define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
21  %tid = call i32 @llvm.amdgcn.workitem.id.x()
22  %tid.ext = sext i32 %tid to i64
23  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
24  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
25  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
26  %a = load volatile float, float addrspace(1)* %a.gep
27  %b = load volatile float, float addrspace(1)* %b.gep
28  %add = fadd float %a, %b
29  %fneg = fneg float %add
30  store float %fneg, float addrspace(1)* %out.gep
31  ret void
32}
33
34; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
35; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
36; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
37; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
38; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
39; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
40; GCN-NEXT: s_waitcnt vmcnt(0)
41; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
42; GCN-NEXT: s_waitcnt vmcnt(0)
43define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
44  %tid = call i32 @llvm.amdgcn.workitem.id.x()
45  %tid.ext = sext i32 %tid to i64
46  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
47  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
48  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
49  %a = load volatile float, float addrspace(1)* %a.gep
50  %b = load volatile float, float addrspace(1)* %b.gep
51  %add = fadd float %a, %b
52  %fneg = fneg float %add
53  store volatile float %fneg, float addrspace(1)* %out
54  store volatile float %add, float addrspace(1)* %out
55  ret void
56}
57
58; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
59; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
60; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
61
62; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
63; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
64; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
65
66; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]]
67; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]]
68
69; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
70; GCN-NEXT: s_waitcnt vmcnt(0)
71; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
72; GCN-NEXT: s_waitcnt vmcnt(0)
73define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
74  %tid = call i32 @llvm.amdgcn.workitem.id.x()
75  %tid.ext = sext i32 %tid to i64
76  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
77  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
78  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
79  %a = load volatile float, float addrspace(1)* %a.gep
80  %b = load volatile float, float addrspace(1)* %b.gep
81  %add = fadd float %a, %b
82  %fneg = fneg float %add
83  %use1 = fmul float %add, 4.0
84  store volatile float %fneg, float addrspace(1)* %out
85  store volatile float %use1, float addrspace(1)* %out
86  ret void
87}
88
89; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32:
90; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
91; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
92
93; GCN-SAFE: v_sub_f32_e32
94; GCN-SAFE: v_xor_b32_e32 [[ADD:v[0-9]+]], 0x80000000,
95
96; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
97
98; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
99define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
100  %tid = call i32 @llvm.amdgcn.workitem.id.x()
101  %tid.ext = sext i32 %tid to i64
102  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
103  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
104  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
105  %a = load volatile float, float addrspace(1)* %a.gep
106  %b = load volatile float, float addrspace(1)* %b.gep
107  %fneg.a = fneg float %a
108  %add = fadd float %fneg.a, %b
109  %fneg = fneg float %add
110  store volatile float %fneg, float addrspace(1)* %out
111  ret void
112}
113
114; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32:
115; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
116; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
117
118; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
119; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
120
121; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
122; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
123define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
124  %tid = call i32 @llvm.amdgcn.workitem.id.x()
125  %tid.ext = sext i32 %tid to i64
126  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
127  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
128  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
129  %a = load volatile float, float addrspace(1)* %a.gep
130  %b = load volatile float, float addrspace(1)* %b.gep
131  %fneg.b = fneg float %b
132  %add = fadd float %a, %fneg.b
133  %fneg = fneg float %add
134  store volatile float %fneg, float addrspace(1)* %out
135  ret void
136}
137
138; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32:
139; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
140; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
141
142; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]]
143; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
144
145; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
146; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
147define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
148  %tid = call i32 @llvm.amdgcn.workitem.id.x()
149  %tid.ext = sext i32 %tid to i64
150  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
151  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
152  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
153  %a = load volatile float, float addrspace(1)* %a.gep
154  %b = load volatile float, float addrspace(1)* %b.gep
155  %fneg.a = fneg float %a
156  %fneg.b = fneg float %b
157  %add = fadd float %fneg.a, %fneg.b
158  %fneg = fneg float %add
159  store volatile float %fneg, float addrspace(1)* %out
160  ret void
161}
162
163; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
164; GCN-SAFE-DAG: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1{{$}}
165; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
166; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
167
168; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[SIGNBIT]], [[A]]
169; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
170; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[SIGNBIT]], [[ADD]]
171
172; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
173; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
174; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
175; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
176; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
177; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
178define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
179  %tid = call i32 @llvm.amdgcn.workitem.id.x()
180  %tid.ext = sext i32 %tid to i64
181  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
182  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
183  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
184  %a = load volatile float, float addrspace(1)* %a.gep
185  %b = load volatile float, float addrspace(1)* %b.gep
186  %fneg.a = fneg float %a
187  %add = fadd float %fneg.a, %b
188  %fneg = fneg float %add
189  store volatile float %fneg, float addrspace(1)* %out
190  store volatile float %fneg.a, float addrspace(1)* %out
191  ret void
192}
193
194; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32:
195; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
196; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
197
198; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
199; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
200; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
201
202; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
203; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
204; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
205; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
206; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
207; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
208define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
209  %tid = call i32 @llvm.amdgcn.workitem.id.x()
210  %tid.ext = sext i32 %tid to i64
211  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
212  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
213  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
214  %a = load volatile float, float addrspace(1)* %a.gep
215  %b = load volatile float, float addrspace(1)* %b.gep
216  %fneg.a = fneg float %a
217  %add = fadd float %fneg.a, %b
218  %fneg = fneg float %add
219  %use1 = fmul float %fneg.a, %c
220  store volatile float %fneg, float addrspace(1)* %out
221  store volatile float %use1, float addrspace(1)* %out
222  ret void
223}
224
225; This one asserted with -enable-no-signed-zeros-fp-math
226; GCN-LABEL: {{^}}fneg_fadd_0:
227; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]],
228; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]]
229; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]]
230define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 {
231.entry:
232  %tmp7 = fdiv float 1.000000e+00, %tmp6
233  %tmp8 = fmul float 0.000000e+00, %tmp7
234  %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8
235  %.i188 = fadd float %tmp9, 0.000000e+00
236  %tmp10 = fcmp uge float %.i188, %tmp2
237  %tmp11 = fneg float %.i188
238  %.i092 = select i1 %tmp10, float %tmp2, float %tmp11
239  %tmp12 = fcmp ule float %.i092, 0.000000e+00
240  %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000
241  ret float %.i198
242}
243
244; This is a workaround because -enable-no-signed-zeros-fp-math does not set up
245; function attribute unsafe-fp-math automatically. Combine with the previous test
246; when that is done.
247; GCN-LABEL: {{^}}fneg_fadd_0_nsz:
248; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]],
249; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]],
250; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]],
251; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]],
252; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]]
253define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 {
254.entry:
255  %tmp7 = fdiv afn float 1.000000e+00, %tmp6
256  %tmp8 = fmul float 0.000000e+00, %tmp7
257  %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8
258  %.i188 = fadd float %tmp9, 0.000000e+00
259  %tmp10 = fcmp uge float %.i188, %tmp2
260  %tmp11 = fneg float %.i188
261  %.i092 = select i1 %tmp10, float %tmp2, float %tmp11
262  %tmp12 = fcmp ule float %.i092, 0.000000e+00
263  %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000
264  ret float %.i198
265}
266
267; --------------------------------------------------------------------------------
268; fmul tests
269; --------------------------------------------------------------------------------
270
271; GCN-LABEL: {{^}}v_fneg_mul_f32:
272; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
273; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
274; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
275; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
276define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
277  %tid = call i32 @llvm.amdgcn.workitem.id.x()
278  %tid.ext = sext i32 %tid to i64
279  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
280  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
281  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
282  %a = load volatile float, float addrspace(1)* %a.gep
283  %b = load volatile float, float addrspace(1)* %b.gep
284  %mul = fmul float %a, %b
285  %fneg = fneg float %mul
286  store float %fneg, float addrspace(1)* %out.gep
287  ret void
288}
289
290; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
291; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
292; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
293; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
294; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
295; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]]
296; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
297define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
298  %tid = call i32 @llvm.amdgcn.workitem.id.x()
299  %tid.ext = sext i32 %tid to i64
300  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
301  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
302  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
303  %a = load volatile float, float addrspace(1)* %a.gep
304  %b = load volatile float, float addrspace(1)* %b.gep
305  %mul = fmul float %a, %b
306  %fneg = fneg float %mul
307  store volatile float %fneg, float addrspace(1)* %out
308  store volatile float %mul, float addrspace(1)* %out
309  ret void
310}
311
312; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
313; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
314; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
315; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]]
316; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
317
318; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
319; GCN-NEXT: s_waitcnt vmcnt(0)
320; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
321; GCN-NEXT: s_waitcnt vmcnt(0)
322define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
323  %tid = call i32 @llvm.amdgcn.workitem.id.x()
324  %tid.ext = sext i32 %tid to i64
325  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
326  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
327  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
328  %a = load volatile float, float addrspace(1)* %a.gep
329  %b = load volatile float, float addrspace(1)* %b.gep
330  %mul = fmul float %a, %b
331  %fneg = fneg float %mul
332  %use1 = fmul float %mul, 4.0
333  store volatile float %fneg, float addrspace(1)* %out
334  store volatile float %use1, float addrspace(1)* %out
335  ret void
336}
337
338; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
339; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
340; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
341; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
342; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
343define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
344  %tid = call i32 @llvm.amdgcn.workitem.id.x()
345  %tid.ext = sext i32 %tid to i64
346  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
347  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
348  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
349  %a = load volatile float, float addrspace(1)* %a.gep
350  %b = load volatile float, float addrspace(1)* %b.gep
351  %fneg.a = fneg float %a
352  %mul = fmul float %fneg.a, %b
353  %fneg = fneg float %mul
354  store volatile float %fneg, float addrspace(1)* %out
355  ret void
356}
357
358; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
359; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
360; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
361; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
362; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
363define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
364  %tid = call i32 @llvm.amdgcn.workitem.id.x()
365  %tid.ext = sext i32 %tid to i64
366  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
367  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
368  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
369  %a = load volatile float, float addrspace(1)* %a.gep
370  %b = load volatile float, float addrspace(1)* %b.gep
371  %fneg.b = fneg float %b
372  %mul = fmul float %a, %fneg.b
373  %fneg = fneg float %mul
374  store volatile float %fneg, float addrspace(1)* %out
375  ret void
376}
377
378; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32:
379; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
380; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
381; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
382; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
383define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
384  %tid = call i32 @llvm.amdgcn.workitem.id.x()
385  %tid.ext = sext i32 %tid to i64
386  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
387  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
388  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
389  %a = load volatile float, float addrspace(1)* %a.gep
390  %b = load volatile float, float addrspace(1)* %b.gep
391  %fneg.a = fneg float %a
392  %fneg.b = fneg float %b
393  %mul = fmul float %fneg.a, %fneg.b
394  %fneg = fneg float %mul
395  store volatile float %fneg, float addrspace(1)* %out
396  ret void
397}
398
399; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32:
400; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
401; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
402; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
403; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
404
405; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]]
406; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
407define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
408  %tid = call i32 @llvm.amdgcn.workitem.id.x()
409  %tid.ext = sext i32 %tid to i64
410  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
411  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
412  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
413  %a = load volatile float, float addrspace(1)* %a.gep
414  %b = load volatile float, float addrspace(1)* %b.gep
415  %fneg.a = fneg float %a
416  %mul = fmul float %fneg.a, %b
417  %fneg = fneg float %mul
418  store volatile float %fneg, float addrspace(1)* %out
419  store volatile float %fneg.a, float addrspace(1)* %out
420  ret void
421}
422
423; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
424; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
425; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
426; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
427; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
428; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]]
429; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
430define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
431  %tid = call i32 @llvm.amdgcn.workitem.id.x()
432  %tid.ext = sext i32 %tid to i64
433  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
434  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
435  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
436  %a = load volatile float, float addrspace(1)* %a.gep
437  %b = load volatile float, float addrspace(1)* %b.gep
438  %fneg.a = fneg float %a
439  %mul = fmul float %fneg.a, %b
440  %fneg = fneg float %mul
441  %use1 = fmul float %fneg.a, %c
442  store volatile float %fneg, float addrspace(1)* %out
443  store volatile float %use1, float addrspace(1)* %out
444  ret void
445}
446
447; --------------------------------------------------------------------------------
448; fminnum tests
449; --------------------------------------------------------------------------------
450
451; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee:
452; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
453; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
454; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
455; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
456; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
457; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
458define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
459  %tid = call i32 @llvm.amdgcn.workitem.id.x()
460  %tid.ext = sext i32 %tid to i64
461  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
462  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
463  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
464  %a = load volatile float, float addrspace(1)* %a.gep
465  %b = load volatile float, float addrspace(1)* %b.gep
466  %min = call float @llvm.minnum.f32(float %a, float %b)
467  %fneg = fneg float %min
468  store float %fneg, float addrspace(1)* %out.gep
469  ret void
470}
471
472; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee:
473; GCN-NOT: v0
474; GCN-NOT: v1
475; GCN: v_max_f32_e64 v0, -v0, -v1
476; GCN-NEXT: ; return
477define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 {
478  %min = call float @llvm.minnum.f32(float %a, float %b)
479  %fneg = fneg float %min
480  ret float %fneg
481}
482
483; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee:
484; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
485; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
486; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
487; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
488define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
489  %tid = call i32 @llvm.amdgcn.workitem.id.x()
490  %tid.ext = sext i32 %tid to i64
491  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
492  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
493  %a = load volatile float, float addrspace(1)* %a.gep
494  %min = call float @llvm.minnum.f32(float %a, float %a)
495  %min.fneg = fneg float %min
496  store float %min.fneg, float addrspace(1)* %out.gep
497  ret void
498}
499
500; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee:
501; GCN-NOT: v0
502; GCN: v_max_f32_e64 v0, -v0, -v0
503; GCN-NEXT: ; return
504define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 {
505  %min = call float @llvm.minnum.f32(float %a, float %a)
506  %min.fneg = fneg float %min
507  ret float %min.fneg
508}
509
510; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee:
511; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
512; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
513; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
514; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
515define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
516  %tid = call i32 @llvm.amdgcn.workitem.id.x()
517  %tid.ext = sext i32 %tid to i64
518  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
519  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
520  %a = load volatile float, float addrspace(1)* %a.gep
521  %min = call float @llvm.minnum.f32(float 4.0, float %a)
522  %fneg = fneg float %min
523  store float %fneg, float addrspace(1)* %out.gep
524  ret void
525}
526
527; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee:
528; GCN-NOT: v0
529; GCN: v_max_f32_e64 v0, -v0, -4.0
530; GCN-NEXT: ; return
531define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 {
532  %min = call float @llvm.minnum.f32(float 4.0, float %a)
533  %fneg = fneg float %min
534  ret float %fneg
535}
536
537; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee:
538; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
539; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
540; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
541; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
542define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
543  %tid = call i32 @llvm.amdgcn.workitem.id.x()
544  %tid.ext = sext i32 %tid to i64
545  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
546  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
547  %a = load volatile float, float addrspace(1)* %a.gep
548  %min = call float @llvm.minnum.f32(float -4.0, float %a)
549  %fneg = fneg float %min
550  store float %fneg, float addrspace(1)* %out.gep
551  ret void
552}
553
554; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee:
555; GCN-NOT: v0
556; GCN: v_max_f32_e64 v0, -v0, 4.0
557; GCN-NEXT: ; return
558define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 {
559  %min = call float @llvm.minnum.f32(float -4.0, float %a)
560  %fneg = fneg float %min
561  ret float %fneg
562}
563
564; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
565; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
566; GCN-NOT [[A]]
567; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
568; GCN: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MIN]]
569; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
570define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
571  %tid = call i32 @llvm.amdgcn.workitem.id.x()
572  %tid.ext = sext i32 %tid to i64
573  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
574  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
575  %a = load volatile float, float addrspace(1)* %a.gep
576  %min = call nnan float @llvm.minnum.f32(float 0.0, float %a)
577  %fneg = fneg float %min
578  store float %fneg, float addrspace(1)* %out.gep
579  ret void
580}
581
582; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee:
583; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
584; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
585; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
586; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
587define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
588  %tid = call i32 @llvm.amdgcn.workitem.id.x()
589  %tid.ext = sext i32 %tid to i64
590  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
591  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
592  %a = load volatile float, float addrspace(1)* %a.gep
593  %min = call float @llvm.minnum.f32(float -0.0, float %a)
594  %fneg = fneg float %min
595  store float %fneg, float addrspace(1)* %out.gep
596  ret void
597}
598
599; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32:
600; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
601
602; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
603; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
604
605; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
606; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
607; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]]
608
609; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
610define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
611  %tid = call i32 @llvm.amdgcn.workitem.id.x()
612  %tid.ext = sext i32 %tid to i64
613  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
614  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
615  %a = load volatile float, float addrspace(1)* %a.gep
616  %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a)
617  %fneg = fneg float %min
618  store float %fneg, float addrspace(1)* %out.gep
619  ret void
620}
621
622; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32:
623; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
624
625; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
626; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]]
627
628; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
629; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
630
631; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
632define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
633  %tid = call i32 @llvm.amdgcn.workitem.id.x()
634  %tid.ext = sext i32 %tid to i64
635  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
636  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
637  %a = load volatile float, float addrspace(1)* %a.gep
638  %min = call float @llvm.minnum.f32(float 0xBFC45F3060000000, float %a)
639  %fneg = fneg float %min
640  store float %fneg, float addrspace(1)* %out.gep
641  ret void
642}
643
644; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f16:
645; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
646
647; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
648; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]]
649; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
650
651; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]]
652; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
653; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]]
654
655; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
656define amdgpu_kernel void @v_fneg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
657  %tid = call i32 @llvm.amdgcn.workitem.id.x()
658  %tid.ext = sext i32 %tid to i64
659  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
660  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
661  %a = load volatile half, half addrspace(1)* %a.gep
662  %min = call half @llvm.minnum.f16(half 0xH3118, half %a)
663  %fneg = fsub half -0.000000e+00, %min
664  store half %fneg, half addrspace(1)* %out.gep
665  ret void
666}
667
668; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f16:
669; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
670
671; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
672; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]]
673; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
674
675; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]]
676; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
677
678; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
679define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
680  %tid = call i32 @llvm.amdgcn.workitem.id.x()
681  %tid.ext = sext i32 %tid to i64
682  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
683  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
684  %a = load volatile half, half addrspace(1)* %a.gep
685  %min = call half @llvm.minnum.f16(half 0xHB118, half %a)
686  %fneg = fsub half -0.000000e+00, %min
687  store half %fneg, half addrspace(1)* %out.gep
688  ret void
689}
690
691; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f64:
692; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
693
694; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30
695; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
696; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
697; SI: v_max_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]]
698
699; VI: v_min_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[A]], 0.15915494
700; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]]
701
702; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RESULT_LO]]:[[RESULT_HI]]]
703define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
704  %tid = call i32 @llvm.amdgcn.workitem.id.x()
705  %tid.ext = sext i32 %tid to i64
706  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
707  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
708  %a = load volatile double, double addrspace(1)* %a.gep
709  %min = call double @llvm.minnum.f64(double 0x3fc45f306dc9c882, double %a)
710  %fneg = fsub double -0.000000e+00, %min
711  store double %fneg, double addrspace(1)* %out.gep
712  ret void
713}
714
715; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f64:
716; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
717
718; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30
719; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
720; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
721; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]]
722
723; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
724; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494
725
726; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
727define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
728  %tid = call i32 @llvm.amdgcn.workitem.id.x()
729  %tid.ext = sext i32 %tid to i64
730  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
731  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
732  %a = load volatile double, double addrspace(1)* %a.gep
733  %min = call double @llvm.minnum.f64(double 0xbfc45f306dc9c882, double %a)
734  %fneg = fsub double -0.000000e+00, %min
735  store double %fneg, double addrspace(1)* %out.gep
736  ret void
737}
738
739; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_no_ieee:
740; GCN-NOT: v0
741; GCN: v_max_f32_e64 v0, -v0, 0{{$}}
742; GCN-NEXT: ; return
743define amdgpu_ps float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #0 {
744  %min = call float @llvm.minnum.f32(float -0.0, float %a)
745  %fneg = fneg float %min
746  ret float %fneg
747}
748
749; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee:
750; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
751; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
752; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
753; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]]
754; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
755; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
756define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
757  %tid = call i32 @llvm.amdgcn.workitem.id.x()
758  %tid.ext = sext i32 %tid to i64
759  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
760  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
761  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
762  %a = load volatile float, float addrspace(1)* %a.gep
763  %b = load volatile float, float addrspace(1)* %b.gep
764  %min = call float @llvm.minnum.f32(float 0.0, float %a)
765  %fneg = fneg float %min
766  %mul = fmul float %fneg, %b
767  store float %mul, float addrspace(1)* %out.gep
768  ret void
769}
770
771; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32:
772; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
773; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
774
775; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
776
777; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
778; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]]
779
780; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
781; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]]
782; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
783
784; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
785define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
786  %tid = call i32 @llvm.amdgcn.workitem.id.x()
787  %tid.ext = sext i32 %tid to i64
788  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
789  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
790  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
791  %a = load volatile float, float addrspace(1)* %a.gep
792  %b = load volatile float, float addrspace(1)* %b.gep
793  %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a)
794  %fneg = fneg float %min
795  %mul = fmul float %fneg, %b
796  store float %mul, float addrspace(1)* %out.gep
797  ret void
798}
799
800; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee:
801; GCN-NOT: v0
802; GCN-NOT: v1
803; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0
804; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1
805; GCN-NEXT: ; return
806define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
807  %min = call float @llvm.minnum.f32(float 0.0, float %a)
808  %fneg = fneg float %min
809  %mul = fmul float %fneg, %b
810  ret float %mul
811}
812
813; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee:
814; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
815; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
816; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
817; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
818; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
819; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
820; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
821; GCN-NEXT: s_waitcnt vmcnt(0)
822; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
823; GCN-NEXT: s_waitcnt vmcnt(0)
824define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
825  %tid = call i32 @llvm.amdgcn.workitem.id.x()
826  %tid.ext = sext i32 %tid to i64
827  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
828  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
829  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
830  %a = load volatile float, float addrspace(1)* %a.gep
831  %b = load volatile float, float addrspace(1)* %b.gep
832  %min = call float @llvm.minnum.f32(float %a, float %b)
833  %fneg = fneg float %min
834  %use1 = fmul float %min, 4.0
835  store volatile float %fneg, float addrspace(1)* %out
836  store volatile float %use1, float addrspace(1)* %out
837  ret void
838}
839
840; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee:
841; GCN-NOT: v0
842; GCN-NOT: v1
843; GCN: v_max_f32_e64 v0, -v0, -v1
844; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
845; GCN-NEXT: ; return
846define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 {
847  %min = call float @llvm.minnum.f32(float %a, float %b)
848  %fneg = fneg float %min
849  %use1 = fmul float %min, 4.0
850  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
851  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
852  ret <2 x float> %ins1
853}
854
855; --------------------------------------------------------------------------------
856; fmaxnum tests
857; --------------------------------------------------------------------------------
858
859
860; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee:
861; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
862; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
863; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
864; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
865; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
866; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
867define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
868  %tid = call i32 @llvm.amdgcn.workitem.id.x()
869  %tid.ext = sext i32 %tid to i64
870  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
871  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
872  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
873  %a = load volatile float, float addrspace(1)* %a.gep
874  %b = load volatile float, float addrspace(1)* %b.gep
875  %max = call float @llvm.maxnum.f32(float %a, float %b)
876  %fneg = fneg float %max
877  store float %fneg, float addrspace(1)* %out.gep
878  ret void
879}
880
881; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee:
882; GCN-NOT: v0
883; GCN-NOT: v1
884; GCN: v_min_f32_e64 v0, -v0, -v1
885; GCN-NEXT: ; return
886define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 {
887  %max = call float @llvm.maxnum.f32(float %a, float %b)
888  %fneg = fneg float %max
889  ret float %fneg
890}
891
892; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee:
893; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
894; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
895; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
896; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
897define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
898  %tid = call i32 @llvm.amdgcn.workitem.id.x()
899  %tid.ext = sext i32 %tid to i64
900  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
901  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
902  %a = load volatile float, float addrspace(1)* %a.gep
903  %max = call float @llvm.maxnum.f32(float %a, float %a)
904  %max.fneg = fneg float %max
905  store float %max.fneg, float addrspace(1)* %out.gep
906  ret void
907}
908
909; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee:
910; GCN-NOT: v0
911; GCN: v_min_f32_e64 v0, -v0, -v0
912; GCN-NEXT: ; return
913define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 {
914  %max = call float @llvm.maxnum.f32(float %a, float %a)
915  %max.fneg = fneg float %max
916  ret float %max.fneg
917}
918
919; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee:
920; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
921; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
922; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
923; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
924define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
925  %tid = call i32 @llvm.amdgcn.workitem.id.x()
926  %tid.ext = sext i32 %tid to i64
927  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
928  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
929  %a = load volatile float, float addrspace(1)* %a.gep
930  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
931  %fneg = fneg float %max
932  store float %fneg, float addrspace(1)* %out.gep
933  ret void
934}
935
936; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee:
937; GCN-NOT: v0
938; GCN: v_min_f32_e64 v0, -v0, -4.0
939; GCN-NEXT: ; return
940define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 {
941  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
942  %fneg = fneg float %max
943  ret float %fneg
944}
945
946; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee:
947; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
948; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
949; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
950; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
951define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
952  %tid = call i32 @llvm.amdgcn.workitem.id.x()
953  %tid.ext = sext i32 %tid to i64
954  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
955  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
956  %a = load volatile float, float addrspace(1)* %a.gep
957  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
958  %fneg = fneg float %max
959  store float %fneg, float addrspace(1)* %out.gep
960  ret void
961}
962
963; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee:
964; GCN-NOT: v0
965; GCN: v_min_f32_e64 v0, -v0, 4.0
966; GCN-NEXT: ; return
967define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 {
968  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
969  %fneg = fneg float %max
970  ret float %fneg
971}
972
973; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
974; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
975; GCN-NOT: [[A]]
976; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
977; GCN: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]]
978; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
979define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
980  %tid = call i32 @llvm.amdgcn.workitem.id.x()
981  %tid.ext = sext i32 %tid to i64
982  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
983  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
984  %a = load volatile float, float addrspace(1)* %a.gep
985  %max = call nnan float @llvm.maxnum.f32(float 0.0, float %a)
986  %fneg = fneg float %max
987  store float %fneg, float addrspace(1)* %out.gep
988  ret void
989}
990
991; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee:
992; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
993; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
994; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
995; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
996define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
997  %tid = call i32 @llvm.amdgcn.workitem.id.x()
998  %tid.ext = sext i32 %tid to i64
999  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1000  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1001  %a = load volatile float, float addrspace(1)* %a.gep
1002  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
1003  %fneg = fneg float %max
1004  store float %fneg, float addrspace(1)* %out.gep
1005  ret void
1006}
1007
1008; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee:
1009; GCN-NOT: v0
1010; GCN: v_min_f32_e64 v0, -v0, 0{{$}}
1011; GCN-NEXT: ; return
1012define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 {
1013  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
1014  %fneg = fneg float %max
1015  ret float %fneg
1016}
1017
1018; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee:
1019; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1020; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1021; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
1022; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
1023; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
1024; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1025define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1026  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1027  %tid.ext = sext i32 %tid to i64
1028  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1029  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1030  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1031  %a = load volatile float, float addrspace(1)* %a.gep
1032  %b = load volatile float, float addrspace(1)* %b.gep
1033  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
1034  %fneg = fneg float %max
1035  %mul = fmul float %fneg, %b
1036  store float %mul, float addrspace(1)* %out.gep
1037  ret void
1038}
1039
1040; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee:
1041; GCN-NOT: v0
1042; GCN-NOT: v1
1043; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0
1044; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1
1045; GCN-NEXT: ; return
1046define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
1047  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
1048  %fneg = fneg float %max
1049  %mul = fmul float %fneg, %b
1050  ret float %mul
1051}
1052
1053; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee:
1054; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1055; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1056; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
1057; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
1058; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
1059; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
1060; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
1061; GCN-NEXT: s_waitcnt vmcnt(0)
1062; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
1063; GCN-NEXT: s_waitcnt vmcnt(0)
1064define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1065  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1066  %tid.ext = sext i32 %tid to i64
1067  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1068  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1069  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1070  %a = load volatile float, float addrspace(1)* %a.gep
1071  %b = load volatile float, float addrspace(1)* %b.gep
1072  %max = call float @llvm.maxnum.f32(float %a, float %b)
1073  %fneg = fneg float %max
1074  %use1 = fmul float %max, 4.0
1075  store volatile float %fneg, float addrspace(1)* %out
1076  store volatile float %use1, float addrspace(1)* %out
1077  ret void
1078}
1079
1080; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee:
1081; GCN-NOT: v0
1082; GCN-NOT: v1
1083; GCN: v_min_f32_e64 v0, -v0, -v1
1084; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
1085; GCN-NEXT: ; return
1086define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 {
1087  %max = call float @llvm.maxnum.f32(float %a, float %b)
1088  %fneg = fneg float %max
1089  %use1 = fmul float %max, 4.0
1090  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
1091  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
1092  ret <2 x float> %ins1
1093}
1094
1095; --------------------------------------------------------------------------------
1096; fma tests
1097; --------------------------------------------------------------------------------
1098
1099; GCN-LABEL: {{^}}v_fneg_fma_f32:
1100; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1101; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1102; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1103
1104; GCN-SAFE: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
1105; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[RESULT]]
1106
1107; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
1108; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1109define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1110  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1111  %tid.ext = sext i32 %tid to i64
1112  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1113  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1114  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1115  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1116  %a = load volatile float, float addrspace(1)* %a.gep
1117  %b = load volatile float, float addrspace(1)* %b.gep
1118  %c = load volatile float, float addrspace(1)* %c.gep
1119  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
1120  %fneg = fneg float %fma
1121  store float %fneg, float addrspace(1)* %out.gep
1122  ret void
1123}
1124
1125; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32:
1126; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1127; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1128; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1129; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1130; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
1131; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
1132; GCN-NEXT: s_waitcnt vmcnt(0)
1133; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1134; GCN-NEXT: s_waitcnt vmcnt(0)
1135define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1136  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1137  %tid.ext = sext i32 %tid to i64
1138  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1139  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1140  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1141  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1142  %a = load volatile float, float addrspace(1)* %a.gep
1143  %b = load volatile float, float addrspace(1)* %b.gep
1144  %c = load volatile float, float addrspace(1)* %c.gep
1145  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
1146  %fneg = fneg float %fma
1147  store volatile float %fneg, float addrspace(1)* %out
1148  store volatile float %fma, float addrspace(1)* %out
1149  ret void
1150}
1151
1152; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32:
1153; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1154; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1155; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1156
1157; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1158; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
1159; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
1160
1161; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
1162; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]
1163
1164; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
1165; GCN-NEXT: s_waitcnt vmcnt(0)
1166; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1167; GCN-NEXT: s_waitcnt vmcnt(0)
1168define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1169  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1170  %tid.ext = sext i32 %tid to i64
1171  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1172  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1173  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1174  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1175  %a = load volatile float, float addrspace(1)* %a.gep
1176  %b = load volatile float, float addrspace(1)* %b.gep
1177  %c = load volatile float, float addrspace(1)* %c.gep
1178  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
1179  %fneg = fneg float %fma
1180  %use1 = fmul float %fma, 4.0
1181  store volatile float %fneg, float addrspace(1)* %out
1182  store volatile float %use1, float addrspace(1)* %out
1183  ret void
1184}
1185
1186; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32:
1187; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1188; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1189; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1190
1191; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], [[C]]
1192; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1193
1194; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1195; GCN-NSZ-NOT: [[FMA]]
1196; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1197define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1198  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1199  %tid.ext = sext i32 %tid to i64
1200  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1201  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1202  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1203  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1204  %a = load volatile float, float addrspace(1)* %a.gep
1205  %b = load volatile float, float addrspace(1)* %b.gep
1206  %c = load volatile float, float addrspace(1)* %c.gep
1207  %fneg.a = fneg float %a
1208  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1209  %fneg = fneg float %fma
1210  store volatile float %fneg, float addrspace(1)* %out
1211  ret void
1212}
1213
1214; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32:
1215; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1216; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1217; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1218
1219; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
1220; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1221
1222; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1223; GCN-NSZ-NOT: [[FMA]]
1224; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1225define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1226  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1227  %tid.ext = sext i32 %tid to i64
1228  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1229  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1230  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1231  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1232  %a = load volatile float, float addrspace(1)* %a.gep
1233  %b = load volatile float, float addrspace(1)* %b.gep
1234  %c = load volatile float, float addrspace(1)* %c.gep
1235  %fneg.b = fneg float %b
1236  %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c)
1237  %fneg = fneg float %fma
1238  store volatile float %fneg, float addrspace(1)* %out
1239  ret void
1240}
1241
1242; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32:
1243; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1244; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1245; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1246
1247; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1248; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1249
1250; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
1251; GCN-NSZ-NOT: [[FMA]]
1252; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1253define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1254  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1255  %tid.ext = sext i32 %tid to i64
1256  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1257  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1258  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1259  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1260  %a = load volatile float, float addrspace(1)* %a.gep
1261  %b = load volatile float, float addrspace(1)* %b.gep
1262  %c = load volatile float, float addrspace(1)* %c.gep
1263  %fneg.a = fneg float %a
1264  %fneg.b = fneg float %b
1265  %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c)
1266  %fneg = fneg float %fma
1267  store volatile float %fneg, float addrspace(1)* %out
1268  ret void
1269}
1270
1271; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32:
1272; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1273; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1274; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1275
1276; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], -[[C]]
1277; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1278
1279; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1280; GCN-NSZ-NOT: [[FMA]]
1281; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1282define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1283  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1284  %tid.ext = sext i32 %tid to i64
1285  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1286  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1287  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1288  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1289  %a = load volatile float, float addrspace(1)* %a.gep
1290  %b = load volatile float, float addrspace(1)* %b.gep
1291  %c = load volatile float, float addrspace(1)* %c.gep
1292  %fneg.a = fneg float %a
1293  %fneg.c = fneg float %c
1294  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c)
1295  %fneg = fneg float %fma
1296  store volatile float %fneg, float addrspace(1)* %out
1297  ret void
1298}
1299
1300; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32:
1301; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1302; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1303; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1304
1305; GCN-NSZ-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1306; GCN-NSZ-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1307
1308; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
1309; GCN-NSZ-NOT: [[FMA]]
1310; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1311define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1312  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1313  %tid.ext = sext i32 %tid to i64
1314  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1315  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1316  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1317  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1318  %a = load volatile float, float addrspace(1)* %a.gep
1319  %b = load volatile float, float addrspace(1)* %b.gep
1320  %c = load volatile float, float addrspace(1)* %c.gep
1321  %fneg.c = fneg float %c
1322  %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c)
1323  %fneg = fneg float %fma
1324  store volatile float %fneg, float addrspace(1)* %out
1325  ret void
1326}
1327
1328; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32:
1329; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1330; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1331; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1332
1333; GCN-SAFE: v_xor_b32
1334; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]],
1335; GCN-SAFE: v_xor_b32
1336
1337; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1338; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1339
1340; GCN-NSZ-NOT: [[FMA]]
1341; GCN-NSZ-NOT: [[NEG_A]]
1342; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1343; GCN-NSZ-NOT: [[NEG_A]]
1344; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1345define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1346  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1347  %tid.ext = sext i32 %tid to i64
1348  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1349  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1350  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1351  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1352  %a = load volatile float, float addrspace(1)* %a.gep
1353  %b = load volatile float, float addrspace(1)* %b.gep
1354  %c = load volatile float, float addrspace(1)* %c.gep
1355  %fneg.a = fneg float %a
1356  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1357  %fneg = fneg float %fma
1358  store volatile float %fneg, float addrspace(1)* %out
1359  store volatile float %fneg.a, float addrspace(1)* %out
1360  ret void
1361}
1362
1363; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32:
1364; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1365; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1366; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1367
1368; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
1369; GCN-SAFE-DAG: v_fma_f32 [[FMA:v[0-9]+]]
1370; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1371
1372; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1373; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
1374; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
1375; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1376; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
1377define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
1378  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1379  %tid.ext = sext i32 %tid to i64
1380  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1381  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1382  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1383  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1384  %a = load volatile float, float addrspace(1)* %a.gep
1385  %b = load volatile float, float addrspace(1)* %b.gep
1386  %c = load volatile float, float addrspace(1)* %c.gep
1387  %fneg.a = fneg float %a
1388  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1389  %fneg = fneg float %fma
1390  %use1 = fmul float %fneg.a, %d
1391  store volatile float %fneg, float addrspace(1)* %out
1392  store volatile float %use1, float addrspace(1)* %out
1393  ret void
1394}
1395
1396; --------------------------------------------------------------------------------
1397; fmad tests
1398; --------------------------------------------------------------------------------
1399
1400; GCN-LABEL: {{^}}v_fneg_fmad_f32:
1401; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1402; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1403; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1404
1405; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
1406; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]]
1407
1408; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
1409; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1410define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1411  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1412  %tid.ext = sext i32 %tid to i64
1413  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1414  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1415  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1416  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1417  %a = load volatile float, float addrspace(1)* %a.gep
1418  %b = load volatile float, float addrspace(1)* %b.gep
1419  %c = load volatile float, float addrspace(1)* %c.gep
1420  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
1421  %fneg = fneg float %fma
1422  store float %fneg, float addrspace(1)* %out.gep
1423  ret void
1424}
1425
1426; GCN-LABEL: {{^}}v_fneg_fmad_v4f32:
1427
1428; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1429; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1430; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1431; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1432define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %a.ptr, <4 x float> addrspace(1)* %b.ptr, <4 x float> addrspace(1)* %c.ptr) #0 {
1433  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1434  %tid.ext = sext i32 %tid to i64
1435  %a.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a.ptr, i64 %tid.ext
1436  %b.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %b.ptr, i64 %tid.ext
1437  %c.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %c.ptr, i64 %tid.ext
1438  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
1439  %a = load volatile <4 x float>, <4 x float> addrspace(1)* %a.gep
1440  %b = load volatile <4 x float>, <4 x float> addrspace(1)* %b.gep
1441  %c = load volatile <4 x float>, <4 x float> addrspace(1)* %c.gep
1442  %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
1443  %fneg = fneg <4 x float> %fma
1444  store <4 x float> %fneg, <4 x float> addrspace(1)* %out.gep
1445  ret void
1446}
1447
1448; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
1449; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1450; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1451; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1452
1453; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
1454; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
1455; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
1456
1457; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], [[A]], -[[B]], -[[C]]
1458; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
1459
1460; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]]
1461; GCN-NEXT: s_waitcnt vmcnt(0)
1462; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1463; GCN-NEXT: s_waitcnt vmcnt(0)
1464define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1465  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1466  %tid.ext = sext i32 %tid to i64
1467  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1468  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1469  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1470  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1471  %a = load volatile float, float addrspace(1)* %a.gep
1472  %b = load volatile float, float addrspace(1)* %b.gep
1473  %c = load volatile float, float addrspace(1)* %c.gep
1474  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
1475  %fneg = fneg float %fma
1476  %use1 = fmul float %fma, 4.0
1477  store volatile float %fneg, float addrspace(1)* %out
1478  store volatile float %use1, float addrspace(1)* %out
1479  ret void
1480}
1481
1482; --------------------------------------------------------------------------------
1483; fp_extend tests
1484; --------------------------------------------------------------------------------
1485
1486; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64:
1487; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1488; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
1489; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1490define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1491  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1492  %tid.ext = sext i32 %tid to i64
1493  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1494  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1495  %a = load volatile float, float addrspace(1)* %a.gep
1496  %fpext = fpext float %a to double
1497  %fneg = fsub double -0.000000e+00, %fpext
1498  store double %fneg, double addrspace(1)* %out.gep
1499  ret void
1500}
1501
1502; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64:
1503; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1504; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
1505; GCN: {{buffer|flat}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1506define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1507  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1508  %tid.ext = sext i32 %tid to i64
1509  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1510  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1511  %a = load volatile float, float addrspace(1)* %a.gep
1512  %fneg.a = fneg float %a
1513  %fpext = fpext float %fneg.a to double
1514  %fneg = fsub double -0.000000e+00, %fpext
1515  store double %fneg, double addrspace(1)* %out.gep
1516  ret void
1517}
1518
1519; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64:
1520; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1521; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
1522; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
1523; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1524; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FNEG_A]]
1525define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1526  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1527  %tid.ext = sext i32 %tid to i64
1528  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1529  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1530  %a = load volatile float, float addrspace(1)* %a.gep
1531  %fneg.a = fneg float %a
1532  %fpext = fpext float %fneg.a to double
1533  %fneg = fsub double -0.000000e+00, %fpext
1534  store volatile double %fneg, double addrspace(1)* %out.gep
1535  store volatile float %fneg.a, float addrspace(1)* undef
1536  ret void
1537}
1538
1539; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
1540; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1541; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]]
1542; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
1543; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]]
1544; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[CVT_LO]]:[[CVT_HI]]]
1545define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1546  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1547  %tid.ext = sext i32 %tid to i64
1548  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1549  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1550  %a = load volatile float, float addrspace(1)* %a.gep
1551  %fpext = fpext float %a to double
1552  %fneg = fsub double -0.000000e+00, %fpext
1553  store volatile double %fneg, double addrspace(1)* %out.gep
1554  store volatile double %fpext, double addrspace(1)* undef
1555  ret void
1556}
1557
1558; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
1559; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1560; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]]
1561; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
1562; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v[[[CVT_LO]]:[[CVT_HI]]], 4.0
1563; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]]
1564; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1565define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1566  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1567  %tid.ext = sext i32 %tid to i64
1568  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1569  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1570  %a = load volatile float, float addrspace(1)* %a.gep
1571  %fpext = fpext float %a to double
1572  %fneg = fsub double -0.000000e+00, %fpext
1573  %mul = fmul double %fpext, 4.0
1574  store volatile double %fneg, double addrspace(1)* %out.gep
1575  store volatile double %mul, double addrspace(1)* %out.gep
1576  ret void
1577}
1578
1579; FIXME: Source modifiers not folded for f16->f32
1580; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
1581define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
1582  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1583  %tid.ext = sext i32 %tid to i64
1584  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
1585  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1586  %a = load volatile half, half addrspace(1)* %a.gep
1587  %fpext = fpext half %a to float
1588  %fneg = fneg float %fpext
1589  store volatile float %fneg, float addrspace(1)* %out.gep
1590  store volatile float %fpext, float addrspace(1)* %out.gep
1591  ret void
1592}
1593
1594; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
1595define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
1596  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1597  %tid.ext = sext i32 %tid to i64
1598  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
1599  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1600  %a = load volatile half, half addrspace(1)* %a.gep
1601  %fpext = fpext half %a to float
1602  %fneg = fneg float %fpext
1603  %mul = fmul float %fpext, 4.0
1604  store volatile float %fneg, float addrspace(1)* %out.gep
1605  store volatile float %mul, float addrspace(1)* %out.gep
1606  ret void
1607}
1608
1609; --------------------------------------------------------------------------------
1610; fp_round tests
1611; --------------------------------------------------------------------------------
1612
1613; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32:
1614; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1615; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
1616; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1617define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1618  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1619  %tid.ext = sext i32 %tid to i64
1620  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1621  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1622  %a = load volatile double, double addrspace(1)* %a.gep
1623  %fpround = fptrunc double %a to float
1624  %fneg = fneg float %fpround
1625  store float %fneg, float addrspace(1)* %out.gep
1626  ret void
1627}
1628
1629; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32:
1630; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1631; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
1632; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1633define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1634  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1635  %tid.ext = sext i32 %tid to i64
1636  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1637  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1638  %a = load volatile double, double addrspace(1)* %a.gep
1639  %fneg.a = fsub double -0.000000e+00, %a
1640  %fpround = fptrunc double %fneg.a to float
1641  %fneg = fneg float %fpround
1642  store float %fneg, float addrspace(1)* %out.gep
1643  ret void
1644}
1645
1646; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32:
1647; GCN: {{buffer|flat}}_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]]
1648; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v[[[A_LO]]:[[A_HI]]]
1649; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
1650; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1651; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[A_LO]]:[[NEG_A_HI]]]
1652define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1653  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1654  %tid.ext = sext i32 %tid to i64
1655  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1656  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1657  %a = load volatile double, double addrspace(1)* %a.gep
1658  %fneg.a = fsub double -0.000000e+00, %a
1659  %fpround = fptrunc double %fneg.a to float
1660  %fneg = fneg float %fpround
1661  store volatile float %fneg, float addrspace(1)* %out.gep
1662  store volatile double %fneg.a, double addrspace(1)* undef
1663  ret void
1664}
1665
1666; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32:
1667; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1668; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
1669; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s[
1670
1671; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1672; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[USE1]]
1673define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
1674  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1675  %tid.ext = sext i32 %tid to i64
1676  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1677  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1678  %a = load volatile double, double addrspace(1)* %a.gep
1679  %fneg.a = fsub double -0.000000e+00, %a
1680  %fpround = fptrunc double %fneg.a to float
1681  %fneg = fneg float %fpround
1682  %use1 = fmul double %fneg.a, %c
1683  store volatile float %fneg, float addrspace(1)* %out.gep
1684  store volatile double %use1, double addrspace(1)* undef
1685  ret void
1686}
1687
1688; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16:
1689; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1690; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1691; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1692define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1693  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1694  %tid.ext = sext i32 %tid to i64
1695  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1696  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1697  %a = load volatile float, float addrspace(1)* %a.gep
1698  %fpround = fptrunc float %a to half
1699  %fneg = fsub half -0.000000e+00, %fpround
1700  store half %fneg, half addrspace(1)* %out.gep
1701  ret void
1702}
1703
1704; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16:
1705; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1706; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1707; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1708define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1709  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1710  %tid.ext = sext i32 %tid to i64
1711  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1712  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1713  %a = load volatile float, float addrspace(1)* %a.gep
1714  %fneg.a = fneg float %a
1715  %fpround = fptrunc float %fneg.a to half
1716  %fneg = fsub half -0.000000e+00, %fpround
1717  store half %fneg, half addrspace(1)* %out.gep
1718  ret void
1719}
1720
1721; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32:
1722; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1723; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]]
1724; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
1725; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG]]
1726; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[CVT]]
1727define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1728  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1729  %tid.ext = sext i32 %tid to i64
1730  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1731  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1732  %a = load volatile double, double addrspace(1)* %a.gep
1733  %fpround = fptrunc double %a to float
1734  %fneg = fneg float %fpround
1735  store volatile float %fneg, float addrspace(1)* %out.gep
1736  store volatile float %fpround, float addrspace(1)* %out.gep
1737  ret void
1738}
1739
1740; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16:
1741; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1742; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1743; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1744; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1745; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1746define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1747  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1748  %tid.ext = sext i32 %tid to i64
1749  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1750  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1751  %a = load volatile float, float addrspace(1)* %a.gep
1752  %fneg.a = fneg float %a
1753  %fpround = fptrunc float %fneg.a to half
1754  %fneg = fsub half -0.000000e+00, %fpround
1755  store volatile half %fneg, half addrspace(1)* %out.gep
1756  store volatile float %fneg.a, float addrspace(1)* undef
1757  ret void
1758}
1759
1760; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16:
1761; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1762; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1763; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
1764; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1765; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[USE1]]
1766define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
1767  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1768  %tid.ext = sext i32 %tid to i64
1769  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1770  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1771  %a = load volatile float, float addrspace(1)* %a.gep
1772  %fneg.a = fneg float %a
1773  %fpround = fptrunc float %fneg.a to half
1774  %fneg = fsub half -0.000000e+00, %fpround
1775  %use1 = fmul float %fneg.a, %c
1776  store volatile half %fneg, half addrspace(1)* %out.gep
1777  store volatile float %use1, float addrspace(1)* undef
1778  ret void
1779}
1780
1781; --------------------------------------------------------------------------------
1782; rcp tests
1783; --------------------------------------------------------------------------------
1784
1785; GCN-LABEL: {{^}}v_fneg_rcp_f32:
1786; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1787; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1788; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1789define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1790  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1791  %tid.ext = sext i32 %tid to i64
1792  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1793  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1794  %a = load volatile float, float addrspace(1)* %a.gep
1795  %rcp = call float @llvm.amdgcn.rcp.f32(float %a)
1796  %fneg = fneg float %rcp
1797  store float %fneg, float addrspace(1)* %out.gep
1798  ret void
1799}
1800
1801; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32:
1802; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1803; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1804; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1805define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1806  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1807  %tid.ext = sext i32 %tid to i64
1808  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1809  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1810  %a = load volatile float, float addrspace(1)* %a.gep
1811  %fneg.a = fneg float %a
1812  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1813  %fneg = fneg float %rcp
1814  store float %fneg, float addrspace(1)* %out.gep
1815  ret void
1816}
1817
1818; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32:
1819; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1820; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1821; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1822; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1823; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1824define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1825  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1826  %tid.ext = sext i32 %tid to i64
1827  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1828  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1829  %a = load volatile float, float addrspace(1)* %a.gep
1830  %fneg.a = fneg float %a
1831  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1832  %fneg = fneg float %rcp
1833  store volatile float %fneg, float addrspace(1)* %out.gep
1834  store volatile float %fneg.a, float addrspace(1)* undef
1835  ret void
1836}
1837
1838; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32:
1839; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1840; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1841; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
1842; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1843; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1844define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
1845  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1846  %tid.ext = sext i32 %tid to i64
1847  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1848  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1849  %a = load volatile float, float addrspace(1)* %a.gep
1850  %fneg.a = fneg float %a
1851  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1852  %fneg = fneg float %rcp
1853  %use1 = fmul float %fneg.a, %c
1854  store volatile float %fneg, float addrspace(1)* %out.gep
1855  store volatile float %use1, float addrspace(1)* undef
1856  ret void
1857}
1858
1859; --------------------------------------------------------------------------------
1860; fmul_legacy tests
1861; --------------------------------------------------------------------------------
1862
1863; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32:
1864; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1865; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1866; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
1867; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1868define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1869  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1870  %tid.ext = sext i32 %tid to i64
1871  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1872  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1873  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1874  %a = load volatile float, float addrspace(1)* %a.gep
1875  %b = load volatile float, float addrspace(1)* %b.gep
1876  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1877  %fneg = fneg float %mul
1878  store float %fneg, float addrspace(1)* %out.gep
1879  ret void
1880}
1881
1882; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
1883; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1884; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1885; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1886; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
1887; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
1888; GCN-NEXT: s_waitcnt vmcnt(0)
1889; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1890; GCN-NEXT: s_waitcnt vmcnt(0)
1891define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1892  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1893  %tid.ext = sext i32 %tid to i64
1894  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1895  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1896  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1897  %a = load volatile float, float addrspace(1)* %a.gep
1898  %b = load volatile float, float addrspace(1)* %b.gep
1899  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1900  %fneg = fneg float %mul
1901  store volatile float %fneg, float addrspace(1)* %out
1902  store volatile float %mul, float addrspace(1)* %out
1903  ret void
1904}
1905
1906; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
1907; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1908; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1909; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1910; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
1911; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1912; GCN-NEXT: s_waitcnt vmcnt(0)
1913; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1914; GCN-NEXT: s_waitcnt vmcnt(0)
1915define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1916  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1917  %tid.ext = sext i32 %tid to i64
1918  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1919  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1920  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1921  %a = load volatile float, float addrspace(1)* %a.gep
1922  %b = load volatile float, float addrspace(1)* %b.gep
1923  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1924  %fneg = fneg float %mul
1925  %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0)
1926  store volatile float %fneg, float addrspace(1)* %out
1927  store volatile float %use1, float addrspace(1)* %out
1928  ret void
1929}
1930
1931; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
1932; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1933; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1934; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1935; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1936define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1937  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1938  %tid.ext = sext i32 %tid to i64
1939  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1940  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1941  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1942  %a = load volatile float, float addrspace(1)* %a.gep
1943  %b = load volatile float, float addrspace(1)* %b.gep
1944  %fneg.a = fneg float %a
1945  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
1946  %fneg = fneg float %mul
1947  store volatile float %fneg, float addrspace(1)* %out
1948  ret void
1949}
1950
1951; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
1952; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1953; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1954; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1955; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1956define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1957  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1958  %tid.ext = sext i32 %tid to i64
1959  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1960  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1961  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1962  %a = load volatile float, float addrspace(1)* %a.gep
1963  %b = load volatile float, float addrspace(1)* %b.gep
1964  %fneg.b = fneg float %b
1965  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b)
1966  %fneg = fneg float %mul
1967  store volatile float %fneg, float addrspace(1)* %out
1968  ret void
1969}
1970
1971; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32:
1972; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1973; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1974; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1975; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1976define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1977  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1978  %tid.ext = sext i32 %tid to i64
1979  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1980  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1981  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1982  %a = load volatile float, float addrspace(1)* %a.gep
1983  %b = load volatile float, float addrspace(1)* %b.gep
1984  %fneg.a = fneg float %a
1985  %fneg.b = fneg float %b
1986  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b)
1987  %fneg = fneg float %mul
1988  store volatile float %fneg, float addrspace(1)* %out
1989  ret void
1990}
1991
1992; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32:
1993; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1994; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1995; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1996; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
1997; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
1998; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1999define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
2000  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2001  %tid.ext = sext i32 %tid to i64
2002  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2003  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2004  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2005  %a = load volatile float, float addrspace(1)* %a.gep
2006  %b = load volatile float, float addrspace(1)* %b.gep
2007  %fneg.a = fneg float %a
2008  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
2009  %fneg = fneg float %mul
2010  store volatile float %fneg, float addrspace(1)* %out
2011  store volatile float %fneg.a, float addrspace(1)* %out
2012  ret void
2013}
2014
2015; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
2016; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2017; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2018; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
2019; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
2020; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
2021; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2022define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
2023  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2024  %tid.ext = sext i32 %tid to i64
2025  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2026  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2027  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2028  %a = load volatile float, float addrspace(1)* %a.gep
2029  %b = load volatile float, float addrspace(1)* %b.gep
2030  %fneg.a = fneg float %a
2031  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
2032  %fneg = fneg float %mul
2033  %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c)
2034  store volatile float %fneg, float addrspace(1)* %out
2035  store volatile float %use1, float addrspace(1)* %out
2036  ret void
2037}
2038
2039; --------------------------------------------------------------------------------
2040; sin tests
2041; --------------------------------------------------------------------------------
2042
2043; GCN-LABEL: {{^}}v_fneg_sin_f32:
2044; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2045; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]]
2046; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
2047; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
2048; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2049define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2050  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2051  %tid.ext = sext i32 %tid to i64
2052  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2053  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2054  %a = load volatile float, float addrspace(1)* %a.gep
2055  %sin = call float @llvm.sin.f32(float %a)
2056  %fneg = fneg float %sin
2057  store float %fneg, float addrspace(1)* %out.gep
2058  ret void
2059}
2060
2061; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32:
2062; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2063; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2064; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2065define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2066  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2067  %tid.ext = sext i32 %tid to i64
2068  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2069  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2070  %a = load volatile float, float addrspace(1)* %a.gep
2071  %sin = call float @llvm.amdgcn.sin.f32(float %a)
2072  %fneg = fneg float %sin
2073  store float %fneg, float addrspace(1)* %out.gep
2074  ret void
2075}
2076
2077; --------------------------------------------------------------------------------
2078; ftrunc tests
2079; --------------------------------------------------------------------------------
2080
2081; GCN-LABEL: {{^}}v_fneg_trunc_f32:
2082; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2083; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2084; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2085define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2086  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2087  %tid.ext = sext i32 %tid to i64
2088  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2089  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2090  %a = load volatile float, float addrspace(1)* %a.gep
2091  %trunc = call float @llvm.trunc.f32(float %a)
2092  %fneg = fneg float %trunc
2093  store float %fneg, float addrspace(1)* %out.gep
2094  ret void
2095}
2096
2097; --------------------------------------------------------------------------------
2098; fround tests
2099; --------------------------------------------------------------------------------
2100
2101; GCN-LABEL: {{^}}v_fneg_round_f32:
2102; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2103; GCN: v_trunc_f32_e32
2104; GCN: v_sub_f32_e32
2105; GCN: v_cndmask_b32
2106
2107; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
2108; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]]
2109
2110; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
2111; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2112define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2113  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2114  %tid.ext = sext i32 %tid to i64
2115  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2116  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2117  %a = load volatile float, float addrspace(1)* %a.gep
2118  %round = call float @llvm.round.f32(float %a)
2119  %fneg = fneg float %round
2120  store float %fneg, float addrspace(1)* %out.gep
2121  ret void
2122}
2123
2124; --------------------------------------------------------------------------------
2125; rint tests
2126; --------------------------------------------------------------------------------
2127
2128; GCN-LABEL: {{^}}v_fneg_rint_f32:
2129; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2130; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2131; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2132define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2133  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2134  %tid.ext = sext i32 %tid to i64
2135  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2136  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2137  %a = load volatile float, float addrspace(1)* %a.gep
2138  %rint = call float @llvm.rint.f32(float %a)
2139  %fneg = fneg float %rint
2140  store float %fneg, float addrspace(1)* %out.gep
2141  ret void
2142}
2143
2144; --------------------------------------------------------------------------------
2145; nearbyint tests
2146; --------------------------------------------------------------------------------
2147
2148; GCN-LABEL: {{^}}v_fneg_nearbyint_f32:
2149; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2150; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2151; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2152define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2153  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2154  %tid.ext = sext i32 %tid to i64
2155  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2156  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2157  %a = load volatile float, float addrspace(1)* %a.gep
2158  %nearbyint = call float @llvm.nearbyint.f32(float %a)
2159  %fneg = fneg float %nearbyint
2160  store float %fneg, float addrspace(1)* %out.gep
2161  ret void
2162}
2163
2164; --------------------------------------------------------------------------------
2165; fcanonicalize tests
2166; --------------------------------------------------------------------------------
2167
2168; GCN-LABEL: {{^}}v_fneg_canonicalize_f32:
2169; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2170; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], -1.0, [[A]]
2171; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2172define amdgpu_kernel void @v_fneg_canonicalize_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2173  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2174  %tid.ext = sext i32 %tid to i64
2175  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2176  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2177  %a = load volatile float, float addrspace(1)* %a.gep
2178  %trunc = call float @llvm.canonicalize.f32(float %a)
2179  %fneg = fneg float %trunc
2180  store float %fneg, float addrspace(1)* %out.gep
2181  ret void
2182}
2183
2184; --------------------------------------------------------------------------------
2185; vintrp tests
2186; --------------------------------------------------------------------------------
2187
2188; GCN-LABEL: {{^}}v_fneg_interp_p1_f32:
2189; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2190; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2191; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
2192; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2193; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2194define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
2195  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2196  %tid.ext = sext i32 %tid to i64
2197  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2198  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2199  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2200  %a = load volatile float, float addrspace(1)* %a.gep
2201  %b = load volatile float, float addrspace(1)* %b.gep
2202  %mul = fmul float %a, %b
2203  %fneg = fneg float %mul
2204  %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0)
2205  %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0)
2206  store volatile float %intrp0, float addrspace(1)* %out.gep
2207  store volatile float %intrp1, float addrspace(1)* %out.gep
2208  ret void
2209}
2210
2211; GCN-LABEL: {{^}}v_fneg_interp_p2_f32:
2212; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2213; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2214; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
2215; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2216; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2217define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
2218  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2219  %tid.ext = sext i32 %tid to i64
2220  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2221  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2222  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2223  %a = load volatile float, float addrspace(1)* %a.gep
2224  %b = load volatile float, float addrspace(1)* %b.gep
2225  %mul = fmul float %a, %b
2226  %fneg = fneg float %mul
2227  %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0)
2228  %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0)
2229  store volatile float %intrp0, float addrspace(1)* %out.gep
2230  store volatile float %intrp1, float addrspace(1)* %out.gep
2231  ret void
2232}
2233
2234; --------------------------------------------------------------------------------
2235; CopyToReg tests
2236; --------------------------------------------------------------------------------
2237
2238; GCN-LABEL: {{^}}v_fneg_copytoreg_f32:
2239; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2240; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2241; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2242; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
2243; GCN: s_cbranch_scc0
2244
2245; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
2246; GCN: s_endpgm
2247
2248; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
2249; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]]
2250; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2251
2252define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
2253  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2254  %tid.ext = sext i32 %tid to i64
2255  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2256  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2257  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2258  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2259  %a = load volatile float, float addrspace(1)* %a.gep
2260  %b = load volatile float, float addrspace(1)* %b.gep
2261  %c = load volatile float, float addrspace(1)* %c.gep
2262  %mul = fmul float %a, %b
2263  %fneg = fneg float %mul
2264  %cmp0 = icmp eq i32 %d, 0
2265  br i1 %cmp0, label %if, label %endif
2266
2267if:
2268  %mul1 = fmul float %fneg, %c
2269  store volatile float %mul1, float addrspace(1)* %out.gep
2270  br label %endif
2271
2272endif:
2273  store volatile float %mul, float addrspace(1)* %out.gep
2274  ret void
2275}
2276
2277; --------------------------------------------------------------------------------
2278; inlineasm tests
2279; --------------------------------------------------------------------------------
2280
2281; Can't fold into use, so should fold into source
2282; GCN-LABEL: {{^}}v_fneg_inlineasm_f32:
2283; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2284; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2285; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
2286; GCN: ; use [[MUL]]
2287; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2288define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
2289  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2290  %tid.ext = sext i32 %tid to i64
2291  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2292  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2293  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2294  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2295  %a = load volatile float, float addrspace(1)* %a.gep
2296  %b = load volatile float, float addrspace(1)* %b.gep
2297  %c = load volatile float, float addrspace(1)* %c.gep
2298  %mul = fmul float %a, %b
2299  %fneg = fneg float %mul
2300  call void asm sideeffect "; use $0", "v"(float %fneg) #0
2301  store volatile float %fneg, float addrspace(1)* %out.gep
2302  ret void
2303}
2304
2305; --------------------------------------------------------------------------------
2306; inlineasm tests
2307; --------------------------------------------------------------------------------
2308
2309; Can't fold into use, so should fold into source
2310; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
2311; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2312; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2313; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
2314; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
2315; GCN: ; use [[NEG]]
2316; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2317define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
2318  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2319  %tid.ext = sext i32 %tid to i64
2320  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2321  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2322  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2323  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2324  %a = load volatile float, float addrspace(1)* %a.gep
2325  %b = load volatile float, float addrspace(1)* %b.gep
2326  %c = load volatile float, float addrspace(1)* %c.gep
2327  %mul = fmul float %a, %b
2328  %fneg = fneg float %mul
2329  call void asm sideeffect "; use $0", "v"(float %fneg) #0
2330  store volatile float %mul, float addrspace(1)* %out.gep
2331  ret void
2332}
2333
2334; --------------------------------------------------------------------------------
2335; code size regression tests
2336; --------------------------------------------------------------------------------
2337
2338; There are multiple users of the fneg that must use a VOP3
2339; instruction, so there is no penalty
2340; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32:
2341; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2342; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2343; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2344
2345; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]]
2346; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
2347
2348; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2349; GCN-NEXT: s_waitcnt vmcnt(0)
2350; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA1]]
2351; GCN-NEXT: s_waitcnt vmcnt(0)
2352define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
2353  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2354  %tid.ext = sext i32 %tid to i64
2355  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2356  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2357  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2358  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2359  %a = load volatile float, float addrspace(1)* %a.gep
2360  %b = load volatile float, float addrspace(1)* %b.gep
2361  %c = load volatile float, float addrspace(1)* %c.gep
2362
2363  %fneg.a = fneg float %a
2364  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
2365  %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0)
2366
2367  store volatile float %fma0, float addrspace(1)* %out
2368  store volatile float %fma1, float addrspace(1)* %out
2369  ret void
2370}
2371
2372; There are multiple users, but both require using a larger encoding
2373; for the modifier.
2374
2375; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32:
2376; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2377; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2378; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2379
2380; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
2381; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
2382; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
2383; GCN-NEXT: s_waitcnt vmcnt(0)
2384; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2385; GCN-NEXT: s_waitcnt vmcnt(0)
2386define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
2387  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2388  %tid.ext = sext i32 %tid to i64
2389  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2390  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2391  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2392  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2393  %a = load volatile float, float addrspace(1)* %a.gep
2394  %b = load volatile float, float addrspace(1)* %b.gep
2395  %c = load volatile float, float addrspace(1)* %c.gep
2396
2397  %fneg.a = fneg float %a
2398  %mul0 = fmul float %fneg.a, %b
2399  %mul1 = fmul float %fneg.a, %c
2400
2401  store volatile float %mul0, float addrspace(1)* %out
2402  store volatile float %mul1, float addrspace(1)* %out
2403  ret void
2404}
2405
2406; One user is VOP3 so has no cost to folding the modifier, the other does.
2407; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32:
2408; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2409; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2410; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2411
2412; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0
2413; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
2414
2415; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2416; GCN-NEXT: s_waitcnt vmcnt(0)
2417; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2418; GCN-NEXT: s_waitcnt vmcnt(0)
2419define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
2420  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2421  %tid.ext = sext i32 %tid to i64
2422  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2423  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2424  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2425  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2426  %a = load volatile float, float addrspace(1)* %a.gep
2427  %b = load volatile float, float addrspace(1)* %b.gep
2428  %c = load volatile float, float addrspace(1)* %c.gep
2429
2430  %fneg.a = fneg float %a
2431  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
2432  %mul1 = fmul float %fneg.a, %c
2433
2434  store volatile float %fma0, float addrspace(1)* %out
2435  store volatile float %mul1, float addrspace(1)* %out
2436  ret void
2437}
2438
2439; The use of the fneg requires a code size increase, but folding into
2440; the source does not
2441
2442; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32:
2443; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2444; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2445; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2446; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
2447
2448; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0
2449; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]]
2450; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
2451
2452; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
2453; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]]
2454; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]
2455
2456; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2457; GCN-NEXT: s_waitcnt vmcnt(0)
2458; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL2]]
2459; GCN-NEXT: s_waitcnt vmcnt(0)
2460define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2461  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2462  %tid.ext = sext i32 %tid to i64
2463  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2464  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2465  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2466  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2467  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2468  %a = load volatile float, float addrspace(1)* %a.gep
2469  %b = load volatile float, float addrspace(1)* %b.gep
2470  %c = load volatile float, float addrspace(1)* %c.gep
2471  %d = load volatile float, float addrspace(1)* %d.gep
2472
2473  %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
2474  %fneg.fma0 = fneg float %fma0
2475  %mul1 = fmul float %fneg.fma0, %c
2476  %mul2 = fmul float %fneg.fma0, %d
2477
2478  store volatile float %mul1, float addrspace(1)* %out
2479  store volatile float %mul2, float addrspace(1)* %out
2480  ret void
2481}
2482
2483; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64:
2484; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
2485; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
2486; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
2487; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
2488
2489; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
2490; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]]
2491; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
2492
2493; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
2494; GCN-NEXT: s_waitcnt vmcnt(0)
2495; GCN-NEXT: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2496; GCN-NEXT: s_waitcnt vmcnt(0)
2497define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
2498  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2499  %tid.ext = sext i32 %tid to i64
2500  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
2501  %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext
2502  %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext
2503  %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext
2504  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
2505  %a = load volatile double, double addrspace(1)* %a.gep
2506  %b = load volatile double, double addrspace(1)* %b.gep
2507  %c = load volatile double, double addrspace(1)* %c.gep
2508  %d = load volatile double, double addrspace(1)* %d.gep
2509
2510  %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
2511  %fneg.fma0 = fsub double -0.0, %fma0
2512  %mul1 = fmul double %fneg.fma0, %c
2513  %mul2 = fmul double %fneg.fma0, %d
2514
2515  store volatile double %mul1, double addrspace(1)* %out
2516  store volatile double %mul2, double addrspace(1)* %out
2517  ret void
2518}
2519
2520; %trunc.a has one fneg use, but it requires a code size increase and
2521; %the fneg can instead be folded for free into the fma.
2522
2523; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32:
2524; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2525; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2526; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2527; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
2528; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2529; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2530define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2531  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2532  %tid.ext = sext i32 %tid to i64
2533  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2534  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2535  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2536  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2537  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2538  %a = load volatile float, float addrspace(1)* %a.gep
2539  %b = load volatile float, float addrspace(1)* %b.gep
2540  %c = load volatile float, float addrspace(1)* %c.gep
2541  %d = load volatile float, float addrspace(1)* %d.gep
2542
2543  %trunc.a = call float @llvm.trunc.f32(float %a)
2544  %trunc.fneg.a = fneg float %trunc.a
2545  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
2546  store volatile float %fma0, float addrspace(1)* %out
2547  ret void
2548}
2549
2550; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src:
2551; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2552; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2553; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2554; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
2555; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
2556; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2557; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]]
2558; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2559; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2560define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2561  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2562  %tid.ext = sext i32 %tid to i64
2563  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2564  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2565  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2566  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2567  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2568  %a = load volatile float, float addrspace(1)* %a.gep
2569  %b = load volatile float, float addrspace(1)* %b.gep
2570  %c = load volatile float, float addrspace(1)* %c.gep
2571  %d = load volatile float, float addrspace(1)* %d.gep
2572
2573  %trunc.a = call float @llvm.trunc.f32(float %a)
2574  %trunc.fneg.a = fneg float %trunc.a
2575  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
2576  %mul1 = fmul float %trunc.a, %d
2577  store volatile float %fma0, float addrspace(1)* %out
2578  store volatile float %mul1, float addrspace(1)* %out
2579  ret void
2580}
2581
2582; The AMDGPU combine to pull fneg into the FMA operands was being
2583; undone by the generic combine to pull the fneg out of the fma if
2584; !isFNegFree. We were reporting false for v2f32 even though it will
2585; be split into f32 where it will be free.
2586; GCN-LABEL: {{^}}fneg_fma_fneg_dagcombine_loop:
2587; GCN: s_brev_b32 [[NEGZERO:s[0-9]+]], 1{{$}}
2588; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], v2, -v4, [[NEGZERO]]
2589; GCN-DAG: v_fma_f32 [[FMA1:v[0-9]+]], v3, -v5, [[NEGZERO]]
2590; GCN-DAG: v_sub_f32_e32 [[SUB0:v[0-9]+]], [[FMA0]], v0
2591; GCN-DAG: v_sub_f32_e32 [[SUB1:v[0-9]+]], [[FMA1]], v1
2592; GCN-DAG: v_mul_f32_e32 v0, [[SUB0]], v4
2593; GCN-DAG: v_mul_f32_e32 v1, [[SUB1]], v5
2594; GCN: s_setpc_b64
2595define <2 x float> @fneg_fma_fneg_dagcombine_loop(<2 x float> %arg, <2 x float> %arg1, <2 x float> %arg2) #0 {
2596bb:
2597  %i3 = call fast <2 x float> @llvm.fma.v2f32(<2 x float> %arg1, <2 x float> %arg2, <2 x float> zeroinitializer)
2598  %i4 = fadd fast <2 x float> %i3, %arg
2599  %i5 = fneg <2 x float> %i4
2600  %i6 = fmul fast <2 x float> %i5, %arg2
2601  ret <2 x float> %i6
2602}
2603
2604; This expects denormal flushing, so can't turn this fmul into fneg
2605; TODO: Keeping this as fmul saves encoding size
2606; GCN-LABEL: {{^}}nnan_fmul_neg1_to_fneg:
2607; GCN: v_sub_f32_e32 [[TMP:v[0-9]+]], 0x80000000, v0
2608; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1
2609define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 {
2610  %mul = fmul float %x, -1.0
2611  %add = fmul nnan float %mul, %y
2612  ret float %add
2613}
2614
2615; It's legal to turn this fmul into an fneg since denormals are
2616; preserved and we know an snan can't happen from the flag.
2617; GCN-LABEL: {{^}}denormal_fmul_neg1_to_fneg:
2618; GCN: v_mul_f32_e64 v0, -v0, v1
2619; GCN-NEXT: s_setpc_b64
2620define float @denormal_fmul_neg1_to_fneg(float %x, float %y) {
2621  %mul = fmul nnan float %x, -1.0
2622  %add = fmul float %mul, %y
2623  ret float %add
2624}
2625
2626; know the source can't be an snan
2627; GCN-LABEL: {{^}}denorm_snan_fmul_neg1_to_fneg:
2628; GCN: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0
2629; GCN: v_mul_f32_e32 v0, [[TMP]], v1
2630; GCN-NEXT: s_setpc_b64
2631define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) {
2632  %canonical = fmul float %x, %x
2633  %mul = fmul float %canonical, -1.0
2634  %add = fmul float %mul, %y
2635  ret float %add
2636}
2637
2638; GCN-LABEL: {{^}}flush_snan_fmul_neg1_to_fneg:
2639; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], 1.0, v0
2640; GCN: v_sub_f32_e32 [[TMP1:v[0-9]+]], 0x80000000, [[TMP0]]
2641; GCN-NEXT: v_mul_f32_e32 v0, [[TMP1]], v1
2642define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 {
2643  %quiet = call float @llvm.canonicalize.f32(float %x)
2644  %mul = fmul float %quiet, -1.0
2645  %add = fmul float %mul, %y
2646  ret float %add
2647}
2648
2649declare i32 @llvm.amdgcn.workitem.id.x() #1
2650declare float @llvm.fma.f32(float, float, float) #1
2651declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
2652declare float @llvm.fmuladd.f32(float, float, float) #1
2653declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
2654declare float @llvm.sin.f32(float) #1
2655declare float @llvm.trunc.f32(float) #1
2656declare float @llvm.round.f32(float) #1
2657declare float @llvm.rint.f32(float) #1
2658declare float @llvm.nearbyint.f32(float) #1
2659declare float @llvm.canonicalize.f32(float) #1
2660declare float @llvm.minnum.f32(float, float) #1
2661declare float @llvm.maxnum.f32(float, float) #1
2662declare half @llvm.minnum.f16(half, half) #1
2663declare double @llvm.minnum.f64(double, double) #1
2664declare double @llvm.fma.f64(double, double, double) #1
2665
2666declare float @llvm.amdgcn.sin.f32(float) #1
2667declare float @llvm.amdgcn.rcp.f32(float) #1
2668declare float @llvm.amdgcn.rcp.legacy(float) #1
2669declare float @llvm.amdgcn.fmul.legacy(float, float) #1
2670declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
2671declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
2672
2673attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2674attributes #1 = { nounwind readnone }
2675attributes #2 = { nounwind "unsafe-fp-math"="true" }
2676attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
2677