1; RUN: llc -march=amdgcn -mcpu=hawaii -start-after=sink -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI %s
2; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI %s
3
4; RUN: llc -march=amdgcn -mcpu=fiji -start-after=sink --verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI %s
5; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI %s
6
7; --------------------------------------------------------------------------------
8; fadd tests
9; --------------------------------------------------------------------------------
10
11; GCN-LABEL: {{^}}v_fneg_add_f32:
12; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
13; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
14
15; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
16; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
17
18; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
19; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
20define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
21  %tid = call i32 @llvm.amdgcn.workitem.id.x()
22  %tid.ext = sext i32 %tid to i64
23  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
24  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
25  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
26  %a = load volatile float, float addrspace(1)* %a.gep
27  %b = load volatile float, float addrspace(1)* %b.gep
28  %add = fadd float %a, %b
29  %fneg = fneg float %add
30  store float %fneg, float addrspace(1)* %out.gep
31  ret void
32}
33
34; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32:
35; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
36; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
37; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
38; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
39; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
40; GCN-NEXT: s_waitcnt vmcnt(0)
41; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
42; GCN-NEXT: s_waitcnt vmcnt(0)
43define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
44  %tid = call i32 @llvm.amdgcn.workitem.id.x()
45  %tid.ext = sext i32 %tid to i64
46  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
47  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
48  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
49  %a = load volatile float, float addrspace(1)* %a.gep
50  %b = load volatile float, float addrspace(1)* %b.gep
51  %add = fadd float %a, %b
52  %fneg = fneg float %add
53  store volatile float %fneg, float addrspace(1)* %out
54  store volatile float %add, float addrspace(1)* %out
55  ret void
56}
57
58; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
59; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
60; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
61
62; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
63; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
64; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
65
66; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]]
67; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]]
68
69; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
70; GCN-NEXT: s_waitcnt vmcnt(0)
71; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
72; GCN-NEXT: s_waitcnt vmcnt(0)
73define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
74  %tid = call i32 @llvm.amdgcn.workitem.id.x()
75  %tid.ext = sext i32 %tid to i64
76  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
77  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
78  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
79  %a = load volatile float, float addrspace(1)* %a.gep
80  %b = load volatile float, float addrspace(1)* %b.gep
81  %add = fadd float %a, %b
82  %fneg = fneg float %add
83  %use1 = fmul float %add, 4.0
84  store volatile float %fneg, float addrspace(1)* %out
85  store volatile float %use1, float addrspace(1)* %out
86  ret void
87}
88
89; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32:
90; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
91; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
92
93; GCN-SAFE: v_sub_f32_e32
94; GCN-SAFE: v_xor_b32_e32 [[ADD:v[0-9]+]], 0x80000000,
95
96; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
97
98; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
99define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
100  %tid = call i32 @llvm.amdgcn.workitem.id.x()
101  %tid.ext = sext i32 %tid to i64
102  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
103  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
104  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
105  %a = load volatile float, float addrspace(1)* %a.gep
106  %b = load volatile float, float addrspace(1)* %b.gep
107  %fneg.a = fneg float %a
108  %add = fadd float %fneg.a, %b
109  %fneg = fneg float %add
110  store volatile float %fneg, float addrspace(1)* %out
111  ret void
112}
113
114; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32:
115; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
116; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
117
118; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
119; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
120
121; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
122; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
123define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
124  %tid = call i32 @llvm.amdgcn.workitem.id.x()
125  %tid.ext = sext i32 %tid to i64
126  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
127  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
128  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
129  %a = load volatile float, float addrspace(1)* %a.gep
130  %b = load volatile float, float addrspace(1)* %b.gep
131  %fneg.b = fneg float %b
132  %add = fadd float %a, %fneg.b
133  %fneg = fneg float %add
134  store volatile float %fneg, float addrspace(1)* %out
135  ret void
136}
137
138; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32:
139; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
140; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
141
142; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]]
143; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
144
145; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
146; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
147define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
148  %tid = call i32 @llvm.amdgcn.workitem.id.x()
149  %tid.ext = sext i32 %tid to i64
150  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
151  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
152  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
153  %a = load volatile float, float addrspace(1)* %a.gep
154  %b = load volatile float, float addrspace(1)* %b.gep
155  %fneg.a = fneg float %a
156  %fneg.b = fneg float %b
157  %add = fadd float %fneg.a, %fneg.b
158  %fneg = fneg float %add
159  store volatile float %fneg, float addrspace(1)* %out
160  ret void
161}
162
163; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
164; GCN-SAFE-DAG: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1{{$}}
165; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
166; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
167
168; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[SIGNBIT]], [[A]]
169; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
170; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[SIGNBIT]], [[ADD]]
171
172; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
173; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
174; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
175; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
176; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
177; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
178define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
179  %tid = call i32 @llvm.amdgcn.workitem.id.x()
180  %tid.ext = sext i32 %tid to i64
181  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
182  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
183  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
184  %a = load volatile float, float addrspace(1)* %a.gep
185  %b = load volatile float, float addrspace(1)* %b.gep
186  %fneg.a = fneg float %a
187  %add = fadd float %fneg.a, %b
188  %fneg = fneg float %add
189  store volatile float %fneg, float addrspace(1)* %out
190  store volatile float %fneg.a, float addrspace(1)* %out
191  ret void
192}
193
194; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32:
195; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
196; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
197
198; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
199; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
200; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
201
202; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
203; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
204; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]]
205; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
206; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
207; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
208define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
209  %tid = call i32 @llvm.amdgcn.workitem.id.x()
210  %tid.ext = sext i32 %tid to i64
211  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
212  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
213  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
214  %a = load volatile float, float addrspace(1)* %a.gep
215  %b = load volatile float, float addrspace(1)* %b.gep
216  %fneg.a = fneg float %a
217  %add = fadd float %fneg.a, %b
218  %fneg = fneg float %add
219  %use1 = fmul float %fneg.a, %c
220  store volatile float %fneg, float addrspace(1)* %out
221  store volatile float %use1, float addrspace(1)* %out
222  ret void
223}
224
225; This one asserted with -enable-no-signed-zeros-fp-math
226; GCN-LABEL: {{^}}fneg_fadd_0:
227; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]],
228; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]]
229; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]]
230define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 {
231.entry:
232  %tmp7 = fdiv float 1.000000e+00, %tmp6
233  %tmp8 = fmul float 0.000000e+00, %tmp7
234  %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8
235  %.i188 = fadd float %tmp9, 0.000000e+00
236  %tmp10 = fcmp uge float %.i188, %tmp2
237  %tmp11 = fneg float %.i188
238  %.i092 = select i1 %tmp10, float %tmp2, float %tmp11
239  %tmp12 = fcmp ule float %.i092, 0.000000e+00
240  %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000
241  ret float %.i198
242}
243
244; This is a workaround because -enable-no-signed-zeros-fp-math does not set up
245; function attribute unsafe-fp-math automatically. Combine with the previous test
246; when that is done.
247; GCN-LABEL: {{^}}fneg_fadd_0_nsz:
248; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]],
249; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]],
250; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]],
251; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]],
252; GCN-NSZ-DAG: v_cmp_nlt_f32_e64 {{.*}}, -[[D]]
253define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 {
254.entry:
255  %tmp7 = fdiv afn float 1.000000e+00, %tmp6
256  %tmp8 = fmul float 0.000000e+00, %tmp7
257  %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8
258  %.i188 = fadd float %tmp9, 0.000000e+00
259  %tmp10 = fcmp uge float %.i188, %tmp2
260  %tmp11 = fneg float %.i188
261  %.i092 = select i1 %tmp10, float %tmp2, float %tmp11
262  %tmp12 = fcmp ule float %.i092, 0.000000e+00
263  %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000
264  ret float %.i198
265}
266
267; --------------------------------------------------------------------------------
268; fmul tests
269; --------------------------------------------------------------------------------
270
271; GCN-LABEL: {{^}}v_fneg_mul_f32:
272; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
273; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
274; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
275; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
276define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
277  %tid = call i32 @llvm.amdgcn.workitem.id.x()
278  %tid.ext = sext i32 %tid to i64
279  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
280  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
281  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
282  %a = load volatile float, float addrspace(1)* %a.gep
283  %b = load volatile float, float addrspace(1)* %b.gep
284  %mul = fmul float %a, %b
285  %fneg = fneg float %mul
286  store float %fneg, float addrspace(1)* %out.gep
287  ret void
288}
289
290; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32:
291; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
292; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
293; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
294; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
295; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]]
296; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
297define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
298  %tid = call i32 @llvm.amdgcn.workitem.id.x()
299  %tid.ext = sext i32 %tid to i64
300  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
301  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
302  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
303  %a = load volatile float, float addrspace(1)* %a.gep
304  %b = load volatile float, float addrspace(1)* %b.gep
305  %mul = fmul float %a, %b
306  %fneg = fneg float %mul
307  store volatile float %fneg, float addrspace(1)* %out
308  store volatile float %mul, float addrspace(1)* %out
309  ret void
310}
311
312; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
313; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
314; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
315; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]]
316; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
317
318; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
319; GCN-NEXT: s_waitcnt vmcnt(0)
320; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
321; GCN-NEXT: s_waitcnt vmcnt(0)
322define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
323  %tid = call i32 @llvm.amdgcn.workitem.id.x()
324  %tid.ext = sext i32 %tid to i64
325  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
326  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
327  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
328  %a = load volatile float, float addrspace(1)* %a.gep
329  %b = load volatile float, float addrspace(1)* %b.gep
330  %mul = fmul float %a, %b
331  %fneg = fneg float %mul
332  %use1 = fmul float %mul, 4.0
333  store volatile float %fneg, float addrspace(1)* %out
334  store volatile float %use1, float addrspace(1)* %out
335  ret void
336}
337
338; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32:
339; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
340; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
341; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
342; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
343define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
344  %tid = call i32 @llvm.amdgcn.workitem.id.x()
345  %tid.ext = sext i32 %tid to i64
346  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
347  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
348  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
349  %a = load volatile float, float addrspace(1)* %a.gep
350  %b = load volatile float, float addrspace(1)* %b.gep
351  %fneg.a = fneg float %a
352  %mul = fmul float %fneg.a, %b
353  %fneg = fneg float %mul
354  store volatile float %fneg, float addrspace(1)* %out
355  ret void
356}
357
358; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32:
359; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
360; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
361; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
362; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
363define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
364  %tid = call i32 @llvm.amdgcn.workitem.id.x()
365  %tid.ext = sext i32 %tid to i64
366  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
367  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
368  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
369  %a = load volatile float, float addrspace(1)* %a.gep
370  %b = load volatile float, float addrspace(1)* %b.gep
371  %fneg.b = fneg float %b
372  %mul = fmul float %a, %fneg.b
373  %fneg = fneg float %mul
374  store volatile float %fneg, float addrspace(1)* %out
375  ret void
376}
377
378; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32:
379; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
380; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
381; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
382; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
383define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
384  %tid = call i32 @llvm.amdgcn.workitem.id.x()
385  %tid.ext = sext i32 %tid to i64
386  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
387  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
388  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
389  %a = load volatile float, float addrspace(1)* %a.gep
390  %b = load volatile float, float addrspace(1)* %b.gep
391  %fneg.a = fneg float %a
392  %fneg.b = fneg float %b
393  %mul = fmul float %fneg.a, %fneg.b
394  %fneg = fneg float %mul
395  store volatile float %fneg, float addrspace(1)* %out
396  ret void
397}
398
399; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32:
400; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
401; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
402; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
403; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
404
405; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]]
406; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
407define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
408  %tid = call i32 @llvm.amdgcn.workitem.id.x()
409  %tid.ext = sext i32 %tid to i64
410  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
411  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
412  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
413  %a = load volatile float, float addrspace(1)* %a.gep
414  %b = load volatile float, float addrspace(1)* %b.gep
415  %fneg.a = fneg float %a
416  %mul = fmul float %fneg.a, %b
417  %fneg = fneg float %mul
418  store volatile float %fneg, float addrspace(1)* %out
419  store volatile float %fneg.a, float addrspace(1)* %out
420  ret void
421}
422
423; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32:
424; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
425; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
426; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]]
427; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
428; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]]
429; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
430define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
431  %tid = call i32 @llvm.amdgcn.workitem.id.x()
432  %tid.ext = sext i32 %tid to i64
433  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
434  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
435  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
436  %a = load volatile float, float addrspace(1)* %a.gep
437  %b = load volatile float, float addrspace(1)* %b.gep
438  %fneg.a = fneg float %a
439  %mul = fmul float %fneg.a, %b
440  %fneg = fneg float %mul
441  %use1 = fmul float %fneg.a, %c
442  store volatile float %fneg, float addrspace(1)* %out
443  store volatile float %use1, float addrspace(1)* %out
444  ret void
445}
446
447; --------------------------------------------------------------------------------
448; fminnum tests
449; --------------------------------------------------------------------------------
450
451; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee:
452; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
453; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
454; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
455; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
456; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
457; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
458define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
459  %tid = call i32 @llvm.amdgcn.workitem.id.x()
460  %tid.ext = sext i32 %tid to i64
461  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
462  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
463  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
464  %a = load volatile float, float addrspace(1)* %a.gep
465  %b = load volatile float, float addrspace(1)* %b.gep
466  %min = call float @llvm.minnum.f32(float %a, float %b)
467  %fneg = fneg float %min
468  store float %fneg, float addrspace(1)* %out.gep
469  ret void
470}
471
472; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee:
473; GCN-NOT: v0
474; GCN-NOT: v1
475; GCN: v_max_f32_e64 v0, -v0, -v1
476; GCN-NEXT: ; return
477define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 {
478  %min = call float @llvm.minnum.f32(float %a, float %b)
479  %fneg = fneg float %min
480  ret float %fneg
481}
482
483; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee:
484; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
485; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
486; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
487; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
488define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
489  %tid = call i32 @llvm.amdgcn.workitem.id.x()
490  %tid.ext = sext i32 %tid to i64
491  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
492  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
493  %a = load volatile float, float addrspace(1)* %a.gep
494  %min = call float @llvm.minnum.f32(float %a, float %a)
495  %min.fneg = fneg float %min
496  store float %min.fneg, float addrspace(1)* %out.gep
497  ret void
498}
499
500; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee:
501; GCN-NOT: v0
502; GCN: v_max_f32_e64 v0, -v0, -v0
503; GCN-NEXT: ; return
504define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 {
505  %min = call float @llvm.minnum.f32(float %a, float %a)
506  %min.fneg = fneg float %min
507  ret float %min.fneg
508}
509
510; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee:
511; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
512; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
513; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
514; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
515define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
516  %tid = call i32 @llvm.amdgcn.workitem.id.x()
517  %tid.ext = sext i32 %tid to i64
518  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
519  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
520  %a = load volatile float, float addrspace(1)* %a.gep
521  %min = call float @llvm.minnum.f32(float 4.0, float %a)
522  %fneg = fneg float %min
523  store float %fneg, float addrspace(1)* %out.gep
524  ret void
525}
526
527; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee:
528; GCN-NOT: v0
529; GCN: v_max_f32_e64 v0, -v0, -4.0
530; GCN-NEXT: ; return
531define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 {
532  %min = call float @llvm.minnum.f32(float 4.0, float %a)
533  %fneg = fneg float %min
534  ret float %fneg
535}
536
537; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee:
538; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
539; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
540; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
541; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
542define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
543  %tid = call i32 @llvm.amdgcn.workitem.id.x()
544  %tid.ext = sext i32 %tid to i64
545  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
546  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
547  %a = load volatile float, float addrspace(1)* %a.gep
548  %min = call float @llvm.minnum.f32(float -4.0, float %a)
549  %fneg = fneg float %min
550  store float %fneg, float addrspace(1)* %out.gep
551  ret void
552}
553
554; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee:
555; GCN-NOT: v0
556; GCN: v_max_f32_e64 v0, -v0, 4.0
557; GCN-NEXT: ; return
558define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 {
559  %min = call float @llvm.minnum.f32(float -4.0, float %a)
560  %fneg = fneg float %min
561  ret float %fneg
562}
563
564; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
565; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
566; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
567; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
568define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
569  %tid = call i32 @llvm.amdgcn.workitem.id.x()
570  %tid.ext = sext i32 %tid to i64
571  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
572  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
573  %a = load volatile float, float addrspace(1)* %a.gep
574  %min = call float @llvm.minnum.f32(float 0.0, float %a)
575  %fneg = fneg float %min
576  store float %fneg, float addrspace(1)* %out.gep
577  ret void
578}
579
580; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee:
581; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
582; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
583; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
584; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
585define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
586  %tid = call i32 @llvm.amdgcn.workitem.id.x()
587  %tid.ext = sext i32 %tid to i64
588  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
589  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
590  %a = load volatile float, float addrspace(1)* %a.gep
591  %min = call float @llvm.minnum.f32(float -0.0, float %a)
592  %fneg = fneg float %min
593  store float %fneg, float addrspace(1)* %out.gep
594  ret void
595}
596
597; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32:
598; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
599
600; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
601; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
602
603; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
604; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
605; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]]
606
607; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
608define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
609  %tid = call i32 @llvm.amdgcn.workitem.id.x()
610  %tid.ext = sext i32 %tid to i64
611  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
612  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
613  %a = load volatile float, float addrspace(1)* %a.gep
614  %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a)
615  %fneg = fneg float %min
616  store float %fneg, float addrspace(1)* %out.gep
617  ret void
618}
619
620; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32:
621; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
622
623; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
624; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]]
625
626; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
627; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
628
629; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
630define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
631  %tid = call i32 @llvm.amdgcn.workitem.id.x()
632  %tid.ext = sext i32 %tid to i64
633  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
634  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
635  %a = load volatile float, float addrspace(1)* %a.gep
636  %min = call float @llvm.minnum.f32(float 0xBFC45F3060000000, float %a)
637  %fneg = fneg float %min
638  store float %fneg, float addrspace(1)* %out.gep
639  ret void
640}
641
642; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f16:
643; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
644
645; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
646; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]]
647; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
648
649; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]]
650; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
651; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]]
652
653; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
654define amdgpu_kernel void @v_fneg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
655  %tid = call i32 @llvm.amdgcn.workitem.id.x()
656  %tid.ext = sext i32 %tid to i64
657  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
658  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
659  %a = load volatile half, half addrspace(1)* %a.gep
660  %min = call half @llvm.minnum.f16(half 0xH3118, half %a)
661  %fneg = fsub half -0.000000e+00, %min
662  store half %fneg, half addrspace(1)* %out.gep
663  ret void
664}
665
666; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f16:
667; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
668
669; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
670; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]]
671; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
672
673; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]]
674; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
675
676; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
677define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
678  %tid = call i32 @llvm.amdgcn.workitem.id.x()
679  %tid.ext = sext i32 %tid to i64
680  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
681  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
682  %a = load volatile half, half addrspace(1)* %a.gep
683  %min = call half @llvm.minnum.f16(half 0xHB118, half %a)
684  %fneg = fsub half -0.000000e+00, %min
685  store half %fneg, half addrspace(1)* %out.gep
686  ret void
687}
688
689; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f64:
690; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
691
692; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30
693; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
694; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
695; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
696
697; VI: v_min_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[A]], 0.15915494
698; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]]
699
700; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
701define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
702  %tid = call i32 @llvm.amdgcn.workitem.id.x()
703  %tid.ext = sext i32 %tid to i64
704  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
705  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
706  %a = load volatile double, double addrspace(1)* %a.gep
707  %min = call double @llvm.minnum.f64(double 0x3fc45f306dc9c882, double %a)
708  %fneg = fsub double -0.000000e+00, %min
709  store double %fneg, double addrspace(1)* %out.gep
710  ret void
711}
712
713; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f64:
714; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
715
716; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30
717; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
718; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
719; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
720
721; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
722; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494
723
724; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
725define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
726  %tid = call i32 @llvm.amdgcn.workitem.id.x()
727  %tid.ext = sext i32 %tid to i64
728  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
729  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
730  %a = load volatile double, double addrspace(1)* %a.gep
731  %min = call double @llvm.minnum.f64(double 0xbfc45f306dc9c882, double %a)
732  %fneg = fsub double -0.000000e+00, %min
733  store double %fneg, double addrspace(1)* %out.gep
734  ret void
735}
736
737; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_no_ieee:
738; GCN-NOT: v0
739; GCN: v_max_f32_e64 v0, -v0, 0{{$}}
740; GCN-NEXT: ; return
741define amdgpu_ps float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #0 {
742  %min = call float @llvm.minnum.f32(float -0.0, float %a)
743  %fneg = fneg float %min
744  ret float %fneg
745}
746
747; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee:
748; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
749; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
750; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
751; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]]
752; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
753; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
754define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
755  %tid = call i32 @llvm.amdgcn.workitem.id.x()
756  %tid.ext = sext i32 %tid to i64
757  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
758  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
759  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
760  %a = load volatile float, float addrspace(1)* %a.gep
761  %b = load volatile float, float addrspace(1)* %b.gep
762  %min = call float @llvm.minnum.f32(float 0.0, float %a)
763  %fneg = fneg float %min
764  %mul = fmul float %fneg, %b
765  store float %mul, float addrspace(1)* %out.gep
766  ret void
767}
768
769; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32:
770; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
771; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
772
773; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
774
775; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
776; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]]
777
778; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
779; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]]
780; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
781
782; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
783define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
784  %tid = call i32 @llvm.amdgcn.workitem.id.x()
785  %tid.ext = sext i32 %tid to i64
786  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
787  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
788  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
789  %a = load volatile float, float addrspace(1)* %a.gep
790  %b = load volatile float, float addrspace(1)* %b.gep
791  %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a)
792  %fneg = fneg float %min
793  %mul = fmul float %fneg, %b
794  store float %mul, float addrspace(1)* %out.gep
795  ret void
796}
797
798; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee:
799; GCN-NOT: v0
800; GCN-NOT: v1
801; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0
802; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1
803; GCN-NEXT: ; return
804define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
805  %min = call float @llvm.minnum.f32(float 0.0, float %a)
806  %fneg = fneg float %min
807  %mul = fmul float %fneg, %b
808  ret float %mul
809}
810
811; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee:
812; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
813; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
814; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
815; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
816; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
817; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
818; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
819; GCN-NEXT: s_waitcnt vmcnt(0)
820; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
821; GCN-NEXT: s_waitcnt vmcnt(0)
822define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
823  %tid = call i32 @llvm.amdgcn.workitem.id.x()
824  %tid.ext = sext i32 %tid to i64
825  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
826  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
827  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
828  %a = load volatile float, float addrspace(1)* %a.gep
829  %b = load volatile float, float addrspace(1)* %b.gep
830  %min = call float @llvm.minnum.f32(float %a, float %b)
831  %fneg = fneg float %min
832  %use1 = fmul float %min, 4.0
833  store volatile float %fneg, float addrspace(1)* %out
834  store volatile float %use1, float addrspace(1)* %out
835  ret void
836}
837
838; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee:
839; GCN-NOT: v0
840; GCN-NOT: v1
841; GCN: v_max_f32_e64 v0, -v0, -v1
842; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
843; GCN-NEXT: ; return
844define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 {
845  %min = call float @llvm.minnum.f32(float %a, float %b)
846  %fneg = fneg float %min
847  %use1 = fmul float %min, 4.0
848  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
849  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
850  ret <2 x float> %ins1
851}
852
853; --------------------------------------------------------------------------------
854; fmaxnum tests
855; --------------------------------------------------------------------------------
856
857
858; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee:
859; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
860; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
861; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
862; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
863; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
864; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
865define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
866  %tid = call i32 @llvm.amdgcn.workitem.id.x()
867  %tid.ext = sext i32 %tid to i64
868  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
869  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
870  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
871  %a = load volatile float, float addrspace(1)* %a.gep
872  %b = load volatile float, float addrspace(1)* %b.gep
873  %max = call float @llvm.maxnum.f32(float %a, float %b)
874  %fneg = fneg float %max
875  store float %fneg, float addrspace(1)* %out.gep
876  ret void
877}
878
879; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee:
880; GCN-NOT: v0
881; GCN-NOT: v1
882; GCN: v_min_f32_e64 v0, -v0, -v1
883; GCN-NEXT: ; return
884define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 {
885  %max = call float @llvm.maxnum.f32(float %a, float %b)
886  %fneg = fneg float %max
887  ret float %fneg
888}
889
890; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee:
891; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
892; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
893; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
894; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
895define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
896  %tid = call i32 @llvm.amdgcn.workitem.id.x()
897  %tid.ext = sext i32 %tid to i64
898  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
899  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
900  %a = load volatile float, float addrspace(1)* %a.gep
901  %max = call float @llvm.maxnum.f32(float %a, float %a)
902  %max.fneg = fneg float %max
903  store float %max.fneg, float addrspace(1)* %out.gep
904  ret void
905}
906
907; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee:
908; GCN-NOT: v0
909; GCN: v_min_f32_e64 v0, -v0, -v0
910; GCN-NEXT: ; return
911define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 {
912  %max = call float @llvm.maxnum.f32(float %a, float %a)
913  %max.fneg = fneg float %max
914  ret float %max.fneg
915}
916
917; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee:
918; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
919; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
920; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
921; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
922define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
923  %tid = call i32 @llvm.amdgcn.workitem.id.x()
924  %tid.ext = sext i32 %tid to i64
925  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
926  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
927  %a = load volatile float, float addrspace(1)* %a.gep
928  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
929  %fneg = fneg float %max
930  store float %fneg, float addrspace(1)* %out.gep
931  ret void
932}
933
934; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee:
935; GCN-NOT: v0
936; GCN: v_min_f32_e64 v0, -v0, -4.0
937; GCN-NEXT: ; return
938define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 {
939  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
940  %fneg = fneg float %max
941  ret float %fneg
942}
943
944; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee:
945; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
946; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
947; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
948; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
949define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
950  %tid = call i32 @llvm.amdgcn.workitem.id.x()
951  %tid.ext = sext i32 %tid to i64
952  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
953  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
954  %a = load volatile float, float addrspace(1)* %a.gep
955  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
956  %fneg = fneg float %max
957  store float %fneg, float addrspace(1)* %out.gep
958  ret void
959}
960
961; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee:
962; GCN-NOT: v0
963; GCN: v_min_f32_e64 v0, -v0, 4.0
964; GCN-NEXT: ; return
965define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 {
966  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
967  %fneg = fneg float %max
968  ret float %fneg
969}
970
971; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
972; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
973; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
974; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
975define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
976  %tid = call i32 @llvm.amdgcn.workitem.id.x()
977  %tid.ext = sext i32 %tid to i64
978  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
979  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
980  %a = load volatile float, float addrspace(1)* %a.gep
981  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
982  %fneg = fneg float %max
983  store float %fneg, float addrspace(1)* %out.gep
984  ret void
985}
986
987; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee:
988; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
989; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
990; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
991; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
992define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
993  %tid = call i32 @llvm.amdgcn.workitem.id.x()
994  %tid.ext = sext i32 %tid to i64
995  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
996  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
997  %a = load volatile float, float addrspace(1)* %a.gep
998  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
999  %fneg = fneg float %max
1000  store float %fneg, float addrspace(1)* %out.gep
1001  ret void
1002}
1003
1004; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee:
1005; GCN-NOT: v0
1006; GCN: v_min_f32_e64 v0, -v0, 0{{$}}
1007; GCN-NEXT: ; return
1008define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 {
1009  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
1010  %fneg = fneg float %max
1011  ret float %fneg
1012}
1013
1014; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee:
1015; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1016; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1017; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
1018; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
1019; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
1020; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1021define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1022  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1023  %tid.ext = sext i32 %tid to i64
1024  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1025  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1026  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1027  %a = load volatile float, float addrspace(1)* %a.gep
1028  %b = load volatile float, float addrspace(1)* %b.gep
1029  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
1030  %fneg = fneg float %max
1031  %mul = fmul float %fneg, %b
1032  store float %mul, float addrspace(1)* %out.gep
1033  ret void
1034}
1035
1036; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee:
1037; GCN-NOT: v0
1038; GCN-NOT: v1
1039; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0
1040; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1
1041; GCN-NEXT: ; return
1042define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
1043  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
1044  %fneg = fneg float %max
1045  %mul = fmul float %fneg, %b
1046  ret float %mul
1047}
1048
1049; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee:
1050; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1051; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1052; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
1053; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
1054; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
1055; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
1056; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
1057; GCN-NEXT: s_waitcnt vmcnt(0)
1058; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
1059; GCN-NEXT: s_waitcnt vmcnt(0)
1060define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1061  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1062  %tid.ext = sext i32 %tid to i64
1063  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1064  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1065  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1066  %a = load volatile float, float addrspace(1)* %a.gep
1067  %b = load volatile float, float addrspace(1)* %b.gep
1068  %max = call float @llvm.maxnum.f32(float %a, float %b)
1069  %fneg = fneg float %max
1070  %use1 = fmul float %max, 4.0
1071  store volatile float %fneg, float addrspace(1)* %out
1072  store volatile float %use1, float addrspace(1)* %out
1073  ret void
1074}
1075
1076; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee:
1077; GCN-NOT: v0
1078; GCN-NOT: v1
1079; GCN: v_min_f32_e64 v0, -v0, -v1
1080; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
1081; GCN-NEXT: ; return
1082define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 {
1083  %max = call float @llvm.maxnum.f32(float %a, float %b)
1084  %fneg = fneg float %max
1085  %use1 = fmul float %max, 4.0
1086  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
1087  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
1088  ret <2 x float> %ins1
1089}
1090
1091; --------------------------------------------------------------------------------
1092; fma tests
1093; --------------------------------------------------------------------------------
1094
1095; GCN-LABEL: {{^}}v_fneg_fma_f32:
1096; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1097; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1098; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1099
1100; GCN-SAFE: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
1101; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[RESULT]]
1102
1103; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
1104; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1105define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1106  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1107  %tid.ext = sext i32 %tid to i64
1108  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1109  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1110  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1111  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1112  %a = load volatile float, float addrspace(1)* %a.gep
1113  %b = load volatile float, float addrspace(1)* %b.gep
1114  %c = load volatile float, float addrspace(1)* %c.gep
1115  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
1116  %fneg = fneg float %fma
1117  store float %fneg, float addrspace(1)* %out.gep
1118  ret void
1119}
1120
1121; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32:
1122; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1123; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1124; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1125; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1126; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
1127; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
1128; GCN-NEXT: s_waitcnt vmcnt(0)
1129; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1130; GCN-NEXT: s_waitcnt vmcnt(0)
1131define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1132  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1133  %tid.ext = sext i32 %tid to i64
1134  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1135  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1136  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1137  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1138  %a = load volatile float, float addrspace(1)* %a.gep
1139  %b = load volatile float, float addrspace(1)* %b.gep
1140  %c = load volatile float, float addrspace(1)* %c.gep
1141  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
1142  %fneg = fneg float %fma
1143  store volatile float %fneg, float addrspace(1)* %out
1144  store volatile float %fma, float addrspace(1)* %out
1145  ret void
1146}
1147
1148; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32:
1149; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1150; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1151; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1152
1153; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1154; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
1155; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
1156
1157; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
1158; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]
1159
1160; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
1161; GCN-NEXT: s_waitcnt vmcnt(0)
1162; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1163; GCN-NEXT: s_waitcnt vmcnt(0)
1164define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1165  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1166  %tid.ext = sext i32 %tid to i64
1167  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1168  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1169  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1170  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1171  %a = load volatile float, float addrspace(1)* %a.gep
1172  %b = load volatile float, float addrspace(1)* %b.gep
1173  %c = load volatile float, float addrspace(1)* %c.gep
1174  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
1175  %fneg = fneg float %fma
1176  %use1 = fmul float %fma, 4.0
1177  store volatile float %fneg, float addrspace(1)* %out
1178  store volatile float %use1, float addrspace(1)* %out
1179  ret void
1180}
1181
1182; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32:
1183; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1184; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1185; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1186
1187; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], [[C]]
1188; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1189
1190; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1191; GCN-NSZ-NOT: [[FMA]]
1192; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1193define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1194  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1195  %tid.ext = sext i32 %tid to i64
1196  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1197  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1198  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1199  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1200  %a = load volatile float, float addrspace(1)* %a.gep
1201  %b = load volatile float, float addrspace(1)* %b.gep
1202  %c = load volatile float, float addrspace(1)* %c.gep
1203  %fneg.a = fneg float %a
1204  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1205  %fneg = fneg float %fma
1206  store volatile float %fneg, float addrspace(1)* %out
1207  ret void
1208}
1209
1210; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32:
1211; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1212; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1213; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1214
1215; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
1216; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1217
1218; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1219; GCN-NSZ-NOT: [[FMA]]
1220; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1221define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1222  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1223  %tid.ext = sext i32 %tid to i64
1224  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1225  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1226  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1227  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1228  %a = load volatile float, float addrspace(1)* %a.gep
1229  %b = load volatile float, float addrspace(1)* %b.gep
1230  %c = load volatile float, float addrspace(1)* %c.gep
1231  %fneg.b = fneg float %b
1232  %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c)
1233  %fneg = fneg float %fma
1234  store volatile float %fneg, float addrspace(1)* %out
1235  ret void
1236}
1237
1238; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32:
1239; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1240; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1241; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1242
1243; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1244; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1245
1246; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
1247; GCN-NSZ-NOT: [[FMA]]
1248; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1249define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1250  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1251  %tid.ext = sext i32 %tid to i64
1252  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1253  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1254  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1255  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1256  %a = load volatile float, float addrspace(1)* %a.gep
1257  %b = load volatile float, float addrspace(1)* %b.gep
1258  %c = load volatile float, float addrspace(1)* %c.gep
1259  %fneg.a = fneg float %a
1260  %fneg.b = fneg float %b
1261  %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c)
1262  %fneg = fneg float %fma
1263  store volatile float %fneg, float addrspace(1)* %out
1264  ret void
1265}
1266
1267; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32:
1268; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1269; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1270; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1271
1272; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], -[[C]]
1273; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1274
1275; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
1276; GCN-NSZ-NOT: [[FMA]]
1277; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1278define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1279  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1280  %tid.ext = sext i32 %tid to i64
1281  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1282  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1283  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1284  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1285  %a = load volatile float, float addrspace(1)* %a.gep
1286  %b = load volatile float, float addrspace(1)* %b.gep
1287  %c = load volatile float, float addrspace(1)* %c.gep
1288  %fneg.a = fneg float %a
1289  %fneg.c = fneg float %c
1290  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c)
1291  %fneg = fneg float %fma
1292  store volatile float %fneg, float addrspace(1)* %out
1293  ret void
1294}
1295
1296; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32:
1297; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1298; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1299; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1300
1301; GCN-NSZ-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1302; GCN-NSZ-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1303
1304; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
1305; GCN-NSZ-NOT: [[FMA]]
1306; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1307define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1308  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1309  %tid.ext = sext i32 %tid to i64
1310  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1311  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1312  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1313  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1314  %a = load volatile float, float addrspace(1)* %a.gep
1315  %b = load volatile float, float addrspace(1)* %b.gep
1316  %c = load volatile float, float addrspace(1)* %c.gep
1317  %fneg.c = fneg float %c
1318  %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c)
1319  %fneg = fneg float %fma
1320  store volatile float %fneg, float addrspace(1)* %out
1321  ret void
1322}
1323
1324; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32:
1325; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1326; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1327; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1328
1329; GCN-SAFE: v_xor_b32
1330; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]],
1331; GCN-SAFE: v_xor_b32
1332
1333; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1334; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1335
1336; GCN-NSZ-NOT: [[FMA]]
1337; GCN-NSZ-NOT: [[NEG_A]]
1338; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]]
1339; GCN-NSZ-NOT: [[NEG_A]]
1340; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1341define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1342  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1343  %tid.ext = sext i32 %tid to i64
1344  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1345  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1346  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1347  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1348  %a = load volatile float, float addrspace(1)* %a.gep
1349  %b = load volatile float, float addrspace(1)* %b.gep
1350  %c = load volatile float, float addrspace(1)* %c.gep
1351  %fneg.a = fneg float %a
1352  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1353  %fneg = fneg float %fma
1354  store volatile float %fneg, float addrspace(1)* %out
1355  store volatile float %fneg.a, float addrspace(1)* %out
1356  ret void
1357}
1358
1359; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32:
1360; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1361; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1362; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1363
1364; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
1365; GCN-SAFE-DAG: v_fma_f32 [[FMA:v[0-9]+]]
1366; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
1367
1368; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
1369; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
1370; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
1371; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1372; GCN-NSZ-NEXT: s_waitcnt vmcnt(0)
1373define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
1374  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1375  %tid.ext = sext i32 %tid to i64
1376  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1377  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1378  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1379  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1380  %a = load volatile float, float addrspace(1)* %a.gep
1381  %b = load volatile float, float addrspace(1)* %b.gep
1382  %c = load volatile float, float addrspace(1)* %c.gep
1383  %fneg.a = fneg float %a
1384  %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
1385  %fneg = fneg float %fma
1386  %use1 = fmul float %fneg.a, %d
1387  store volatile float %fneg, float addrspace(1)* %out
1388  store volatile float %use1, float addrspace(1)* %out
1389  ret void
1390}
1391
1392; --------------------------------------------------------------------------------
1393; fmad tests
1394; --------------------------------------------------------------------------------
1395
1396; GCN-LABEL: {{^}}v_fneg_fmad_f32:
1397; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1398; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1399; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1400
1401; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
1402; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]]
1403
1404; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
1405; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1406define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1407  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1408  %tid.ext = sext i32 %tid to i64
1409  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1410  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1411  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1412  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1413  %a = load volatile float, float addrspace(1)* %a.gep
1414  %b = load volatile float, float addrspace(1)* %b.gep
1415  %c = load volatile float, float addrspace(1)* %c.gep
1416  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
1417  %fneg = fneg float %fma
1418  store float %fneg, float addrspace(1)* %out.gep
1419  ret void
1420}
1421
1422; GCN-LABEL: {{^}}v_fneg_fmad_v4f32:
1423
1424; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1425; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1426; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1427; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
1428define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %a.ptr, <4 x float> addrspace(1)* %b.ptr, <4 x float> addrspace(1)* %c.ptr) #0 {
1429  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1430  %tid.ext = sext i32 %tid to i64
1431  %a.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a.ptr, i64 %tid.ext
1432  %b.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %b.ptr, i64 %tid.ext
1433  %c.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %c.ptr, i64 %tid.ext
1434  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
1435  %a = load volatile <4 x float>, <4 x float> addrspace(1)* %a.gep
1436  %b = load volatile <4 x float>, <4 x float> addrspace(1)* %b.gep
1437  %c = load volatile <4 x float>, <4 x float> addrspace(1)* %c.gep
1438  %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
1439  %fneg = fneg <4 x float> %fma
1440  store <4 x float> %fneg, <4 x float> addrspace(1)* %out.gep
1441  ret void
1442}
1443
1444; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
1445; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1446; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1447; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
1448
1449; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]]
1450; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
1451; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
1452
1453; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], [[A]], -[[B]], -[[C]]
1454; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
1455
1456; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]]
1457; GCN-NEXT: s_waitcnt vmcnt(0)
1458; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1459; GCN-NEXT: s_waitcnt vmcnt(0)
1460define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
1461  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1462  %tid.ext = sext i32 %tid to i64
1463  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1464  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1465  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
1466  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1467  %a = load volatile float, float addrspace(1)* %a.gep
1468  %b = load volatile float, float addrspace(1)* %b.gep
1469  %c = load volatile float, float addrspace(1)* %c.gep
1470  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
1471  %fneg = fneg float %fma
1472  %use1 = fmul float %fma, 4.0
1473  store volatile float %fneg, float addrspace(1)* %out
1474  store volatile float %use1, float addrspace(1)* %out
1475  ret void
1476}
1477
1478; --------------------------------------------------------------------------------
1479; fp_extend tests
1480; --------------------------------------------------------------------------------
1481
1482; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64:
1483; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1484; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
1485; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1486define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1487  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1488  %tid.ext = sext i32 %tid to i64
1489  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1490  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1491  %a = load volatile float, float addrspace(1)* %a.gep
1492  %fpext = fpext float %a to double
1493  %fneg = fsub double -0.000000e+00, %fpext
1494  store double %fneg, double addrspace(1)* %out.gep
1495  ret void
1496}
1497
1498; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64:
1499; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1500; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
1501; GCN: {{buffer|flat}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1502define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1503  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1504  %tid.ext = sext i32 %tid to i64
1505  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1506  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1507  %a = load volatile float, float addrspace(1)* %a.gep
1508  %fneg.a = fneg float %a
1509  %fpext = fpext float %fneg.a to double
1510  %fneg = fsub double -0.000000e+00, %fpext
1511  store double %fneg, double addrspace(1)* %out.gep
1512  ret void
1513}
1514
1515; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64:
1516; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1517; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
1518; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
1519; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1520; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FNEG_A]]
1521define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1522  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1523  %tid.ext = sext i32 %tid to i64
1524  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1525  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1526  %a = load volatile float, float addrspace(1)* %a.gep
1527  %fneg.a = fneg float %a
1528  %fpext = fpext float %fneg.a to double
1529  %fneg = fsub double -0.000000e+00, %fpext
1530  store volatile double %fneg, double addrspace(1)* %out.gep
1531  store volatile float %fneg.a, float addrspace(1)* undef
1532  ret void
1533}
1534
1535; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
1536; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1537; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
1538; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
1539; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
1540; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
1541define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1542  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1543  %tid.ext = sext i32 %tid to i64
1544  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1545  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1546  %a = load volatile float, float addrspace(1)* %a.gep
1547  %fpext = fpext float %a to double
1548  %fneg = fsub double -0.000000e+00, %fpext
1549  store volatile double %fneg, double addrspace(1)* %out.gep
1550  store volatile double %fpext, double addrspace(1)* undef
1551  ret void
1552}
1553
1554; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
1555; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1556; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]{{\]}}, [[A]]
1557; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
1558; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
1559; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
1560; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1561define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1562  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1563  %tid.ext = sext i32 %tid to i64
1564  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1565  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
1566  %a = load volatile float, float addrspace(1)* %a.gep
1567  %fpext = fpext float %a to double
1568  %fneg = fsub double -0.000000e+00, %fpext
1569  %mul = fmul double %fpext, 4.0
1570  store volatile double %fneg, double addrspace(1)* %out.gep
1571  store volatile double %mul, double addrspace(1)* %out.gep
1572  ret void
1573}
1574
1575; FIXME: Source modifiers not folded for f16->f32
1576; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
1577define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
1578  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1579  %tid.ext = sext i32 %tid to i64
1580  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
1581  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1582  %a = load volatile half, half addrspace(1)* %a.gep
1583  %fpext = fpext half %a to float
1584  %fneg = fneg float %fpext
1585  store volatile float %fneg, float addrspace(1)* %out.gep
1586  store volatile float %fpext, float addrspace(1)* %out.gep
1587  ret void
1588}
1589
1590; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
1591define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
1592  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1593  %tid.ext = sext i32 %tid to i64
1594  %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
1595  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1596  %a = load volatile half, half addrspace(1)* %a.gep
1597  %fpext = fpext half %a to float
1598  %fneg = fneg float %fpext
1599  %mul = fmul float %fpext, 4.0
1600  store volatile float %fneg, float addrspace(1)* %out.gep
1601  store volatile float %mul, float addrspace(1)* %out.gep
1602  ret void
1603}
1604
1605; --------------------------------------------------------------------------------
1606; fp_round tests
1607; --------------------------------------------------------------------------------
1608
1609; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32:
1610; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1611; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
1612; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1613define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1614  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1615  %tid.ext = sext i32 %tid to i64
1616  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1617  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1618  %a = load volatile double, double addrspace(1)* %a.gep
1619  %fpround = fptrunc double %a to float
1620  %fneg = fneg float %fpround
1621  store float %fneg, float addrspace(1)* %out.gep
1622  ret void
1623}
1624
1625; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32:
1626; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1627; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
1628; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1629define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1630  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1631  %tid.ext = sext i32 %tid to i64
1632  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1633  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1634  %a = load volatile double, double addrspace(1)* %a.gep
1635  %fneg.a = fsub double -0.000000e+00, %a
1636  %fpround = fptrunc double %fneg.a to float
1637  %fneg = fneg float %fpround
1638  store float %fneg, float addrspace(1)* %out.gep
1639  ret void
1640}
1641
1642; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32:
1643; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
1644; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v{{\[}}[[A_LO]]:[[A_HI]]{{\]}}
1645; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
1646; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1647; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}}
1648define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1649  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1650  %tid.ext = sext i32 %tid to i64
1651  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1652  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1653  %a = load volatile double, double addrspace(1)* %a.gep
1654  %fneg.a = fsub double -0.000000e+00, %a
1655  %fpround = fptrunc double %fneg.a to float
1656  %fneg = fneg float %fpround
1657  store volatile float %fneg, float addrspace(1)* %out.gep
1658  store volatile double %fneg.a, double addrspace(1)* undef
1659  ret void
1660}
1661
1662; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32:
1663; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1664; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
1665; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
1666
1667; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1668; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[USE1]]
1669define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
1670  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1671  %tid.ext = sext i32 %tid to i64
1672  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1673  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1674  %a = load volatile double, double addrspace(1)* %a.gep
1675  %fneg.a = fsub double -0.000000e+00, %a
1676  %fpround = fptrunc double %fneg.a to float
1677  %fneg = fneg float %fpround
1678  %use1 = fmul double %fneg.a, %c
1679  store volatile float %fneg, float addrspace(1)* %out.gep
1680  store volatile double %use1, double addrspace(1)* undef
1681  ret void
1682}
1683
1684; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16:
1685; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1686; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1687; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1688define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1689  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1690  %tid.ext = sext i32 %tid to i64
1691  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1692  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1693  %a = load volatile float, float addrspace(1)* %a.gep
1694  %fpround = fptrunc float %a to half
1695  %fneg = fsub half -0.000000e+00, %fpround
1696  store half %fneg, half addrspace(1)* %out.gep
1697  ret void
1698}
1699
1700; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16:
1701; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1702; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1703; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1704define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1705  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1706  %tid.ext = sext i32 %tid to i64
1707  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1708  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1709  %a = load volatile float, float addrspace(1)* %a.gep
1710  %fneg.a = fneg float %a
1711  %fpround = fptrunc float %fneg.a to half
1712  %fneg = fsub half -0.000000e+00, %fpround
1713  store half %fneg, half addrspace(1)* %out.gep
1714  ret void
1715}
1716
1717; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32:
1718; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
1719; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]]
1720; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
1721; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG]]
1722; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[CVT]]
1723define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
1724  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1725  %tid.ext = sext i32 %tid to i64
1726  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
1727  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1728  %a = load volatile double, double addrspace(1)* %a.gep
1729  %fpround = fptrunc double %a to float
1730  %fneg = fneg float %fpround
1731  store volatile float %fneg, float addrspace(1)* %out.gep
1732  store volatile float %fpround, float addrspace(1)* %out.gep
1733  ret void
1734}
1735
1736; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16:
1737; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1738; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1739; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1740; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1741; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1742define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1743  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1744  %tid.ext = sext i32 %tid to i64
1745  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1746  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1747  %a = load volatile float, float addrspace(1)* %a.gep
1748  %fneg.a = fneg float %a
1749  %fpround = fptrunc float %fneg.a to half
1750  %fneg = fsub half -0.000000e+00, %fpround
1751  store volatile half %fneg, half addrspace(1)* %out.gep
1752  store volatile float %fneg.a, float addrspace(1)* undef
1753  ret void
1754}
1755
1756; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16:
1757; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1758; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1759; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
1760; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1761; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[USE1]]
1762define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
1763  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1764  %tid.ext = sext i32 %tid to i64
1765  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1766  %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
1767  %a = load volatile float, float addrspace(1)* %a.gep
1768  %fneg.a = fneg float %a
1769  %fpround = fptrunc float %fneg.a to half
1770  %fneg = fsub half -0.000000e+00, %fpround
1771  %use1 = fmul float %fneg.a, %c
1772  store volatile half %fneg, half addrspace(1)* %out.gep
1773  store volatile float %use1, float addrspace(1)* undef
1774  ret void
1775}
1776
1777; --------------------------------------------------------------------------------
1778; rcp tests
1779; --------------------------------------------------------------------------------
1780
1781; GCN-LABEL: {{^}}v_fneg_rcp_f32:
1782; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1783; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
1784; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1785define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1786  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1787  %tid.ext = sext i32 %tid to i64
1788  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1789  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1790  %a = load volatile float, float addrspace(1)* %a.gep
1791  %rcp = call float @llvm.amdgcn.rcp.f32(float %a)
1792  %fneg = fneg float %rcp
1793  store float %fneg, float addrspace(1)* %out.gep
1794  ret void
1795}
1796
1797; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32:
1798; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1799; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1800; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1801define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1802  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1803  %tid.ext = sext i32 %tid to i64
1804  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1805  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1806  %a = load volatile float, float addrspace(1)* %a.gep
1807  %fneg.a = fneg float %a
1808  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1809  %fneg = fneg float %rcp
1810  store float %fneg, float addrspace(1)* %out.gep
1811  ret void
1812}
1813
1814; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32:
1815; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1816; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1817; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1818; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1819; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1820define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
1821  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1822  %tid.ext = sext i32 %tid to i64
1823  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1824  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1825  %a = load volatile float, float addrspace(1)* %a.gep
1826  %fneg.a = fneg float %a
1827  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1828  %fneg = fneg float %rcp
1829  store volatile float %fneg, float addrspace(1)* %out.gep
1830  store volatile float %fneg.a, float addrspace(1)* undef
1831  ret void
1832}
1833
1834; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32:
1835; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1836; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
1837; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
1838; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1839; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1840define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
1841  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1842  %tid.ext = sext i32 %tid to i64
1843  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1844  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1845  %a = load volatile float, float addrspace(1)* %a.gep
1846  %fneg.a = fneg float %a
1847  %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
1848  %fneg = fneg float %rcp
1849  %use1 = fmul float %fneg.a, %c
1850  store volatile float %fneg, float addrspace(1)* %out.gep
1851  store volatile float %use1, float addrspace(1)* undef
1852  ret void
1853}
1854
1855; --------------------------------------------------------------------------------
1856; fmul_legacy tests
1857; --------------------------------------------------------------------------------
1858
1859; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32:
1860; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1861; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1862; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
1863; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
1864define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1865  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1866  %tid.ext = sext i32 %tid to i64
1867  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1868  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1869  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1870  %a = load volatile float, float addrspace(1)* %a.gep
1871  %b = load volatile float, float addrspace(1)* %b.gep
1872  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1873  %fneg = fneg float %mul
1874  store float %fneg, float addrspace(1)* %out.gep
1875  ret void
1876}
1877
1878; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32:
1879; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1880; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1881; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1882; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
1883; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
1884; GCN-NEXT: s_waitcnt vmcnt(0)
1885; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1886; GCN-NEXT: s_waitcnt vmcnt(0)
1887define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1888  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1889  %tid.ext = sext i32 %tid to i64
1890  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1891  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1892  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1893  %a = load volatile float, float addrspace(1)* %a.gep
1894  %b = load volatile float, float addrspace(1)* %b.gep
1895  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1896  %fneg = fneg float %mul
1897  store volatile float %fneg, float addrspace(1)* %out
1898  store volatile float %mul, float addrspace(1)* %out
1899  ret void
1900}
1901
1902; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32:
1903; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1904; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1905; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1906; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0
1907; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1908; GCN-NEXT: s_waitcnt vmcnt(0)
1909; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
1910; GCN-NEXT: s_waitcnt vmcnt(0)
1911define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1912  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1913  %tid.ext = sext i32 %tid to i64
1914  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1915  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1916  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1917  %a = load volatile float, float addrspace(1)* %a.gep
1918  %b = load volatile float, float addrspace(1)* %b.gep
1919  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
1920  %fneg = fneg float %mul
1921  %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0)
1922  store volatile float %fneg, float addrspace(1)* %out
1923  store volatile float %use1, float addrspace(1)* %out
1924  ret void
1925}
1926
1927; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32:
1928; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1929; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1930; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1931; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1932define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1933  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1934  %tid.ext = sext i32 %tid to i64
1935  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1936  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1937  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1938  %a = load volatile float, float addrspace(1)* %a.gep
1939  %b = load volatile float, float addrspace(1)* %b.gep
1940  %fneg.a = fneg float %a
1941  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
1942  %fneg = fneg float %mul
1943  store volatile float %fneg, float addrspace(1)* %out
1944  ret void
1945}
1946
1947; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32:
1948; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1949; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1950; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
1951; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1952define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1953  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1954  %tid.ext = sext i32 %tid to i64
1955  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1956  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1957  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1958  %a = load volatile float, float addrspace(1)* %a.gep
1959  %b = load volatile float, float addrspace(1)* %b.gep
1960  %fneg.b = fneg float %b
1961  %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b)
1962  %fneg = fneg float %mul
1963  store volatile float %fneg, float addrspace(1)* %out
1964  ret void
1965}
1966
1967; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32:
1968; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1969; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1970; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
1971; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]]
1972define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1973  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1974  %tid.ext = sext i32 %tid to i64
1975  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1976  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
1977  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
1978  %a = load volatile float, float addrspace(1)* %a.gep
1979  %b = load volatile float, float addrspace(1)* %b.gep
1980  %fneg.a = fneg float %a
1981  %fneg.b = fneg float %b
1982  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b)
1983  %fneg = fneg float %mul
1984  store volatile float %fneg, float addrspace(1)* %out
1985  ret void
1986}
1987
1988; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32:
1989; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
1990; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
1991; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
1992; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
1993; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
1994; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]]
1995define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
1996  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1997  %tid.ext = sext i32 %tid to i64
1998  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
1999  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2000  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2001  %a = load volatile float, float addrspace(1)* %a.gep
2002  %b = load volatile float, float addrspace(1)* %b.gep
2003  %fneg.a = fneg float %a
2004  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
2005  %fneg = fneg float %mul
2006  store volatile float %fneg, float addrspace(1)* %out
2007  store volatile float %fneg.a, float addrspace(1)* %out
2008  ret void
2009}
2010
2011; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32:
2012; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2013; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2014; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]]
2015; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
2016; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]]
2017; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2018define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
2019  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2020  %tid.ext = sext i32 %tid to i64
2021  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2022  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2023  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2024  %a = load volatile float, float addrspace(1)* %a.gep
2025  %b = load volatile float, float addrspace(1)* %b.gep
2026  %fneg.a = fneg float %a
2027  %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
2028  %fneg = fneg float %mul
2029  %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c)
2030  store volatile float %fneg, float addrspace(1)* %out
2031  store volatile float %use1, float addrspace(1)* %out
2032  ret void
2033}
2034
2035; --------------------------------------------------------------------------------
2036; sin tests
2037; --------------------------------------------------------------------------------
2038
2039; GCN-LABEL: {{^}}v_fneg_sin_f32:
2040; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2041; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]]
2042; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
2043; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
2044; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2045define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2046  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2047  %tid.ext = sext i32 %tid to i64
2048  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2049  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2050  %a = load volatile float, float addrspace(1)* %a.gep
2051  %sin = call float @llvm.sin.f32(float %a)
2052  %fneg = fneg float %sin
2053  store float %fneg, float addrspace(1)* %out.gep
2054  ret void
2055}
2056
2057; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32:
2058; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2059; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2060; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2061define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2062  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2063  %tid.ext = sext i32 %tid to i64
2064  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2065  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2066  %a = load volatile float, float addrspace(1)* %a.gep
2067  %sin = call float @llvm.amdgcn.sin.f32(float %a)
2068  %fneg = fneg float %sin
2069  store float %fneg, float addrspace(1)* %out.gep
2070  ret void
2071}
2072
2073; --------------------------------------------------------------------------------
2074; ftrunc tests
2075; --------------------------------------------------------------------------------
2076
2077; GCN-LABEL: {{^}}v_fneg_trunc_f32:
2078; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2079; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2080; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2081define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2082  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2083  %tid.ext = sext i32 %tid to i64
2084  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2085  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2086  %a = load volatile float, float addrspace(1)* %a.gep
2087  %trunc = call float @llvm.trunc.f32(float %a)
2088  %fneg = fneg float %trunc
2089  store float %fneg, float addrspace(1)* %out.gep
2090  ret void
2091}
2092
2093; --------------------------------------------------------------------------------
2094; fround tests
2095; --------------------------------------------------------------------------------
2096
2097; GCN-LABEL: {{^}}v_fneg_round_f32:
2098; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2099; GCN: v_trunc_f32_e32
2100; GCN: v_sub_f32_e32
2101; GCN: v_cndmask_b32
2102
2103; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
2104; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]]
2105
2106; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
2107; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2108define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2109  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2110  %tid.ext = sext i32 %tid to i64
2111  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2112  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2113  %a = load volatile float, float addrspace(1)* %a.gep
2114  %round = call float @llvm.round.f32(float %a)
2115  %fneg = fneg float %round
2116  store float %fneg, float addrspace(1)* %out.gep
2117  ret void
2118}
2119
2120; --------------------------------------------------------------------------------
2121; rint tests
2122; --------------------------------------------------------------------------------
2123
2124; GCN-LABEL: {{^}}v_fneg_rint_f32:
2125; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2126; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2127; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2128define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2129  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2130  %tid.ext = sext i32 %tid to i64
2131  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2132  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2133  %a = load volatile float, float addrspace(1)* %a.gep
2134  %rint = call float @llvm.rint.f32(float %a)
2135  %fneg = fneg float %rint
2136  store float %fneg, float addrspace(1)* %out.gep
2137  ret void
2138}
2139
2140; --------------------------------------------------------------------------------
2141; nearbyint tests
2142; --------------------------------------------------------------------------------
2143
2144; GCN-LABEL: {{^}}v_fneg_nearbyint_f32:
2145; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2146; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
2147; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2148define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2149  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2150  %tid.ext = sext i32 %tid to i64
2151  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2152  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2153  %a = load volatile float, float addrspace(1)* %a.gep
2154  %nearbyint = call float @llvm.nearbyint.f32(float %a)
2155  %fneg = fneg float %nearbyint
2156  store float %fneg, float addrspace(1)* %out.gep
2157  ret void
2158}
2159
2160; --------------------------------------------------------------------------------
2161; fcanonicalize tests
2162; --------------------------------------------------------------------------------
2163
2164; GCN-LABEL: {{^}}v_fneg_canonicalize_f32:
2165; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2166; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], -1.0, [[A]]
2167; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
2168define amdgpu_kernel void @v_fneg_canonicalize_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
2169  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2170  %tid.ext = sext i32 %tid to i64
2171  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2172  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2173  %a = load volatile float, float addrspace(1)* %a.gep
2174  %trunc = call float @llvm.canonicalize.f32(float %a)
2175  %fneg = fneg float %trunc
2176  store float %fneg, float addrspace(1)* %out.gep
2177  ret void
2178}
2179
2180; --------------------------------------------------------------------------------
2181; vintrp tests
2182; --------------------------------------------------------------------------------
2183
2184; GCN-LABEL: {{^}}v_fneg_interp_p1_f32:
2185; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2186; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2187; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
2188; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2189; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2190define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
2191  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2192  %tid.ext = sext i32 %tid to i64
2193  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2194  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2195  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2196  %a = load volatile float, float addrspace(1)* %a.gep
2197  %b = load volatile float, float addrspace(1)* %b.gep
2198  %mul = fmul float %a, %b
2199  %fneg = fneg float %mul
2200  %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0)
2201  %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0)
2202  store volatile float %intrp0, float addrspace(1)* %out.gep
2203  store volatile float %intrp1, float addrspace(1)* %out.gep
2204  ret void
2205}
2206
2207; GCN-LABEL: {{^}}v_fneg_interp_p2_f32:
2208; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2209; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2210; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
2211; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2212; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]]
2213define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
2214  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2215  %tid.ext = sext i32 %tid to i64
2216  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2217  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2218  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2219  %a = load volatile float, float addrspace(1)* %a.gep
2220  %b = load volatile float, float addrspace(1)* %b.gep
2221  %mul = fmul float %a, %b
2222  %fneg = fneg float %mul
2223  %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0)
2224  %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0)
2225  store volatile float %intrp0, float addrspace(1)* %out.gep
2226  store volatile float %intrp1, float addrspace(1)* %out.gep
2227  ret void
2228}
2229
2230; --------------------------------------------------------------------------------
2231; CopyToReg tests
2232; --------------------------------------------------------------------------------
2233
2234; GCN-LABEL: {{^}}v_fneg_copytoreg_f32:
2235; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2236; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2237; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2238; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]]
2239; GCN: s_cbranch_scc0
2240
2241; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
2242; GCN: s_endpgm
2243
2244; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
2245; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]]
2246; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2247
2248define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
2249  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2250  %tid.ext = sext i32 %tid to i64
2251  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2252  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2253  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2254  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2255  %a = load volatile float, float addrspace(1)* %a.gep
2256  %b = load volatile float, float addrspace(1)* %b.gep
2257  %c = load volatile float, float addrspace(1)* %c.gep
2258  %mul = fmul float %a, %b
2259  %fneg = fneg float %mul
2260  %cmp0 = icmp eq i32 %d, 0
2261  br i1 %cmp0, label %if, label %endif
2262
2263if:
2264  %mul1 = fmul float %fneg, %c
2265  store volatile float %mul1, float addrspace(1)* %out.gep
2266  br label %endif
2267
2268endif:
2269  store volatile float %mul, float addrspace(1)* %out.gep
2270  ret void
2271}
2272
2273; --------------------------------------------------------------------------------
2274; inlineasm tests
2275; --------------------------------------------------------------------------------
2276
2277; Can't fold into use, so should fold into source
2278; GCN-LABEL: {{^}}v_fneg_inlineasm_f32:
2279; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2280; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2281; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
2282; GCN: ; use [[MUL]]
2283; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2284define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
2285  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2286  %tid.ext = sext i32 %tid to i64
2287  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2288  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2289  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2290  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2291  %a = load volatile float, float addrspace(1)* %a.gep
2292  %b = load volatile float, float addrspace(1)* %b.gep
2293  %c = load volatile float, float addrspace(1)* %c.gep
2294  %mul = fmul float %a, %b
2295  %fneg = fneg float %mul
2296  call void asm sideeffect "; use $0", "v"(float %fneg) #0
2297  store volatile float %fneg, float addrspace(1)* %out.gep
2298  ret void
2299}
2300
2301; --------------------------------------------------------------------------------
2302; inlineasm tests
2303; --------------------------------------------------------------------------------
2304
2305; Can't fold into use, so should fold into source
2306; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
2307; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2308; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2309; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]]
2310; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
2311; GCN: ; use [[NEG]]
2312; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
2313define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
2314  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2315  %tid.ext = sext i32 %tid to i64
2316  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2317  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2318  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2319  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2320  %a = load volatile float, float addrspace(1)* %a.gep
2321  %b = load volatile float, float addrspace(1)* %b.gep
2322  %c = load volatile float, float addrspace(1)* %c.gep
2323  %mul = fmul float %a, %b
2324  %fneg = fneg float %mul
2325  call void asm sideeffect "; use $0", "v"(float %fneg) #0
2326  store volatile float %mul, float addrspace(1)* %out.gep
2327  ret void
2328}
2329
2330; --------------------------------------------------------------------------------
2331; code size regression tests
2332; --------------------------------------------------------------------------------
2333
2334; There are multiple users of the fneg that must use a VOP3
2335; instruction, so there is no penalty
2336; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32:
2337; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2338; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2339; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2340
2341; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]]
2342; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
2343
2344; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2345; GCN-NEXT: s_waitcnt vmcnt(0)
2346; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA1]]
2347; GCN-NEXT: s_waitcnt vmcnt(0)
2348define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
2349  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2350  %tid.ext = sext i32 %tid to i64
2351  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2352  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2353  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2354  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2355  %a = load volatile float, float addrspace(1)* %a.gep
2356  %b = load volatile float, float addrspace(1)* %b.gep
2357  %c = load volatile float, float addrspace(1)* %c.gep
2358
2359  %fneg.a = fneg float %a
2360  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
2361  %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0)
2362
2363  store volatile float %fma0, float addrspace(1)* %out
2364  store volatile float %fma1, float addrspace(1)* %out
2365  ret void
2366}
2367
2368; There are multiple users, but both require using a larger encoding
2369; for the modifier.
2370
2371; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32:
2372; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2373; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2374; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2375
2376; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
2377; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
2378; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
2379; GCN-NEXT: s_waitcnt vmcnt(0)
2380; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2381; GCN-NEXT: s_waitcnt vmcnt(0)
2382define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
2383  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2384  %tid.ext = sext i32 %tid to i64
2385  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2386  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2387  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2388  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2389  %a = load volatile float, float addrspace(1)* %a.gep
2390  %b = load volatile float, float addrspace(1)* %b.gep
2391  %c = load volatile float, float addrspace(1)* %c.gep
2392
2393  %fneg.a = fneg float %a
2394  %mul0 = fmul float %fneg.a, %b
2395  %mul1 = fmul float %fneg.a, %c
2396
2397  store volatile float %mul0, float addrspace(1)* %out
2398  store volatile float %mul1, float addrspace(1)* %out
2399  ret void
2400}
2401
2402; One user is VOP3 so has no cost to folding the modifier, the other does.
2403; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32:
2404; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2405; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2406; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2407
2408; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0
2409; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
2410
2411; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2412; GCN-NEXT: s_waitcnt vmcnt(0)
2413; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2414; GCN-NEXT: s_waitcnt vmcnt(0)
2415define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
2416  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2417  %tid.ext = sext i32 %tid to i64
2418  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2419  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2420  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2421  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2422  %a = load volatile float, float addrspace(1)* %a.gep
2423  %b = load volatile float, float addrspace(1)* %b.gep
2424  %c = load volatile float, float addrspace(1)* %c.gep
2425
2426  %fneg.a = fneg float %a
2427  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
2428  %mul1 = fmul float %fneg.a, %c
2429
2430  store volatile float %fma0, float addrspace(1)* %out
2431  store volatile float %mul1, float addrspace(1)* %out
2432  ret void
2433}
2434
2435; The use of the fneg requires a code size increase, but folding into
2436; the source does not
2437
2438; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32:
2439; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2440; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2441; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2442; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
2443
2444; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0
2445; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]]
2446; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
2447
2448; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
2449; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]]
2450; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]]
2451
2452; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2453; GCN-NEXT: s_waitcnt vmcnt(0)
2454; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL2]]
2455; GCN-NEXT: s_waitcnt vmcnt(0)
2456define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2457  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2458  %tid.ext = sext i32 %tid to i64
2459  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2460  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2461  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2462  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2463  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2464  %a = load volatile float, float addrspace(1)* %a.gep
2465  %b = load volatile float, float addrspace(1)* %b.gep
2466  %c = load volatile float, float addrspace(1)* %c.gep
2467  %d = load volatile float, float addrspace(1)* %d.gep
2468
2469  %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
2470  %fneg.fma0 = fneg float %fma0
2471  %mul1 = fmul float %fneg.fma0, %c
2472  %mul2 = fmul float %fneg.fma0, %d
2473
2474  store volatile float %mul1, float addrspace(1)* %out
2475  store volatile float %mul2, float addrspace(1)* %out
2476  ret void
2477}
2478
2479; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64:
2480; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
2481; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
2482; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
2483; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
2484
2485; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
2486; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]]
2487; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
2488
2489; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]]
2490; GCN-NEXT: s_waitcnt vmcnt(0)
2491; GCN-NEXT: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2492; GCN-NEXT: s_waitcnt vmcnt(0)
2493define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
2494  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2495  %tid.ext = sext i32 %tid to i64
2496  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
2497  %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext
2498  %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext
2499  %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext
2500  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
2501  %a = load volatile double, double addrspace(1)* %a.gep
2502  %b = load volatile double, double addrspace(1)* %b.gep
2503  %c = load volatile double, double addrspace(1)* %c.gep
2504  %d = load volatile double, double addrspace(1)* %d.gep
2505
2506  %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
2507  %fneg.fma0 = fsub double -0.0, %fma0
2508  %mul1 = fmul double %fneg.fma0, %c
2509  %mul2 = fmul double %fneg.fma0, %d
2510
2511  store volatile double %mul1, double addrspace(1)* %out
2512  store volatile double %mul2, double addrspace(1)* %out
2513  ret void
2514}
2515
2516; %trunc.a has one fneg use, but it requires a code size increase and
2517; %the fneg can instead be folded for free into the fma.
2518
2519; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32:
2520; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2521; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2522; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2523; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
2524; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2525; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2526define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2527  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2528  %tid.ext = sext i32 %tid to i64
2529  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2530  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2531  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2532  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2533  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2534  %a = load volatile float, float addrspace(1)* %a.gep
2535  %b = load volatile float, float addrspace(1)* %b.gep
2536  %c = load volatile float, float addrspace(1)* %c.gep
2537  %d = load volatile float, float addrspace(1)* %d.gep
2538
2539  %trunc.a = call float @llvm.trunc.f32(float %a)
2540  %trunc.fneg.a = fneg float %trunc.a
2541  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
2542  store volatile float %fma0, float addrspace(1)* %out
2543  ret void
2544}
2545
2546; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src:
2547; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
2548; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
2549; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
2550; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
2551; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
2552; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
2553; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]]
2554; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]]
2555; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
2556define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
2557  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2558  %tid.ext = sext i32 %tid to i64
2559  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
2560  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
2561  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
2562  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
2563  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
2564  %a = load volatile float, float addrspace(1)* %a.gep
2565  %b = load volatile float, float addrspace(1)* %b.gep
2566  %c = load volatile float, float addrspace(1)* %c.gep
2567  %d = load volatile float, float addrspace(1)* %d.gep
2568
2569  %trunc.a = call float @llvm.trunc.f32(float %a)
2570  %trunc.fneg.a = fneg float %trunc.a
2571  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
2572  %mul1 = fmul float %trunc.a, %d
2573  store volatile float %fma0, float addrspace(1)* %out
2574  store volatile float %mul1, float addrspace(1)* %out
2575  ret void
2576}
2577
2578; The AMDGPU combine to pull fneg into the FMA operands was being
2579; undone by the generic combine to pull the fneg out of the fma if
2580; !isFNegFree. We were reporting false for v2f32 even though it will
2581; be split into f32 where it will be free.
2582; GCN-LABEL: {{^}}fneg_fma_fneg_dagcombine_loop:
2583; GCN: s_brev_b32 [[NEGZERO:s[0-9]+]], 1{{$}}
2584; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], v2, -v4, [[NEGZERO]]
2585; GCN-DAG: v_fma_f32 [[FMA1:v[0-9]+]], v3, -v5, [[NEGZERO]]
2586; GCN-DAG: v_sub_f32_e32 [[SUB0:v[0-9]+]], [[FMA0]], v0
2587; GCN-DAG: v_sub_f32_e32 [[SUB1:v[0-9]+]], [[FMA1]], v1
2588; GCN-DAG: v_mul_f32_e32 v0, [[SUB0]], v4
2589; GCN-DAG: v_mul_f32_e32 v1, [[SUB1]], v5
2590; GCN: s_setpc_b64
2591define <2 x float> @fneg_fma_fneg_dagcombine_loop(<2 x float> %arg, <2 x float> %arg1, <2 x float> %arg2) #0 {
2592bb:
2593  %i3 = call fast <2 x float> @llvm.fma.v2f32(<2 x float> %arg1, <2 x float> %arg2, <2 x float> zeroinitializer)
2594  %i4 = fadd fast <2 x float> %i3, %arg
2595  %i5 = fneg <2 x float> %i4
2596  %i6 = fmul fast <2 x float> %i5, %arg2
2597  ret <2 x float> %i6
2598}
2599
2600; This expects denormal flushing, so can't turn this fmul into fneg
2601; TODO: Keeping this as fmul saves encoding size
2602; GCN-LABEL: {{^}}nnan_fmul_neg1_to_fneg:
2603; GCN: v_sub_f32_e32 [[TMP:v[0-9]+]], 0x80000000, v0
2604; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1
2605define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 {
2606  %mul = fmul float %x, -1.0
2607  %add = fmul nnan float %mul, %y
2608  ret float %add
2609}
2610
2611; It's legal to turn this fmul into an fneg since denormals are
2612; preserved and we know an snan can't happen from the flag.
2613; GCN-LABEL: {{^}}denormal_fmul_neg1_to_fneg:
2614; GCN: v_mul_f32_e64 v0, -v0, v1
2615; GCN-NEXT: s_setpc_b64
2616define float @denormal_fmul_neg1_to_fneg(float %x, float %y) {
2617  %mul = fmul nnan float %x, -1.0
2618  %add = fmul float %mul, %y
2619  ret float %add
2620}
2621
2622; know the source can't be an snan
2623; GCN-LABEL: {{^}}denorm_snan_fmul_neg1_to_fneg:
2624; GCN: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0
2625; GCN: v_mul_f32_e32 v0, [[TMP]], v1
2626; GCN-NEXT: s_setpc_b64
2627define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) {
2628  %canonical = fmul float %x, %x
2629  %mul = fmul float %canonical, -1.0
2630  %add = fmul float %mul, %y
2631  ret float %add
2632}
2633
2634; GCN-LABEL: {{^}}flush_snan_fmul_neg1_to_fneg:
2635; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], 1.0, v0
2636; GCN: v_sub_f32_e32 [[TMP1:v[0-9]+]], 0x80000000, [[TMP0]]
2637; GCN-NEXT: v_mul_f32_e32 v0, [[TMP1]], v1
2638define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 {
2639  %quiet = call float @llvm.canonicalize.f32(float %x)
2640  %mul = fmul float %quiet, -1.0
2641  %add = fmul float %mul, %y
2642  ret float %add
2643}
2644
2645declare i32 @llvm.amdgcn.workitem.id.x() #1
2646declare float @llvm.fma.f32(float, float, float) #1
2647declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
2648declare float @llvm.fmuladd.f32(float, float, float) #1
2649declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
2650declare float @llvm.sin.f32(float) #1
2651declare float @llvm.trunc.f32(float) #1
2652declare float @llvm.round.f32(float) #1
2653declare float @llvm.rint.f32(float) #1
2654declare float @llvm.nearbyint.f32(float) #1
2655declare float @llvm.canonicalize.f32(float) #1
2656declare float @llvm.minnum.f32(float, float) #1
2657declare float @llvm.maxnum.f32(float, float) #1
2658declare half @llvm.minnum.f16(half, half) #1
2659declare double @llvm.minnum.f64(double, double) #1
2660declare double @llvm.fma.f64(double, double, double) #1
2661
2662declare float @llvm.amdgcn.sin.f32(float) #1
2663declare float @llvm.amdgcn.rcp.f32(float) #1
2664declare float @llvm.amdgcn.rcp.legacy(float) #1
2665declare float @llvm.amdgcn.fmul.legacy(float, float) #1
2666declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
2667declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
2668
2669attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2670attributes #1 = { nounwind readnone }
2671attributes #2 = { nounwind "unsafe-fp-math"="true" }
2672attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
2673