1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,FUNC %s
2; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,FUNC %s
3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,FUNC %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,FUNC %s
5; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
6
7; These tests check that fdiv is expanded correctly and also test that the
8; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
9; instruction groups.
10
11; These test check that fdiv using unsafe_fp_math, coarse fp div, and IEEE754 fp div.
12
13; FUNC-LABEL: {{^}}fdiv_f32:
14; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
15; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
16
17; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
18; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
19; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
20
21; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
22; GFX10: s_denorm_mode 15
23; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
24; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
25; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
26; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
27; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
28; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
29; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
30; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
31; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
32; GFX10: s_denorm_mode 12
33; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
34; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
35define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
36entry:
37  %fdiv = fdiv ninf float %a, %b
38  store float %fdiv, float addrspace(1)* %out
39  ret void
40}
41
42; FUNC-LABEL: {{^}}fdiv_f32_denormals:
43; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
44; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
45
46; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
47; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
48
49; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
50; PREGFX10-NOT: s_setreg
51; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
52; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
53; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
54; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
55; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
56; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
57; PREGFX10-NOT: s_setreg
58
59; GFX10-NOT: s_denorm_mode
60; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
61; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
62; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
63; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
64; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
65; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
66; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
67; GFX10-NOT: s_denorm_mode
68
69; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
70; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
71define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
72entry:
73  %fdiv = fdiv float %a, %b
74  store float %fdiv, float addrspace(1)* %out
75  ret void
76}
77
78; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
79; GCN: v_cndmask_b32
80; GCN: v_mul_f32
81; GCN: v_rcp_f32
82; GCN: v_mul_f32
83; GCN: v_mul_f32
84define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
85entry:
86  %fdiv = fdiv float %a, %b, !fpmath !0
87  store float %fdiv, float addrspace(1)* %out
88  ret void
89}
90
91; Use correct fdiv
92; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32:
93; GCN: v_fma_f32
94; GCN: v_div_fmas_f32
95; GCN: v_div_fixup_f32
96define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
97entry:
98  %fdiv = fdiv float %a, %b, !fpmath !0
99  store float %fdiv, float addrspace(1)* %out
100  ret void
101}
102
103; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
104; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
105; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
106; GCN-NOT: [[RESULT]]
107; PREGFX10-NOT: s_setreg
108; GFX10-NOT: s_denorm_mode
109; GCN: buffer_store_dword [[RESULT]]
110define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
111entry:
112  %fdiv = fdiv fast float %a, %b
113  store float %fdiv, float addrspace(1)* %out
114  ret void
115}
116
117; FUNC-LABEL: {{^}}fdiv_f32_fast_math:
118; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
119; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z,
120
121; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
122; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
123; GCN-NOT: [[RESULT]]
124; GCN: buffer_store_dword [[RESULT]]
125define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
126entry:
127  %fdiv = fdiv fast float %a, %b
128  store float %fdiv, float addrspace(1)* %out
129  ret void
130}
131
132; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math:
133; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
134; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z,
135
136; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
137; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
138; GCN-NOT: [[RESULT]]
139; GCN: buffer_store_dword [[RESULT]]
140define amdgpu_kernel void @fdiv_ulp25_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
141entry:
142  %fdiv = fdiv fast float %a, %b, !fpmath !0
143  store float %fdiv, float addrspace(1)* %out
144  ret void
145}
146
147; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
148; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
149; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z,
150
151; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
152; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
153; GCN-NOT: [[RESULT]]
154; GCN: buffer_store_dword [[RESULT]]
155define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
156entry:
157  %fdiv = fdiv arcp ninf float %a, %b
158  store float %fdiv, float addrspace(1)* %out
159  ret void
160}
161
162; FUNC-LABEL: {{^}}fdiv_v2f32:
163; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
164; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
165; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
166; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
167
168; GCN: v_div_scale_f32
169; GCN: v_div_scale_f32
170; GCN: v_div_scale_f32
171; GCN: v_div_scale_f32
172define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
173entry:
174  %fdiv = fdiv <2 x float> %a, %b
175  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
176  ret void
177}
178
179; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
180; GCN: v_rcp_f32
181; GCN: v_rcp_f32
182; GCN-NOT: v_cmp_gt_f32
183define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
184entry:
185  %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
186  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
187  ret void
188}
189
190; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math:
191; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
192; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
193; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[3].X,
194; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].W,
195
196; GCN: v_rcp_f32
197; GCN: v_rcp_f32
198define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
199entry:
200  %fdiv = fdiv fast <2 x float> %a, %b
201  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
202  ret void
203}
204
205; FUNC-LABEL: {{^}}fdiv_v2f32_arcp_math:
206; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
207; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
208; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[3].X,
209; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].W,
210
211; GCN: v_rcp_f32
212; GCN: v_rcp_f32
213define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
214entry:
215  %fdiv = fdiv arcp ninf <2 x float> %a, %b
216  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
217  ret void
218}
219
220; FUNC-LABEL: {{^}}fdiv_v4f32:
221; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
222; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
223; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
224; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
225; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
226; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
227; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
228; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
229
230; GCN: v_div_fixup_f32
231; GCN: v_div_fixup_f32
232; GCN: v_div_fixup_f32
233; GCN: v_div_fixup_f32
234define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
235  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
236  %a = load <4 x float>, <4 x float> addrspace(1) * %in
237  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
238  %result = fdiv <4 x float> %a, %b
239  store <4 x float> %result, <4 x float> addrspace(1)* %out
240  ret void
241}
242
243; FUNC-LABEL: {{^}}fdiv_v4f32_fast_math:
244; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
245; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
246; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
247; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
248; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
249; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
250; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
251; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
252
253; GCN: v_rcp_f32
254; GCN: v_rcp_f32
255; GCN: v_rcp_f32
256; GCN: v_rcp_f32
257define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
258  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
259  %a = load <4 x float>, <4 x float> addrspace(1) * %in
260  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
261  %result = fdiv fast <4 x float> %a, %b
262  store <4 x float> %result, <4 x float> addrspace(1)* %out
263  ret void
264}
265
266; FUNC-LABEL: {{^}}fdiv_v4f32_arcp_math:
267; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
268; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
269; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
270; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
271; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
272; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
273; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
274; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
275
276; GCN: v_rcp_f32
277; GCN: v_rcp_f32
278; GCN: v_rcp_f32
279; GCN: v_rcp_f32
280define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
281  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
282  %a = load <4 x float>, <4 x float> addrspace(1) * %in
283  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
284  %result = fdiv arcp ninf <4 x float> %a, %b
285  store <4 x float> %result, <4 x float> addrspace(1)* %out
286  ret void
287}
288
289; FUNC-LABEL: {{^}}fdiv_f32_correctly_rounded_divide_sqrt:
290
291; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
292; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
293; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
294
295; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
296; GFX10: s_denorm_mode 15
297; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
298; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
299; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
300; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
301; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
302; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
303; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
304; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
305; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
306; GFX10: s_denorm_mode 12
307; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
308; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
309
310define amdgpu_kernel void @fdiv_f32_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #0 {
311entry:
312  %fdiv = fdiv float 1.000000e+00, %a
313  store float %fdiv, float addrspace(1)* %out
314  ret void
315}
316
317
318; FUNC-LABEL: {{^}}fdiv_f32_denorms_correctly_rounded_divide_sqrt:
319
320; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
321; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
322
323; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
324; PREGFX10-NOT: s_setreg
325; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
326; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
327; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
328; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
329; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
330; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
331; PREGFX10-NOT: s_setreg
332
333; GFX10-NOT: s_denorm_mode
334; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
335; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
336; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
337; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
338; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
339; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
340; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
341; GFX10-NOT: s_denorm_mode
342
343; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
344; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
345define amdgpu_kernel void @fdiv_f32_denorms_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #2 {
346entry:
347  %fdiv = fdiv float 1.000000e+00, %a
348  store float %fdiv, float addrspace(1)* %out
349  ret void
350}
351
352attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-features"="-flat-for-global" }
353attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-features"="-flat-for-global" }
354attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "denormal-fp-math-f32"="ieee,ieee" "target-features"="-flat-for-global" }
355
356!0 = !{float 2.500000e+00}
357