1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,FUNC %s
2; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,FUNC %s
3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,FUNC %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,FUNC %s
5; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,FUNC %s
6; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
7
8; These tests check that fdiv is expanded correctly and also test that the
9; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
10; instruction groups.
11
12; These test check that fdiv using unsafe_fp_math, coarse fp div, and IEEE754 fp div.
13
14; FUNC-LABEL: {{^}}fdiv_f32:
15; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
16; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
17
18; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
19; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
20; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
21
22; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
23; GFX10: s_denorm_mode 15
24; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
25; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
26; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
27; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
28; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
29; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
30; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
31; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
32; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
33; GFX10: s_denorm_mode 12
34; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
35; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
36define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
37entry:
38  %fdiv = fdiv ninf float %a, %b
39  store float %fdiv, float addrspace(1)* %out
40  ret void
41}
42
43; FUNC-LABEL: {{^}}fdiv_f32_denormals:
44; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
45; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
46
47; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
48; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
49
50; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
51; PREGFX10-NOT: s_setreg
52; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
53; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
54; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
55; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
56; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
57; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
58; PREGFX10-NOT: s_setreg
59
60; GFX10-NOT: s_denorm_mode
61; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
62; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
63; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
64; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
65; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
66; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
67; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
68; GFX10-NOT: s_denorm_mode
69
70; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
71; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
72define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
73entry:
74  %fdiv = fdiv float %a, %b
75  store float %fdiv, float addrspace(1)* %out
76  ret void
77}
78
79; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
80; GCN: v_cndmask_b32
81; GCN: v_mul_f32
82; GCN: v_rcp_f32
83; GCN: v_mul_f32
84; GCN: v_mul_f32
85define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
86entry:
87  %fdiv = fdiv float %a, %b, !fpmath !0
88  store float %fdiv, float addrspace(1)* %out
89  ret void
90}
91
92; Use correct fdiv
93; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32:
94; GCN: v_fma_f32
95; GCN: v_div_fmas_f32
96; GCN: v_div_fixup_f32
97define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
98entry:
99  %fdiv = fdiv float %a, %b, !fpmath !0
100  store float %fdiv, float addrspace(1)* %out
101  ret void
102}
103
104; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
105; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
106; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
107; GCN-NOT: [[RESULT]]
108; PREGFX10-NOT: s_setreg
109; GFX10-NOT: s_denorm_mode
110; GCN: buffer_store_{{dword|b32}} [[RESULT]]
111define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
112entry:
113  %fdiv = fdiv fast float %a, %b
114  store float %fdiv, float addrspace(1)* %out
115  ret void
116}
117
118; FUNC-LABEL: {{^}}fdiv_f32_fast_math:
119; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
120; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z,
121
122; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
123; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
124; GCN-NOT: [[RESULT]]
125; GCN: buffer_store_{{dword|b32}} [[RESULT]]
126define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
127entry:
128  %fdiv = fdiv fast float %a, %b
129  store float %fdiv, float addrspace(1)* %out
130  ret void
131}
132
133; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math:
134; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
135; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z,
136
137; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
138; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
139; GCN-NOT: [[RESULT]]
140; GCN: buffer_store_{{dword|b32}} [[RESULT]]
141define amdgpu_kernel void @fdiv_ulp25_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
142entry:
143  %fdiv = fdiv fast float %a, %b, !fpmath !0
144  store float %fdiv, float addrspace(1)* %out
145  ret void
146}
147
148; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
149; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
150; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z,
151
152; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
153; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
154; GCN-NOT: [[RESULT]]
155; GCN: buffer_store_{{dword|b32}} [[RESULT]]
156define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
157entry:
158  %fdiv = fdiv arcp ninf float %a, %b
159  store float %fdiv, float addrspace(1)* %out
160  ret void
161}
162
163; FUNC-LABEL: {{^}}fdiv_v2f32:
164; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
165; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
166; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
167; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
168
169; GCN: v_div_scale_f32
170; GCN: v_div_scale_f32
171; GCN: v_div_scale_f32
172; GCN: v_div_scale_f32
173define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
174entry:
175  %fdiv = fdiv <2 x float> %a, %b
176  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
177  ret void
178}
179
180; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
181; GCN: v_rcp_f32
182; GCN: v_rcp_f32
183; GCN-NOT: v_cmp_gt_f32
184define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
185entry:
186  %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
187  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
188  ret void
189}
190
191; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math:
192; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
193; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
194; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[3].X,
195; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].W,
196
197; GCN: v_rcp_f32
198; GCN: v_rcp_f32
199define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
200entry:
201  %fdiv = fdiv fast <2 x float> %a, %b
202  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
203  ret void
204}
205
206; FUNC-LABEL: {{^}}fdiv_v2f32_arcp_math:
207; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
208; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
209; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[3].X,
210; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].W,
211
212; GCN: v_rcp_f32
213; GCN: v_rcp_f32
214define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
215entry:
216  %fdiv = fdiv arcp ninf <2 x float> %a, %b
217  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
218  ret void
219}
220
221; FUNC-LABEL: {{^}}fdiv_v4f32:
222; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
223; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
224; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
225; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
226; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
227; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
228; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
229; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
230
231; GCN: v_div_fixup_f32
232; GCN: v_div_fixup_f32
233; GCN: v_div_fixup_f32
234; GCN: v_div_fixup_f32
235define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
236  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
237  %a = load <4 x float>, <4 x float> addrspace(1) * %in
238  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
239  %result = fdiv <4 x float> %a, %b
240  store <4 x float> %result, <4 x float> addrspace(1)* %out
241  ret void
242}
243
244; FUNC-LABEL: {{^}}fdiv_v4f32_fast_math:
245; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
246; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
247; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
248; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
249; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
250; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
251; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
252; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
253
254; GCN: v_rcp_f32
255; GCN: v_rcp_f32
256; GCN: v_rcp_f32
257; GCN: v_rcp_f32
258define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
259  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
260  %a = load <4 x float>, <4 x float> addrspace(1) * %in
261  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
262  %result = fdiv fast <4 x float> %a, %b
263  store <4 x float> %result, <4 x float> addrspace(1)* %out
264  ret void
265}
266
267; FUNC-LABEL: {{^}}fdiv_v4f32_arcp_math:
268; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
269; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
270; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
271; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
272; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
273; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
274; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
275; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}},
276
277; GCN: v_rcp_f32
278; GCN: v_rcp_f32
279; GCN: v_rcp_f32
280; GCN: v_rcp_f32
281define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
282  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
283  %a = load <4 x float>, <4 x float> addrspace(1) * %in
284  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
285  %result = fdiv arcp ninf <4 x float> %a, %b
286  store <4 x float> %result, <4 x float> addrspace(1)* %out
287  ret void
288}
289
290; FUNC-LABEL: {{^}}fdiv_f32_correctly_rounded_divide_sqrt:
291
292; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
293; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
294; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
295
296; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
297; GFX10: s_denorm_mode 15
298; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
299; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
300; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
301; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
302; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
303; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
304; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
305; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
306; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
307; GFX10: s_denorm_mode 12
308; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
309; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
310
311define amdgpu_kernel void @fdiv_f32_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #0 {
312entry:
313  %fdiv = fdiv float 1.000000e+00, %a
314  store float %fdiv, float addrspace(1)* %out
315  ret void
316}
317
318
319; FUNC-LABEL: {{^}}fdiv_f32_denorms_correctly_rounded_divide_sqrt:
320
321; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
322; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
323
324; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
325; PREGFX10-NOT: s_setreg
326; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
327; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
328; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
329; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
330; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
331; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
332; PREGFX10-NOT: s_setreg
333
334; GFX10-NOT: s_denorm_mode
335; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
336; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
337; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
338; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
339; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
340; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
341; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
342; GFX10-NOT: s_denorm_mode
343
344; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
345; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
346define amdgpu_kernel void @fdiv_f32_denorms_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #2 {
347entry:
348  %fdiv = fdiv float 1.000000e+00, %a
349  store float %fdiv, float addrspace(1)* %out
350  ret void
351}
352
353attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-features"="-flat-for-global" }
354attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-features"="-flat-for-global" }
355attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "denormal-fp-math-f32"="ieee,ieee" "target-features"="-flat-for-global" }
356
357!0 = !{float 2.500000e+00}
358