1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
4; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
5
6; These tests check that fdiv is expanded correctly and also test that the
7; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
8; instruction groups.
9
10; These test check that fdiv using unsafe_fp_math, coarse fp div, and IEEE754 fp div.
11
12; FUNC-LABEL: {{^}}fdiv_f32:
13; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
14; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
15
16; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
17; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
18; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
19
20; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
21; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
22; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
23; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
24; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
25; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
26; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
27; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
28; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
29; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
30define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
31entry:
32  %fdiv = fdiv float %a, %b
33  store float %fdiv, float addrspace(1)* %out
34  ret void
35}
36
37; FUNC-LABEL: {{^}}fdiv_f32_denormals:
38; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
39; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
40
41; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
42; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
43; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
44
45; GCN-NOT: s_setreg
46; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
47; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
48; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
49; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
50; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
51; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
52; GCN-NOT: s_setreg
53; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
54; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
55define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
56entry:
57  %fdiv = fdiv float %a, %b
58  store float %fdiv, float addrspace(1)* %out
59  ret void
60}
61
62; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
63; GCN: v_cndmask_b32
64; GCN: v_mul_f32
65; GCN: v_rcp_f32
66; GCN: v_mul_f32
67; GCN: v_mul_f32
68define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
69entry:
70  %fdiv = fdiv float %a, %b, !fpmath !0
71  store float %fdiv, float addrspace(1)* %out
72  ret void
73}
74
75; Use correct fdiv
76; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32:
77; GCN: v_fma_f32
78; GCN: v_div_fmas_f32
79; GCN: v_div_fixup_f32
80define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
81entry:
82  %fdiv = fdiv float %a, %b, !fpmath !0
83  store float %fdiv, float addrspace(1)* %out
84  ret void
85}
86
87; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
88; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
89; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
90; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
91
92; GCN-NOT: s_setreg
93; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
94; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
95; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]]
96; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
97; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
98; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
99; GCN-NOT: s_setreg
100; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
101; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
102define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
103entry:
104  %fdiv = fdiv fast float %a, %b
105  store float %fdiv, float addrspace(1)* %out
106  ret void
107}
108
109; FUNC-LABEL: {{^}}fdiv_f32_fast_math:
110; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
111; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
112
113; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
114; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
115; GCN-NOT: [[RESULT]]
116; GCN: buffer_store_dword [[RESULT]]
117define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
118entry:
119  %fdiv = fdiv fast float %a, %b
120  store float %fdiv, float addrspace(1)* %out
121  ret void
122}
123
124; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
125; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
126; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
127
128; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
129; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
130; GCN-NOT: [[RESULT]]
131; GCN: buffer_store_dword [[RESULT]]
132define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
133entry:
134  %fdiv = fdiv arcp float %a, %b
135  store float %fdiv, float addrspace(1)* %out
136  ret void
137}
138
139; FUNC-LABEL: {{^}}fdiv_v2f32:
140; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
141; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
142; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
143; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
144
145; GCN: v_div_scale_f32
146; GCN: v_div_scale_f32
147; GCN: v_div_scale_f32
148; GCN: v_div_scale_f32
149define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
150entry:
151  %fdiv = fdiv <2 x float> %a, %b
152  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
153  ret void
154}
155
156; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
157; GCN: v_cmp_gt_f32
158; GCN: v_cmp_gt_f32
159define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
160entry:
161  %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
162  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
163  ret void
164}
165
166; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math:
167; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
168; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
169; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
170; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
171
172; GCN: v_rcp_f32
173; GCN: v_rcp_f32
174define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
175entry:
176  %fdiv = fdiv fast <2 x float> %a, %b
177  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
178  ret void
179}
180
181; FUNC-LABEL: {{^}}fdiv_v2f32_arcp_math:
182; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
183; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
184; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
185; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
186
187; GCN: v_rcp_f32
188; GCN: v_rcp_f32
189define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
190entry:
191  %fdiv = fdiv arcp <2 x float> %a, %b
192  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
193  ret void
194}
195
196; FUNC-LABEL: {{^}}fdiv_v4f32:
197; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
198; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
199; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
200; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
201; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
202; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
203; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
204; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
205
206; GCN: v_div_fixup_f32
207; GCN: v_div_fixup_f32
208; GCN: v_div_fixup_f32
209; GCN: v_div_fixup_f32
210define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
211  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
212  %a = load <4 x float>, <4 x float> addrspace(1) * %in
213  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
214  %result = fdiv <4 x float> %a, %b
215  store <4 x float> %result, <4 x float> addrspace(1)* %out
216  ret void
217}
218
219; FUNC-LABEL: {{^}}fdiv_v4f32_fast_math:
220; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
221; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
222; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
223; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
224; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
225; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
226; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
227; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
228
229; GCN: v_rcp_f32
230; GCN: v_rcp_f32
231; GCN: v_rcp_f32
232; GCN: v_rcp_f32
233define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
234  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
235  %a = load <4 x float>, <4 x float> addrspace(1) * %in
236  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
237  %result = fdiv fast <4 x float> %a, %b
238  store <4 x float> %result, <4 x float> addrspace(1)* %out
239  ret void
240}
241
242; FUNC-LABEL: {{^}}fdiv_v4f32_arcp_math:
243; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
244; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
245; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
246; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
247; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
248; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
249; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
250; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
251
252; GCN: v_rcp_f32
253; GCN: v_rcp_f32
254; GCN: v_rcp_f32
255; GCN: v_rcp_f32
256define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
257  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
258  %a = load <4 x float>, <4 x float> addrspace(1) * %in
259  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
260  %result = fdiv arcp <4 x float> %a, %b
261  store <4 x float> %result, <4 x float> addrspace(1)* %out
262  ret void
263}
264
265attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,-flat-for-global" }
266attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" }
267attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" }
268
269!0 = !{float 2.500000e+00}
270