1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s 3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s 4; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s 5 6; These tests check that fdiv is expanded correctly and also test that the 7; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate 8; instruction groups. 9 10; These test check that fdiv using unsafe_fp_math, coarse fp div, and IEEE754 fp div. 11 12; FUNC-LABEL: {{^}}fdiv_f32: 13; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 14; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS 15 16; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] 17; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] 18; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] 19 20; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 21; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 22; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] 23; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] 24; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] 25; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] 26; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] 27; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 28; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] 29; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], 30define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { 31entry: 32 %fdiv = fdiv float %a, %b 33 store float %fdiv, float addrspace(1)* %out 34 ret void 35} 36 37; FUNC-LABEL: {{^}}fdiv_f32_denormals: 38; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 39; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS 40 41; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] 42; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] 43; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] 44 45; GCN-NOT: s_setreg 46; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 47; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] 48; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] 49; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] 50; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] 51; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] 52; GCN-NOT: s_setreg 53; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] 54; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], 55define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { 56entry: 57 %fdiv = fdiv float %a, %b 58 store float %fdiv, float addrspace(1)* %out 59 ret void 60} 61 62; FUNC-LABEL: {{^}}fdiv_25ulp_f32: 63; GCN: v_cndmask_b32 64; GCN: v_mul_f32 65; GCN: v_rcp_f32 66; GCN: v_mul_f32 67; GCN: v_mul_f32 68define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 { 69entry: 70 %fdiv = fdiv float %a, %b, !fpmath !0 71 store float %fdiv, float addrspace(1)* %out 72 ret void 73} 74 75; Use correct fdiv 76; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32: 77; GCN: v_fma_f32 78; GCN: v_div_fmas_f32 79; GCN: v_div_fixup_f32 80define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { 81entry: 82 %fdiv = fdiv float %a, %b, !fpmath !0 83 store float %fdiv, float addrspace(1)* %out 84 ret void 85} 86 87; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32: 88; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] 89; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] 90; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] 91 92; GCN-NOT: s_setreg 93; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 94; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] 95; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[B]], [[DEN_SCALE]] 96; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] 97; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] 98; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] 99; GCN-NOT: s_setreg 100; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] 101; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], 102define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { 103entry: 104 %fdiv = fdiv fast float %a, %b 105 store float %fdiv, float addrspace(1)* %out 106 ret void 107} 108 109; FUNC-LABEL: {{^}}fdiv_f32_fast_math: 110; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 111; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS 112 113; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} 114; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] 115; GCN-NOT: [[RESULT]] 116; GCN: buffer_store_dword [[RESULT]] 117define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { 118entry: 119 %fdiv = fdiv fast float %a, %b 120 store float %fdiv, float addrspace(1)* %out 121 ret void 122} 123 124; FUNC-LABEL: {{^}}fdiv_f32_arcp_math: 125; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 126; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS 127 128; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} 129; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] 130; GCN-NOT: [[RESULT]] 131; GCN: buffer_store_dword [[RESULT]] 132define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 { 133entry: 134 %fdiv = fdiv arcp float %a, %b 135 store float %fdiv, float addrspace(1)* %out 136 ret void 137} 138 139; FUNC-LABEL: {{^}}fdiv_v2f32: 140; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z 141; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y 142; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS 143; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS 144 145; GCN: v_div_scale_f32 146; GCN: v_div_scale_f32 147; GCN: v_div_scale_f32 148; GCN: v_div_scale_f32 149define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { 150entry: 151 %fdiv = fdiv <2 x float> %a, %b 152 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out 153 ret void 154} 155 156; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32: 157; GCN: v_cmp_gt_f32 158; GCN: v_cmp_gt_f32 159define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { 160entry: 161 %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 162 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out 163 ret void 164} 165 166; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math: 167; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z 168; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y 169; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS 170; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS 171 172; GCN: v_rcp_f32 173; GCN: v_rcp_f32 174define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { 175entry: 176 %fdiv = fdiv fast <2 x float> %a, %b 177 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out 178 ret void 179} 180 181; FUNC-LABEL: {{^}}fdiv_v2f32_arcp_math: 182; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z 183; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y 184; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS 185; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS 186 187; GCN: v_rcp_f32 188; GCN: v_rcp_f32 189define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { 190entry: 191 %fdiv = fdiv arcp <2 x float> %a, %b 192 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out 193 ret void 194} 195 196; FUNC-LABEL: {{^}}fdiv_v4f32: 197; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 198; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 199; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 200; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 201; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 202; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 203; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 204; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 205 206; GCN: v_div_fixup_f32 207; GCN: v_div_fixup_f32 208; GCN: v_div_fixup_f32 209; GCN: v_div_fixup_f32 210define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 211 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 212 %a = load <4 x float>, <4 x float> addrspace(1) * %in 213 %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr 214 %result = fdiv <4 x float> %a, %b 215 store <4 x float> %result, <4 x float> addrspace(1)* %out 216 ret void 217} 218 219; FUNC-LABEL: {{^}}fdiv_v4f32_fast_math: 220; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 221; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 222; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 223; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 224; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 225; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 226; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 227; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 228 229; GCN: v_rcp_f32 230; GCN: v_rcp_f32 231; GCN: v_rcp_f32 232; GCN: v_rcp_f32 233define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 234 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 235 %a = load <4 x float>, <4 x float> addrspace(1) * %in 236 %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr 237 %result = fdiv fast <4 x float> %a, %b 238 store <4 x float> %result, <4 x float> addrspace(1)* %out 239 ret void 240} 241 242; FUNC-LABEL: {{^}}fdiv_v4f32_arcp_math: 243; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 244; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 245; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 246; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 247; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 248; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 249; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 250; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 251 252; GCN: v_rcp_f32 253; GCN: v_rcp_f32 254; GCN: v_rcp_f32 255; GCN: v_rcp_f32 256define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 257 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 258 %a = load <4 x float>, <4 x float> addrspace(1) * %in 259 %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr 260 %result = fdiv arcp <4 x float> %a, %b 261 store <4 x float> %result, <4 x float> addrspace(1)* %out 262 ret void 263} 264 265attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,-flat-for-global" } 266attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" } 267attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" } 268 269!0 = !{float 2.500000e+00} 270