1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,FUNC %s 2; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,FUNC %s 3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,FUNC %s 4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,FUNC %s 5; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,FUNC %s 6; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s 7 8; These tests check that fdiv is expanded correctly and also test that the 9; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate 10; instruction groups. 11 12; These test check that fdiv using unsafe_fp_math, coarse fp div, and IEEE754 fp div. 13 14; FUNC-LABEL: {{^}}fdiv_f32: 15; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 16; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS 17 18; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] 19; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] 20; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] 21 22; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 23; GFX10: s_denorm_mode 15 24; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 25; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] 26; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] 27; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] 28; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] 29; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] 30; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] 31; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] 32; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 33; GFX10: s_denorm_mode 12 34; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] 35; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], 36define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { 37entry: 38 %fdiv = fdiv ninf float %a, %b 39 store float %fdiv, float addrspace(1)* %out 40 ret void 41} 42 43; FUNC-LABEL: {{^}}fdiv_f32_denormals: 44; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 45; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS 46 47; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] 48; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] 49 50; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] 51; PREGFX10-NOT: s_setreg 52; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 53; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] 54; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] 55; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] 56; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] 57; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] 58; PREGFX10-NOT: s_setreg 59 60; GFX10-NOT: s_denorm_mode 61; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 62; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] 63; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] 64; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] 65; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] 66; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] 67; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] 68; GFX10-NOT: s_denorm_mode 69 70; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] 71; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], 72define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { 73entry: 74 %fdiv = fdiv float %a, %b 75 store float %fdiv, float addrspace(1)* %out 76 ret void 77} 78 79; FUNC-LABEL: {{^}}fdiv_25ulp_f32: 80; GCN: v_cndmask_b32 81; GCN: v_mul_f32 82; GCN: v_rcp_f32 83; GCN: v_mul_f32 84; GCN: v_mul_f32 85define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 { 86entry: 87 %fdiv = fdiv float %a, %b, !fpmath !0 88 store float %fdiv, float addrspace(1)* %out 89 ret void 90} 91 92; Use correct fdiv 93; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32: 94; GCN: v_fma_f32 95; GCN: v_div_fmas_f32 96; GCN: v_div_fixup_f32 97define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { 98entry: 99 %fdiv = fdiv float %a, %b, !fpmath !0 100 store float %fdiv, float addrspace(1)* %out 101 ret void 102} 103 104; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32: 105; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} 106; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] 107; GCN-NOT: [[RESULT]] 108; PREGFX10-NOT: s_setreg 109; GFX10-NOT: s_denorm_mode 110; GCN: buffer_store_{{dword|b32}} [[RESULT]] 111define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { 112entry: 113 %fdiv = fdiv fast float %a, %b 114 store float %fdiv, float addrspace(1)* %out 115 ret void 116} 117 118; FUNC-LABEL: {{^}}fdiv_f32_fast_math: 119; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 120; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z, 121 122; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} 123; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] 124; GCN-NOT: [[RESULT]] 125; GCN: buffer_store_{{dword|b32}} [[RESULT]] 126define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { 127entry: 128 %fdiv = fdiv fast float %a, %b 129 store float %fdiv, float addrspace(1)* %out 130 ret void 131} 132 133; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math: 134; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 135; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z, 136 137; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} 138; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] 139; GCN-NOT: [[RESULT]] 140; GCN: buffer_store_{{dword|b32}} [[RESULT]] 141define amdgpu_kernel void @fdiv_ulp25_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { 142entry: 143 %fdiv = fdiv fast float %a, %b, !fpmath !0 144 store float %fdiv, float addrspace(1)* %out 145 ret void 146} 147 148; FUNC-LABEL: {{^}}fdiv_f32_arcp_math: 149; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 150; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z, 151 152; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} 153; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] 154; GCN-NOT: [[RESULT]] 155; GCN: buffer_store_{{dword|b32}} [[RESULT]] 156define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 { 157entry: 158 %fdiv = fdiv arcp ninf float %a, %b 159 store float %fdiv, float addrspace(1)* %out 160 ret void 161} 162 163; FUNC-LABEL: {{^}}fdiv_v2f32: 164; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z 165; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y 166; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS 167; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS 168 169; GCN: v_div_scale_f32 170; GCN: v_div_scale_f32 171; GCN: v_div_scale_f32 172; GCN: v_div_scale_f32 173define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { 174entry: 175 %fdiv = fdiv <2 x float> %a, %b 176 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out 177 ret void 178} 179 180; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32: 181; GCN: v_rcp_f32 182; GCN: v_rcp_f32 183; GCN-NOT: v_cmp_gt_f32 184define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { 185entry: 186 %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 187 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out 188 ret void 189} 190 191; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math: 192; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z 193; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y 194; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[3].X, 195; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].W, 196 197; GCN: v_rcp_f32 198; GCN: v_rcp_f32 199define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { 200entry: 201 %fdiv = fdiv fast <2 x float> %a, %b 202 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out 203 ret void 204} 205 206; FUNC-LABEL: {{^}}fdiv_v2f32_arcp_math: 207; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z 208; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y 209; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[3].X, 210; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].W, 211 212; GCN: v_rcp_f32 213; GCN: v_rcp_f32 214define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { 215entry: 216 %fdiv = fdiv arcp ninf <2 x float> %a, %b 217 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out 218 ret void 219} 220 221; FUNC-LABEL: {{^}}fdiv_v4f32: 222; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 223; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 224; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 225; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 226; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 227; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 228; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 229; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 230 231; GCN: v_div_fixup_f32 232; GCN: v_div_fixup_f32 233; GCN: v_div_fixup_f32 234; GCN: v_div_fixup_f32 235define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 236 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 237 %a = load <4 x float>, <4 x float> addrspace(1) * %in 238 %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr 239 %result = fdiv <4 x float> %a, %b 240 store <4 x float> %result, <4 x float> addrspace(1)* %out 241 ret void 242} 243 244; FUNC-LABEL: {{^}}fdiv_v4f32_fast_math: 245; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 246; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 247; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 248; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 249; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, 250; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, 251; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, 252; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, 253 254; GCN: v_rcp_f32 255; GCN: v_rcp_f32 256; GCN: v_rcp_f32 257; GCN: v_rcp_f32 258define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 259 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 260 %a = load <4 x float>, <4 x float> addrspace(1) * %in 261 %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr 262 %result = fdiv fast <4 x float> %a, %b 263 store <4 x float> %result, <4 x float> addrspace(1)* %out 264 ret void 265} 266 267; FUNC-LABEL: {{^}}fdiv_v4f32_arcp_math: 268; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 269; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 270; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 271; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 272; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, 273; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, 274; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, 275; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], PS, T[0-9]+\.[XYZW]}}, 276 277; GCN: v_rcp_f32 278; GCN: v_rcp_f32 279; GCN: v_rcp_f32 280; GCN: v_rcp_f32 281define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 282 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 283 %a = load <4 x float>, <4 x float> addrspace(1) * %in 284 %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr 285 %result = fdiv arcp ninf <4 x float> %a, %b 286 store <4 x float> %result, <4 x float> addrspace(1)* %out 287 ret void 288} 289 290; FUNC-LABEL: {{^}}fdiv_f32_correctly_rounded_divide_sqrt: 291 292; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] 293; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] 294; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] 295 296; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 297; GFX10: s_denorm_mode 15 298; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 299; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] 300; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] 301; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] 302; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] 303; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] 304; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] 305; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] 306; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 307; GFX10: s_denorm_mode 12 308; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] 309; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], 310 311define amdgpu_kernel void @fdiv_f32_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #0 { 312entry: 313 %fdiv = fdiv float 1.000000e+00, %a 314 store float %fdiv, float addrspace(1)* %out 315 ret void 316} 317 318 319; FUNC-LABEL: {{^}}fdiv_f32_denorms_correctly_rounded_divide_sqrt: 320 321; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] 322; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] 323 324; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] 325; PREGFX10-NOT: s_setreg 326; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 327; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] 328; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] 329; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] 330; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] 331; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] 332; PREGFX10-NOT: s_setreg 333 334; GFX10-NOT: s_denorm_mode 335; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 336; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] 337; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] 338; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] 339; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] 340; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] 341; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] 342; GFX10-NOT: s_denorm_mode 343 344; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] 345; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], 346define amdgpu_kernel void @fdiv_f32_denorms_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #2 { 347entry: 348 %fdiv = fdiv float 1.000000e+00, %a 349 store float %fdiv, float addrspace(1)* %out 350 ret void 351} 352 353attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-features"="-flat-for-global" } 354attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-features"="-flat-for-global" } 355attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "denormal-fp-math-f32"="ieee,ieee" "target-features"="-flat-for-global" } 356 357!0 = !{float 2.500000e+00} 358