1; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A 2; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10 3 4; GCN-LABEL: {{^}}dpp64_ceil: 5; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]], 6; DPP64: v_ceil_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} 7; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} 8define amdgpu_kernel void @dpp64_ceil(i64 addrspace(1)* %arg, i64 %in1) { 9 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 10 %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id 11 %load = load i64, i64 addrspace(1)* %gep 12 %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 337, i32 15, i32 15, i1 1) #0 13 %tmp1 = bitcast i64 %tmp0 to double 14 %round = tail call double @llvm.ceil.f64(double %tmp1) 15 %tmp2 = bitcast double %round to i64 16 store i64 %tmp2, i64 addrspace(1)* %gep 17 ret void 18} 19 20; GCN-LABEL: {{^}}dpp64_rcp: 21; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]], 22; DPP64: v_rcp_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} 23; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} 24define amdgpu_kernel void @dpp64_rcp(i64 addrspace(1)* %arg, i64 %in1) { 25 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 26 %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id 27 %load = load i64, i64 addrspace(1)* %gep 28 %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 337, i32 15, i32 15, i1 1) #0 29 %tmp1 = bitcast i64 %tmp0 to double 30 %rcp = call double @llvm.amdgcn.rcp.f64(double %tmp1) 31 %tmp2 = bitcast double %rcp to i64 32 store i64 %tmp2, i64 addrspace(1)* %gep 33 ret void 34} 35 36; GCN-LABEL: {{^}}dpp64_rcp_unsupported_ctl: 37; GCN-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} 38; GCN: v_rcp_f64_e32 39define amdgpu_kernel void @dpp64_rcp_unsupported_ctl(i64 addrspace(1)* %arg, i64 %in1) { 40 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 41 %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id 42 %load = load i64, i64 addrspace(1)* %gep 43 %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 15, i32 15, i1 1) #0 44 %tmp1 = bitcast i64 %tmp0 to double 45 %rcp = fdiv fast double 1.0, %tmp1 46 %tmp2 = bitcast double %rcp to i64 47 store i64 %tmp2, i64 addrspace(1)* %gep 48 ret void 49} 50 51; GCN-LABEL: {{^}}dpp64_div: 52; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]], 53; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} 54; GFX10-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} 55; GCN: v_div_scale_f64 56; GCN: v_rcp_f64_e32 57define amdgpu_kernel void @dpp64_div(i64 addrspace(1)* %arg, i64 %in1) { 58 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 59 %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id 60 %load = load i64, i64 addrspace(1)* %gep 61 %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 337, i32 15, i32 15, i1 1) #0 62 %tmp1 = bitcast i64 %tmp0 to double 63 %rcp = fdiv double 15.0, %tmp1 64 %tmp2 = bitcast double %rcp to i64 65 store i64 %tmp2, i64 addrspace(1)* %gep 66 ret void 67} 68 69declare i32 @llvm.amdgcn.workitem.id.x() 70declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0 71declare double @llvm.ceil.f64(double) 72declare double @llvm.amdgcn.rcp.f64(double) 73 74attributes #0 = { nounwind readnone convergent } 75