1 //===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // 10 //===----------------------------------------------------------------------===// 11 12 #include "Mapping.h" 13 #include "Interface.h" 14 #include "State.h" 15 #include "Types.h" 16 #include "Utils.h" 17 18 #pragma omp declare target 19 20 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 21 22 using namespace _OMP; 23 24 namespace _OMP { 25 namespace impl { 26 27 /// AMDGCN Implementation 28 /// 29 ///{ 30 #pragma omp begin declare variant match(device = {arch(amdgcn)}) 31 32 constexpr const llvm::omp::GV &getGridValue() { 33 return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>(); 34 } 35 36 uint32_t getGridDim(uint32_t n, uint16_t d) { 37 uint32_t q = n / d; 38 return q + (n > q * d); 39 } 40 41 uint32_t getWorkgroupDim(uint32_t group_id, uint32_t grid_size, 42 uint16_t group_size) { 43 uint32_t r = grid_size - group_id * group_size; 44 return (r < group_size) ? r : group_size; 45 } 46 47 uint32_t getNumHardwareThreadsInBlock() { 48 return getWorkgroupDim(__builtin_amdgcn_workgroup_id_x(), 49 __builtin_amdgcn_grid_size_x(), 50 __builtin_amdgcn_workgroup_size_x()); 51 } 52 53 LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } 54 55 LaneMaskTy lanemaskLT() { 56 uint32_t Lane = mapping::getThreadIdInWarp(); 57 int64_t Ballot = mapping::activemask(); 58 uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; 59 return Mask & Ballot; 60 } 61 62 LaneMaskTy lanemaskGT() { 63 uint32_t Lane = mapping::getThreadIdInWarp(); 64 if (Lane == (mapping::getWarpSize() - 1)) 65 return 0; 66 int64_t Ballot = mapping::activemask(); 67 uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); 68 return Mask & Ballot; 69 } 70 71 uint32_t getThreadIdInWarp() { 72 return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); 73 } 74 75 uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } 76 77 uint32_t getKernelSize() { return __builtin_amdgcn_grid_size_x(); } 78 79 uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); } 80 81 uint32_t getNumberOfBlocks() { 82 return getGridDim(__builtin_amdgcn_grid_size_x(), 83 __builtin_amdgcn_workgroup_size_x()); 84 } 85 86 uint32_t getWarpId() { 87 return impl::getThreadIdInBlock() / mapping::getWarpSize(); 88 } 89 90 uint32_t getNumberOfWarpsInBlock() { 91 return mapping::getBlockSize() / mapping::getWarpSize(); 92 } 93 94 #pragma omp end declare variant 95 ///} 96 97 /// NVPTX Implementation 98 /// 99 ///{ 100 #pragma omp begin declare variant match( \ 101 device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) 102 103 uint32_t getNumHardwareThreadsInBlock() { 104 return __nvvm_read_ptx_sreg_ntid_x(); 105 } 106 107 constexpr const llvm::omp::GV &getGridValue() { 108 return llvm::omp::NVPTXGridValues; 109 } 110 111 LaneMaskTy activemask() { 112 unsigned int Mask; 113 asm("activemask.b32 %0;" : "=r"(Mask)); 114 return Mask; 115 } 116 117 LaneMaskTy lanemaskLT() { 118 __kmpc_impl_lanemask_t Res; 119 asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res)); 120 return Res; 121 } 122 123 LaneMaskTy lanemaskGT() { 124 __kmpc_impl_lanemask_t Res; 125 asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res)); 126 return Res; 127 } 128 129 uint32_t getThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } 130 131 uint32_t getThreadIdInWarp() { 132 return impl::getThreadIdInBlock() & (mapping::getWarpSize() - 1); 133 } 134 135 uint32_t getKernelSize() { 136 return __nvvm_read_ptx_sreg_nctaid_x() * 137 mapping::getNumberOfProcessorElements(); 138 } 139 140 uint32_t getBlockId() { return __nvvm_read_ptx_sreg_ctaid_x(); } 141 142 uint32_t getNumberOfBlocks() { return __nvvm_read_ptx_sreg_nctaid_x(); } 143 144 uint32_t getWarpId() { 145 return impl::getThreadIdInBlock() / mapping::getWarpSize(); 146 } 147 148 uint32_t getNumberOfWarpsInBlock() { 149 return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / 150 mapping::getWarpSize(); 151 } 152 153 #pragma omp end declare variant 154 ///} 155 156 uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; } 157 158 } // namespace impl 159 } // namespace _OMP 160 161 /// We have to be deliberate about the distinction of `mapping::` and `impl::` 162 /// below to avoid repeating assumptions or including irrelevant ones. 163 ///{ 164 165 static bool isInLastWarp() { 166 uint32_t MainTId = (mapping::getNumberOfProcessorElements() - 1) & 167 ~(mapping::getWarpSize() - 1); 168 return mapping::getThreadIdInBlock() == MainTId; 169 } 170 171 bool mapping::isMainThreadInGenericMode(bool IsSPMD) { 172 if (IsSPMD || icv::Level) 173 return false; 174 175 // Check if this is the last warp in the block. 176 return isInLastWarp(); 177 } 178 179 bool mapping::isMainThreadInGenericMode() { 180 return mapping::isMainThreadInGenericMode(mapping::isSPMDMode()); 181 } 182 183 bool mapping::isInitialThreadInLevel0(bool IsSPMD) { 184 if (IsSPMD) 185 return mapping::getThreadIdInBlock() == 0; 186 return isInLastWarp(); 187 } 188 189 bool mapping::isLeaderInWarp() { 190 __kmpc_impl_lanemask_t Active = mapping::activemask(); 191 __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT(); 192 return utils::popc(Active & LaneMaskLT) == 0; 193 } 194 195 LaneMaskTy mapping::activemask() { return impl::activemask(); } 196 197 LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); } 198 199 LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); } 200 201 uint32_t mapping::getThreadIdInWarp() { 202 uint32_t ThreadIdInWarp = impl::getThreadIdInWarp(); 203 ASSERT(ThreadIdInWarp < impl::getWarpSize()); 204 return ThreadIdInWarp; 205 } 206 207 uint32_t mapping::getThreadIdInBlock() { 208 uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(); 209 ASSERT(ThreadIdInBlock < impl::getNumHardwareThreadsInBlock()); 210 return ThreadIdInBlock; 211 } 212 213 uint32_t mapping::getWarpSize() { return impl::getWarpSize(); } 214 215 uint32_t mapping::getBlockSize() { 216 uint32_t BlockSize = mapping::getNumberOfProcessorElements() - 217 (!mapping::isSPMDMode() * impl::getWarpSize()); 218 return BlockSize; 219 } 220 221 uint32_t mapping::getKernelSize() { return impl::getKernelSize(); } 222 223 uint32_t mapping::getWarpId() { 224 uint32_t WarpID = impl::getWarpId(); 225 ASSERT(WarpID < impl::getNumberOfWarpsInBlock()); 226 return WarpID; 227 } 228 229 uint32_t mapping::getBlockId() { 230 uint32_t BlockId = impl::getBlockId(); 231 ASSERT(BlockId < impl::getNumberOfBlocks()); 232 return BlockId; 233 } 234 235 uint32_t mapping::getNumberOfWarpsInBlock() { 236 uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock(); 237 ASSERT(impl::getWarpId() < NumberOfWarpsInBlocks); 238 return NumberOfWarpsInBlocks; 239 } 240 241 uint32_t mapping::getNumberOfBlocks() { 242 uint32_t NumberOfBlocks = impl::getNumberOfBlocks(); 243 ASSERT(impl::getBlockId() < NumberOfBlocks); 244 return NumberOfBlocks; 245 } 246 247 uint32_t mapping::getNumberOfProcessorElements() { 248 uint32_t NumberOfProcessorElements = impl::getNumHardwareThreadsInBlock(); 249 ASSERT(impl::getThreadIdInBlock() < NumberOfProcessorElements); 250 return NumberOfProcessorElements; 251 } 252 253 ///} 254 255 /// Execution mode 256 /// 257 ///{ 258 static int SHARED(IsSPMDMode); 259 260 void mapping::init(bool IsSPMD) { 261 if (mapping::isInitialThreadInLevel0(IsSPMD)) 262 IsSPMDMode = IsSPMD; 263 } 264 265 bool mapping::isSPMDMode() { return IsSPMDMode; } 266 267 bool mapping::isGenericMode() { return !isSPMDMode(); } 268 ///} 269 270 extern "C" { 271 __attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() { 272 FunctionTracingRAII(); 273 return mapping::getThreadIdInBlock(); 274 } 275 276 __attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() { 277 FunctionTracingRAII(); 278 return impl::getNumHardwareThreadsInBlock(); 279 } 280 281 __attribute__((noinline)) uint32_t __kmpc_get_warp_size() { 282 FunctionTracingRAII(); 283 return impl::getWarpSize(); 284 } 285 } 286 #pragma omp end declare target 287