1 //===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // 10 //===----------------------------------------------------------------------===// 11 12 #include "Mapping.h" 13 #include "Interface.h" 14 #include "State.h" 15 #include "Types.h" 16 #include "Utils.h" 17 18 #pragma omp begin declare target device_type(nohost) 19 20 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 21 22 using namespace _OMP; 23 24 namespace _OMP { 25 namespace impl { 26 27 // Forward declarations defined to be defined for AMDGCN and NVPTX. 28 const llvm::omp::GV &getGridValue(); 29 uint32_t getGridDim(uint32_t n, uint16_t d); 30 uint32_t getWorkgroupDim(uint32_t group_id, uint32_t grid_size, 31 uint16_t group_size); 32 uint32_t getNumHardwareThreadsInBlock(); 33 LaneMaskTy activemask(); 34 LaneMaskTy lanemaskLT(); 35 LaneMaskTy lanemaskGT(); 36 uint32_t getThreadIdInWarp(); 37 uint32_t getThreadIdInBlock(); 38 uint32_t getKernelSize(); 39 uint32_t getBlockId(); 40 uint32_t getNumberOfBlocks(); 41 uint32_t getWarpId(); 42 uint32_t getNumberOfWarpsInBlock(); 43 44 /// AMDGCN Implementation 45 /// 46 ///{ 47 #pragma omp begin declare variant match(device = {arch(amdgcn)}) 48 49 const llvm::omp::GV &getGridValue() { 50 return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>(); 51 } 52 53 uint32_t getGridDim(uint32_t n, uint16_t d) { 54 uint32_t q = n / d; 55 return q + (n > q * d); 56 } 57 58 uint32_t getWorkgroupDim(uint32_t group_id, uint32_t grid_size, 59 uint16_t group_size) { 60 uint32_t r = grid_size - group_id * group_size; 61 return (r < group_size) ? r : group_size; 62 } 63 64 uint32_t getNumHardwareThreadsInBlock() { 65 return getWorkgroupDim(__builtin_amdgcn_workgroup_id_x(), 66 __builtin_amdgcn_grid_size_x(), 67 __builtin_amdgcn_workgroup_size_x()); 68 } 69 70 LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } 71 72 LaneMaskTy lanemaskLT() { 73 uint32_t Lane = mapping::getThreadIdInWarp(); 74 int64_t Ballot = mapping::activemask(); 75 uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; 76 return Mask & Ballot; 77 } 78 79 LaneMaskTy lanemaskGT() { 80 uint32_t Lane = mapping::getThreadIdInWarp(); 81 if (Lane == (mapping::getWarpSize() - 1)) 82 return 0; 83 int64_t Ballot = mapping::activemask(); 84 uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); 85 return Mask & Ballot; 86 } 87 88 uint32_t getThreadIdInWarp() { 89 return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); 90 } 91 92 uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } 93 94 uint32_t getKernelSize() { return __builtin_amdgcn_grid_size_x(); } 95 96 uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); } 97 98 uint32_t getNumberOfBlocks() { 99 return getGridDim(__builtin_amdgcn_grid_size_x(), 100 __builtin_amdgcn_workgroup_size_x()); 101 } 102 103 uint32_t getWarpId() { 104 return impl::getThreadIdInBlock() / mapping::getWarpSize(); 105 } 106 107 uint32_t getNumberOfWarpsInBlock() { 108 return mapping::getBlockSize() / mapping::getWarpSize(); 109 } 110 111 #pragma omp end declare variant 112 ///} 113 114 /// NVPTX Implementation 115 /// 116 ///{ 117 #pragma omp begin declare variant match( \ 118 device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) 119 120 uint32_t getNumHardwareThreadsInBlock() { 121 return __nvvm_read_ptx_sreg_ntid_x(); 122 } 123 124 const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; } 125 126 LaneMaskTy activemask() { 127 unsigned int Mask; 128 asm("activemask.b32 %0;" : "=r"(Mask)); 129 return Mask; 130 } 131 132 LaneMaskTy lanemaskLT() { 133 __kmpc_impl_lanemask_t Res; 134 asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res)); 135 return Res; 136 } 137 138 LaneMaskTy lanemaskGT() { 139 __kmpc_impl_lanemask_t Res; 140 asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res)); 141 return Res; 142 } 143 144 uint32_t getThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } 145 146 uint32_t getThreadIdInWarp() { 147 return impl::getThreadIdInBlock() & (mapping::getWarpSize() - 1); 148 } 149 150 uint32_t getKernelSize() { 151 return __nvvm_read_ptx_sreg_nctaid_x() * 152 mapping::getNumberOfProcessorElements(); 153 } 154 155 uint32_t getBlockId() { return __nvvm_read_ptx_sreg_ctaid_x(); } 156 157 uint32_t getNumberOfBlocks() { return __nvvm_read_ptx_sreg_nctaid_x(); } 158 159 uint32_t getWarpId() { 160 return impl::getThreadIdInBlock() / mapping::getWarpSize(); 161 } 162 163 uint32_t getNumberOfWarpsInBlock() { 164 return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / 165 mapping::getWarpSize(); 166 } 167 168 #pragma omp end declare variant 169 ///} 170 171 uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; } 172 173 } // namespace impl 174 } // namespace _OMP 175 176 /// We have to be deliberate about the distinction of `mapping::` and `impl::` 177 /// below to avoid repeating assumptions or including irrelevant ones. 178 ///{ 179 180 static bool isInLastWarp() { 181 uint32_t MainTId = (mapping::getNumberOfProcessorElements() - 1) & 182 ~(mapping::getWarpSize() - 1); 183 return mapping::getThreadIdInBlock() == MainTId; 184 } 185 186 bool mapping::isMainThreadInGenericMode(bool IsSPMD) { 187 if (IsSPMD || icv::Level) 188 return false; 189 190 // Check if this is the last warp in the block. 191 return isInLastWarp(); 192 } 193 194 bool mapping::isMainThreadInGenericMode() { 195 return mapping::isMainThreadInGenericMode(mapping::isSPMDMode()); 196 } 197 198 bool mapping::isInitialThreadInLevel0(bool IsSPMD) { 199 if (IsSPMD) 200 return mapping::getThreadIdInBlock() == 0; 201 return isInLastWarp(); 202 } 203 204 bool mapping::isLeaderInWarp() { 205 __kmpc_impl_lanemask_t Active = mapping::activemask(); 206 __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT(); 207 return utils::popc(Active & LaneMaskLT) == 0; 208 } 209 210 LaneMaskTy mapping::activemask() { return impl::activemask(); } 211 212 LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); } 213 214 LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); } 215 216 uint32_t mapping::getThreadIdInWarp() { 217 uint32_t ThreadIdInWarp = impl::getThreadIdInWarp(); 218 ASSERT(ThreadIdInWarp < impl::getWarpSize()); 219 return ThreadIdInWarp; 220 } 221 222 uint32_t mapping::getThreadIdInBlock() { 223 uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(); 224 ASSERT(ThreadIdInBlock < impl::getNumHardwareThreadsInBlock()); 225 return ThreadIdInBlock; 226 } 227 228 uint32_t mapping::getWarpSize() { return impl::getWarpSize(); } 229 230 uint32_t mapping::getBlockSize(bool IsSPMD) { 231 uint32_t BlockSize = mapping::getNumberOfProcessorElements() - 232 (!IsSPMD * impl::getWarpSize()); 233 return BlockSize; 234 } 235 uint32_t mapping::getBlockSize() { 236 return mapping::getBlockSize(mapping::isSPMDMode()); 237 } 238 239 uint32_t mapping::getKernelSize() { return impl::getKernelSize(); } 240 241 uint32_t mapping::getWarpId() { 242 uint32_t WarpID = impl::getWarpId(); 243 ASSERT(WarpID < impl::getNumberOfWarpsInBlock()); 244 return WarpID; 245 } 246 247 uint32_t mapping::getBlockId() { 248 uint32_t BlockId = impl::getBlockId(); 249 ASSERT(BlockId < impl::getNumberOfBlocks()); 250 return BlockId; 251 } 252 253 uint32_t mapping::getNumberOfWarpsInBlock() { 254 uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock(); 255 ASSERT(impl::getWarpId() < NumberOfWarpsInBlocks); 256 return NumberOfWarpsInBlocks; 257 } 258 259 uint32_t mapping::getNumberOfBlocks() { 260 uint32_t NumberOfBlocks = impl::getNumberOfBlocks(); 261 ASSERT(impl::getBlockId() < NumberOfBlocks); 262 return NumberOfBlocks; 263 } 264 265 uint32_t mapping::getNumberOfProcessorElements() { 266 uint32_t NumberOfProcessorElements = impl::getNumHardwareThreadsInBlock(); 267 ASSERT(impl::getThreadIdInBlock() < NumberOfProcessorElements); 268 return NumberOfProcessorElements; 269 } 270 271 ///} 272 273 /// Execution mode 274 /// 275 ///{ 276 277 // TODO: This is a workaround for initialization coming from kernels outside of 278 // the TU. We will need to solve this more correctly in the future. 279 int __attribute__((weak)) KEEP_ALIVE SHARED(IsSPMDMode); 280 281 void mapping::init(bool IsSPMD) { 282 if (mapping::isInitialThreadInLevel0(IsSPMD)) 283 IsSPMDMode = IsSPMD; 284 } 285 286 bool mapping::isSPMDMode() { return IsSPMDMode; } 287 288 bool mapping::isGenericMode() { return !isSPMDMode(); } 289 ///} 290 291 extern "C" { 292 __attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() { 293 FunctionTracingRAII(); 294 return mapping::getThreadIdInBlock(); 295 } 296 297 __attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() { 298 FunctionTracingRAII(); 299 return impl::getNumHardwareThreadsInBlock(); 300 } 301 302 __attribute__((noinline)) uint32_t __kmpc_get_warp_size() { 303 FunctionTracingRAII(); 304 return impl::getWarpSize(); 305 } 306 } 307 #pragma omp end declare target 308