1 //===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // 10 //===----------------------------------------------------------------------===// 11 12 #include "Mapping.h" 13 #include "Interface.h" 14 #include "State.h" 15 #include "Types.h" 16 #include "Utils.h" 17 18 #pragma omp declare target 19 20 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 21 22 using namespace _OMP; 23 24 namespace _OMP { 25 namespace impl { 26 27 /// AMDGCN Implementation 28 /// 29 ///{ 30 #pragma omp begin declare variant match(device = {arch(amdgcn)}) 31 32 static const llvm::omp::GV &getGridValue() { 33 return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>(); 34 } 35 36 uint32_t getGridDim(uint32_t n, uint16_t d) { 37 uint32_t q = n / d; 38 return q + (n > q * d); 39 } 40 41 uint32_t getWorkgroupDim(uint32_t group_id, uint32_t grid_size, 42 uint16_t group_size) { 43 uint32_t r = grid_size - group_id * group_size; 44 return (r < group_size) ? r : group_size; 45 } 46 47 uint32_t getNumHardwareThreadsInBlock() { 48 return getWorkgroupDim(__builtin_amdgcn_workgroup_id_x(), 49 __builtin_amdgcn_grid_size_x(), 50 __builtin_amdgcn_workgroup_size_x()); 51 } 52 53 LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } 54 55 LaneMaskTy lanemaskLT() { 56 uint32_t Lane = mapping::getThreadIdInWarp(); 57 int64_t Ballot = mapping::activemask(); 58 uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; 59 return Mask & Ballot; 60 } 61 62 LaneMaskTy lanemaskGT() { 63 uint32_t Lane = mapping::getThreadIdInWarp(); 64 if (Lane == (mapping::getWarpSize() - 1)) 65 return 0; 66 int64_t Ballot = mapping::activemask(); 67 uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); 68 return Mask & Ballot; 69 } 70 71 uint32_t getThreadIdInWarp() { 72 return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); 73 } 74 75 uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } 76 77 uint32_t getKernelSize() { return __builtin_amdgcn_grid_size_x(); } 78 79 uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); } 80 81 uint32_t getNumberOfBlocks() { 82 return getGridDim(__builtin_amdgcn_grid_size_x(), 83 __builtin_amdgcn_workgroup_size_x()); 84 } 85 86 uint32_t getWarpId() { 87 return impl::getThreadIdInBlock() / mapping::getWarpSize(); 88 } 89 90 uint32_t getNumberOfWarpsInBlock() { 91 return mapping::getBlockSize() / mapping::getWarpSize(); 92 } 93 94 #pragma omp end declare variant 95 ///} 96 97 /// NVPTX Implementation 98 /// 99 ///{ 100 #pragma omp begin declare variant match( \ 101 device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) 102 103 uint32_t getNumHardwareThreadsInBlock() { 104 return __nvvm_read_ptx_sreg_ntid_x(); 105 } 106 107 static const llvm::omp::GV &getGridValue() { 108 return llvm::omp::NVPTXGridValues; 109 } 110 111 LaneMaskTy activemask() { 112 unsigned int Mask; 113 asm("activemask.b32 %0;" : "=r"(Mask)); 114 return Mask; 115 } 116 117 LaneMaskTy lanemaskLT() { 118 __kmpc_impl_lanemask_t Res; 119 asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res)); 120 return Res; 121 } 122 123 LaneMaskTy lanemaskGT() { 124 __kmpc_impl_lanemask_t Res; 125 asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res)); 126 return Res; 127 } 128 129 uint32_t getThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); } 130 131 uint32_t getThreadIdInWarp() { 132 return impl::getThreadIdInBlock() & (mapping::getWarpSize() - 1); 133 } 134 135 uint32_t getKernelSize() { 136 return __nvvm_read_ptx_sreg_nctaid_x() * 137 mapping::getNumberOfProcessorElements(); 138 } 139 140 uint32_t getBlockId() { return __nvvm_read_ptx_sreg_ctaid_x(); } 141 142 uint32_t getNumberOfBlocks() { return __nvvm_read_ptx_sreg_nctaid_x(); } 143 144 uint32_t getWarpId() { 145 return impl::getThreadIdInBlock() / mapping::getWarpSize(); 146 } 147 148 uint32_t getNumberOfWarpsInBlock() { 149 return (mapping::getBlockSize() + mapping::getWarpSize() - 1) / 150 mapping::getWarpSize(); 151 } 152 153 #pragma omp end declare variant 154 ///} 155 156 uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; } 157 158 } // namespace impl 159 } // namespace _OMP 160 161 /// We have to be deliberate about the distinction of `mapping::` and `impl::` 162 /// below to avoid repeating assumptions or including irrelevant ones. 163 ///{ 164 165 static bool isInLastWarp() { 166 uint32_t MainTId = (mapping::getNumberOfProcessorElements() - 1) & 167 ~(mapping::getWarpSize() - 1); 168 return mapping::getThreadIdInBlock() == MainTId; 169 } 170 171 bool mapping::isMainThreadInGenericMode(bool IsSPMD) { 172 if (IsSPMD || icv::Level) 173 return false; 174 175 // Check if this is the last warp in the block. 176 return isInLastWarp(); 177 } 178 179 bool mapping::isMainThreadInGenericMode() { 180 return mapping::isMainThreadInGenericMode(mapping::isSPMDMode()); 181 } 182 183 bool mapping::isInitialThreadInLevel0(bool IsSPMD) { 184 if (IsSPMD) 185 return mapping::getThreadIdInBlock() == 0; 186 return isInLastWarp(); 187 } 188 189 bool mapping::isLeaderInWarp() { 190 __kmpc_impl_lanemask_t Active = mapping::activemask(); 191 __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT(); 192 return utils::popc(Active & LaneMaskLT) == 0; 193 } 194 195 LaneMaskTy mapping::activemask() { return impl::activemask(); } 196 197 LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); } 198 199 LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); } 200 201 uint32_t mapping::getThreadIdInWarp() { 202 uint32_t ThreadIdInWarp = impl::getThreadIdInWarp(); 203 ASSERT(ThreadIdInWarp < impl::getWarpSize()); 204 return ThreadIdInWarp; 205 } 206 207 uint32_t mapping::getThreadIdInBlock() { 208 uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(); 209 ASSERT(ThreadIdInBlock < impl::getNumHardwareThreadsInBlock()); 210 return ThreadIdInBlock; 211 } 212 213 uint32_t mapping::getWarpSize() { return impl::getWarpSize(); } 214 215 uint32_t mapping::getBlockSize(bool IsSPMD) { 216 uint32_t BlockSize = mapping::getNumberOfProcessorElements() - 217 (!IsSPMD * impl::getWarpSize()); 218 return BlockSize; 219 } 220 uint32_t mapping::getBlockSize() { 221 return mapping::getBlockSize(mapping::isSPMDMode()); 222 } 223 224 uint32_t mapping::getKernelSize() { return impl::getKernelSize(); } 225 226 uint32_t mapping::getWarpId() { 227 uint32_t WarpID = impl::getWarpId(); 228 ASSERT(WarpID < impl::getNumberOfWarpsInBlock()); 229 return WarpID; 230 } 231 232 uint32_t mapping::getBlockId() { 233 uint32_t BlockId = impl::getBlockId(); 234 ASSERT(BlockId < impl::getNumberOfBlocks()); 235 return BlockId; 236 } 237 238 uint32_t mapping::getNumberOfWarpsInBlock() { 239 uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock(); 240 ASSERT(impl::getWarpId() < NumberOfWarpsInBlocks); 241 return NumberOfWarpsInBlocks; 242 } 243 244 uint32_t mapping::getNumberOfBlocks() { 245 uint32_t NumberOfBlocks = impl::getNumberOfBlocks(); 246 ASSERT(impl::getBlockId() < NumberOfBlocks); 247 return NumberOfBlocks; 248 } 249 250 uint32_t mapping::getNumberOfProcessorElements() { 251 uint32_t NumberOfProcessorElements = impl::getNumHardwareThreadsInBlock(); 252 ASSERT(impl::getThreadIdInBlock() < NumberOfProcessorElements); 253 return NumberOfProcessorElements; 254 } 255 256 ///} 257 258 /// Execution mode 259 /// 260 ///{ 261 static int SHARED(IsSPMDMode); 262 263 void mapping::init(bool IsSPMD) { 264 if (mapping::isInitialThreadInLevel0(IsSPMD)) 265 IsSPMDMode = IsSPMD; 266 } 267 268 bool mapping::isSPMDMode() { return IsSPMDMode; } 269 270 bool mapping::isGenericMode() { return !isSPMDMode(); } 271 ///} 272 273 extern "C" { 274 __attribute__((noinline)) uint32_t __kmpc_get_hardware_thread_id_in_block() { 275 FunctionTracingRAII(); 276 return mapping::getThreadIdInBlock(); 277 } 278 279 __attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() { 280 FunctionTracingRAII(); 281 return impl::getNumHardwareThreadsInBlock(); 282 } 283 284 __attribute__((noinline)) uint32_t __kmpc_get_warp_size() { 285 FunctionTracingRAII(); 286 return impl::getWarpSize(); 287 } 288 } 289 #pragma omp end declare target 290