1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 //===----------------------------------------------------------------------===// 10 11 #include "State.h" 12 #include "Configuration.h" 13 #include "Debug.h" 14 #include "Interface.h" 15 #include "Synchronization.h" 16 #include "Types.h" 17 18 using namespace _OMP; 19 20 #pragma omp begin declare target device_type(nohost) 21 22 /// Memory implementation 23 /// 24 ///{ 25 26 /// Add worst-case padding so that future allocations are properly aligned. 27 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be 28 /// passed in as an argument and the stack rewritten to support it. 29 constexpr const uint32_t Alignment = 16; 30 31 /// External symbol to access dynamic shared memory. 32 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment))); 33 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc) 34 35 namespace { 36 37 /// Fallback implementations are missing to trigger a link time error. 38 /// Implementations for new devices, including the host, should go into a 39 /// dedicated begin/end declare variant. 40 /// 41 ///{ 42 43 extern "C" { 44 __attribute__((leaf)) void *malloc(uint64_t Size); 45 __attribute__((leaf)) void free(void *Ptr); 46 } 47 48 ///} 49 50 /// AMDGCN implementations of the shuffle sync idiom. 51 /// 52 ///{ 53 #pragma omp begin declare variant match(device = {arch(amdgcn)}) 54 55 extern "C" { 56 void *malloc(uint64_t Size) { 57 // TODO: Use some preallocated space for dynamic malloc. 58 return nullptr; 59 } 60 61 void free(void *Ptr) {} 62 } 63 64 #pragma omp end declare variant 65 ///} 66 67 /// A "smart" stack in shared memory. 68 /// 69 /// The stack exposes a malloc/free interface but works like a stack internally. 70 /// In fact, it is a separate stack *per warp*. That means, each warp must push 71 /// and pop symmetrically or this breaks, badly. The implementation will (aim 72 /// to) detect non-lock-step warps and fallback to malloc/free. The same will 73 /// happen if a warp runs out of memory. The master warp in generic memory is 74 /// special and is given more memory than the rest. 75 /// 76 struct SharedMemorySmartStackTy { 77 /// Initialize the stack. Must be called by all threads. 78 void init(bool IsSPMD); 79 80 /// Allocate \p Bytes on the stack for the encountering thread. Each thread 81 /// can call this function. 82 void *push(uint64_t Bytes); 83 84 /// Deallocate the last allocation made by the encountering thread and pointed 85 /// to by \p Ptr from the stack. Each thread can call this function. 86 void pop(void *Ptr, uint32_t Bytes); 87 88 private: 89 /// Compute the size of the storage space reserved for a thread. 90 uint32_t computeThreadStorageTotal() { 91 uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements(); 92 return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock), 93 Alignment); 94 } 95 96 /// Return the top address of the warp data stack, that is the first address 97 /// this warp will allocate memory at next. 98 void *getThreadDataTop(uint32_t TId) { 99 return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; 100 } 101 102 /// The actual storage, shared among all warps. 103 unsigned char Data[state::SharedScratchpadSize] 104 __attribute__((aligned(Alignment))); 105 unsigned char Usage[mapping::MaxThreadsPerTeam] 106 __attribute__((aligned(Alignment))); 107 }; 108 109 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, 110 "Shared scratchpad of this size not supported yet."); 111 112 /// The allocation of a single shared memory scratchpad. 113 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack); 114 115 void SharedMemorySmartStackTy::init(bool IsSPMD) { 116 Usage[mapping::getThreadIdInBlock()] = 0; 117 } 118 119 void *SharedMemorySmartStackTy::push(uint64_t Bytes) { 120 // First align the number of requested bytes. 121 uint64_t AlignedBytes = utils::align_up(Bytes, Alignment); 122 123 uint32_t StorageTotal = computeThreadStorageTotal(); 124 125 // The main thread in generic mode gets the space of its entire warp as the 126 // other threads do not participate in any computation at all. 127 if (mapping::isMainThreadInGenericMode()) 128 StorageTotal *= mapping::getWarpSize(); 129 130 int TId = mapping::getThreadIdInBlock(); 131 if (Usage[TId] + AlignedBytes <= StorageTotal) { 132 void *Ptr = getThreadDataTop(TId); 133 Usage[TId] += AlignedBytes; 134 return Ptr; 135 } 136 137 if (config::isDebugMode(config::DebugKind::CommonIssues)) 138 PRINT("Shared memory stack full, fallback to dynamic allocation of global " 139 "memory will negatively impact performance.\n"); 140 void *GlobalMemory = memory::allocGlobal( 141 AlignedBytes, "Slow path shared memory allocation, insufficient " 142 "shared memory stack memory!"); 143 ASSERT(GlobalMemory != nullptr && "nullptr returned by malloc!"); 144 145 return GlobalMemory; 146 } 147 148 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) { 149 uint64_t AlignedBytes = utils::align_up(Bytes, Alignment); 150 if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) { 151 int TId = mapping::getThreadIdInBlock(); 152 Usage[TId] -= AlignedBytes; 153 return; 154 } 155 memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); 156 } 157 158 } // namespace 159 160 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } 161 162 void *memory::allocShared(uint64_t Bytes, const char *Reason) { 163 return SharedMemorySmartStack.push(Bytes); 164 } 165 166 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { 167 SharedMemorySmartStack.pop(Ptr, Bytes); 168 } 169 170 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { 171 void *Ptr = malloc(Bytes); 172 if (config::isDebugMode(config::DebugKind::CommonIssues) && Ptr == nullptr) 173 PRINT("nullptr returned by malloc!\n"); 174 return Ptr; 175 } 176 177 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } 178 179 ///} 180 181 bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { 182 return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & 183 (ActiveLevelVar == Other.ActiveLevelVar) & 184 (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & 185 (RunSchedVar == Other.RunSchedVar) & 186 (RunSchedChunkVar == Other.RunSchedChunkVar); 187 } 188 189 void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { 190 ASSERT(NThreadsVar == Other.NThreadsVar); 191 ASSERT(LevelVar == Other.LevelVar); 192 ASSERT(ActiveLevelVar == Other.ActiveLevelVar); 193 ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar); 194 ASSERT(RunSchedVar == Other.RunSchedVar); 195 ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar); 196 } 197 198 void state::TeamStateTy::init(bool IsSPMD) { 199 ICVState.NThreadsVar = mapping::getBlockSize(IsSPMD); 200 ICVState.LevelVar = 0; 201 ICVState.ActiveLevelVar = 0; 202 ICVState.MaxActiveLevelsVar = 1; 203 ICVState.RunSchedVar = omp_sched_static; 204 ICVState.RunSchedChunkVar = 1; 205 ParallelTeamSize = 1; 206 HasThreadState = false; 207 ParallelRegionFnVar = nullptr; 208 } 209 210 bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { 211 return (ICVState == Other.ICVState) & 212 (HasThreadState == Other.HasThreadState) & 213 (ParallelTeamSize == Other.ParallelTeamSize); 214 } 215 216 void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { 217 ICVState.assertEqual(Other.ICVState); 218 ASSERT(ParallelTeamSize == Other.ParallelTeamSize); 219 ASSERT(HasThreadState == Other.HasThreadState); 220 } 221 222 state::TeamStateTy SHARED(_OMP::state::TeamState); 223 224 __attribute__((loader_uninitialized)) 225 state::ThreadStateTy *_OMP::state::ThreadStates[mapping::MaxThreadsPerTeam]; 226 #pragma omp allocate(_OMP::state::ThreadStates) allocator(omp_pteam_mem_alloc) 227 228 namespace { 229 230 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, 231 int OutOfBoundsVal = -1) { 232 if (Level == 0) 233 return DefaultVal; 234 int LevelVar = omp_get_level(); 235 if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) 236 return OutOfBoundsVal; 237 int ActiveLevel = icv::ActiveLevel; 238 if (OMP_UNLIKELY(Level != ActiveLevel)) 239 return DefaultVal; 240 return Val; 241 } 242 243 } // namespace 244 245 void state::init(bool IsSPMD) { 246 SharedMemorySmartStack.init(IsSPMD); 247 if (mapping::isInitialThreadInLevel0(IsSPMD)) { 248 TeamState.init(IsSPMD); 249 DebugEntryRAII::init(); 250 } 251 252 ThreadStates[mapping::getThreadIdInBlock()] = nullptr; 253 } 254 255 void state::enterDataEnvironment(IdentTy *Ident) { 256 ASSERT(config::mayUseThreadStates() && 257 "Thread state modified while explicitly disabled!"); 258 259 unsigned TId = mapping::getThreadIdInBlock(); 260 ThreadStateTy *NewThreadState = 261 static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy))); 262 NewThreadState->init(ThreadStates[TId]); 263 TeamState.HasThreadState = true; 264 ThreadStates[TId] = NewThreadState; 265 } 266 267 void state::exitDataEnvironment() { 268 ASSERT(config::mayUseThreadStates() && 269 "Thread state modified while explicitly disabled!"); 270 271 unsigned TId = mapping::getThreadIdInBlock(); 272 resetStateForThread(TId); 273 } 274 275 void state::resetStateForThread(uint32_t TId) { 276 if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId])) 277 return; 278 279 ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; 280 __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy)); 281 ThreadStates[TId] = PreviousThreadState; 282 } 283 284 void state::runAndCheckState(void(Func(void))) { 285 TeamStateTy OldTeamState = TeamState; 286 OldTeamState.assertEqual(TeamState); 287 288 Func(); 289 290 OldTeamState.assertEqual(TeamState); 291 } 292 293 void state::assumeInitialState(bool IsSPMD) { 294 TeamStateTy InitialTeamState; 295 InitialTeamState.init(IsSPMD); 296 InitialTeamState.assertEqual(TeamState); 297 ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]); 298 ASSERT(mapping::isSPMDMode() == IsSPMD); 299 } 300 301 extern "C" { 302 void omp_set_dynamic(int V) {} 303 304 int omp_get_dynamic(void) { return 0; } 305 306 void omp_set_num_threads(int V) { icv::NThreads = V; } 307 308 int omp_get_max_threads(void) { return icv::NThreads; } 309 310 int omp_get_level(void) { 311 int LevelVar = icv::Level; 312 ASSERT(LevelVar >= 0); 313 return LevelVar; 314 } 315 316 int omp_get_active_level(void) { return !!icv::ActiveLevel; } 317 318 int omp_in_parallel(void) { return !!icv::ActiveLevel; } 319 320 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { 321 *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched); 322 *ChunkSize = state::RunSchedChunk; 323 } 324 325 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { 326 icv::RunSched = (int)ScheduleKind; 327 state::RunSchedChunk = ChunkSize; 328 } 329 330 int omp_get_ancestor_thread_num(int Level) { 331 return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); 332 } 333 334 int omp_get_thread_num(void) { 335 return omp_get_ancestor_thread_num(omp_get_level()); 336 } 337 338 int omp_get_team_size(int Level) { 339 return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1); 340 } 341 342 int omp_get_num_threads(void) { 343 return omp_get_level() > 1 ? 1 : state::ParallelTeamSize; 344 } 345 346 int omp_get_thread_limit(void) { return mapping::getKernelSize(); } 347 348 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } 349 350 void omp_set_nested(int) {} 351 352 int omp_get_nested(void) { return false; } 353 354 void omp_set_max_active_levels(int Levels) { 355 icv::MaxActiveLevels = Levels > 0 ? 1 : 0; 356 } 357 358 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } 359 360 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } 361 362 int omp_get_num_places(void) { return 0; } 363 364 int omp_get_place_num_procs(int) { return omp_get_num_procs(); } 365 366 void omp_get_place_proc_ids(int, int *) { 367 // TODO 368 } 369 370 int omp_get_place_num(void) { return 0; } 371 372 int omp_get_partition_num_places(void) { return 0; } 373 374 void omp_get_partition_place_nums(int *) { 375 // TODO 376 } 377 378 int omp_get_cancellation(void) { return 0; } 379 380 void omp_set_default_device(int) {} 381 382 int omp_get_default_device(void) { return -1; } 383 384 int omp_get_num_devices(void) { return config::getNumDevices(); } 385 386 int omp_get_device_num(void) { return config::getDeviceNum(); } 387 388 int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); } 389 390 int omp_get_team_num() { return mapping::getBlockId(); } 391 392 int omp_get_initial_device(void) { return -1; } 393 } 394 395 extern "C" { 396 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) { 397 FunctionTracingRAII(); 398 return memory::allocShared(Bytes, "Frontend alloc shared"); 399 } 400 401 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { 402 FunctionTracingRAII(); 403 memory::freeShared(Ptr, Bytes, "Frontend free shared"); 404 } 405 406 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); } 407 408 void *llvm_omp_target_dynamic_shared_alloc() { 409 return __kmpc_get_dynamic_shared(); 410 } 411 412 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } 413 414 /// Allocate storage in shared memory to communicate arguments from the main 415 /// thread to the workers in generic mode. If we exceed 416 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. 417 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; 418 419 [[clang::loader_uninitialized]] static void 420 *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; 421 #pragma omp allocate(SharedMemVariableSharingSpace) \ 422 allocator(omp_pteam_mem_alloc) 423 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr; 424 #pragma omp allocate(SharedMemVariableSharingSpacePtr) \ 425 allocator(omp_pteam_mem_alloc) 426 427 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { 428 FunctionTracingRAII(); 429 if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { 430 SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; 431 } else { 432 SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal( 433 nArgs * sizeof(void *), "new extended args"); 434 ASSERT(SharedMemVariableSharingSpacePtr != nullptr && 435 "Nullptr returned by malloc!"); 436 } 437 *GlobalArgs = SharedMemVariableSharingSpacePtr; 438 } 439 440 void __kmpc_end_sharing_variables() { 441 FunctionTracingRAII(); 442 if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) 443 memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args"); 444 } 445 446 void __kmpc_get_shared_variables(void ***GlobalArgs) { 447 FunctionTracingRAII(); 448 *GlobalArgs = SharedMemVariableSharingSpacePtr; 449 } 450 } 451 #pragma omp end declare target 452