1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 //===----------------------------------------------------------------------===// 10 11 #include "State.h" 12 #include "Configuration.h" 13 #include "Debug.h" 14 #include "Interface.h" 15 #include "Mapping.h" 16 #include "Synchronization.h" 17 #include "Types.h" 18 #include "Utils.h" 19 20 using namespace _OMP; 21 22 #pragma omp declare target 23 24 /// Memory implementation 25 /// 26 ///{ 27 28 /// Add worst-case padding so that future allocations are properly aligned. 29 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be 30 /// passed in as an argument and the stack rewritten to support it. 31 constexpr const uint32_t Alignment = 16; 32 33 /// External symbol to access dynamic shared memory. 34 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment))); 35 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc) 36 37 namespace { 38 39 /// Fallback implementations are missing to trigger a link time error. 40 /// Implementations for new devices, including the host, should go into a 41 /// dedicated begin/end declare variant. 42 /// 43 ///{ 44 45 extern "C" { 46 __attribute__((leaf)) void *malloc(uint64_t Size); 47 __attribute__((leaf)) void free(void *Ptr); 48 } 49 50 ///} 51 52 /// AMDGCN implementations of the shuffle sync idiom. 53 /// 54 ///{ 55 #pragma omp begin declare variant match(device = {arch(amdgcn)}) 56 57 extern "C" { 58 void *malloc(uint64_t Size) { 59 // TODO: Use some preallocated space for dynamic malloc. 60 return nullptr; 61 } 62 63 void free(void *Ptr) {} 64 } 65 66 #pragma omp end declare variant 67 ///} 68 69 /// A "smart" stack in shared memory. 70 /// 71 /// The stack exposes a malloc/free interface but works like a stack internally. 72 /// In fact, it is a separate stack *per warp*. That means, each warp must push 73 /// and pop symmetrically or this breaks, badly. The implementation will (aim 74 /// to) detect non-lock-step warps and fallback to malloc/free. The same will 75 /// happen if a warp runs out of memory. The master warp in generic memory is 76 /// special and is given more memory than the rest. 77 /// 78 struct SharedMemorySmartStackTy { 79 /// Initialize the stack. Must be called by all threads. 80 void init(bool IsSPMD); 81 82 /// Allocate \p Bytes on the stack for the encountering thread. Each thread 83 /// can call this function. 84 void *push(uint64_t Bytes); 85 86 /// Deallocate the last allocation made by the encountering thread and pointed 87 /// to by \p Ptr from the stack. Each thread can call this function. 88 void pop(void *Ptr, uint32_t Bytes); 89 90 private: 91 /// Compute the size of the storage space reserved for a thread. 92 uint32_t computeThreadStorageTotal() { 93 uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements(); 94 return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock), 95 Alignment); 96 } 97 98 /// Return the top address of the warp data stack, that is the first address 99 /// this warp will allocate memory at next. 100 void *getThreadDataTop(uint32_t TId) { 101 return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; 102 } 103 104 /// The actual storage, shared among all warps. 105 unsigned char Data[state::SharedScratchpadSize] 106 __attribute__((aligned(Alignment))); 107 unsigned char Usage[mapping::MaxThreadsPerTeam] 108 __attribute__((aligned(Alignment))); 109 }; 110 111 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, 112 "Shared scratchpad of this size not supported yet."); 113 114 /// The allocation of a single shared memory scratchpad. 115 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack); 116 117 void SharedMemorySmartStackTy::init(bool IsSPMD) { 118 Usage[mapping::getThreadIdInBlock()] = 0; 119 } 120 121 void *SharedMemorySmartStackTy::push(uint64_t Bytes) { 122 // First align the number of requested bytes. 123 uint64_t AlignedBytes = utils::align_up(Bytes, Alignment); 124 125 uint32_t StorageTotal = computeThreadStorageTotal(); 126 127 // The main thread in generic mode gets the space of its entire warp as the 128 // other threads do not participate in any computation at all. 129 if (mapping::isMainThreadInGenericMode()) 130 StorageTotal *= mapping::getWarpSize(); 131 132 int TId = mapping::getThreadIdInBlock(); 133 if (Usage[TId] + AlignedBytes <= StorageTotal) { 134 void *Ptr = getThreadDataTop(TId); 135 Usage[TId] += AlignedBytes; 136 return Ptr; 137 } 138 139 if (config::isDebugMode(config::DebugKind::CommonIssues)) 140 PRINT("Shared memory stack full, fallback to dynamic allocation of global " 141 "memory will negatively impact performance.\n"); 142 void *GlobalMemory = memory::allocGlobal( 143 AlignedBytes, "Slow path shared memory allocation, insufficient " 144 "shared memory stack memory!"); 145 ASSERT(GlobalMemory != nullptr && "nullptr returned by malloc!"); 146 147 return GlobalMemory; 148 } 149 150 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) { 151 uint64_t AlignedBytes = utils::align_up(Bytes, Alignment); 152 if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) { 153 int TId = mapping::getThreadIdInBlock(); 154 Usage[TId] -= AlignedBytes; 155 return; 156 } 157 memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); 158 } 159 160 } // namespace 161 162 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } 163 164 void *memory::allocShared(uint64_t Bytes, const char *Reason) { 165 return SharedMemorySmartStack.push(Bytes); 166 } 167 168 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { 169 SharedMemorySmartStack.pop(Ptr, Bytes); 170 } 171 172 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { 173 void *Ptr = malloc(Bytes); 174 if (config::isDebugMode(config::DebugKind::CommonIssues) && Ptr == nullptr) 175 PRINT("nullptr returned by malloc!\n"); 176 return Ptr; 177 } 178 179 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } 180 181 ///} 182 183 namespace { 184 185 struct ICVStateTy { 186 uint32_t NThreadsVar; 187 uint32_t LevelVar; 188 uint32_t ActiveLevelVar; 189 uint32_t MaxActiveLevelsVar; 190 uint32_t RunSchedVar; 191 uint32_t RunSchedChunkVar; 192 193 bool operator==(const ICVStateTy &Other) const; 194 195 void assertEqual(const ICVStateTy &Other) const; 196 }; 197 198 bool ICVStateTy::operator==(const ICVStateTy &Other) const { 199 return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & 200 (ActiveLevelVar == Other.ActiveLevelVar) & 201 (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & 202 (RunSchedVar == Other.RunSchedVar) & 203 (RunSchedChunkVar == Other.RunSchedChunkVar); 204 } 205 206 void ICVStateTy::assertEqual(const ICVStateTy &Other) const { 207 ASSERT(NThreadsVar == Other.NThreadsVar); 208 ASSERT(LevelVar == Other.LevelVar); 209 ASSERT(ActiveLevelVar == Other.ActiveLevelVar); 210 ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar); 211 ASSERT(RunSchedVar == Other.RunSchedVar); 212 ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar); 213 } 214 215 struct TeamStateTy { 216 /// TODO: provide a proper init function. 217 void init(bool IsSPMD); 218 219 bool operator==(const TeamStateTy &) const; 220 221 void assertEqual(TeamStateTy &Other) const; 222 223 /// ICVs 224 /// 225 /// Preallocated storage for ICV values that are used if the threads have not 226 /// set a custom default. The latter is supported but unlikely and slow(er). 227 /// 228 ///{ 229 ICVStateTy ICVState; 230 ///} 231 232 uint32_t ParallelTeamSize; 233 ParallelRegionFnTy ParallelRegionFnVar; 234 }; 235 236 TeamStateTy SHARED(TeamState); 237 238 void TeamStateTy::init(bool IsSPMD) { 239 ICVState.NThreadsVar = mapping::getBlockSize(IsSPMD); 240 ICVState.LevelVar = 0; 241 ICVState.ActiveLevelVar = 0; 242 ICVState.MaxActiveLevelsVar = 1; 243 ICVState.RunSchedVar = omp_sched_static; 244 ICVState.RunSchedChunkVar = 1; 245 ParallelTeamSize = 1; 246 ParallelRegionFnVar = nullptr; 247 } 248 249 bool TeamStateTy::operator==(const TeamStateTy &Other) const { 250 return (ICVState == Other.ICVState) & 251 (ParallelTeamSize == Other.ParallelTeamSize); 252 } 253 254 void TeamStateTy::assertEqual(TeamStateTy &Other) const { 255 ICVState.assertEqual(Other.ICVState); 256 ASSERT(ParallelTeamSize == Other.ParallelTeamSize); 257 } 258 259 struct ThreadStateTy { 260 261 /// ICVs have preallocated storage in the TeamStateTy which is used if a 262 /// thread has not set a custom value. The latter is supported but unlikely. 263 /// When it happens we will allocate dynamic memory to hold the values of all 264 /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an 265 /// ICV struct to hold them all. This is slower than alternatives but allows 266 /// users to pay only for what they use. 267 /// 268 ICVStateTy ICVState; 269 270 ThreadStateTy *PreviousThreadState; 271 272 void init() { 273 ICVState = TeamState.ICVState; 274 PreviousThreadState = nullptr; 275 } 276 277 void init(ThreadStateTy *PreviousTS) { 278 ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState; 279 PreviousThreadState = PreviousTS; 280 } 281 }; 282 283 __attribute__((loader_uninitialized)) 284 ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; 285 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) 286 287 uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) { 288 if (OMP_LIKELY(!config::mayUseThreadStates() || 289 TeamState.ICVState.LevelVar == 0)) 290 return TeamState.ICVState.*Var; 291 uint32_t TId = mapping::getThreadIdInBlock(); 292 if (!ThreadStates[TId]) { 293 ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal( 294 sizeof(ThreadStateTy), "ICV modification outside data environment")); 295 ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!"); 296 ThreadStates[TId]->init(); 297 } 298 return ThreadStates[TId]->ICVState.*Var; 299 } 300 301 uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) { 302 uint32_t TId = mapping::getThreadIdInBlock(); 303 if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId])) 304 return ThreadStates[TId]->ICVState.*Var; 305 return TeamState.ICVState.*Var; 306 } 307 uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) { 308 uint64_t TId = mapping::getThreadIdInBlock(); 309 if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId])) 310 return ThreadStates[TId]->ICVState.*Var; 311 return TeamState.ICVState.*Var; 312 } 313 314 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, 315 int OutOfBoundsVal = -1) { 316 if (Level == 0) 317 return DefaultVal; 318 int LevelVar = omp_get_level(); 319 if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) 320 return OutOfBoundsVal; 321 int ActiveLevel = icv::ActiveLevel; 322 if (OMP_UNLIKELY(Level != ActiveLevel)) 323 return DefaultVal; 324 return Val; 325 } 326 327 } // namespace 328 329 uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident) { 330 switch (Kind) { 331 case state::VK_NThreads: 332 if (IsReadonly) 333 return lookup32Impl(&ICVStateTy::NThreadsVar); 334 return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident); 335 case state::VK_Level: 336 if (IsReadonly) 337 return lookup32Impl(&ICVStateTy::LevelVar); 338 return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident); 339 case state::VK_ActiveLevel: 340 if (IsReadonly) 341 return lookup32Impl(&ICVStateTy::ActiveLevelVar); 342 return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident); 343 case state::VK_MaxActiveLevels: 344 if (IsReadonly) 345 return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar); 346 return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident); 347 case state::VK_RunSched: 348 if (IsReadonly) 349 return lookup32Impl(&ICVStateTy::RunSchedVar); 350 return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident); 351 case state::VK_RunSchedChunk: 352 if (IsReadonly) 353 return lookup32Impl(&ICVStateTy::RunSchedChunkVar); 354 return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident); 355 case state::VK_ParallelTeamSize: 356 return TeamState.ParallelTeamSize; 357 default: 358 break; 359 } 360 __builtin_unreachable(); 361 } 362 363 void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) { 364 switch (Kind) { 365 case state::VK_ParallelRegionFn: 366 return TeamState.ParallelRegionFnVar; 367 default: 368 break; 369 } 370 __builtin_unreachable(); 371 } 372 373 void state::init(bool IsSPMD) { 374 SharedMemorySmartStack.init(IsSPMD); 375 if (mapping::isInitialThreadInLevel0(IsSPMD)) { 376 TeamState.init(IsSPMD); 377 DebugEntryRAII::init(); 378 } 379 380 ThreadStates[mapping::getThreadIdInBlock()] = nullptr; 381 } 382 383 void state::enterDataEnvironment(IdentTy *Ident) { 384 ASSERT(config::mayUseThreadStates() && 385 "Thread state modified while explicitly disabled!"); 386 387 unsigned TId = mapping::getThreadIdInBlock(); 388 ThreadStateTy *NewThreadState = 389 static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy))); 390 NewThreadState->init(ThreadStates[TId]); 391 ThreadStates[TId] = NewThreadState; 392 } 393 394 void state::exitDataEnvironment() { 395 ASSERT(config::mayUseThreadStates() && 396 "Thread state modified while explicitly disabled!"); 397 398 unsigned TId = mapping::getThreadIdInBlock(); 399 resetStateForThread(TId); 400 } 401 402 void state::resetStateForThread(uint32_t TId) { 403 if (OMP_LIKELY(!ThreadStates[TId])) 404 return; 405 406 ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; 407 __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy)); 408 ThreadStates[TId] = PreviousThreadState; 409 } 410 411 void state::runAndCheckState(void(Func(void))) { 412 TeamStateTy OldTeamState = TeamState; 413 OldTeamState.assertEqual(TeamState); 414 415 Func(); 416 417 OldTeamState.assertEqual(TeamState); 418 } 419 420 void state::assumeInitialState(bool IsSPMD) { 421 TeamStateTy InitialTeamState; 422 InitialTeamState.init(IsSPMD); 423 InitialTeamState.assertEqual(TeamState); 424 ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]); 425 ASSERT(mapping::isSPMDMode() == IsSPMD); 426 } 427 428 extern "C" { 429 void omp_set_dynamic(int V) {} 430 431 int omp_get_dynamic(void) { return 0; } 432 433 void omp_set_num_threads(int V) { icv::NThreads = V; } 434 435 int omp_get_max_threads(void) { return icv::NThreads; } 436 437 int omp_get_level(void) { 438 int LevelVar = icv::Level; 439 ASSERT(LevelVar >= 0); 440 return LevelVar; 441 } 442 443 int omp_get_active_level(void) { return !!icv::ActiveLevel; } 444 445 int omp_in_parallel(void) { return !!icv::ActiveLevel; } 446 447 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { 448 *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched); 449 *ChunkSize = state::RunSchedChunk; 450 } 451 452 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { 453 icv::RunSched = (int)ScheduleKind; 454 state::RunSchedChunk = ChunkSize; 455 } 456 457 int omp_get_ancestor_thread_num(int Level) { 458 return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); 459 } 460 461 int omp_get_thread_num(void) { 462 return omp_get_ancestor_thread_num(omp_get_level()); 463 } 464 465 int omp_get_team_size(int Level) { 466 return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1); 467 } 468 469 int omp_get_num_threads(void) { 470 return omp_get_level() > 1 ? 1 : state::ParallelTeamSize; 471 } 472 473 int omp_get_thread_limit(void) { return mapping::getKernelSize(); } 474 475 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } 476 477 void omp_set_nested(int) {} 478 479 int omp_get_nested(void) { return false; } 480 481 void omp_set_max_active_levels(int Levels) { 482 icv::MaxActiveLevels = Levels > 0 ? 1 : 0; 483 } 484 485 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } 486 487 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } 488 489 int omp_get_num_places(void) { return 0; } 490 491 int omp_get_place_num_procs(int) { return omp_get_num_procs(); } 492 493 void omp_get_place_proc_ids(int, int *) { 494 // TODO 495 } 496 497 int omp_get_place_num(void) { return 0; } 498 499 int omp_get_partition_num_places(void) { return 0; } 500 501 void omp_get_partition_place_nums(int *) { 502 // TODO 503 } 504 505 int omp_get_cancellation(void) { return 0; } 506 507 void omp_set_default_device(int) {} 508 509 int omp_get_default_device(void) { return -1; } 510 511 int omp_get_num_devices(void) { return config::getNumDevices(); } 512 513 int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); } 514 515 int omp_get_team_num() { return mapping::getBlockId(); } 516 517 int omp_get_initial_device(void) { return -1; } 518 } 519 520 extern "C" { 521 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) { 522 FunctionTracingRAII(); 523 return memory::allocShared(Bytes, "Frontend alloc shared"); 524 } 525 526 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { 527 FunctionTracingRAII(); 528 memory::freeShared(Ptr, Bytes, "Frontend free shared"); 529 } 530 531 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); } 532 533 void *llvm_omp_target_dynamic_shared_alloc() { 534 return __kmpc_get_dynamic_shared(); 535 } 536 537 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } 538 539 /// Allocate storage in shared memory to communicate arguments from the main 540 /// thread to the workers in generic mode. If we exceed 541 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. 542 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; 543 544 [[clang::loader_uninitialized]] static void 545 *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; 546 #pragma omp allocate(SharedMemVariableSharingSpace) \ 547 allocator(omp_pteam_mem_alloc) 548 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr; 549 #pragma omp allocate(SharedMemVariableSharingSpacePtr) \ 550 allocator(omp_pteam_mem_alloc) 551 552 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { 553 FunctionTracingRAII(); 554 if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { 555 SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; 556 } else { 557 SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal( 558 nArgs * sizeof(void *), "new extended args"); 559 ASSERT(SharedMemVariableSharingSpacePtr != nullptr && 560 "Nullptr returned by malloc!"); 561 } 562 *GlobalArgs = SharedMemVariableSharingSpacePtr; 563 } 564 565 void __kmpc_end_sharing_variables() { 566 FunctionTracingRAII(); 567 if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) 568 memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args"); 569 } 570 571 void __kmpc_get_shared_variables(void ***GlobalArgs) { 572 FunctionTracingRAII(); 573 *GlobalArgs = SharedMemVariableSharingSpacePtr; 574 } 575 } 576 #pragma omp end declare target 577