1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 //===----------------------------------------------------------------------===// 10 11 #include "State.h" 12 #include "Configuration.h" 13 #include "Debug.h" 14 #include "Interface.h" 15 #include "Mapping.h" 16 #include "Synchronization.h" 17 #include "Types.h" 18 #include "Utils.h" 19 20 using namespace _OMP; 21 22 #pragma omp declare target 23 24 /// Memory implementation 25 /// 26 ///{ 27 28 /// Add worst-case padding so that future allocations are properly aligned. 29 constexpr const uint32_t Alignment = 8; 30 31 /// External symbol to access dynamic shared memory. 32 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment))); 33 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc) 34 35 namespace { 36 37 /// Fallback implementations are missing to trigger a link time error. 38 /// Implementations for new devices, including the host, should go into a 39 /// dedicated begin/end declare variant. 40 /// 41 ///{ 42 43 extern "C" { 44 void *malloc(uint64_t Size); 45 void free(void *Ptr); 46 } 47 48 ///} 49 50 /// AMDGCN implementations of the shuffle sync idiom. 51 /// 52 ///{ 53 #pragma omp begin declare variant match(device = {arch(amdgcn)}) 54 55 extern "C" { 56 void *malloc(uint64_t Size) { 57 // TODO: Use some preallocated space for dynamic malloc. 58 return nullptr; 59 } 60 61 void free(void *Ptr) {} 62 } 63 64 #pragma omp end declare variant 65 ///} 66 67 /// A "smart" stack in shared memory. 68 /// 69 /// The stack exposes a malloc/free interface but works like a stack internally. 70 /// In fact, it is a separate stack *per warp*. That means, each warp must push 71 /// and pop symmetrically or this breaks, badly. The implementation will (aim 72 /// to) detect non-lock-step warps and fallback to malloc/free. The same will 73 /// happen if a warp runs out of memory. The master warp in generic memory is 74 /// special and is given more memory than the rest. 75 /// 76 struct SharedMemorySmartStackTy { 77 /// Initialize the stack. Must be called by all threads. 78 void init(bool IsSPMD); 79 80 /// Allocate \p Bytes on the stack for the encountering thread. Each thread 81 /// can call this function. 82 void *push(uint64_t Bytes); 83 84 /// Deallocate the last allocation made by the encountering thread and pointed 85 /// to by \p Ptr from the stack. Each thread can call this function. 86 void pop(void *Ptr, uint32_t Bytes); 87 88 private: 89 /// Compute the size of the storage space reserved for a thread. 90 uint32_t computeThreadStorageTotal() { 91 uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements(); 92 return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock), 93 Alignment); 94 } 95 96 /// Return the top address of the warp data stack, that is the first address 97 /// this warp will allocate memory at next. 98 void *getThreadDataTop(uint32_t TId) { 99 return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; 100 } 101 102 /// The actual storage, shared among all warps. 103 unsigned char Data[state::SharedScratchpadSize] 104 __attribute__((aligned(Alignment))); 105 unsigned char Usage[mapping::MaxThreadsPerTeam] 106 __attribute__((aligned(Alignment))); 107 }; 108 109 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, 110 "Shared scratchpad of this size not supported yet."); 111 112 /// The allocation of a single shared memory scratchpad. 113 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack); 114 115 void SharedMemorySmartStackTy::init(bool IsSPMD) { 116 Usage[mapping::getThreadIdInBlock()] = 0; 117 } 118 119 void *SharedMemorySmartStackTy::push(uint64_t Bytes) { 120 // First align the number of requested bytes. 121 uint64_t AlignedBytes = utils::align_up(Bytes, Alignment); 122 123 uint32_t StorageTotal = computeThreadStorageTotal(); 124 125 // The main thread in generic mode gets the space of its entire warp as the 126 // other threads do not participate in any computation at all. 127 if (mapping::isMainThreadInGenericMode()) 128 StorageTotal *= mapping::getWarpSize(); 129 130 int TId = mapping::getThreadIdInBlock(); 131 if (Usage[TId] + AlignedBytes <= StorageTotal) { 132 void *Ptr = getThreadDataTop(TId); 133 Usage[TId] += AlignedBytes; 134 return Ptr; 135 } 136 137 return memory::allocGlobal(AlignedBytes, 138 "Slow path shared memory allocation, insufficient " 139 "shared memory stack memory!"); 140 } 141 142 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) { 143 uint64_t AlignedBytes = utils::align_up(Bytes, Alignment); 144 if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) { 145 int TId = mapping::getThreadIdInBlock(); 146 Usage[TId] -= AlignedBytes; 147 return; 148 } 149 memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); 150 } 151 152 } // namespace 153 154 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } 155 156 void *memory::allocShared(uint64_t Bytes, const char *Reason) { 157 return SharedMemorySmartStack.push(Bytes); 158 } 159 160 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { 161 SharedMemorySmartStack.pop(Ptr, Bytes); 162 } 163 164 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { 165 return malloc(Bytes); 166 } 167 168 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } 169 170 ///} 171 172 namespace { 173 174 struct ICVStateTy { 175 uint32_t NThreadsVar; 176 uint32_t LevelVar; 177 uint32_t ActiveLevelVar; 178 uint32_t MaxActiveLevelsVar; 179 uint32_t RunSchedVar; 180 uint32_t RunSchedChunkVar; 181 182 bool operator==(const ICVStateTy &Other) const; 183 184 void assertEqual(const ICVStateTy &Other) const; 185 }; 186 187 bool ICVStateTy::operator==(const ICVStateTy &Other) const { 188 return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & 189 (ActiveLevelVar == Other.ActiveLevelVar) & 190 (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & 191 (RunSchedVar == Other.RunSchedVar) & 192 (RunSchedChunkVar == Other.RunSchedChunkVar); 193 } 194 195 void ICVStateTy::assertEqual(const ICVStateTy &Other) const { 196 ASSERT(NThreadsVar == Other.NThreadsVar); 197 ASSERT(LevelVar == Other.LevelVar); 198 ASSERT(ActiveLevelVar == Other.ActiveLevelVar); 199 ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar); 200 ASSERT(RunSchedVar == Other.RunSchedVar); 201 ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar); 202 } 203 204 struct TeamStateTy { 205 /// TODO: provide a proper init function. 206 void init(bool IsSPMD); 207 208 bool operator==(const TeamStateTy &) const; 209 210 void assertEqual(TeamStateTy &Other) const; 211 212 /// ICVs 213 /// 214 /// Preallocated storage for ICV values that are used if the threads have not 215 /// set a custom default. The latter is supported but unlikely and slow(er). 216 /// 217 ///{ 218 ICVStateTy ICVState; 219 ///} 220 221 uint32_t ParallelTeamSize; 222 ParallelRegionFnTy ParallelRegionFnVar; 223 }; 224 225 TeamStateTy SHARED(TeamState); 226 227 void TeamStateTy::init(bool IsSPMD) { 228 ICVState.NThreadsVar = mapping::getBlockSize(); 229 ICVState.LevelVar = 0; 230 ICVState.ActiveLevelVar = 0; 231 ICVState.MaxActiveLevelsVar = 1; 232 ICVState.RunSchedVar = omp_sched_static; 233 ICVState.RunSchedChunkVar = 1; 234 ParallelTeamSize = 1; 235 ParallelRegionFnVar = nullptr; 236 } 237 238 bool TeamStateTy::operator==(const TeamStateTy &Other) const { 239 return (ICVState == Other.ICVState) & 240 (ParallelTeamSize == Other.ParallelTeamSize); 241 } 242 243 void TeamStateTy::assertEqual(TeamStateTy &Other) const { 244 ICVState.assertEqual(Other.ICVState); 245 ASSERT(ParallelTeamSize == Other.ParallelTeamSize); 246 } 247 248 struct ThreadStateTy { 249 250 /// ICVs have preallocated storage in the TeamStateTy which is used if a 251 /// thread has not set a custom value. The latter is supported but unlikely. 252 /// When it happens we will allocate dynamic memory to hold the values of all 253 /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an 254 /// ICV struct to hold them all. This is slower than alternatives but allows 255 /// users to pay only for what they use. 256 /// 257 ICVStateTy ICVState; 258 259 ThreadStateTy *PreviousThreadState; 260 261 void init() { 262 ICVState = TeamState.ICVState; 263 PreviousThreadState = nullptr; 264 } 265 266 void init(ThreadStateTy &PreviousTS) { 267 ICVState = PreviousTS.ICVState; 268 PreviousThreadState = &PreviousTS; 269 } 270 }; 271 272 __attribute__((loader_uninitialized)) 273 ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; 274 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) 275 276 uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var) { 277 if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0)) 278 return TeamState.ICVState.*Var; 279 uint32_t TId = mapping::getThreadIdInBlock(); 280 if (!ThreadStates[TId]) { 281 ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal( 282 sizeof(ThreadStateTy), "ICV modification outside data environment")); 283 ThreadStates[TId]->init(); 284 } 285 return ThreadStates[TId]->ICVState.*Var; 286 } 287 288 uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) { 289 uint32_t TId = mapping::getThreadIdInBlock(); 290 if (OMP_UNLIKELY(ThreadStates[TId])) 291 return ThreadStates[TId]->ICVState.*Var; 292 return TeamState.ICVState.*Var; 293 } 294 uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) { 295 uint64_t TId = mapping::getThreadIdInBlock(); 296 if (OMP_UNLIKELY(ThreadStates[TId])) 297 return ThreadStates[TId]->ICVState.*Var; 298 return TeamState.ICVState.*Var; 299 } 300 301 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, 302 int OutOfBoundsVal = -1) { 303 if (Level == 0) 304 return DefaultVal; 305 int LevelVar = omp_get_level(); 306 if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) 307 return OutOfBoundsVal; 308 int ActiveLevel = icv::ActiveLevel; 309 if (OMP_UNLIKELY(Level != ActiveLevel)) 310 return DefaultVal; 311 return Val; 312 } 313 314 } // namespace 315 316 uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly) { 317 switch (Kind) { 318 case state::VK_NThreads: 319 if (IsReadonly) 320 return lookup32Impl(&ICVStateTy::NThreadsVar); 321 return lookupForModify32Impl(&ICVStateTy::NThreadsVar); 322 case state::VK_Level: 323 if (IsReadonly) 324 return lookup32Impl(&ICVStateTy::LevelVar); 325 return lookupForModify32Impl(&ICVStateTy::LevelVar); 326 case state::VK_ActiveLevel: 327 if (IsReadonly) 328 return lookup32Impl(&ICVStateTy::ActiveLevelVar); 329 return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar); 330 case state::VK_MaxActiveLevels: 331 if (IsReadonly) 332 return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar); 333 return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar); 334 case state::VK_RunSched: 335 if (IsReadonly) 336 return lookup32Impl(&ICVStateTy::RunSchedVar); 337 return lookupForModify32Impl(&ICVStateTy::RunSchedVar); 338 case state::VK_RunSchedChunk: 339 if (IsReadonly) 340 return lookup32Impl(&ICVStateTy::RunSchedChunkVar); 341 return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar); 342 case state::VK_ParallelTeamSize: 343 return TeamState.ParallelTeamSize; 344 default: 345 break; 346 } 347 __builtin_unreachable(); 348 } 349 350 void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) { 351 switch (Kind) { 352 case state::VK_ParallelRegionFn: 353 return TeamState.ParallelRegionFnVar; 354 default: 355 break; 356 } 357 __builtin_unreachable(); 358 } 359 360 void state::init(bool IsSPMD) { 361 SharedMemorySmartStack.init(IsSPMD); 362 if (!mapping::getThreadIdInBlock()) 363 TeamState.init(IsSPMD); 364 365 ThreadStates[mapping::getThreadIdInBlock()] = nullptr; 366 } 367 368 void state::enterDataEnvironment() { 369 unsigned TId = mapping::getThreadIdInBlock(); 370 ThreadStateTy *NewThreadState = 371 static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy))); 372 NewThreadState->init(*ThreadStates[TId]); 373 ThreadStates[TId] = NewThreadState; 374 } 375 376 void state::exitDataEnvironment() { 377 unsigned TId = mapping::getThreadIdInBlock(); 378 resetStateForThread(TId); 379 } 380 381 void state::resetStateForThread(uint32_t TId) { 382 if (OMP_LIKELY(!ThreadStates[TId])) 383 return; 384 385 ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; 386 __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy)); 387 ThreadStates[TId] = PreviousThreadState; 388 } 389 390 void state::runAndCheckState(void(Func(void))) { 391 TeamStateTy OldTeamState = TeamState; 392 OldTeamState.assertEqual(TeamState); 393 394 Func(); 395 396 OldTeamState.assertEqual(TeamState); 397 } 398 399 void state::assumeInitialState(bool IsSPMD) { 400 TeamStateTy InitialTeamState; 401 InitialTeamState.init(IsSPMD); 402 InitialTeamState.assertEqual(TeamState); 403 ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]); 404 ASSERT(mapping::isSPMDMode() == IsSPMD); 405 } 406 407 extern "C" { 408 void omp_set_dynamic(int V) {} 409 410 int omp_get_dynamic(void) { return 0; } 411 412 void omp_set_num_threads(int V) { icv::NThreads = V; } 413 414 int omp_get_max_threads(void) { return icv::NThreads; } 415 416 int omp_get_level(void) { 417 int LevelVar = icv::Level; 418 ASSERT(LevelVar >= 0); 419 return LevelVar; 420 } 421 422 int omp_get_active_level(void) { return !!icv::ActiveLevel; } 423 424 int omp_in_parallel(void) { return !!icv::ActiveLevel; } 425 426 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { 427 *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched); 428 *ChunkSize = state::RunSchedChunk; 429 } 430 431 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { 432 icv::RunSched = (int)ScheduleKind; 433 state::RunSchedChunk = ChunkSize; 434 } 435 436 int omp_get_ancestor_thread_num(int Level) { 437 return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); 438 } 439 440 int omp_get_thread_num(void) { 441 return omp_get_ancestor_thread_num(omp_get_level()); 442 } 443 444 int omp_get_team_size(int Level) { 445 return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1); 446 } 447 448 int omp_get_num_threads(void) { return state::ParallelTeamSize; } 449 450 int omp_get_thread_limit(void) { return mapping::getKernelSize(); } 451 452 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } 453 454 void omp_set_nested(int) {} 455 456 int omp_get_nested(void) { return false; } 457 458 void omp_set_max_active_levels(int Levels) { 459 icv::MaxActiveLevels = Levels > 0 ? 1 : 0; 460 } 461 462 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } 463 464 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } 465 466 int omp_get_num_places(void) { return 0; } 467 468 int omp_get_place_num_procs(int) { return omp_get_num_procs(); } 469 470 void omp_get_place_proc_ids(int, int *) { 471 // TODO 472 } 473 474 int omp_get_place_num(void) { return 0; } 475 476 int omp_get_partition_num_places(void) { return 0; } 477 478 void omp_get_partition_place_nums(int *) { 479 // TODO 480 } 481 482 int omp_get_cancellation(void) { return 0; } 483 484 void omp_set_default_device(int) {} 485 486 int omp_get_default_device(void) { return -1; } 487 488 int omp_get_num_devices(void) { return config::getNumDevices(); } 489 490 int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); } 491 492 int omp_get_team_num() { return mapping::getBlockId(); } 493 494 int omp_get_initial_device(void) { return -1; } 495 } 496 497 extern "C" { 498 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) { 499 return memory::allocShared(Bytes, "Frontend alloc shared"); 500 } 501 502 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { 503 memory::freeShared(Ptr, Bytes, "Frontend free shared"); 504 } 505 506 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); } 507 508 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } 509 510 /// Allocate storage in shared memory to communicate arguments from the main 511 /// thread to the workers in generic mode. If we exceed 512 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. 513 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; 514 515 [[clang::loader_uninitialized]] static void 516 *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; 517 #pragma omp allocate(SharedMemVariableSharingSpace) \ 518 allocator(omp_pteam_mem_alloc) 519 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr; 520 #pragma omp allocate(SharedMemVariableSharingSpacePtr) \ 521 allocator(omp_pteam_mem_alloc) 522 523 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { 524 if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { 525 SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; 526 } else { 527 SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal( 528 nArgs * sizeof(void *), "new extended args"); 529 } 530 *GlobalArgs = SharedMemVariableSharingSpacePtr; 531 } 532 533 void __kmpc_end_sharing_variables() { 534 if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) 535 memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args"); 536 } 537 538 void __kmpc_get_shared_variables(void ***GlobalArgs) { 539 *GlobalArgs = SharedMemVariableSharingSpacePtr; 540 } 541 } 542 #pragma omp end declare target 543