1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 //===----------------------------------------------------------------------===// 10 11 #include "State.h" 12 #include "Configuration.h" 13 #include "Debug.h" 14 #include "Interface.h" 15 #include "Mapping.h" 16 #include "Synchronization.h" 17 #include "Types.h" 18 #include "Utils.h" 19 20 using namespace _OMP; 21 22 #pragma omp declare target 23 24 /// Memory implementation 25 /// 26 ///{ 27 28 namespace { 29 30 /// Fallback implementations are missing to trigger a link time error. 31 /// Implementations for new devices, including the host, should go into a 32 /// dedicated begin/end declare variant. 33 /// 34 ///{ 35 36 extern "C" { 37 void *malloc(uint64_t Size); 38 void free(void *Ptr); 39 } 40 41 ///} 42 43 /// AMDGCN implementations of the shuffle sync idiom. 44 /// 45 ///{ 46 #pragma omp begin declare variant match(device = {arch(amdgcn)}) 47 48 extern "C" { 49 void *malloc(uint64_t Size) { 50 // TODO: Use some preallocated space for dynamic malloc. 51 return nullptr; 52 } 53 54 void free(void *Ptr) {} 55 } 56 57 #pragma omp end declare variant 58 ///} 59 60 /// Add worst-case padding so that future allocations are properly aligned. 61 constexpr const uint32_t Alignment = 8; 62 63 /// A "smart" stack in shared memory. 64 /// 65 /// The stack exposes a malloc/free interface but works like a stack internally. 66 /// In fact, it is a separate stack *per warp*. That means, each warp must push 67 /// and pop symmetrically or this breaks, badly. The implementation will (aim 68 /// to) detect non-lock-step warps and fallback to malloc/free. The same will 69 /// happen if a warp runs out of memory. The master warp in generic memory is 70 /// special and is given more memory than the rest. 71 /// 72 struct SharedMemorySmartStackTy { 73 /// Initialize the stack. Must be called by all threads. 74 void init(bool IsSPMD); 75 76 /// Allocate \p Bytes on the stack for the encountering thread. Each thread 77 /// can call this function. 78 void *push(uint64_t Bytes); 79 80 /// Deallocate the last allocation made by the encountering thread and pointed 81 /// to by \p Ptr from the stack. Each thread can call this function. 82 void pop(void *Ptr, uint32_t Bytes); 83 84 private: 85 /// Compute the size of the storage space reserved for a thread. 86 uint32_t computeThreadStorageTotal() { 87 uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements(); 88 return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock), 89 Alignment); 90 } 91 92 /// Return the top address of the warp data stack, that is the first address 93 /// this warp will allocate memory at next. 94 void *getThreadDataTop(uint32_t TId) { 95 return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; 96 } 97 98 /// The actual storage, shared among all warps. 99 unsigned char Data[state::SharedScratchpadSize] 100 __attribute__((aligned(Alignment))); 101 unsigned char Usage[mapping::MaxThreadsPerTeam] 102 __attribute__((aligned(Alignment))); 103 }; 104 105 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, 106 "Shared scratchpad of this size not supported yet."); 107 108 /// The allocation of a single shared memory scratchpad. 109 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack); 110 111 void SharedMemorySmartStackTy::init(bool IsSPMD) { 112 Usage[mapping::getThreadIdInBlock()] = 0; 113 } 114 115 void *SharedMemorySmartStackTy::push(uint64_t Bytes) { 116 // First align the number of requested bytes. 117 uint64_t AlignedBytes = utils::align_up(Bytes, Alignment); 118 119 uint32_t StorageTotal = computeThreadStorageTotal(); 120 121 // The main thread in generic mode gets the space of its entire warp as the 122 // other threads do not participate in any computation at all. 123 if (mapping::isMainThreadInGenericMode()) 124 StorageTotal *= mapping::getWarpSize(); 125 126 int TId = mapping::getThreadIdInBlock(); 127 if (Usage[TId] + AlignedBytes <= StorageTotal) { 128 void *Ptr = getThreadDataTop(TId); 129 Usage[TId] += AlignedBytes; 130 return Ptr; 131 } 132 133 return memory::allocGlobal(AlignedBytes, 134 "Slow path shared memory allocation, insufficient " 135 "shared memory stack memory!"); 136 } 137 138 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) { 139 uint64_t AlignedBytes = utils::align_up(Bytes, Alignment); 140 if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) { 141 int TId = mapping::getThreadIdInBlock(); 142 Usage[TId] -= AlignedBytes; 143 return; 144 } 145 memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); 146 } 147 148 } // namespace 149 150 void *memory::allocShared(uint64_t Bytes, const char *Reason) { 151 return SharedMemorySmartStack.push(Bytes); 152 } 153 154 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { 155 SharedMemorySmartStack.pop(Ptr, Bytes); 156 } 157 158 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { 159 return malloc(Bytes); 160 } 161 162 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } 163 164 ///} 165 166 namespace { 167 168 struct ICVStateTy { 169 uint32_t NThreadsVar; 170 uint32_t LevelVar; 171 uint32_t ActiveLevelVar; 172 uint32_t MaxActiveLevelsVar; 173 uint32_t RunSchedVar; 174 uint32_t RunSchedChunkVar; 175 176 bool operator==(const ICVStateTy &Other) const; 177 178 void assertEqual(const ICVStateTy &Other) const; 179 }; 180 181 bool ICVStateTy::operator==(const ICVStateTy &Other) const { 182 return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & 183 (ActiveLevelVar == Other.ActiveLevelVar) & 184 (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & 185 (RunSchedVar == Other.RunSchedVar) & 186 (RunSchedChunkVar == Other.RunSchedChunkVar); 187 } 188 189 void ICVStateTy::assertEqual(const ICVStateTy &Other) const { 190 ASSERT(NThreadsVar == Other.NThreadsVar); 191 ASSERT(LevelVar == Other.LevelVar); 192 ASSERT(ActiveLevelVar == Other.ActiveLevelVar); 193 ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar); 194 ASSERT(RunSchedVar == Other.RunSchedVar); 195 ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar); 196 } 197 198 struct TeamStateTy { 199 /// TODO: provide a proper init function. 200 void init(bool IsSPMD); 201 202 bool operator==(const TeamStateTy &) const; 203 204 void assertEqual(TeamStateTy &Other) const; 205 206 /// ICVs 207 /// 208 /// Preallocated storage for ICV values that are used if the threads have not 209 /// set a custom default. The latter is supported but unlikely and slow(er). 210 /// 211 ///{ 212 ICVStateTy ICVState; 213 ///} 214 215 uint32_t ParallelTeamSize; 216 ParallelRegionFnTy ParallelRegionFnVar; 217 }; 218 219 TeamStateTy SHARED(TeamState); 220 221 void TeamStateTy::init(bool IsSPMD) { 222 ICVState.NThreadsVar = mapping::getBlockSize(); 223 ICVState.LevelVar = 0; 224 ICVState.ActiveLevelVar = 0; 225 ICVState.MaxActiveLevelsVar = 1; 226 ICVState.RunSchedVar = omp_sched_static; 227 ICVState.RunSchedChunkVar = 1; 228 ParallelTeamSize = 1; 229 ParallelRegionFnVar = nullptr; 230 } 231 232 bool TeamStateTy::operator==(const TeamStateTy &Other) const { 233 return (ICVState == Other.ICVState) & 234 (ParallelTeamSize == Other.ParallelTeamSize); 235 } 236 237 void TeamStateTy::assertEqual(TeamStateTy &Other) const { 238 ICVState.assertEqual(Other.ICVState); 239 ASSERT(ParallelTeamSize == Other.ParallelTeamSize); 240 } 241 242 struct ThreadStateTy { 243 244 /// ICVs have preallocated storage in the TeamStateTy which is used if a 245 /// thread has not set a custom value. The latter is supported but unlikely. 246 /// When it happens we will allocate dynamic memory to hold the values of all 247 /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an 248 /// ICV struct to hold them all. This is slower than alternatives but allows 249 /// users to pay only for what they use. 250 /// 251 ICVStateTy ICVState; 252 253 ThreadStateTy *PreviousThreadState; 254 255 void init() { 256 ICVState = TeamState.ICVState; 257 PreviousThreadState = nullptr; 258 } 259 260 void init(ThreadStateTy &PreviousTS) { 261 ICVState = PreviousTS.ICVState; 262 PreviousThreadState = &PreviousTS; 263 } 264 }; 265 266 __attribute__((loader_uninitialized)) 267 ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam]; 268 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) 269 270 uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var) { 271 if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0)) 272 return TeamState.ICVState.*Var; 273 uint32_t TId = mapping::getThreadIdInBlock(); 274 if (!ThreadStates[TId]) { 275 ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal( 276 sizeof(ThreadStateTy), "ICV modification outside data environment")); 277 ThreadStates[TId]->init(); 278 } 279 return ThreadStates[TId]->ICVState.*Var; 280 } 281 282 uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) { 283 uint32_t TId = mapping::getThreadIdInBlock(); 284 if (OMP_UNLIKELY(ThreadStates[TId])) 285 return ThreadStates[TId]->ICVState.*Var; 286 return TeamState.ICVState.*Var; 287 } 288 uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) { 289 uint64_t TId = mapping::getThreadIdInBlock(); 290 if (OMP_UNLIKELY(ThreadStates[TId])) 291 return ThreadStates[TId]->ICVState.*Var; 292 return TeamState.ICVState.*Var; 293 } 294 295 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, 296 int OutOfBoundsVal = -1) { 297 if (Level == 0) 298 return DefaultVal; 299 int LevelVar = omp_get_level(); 300 if (OMP_UNLIKELY(Level < 0 || Level > LevelVar)) 301 return OutOfBoundsVal; 302 int ActiveLevel = icv::ActiveLevel; 303 if (OMP_UNLIKELY(Level != ActiveLevel)) 304 return DefaultVal; 305 return Val; 306 } 307 308 } // namespace 309 310 uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly) { 311 switch (Kind) { 312 case state::VK_NThreads: 313 if (IsReadonly) 314 return lookup32Impl(&ICVStateTy::NThreadsVar); 315 return lookupForModify32Impl(&ICVStateTy::NThreadsVar); 316 case state::VK_Level: 317 if (IsReadonly) 318 return lookup32Impl(&ICVStateTy::LevelVar); 319 return lookupForModify32Impl(&ICVStateTy::LevelVar); 320 case state::VK_ActiveLevel: 321 if (IsReadonly) 322 return lookup32Impl(&ICVStateTy::ActiveLevelVar); 323 return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar); 324 case state::VK_MaxActiveLevels: 325 if (IsReadonly) 326 return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar); 327 return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar); 328 case state::VK_RunSched: 329 if (IsReadonly) 330 return lookup32Impl(&ICVStateTy::RunSchedVar); 331 return lookupForModify32Impl(&ICVStateTy::RunSchedVar); 332 case state::VK_RunSchedChunk: 333 if (IsReadonly) 334 return lookup32Impl(&ICVStateTy::RunSchedChunkVar); 335 return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar); 336 case state::VK_ParallelTeamSize: 337 return TeamState.ParallelTeamSize; 338 default: 339 break; 340 } 341 __builtin_unreachable(); 342 } 343 344 void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) { 345 switch (Kind) { 346 case state::VK_ParallelRegionFn: 347 return TeamState.ParallelRegionFnVar; 348 default: 349 break; 350 } 351 __builtin_unreachable(); 352 } 353 354 void state::init(bool IsSPMD) { 355 SharedMemorySmartStack.init(IsSPMD); 356 if (!mapping::getThreadIdInBlock()) 357 TeamState.init(IsSPMD); 358 359 ThreadStates[mapping::getThreadIdInBlock()] = nullptr; 360 } 361 362 void state::enterDataEnvironment() { 363 unsigned TId = mapping::getThreadIdInBlock(); 364 ThreadStateTy *NewThreadState = 365 static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy))); 366 NewThreadState->init(*ThreadStates[TId]); 367 ThreadStates[TId] = NewThreadState; 368 } 369 370 void state::exitDataEnvironment() { 371 unsigned TId = mapping::getThreadIdInBlock(); 372 resetStateForThread(TId); 373 } 374 375 void state::resetStateForThread(uint32_t TId) { 376 if (OMP_LIKELY(!ThreadStates[TId])) 377 return; 378 379 ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState; 380 __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy)); 381 ThreadStates[TId] = PreviousThreadState; 382 } 383 384 void state::runAndCheckState(void(Func(void))) { 385 TeamStateTy OldTeamState = TeamState; 386 OldTeamState.assertEqual(TeamState); 387 388 Func(); 389 390 OldTeamState.assertEqual(TeamState); 391 } 392 393 void state::assumeInitialState(bool IsSPMD) { 394 TeamStateTy InitialTeamState; 395 InitialTeamState.init(IsSPMD); 396 InitialTeamState.assertEqual(TeamState); 397 ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]); 398 ASSERT(mapping::isSPMDMode() == IsSPMD); 399 } 400 401 extern "C" { 402 void omp_set_dynamic(int V) {} 403 404 int omp_get_dynamic(void) { return 0; } 405 406 void omp_set_num_threads(int V) { icv::NThreads = V; } 407 408 int omp_get_max_threads(void) { return icv::NThreads; } 409 410 int omp_get_level(void) { 411 int LevelVar = icv::Level; 412 ASSERT(LevelVar >= 0); 413 return LevelVar; 414 } 415 416 int omp_get_active_level(void) { return !!icv::ActiveLevel; } 417 418 int omp_in_parallel(void) { return !!icv::ActiveLevel; } 419 420 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { 421 *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched); 422 *ChunkSize = state::RunSchedChunk; 423 } 424 425 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { 426 icv::RunSched = (int)ScheduleKind; 427 state::RunSchedChunk = ChunkSize; 428 } 429 430 int omp_get_ancestor_thread_num(int Level) { 431 return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); 432 } 433 434 int omp_get_thread_num(void) { 435 return omp_get_ancestor_thread_num(omp_get_level()); 436 } 437 438 int omp_get_team_size(int Level) { 439 return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1); 440 } 441 442 int omp_get_num_threads(void) { return state::ParallelTeamSize; } 443 444 int omp_get_thread_limit(void) { return mapping::getKernelSize(); } 445 446 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } 447 448 void omp_set_nested(int) {} 449 450 int omp_get_nested(void) { return false; } 451 452 void omp_set_max_active_levels(int Levels) { 453 icv::MaxActiveLevels = Levels > 0 ? 1 : 0; 454 } 455 456 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } 457 458 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } 459 460 int omp_get_num_places(void) { return 0; } 461 462 int omp_get_place_num_procs(int) { return omp_get_num_procs(); } 463 464 void omp_get_place_proc_ids(int, int *) { 465 // TODO 466 } 467 468 int omp_get_place_num(void) { return 0; } 469 470 int omp_get_partition_num_places(void) { return 0; } 471 472 void omp_get_partition_place_nums(int *) { 473 // TODO 474 } 475 476 int omp_get_cancellation(void) { return 0; } 477 478 void omp_set_default_device(int) {} 479 480 int omp_get_default_device(void) { return -1; } 481 482 int omp_get_num_devices(void) { return config::getNumDevices(); } 483 484 int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); } 485 486 int omp_get_team_num() { return mapping::getBlockId(); } 487 488 int omp_get_initial_device(void) { return -1; } 489 } 490 491 extern "C" { 492 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) { 493 return memory::allocShared(Bytes, "Frontend alloc shared"); 494 } 495 496 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { 497 memory::freeShared(Ptr, Bytes, "Frontend free shared"); 498 } 499 500 /// Allocate storage in shared memory to communicate arguments from the main 501 /// thread to the workers in generic mode. If we exceed 502 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. 503 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; 504 505 [[clang::loader_uninitialized]] static void 506 *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; 507 #pragma omp allocate(SharedMemVariableSharingSpace) \ 508 allocator(omp_pteam_mem_alloc) 509 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr; 510 #pragma omp allocate(SharedMemVariableSharingSpacePtr) \ 511 allocator(omp_pteam_mem_alloc) 512 513 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { 514 if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { 515 SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; 516 } else { 517 SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal( 518 nArgs * sizeof(void *), "new extended args"); 519 } 520 *GlobalArgs = SharedMemVariableSharingSpacePtr; 521 } 522 523 void __kmpc_end_sharing_variables() { 524 if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) 525 memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args"); 526 } 527 528 void __kmpc_get_shared_variables(void ***GlobalArgs) { 529 *GlobalArgs = SharedMemVariableSharingSpacePtr; 530 } 531 } 532 #pragma omp end declare target 533