1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 
11 #include "State.h"
12 #include "Configuration.h"
13 #include "Debug.h"
14 #include "Interface.h"
15 #include "Mapping.h"
16 #include "Synchronization.h"
17 #include "Types.h"
18 #include "Utils.h"
19 
20 using namespace _OMP;
21 
22 #pragma omp declare target
23 
24 /// Memory implementation
25 ///
26 ///{
27 
28 /// Add worst-case padding so that future allocations are properly aligned.
29 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
30 /// passed in as an argument and the stack rewritten to support it.
31 constexpr const uint32_t Alignment = 16;
32 
33 /// External symbol to access dynamic shared memory.
34 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
35 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
36 
37 namespace {
38 
39 /// Fallback implementations are missing to trigger a link time error.
40 /// Implementations for new devices, including the host, should go into a
41 /// dedicated begin/end declare variant.
42 ///
43 ///{
44 
45 extern "C" {
46 __attribute__((leaf)) void *malloc(uint64_t Size);
47 __attribute__((leaf)) void free(void *Ptr);
48 }
49 
50 ///}
51 
52 /// AMDGCN implementations of the shuffle sync idiom.
53 ///
54 ///{
55 #pragma omp begin declare variant match(device = {arch(amdgcn)})
56 
57 extern "C" {
58 void *malloc(uint64_t Size) {
59   // TODO: Use some preallocated space for dynamic malloc.
60   return nullptr;
61 }
62 
63 void free(void *Ptr) {}
64 }
65 
66 #pragma omp end declare variant
67 ///}
68 
69 /// A "smart" stack in shared memory.
70 ///
71 /// The stack exposes a malloc/free interface but works like a stack internally.
72 /// In fact, it is a separate stack *per warp*. That means, each warp must push
73 /// and pop symmetrically or this breaks, badly. The implementation will (aim
74 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
75 /// happen if a warp runs out of memory. The master warp in generic memory is
76 /// special and is given more memory than the rest.
77 ///
78 struct SharedMemorySmartStackTy {
79   /// Initialize the stack. Must be called by all threads.
80   void init(bool IsSPMD);
81 
82   /// Allocate \p Bytes on the stack for the encountering thread. Each thread
83   /// can call this function.
84   void *push(uint64_t Bytes);
85 
86   /// Deallocate the last allocation made by the encountering thread and pointed
87   /// to by \p Ptr from the stack. Each thread can call this function.
88   void pop(void *Ptr, uint32_t Bytes);
89 
90 private:
91   /// Compute the size of the storage space reserved for a thread.
92   uint32_t computeThreadStorageTotal() {
93     uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
94     return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
95                              Alignment);
96   }
97 
98   /// Return the top address of the warp data stack, that is the first address
99   /// this warp will allocate memory at next.
100   void *getThreadDataTop(uint32_t TId) {
101     return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
102   }
103 
104   /// The actual storage, shared among all warps.
105   unsigned char Data[state::SharedScratchpadSize]
106       __attribute__((aligned(Alignment)));
107   unsigned char Usage[mapping::MaxThreadsPerTeam]
108       __attribute__((aligned(Alignment)));
109 };
110 
111 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
112               "Shared scratchpad of this size not supported yet.");
113 
114 /// The allocation of a single shared memory scratchpad.
115 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
116 
117 void SharedMemorySmartStackTy::init(bool IsSPMD) {
118   Usage[mapping::getThreadIdInBlock()] = 0;
119 }
120 
121 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
122   // First align the number of requested bytes.
123   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
124 
125   uint32_t StorageTotal = computeThreadStorageTotal();
126 
127   // The main thread in generic mode gets the space of its entire warp as the
128   // other threads do not participate in any computation at all.
129   if (mapping::isMainThreadInGenericMode())
130     StorageTotal *= mapping::getWarpSize();
131 
132   int TId = mapping::getThreadIdInBlock();
133   if (Usage[TId] + AlignedBytes <= StorageTotal) {
134     void *Ptr = getThreadDataTop(TId);
135     Usage[TId] += AlignedBytes;
136     return Ptr;
137   }
138 
139   if (config::isDebugMode(config::DebugKind::CommonIssues))
140     PRINT("Shared memory stack full, fallback to dynamic allocation of global "
141           "memory will negatively impact performance.");
142   void *GlobalMemory = memory::allocGlobal(
143       AlignedBytes, "Slow path shared memory allocation, insufficient "
144                     "shared memory stack memory!");
145   ASSERT(GlobalMemory != nullptr && "nullptr returned by malloc!");
146 
147   return GlobalMemory;
148 }
149 
150 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
151   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
152   if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) {
153     int TId = mapping::getThreadIdInBlock();
154     Usage[TId] -= AlignedBytes;
155     return;
156   }
157   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
158 }
159 
160 } // namespace
161 
162 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
163 
164 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
165   return SharedMemorySmartStack.push(Bytes);
166 }
167 
168 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
169   SharedMemorySmartStack.pop(Ptr, Bytes);
170 }
171 
172 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
173   void *Ptr = malloc(Bytes);
174   if (config::isDebugMode(config::DebugKind::CommonIssues) && Ptr == nullptr)
175     PRINT("nullptr returned by malloc!\n");
176   return Ptr;
177 }
178 
179 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
180 
181 ///}
182 
183 namespace {
184 
185 struct ICVStateTy {
186   uint32_t NThreadsVar;
187   uint32_t LevelVar;
188   uint32_t ActiveLevelVar;
189   uint32_t MaxActiveLevelsVar;
190   uint32_t RunSchedVar;
191   uint32_t RunSchedChunkVar;
192 
193   bool operator==(const ICVStateTy &Other) const;
194 
195   void assertEqual(const ICVStateTy &Other) const;
196 };
197 
198 bool ICVStateTy::operator==(const ICVStateTy &Other) const {
199   return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
200          (ActiveLevelVar == Other.ActiveLevelVar) &
201          (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
202          (RunSchedVar == Other.RunSchedVar) &
203          (RunSchedChunkVar == Other.RunSchedChunkVar);
204 }
205 
206 void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
207   ASSERT(NThreadsVar == Other.NThreadsVar);
208   ASSERT(LevelVar == Other.LevelVar);
209   ASSERT(ActiveLevelVar == Other.ActiveLevelVar);
210   ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar);
211   ASSERT(RunSchedVar == Other.RunSchedVar);
212   ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar);
213 }
214 
215 struct TeamStateTy {
216   /// TODO: provide a proper init function.
217   void init(bool IsSPMD);
218 
219   bool operator==(const TeamStateTy &) const;
220 
221   void assertEqual(TeamStateTy &Other) const;
222 
223   /// ICVs
224   ///
225   /// Preallocated storage for ICV values that are used if the threads have not
226   /// set a custom default. The latter is supported but unlikely and slow(er).
227   ///
228   ///{
229   ICVStateTy ICVState;
230   ///}
231 
232   uint32_t ParallelTeamSize;
233   ParallelRegionFnTy ParallelRegionFnVar;
234 };
235 
236 TeamStateTy SHARED(TeamState);
237 
238 void TeamStateTy::init(bool IsSPMD) {
239   ICVState.NThreadsVar = mapping::getBlockSize();
240   ICVState.LevelVar = 0;
241   ICVState.ActiveLevelVar = 0;
242   ICVState.MaxActiveLevelsVar = 1;
243   ICVState.RunSchedVar = omp_sched_static;
244   ICVState.RunSchedChunkVar = 1;
245   ParallelTeamSize = 1;
246   ParallelRegionFnVar = nullptr;
247 }
248 
249 bool TeamStateTy::operator==(const TeamStateTy &Other) const {
250   return (ICVState == Other.ICVState) &
251          (ParallelTeamSize == Other.ParallelTeamSize);
252 }
253 
254 void TeamStateTy::assertEqual(TeamStateTy &Other) const {
255   ICVState.assertEqual(Other.ICVState);
256   ASSERT(ParallelTeamSize == Other.ParallelTeamSize);
257 }
258 
259 struct ThreadStateTy {
260 
261   /// ICVs have preallocated storage in the TeamStateTy which is used if a
262   /// thread has not set a custom value. The latter is supported but unlikely.
263   /// When it happens we will allocate dynamic memory to hold the values of all
264   /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
265   /// ICV struct to hold them all. This is slower than alternatives but allows
266   /// users to pay only for what they use.
267   ///
268   ICVStateTy ICVState;
269 
270   ThreadStateTy *PreviousThreadState;
271 
272   void init() {
273     ICVState = TeamState.ICVState;
274     PreviousThreadState = nullptr;
275   }
276 
277   void init(ThreadStateTy *PreviousTS) {
278     ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
279     PreviousThreadState = PreviousTS;
280   }
281 };
282 
283 __attribute__((loader_uninitialized))
284 ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
285 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
286 
287 uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) {
288   if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0))
289     return TeamState.ICVState.*Var;
290   uint32_t TId = mapping::getThreadIdInBlock();
291   if (!ThreadStates[TId]) {
292     ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
293         sizeof(ThreadStateTy), "ICV modification outside data environment"));
294     ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!");
295     ThreadStates[TId]->init();
296   }
297   return ThreadStates[TId]->ICVState.*Var;
298 }
299 
300 uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) {
301   uint32_t TId = mapping::getThreadIdInBlock();
302   if (OMP_UNLIKELY(ThreadStates[TId]))
303     return ThreadStates[TId]->ICVState.*Var;
304   return TeamState.ICVState.*Var;
305 }
306 uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) {
307   uint64_t TId = mapping::getThreadIdInBlock();
308   if (OMP_UNLIKELY(ThreadStates[TId]))
309     return ThreadStates[TId]->ICVState.*Var;
310   return TeamState.ICVState.*Var;
311 }
312 
313 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
314                              int OutOfBoundsVal = -1) {
315   if (Level == 0)
316     return DefaultVal;
317   int LevelVar = omp_get_level();
318   if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
319     return OutOfBoundsVal;
320   int ActiveLevel = icv::ActiveLevel;
321   if (OMP_UNLIKELY(Level != ActiveLevel))
322     return DefaultVal;
323   return Val;
324 }
325 
326 } // namespace
327 
328 uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident) {
329   switch (Kind) {
330   case state::VK_NThreads:
331     if (IsReadonly)
332       return lookup32Impl(&ICVStateTy::NThreadsVar);
333     return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident);
334   case state::VK_Level:
335     if (IsReadonly)
336       return lookup32Impl(&ICVStateTy::LevelVar);
337     return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident);
338   case state::VK_ActiveLevel:
339     if (IsReadonly)
340       return lookup32Impl(&ICVStateTy::ActiveLevelVar);
341     return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident);
342   case state::VK_MaxActiveLevels:
343     if (IsReadonly)
344       return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar);
345     return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident);
346   case state::VK_RunSched:
347     if (IsReadonly)
348       return lookup32Impl(&ICVStateTy::RunSchedVar);
349     return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident);
350   case state::VK_RunSchedChunk:
351     if (IsReadonly)
352       return lookup32Impl(&ICVStateTy::RunSchedChunkVar);
353     return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident);
354   case state::VK_ParallelTeamSize:
355     return TeamState.ParallelTeamSize;
356   default:
357     break;
358   }
359   __builtin_unreachable();
360 }
361 
362 void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
363   switch (Kind) {
364   case state::VK_ParallelRegionFn:
365     return TeamState.ParallelRegionFnVar;
366   default:
367     break;
368   }
369   __builtin_unreachable();
370 }
371 
372 void state::init(bool IsSPMD) {
373   SharedMemorySmartStack.init(IsSPMD);
374   if (mapping::isInitialThreadInLevel0(IsSPMD)) {
375     TeamState.init(IsSPMD);
376     DebugEntryRAII::init();
377   }
378 
379   ThreadStates[mapping::getThreadIdInBlock()] = nullptr;
380 }
381 
382 void state::enterDataEnvironment(IdentTy *Ident) {
383   unsigned TId = mapping::getThreadIdInBlock();
384   ThreadStateTy *NewThreadState =
385       static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
386   NewThreadState->init(ThreadStates[TId]);
387   ThreadStates[TId] = NewThreadState;
388 }
389 
390 void state::exitDataEnvironment() {
391   unsigned TId = mapping::getThreadIdInBlock();
392   resetStateForThread(TId);
393 }
394 
395 void state::resetStateForThread(uint32_t TId) {
396   if (OMP_LIKELY(!ThreadStates[TId]))
397     return;
398 
399   ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
400   __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy));
401   ThreadStates[TId] = PreviousThreadState;
402 }
403 
404 void state::runAndCheckState(void(Func(void))) {
405   TeamStateTy OldTeamState = TeamState;
406   OldTeamState.assertEqual(TeamState);
407 
408   Func();
409 
410   OldTeamState.assertEqual(TeamState);
411 }
412 
413 void state::assumeInitialState(bool IsSPMD) {
414   TeamStateTy InitialTeamState;
415   InitialTeamState.init(IsSPMD);
416   InitialTeamState.assertEqual(TeamState);
417   ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]);
418   ASSERT(mapping::isSPMDMode() == IsSPMD);
419 }
420 
421 extern "C" {
422 void omp_set_dynamic(int V) {}
423 
424 int omp_get_dynamic(void) { return 0; }
425 
426 void omp_set_num_threads(int V) { icv::NThreads = V; }
427 
428 int omp_get_max_threads(void) { return icv::NThreads; }
429 
430 int omp_get_level(void) {
431   int LevelVar = icv::Level;
432   ASSERT(LevelVar >= 0);
433   return LevelVar;
434 }
435 
436 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
437 
438 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
439 
440 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
441   *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
442   *ChunkSize = state::RunSchedChunk;
443 }
444 
445 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
446   icv::RunSched = (int)ScheduleKind;
447   state::RunSchedChunk = ChunkSize;
448 }
449 
450 int omp_get_ancestor_thread_num(int Level) {
451   return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
452 }
453 
454 int omp_get_thread_num(void) {
455   return omp_get_ancestor_thread_num(omp_get_level());
456 }
457 
458 int omp_get_team_size(int Level) {
459   return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
460 }
461 
462 int omp_get_num_threads(void) {
463   return omp_get_level() > 1 ? 1 : state::ParallelTeamSize;
464 }
465 
466 int omp_get_thread_limit(void) { return mapping::getKernelSize(); }
467 
468 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
469 
470 void omp_set_nested(int) {}
471 
472 int omp_get_nested(void) { return false; }
473 
474 void omp_set_max_active_levels(int Levels) {
475   icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
476 }
477 
478 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
479 
480 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
481 
482 int omp_get_num_places(void) { return 0; }
483 
484 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
485 
486 void omp_get_place_proc_ids(int, int *) {
487   // TODO
488 }
489 
490 int omp_get_place_num(void) { return 0; }
491 
492 int omp_get_partition_num_places(void) { return 0; }
493 
494 void omp_get_partition_place_nums(int *) {
495   // TODO
496 }
497 
498 int omp_get_cancellation(void) { return 0; }
499 
500 void omp_set_default_device(int) {}
501 
502 int omp_get_default_device(void) { return -1; }
503 
504 int omp_get_num_devices(void) { return config::getNumDevices(); }
505 
506 int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
507 
508 int omp_get_team_num() { return mapping::getBlockId(); }
509 
510 int omp_get_initial_device(void) { return -1; }
511 }
512 
513 extern "C" {
514 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
515   FunctionTracingRAII();
516   return memory::allocShared(Bytes, "Frontend alloc shared");
517 }
518 
519 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
520   FunctionTracingRAII();
521   memory::freeShared(Ptr, Bytes, "Frontend free shared");
522 }
523 
524 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
525 
526 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
527 
528 /// Allocate storage in shared memory to communicate arguments from the main
529 /// thread to the workers in generic mode. If we exceed
530 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
531 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
532 
533 [[clang::loader_uninitialized]] static void
534     *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
535 #pragma omp allocate(SharedMemVariableSharingSpace)                            \
536     allocator(omp_pteam_mem_alloc)
537 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
538 #pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
539     allocator(omp_pteam_mem_alloc)
540 
541 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
542   FunctionTracingRAII();
543   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
544     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
545   } else {
546     SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
547         nArgs * sizeof(void *), "new extended args");
548     ASSERT(SharedMemVariableSharingSpacePtr != nullptr &&
549            "Nullptr returned by malloc!");
550   }
551   *GlobalArgs = SharedMemVariableSharingSpacePtr;
552 }
553 
554 void __kmpc_end_sharing_variables() {
555   FunctionTracingRAII();
556   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
557     memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
558 }
559 
560 void __kmpc_get_shared_variables(void ***GlobalArgs) {
561   FunctionTracingRAII();
562   *GlobalArgs = SharedMemVariableSharingSpacePtr;
563 }
564 }
565 #pragma omp end declare target
566