1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 
11 #include "State.h"
12 #include "Configuration.h"
13 #include "Debug.h"
14 #include "Interface.h"
15 #include "Mapping.h"
16 #include "Synchronization.h"
17 #include "Types.h"
18 #include "Utils.h"
19 
20 using namespace _OMP;
21 
22 #pragma omp declare target
23 
24 /// Memory implementation
25 ///
26 ///{
27 
28 /// Add worst-case padding so that future allocations are properly aligned.
29 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
30 /// passed in as an argument and the stack rewritten to support it.
31 constexpr const uint32_t Alignment = 16;
32 
33 /// External symbol to access dynamic shared memory.
34 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
35 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
36 
37 namespace {
38 
39 /// Fallback implementations are missing to trigger a link time error.
40 /// Implementations for new devices, including the host, should go into a
41 /// dedicated begin/end declare variant.
42 ///
43 ///{
44 
45 extern "C" {
46 __attribute__((leaf)) void *malloc(uint64_t Size);
47 __attribute__((leaf)) void free(void *Ptr);
48 }
49 
50 ///}
51 
52 /// AMDGCN implementations of the shuffle sync idiom.
53 ///
54 ///{
55 #pragma omp begin declare variant match(device = {arch(amdgcn)})
56 
57 extern "C" {
58 void *malloc(uint64_t Size) {
59   // TODO: Use some preallocated space for dynamic malloc.
60   return nullptr;
61 }
62 
63 void free(void *Ptr) {}
64 }
65 
66 #pragma omp end declare variant
67 ///}
68 
69 /// A "smart" stack in shared memory.
70 ///
71 /// The stack exposes a malloc/free interface but works like a stack internally.
72 /// In fact, it is a separate stack *per warp*. That means, each warp must push
73 /// and pop symmetrically or this breaks, badly. The implementation will (aim
74 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
75 /// happen if a warp runs out of memory. The master warp in generic memory is
76 /// special and is given more memory than the rest.
77 ///
78 struct SharedMemorySmartStackTy {
79   /// Initialize the stack. Must be called by all threads.
80   void init(bool IsSPMD);
81 
82   /// Allocate \p Bytes on the stack for the encountering thread. Each thread
83   /// can call this function.
84   void *push(uint64_t Bytes);
85 
86   /// Deallocate the last allocation made by the encountering thread and pointed
87   /// to by \p Ptr from the stack. Each thread can call this function.
88   void pop(void *Ptr, uint32_t Bytes);
89 
90 private:
91   /// Compute the size of the storage space reserved for a thread.
92   uint32_t computeThreadStorageTotal() {
93     uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
94     return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
95                              Alignment);
96   }
97 
98   /// Return the top address of the warp data stack, that is the first address
99   /// this warp will allocate memory at next.
100   void *getThreadDataTop(uint32_t TId) {
101     return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
102   }
103 
104   /// The actual storage, shared among all warps.
105   unsigned char Data[state::SharedScratchpadSize]
106       __attribute__((aligned(Alignment)));
107   unsigned char Usage[mapping::MaxThreadsPerTeam]
108       __attribute__((aligned(Alignment)));
109 };
110 
111 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
112               "Shared scratchpad of this size not supported yet.");
113 
114 /// The allocation of a single shared memory scratchpad.
115 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
116 
117 void SharedMemorySmartStackTy::init(bool IsSPMD) {
118   Usage[mapping::getThreadIdInBlock()] = 0;
119 }
120 
121 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
122   // First align the number of requested bytes.
123   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
124 
125   uint32_t StorageTotal = computeThreadStorageTotal();
126 
127   // The main thread in generic mode gets the space of its entire warp as the
128   // other threads do not participate in any computation at all.
129   if (mapping::isMainThreadInGenericMode())
130     StorageTotal *= mapping::getWarpSize();
131 
132   int TId = mapping::getThreadIdInBlock();
133   if (Usage[TId] + AlignedBytes <= StorageTotal) {
134     void *Ptr = getThreadDataTop(TId);
135     Usage[TId] += AlignedBytes;
136     return Ptr;
137   }
138 
139   void *GlobalMemory = memory::allocGlobal(
140       AlignedBytes, "Slow path shared memory allocation, insufficient "
141                     "shared memory stack memory!");
142   ASSERT(GlobalMemory != nullptr && "nullptr returned by malloc!");
143 
144   return GlobalMemory;
145 }
146 
147 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
148   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
149   if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) {
150     int TId = mapping::getThreadIdInBlock();
151     Usage[TId] -= AlignedBytes;
152     return;
153   }
154   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
155 }
156 
157 } // namespace
158 
159 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
160 
161 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
162   return SharedMemorySmartStack.push(Bytes);
163 }
164 
165 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
166   SharedMemorySmartStack.pop(Ptr, Bytes);
167 }
168 
169 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
170   void *Ptr = malloc(Bytes);
171   if (config::isDebugMode(config::DebugKind::CommonIssues) && Ptr == nullptr)
172     PRINT("nullptr returned by malloc!\n");
173   return Ptr;
174 }
175 
176 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
177 
178 ///}
179 
180 namespace {
181 
182 struct ICVStateTy {
183   uint32_t NThreadsVar;
184   uint32_t LevelVar;
185   uint32_t ActiveLevelVar;
186   uint32_t MaxActiveLevelsVar;
187   uint32_t RunSchedVar;
188   uint32_t RunSchedChunkVar;
189 
190   bool operator==(const ICVStateTy &Other) const;
191 
192   void assertEqual(const ICVStateTy &Other) const;
193 };
194 
195 bool ICVStateTy::operator==(const ICVStateTy &Other) const {
196   return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
197          (ActiveLevelVar == Other.ActiveLevelVar) &
198          (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
199          (RunSchedVar == Other.RunSchedVar) &
200          (RunSchedChunkVar == Other.RunSchedChunkVar);
201 }
202 
203 void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
204   ASSERT(NThreadsVar == Other.NThreadsVar);
205   ASSERT(LevelVar == Other.LevelVar);
206   ASSERT(ActiveLevelVar == Other.ActiveLevelVar);
207   ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar);
208   ASSERT(RunSchedVar == Other.RunSchedVar);
209   ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar);
210 }
211 
212 struct TeamStateTy {
213   /// TODO: provide a proper init function.
214   void init(bool IsSPMD);
215 
216   bool operator==(const TeamStateTy &) const;
217 
218   void assertEqual(TeamStateTy &Other) const;
219 
220   /// ICVs
221   ///
222   /// Preallocated storage for ICV values that are used if the threads have not
223   /// set a custom default. The latter is supported but unlikely and slow(er).
224   ///
225   ///{
226   ICVStateTy ICVState;
227   ///}
228 
229   uint32_t ParallelTeamSize;
230   ParallelRegionFnTy ParallelRegionFnVar;
231 };
232 
233 TeamStateTy SHARED(TeamState);
234 
235 void TeamStateTy::init(bool IsSPMD) {
236   ICVState.NThreadsVar = mapping::getBlockSize();
237   ICVState.LevelVar = 0;
238   ICVState.ActiveLevelVar = 0;
239   ICVState.MaxActiveLevelsVar = 1;
240   ICVState.RunSchedVar = omp_sched_static;
241   ICVState.RunSchedChunkVar = 1;
242   ParallelTeamSize = 1;
243   ParallelRegionFnVar = nullptr;
244 }
245 
246 bool TeamStateTy::operator==(const TeamStateTy &Other) const {
247   return (ICVState == Other.ICVState) &
248          (ParallelTeamSize == Other.ParallelTeamSize);
249 }
250 
251 void TeamStateTy::assertEqual(TeamStateTy &Other) const {
252   ICVState.assertEqual(Other.ICVState);
253   ASSERT(ParallelTeamSize == Other.ParallelTeamSize);
254 }
255 
256 struct ThreadStateTy {
257 
258   /// ICVs have preallocated storage in the TeamStateTy which is used if a
259   /// thread has not set a custom value. The latter is supported but unlikely.
260   /// When it happens we will allocate dynamic memory to hold the values of all
261   /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
262   /// ICV struct to hold them all. This is slower than alternatives but allows
263   /// users to pay only for what they use.
264   ///
265   ICVStateTy ICVState;
266 
267   ThreadStateTy *PreviousThreadState;
268 
269   void init() {
270     ICVState = TeamState.ICVState;
271     PreviousThreadState = nullptr;
272   }
273 
274   void init(ThreadStateTy *PreviousTS) {
275     ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
276     PreviousThreadState = PreviousTS;
277   }
278 };
279 
280 __attribute__((loader_uninitialized))
281 ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
282 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
283 
284 uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var) {
285   if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0))
286     return TeamState.ICVState.*Var;
287   uint32_t TId = mapping::getThreadIdInBlock();
288   if (!ThreadStates[TId]) {
289     ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
290         sizeof(ThreadStateTy), "ICV modification outside data environment"));
291     ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!");
292     ThreadStates[TId]->init();
293   }
294   return ThreadStates[TId]->ICVState.*Var;
295 }
296 
297 uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) {
298   uint32_t TId = mapping::getThreadIdInBlock();
299   if (OMP_UNLIKELY(ThreadStates[TId]))
300     return ThreadStates[TId]->ICVState.*Var;
301   return TeamState.ICVState.*Var;
302 }
303 uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) {
304   uint64_t TId = mapping::getThreadIdInBlock();
305   if (OMP_UNLIKELY(ThreadStates[TId]))
306     return ThreadStates[TId]->ICVState.*Var;
307   return TeamState.ICVState.*Var;
308 }
309 
310 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
311                              int OutOfBoundsVal = -1) {
312   if (Level == 0)
313     return DefaultVal;
314   int LevelVar = omp_get_level();
315   if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
316     return OutOfBoundsVal;
317   int ActiveLevel = icv::ActiveLevel;
318   if (OMP_UNLIKELY(Level != ActiveLevel))
319     return DefaultVal;
320   return Val;
321 }
322 
323 } // namespace
324 
325 uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly) {
326   switch (Kind) {
327   case state::VK_NThreads:
328     if (IsReadonly)
329       return lookup32Impl(&ICVStateTy::NThreadsVar);
330     return lookupForModify32Impl(&ICVStateTy::NThreadsVar);
331   case state::VK_Level:
332     if (IsReadonly)
333       return lookup32Impl(&ICVStateTy::LevelVar);
334     return lookupForModify32Impl(&ICVStateTy::LevelVar);
335   case state::VK_ActiveLevel:
336     if (IsReadonly)
337       return lookup32Impl(&ICVStateTy::ActiveLevelVar);
338     return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar);
339   case state::VK_MaxActiveLevels:
340     if (IsReadonly)
341       return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar);
342     return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar);
343   case state::VK_RunSched:
344     if (IsReadonly)
345       return lookup32Impl(&ICVStateTy::RunSchedVar);
346     return lookupForModify32Impl(&ICVStateTy::RunSchedVar);
347   case state::VK_RunSchedChunk:
348     if (IsReadonly)
349       return lookup32Impl(&ICVStateTy::RunSchedChunkVar);
350     return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar);
351   case state::VK_ParallelTeamSize:
352     return TeamState.ParallelTeamSize;
353   default:
354     break;
355   }
356   __builtin_unreachable();
357 }
358 
359 void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
360   switch (Kind) {
361   case state::VK_ParallelRegionFn:
362     return TeamState.ParallelRegionFnVar;
363   default:
364     break;
365   }
366   __builtin_unreachable();
367 }
368 
369 void state::init(bool IsSPMD) {
370   SharedMemorySmartStack.init(IsSPMD);
371   if (mapping::isInitialThreadInLevel0(IsSPMD)) {
372     TeamState.init(IsSPMD);
373     DebugEntryRAII::init();
374   }
375 
376   ThreadStates[mapping::getThreadIdInBlock()] = nullptr;
377 }
378 
379 void state::enterDataEnvironment() {
380   unsigned TId = mapping::getThreadIdInBlock();
381   ThreadStateTy *NewThreadState =
382       static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
383   NewThreadState->init(ThreadStates[TId]);
384   ThreadStates[TId] = NewThreadState;
385 }
386 
387 void state::exitDataEnvironment() {
388   unsigned TId = mapping::getThreadIdInBlock();
389   resetStateForThread(TId);
390 }
391 
392 void state::resetStateForThread(uint32_t TId) {
393   if (OMP_LIKELY(!ThreadStates[TId]))
394     return;
395 
396   ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
397   __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy));
398   ThreadStates[TId] = PreviousThreadState;
399 }
400 
401 void state::runAndCheckState(void(Func(void))) {
402   TeamStateTy OldTeamState = TeamState;
403   OldTeamState.assertEqual(TeamState);
404 
405   Func();
406 
407   OldTeamState.assertEqual(TeamState);
408 }
409 
410 void state::assumeInitialState(bool IsSPMD) {
411   TeamStateTy InitialTeamState;
412   InitialTeamState.init(IsSPMD);
413   InitialTeamState.assertEqual(TeamState);
414   ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]);
415   ASSERT(mapping::isSPMDMode() == IsSPMD);
416 }
417 
418 extern "C" {
419 void omp_set_dynamic(int V) {}
420 
421 int omp_get_dynamic(void) { return 0; }
422 
423 void omp_set_num_threads(int V) { icv::NThreads = V; }
424 
425 int omp_get_max_threads(void) { return icv::NThreads; }
426 
427 int omp_get_level(void) {
428   int LevelVar = icv::Level;
429   ASSERT(LevelVar >= 0);
430   return LevelVar;
431 }
432 
433 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
434 
435 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
436 
437 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
438   *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
439   *ChunkSize = state::RunSchedChunk;
440 }
441 
442 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
443   icv::RunSched = (int)ScheduleKind;
444   state::RunSchedChunk = ChunkSize;
445 }
446 
447 int omp_get_ancestor_thread_num(int Level) {
448   return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
449 }
450 
451 int omp_get_thread_num(void) {
452   return omp_get_ancestor_thread_num(omp_get_level());
453 }
454 
455 int omp_get_team_size(int Level) {
456   return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
457 }
458 
459 int omp_get_num_threads(void) {
460   return omp_get_level() > 1 ? 1 : state::ParallelTeamSize;
461 }
462 
463 int omp_get_thread_limit(void) { return mapping::getKernelSize(); }
464 
465 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
466 
467 void omp_set_nested(int) {}
468 
469 int omp_get_nested(void) { return false; }
470 
471 void omp_set_max_active_levels(int Levels) {
472   icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
473 }
474 
475 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
476 
477 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
478 
479 int omp_get_num_places(void) { return 0; }
480 
481 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
482 
483 void omp_get_place_proc_ids(int, int *) {
484   // TODO
485 }
486 
487 int omp_get_place_num(void) { return 0; }
488 
489 int omp_get_partition_num_places(void) { return 0; }
490 
491 void omp_get_partition_place_nums(int *) {
492   // TODO
493 }
494 
495 int omp_get_cancellation(void) { return 0; }
496 
497 void omp_set_default_device(int) {}
498 
499 int omp_get_default_device(void) { return -1; }
500 
501 int omp_get_num_devices(void) { return config::getNumDevices(); }
502 
503 int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
504 
505 int omp_get_team_num() { return mapping::getBlockId(); }
506 
507 int omp_get_initial_device(void) { return -1; }
508 }
509 
510 extern "C" {
511 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
512   FunctionTracingRAII();
513   return memory::allocShared(Bytes, "Frontend alloc shared");
514 }
515 
516 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
517   FunctionTracingRAII();
518   memory::freeShared(Ptr, Bytes, "Frontend free shared");
519 }
520 
521 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
522 
523 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
524 
525 /// Allocate storage in shared memory to communicate arguments from the main
526 /// thread to the workers in generic mode. If we exceed
527 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
528 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
529 
530 [[clang::loader_uninitialized]] static void
531     *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
532 #pragma omp allocate(SharedMemVariableSharingSpace)                            \
533     allocator(omp_pteam_mem_alloc)
534 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
535 #pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
536     allocator(omp_pteam_mem_alloc)
537 
538 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
539   FunctionTracingRAII();
540   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
541     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
542   } else {
543     SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
544         nArgs * sizeof(void *), "new extended args");
545     ASSERT(SharedMemVariableSharingSpacePtr != nullptr &&
546            "Nullptr returned by malloc!");
547   }
548   *GlobalArgs = SharedMemVariableSharingSpacePtr;
549 }
550 
551 void __kmpc_end_sharing_variables() {
552   FunctionTracingRAII();
553   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
554     memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
555 }
556 
557 void __kmpc_get_shared_variables(void ***GlobalArgs) {
558   FunctionTracingRAII();
559   *GlobalArgs = SharedMemVariableSharingSpacePtr;
560 }
561 }
562 #pragma omp end declare target
563