1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 
11 #include "State.h"
12 #include "Configuration.h"
13 #include "Debug.h"
14 #include "Interface.h"
15 #include "Mapping.h"
16 #include "Synchronization.h"
17 #include "Types.h"
18 #include "Utils.h"
19 
20 using namespace _OMP;
21 
22 #pragma omp declare target
23 
24 /// Memory implementation
25 ///
26 ///{
27 
28 /// Add worst-case padding so that future allocations are properly aligned.
29 constexpr const uint32_t Alignment = 8;
30 
31 /// External symbol to access dynamic shared memory.
32 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
33 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
34 
35 namespace {
36 
37 /// Fallback implementations are missing to trigger a link time error.
38 /// Implementations for new devices, including the host, should go into a
39 /// dedicated begin/end declare variant.
40 ///
41 ///{
42 
43 extern "C" {
44 __attribute__((leaf)) void *malloc(uint64_t Size);
45 __attribute__((leaf)) void free(void *Ptr);
46 }
47 
48 ///}
49 
50 /// AMDGCN implementations of the shuffle sync idiom.
51 ///
52 ///{
53 #pragma omp begin declare variant match(device = {arch(amdgcn)})
54 
55 extern "C" {
56 void *malloc(uint64_t Size) {
57   // TODO: Use some preallocated space for dynamic malloc.
58   return nullptr;
59 }
60 
61 void free(void *Ptr) {}
62 }
63 
64 #pragma omp end declare variant
65 ///}
66 
67 /// A "smart" stack in shared memory.
68 ///
69 /// The stack exposes a malloc/free interface but works like a stack internally.
70 /// In fact, it is a separate stack *per warp*. That means, each warp must push
71 /// and pop symmetrically or this breaks, badly. The implementation will (aim
72 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
73 /// happen if a warp runs out of memory. The master warp in generic memory is
74 /// special and is given more memory than the rest.
75 ///
76 struct SharedMemorySmartStackTy {
77   /// Initialize the stack. Must be called by all threads.
78   void init(bool IsSPMD);
79 
80   /// Allocate \p Bytes on the stack for the encountering thread. Each thread
81   /// can call this function.
82   void *push(uint64_t Bytes);
83 
84   /// Deallocate the last allocation made by the encountering thread and pointed
85   /// to by \p Ptr from the stack. Each thread can call this function.
86   void pop(void *Ptr, uint32_t Bytes);
87 
88 private:
89   /// Compute the size of the storage space reserved for a thread.
90   uint32_t computeThreadStorageTotal() {
91     uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
92     return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
93                              Alignment);
94   }
95 
96   /// Return the top address of the warp data stack, that is the first address
97   /// this warp will allocate memory at next.
98   void *getThreadDataTop(uint32_t TId) {
99     return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
100   }
101 
102   /// The actual storage, shared among all warps.
103   unsigned char Data[state::SharedScratchpadSize]
104       __attribute__((aligned(Alignment)));
105   unsigned char Usage[mapping::MaxThreadsPerTeam]
106       __attribute__((aligned(Alignment)));
107 };
108 
109 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
110               "Shared scratchpad of this size not supported yet.");
111 
112 /// The allocation of a single shared memory scratchpad.
113 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
114 
115 void SharedMemorySmartStackTy::init(bool IsSPMD) {
116   Usage[mapping::getThreadIdInBlock()] = 0;
117 }
118 
119 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
120   // First align the number of requested bytes.
121   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
122 
123   uint32_t StorageTotal = computeThreadStorageTotal();
124 
125   // The main thread in generic mode gets the space of its entire warp as the
126   // other threads do not participate in any computation at all.
127   if (mapping::isMainThreadInGenericMode())
128     StorageTotal *= mapping::getWarpSize();
129 
130   int TId = mapping::getThreadIdInBlock();
131   if (Usage[TId] + AlignedBytes <= StorageTotal) {
132     void *Ptr = getThreadDataTop(TId);
133     Usage[TId] += AlignedBytes;
134     return Ptr;
135   }
136 
137   void *GlobalMemory = memory::allocGlobal(
138       AlignedBytes, "Slow path shared memory allocation, insufficient "
139                     "shared memory stack memory!");
140   ASSERT(GlobalMemory != nullptr && "nullptr returned by malloc!");
141 
142   return GlobalMemory;
143 }
144 
145 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
146   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
147   if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) {
148     int TId = mapping::getThreadIdInBlock();
149     Usage[TId] -= AlignedBytes;
150     return;
151   }
152   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
153 }
154 
155 } // namespace
156 
157 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
158 
159 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
160   return SharedMemorySmartStack.push(Bytes);
161 }
162 
163 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
164   SharedMemorySmartStack.pop(Ptr, Bytes);
165 }
166 
167 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
168   void *Ptr = malloc(Bytes);
169   if (config::isDebugMode(config::DebugKind::CommonIssues) && Ptr == nullptr)
170     PRINT("nullptr returned by malloc!\n");
171   return Ptr;
172 }
173 
174 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
175 
176 ///}
177 
178 namespace {
179 
180 struct ICVStateTy {
181   uint32_t NThreadsVar;
182   uint32_t LevelVar;
183   uint32_t ActiveLevelVar;
184   uint32_t MaxActiveLevelsVar;
185   uint32_t RunSchedVar;
186   uint32_t RunSchedChunkVar;
187 
188   bool operator==(const ICVStateTy &Other) const;
189 
190   void assertEqual(const ICVStateTy &Other) const;
191 };
192 
193 bool ICVStateTy::operator==(const ICVStateTy &Other) const {
194   return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
195          (ActiveLevelVar == Other.ActiveLevelVar) &
196          (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
197          (RunSchedVar == Other.RunSchedVar) &
198          (RunSchedChunkVar == Other.RunSchedChunkVar);
199 }
200 
201 void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
202   ASSERT(NThreadsVar == Other.NThreadsVar);
203   ASSERT(LevelVar == Other.LevelVar);
204   ASSERT(ActiveLevelVar == Other.ActiveLevelVar);
205   ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar);
206   ASSERT(RunSchedVar == Other.RunSchedVar);
207   ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar);
208 }
209 
210 struct TeamStateTy {
211   /// TODO: provide a proper init function.
212   void init(bool IsSPMD);
213 
214   bool operator==(const TeamStateTy &) const;
215 
216   void assertEqual(TeamStateTy &Other) const;
217 
218   /// ICVs
219   ///
220   /// Preallocated storage for ICV values that are used if the threads have not
221   /// set a custom default. The latter is supported but unlikely and slow(er).
222   ///
223   ///{
224   ICVStateTy ICVState;
225   ///}
226 
227   uint32_t ParallelTeamSize;
228   ParallelRegionFnTy ParallelRegionFnVar;
229 };
230 
231 TeamStateTy SHARED(TeamState);
232 
233 void TeamStateTy::init(bool IsSPMD) {
234   ICVState.NThreadsVar = mapping::getBlockSize();
235   ICVState.LevelVar = 0;
236   ICVState.ActiveLevelVar = 0;
237   ICVState.MaxActiveLevelsVar = 1;
238   ICVState.RunSchedVar = omp_sched_static;
239   ICVState.RunSchedChunkVar = 1;
240   ParallelTeamSize = 1;
241   ParallelRegionFnVar = nullptr;
242 }
243 
244 bool TeamStateTy::operator==(const TeamStateTy &Other) const {
245   return (ICVState == Other.ICVState) &
246          (ParallelTeamSize == Other.ParallelTeamSize);
247 }
248 
249 void TeamStateTy::assertEqual(TeamStateTy &Other) const {
250   ICVState.assertEqual(Other.ICVState);
251   ASSERT(ParallelTeamSize == Other.ParallelTeamSize);
252 }
253 
254 struct ThreadStateTy {
255 
256   /// ICVs have preallocated storage in the TeamStateTy which is used if a
257   /// thread has not set a custom value. The latter is supported but unlikely.
258   /// When it happens we will allocate dynamic memory to hold the values of all
259   /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
260   /// ICV struct to hold them all. This is slower than alternatives but allows
261   /// users to pay only for what they use.
262   ///
263   ICVStateTy ICVState;
264 
265   ThreadStateTy *PreviousThreadState;
266 
267   void init() {
268     ICVState = TeamState.ICVState;
269     PreviousThreadState = nullptr;
270   }
271 
272   void init(ThreadStateTy *PreviousTS) {
273     ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
274     PreviousThreadState = PreviousTS;
275   }
276 };
277 
278 __attribute__((loader_uninitialized))
279 ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
280 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
281 
282 uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var) {
283   if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0))
284     return TeamState.ICVState.*Var;
285   uint32_t TId = mapping::getThreadIdInBlock();
286   if (!ThreadStates[TId]) {
287     ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
288         sizeof(ThreadStateTy), "ICV modification outside data environment"));
289     ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!");
290     ThreadStates[TId]->init();
291   }
292   return ThreadStates[TId]->ICVState.*Var;
293 }
294 
295 uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) {
296   uint32_t TId = mapping::getThreadIdInBlock();
297   if (OMP_UNLIKELY(ThreadStates[TId]))
298     return ThreadStates[TId]->ICVState.*Var;
299   return TeamState.ICVState.*Var;
300 }
301 uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) {
302   uint64_t TId = mapping::getThreadIdInBlock();
303   if (OMP_UNLIKELY(ThreadStates[TId]))
304     return ThreadStates[TId]->ICVState.*Var;
305   return TeamState.ICVState.*Var;
306 }
307 
308 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
309                              int OutOfBoundsVal = -1) {
310   if (Level == 0)
311     return DefaultVal;
312   int LevelVar = omp_get_level();
313   if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
314     return OutOfBoundsVal;
315   int ActiveLevel = icv::ActiveLevel;
316   if (OMP_UNLIKELY(Level != ActiveLevel))
317     return DefaultVal;
318   return Val;
319 }
320 
321 } // namespace
322 
323 uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly) {
324   switch (Kind) {
325   case state::VK_NThreads:
326     if (IsReadonly)
327       return lookup32Impl(&ICVStateTy::NThreadsVar);
328     return lookupForModify32Impl(&ICVStateTy::NThreadsVar);
329   case state::VK_Level:
330     if (IsReadonly)
331       return lookup32Impl(&ICVStateTy::LevelVar);
332     return lookupForModify32Impl(&ICVStateTy::LevelVar);
333   case state::VK_ActiveLevel:
334     if (IsReadonly)
335       return lookup32Impl(&ICVStateTy::ActiveLevelVar);
336     return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar);
337   case state::VK_MaxActiveLevels:
338     if (IsReadonly)
339       return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar);
340     return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar);
341   case state::VK_RunSched:
342     if (IsReadonly)
343       return lookup32Impl(&ICVStateTy::RunSchedVar);
344     return lookupForModify32Impl(&ICVStateTy::RunSchedVar);
345   case state::VK_RunSchedChunk:
346     if (IsReadonly)
347       return lookup32Impl(&ICVStateTy::RunSchedChunkVar);
348     return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar);
349   case state::VK_ParallelTeamSize:
350     return TeamState.ParallelTeamSize;
351   default:
352     break;
353   }
354   __builtin_unreachable();
355 }
356 
357 void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
358   switch (Kind) {
359   case state::VK_ParallelRegionFn:
360     return TeamState.ParallelRegionFnVar;
361   default:
362     break;
363   }
364   __builtin_unreachable();
365 }
366 
367 void state::init(bool IsSPMD) {
368   SharedMemorySmartStack.init(IsSPMD);
369   if (mapping::isInitialThreadInLevel0(IsSPMD))
370     TeamState.init(IsSPMD);
371 
372   ThreadStates[mapping::getThreadIdInBlock()] = nullptr;
373 }
374 
375 void state::enterDataEnvironment() {
376   unsigned TId = mapping::getThreadIdInBlock();
377   ThreadStateTy *NewThreadState =
378       static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
379   NewThreadState->init(ThreadStates[TId]);
380   ThreadStates[TId] = NewThreadState;
381 }
382 
383 void state::exitDataEnvironment() {
384   unsigned TId = mapping::getThreadIdInBlock();
385   resetStateForThread(TId);
386 }
387 
388 void state::resetStateForThread(uint32_t TId) {
389   if (OMP_LIKELY(!ThreadStates[TId]))
390     return;
391 
392   ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
393   __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy));
394   ThreadStates[TId] = PreviousThreadState;
395 }
396 
397 void state::runAndCheckState(void(Func(void))) {
398   TeamStateTy OldTeamState = TeamState;
399   OldTeamState.assertEqual(TeamState);
400 
401   Func();
402 
403   OldTeamState.assertEqual(TeamState);
404 }
405 
406 void state::assumeInitialState(bool IsSPMD) {
407   TeamStateTy InitialTeamState;
408   InitialTeamState.init(IsSPMD);
409   InitialTeamState.assertEqual(TeamState);
410   ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]);
411   ASSERT(mapping::isSPMDMode() == IsSPMD);
412 }
413 
414 extern "C" {
415 void omp_set_dynamic(int V) {}
416 
417 int omp_get_dynamic(void) { return 0; }
418 
419 void omp_set_num_threads(int V) { icv::NThreads = V; }
420 
421 int omp_get_max_threads(void) { return icv::NThreads; }
422 
423 int omp_get_level(void) {
424   int LevelVar = icv::Level;
425   ASSERT(LevelVar >= 0);
426   return LevelVar;
427 }
428 
429 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
430 
431 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
432 
433 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
434   *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
435   *ChunkSize = state::RunSchedChunk;
436 }
437 
438 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
439   icv::RunSched = (int)ScheduleKind;
440   state::RunSchedChunk = ChunkSize;
441 }
442 
443 int omp_get_ancestor_thread_num(int Level) {
444   return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
445 }
446 
447 int omp_get_thread_num(void) {
448   return omp_get_ancestor_thread_num(omp_get_level());
449 }
450 
451 int omp_get_team_size(int Level) {
452   return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
453 }
454 
455 int omp_get_num_threads(void) {
456   return omp_get_level() > 1 ? 1 : state::ParallelTeamSize;
457 }
458 
459 int omp_get_thread_limit(void) { return mapping::getKernelSize(); }
460 
461 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
462 
463 void omp_set_nested(int) {}
464 
465 int omp_get_nested(void) { return false; }
466 
467 void omp_set_max_active_levels(int Levels) {
468   icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
469 }
470 
471 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
472 
473 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
474 
475 int omp_get_num_places(void) { return 0; }
476 
477 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
478 
479 void omp_get_place_proc_ids(int, int *) {
480   // TODO
481 }
482 
483 int omp_get_place_num(void) { return 0; }
484 
485 int omp_get_partition_num_places(void) { return 0; }
486 
487 void omp_get_partition_place_nums(int *) {
488   // TODO
489 }
490 
491 int omp_get_cancellation(void) { return 0; }
492 
493 void omp_set_default_device(int) {}
494 
495 int omp_get_default_device(void) { return -1; }
496 
497 int omp_get_num_devices(void) { return config::getNumDevices(); }
498 
499 int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
500 
501 int omp_get_team_num() { return mapping::getBlockId(); }
502 
503 int omp_get_initial_device(void) { return -1; }
504 }
505 
506 extern "C" {
507 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
508   FunctionTracingRAII();
509   return memory::allocShared(Bytes, "Frontend alloc shared");
510 }
511 
512 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
513   FunctionTracingRAII();
514   memory::freeShared(Ptr, Bytes, "Frontend free shared");
515 }
516 
517 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
518 
519 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
520 
521 /// Allocate storage in shared memory to communicate arguments from the main
522 /// thread to the workers in generic mode. If we exceed
523 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
524 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
525 
526 [[clang::loader_uninitialized]] static void
527     *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
528 #pragma omp allocate(SharedMemVariableSharingSpace)                            \
529     allocator(omp_pteam_mem_alloc)
530 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
531 #pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
532     allocator(omp_pteam_mem_alloc)
533 
534 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
535   FunctionTracingRAII();
536   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
537     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
538   } else {
539     SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
540         nArgs * sizeof(void *), "new extended args");
541     ASSERT(SharedMemVariableSharingSpacePtr != nullptr &&
542            "Nullptr returned by malloc!");
543   }
544   *GlobalArgs = SharedMemVariableSharingSpacePtr;
545 }
546 
547 void __kmpc_end_sharing_variables() {
548   FunctionTracingRAII();
549   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
550     memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
551 }
552 
553 void __kmpc_get_shared_variables(void ***GlobalArgs) {
554   FunctionTracingRAII();
555   *GlobalArgs = SharedMemVariableSharingSpacePtr;
556 }
557 }
558 #pragma omp end declare target
559