1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 
11 #include "State.h"
12 #include "Configuration.h"
13 #include "Debug.h"
14 #include "Interface.h"
15 #include "Mapping.h"
16 #include "Synchronization.h"
17 #include "Types.h"
18 #include "Utils.h"
19 
20 using namespace _OMP;
21 
22 #pragma omp begin declare target device_type(nohost)
23 
24 /// Memory implementation
25 ///
26 ///{
27 
28 /// Add worst-case padding so that future allocations are properly aligned.
29 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
30 /// passed in as an argument and the stack rewritten to support it.
31 constexpr const uint32_t Alignment = 16;
32 
33 /// External symbol to access dynamic shared memory.
34 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
35 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
36 
37 namespace {
38 
39 /// Fallback implementations are missing to trigger a link time error.
40 /// Implementations for new devices, including the host, should go into a
41 /// dedicated begin/end declare variant.
42 ///
43 ///{
44 
45 extern "C" {
46 __attribute__((leaf)) void *malloc(uint64_t Size);
47 __attribute__((leaf)) void free(void *Ptr);
48 }
49 
50 ///}
51 
52 /// AMDGCN implementations of the shuffle sync idiom.
53 ///
54 ///{
55 #pragma omp begin declare variant match(device = {arch(amdgcn)})
56 
57 extern "C" {
58 void *malloc(uint64_t Size) {
59   // TODO: Use some preallocated space for dynamic malloc.
60   return nullptr;
61 }
62 
63 void free(void *Ptr) {}
64 }
65 
66 #pragma omp end declare variant
67 ///}
68 
69 /// A "smart" stack in shared memory.
70 ///
71 /// The stack exposes a malloc/free interface but works like a stack internally.
72 /// In fact, it is a separate stack *per warp*. That means, each warp must push
73 /// and pop symmetrically or this breaks, badly. The implementation will (aim
74 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
75 /// happen if a warp runs out of memory. The master warp in generic memory is
76 /// special and is given more memory than the rest.
77 ///
78 struct SharedMemorySmartStackTy {
79   /// Initialize the stack. Must be called by all threads.
80   void init(bool IsSPMD);
81 
82   /// Allocate \p Bytes on the stack for the encountering thread. Each thread
83   /// can call this function.
84   void *push(uint64_t Bytes);
85 
86   /// Deallocate the last allocation made by the encountering thread and pointed
87   /// to by \p Ptr from the stack. Each thread can call this function.
88   void pop(void *Ptr, uint32_t Bytes);
89 
90 private:
91   /// Compute the size of the storage space reserved for a thread.
92   uint32_t computeThreadStorageTotal() {
93     uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
94     return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
95                              Alignment);
96   }
97 
98   /// Return the top address of the warp data stack, that is the first address
99   /// this warp will allocate memory at next.
100   void *getThreadDataTop(uint32_t TId) {
101     return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
102   }
103 
104   /// The actual storage, shared among all warps.
105   unsigned char Data[state::SharedScratchpadSize]
106       __attribute__((aligned(Alignment)));
107   unsigned char Usage[mapping::MaxThreadsPerTeam]
108       __attribute__((aligned(Alignment)));
109 };
110 
111 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
112               "Shared scratchpad of this size not supported yet.");
113 
114 /// The allocation of a single shared memory scratchpad.
115 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
116 
117 void SharedMemorySmartStackTy::init(bool IsSPMD) {
118   Usage[mapping::getThreadIdInBlock()] = 0;
119 }
120 
121 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
122   // First align the number of requested bytes.
123   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
124 
125   uint32_t StorageTotal = computeThreadStorageTotal();
126 
127   // The main thread in generic mode gets the space of its entire warp as the
128   // other threads do not participate in any computation at all.
129   if (mapping::isMainThreadInGenericMode())
130     StorageTotal *= mapping::getWarpSize();
131 
132   int TId = mapping::getThreadIdInBlock();
133   if (Usage[TId] + AlignedBytes <= StorageTotal) {
134     void *Ptr = getThreadDataTop(TId);
135     Usage[TId] += AlignedBytes;
136     return Ptr;
137   }
138 
139   if (config::isDebugMode(config::DebugKind::CommonIssues))
140     PRINT("Shared memory stack full, fallback to dynamic allocation of global "
141           "memory will negatively impact performance.\n");
142   void *GlobalMemory = memory::allocGlobal(
143       AlignedBytes, "Slow path shared memory allocation, insufficient "
144                     "shared memory stack memory!");
145   ASSERT(GlobalMemory != nullptr && "nullptr returned by malloc!");
146 
147   return GlobalMemory;
148 }
149 
150 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
151   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
152   if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) {
153     int TId = mapping::getThreadIdInBlock();
154     Usage[TId] -= AlignedBytes;
155     return;
156   }
157   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
158 }
159 
160 } // namespace
161 
162 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
163 
164 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
165   return SharedMemorySmartStack.push(Bytes);
166 }
167 
168 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
169   SharedMemorySmartStack.pop(Ptr, Bytes);
170 }
171 
172 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
173   void *Ptr = malloc(Bytes);
174   if (config::isDebugMode(config::DebugKind::CommonIssues) && Ptr == nullptr)
175     PRINT("nullptr returned by malloc!\n");
176   return Ptr;
177 }
178 
179 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
180 
181 ///}
182 
183 namespace {
184 
185 struct ICVStateTy {
186   uint32_t NThreadsVar;
187   uint32_t LevelVar;
188   uint32_t ActiveLevelVar;
189   uint32_t MaxActiveLevelsVar;
190   uint32_t RunSchedVar;
191   uint32_t RunSchedChunkVar;
192 
193   bool operator==(const ICVStateTy &Other) const;
194 
195   void assertEqual(const ICVStateTy &Other) const;
196 };
197 
198 bool ICVStateTy::operator==(const ICVStateTy &Other) const {
199   return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
200          (ActiveLevelVar == Other.ActiveLevelVar) &
201          (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
202          (RunSchedVar == Other.RunSchedVar) &
203          (RunSchedChunkVar == Other.RunSchedChunkVar);
204 }
205 
206 void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
207   ASSERT(NThreadsVar == Other.NThreadsVar);
208   ASSERT(LevelVar == Other.LevelVar);
209   ASSERT(ActiveLevelVar == Other.ActiveLevelVar);
210   ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar);
211   ASSERT(RunSchedVar == Other.RunSchedVar);
212   ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar);
213 }
214 
215 struct TeamStateTy {
216   /// TODO: provide a proper init function.
217   void init(bool IsSPMD);
218 
219   bool operator==(const TeamStateTy &) const;
220 
221   void assertEqual(TeamStateTy &Other) const;
222 
223   /// ICVs
224   ///
225   /// Preallocated storage for ICV values that are used if the threads have not
226   /// set a custom default. The latter is supported but unlikely and slow(er).
227   ///
228   ///{
229   ICVStateTy ICVState;
230   ///}
231 
232   uint32_t ParallelTeamSize;
233   ParallelRegionFnTy ParallelRegionFnVar;
234 };
235 
236 TeamStateTy SHARED(TeamState);
237 
238 void TeamStateTy::init(bool IsSPMD) {
239   ICVState.NThreadsVar = mapping::getBlockSize(IsSPMD);
240   ICVState.LevelVar = 0;
241   ICVState.ActiveLevelVar = 0;
242   ICVState.MaxActiveLevelsVar = 1;
243   ICVState.RunSchedVar = omp_sched_static;
244   ICVState.RunSchedChunkVar = 1;
245   ParallelTeamSize = 1;
246   ParallelRegionFnVar = nullptr;
247 }
248 
249 bool TeamStateTy::operator==(const TeamStateTy &Other) const {
250   return (ICVState == Other.ICVState) &
251          (ParallelTeamSize == Other.ParallelTeamSize);
252 }
253 
254 void TeamStateTy::assertEqual(TeamStateTy &Other) const {
255   ICVState.assertEqual(Other.ICVState);
256   ASSERT(ParallelTeamSize == Other.ParallelTeamSize);
257 }
258 
259 struct ThreadStateTy {
260 
261   /// ICVs have preallocated storage in the TeamStateTy which is used if a
262   /// thread has not set a custom value. The latter is supported but unlikely.
263   /// When it happens we will allocate dynamic memory to hold the values of all
264   /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
265   /// ICV struct to hold them all. This is slower than alternatives but allows
266   /// users to pay only for what they use.
267   ///
268   ICVStateTy ICVState;
269 
270   ThreadStateTy *PreviousThreadState;
271 
272   void init() {
273     ICVState = TeamState.ICVState;
274     PreviousThreadState = nullptr;
275   }
276 
277   void init(ThreadStateTy *PreviousTS) {
278     ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
279     PreviousThreadState = PreviousTS;
280   }
281 };
282 
283 __attribute__((loader_uninitialized))
284 ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
285 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
286 
287 uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var, IdentTy *Ident) {
288   if (OMP_LIKELY(!config::mayUseThreadStates() ||
289                  TeamState.ICVState.LevelVar == 0))
290     return TeamState.ICVState.*Var;
291   uint32_t TId = mapping::getThreadIdInBlock();
292   if (OMP_UNLIKELY(!ThreadStates[TId])) {
293     ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
294         sizeof(ThreadStateTy), "ICV modification outside data environment"));
295     ASSERT(ThreadStates[TId] != nullptr && "Nullptr returned by malloc!");
296     ThreadStates[TId]->init();
297   }
298   return ThreadStates[TId]->ICVState.*Var;
299 }
300 
301 template <typename IntTy> IntTy &lookupImpl(IntTy ICVStateTy::*Var) {
302   IntTy TId = mapping::getThreadIdInBlock();
303   if (OMP_UNLIKELY(config::mayUseThreadStates() && ThreadStates[TId]))
304     return ThreadStates[TId]->ICVState.*Var;
305   return TeamState.ICVState.*Var;
306 }
307 
308 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
309                              int OutOfBoundsVal = -1) {
310   if (Level == 0)
311     return DefaultVal;
312   int LevelVar = omp_get_level();
313   if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
314     return OutOfBoundsVal;
315   int ActiveLevel = icv::ActiveLevel;
316   if (OMP_UNLIKELY(Level != ActiveLevel))
317     return DefaultVal;
318   return Val;
319 }
320 
321 } // namespace
322 
323 uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident) {
324   switch (Kind) {
325   case state::VK_NThreads:
326     if (IsReadonly)
327       return lookupImpl<uint32_t>(&ICVStateTy::NThreadsVar);
328     return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident);
329   case state::VK_Level:
330     if (IsReadonly)
331       return lookupImpl<uint32_t>(&ICVStateTy::LevelVar);
332     return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident);
333   case state::VK_ActiveLevel:
334     if (IsReadonly)
335       return lookupImpl<uint32_t>(&ICVStateTy::ActiveLevelVar);
336     return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident);
337   case state::VK_MaxActiveLevels:
338     if (IsReadonly)
339       return lookupImpl<uint32_t>(&ICVStateTy::MaxActiveLevelsVar);
340     return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident);
341   case state::VK_RunSched:
342     if (IsReadonly)
343       return lookupImpl<uint32_t>(&ICVStateTy::RunSchedVar);
344     return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident);
345   case state::VK_RunSchedChunk:
346     if (IsReadonly)
347       return lookupImpl<uint32_t>(&ICVStateTy::RunSchedChunkVar);
348     return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident);
349   case state::VK_ParallelTeamSize:
350     return TeamState.ParallelTeamSize;
351   default:
352     break;
353   }
354   __builtin_unreachable();
355 }
356 
357 void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
358   switch (Kind) {
359   case state::VK_ParallelRegionFn:
360     return TeamState.ParallelRegionFnVar;
361   default:
362     break;
363   }
364   __builtin_unreachable();
365 }
366 
367 void state::init(bool IsSPMD) {
368   SharedMemorySmartStack.init(IsSPMD);
369   if (mapping::isInitialThreadInLevel0(IsSPMD)) {
370     TeamState.init(IsSPMD);
371     DebugEntryRAII::init();
372   }
373 
374   ThreadStates[mapping::getThreadIdInBlock()] = nullptr;
375 }
376 
377 void state::enterDataEnvironment(IdentTy *Ident) {
378   ASSERT(config::mayUseThreadStates() &&
379          "Thread state modified while explicitly disabled!");
380 
381   unsigned TId = mapping::getThreadIdInBlock();
382   ThreadStateTy *NewThreadState =
383       static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
384   NewThreadState->init(ThreadStates[TId]);
385   ThreadStates[TId] = NewThreadState;
386 }
387 
388 void state::exitDataEnvironment() {
389   ASSERT(config::mayUseThreadStates() &&
390          "Thread state modified while explicitly disabled!");
391 
392   unsigned TId = mapping::getThreadIdInBlock();
393   resetStateForThread(TId);
394 }
395 
396 void state::resetStateForThread(uint32_t TId) {
397   if (OMP_LIKELY(!ThreadStates[TId]))
398     return;
399 
400   ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
401   __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy));
402   ThreadStates[TId] = PreviousThreadState;
403 }
404 
405 void state::runAndCheckState(void(Func(void))) {
406   TeamStateTy OldTeamState = TeamState;
407   OldTeamState.assertEqual(TeamState);
408 
409   Func();
410 
411   OldTeamState.assertEqual(TeamState);
412 }
413 
414 void state::assumeInitialState(bool IsSPMD) {
415   TeamStateTy InitialTeamState;
416   InitialTeamState.init(IsSPMD);
417   InitialTeamState.assertEqual(TeamState);
418   ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]);
419   ASSERT(mapping::isSPMDMode() == IsSPMD);
420 }
421 
422 extern "C" {
423 void omp_set_dynamic(int V) {}
424 
425 int omp_get_dynamic(void) { return 0; }
426 
427 void omp_set_num_threads(int V) { icv::NThreads = V; }
428 
429 int omp_get_max_threads(void) { return icv::NThreads; }
430 
431 int omp_get_level(void) {
432   int LevelVar = icv::Level;
433   ASSERT(LevelVar >= 0);
434   return LevelVar;
435 }
436 
437 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
438 
439 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
440 
441 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
442   *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
443   *ChunkSize = state::RunSchedChunk;
444 }
445 
446 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
447   icv::RunSched = (int)ScheduleKind;
448   state::RunSchedChunk = ChunkSize;
449 }
450 
451 int omp_get_ancestor_thread_num(int Level) {
452   return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
453 }
454 
455 int omp_get_thread_num(void) {
456   return omp_get_ancestor_thread_num(omp_get_level());
457 }
458 
459 int omp_get_team_size(int Level) {
460   return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
461 }
462 
463 int omp_get_num_threads(void) {
464   return omp_get_level() > 1 ? 1 : state::ParallelTeamSize;
465 }
466 
467 int omp_get_thread_limit(void) { return mapping::getKernelSize(); }
468 
469 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
470 
471 void omp_set_nested(int) {}
472 
473 int omp_get_nested(void) { return false; }
474 
475 void omp_set_max_active_levels(int Levels) {
476   icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
477 }
478 
479 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
480 
481 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
482 
483 int omp_get_num_places(void) { return 0; }
484 
485 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
486 
487 void omp_get_place_proc_ids(int, int *) {
488   // TODO
489 }
490 
491 int omp_get_place_num(void) { return 0; }
492 
493 int omp_get_partition_num_places(void) { return 0; }
494 
495 void omp_get_partition_place_nums(int *) {
496   // TODO
497 }
498 
499 int omp_get_cancellation(void) { return 0; }
500 
501 void omp_set_default_device(int) {}
502 
503 int omp_get_default_device(void) { return -1; }
504 
505 int omp_get_num_devices(void) { return config::getNumDevices(); }
506 
507 int omp_get_device_num(void) { return config::getDeviceNum(); }
508 
509 int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
510 
511 int omp_get_team_num() { return mapping::getBlockId(); }
512 
513 int omp_get_initial_device(void) { return -1; }
514 }
515 
516 extern "C" {
517 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
518   FunctionTracingRAII();
519   return memory::allocShared(Bytes, "Frontend alloc shared");
520 }
521 
522 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
523   FunctionTracingRAII();
524   memory::freeShared(Ptr, Bytes, "Frontend free shared");
525 }
526 
527 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
528 
529 void *llvm_omp_target_dynamic_shared_alloc() {
530   return __kmpc_get_dynamic_shared();
531 }
532 
533 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
534 
535 /// Allocate storage in shared memory to communicate arguments from the main
536 /// thread to the workers in generic mode. If we exceed
537 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
538 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
539 
540 [[clang::loader_uninitialized]] static void
541     *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
542 #pragma omp allocate(SharedMemVariableSharingSpace)                            \
543     allocator(omp_pteam_mem_alloc)
544 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
545 #pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
546     allocator(omp_pteam_mem_alloc)
547 
548 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
549   FunctionTracingRAII();
550   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
551     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
552   } else {
553     SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
554         nArgs * sizeof(void *), "new extended args");
555     ASSERT(SharedMemVariableSharingSpacePtr != nullptr &&
556            "Nullptr returned by malloc!");
557   }
558   *GlobalArgs = SharedMemVariableSharingSpacePtr;
559 }
560 
561 void __kmpc_end_sharing_variables() {
562   FunctionTracingRAII();
563   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
564     memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
565 }
566 
567 void __kmpc_get_shared_variables(void ***GlobalArgs) {
568   FunctionTracingRAII();
569   *GlobalArgs = SharedMemVariableSharingSpacePtr;
570 }
571 }
572 #pragma omp end declare target
573