1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 
11 #include "State.h"
12 #include "Configuration.h"
13 #include "Debug.h"
14 #include "Interface.h"
15 #include "Mapping.h"
16 #include "Synchronization.h"
17 #include "Types.h"
18 #include "Utils.h"
19 
20 using namespace _OMP;
21 
22 #pragma omp declare target
23 
24 /// Memory implementation
25 ///
26 ///{
27 
28 /// Add worst-case padding so that future allocations are properly aligned.
29 constexpr const uint32_t Alignment = 8;
30 
31 /// External symbol to access dynamic shared memory.
32 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
33 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
34 
35 namespace {
36 
37 /// Fallback implementations are missing to trigger a link time error.
38 /// Implementations for new devices, including the host, should go into a
39 /// dedicated begin/end declare variant.
40 ///
41 ///{
42 
43 extern "C" {
44 void *malloc(uint64_t Size);
45 void free(void *Ptr);
46 }
47 
48 ///}
49 
50 /// AMDGCN implementations of the shuffle sync idiom.
51 ///
52 ///{
53 #pragma omp begin declare variant match(device = {arch(amdgcn)})
54 
55 extern "C" {
56 void *malloc(uint64_t Size) {
57   // TODO: Use some preallocated space for dynamic malloc.
58   return nullptr;
59 }
60 
61 void free(void *Ptr) {}
62 }
63 
64 #pragma omp end declare variant
65 ///}
66 
67 /// A "smart" stack in shared memory.
68 ///
69 /// The stack exposes a malloc/free interface but works like a stack internally.
70 /// In fact, it is a separate stack *per warp*. That means, each warp must push
71 /// and pop symmetrically or this breaks, badly. The implementation will (aim
72 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
73 /// happen if a warp runs out of memory. The master warp in generic memory is
74 /// special and is given more memory than the rest.
75 ///
76 struct SharedMemorySmartStackTy {
77   /// Initialize the stack. Must be called by all threads.
78   void init(bool IsSPMD);
79 
80   /// Allocate \p Bytes on the stack for the encountering thread. Each thread
81   /// can call this function.
82   void *push(uint64_t Bytes);
83 
84   /// Deallocate the last allocation made by the encountering thread and pointed
85   /// to by \p Ptr from the stack. Each thread can call this function.
86   void pop(void *Ptr, uint32_t Bytes);
87 
88 private:
89   /// Compute the size of the storage space reserved for a thread.
90   uint32_t computeThreadStorageTotal() {
91     uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
92     return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
93                              Alignment);
94   }
95 
96   /// Return the top address of the warp data stack, that is the first address
97   /// this warp will allocate memory at next.
98   void *getThreadDataTop(uint32_t TId) {
99     return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
100   }
101 
102   /// The actual storage, shared among all warps.
103   unsigned char Data[state::SharedScratchpadSize]
104       __attribute__((aligned(Alignment)));
105   unsigned char Usage[mapping::MaxThreadsPerTeam]
106       __attribute__((aligned(Alignment)));
107 };
108 
109 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
110               "Shared scratchpad of this size not supported yet.");
111 
112 /// The allocation of a single shared memory scratchpad.
113 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
114 
115 void SharedMemorySmartStackTy::init(bool IsSPMD) {
116   Usage[mapping::getThreadIdInBlock()] = 0;
117 }
118 
119 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
120   // First align the number of requested bytes.
121   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
122 
123   uint32_t StorageTotal = computeThreadStorageTotal();
124 
125   // The main thread in generic mode gets the space of its entire warp as the
126   // other threads do not participate in any computation at all.
127   if (mapping::isMainThreadInGenericMode())
128     StorageTotal *= mapping::getWarpSize();
129 
130   int TId = mapping::getThreadIdInBlock();
131   if (Usage[TId] + AlignedBytes <= StorageTotal) {
132     void *Ptr = getThreadDataTop(TId);
133     Usage[TId] += AlignedBytes;
134     return Ptr;
135   }
136 
137   return memory::allocGlobal(AlignedBytes,
138                              "Slow path shared memory allocation, insufficient "
139                              "shared memory stack memory!");
140 }
141 
142 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
143   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
144   if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) {
145     int TId = mapping::getThreadIdInBlock();
146     Usage[TId] -= AlignedBytes;
147     return;
148   }
149   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
150 }
151 
152 } // namespace
153 
154 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
155 
156 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
157   return SharedMemorySmartStack.push(Bytes);
158 }
159 
160 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
161   SharedMemorySmartStack.pop(Ptr, Bytes);
162 }
163 
164 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
165   return malloc(Bytes);
166 }
167 
168 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
169 
170 ///}
171 
172 namespace {
173 
174 struct ICVStateTy {
175   uint32_t NThreadsVar;
176   uint32_t LevelVar;
177   uint32_t ActiveLevelVar;
178   uint32_t MaxActiveLevelsVar;
179   uint32_t RunSchedVar;
180   uint32_t RunSchedChunkVar;
181 
182   bool operator==(const ICVStateTy &Other) const;
183 
184   void assertEqual(const ICVStateTy &Other) const;
185 };
186 
187 bool ICVStateTy::operator==(const ICVStateTy &Other) const {
188   return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
189          (ActiveLevelVar == Other.ActiveLevelVar) &
190          (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
191          (RunSchedVar == Other.RunSchedVar) &
192          (RunSchedChunkVar == Other.RunSchedChunkVar);
193 }
194 
195 void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
196   ASSERT(NThreadsVar == Other.NThreadsVar);
197   ASSERT(LevelVar == Other.LevelVar);
198   ASSERT(ActiveLevelVar == Other.ActiveLevelVar);
199   ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar);
200   ASSERT(RunSchedVar == Other.RunSchedVar);
201   ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar);
202 }
203 
204 struct TeamStateTy {
205   /// TODO: provide a proper init function.
206   void init(bool IsSPMD);
207 
208   bool operator==(const TeamStateTy &) const;
209 
210   void assertEqual(TeamStateTy &Other) const;
211 
212   /// ICVs
213   ///
214   /// Preallocated storage for ICV values that are used if the threads have not
215   /// set a custom default. The latter is supported but unlikely and slow(er).
216   ///
217   ///{
218   ICVStateTy ICVState;
219   ///}
220 
221   uint32_t ParallelTeamSize;
222   ParallelRegionFnTy ParallelRegionFnVar;
223 };
224 
225 TeamStateTy SHARED(TeamState);
226 
227 void TeamStateTy::init(bool IsSPMD) {
228   ICVState.NThreadsVar = mapping::getBlockSize();
229   ICVState.LevelVar = 0;
230   ICVState.ActiveLevelVar = 0;
231   ICVState.MaxActiveLevelsVar = 1;
232   ICVState.RunSchedVar = omp_sched_static;
233   ICVState.RunSchedChunkVar = 1;
234   ParallelTeamSize = 1;
235   ParallelRegionFnVar = nullptr;
236 }
237 
238 bool TeamStateTy::operator==(const TeamStateTy &Other) const {
239   return (ICVState == Other.ICVState) &
240          (ParallelTeamSize == Other.ParallelTeamSize);
241 }
242 
243 void TeamStateTy::assertEqual(TeamStateTy &Other) const {
244   ICVState.assertEqual(Other.ICVState);
245   ASSERT(ParallelTeamSize == Other.ParallelTeamSize);
246 }
247 
248 struct ThreadStateTy {
249 
250   /// ICVs have preallocated storage in the TeamStateTy which is used if a
251   /// thread has not set a custom value. The latter is supported but unlikely.
252   /// When it happens we will allocate dynamic memory to hold the values of all
253   /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
254   /// ICV struct to hold them all. This is slower than alternatives but allows
255   /// users to pay only for what they use.
256   ///
257   ICVStateTy ICVState;
258 
259   ThreadStateTy *PreviousThreadState;
260 
261   void init() {
262     ICVState = TeamState.ICVState;
263     PreviousThreadState = nullptr;
264   }
265 
266   void init(ThreadStateTy &PreviousTS) {
267     ICVState = PreviousTS.ICVState;
268     PreviousThreadState = &PreviousTS;
269   }
270 };
271 
272 __attribute__((loader_uninitialized))
273 ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
274 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
275 
276 uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var) {
277   if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0))
278     return TeamState.ICVState.*Var;
279   uint32_t TId = mapping::getThreadIdInBlock();
280   if (!ThreadStates[TId]) {
281     ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
282         sizeof(ThreadStateTy), "ICV modification outside data environment"));
283     ThreadStates[TId]->init();
284   }
285   return ThreadStates[TId]->ICVState.*Var;
286 }
287 
288 uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) {
289   uint32_t TId = mapping::getThreadIdInBlock();
290   if (OMP_UNLIKELY(ThreadStates[TId]))
291     return ThreadStates[TId]->ICVState.*Var;
292   return TeamState.ICVState.*Var;
293 }
294 uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) {
295   uint64_t TId = mapping::getThreadIdInBlock();
296   if (OMP_UNLIKELY(ThreadStates[TId]))
297     return ThreadStates[TId]->ICVState.*Var;
298   return TeamState.ICVState.*Var;
299 }
300 
301 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
302                              int OutOfBoundsVal = -1) {
303   if (Level == 0)
304     return DefaultVal;
305   int LevelVar = omp_get_level();
306   if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
307     return OutOfBoundsVal;
308   int ActiveLevel = icv::ActiveLevel;
309   if (OMP_UNLIKELY(Level != ActiveLevel))
310     return DefaultVal;
311   return Val;
312 }
313 
314 } // namespace
315 
316 uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly) {
317   switch (Kind) {
318   case state::VK_NThreads:
319     if (IsReadonly)
320       return lookup32Impl(&ICVStateTy::NThreadsVar);
321     return lookupForModify32Impl(&ICVStateTy::NThreadsVar);
322   case state::VK_Level:
323     if (IsReadonly)
324       return lookup32Impl(&ICVStateTy::LevelVar);
325     return lookupForModify32Impl(&ICVStateTy::LevelVar);
326   case state::VK_ActiveLevel:
327     if (IsReadonly)
328       return lookup32Impl(&ICVStateTy::ActiveLevelVar);
329     return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar);
330   case state::VK_MaxActiveLevels:
331     if (IsReadonly)
332       return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar);
333     return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar);
334   case state::VK_RunSched:
335     if (IsReadonly)
336       return lookup32Impl(&ICVStateTy::RunSchedVar);
337     return lookupForModify32Impl(&ICVStateTy::RunSchedVar);
338   case state::VK_RunSchedChunk:
339     if (IsReadonly)
340       return lookup32Impl(&ICVStateTy::RunSchedChunkVar);
341     return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar);
342   case state::VK_ParallelTeamSize:
343     return TeamState.ParallelTeamSize;
344   default:
345     break;
346   }
347   __builtin_unreachable();
348 }
349 
350 void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
351   switch (Kind) {
352   case state::VK_ParallelRegionFn:
353     return TeamState.ParallelRegionFnVar;
354   default:
355     break;
356   }
357   __builtin_unreachable();
358 }
359 
360 void state::init(bool IsSPMD) {
361   SharedMemorySmartStack.init(IsSPMD);
362   if (!mapping::getThreadIdInBlock())
363     TeamState.init(IsSPMD);
364 
365   ThreadStates[mapping::getThreadIdInBlock()] = nullptr;
366 }
367 
368 void state::enterDataEnvironment() {
369   unsigned TId = mapping::getThreadIdInBlock();
370   ThreadStateTy *NewThreadState =
371       static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
372   NewThreadState->init(*ThreadStates[TId]);
373   ThreadStates[TId] = NewThreadState;
374 }
375 
376 void state::exitDataEnvironment() {
377   unsigned TId = mapping::getThreadIdInBlock();
378   resetStateForThread(TId);
379 }
380 
381 void state::resetStateForThread(uint32_t TId) {
382   if (OMP_LIKELY(!ThreadStates[TId]))
383     return;
384 
385   ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
386   __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy));
387   ThreadStates[TId] = PreviousThreadState;
388 }
389 
390 void state::runAndCheckState(void(Func(void))) {
391   TeamStateTy OldTeamState = TeamState;
392   OldTeamState.assertEqual(TeamState);
393 
394   Func();
395 
396   OldTeamState.assertEqual(TeamState);
397 }
398 
399 void state::assumeInitialState(bool IsSPMD) {
400   TeamStateTy InitialTeamState;
401   InitialTeamState.init(IsSPMD);
402   InitialTeamState.assertEqual(TeamState);
403   ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]);
404   ASSERT(mapping::isSPMDMode() == IsSPMD);
405 }
406 
407 extern "C" {
408 void omp_set_dynamic(int V) {}
409 
410 int omp_get_dynamic(void) { return 0; }
411 
412 void omp_set_num_threads(int V) { icv::NThreads = V; }
413 
414 int omp_get_max_threads(void) { return icv::NThreads; }
415 
416 int omp_get_level(void) {
417   int LevelVar = icv::Level;
418   ASSERT(LevelVar >= 0);
419   return LevelVar;
420 }
421 
422 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
423 
424 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
425 
426 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
427   *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
428   *ChunkSize = state::RunSchedChunk;
429 }
430 
431 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
432   icv::RunSched = (int)ScheduleKind;
433   state::RunSchedChunk = ChunkSize;
434 }
435 
436 int omp_get_ancestor_thread_num(int Level) {
437   return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
438 }
439 
440 int omp_get_thread_num(void) {
441   return omp_get_ancestor_thread_num(omp_get_level());
442 }
443 
444 int omp_get_team_size(int Level) {
445   return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
446 }
447 
448 int omp_get_num_threads(void) { return state::ParallelTeamSize; }
449 
450 int omp_get_thread_limit(void) { return mapping::getKernelSize(); }
451 
452 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
453 
454 void omp_set_nested(int) {}
455 
456 int omp_get_nested(void) { return false; }
457 
458 void omp_set_max_active_levels(int Levels) {
459   icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
460 }
461 
462 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
463 
464 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
465 
466 int omp_get_num_places(void) { return 0; }
467 
468 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
469 
470 void omp_get_place_proc_ids(int, int *) {
471   // TODO
472 }
473 
474 int omp_get_place_num(void) { return 0; }
475 
476 int omp_get_partition_num_places(void) { return 0; }
477 
478 void omp_get_partition_place_nums(int *) {
479   // TODO
480 }
481 
482 int omp_get_cancellation(void) { return 0; }
483 
484 void omp_set_default_device(int) {}
485 
486 int omp_get_default_device(void) { return -1; }
487 
488 int omp_get_num_devices(void) { return config::getNumDevices(); }
489 
490 int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
491 
492 int omp_get_team_num() { return mapping::getBlockId(); }
493 
494 int omp_get_initial_device(void) { return -1; }
495 }
496 
497 extern "C" {
498 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
499   return memory::allocShared(Bytes, "Frontend alloc shared");
500 }
501 
502 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
503   memory::freeShared(Ptr, Bytes, "Frontend free shared");
504 }
505 
506 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
507 
508 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
509 
510 /// Allocate storage in shared memory to communicate arguments from the main
511 /// thread to the workers in generic mode. If we exceed
512 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
513 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
514 
515 [[clang::loader_uninitialized]] static void
516     *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
517 #pragma omp allocate(SharedMemVariableSharingSpace)                            \
518     allocator(omp_pteam_mem_alloc)
519 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
520 #pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
521     allocator(omp_pteam_mem_alloc)
522 
523 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
524   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
525     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
526   } else {
527     SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
528         nArgs * sizeof(void *), "new extended args");
529   }
530   *GlobalArgs = SharedMemVariableSharingSpacePtr;
531 }
532 
533 void __kmpc_end_sharing_variables() {
534   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
535     memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
536 }
537 
538 void __kmpc_get_shared_variables(void ***GlobalArgs) {
539   *GlobalArgs = SharedMemVariableSharingSpacePtr;
540 }
541 }
542 #pragma omp end declare target
543