1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 
11 #include "State.h"
12 #include "Configuration.h"
13 #include "Debug.h"
14 #include "Interface.h"
15 #include "Mapping.h"
16 #include "Synchronization.h"
17 #include "Types.h"
18 #include "Utils.h"
19 
20 using namespace _OMP;
21 
22 #pragma omp declare target
23 
24 /// Memory implementation
25 ///
26 ///{
27 
28 namespace {
29 
30 /// Fallback implementations are missing to trigger a link time error.
31 /// Implementations for new devices, including the host, should go into a
32 /// dedicated begin/end declare variant.
33 ///
34 ///{
35 
36 extern "C" {
37 void *malloc(uint64_t Size);
38 void free(void *Ptr);
39 }
40 
41 ///}
42 
43 /// AMDGCN implementations of the shuffle sync idiom.
44 ///
45 ///{
46 #pragma omp begin declare variant match(device = {arch(amdgcn)})
47 
48 extern "C" {
49 void *malloc(uint64_t Size) {
50   // TODO: Use some preallocated space for dynamic malloc.
51   return nullptr;
52 }
53 
54 void free(void *Ptr) {}
55 }
56 
57 #pragma omp end declare variant
58 ///}
59 
60 /// Add worst-case padding so that future allocations are properly aligned.
61 constexpr const uint32_t Alignment = 8;
62 
63 /// A "smart" stack in shared memory.
64 ///
65 /// The stack exposes a malloc/free interface but works like a stack internally.
66 /// In fact, it is a separate stack *per warp*. That means, each warp must push
67 /// and pop symmetrically or this breaks, badly. The implementation will (aim
68 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
69 /// happen if a warp runs out of memory. The master warp in generic memory is
70 /// special and is given more memory than the rest.
71 ///
72 struct SharedMemorySmartStackTy {
73   /// Initialize the stack. Must be called by all threads.
74   void init(bool IsSPMD);
75 
76   /// Allocate \p Bytes on the stack for the encountering thread. Each thread
77   /// can call this function.
78   void *push(uint64_t Bytes);
79 
80   /// Deallocate the last allocation made by the encountering thread and pointed
81   /// to by \p Ptr from the stack. Each thread can call this function.
82   void pop(void *Ptr, uint32_t Bytes);
83 
84 private:
85   /// Compute the size of the storage space reserved for a thread.
86   uint32_t computeThreadStorageTotal() {
87     uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
88     return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
89                              Alignment);
90   }
91 
92   /// Return the top address of the warp data stack, that is the first address
93   /// this warp will allocate memory at next.
94   void *getThreadDataTop(uint32_t TId) {
95     return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
96   }
97 
98   /// The actual storage, shared among all warps.
99   unsigned char Data[state::SharedScratchpadSize]
100       __attribute__((aligned(Alignment)));
101   unsigned char Usage[mapping::MaxThreadsPerTeam]
102       __attribute__((aligned(Alignment)));
103 };
104 
105 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
106               "Shared scratchpad of this size not supported yet.");
107 
108 /// The allocation of a single shared memory scratchpad.
109 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
110 
111 void SharedMemorySmartStackTy::init(bool IsSPMD) {
112   Usage[mapping::getThreadIdInBlock()] = 0;
113 }
114 
115 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
116   // First align the number of requested bytes.
117   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
118 
119   uint32_t StorageTotal = computeThreadStorageTotal();
120 
121   // The main thread in generic mode gets the space of its entire warp as the
122   // other threads do not participate in any computation at all.
123   if (mapping::isMainThreadInGenericMode())
124     StorageTotal *= mapping::getWarpSize();
125 
126   int TId = mapping::getThreadIdInBlock();
127   if (Usage[TId] + AlignedBytes <= StorageTotal) {
128     void *Ptr = getThreadDataTop(TId);
129     Usage[TId] += AlignedBytes;
130     return Ptr;
131   }
132 
133   return memory::allocGlobal(AlignedBytes,
134                              "Slow path shared memory allocation, insufficient "
135                              "shared memory stack memory!");
136 }
137 
138 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
139   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
140   if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) {
141     int TId = mapping::getThreadIdInBlock();
142     Usage[TId] -= AlignedBytes;
143     return;
144   }
145   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
146 }
147 
148 } // namespace
149 
150 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
151   return SharedMemorySmartStack.push(Bytes);
152 }
153 
154 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
155   SharedMemorySmartStack.pop(Ptr, Bytes);
156 }
157 
158 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
159   return malloc(Bytes);
160 }
161 
162 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
163 
164 ///}
165 
166 namespace {
167 
168 struct ICVStateTy {
169   uint32_t NThreadsVar;
170   uint32_t LevelVar;
171   uint32_t ActiveLevelVar;
172   uint32_t MaxActiveLevelsVar;
173   uint32_t RunSchedVar;
174   uint32_t RunSchedChunkVar;
175 
176   bool operator==(const ICVStateTy &Other) const;
177 
178   void assertEqual(const ICVStateTy &Other) const;
179 };
180 
181 bool ICVStateTy::operator==(const ICVStateTy &Other) const {
182   return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
183          (ActiveLevelVar == Other.ActiveLevelVar) &
184          (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
185          (RunSchedVar == Other.RunSchedVar) &
186          (RunSchedChunkVar == Other.RunSchedChunkVar);
187 }
188 
189 void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
190   ASSERT(NThreadsVar == Other.NThreadsVar);
191   ASSERT(LevelVar == Other.LevelVar);
192   ASSERT(ActiveLevelVar == Other.ActiveLevelVar);
193   ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar);
194   ASSERT(RunSchedVar == Other.RunSchedVar);
195   ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar);
196 }
197 
198 struct TeamStateTy {
199   /// TODO: provide a proper init function.
200   void init(bool IsSPMD);
201 
202   bool operator==(const TeamStateTy &) const;
203 
204   void assertEqual(TeamStateTy &Other) const;
205 
206   /// ICVs
207   ///
208   /// Preallocated storage for ICV values that are used if the threads have not
209   /// set a custom default. The latter is supported but unlikely and slow(er).
210   ///
211   ///{
212   ICVStateTy ICVState;
213   ///}
214 
215   uint32_t ParallelTeamSize;
216   ParallelRegionFnTy ParallelRegionFnVar;
217 };
218 
219 TeamStateTy SHARED(TeamState);
220 
221 void TeamStateTy::init(bool IsSPMD) {
222   ICVState.NThreadsVar = mapping::getBlockSize();
223   ICVState.LevelVar = 0;
224   ICVState.ActiveLevelVar = 0;
225   ICVState.MaxActiveLevelsVar = 1;
226   ICVState.RunSchedVar = omp_sched_static;
227   ICVState.RunSchedChunkVar = 1;
228   ParallelTeamSize = 1;
229   ParallelRegionFnVar = nullptr;
230 }
231 
232 bool TeamStateTy::operator==(const TeamStateTy &Other) const {
233   return (ICVState == Other.ICVState) &
234          (ParallelTeamSize == Other.ParallelTeamSize);
235 }
236 
237 void TeamStateTy::assertEqual(TeamStateTy &Other) const {
238   ICVState.assertEqual(Other.ICVState);
239   ASSERT(ParallelTeamSize == Other.ParallelTeamSize);
240 }
241 
242 struct ThreadStateTy {
243 
244   /// ICVs have preallocated storage in the TeamStateTy which is used if a
245   /// thread has not set a custom value. The latter is supported but unlikely.
246   /// When it happens we will allocate dynamic memory to hold the values of all
247   /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
248   /// ICV struct to hold them all. This is slower than alternatives but allows
249   /// users to pay only for what they use.
250   ///
251   ICVStateTy ICVState;
252 
253   ThreadStateTy *PreviousThreadState;
254 
255   void init() {
256     ICVState = TeamState.ICVState;
257     PreviousThreadState = nullptr;
258   }
259 
260   void init(ThreadStateTy &PreviousTS) {
261     ICVState = PreviousTS.ICVState;
262     PreviousThreadState = &PreviousTS;
263   }
264 };
265 
266 __attribute__((loader_uninitialized))
267 ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
268 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
269 
270 uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var) {
271   if (OMP_LIKELY(TeamState.ICVState.LevelVar == 0))
272     return TeamState.ICVState.*Var;
273   uint32_t TId = mapping::getThreadIdInBlock();
274   if (!ThreadStates[TId]) {
275     ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
276         sizeof(ThreadStateTy), "ICV modification outside data environment"));
277     ThreadStates[TId]->init();
278   }
279   return ThreadStates[TId]->ICVState.*Var;
280 }
281 
282 uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) {
283   uint32_t TId = mapping::getThreadIdInBlock();
284   if (OMP_UNLIKELY(ThreadStates[TId]))
285     return ThreadStates[TId]->ICVState.*Var;
286   return TeamState.ICVState.*Var;
287 }
288 uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) {
289   uint64_t TId = mapping::getThreadIdInBlock();
290   if (OMP_UNLIKELY(ThreadStates[TId]))
291     return ThreadStates[TId]->ICVState.*Var;
292   return TeamState.ICVState.*Var;
293 }
294 
295 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
296                              int OutOfBoundsVal = -1) {
297   if (Level == 0)
298     return DefaultVal;
299   int LevelVar = omp_get_level();
300   if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
301     return OutOfBoundsVal;
302   int ActiveLevel = icv::ActiveLevel;
303   if (OMP_UNLIKELY(Level != ActiveLevel))
304     return DefaultVal;
305   return Val;
306 }
307 
308 } // namespace
309 
310 uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly) {
311   switch (Kind) {
312   case state::VK_NThreads:
313     if (IsReadonly)
314       return lookup32Impl(&ICVStateTy::NThreadsVar);
315     return lookupForModify32Impl(&ICVStateTy::NThreadsVar);
316   case state::VK_Level:
317     if (IsReadonly)
318       return lookup32Impl(&ICVStateTy::LevelVar);
319     return lookupForModify32Impl(&ICVStateTy::LevelVar);
320   case state::VK_ActiveLevel:
321     if (IsReadonly)
322       return lookup32Impl(&ICVStateTy::ActiveLevelVar);
323     return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar);
324   case state::VK_MaxActiveLevels:
325     if (IsReadonly)
326       return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar);
327     return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar);
328   case state::VK_RunSched:
329     if (IsReadonly)
330       return lookup32Impl(&ICVStateTy::RunSchedVar);
331     return lookupForModify32Impl(&ICVStateTy::RunSchedVar);
332   case state::VK_RunSchedChunk:
333     if (IsReadonly)
334       return lookup32Impl(&ICVStateTy::RunSchedChunkVar);
335     return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar);
336   case state::VK_ParallelTeamSize:
337     return TeamState.ParallelTeamSize;
338   default:
339     break;
340   }
341   __builtin_unreachable();
342 }
343 
344 void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
345   switch (Kind) {
346   case state::VK_ParallelRegionFn:
347     return TeamState.ParallelRegionFnVar;
348   default:
349     break;
350   }
351   __builtin_unreachable();
352 }
353 
354 void state::init(bool IsSPMD) {
355   SharedMemorySmartStack.init(IsSPMD);
356   if (!mapping::getThreadIdInBlock())
357     TeamState.init(IsSPMD);
358 
359   ThreadStates[mapping::getThreadIdInBlock()] = nullptr;
360 }
361 
362 void state::enterDataEnvironment() {
363   unsigned TId = mapping::getThreadIdInBlock();
364   ThreadStateTy *NewThreadState =
365       static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
366   NewThreadState->init(*ThreadStates[TId]);
367   ThreadStates[TId] = NewThreadState;
368 }
369 
370 void state::exitDataEnvironment() {
371   unsigned TId = mapping::getThreadIdInBlock();
372   resetStateForThread(TId);
373 }
374 
375 void state::resetStateForThread(uint32_t TId) {
376   if (OMP_LIKELY(!ThreadStates[TId]))
377     return;
378 
379   ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
380   __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy));
381   ThreadStates[TId] = PreviousThreadState;
382 }
383 
384 void state::runAndCheckState(void(Func(void))) {
385   TeamStateTy OldTeamState = TeamState;
386   OldTeamState.assertEqual(TeamState);
387 
388   Func();
389 
390   OldTeamState.assertEqual(TeamState);
391 }
392 
393 void state::assumeInitialState(bool IsSPMD) {
394   TeamStateTy InitialTeamState;
395   InitialTeamState.init(IsSPMD);
396   InitialTeamState.assertEqual(TeamState);
397   ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]);
398   ASSERT(mapping::isSPMDMode() == IsSPMD);
399 }
400 
401 extern "C" {
402 void omp_set_dynamic(int V) {}
403 
404 int omp_get_dynamic(void) { return 0; }
405 
406 void omp_set_num_threads(int V) { icv::NThreads = V; }
407 
408 int omp_get_max_threads(void) { return icv::NThreads; }
409 
410 int omp_get_level(void) {
411   int LevelVar = icv::Level;
412   ASSERT(LevelVar >= 0);
413   return LevelVar;
414 }
415 
416 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
417 
418 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
419 
420 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
421   *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
422   *ChunkSize = state::RunSchedChunk;
423 }
424 
425 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
426   icv::RunSched = (int)ScheduleKind;
427   state::RunSchedChunk = ChunkSize;
428 }
429 
430 int omp_get_ancestor_thread_num(int Level) {
431   return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
432 }
433 
434 int omp_get_thread_num(void) {
435   return omp_get_ancestor_thread_num(omp_get_level());
436 }
437 
438 int omp_get_team_size(int Level) {
439   return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
440 }
441 
442 int omp_get_num_threads(void) { return state::ParallelTeamSize; }
443 
444 int omp_get_thread_limit(void) { return mapping::getKernelSize(); }
445 
446 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
447 
448 void omp_set_nested(int) {}
449 
450 int omp_get_nested(void) { return false; }
451 
452 void omp_set_max_active_levels(int Levels) {
453   icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
454 }
455 
456 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
457 
458 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
459 
460 int omp_get_num_places(void) { return 0; }
461 
462 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
463 
464 void omp_get_place_proc_ids(int, int *) {
465   // TODO
466 }
467 
468 int omp_get_place_num(void) { return 0; }
469 
470 int omp_get_partition_num_places(void) { return 0; }
471 
472 void omp_get_partition_place_nums(int *) {
473   // TODO
474 }
475 
476 int omp_get_cancellation(void) { return 0; }
477 
478 void omp_set_default_device(int) {}
479 
480 int omp_get_default_device(void) { return -1; }
481 
482 int omp_get_num_devices(void) { return config::getNumDevices(); }
483 
484 int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
485 
486 int omp_get_team_num() { return mapping::getBlockId(); }
487 
488 int omp_get_initial_device(void) { return -1; }
489 }
490 
491 extern "C" {
492 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
493   return memory::allocShared(Bytes, "Frontend alloc shared");
494 }
495 
496 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
497   memory::freeShared(Ptr, Bytes, "Frontend free shared");
498 }
499 
500 /// Allocate storage in shared memory to communicate arguments from the main
501 /// thread to the workers in generic mode. If we exceed
502 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
503 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
504 
505 [[clang::loader_uninitialized]] static void
506     *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
507 #pragma omp allocate(SharedMemVariableSharingSpace)                            \
508     allocator(omp_pteam_mem_alloc)
509 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
510 #pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
511     allocator(omp_pteam_mem_alloc)
512 
513 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
514   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
515     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
516   } else {
517     SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
518         nArgs * sizeof(void *), "new extended args");
519   }
520   *GlobalArgs = SharedMemVariableSharingSpacePtr;
521 }
522 
523 void __kmpc_end_sharing_variables() {
524   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
525     memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
526 }
527 
528 void __kmpc_get_shared_variables(void ***GlobalArgs) {
529   *GlobalArgs = SharedMemVariableSharingSpacePtr;
530 }
531 }
532 #pragma omp end declare target
533