1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10 
11 #include "State.h"
12 #include "Configuration.h"
13 #include "Debug.h"
14 #include "Interface.h"
15 #include "Synchronization.h"
16 #include "Types.h"
17 
18 using namespace _OMP;
19 
20 #pragma omp begin declare target device_type(nohost)
21 
22 /// Memory implementation
23 ///
24 ///{
25 
26 /// Add worst-case padding so that future allocations are properly aligned.
27 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
28 /// passed in as an argument and the stack rewritten to support it.
29 constexpr const uint32_t Alignment = 16;
30 
31 /// External symbol to access dynamic shared memory.
32 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
33 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
34 
35 namespace {
36 
37 /// Fallback implementations are missing to trigger a link time error.
38 /// Implementations for new devices, including the host, should go into a
39 /// dedicated begin/end declare variant.
40 ///
41 ///{
42 
43 extern "C" {
44 __attribute__((leaf)) void *malloc(uint64_t Size);
45 __attribute__((leaf)) void free(void *Ptr);
46 }
47 
48 ///}
49 
50 /// AMDGCN implementations of the shuffle sync idiom.
51 ///
52 ///{
53 #pragma omp begin declare variant match(device = {arch(amdgcn)})
54 
55 extern "C" {
malloc(uint64_t Size)56 void *malloc(uint64_t Size) {
57   // TODO: Use some preallocated space for dynamic malloc.
58   return nullptr;
59 }
60 
free(void * Ptr)61 void free(void *Ptr) {}
62 }
63 
64 #pragma omp end declare variant
65 ///}
66 
67 /// A "smart" stack in shared memory.
68 ///
69 /// The stack exposes a malloc/free interface but works like a stack internally.
70 /// In fact, it is a separate stack *per warp*. That means, each warp must push
71 /// and pop symmetrically or this breaks, badly. The implementation will (aim
72 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
73 /// happen if a warp runs out of memory. The master warp in generic memory is
74 /// special and is given more memory than the rest.
75 ///
76 struct SharedMemorySmartStackTy {
77   /// Initialize the stack. Must be called by all threads.
78   void init(bool IsSPMD);
79 
80   /// Allocate \p Bytes on the stack for the encountering thread. Each thread
81   /// can call this function.
82   void *push(uint64_t Bytes);
83 
84   /// Deallocate the last allocation made by the encountering thread and pointed
85   /// to by \p Ptr from the stack. Each thread can call this function.
86   void pop(void *Ptr, uint32_t Bytes);
87 
88 private:
89   /// Compute the size of the storage space reserved for a thread.
computeThreadStorageTotal__anonb8aa7d100111::SharedMemorySmartStackTy90   uint32_t computeThreadStorageTotal() {
91     uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
92     return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
93                              Alignment);
94   }
95 
96   /// Return the top address of the warp data stack, that is the first address
97   /// this warp will allocate memory at next.
getThreadDataTop__anonb8aa7d100111::SharedMemorySmartStackTy98   void *getThreadDataTop(uint32_t TId) {
99     return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
100   }
101 
102   /// The actual storage, shared among all warps.
103   unsigned char Data[state::SharedScratchpadSize]
104       __attribute__((aligned(Alignment)));
105   unsigned char Usage[mapping::MaxThreadsPerTeam]
106       __attribute__((aligned(Alignment)));
107 };
108 
109 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
110               "Shared scratchpad of this size not supported yet.");
111 
112 /// The allocation of a single shared memory scratchpad.
113 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
114 
init(bool IsSPMD)115 void SharedMemorySmartStackTy::init(bool IsSPMD) {
116   Usage[mapping::getThreadIdInBlock()] = 0;
117 }
118 
push(uint64_t Bytes)119 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
120   // First align the number of requested bytes.
121   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
122 
123   uint32_t StorageTotal = computeThreadStorageTotal();
124 
125   // The main thread in generic mode gets the space of its entire warp as the
126   // other threads do not participate in any computation at all.
127   if (mapping::isMainThreadInGenericMode())
128     StorageTotal *= mapping::getWarpSize();
129 
130   int TId = mapping::getThreadIdInBlock();
131   if (Usage[TId] + AlignedBytes <= StorageTotal) {
132     void *Ptr = getThreadDataTop(TId);
133     Usage[TId] += AlignedBytes;
134     return Ptr;
135   }
136 
137   if (config::isDebugMode(config::DebugKind::CommonIssues))
138     PRINT("Shared memory stack full, fallback to dynamic allocation of global "
139           "memory will negatively impact performance.\n");
140   void *GlobalMemory = memory::allocGlobal(
141       AlignedBytes, "Slow path shared memory allocation, insufficient "
142                     "shared memory stack memory!");
143   ASSERT(GlobalMemory != nullptr && "nullptr returned by malloc!");
144 
145   return GlobalMemory;
146 }
147 
pop(void * Ptr,uint32_t Bytes)148 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
149   uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
150   if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) {
151     int TId = mapping::getThreadIdInBlock();
152     Usage[TId] -= AlignedBytes;
153     return;
154   }
155   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
156 }
157 
158 } // namespace
159 
getDynamicBuffer()160 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
161 
allocShared(uint64_t Bytes,const char * Reason)162 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
163   return SharedMemorySmartStack.push(Bytes);
164 }
165 
freeShared(void * Ptr,uint64_t Bytes,const char * Reason)166 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
167   SharedMemorySmartStack.pop(Ptr, Bytes);
168 }
169 
allocGlobal(uint64_t Bytes,const char * Reason)170 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
171   void *Ptr = malloc(Bytes);
172   if (config::isDebugMode(config::DebugKind::CommonIssues) && Ptr == nullptr)
173     PRINT("nullptr returned by malloc!\n");
174   return Ptr;
175 }
176 
freeGlobal(void * Ptr,const char * Reason)177 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
178 
179 ///}
180 
operator ==(const ICVStateTy & Other) const181 bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
182   return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
183          (ActiveLevelVar == Other.ActiveLevelVar) &
184          (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
185          (RunSchedVar == Other.RunSchedVar) &
186          (RunSchedChunkVar == Other.RunSchedChunkVar);
187 }
188 
assertEqual(const ICVStateTy & Other) const189 void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
190   ASSERT(NThreadsVar == Other.NThreadsVar);
191   ASSERT(LevelVar == Other.LevelVar);
192   ASSERT(ActiveLevelVar == Other.ActiveLevelVar);
193   ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar);
194   ASSERT(RunSchedVar == Other.RunSchedVar);
195   ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar);
196 }
197 
init(bool IsSPMD)198 void state::TeamStateTy::init(bool IsSPMD) {
199   ICVState.NThreadsVar = mapping::getBlockSize(IsSPMD);
200   ICVState.LevelVar = 0;
201   ICVState.ActiveLevelVar = 0;
202   ICVState.MaxActiveLevelsVar = 1;
203   ICVState.RunSchedVar = omp_sched_static;
204   ICVState.RunSchedChunkVar = 1;
205   ParallelTeamSize = 1;
206   HasThreadState = false;
207   ParallelRegionFnVar = nullptr;
208 }
209 
operator ==(const TeamStateTy & Other) const210 bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
211   return (ICVState == Other.ICVState) &
212          (HasThreadState == Other.HasThreadState) &
213          (ParallelTeamSize == Other.ParallelTeamSize);
214 }
215 
assertEqual(TeamStateTy & Other) const216 void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
217   ICVState.assertEqual(Other.ICVState);
218   ASSERT(ParallelTeamSize == Other.ParallelTeamSize);
219   ASSERT(HasThreadState == Other.HasThreadState);
220 }
221 
222 state::TeamStateTy SHARED(_OMP::state::TeamState);
223 
224 __attribute__((loader_uninitialized))
225 state::ThreadStateTy *_OMP::state::ThreadStates[mapping::MaxThreadsPerTeam];
226 #pragma omp allocate(_OMP::state::ThreadStates) allocator(omp_pteam_mem_alloc)
227 
228 namespace {
229 
returnValIfLevelIsActive(int Level,int Val,int DefaultVal,int OutOfBoundsVal=-1)230 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
231                              int OutOfBoundsVal = -1) {
232   if (Level == 0)
233     return DefaultVal;
234   int LevelVar = omp_get_level();
235   if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
236     return OutOfBoundsVal;
237   int ActiveLevel = icv::ActiveLevel;
238   if (OMP_UNLIKELY(Level != ActiveLevel))
239     return DefaultVal;
240   return Val;
241 }
242 
243 } // namespace
244 
init(bool IsSPMD)245 void state::init(bool IsSPMD) {
246   SharedMemorySmartStack.init(IsSPMD);
247   if (mapping::isInitialThreadInLevel0(IsSPMD)) {
248     TeamState.init(IsSPMD);
249     DebugEntryRAII::init();
250   }
251 
252   ThreadStates[mapping::getThreadIdInBlock()] = nullptr;
253 }
254 
enterDataEnvironment(IdentTy * Ident)255 void state::enterDataEnvironment(IdentTy *Ident) {
256   ASSERT(config::mayUseThreadStates() &&
257          "Thread state modified while explicitly disabled!");
258 
259   unsigned TId = mapping::getThreadIdInBlock();
260   ThreadStateTy *NewThreadState =
261       static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
262   NewThreadState->init(ThreadStates[TId]);
263   TeamState.HasThreadState = true;
264   ThreadStates[TId] = NewThreadState;
265 }
266 
exitDataEnvironment()267 void state::exitDataEnvironment() {
268   ASSERT(config::mayUseThreadStates() &&
269          "Thread state modified while explicitly disabled!");
270 
271   unsigned TId = mapping::getThreadIdInBlock();
272   resetStateForThread(TId);
273 }
274 
resetStateForThread(uint32_t TId)275 void state::resetStateForThread(uint32_t TId) {
276   if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
277     return;
278 
279   ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
280   __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy));
281   ThreadStates[TId] = PreviousThreadState;
282 }
283 
runAndCheckState(void (Func (void)))284 void state::runAndCheckState(void(Func(void))) {
285   TeamStateTy OldTeamState = TeamState;
286   OldTeamState.assertEqual(TeamState);
287 
288   Func();
289 
290   OldTeamState.assertEqual(TeamState);
291 }
292 
assumeInitialState(bool IsSPMD)293 void state::assumeInitialState(bool IsSPMD) {
294   TeamStateTy InitialTeamState;
295   InitialTeamState.init(IsSPMD);
296   InitialTeamState.assertEqual(TeamState);
297   ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]);
298   ASSERT(mapping::isSPMDMode() == IsSPMD);
299 }
300 
301 extern "C" {
omp_set_dynamic(int V)302 void omp_set_dynamic(int V) {}
303 
omp_get_dynamic(void)304 int omp_get_dynamic(void) { return 0; }
305 
omp_set_num_threads(int V)306 void omp_set_num_threads(int V) { icv::NThreads = V; }
307 
omp_get_max_threads(void)308 int omp_get_max_threads(void) { return icv::NThreads; }
309 
omp_get_level(void)310 int omp_get_level(void) {
311   int LevelVar = icv::Level;
312   ASSERT(LevelVar >= 0);
313   return LevelVar;
314 }
315 
omp_get_active_level(void)316 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
317 
omp_in_parallel(void)318 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
319 
omp_get_schedule(omp_sched_t * ScheduleKind,int * ChunkSize)320 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
321   *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
322   *ChunkSize = state::RunSchedChunk;
323 }
324 
omp_set_schedule(omp_sched_t ScheduleKind,int ChunkSize)325 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
326   icv::RunSched = (int)ScheduleKind;
327   state::RunSchedChunk = ChunkSize;
328 }
329 
omp_get_ancestor_thread_num(int Level)330 int omp_get_ancestor_thread_num(int Level) {
331   return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
332 }
333 
omp_get_thread_num(void)334 int omp_get_thread_num(void) {
335   return omp_get_ancestor_thread_num(omp_get_level());
336 }
337 
omp_get_team_size(int Level)338 int omp_get_team_size(int Level) {
339   return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
340 }
341 
omp_get_num_threads(void)342 int omp_get_num_threads(void) {
343   return omp_get_level() > 1 ? 1 : state::ParallelTeamSize;
344 }
345 
omp_get_thread_limit(void)346 int omp_get_thread_limit(void) { return mapping::getKernelSize(); }
347 
omp_get_num_procs(void)348 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
349 
omp_set_nested(int)350 void omp_set_nested(int) {}
351 
omp_get_nested(void)352 int omp_get_nested(void) { return false; }
353 
omp_set_max_active_levels(int Levels)354 void omp_set_max_active_levels(int Levels) {
355   icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
356 }
357 
omp_get_max_active_levels(void)358 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
359 
omp_get_proc_bind(void)360 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
361 
omp_get_num_places(void)362 int omp_get_num_places(void) { return 0; }
363 
omp_get_place_num_procs(int)364 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
365 
omp_get_place_proc_ids(int,int *)366 void omp_get_place_proc_ids(int, int *) {
367   // TODO
368 }
369 
omp_get_place_num(void)370 int omp_get_place_num(void) { return 0; }
371 
omp_get_partition_num_places(void)372 int omp_get_partition_num_places(void) { return 0; }
373 
omp_get_partition_place_nums(int *)374 void omp_get_partition_place_nums(int *) {
375   // TODO
376 }
377 
omp_get_cancellation(void)378 int omp_get_cancellation(void) { return 0; }
379 
omp_set_default_device(int)380 void omp_set_default_device(int) {}
381 
omp_get_default_device(void)382 int omp_get_default_device(void) { return -1; }
383 
omp_get_num_devices(void)384 int omp_get_num_devices(void) { return config::getNumDevices(); }
385 
omp_get_device_num(void)386 int omp_get_device_num(void) { return config::getDeviceNum(); }
387 
omp_get_num_teams(void)388 int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
389 
omp_get_team_num()390 int omp_get_team_num() { return mapping::getBlockId(); }
391 
omp_get_initial_device(void)392 int omp_get_initial_device(void) { return -1; }
393 }
394 
395 extern "C" {
__kmpc_alloc_shared(uint64_t Bytes)396 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
397   FunctionTracingRAII();
398   return memory::allocShared(Bytes, "Frontend alloc shared");
399 }
400 
__kmpc_free_shared(void * Ptr,uint64_t Bytes)401 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
402   FunctionTracingRAII();
403   memory::freeShared(Ptr, Bytes, "Frontend free shared");
404 }
405 
__kmpc_get_dynamic_shared()406 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
407 
llvm_omp_target_dynamic_shared_alloc()408 void *llvm_omp_target_dynamic_shared_alloc() {
409   return __kmpc_get_dynamic_shared();
410 }
411 
llvm_omp_get_dynamic_shared()412 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
413 
414 /// Allocate storage in shared memory to communicate arguments from the main
415 /// thread to the workers in generic mode. If we exceed
416 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
417 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
418 
419 [[clang::loader_uninitialized]] static void
420     *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
421 #pragma omp allocate(SharedMemVariableSharingSpace)                            \
422     allocator(omp_pteam_mem_alloc)
423 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
424 #pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
425     allocator(omp_pteam_mem_alloc)
426 
__kmpc_begin_sharing_variables(void *** GlobalArgs,uint64_t nArgs)427 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
428   FunctionTracingRAII();
429   if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
430     SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
431   } else {
432     SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
433         nArgs * sizeof(void *), "new extended args");
434     ASSERT(SharedMemVariableSharingSpacePtr != nullptr &&
435            "Nullptr returned by malloc!");
436   }
437   *GlobalArgs = SharedMemVariableSharingSpacePtr;
438 }
439 
__kmpc_end_sharing_variables()440 void __kmpc_end_sharing_variables() {
441   FunctionTracingRAII();
442   if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
443     memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
444 }
445 
__kmpc_get_shared_variables(void *** GlobalArgs)446 void __kmpc_get_shared_variables(void ***GlobalArgs) {
447   FunctionTracingRAII();
448   *GlobalArgs = SharedMemVariableSharingSpacePtr;
449 }
450 }
451 #pragma omp end declare target
452