1 //===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
10
11 #include "State.h"
12 #include "Configuration.h"
13 #include "Debug.h"
14 #include "Interface.h"
15 #include "Synchronization.h"
16 #include "Types.h"
17
18 using namespace _OMP;
19
20 #pragma omp begin declare target device_type(nohost)
21
22 /// Memory implementation
23 ///
24 ///{
25
26 /// Add worst-case padding so that future allocations are properly aligned.
27 /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to be
28 /// passed in as an argument and the stack rewritten to support it.
29 constexpr const uint32_t Alignment = 16;
30
31 /// External symbol to access dynamic shared memory.
32 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
33 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
34
35 namespace {
36
37 /// Fallback implementations are missing to trigger a link time error.
38 /// Implementations for new devices, including the host, should go into a
39 /// dedicated begin/end declare variant.
40 ///
41 ///{
42
43 extern "C" {
44 __attribute__((leaf)) void *malloc(uint64_t Size);
45 __attribute__((leaf)) void free(void *Ptr);
46 }
47
48 ///}
49
50 /// AMDGCN implementations of the shuffle sync idiom.
51 ///
52 ///{
53 #pragma omp begin declare variant match(device = {arch(amdgcn)})
54
55 extern "C" {
malloc(uint64_t Size)56 void *malloc(uint64_t Size) {
57 // TODO: Use some preallocated space for dynamic malloc.
58 return nullptr;
59 }
60
free(void * Ptr)61 void free(void *Ptr) {}
62 }
63
64 #pragma omp end declare variant
65 ///}
66
67 /// A "smart" stack in shared memory.
68 ///
69 /// The stack exposes a malloc/free interface but works like a stack internally.
70 /// In fact, it is a separate stack *per warp*. That means, each warp must push
71 /// and pop symmetrically or this breaks, badly. The implementation will (aim
72 /// to) detect non-lock-step warps and fallback to malloc/free. The same will
73 /// happen if a warp runs out of memory. The master warp in generic memory is
74 /// special and is given more memory than the rest.
75 ///
76 struct SharedMemorySmartStackTy {
77 /// Initialize the stack. Must be called by all threads.
78 void init(bool IsSPMD);
79
80 /// Allocate \p Bytes on the stack for the encountering thread. Each thread
81 /// can call this function.
82 void *push(uint64_t Bytes);
83
84 /// Deallocate the last allocation made by the encountering thread and pointed
85 /// to by \p Ptr from the stack. Each thread can call this function.
86 void pop(void *Ptr, uint32_t Bytes);
87
88 private:
89 /// Compute the size of the storage space reserved for a thread.
computeThreadStorageTotal__anonb8aa7d100111::SharedMemorySmartStackTy90 uint32_t computeThreadStorageTotal() {
91 uint32_t NumLanesInBlock = mapping::getNumberOfProcessorElements();
92 return utils::align_down((state::SharedScratchpadSize / NumLanesInBlock),
93 Alignment);
94 }
95
96 /// Return the top address of the warp data stack, that is the first address
97 /// this warp will allocate memory at next.
getThreadDataTop__anonb8aa7d100111::SharedMemorySmartStackTy98 void *getThreadDataTop(uint32_t TId) {
99 return &Data[computeThreadStorageTotal() * TId + Usage[TId]];
100 }
101
102 /// The actual storage, shared among all warps.
103 unsigned char Data[state::SharedScratchpadSize]
104 __attribute__((aligned(Alignment)));
105 unsigned char Usage[mapping::MaxThreadsPerTeam]
106 __attribute__((aligned(Alignment)));
107 };
108
109 static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
110 "Shared scratchpad of this size not supported yet.");
111
112 /// The allocation of a single shared memory scratchpad.
113 static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
114
init(bool IsSPMD)115 void SharedMemorySmartStackTy::init(bool IsSPMD) {
116 Usage[mapping::getThreadIdInBlock()] = 0;
117 }
118
push(uint64_t Bytes)119 void *SharedMemorySmartStackTy::push(uint64_t Bytes) {
120 // First align the number of requested bytes.
121 uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
122
123 uint32_t StorageTotal = computeThreadStorageTotal();
124
125 // The main thread in generic mode gets the space of its entire warp as the
126 // other threads do not participate in any computation at all.
127 if (mapping::isMainThreadInGenericMode())
128 StorageTotal *= mapping::getWarpSize();
129
130 int TId = mapping::getThreadIdInBlock();
131 if (Usage[TId] + AlignedBytes <= StorageTotal) {
132 void *Ptr = getThreadDataTop(TId);
133 Usage[TId] += AlignedBytes;
134 return Ptr;
135 }
136
137 if (config::isDebugMode(config::DebugKind::CommonIssues))
138 PRINT("Shared memory stack full, fallback to dynamic allocation of global "
139 "memory will negatively impact performance.\n");
140 void *GlobalMemory = memory::allocGlobal(
141 AlignedBytes, "Slow path shared memory allocation, insufficient "
142 "shared memory stack memory!");
143 ASSERT(GlobalMemory != nullptr && "nullptr returned by malloc!");
144
145 return GlobalMemory;
146 }
147
pop(void * Ptr,uint32_t Bytes)148 void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
149 uint64_t AlignedBytes = utils::align_up(Bytes, Alignment);
150 if (Ptr >= &Data[0] && Ptr < &Data[state::SharedScratchpadSize]) {
151 int TId = mapping::getThreadIdInBlock();
152 Usage[TId] -= AlignedBytes;
153 return;
154 }
155 memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
156 }
157
158 } // namespace
159
getDynamicBuffer()160 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
161
allocShared(uint64_t Bytes,const char * Reason)162 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
163 return SharedMemorySmartStack.push(Bytes);
164 }
165
freeShared(void * Ptr,uint64_t Bytes,const char * Reason)166 void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
167 SharedMemorySmartStack.pop(Ptr, Bytes);
168 }
169
allocGlobal(uint64_t Bytes,const char * Reason)170 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
171 void *Ptr = malloc(Bytes);
172 if (config::isDebugMode(config::DebugKind::CommonIssues) && Ptr == nullptr)
173 PRINT("nullptr returned by malloc!\n");
174 return Ptr;
175 }
176
freeGlobal(void * Ptr,const char * Reason)177 void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
178
179 ///}
180
operator ==(const ICVStateTy & Other) const181 bool state::ICVStateTy::operator==(const ICVStateTy &Other) const {
182 return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) &
183 (ActiveLevelVar == Other.ActiveLevelVar) &
184 (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &
185 (RunSchedVar == Other.RunSchedVar) &
186 (RunSchedChunkVar == Other.RunSchedChunkVar);
187 }
188
assertEqual(const ICVStateTy & Other) const189 void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const {
190 ASSERT(NThreadsVar == Other.NThreadsVar);
191 ASSERT(LevelVar == Other.LevelVar);
192 ASSERT(ActiveLevelVar == Other.ActiveLevelVar);
193 ASSERT(MaxActiveLevelsVar == Other.MaxActiveLevelsVar);
194 ASSERT(RunSchedVar == Other.RunSchedVar);
195 ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar);
196 }
197
init(bool IsSPMD)198 void state::TeamStateTy::init(bool IsSPMD) {
199 ICVState.NThreadsVar = mapping::getBlockSize(IsSPMD);
200 ICVState.LevelVar = 0;
201 ICVState.ActiveLevelVar = 0;
202 ICVState.MaxActiveLevelsVar = 1;
203 ICVState.RunSchedVar = omp_sched_static;
204 ICVState.RunSchedChunkVar = 1;
205 ParallelTeamSize = 1;
206 HasThreadState = false;
207 ParallelRegionFnVar = nullptr;
208 }
209
operator ==(const TeamStateTy & Other) const210 bool state::TeamStateTy::operator==(const TeamStateTy &Other) const {
211 return (ICVState == Other.ICVState) &
212 (HasThreadState == Other.HasThreadState) &
213 (ParallelTeamSize == Other.ParallelTeamSize);
214 }
215
assertEqual(TeamStateTy & Other) const216 void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
217 ICVState.assertEqual(Other.ICVState);
218 ASSERT(ParallelTeamSize == Other.ParallelTeamSize);
219 ASSERT(HasThreadState == Other.HasThreadState);
220 }
221
222 state::TeamStateTy SHARED(_OMP::state::TeamState);
223
224 __attribute__((loader_uninitialized))
225 state::ThreadStateTy *_OMP::state::ThreadStates[mapping::MaxThreadsPerTeam];
226 #pragma omp allocate(_OMP::state::ThreadStates) allocator(omp_pteam_mem_alloc)
227
228 namespace {
229
returnValIfLevelIsActive(int Level,int Val,int DefaultVal,int OutOfBoundsVal=-1)230 int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
231 int OutOfBoundsVal = -1) {
232 if (Level == 0)
233 return DefaultVal;
234 int LevelVar = omp_get_level();
235 if (OMP_UNLIKELY(Level < 0 || Level > LevelVar))
236 return OutOfBoundsVal;
237 int ActiveLevel = icv::ActiveLevel;
238 if (OMP_UNLIKELY(Level != ActiveLevel))
239 return DefaultVal;
240 return Val;
241 }
242
243 } // namespace
244
init(bool IsSPMD)245 void state::init(bool IsSPMD) {
246 SharedMemorySmartStack.init(IsSPMD);
247 if (mapping::isInitialThreadInLevel0(IsSPMD)) {
248 TeamState.init(IsSPMD);
249 DebugEntryRAII::init();
250 }
251
252 ThreadStates[mapping::getThreadIdInBlock()] = nullptr;
253 }
254
enterDataEnvironment(IdentTy * Ident)255 void state::enterDataEnvironment(IdentTy *Ident) {
256 ASSERT(config::mayUseThreadStates() &&
257 "Thread state modified while explicitly disabled!");
258
259 unsigned TId = mapping::getThreadIdInBlock();
260 ThreadStateTy *NewThreadState =
261 static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
262 NewThreadState->init(ThreadStates[TId]);
263 TeamState.HasThreadState = true;
264 ThreadStates[TId] = NewThreadState;
265 }
266
exitDataEnvironment()267 void state::exitDataEnvironment() {
268 ASSERT(config::mayUseThreadStates() &&
269 "Thread state modified while explicitly disabled!");
270
271 unsigned TId = mapping::getThreadIdInBlock();
272 resetStateForThread(TId);
273 }
274
resetStateForThread(uint32_t TId)275 void state::resetStateForThread(uint32_t TId) {
276 if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId]))
277 return;
278
279 ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
280 __kmpc_free_shared(ThreadStates[TId], sizeof(ThreadStateTy));
281 ThreadStates[TId] = PreviousThreadState;
282 }
283
runAndCheckState(void (Func (void)))284 void state::runAndCheckState(void(Func(void))) {
285 TeamStateTy OldTeamState = TeamState;
286 OldTeamState.assertEqual(TeamState);
287
288 Func();
289
290 OldTeamState.assertEqual(TeamState);
291 }
292
assumeInitialState(bool IsSPMD)293 void state::assumeInitialState(bool IsSPMD) {
294 TeamStateTy InitialTeamState;
295 InitialTeamState.init(IsSPMD);
296 InitialTeamState.assertEqual(TeamState);
297 ASSERT(!ThreadStates[mapping::getThreadIdInBlock()]);
298 ASSERT(mapping::isSPMDMode() == IsSPMD);
299 }
300
301 extern "C" {
omp_set_dynamic(int V)302 void omp_set_dynamic(int V) {}
303
omp_get_dynamic(void)304 int omp_get_dynamic(void) { return 0; }
305
omp_set_num_threads(int V)306 void omp_set_num_threads(int V) { icv::NThreads = V; }
307
omp_get_max_threads(void)308 int omp_get_max_threads(void) { return icv::NThreads; }
309
omp_get_level(void)310 int omp_get_level(void) {
311 int LevelVar = icv::Level;
312 ASSERT(LevelVar >= 0);
313 return LevelVar;
314 }
315
omp_get_active_level(void)316 int omp_get_active_level(void) { return !!icv::ActiveLevel; }
317
omp_in_parallel(void)318 int omp_in_parallel(void) { return !!icv::ActiveLevel; }
319
omp_get_schedule(omp_sched_t * ScheduleKind,int * ChunkSize)320 void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
321 *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
322 *ChunkSize = state::RunSchedChunk;
323 }
324
omp_set_schedule(omp_sched_t ScheduleKind,int ChunkSize)325 void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
326 icv::RunSched = (int)ScheduleKind;
327 state::RunSchedChunk = ChunkSize;
328 }
329
omp_get_ancestor_thread_num(int Level)330 int omp_get_ancestor_thread_num(int Level) {
331 return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
332 }
333
omp_get_thread_num(void)334 int omp_get_thread_num(void) {
335 return omp_get_ancestor_thread_num(omp_get_level());
336 }
337
omp_get_team_size(int Level)338 int omp_get_team_size(int Level) {
339 return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
340 }
341
omp_get_num_threads(void)342 int omp_get_num_threads(void) {
343 return omp_get_level() > 1 ? 1 : state::ParallelTeamSize;
344 }
345
omp_get_thread_limit(void)346 int omp_get_thread_limit(void) { return mapping::getKernelSize(); }
347
omp_get_num_procs(void)348 int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
349
omp_set_nested(int)350 void omp_set_nested(int) {}
351
omp_get_nested(void)352 int omp_get_nested(void) { return false; }
353
omp_set_max_active_levels(int Levels)354 void omp_set_max_active_levels(int Levels) {
355 icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
356 }
357
omp_get_max_active_levels(void)358 int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
359
omp_get_proc_bind(void)360 omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
361
omp_get_num_places(void)362 int omp_get_num_places(void) { return 0; }
363
omp_get_place_num_procs(int)364 int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
365
omp_get_place_proc_ids(int,int *)366 void omp_get_place_proc_ids(int, int *) {
367 // TODO
368 }
369
omp_get_place_num(void)370 int omp_get_place_num(void) { return 0; }
371
omp_get_partition_num_places(void)372 int omp_get_partition_num_places(void) { return 0; }
373
omp_get_partition_place_nums(int *)374 void omp_get_partition_place_nums(int *) {
375 // TODO
376 }
377
omp_get_cancellation(void)378 int omp_get_cancellation(void) { return 0; }
379
omp_set_default_device(int)380 void omp_set_default_device(int) {}
381
omp_get_default_device(void)382 int omp_get_default_device(void) { return -1; }
383
omp_get_num_devices(void)384 int omp_get_num_devices(void) { return config::getNumDevices(); }
385
omp_get_device_num(void)386 int omp_get_device_num(void) { return config::getDeviceNum(); }
387
omp_get_num_teams(void)388 int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
389
omp_get_team_num()390 int omp_get_team_num() { return mapping::getBlockId(); }
391
omp_get_initial_device(void)392 int omp_get_initial_device(void) { return -1; }
393 }
394
395 extern "C" {
__kmpc_alloc_shared(uint64_t Bytes)396 __attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
397 FunctionTracingRAII();
398 return memory::allocShared(Bytes, "Frontend alloc shared");
399 }
400
__kmpc_free_shared(void * Ptr,uint64_t Bytes)401 __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
402 FunctionTracingRAII();
403 memory::freeShared(Ptr, Bytes, "Frontend free shared");
404 }
405
__kmpc_get_dynamic_shared()406 void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); }
407
llvm_omp_target_dynamic_shared_alloc()408 void *llvm_omp_target_dynamic_shared_alloc() {
409 return __kmpc_get_dynamic_shared();
410 }
411
llvm_omp_get_dynamic_shared()412 void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
413
414 /// Allocate storage in shared memory to communicate arguments from the main
415 /// thread to the workers in generic mode. If we exceed
416 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
417 constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
418
419 [[clang::loader_uninitialized]] static void
420 *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
421 #pragma omp allocate(SharedMemVariableSharingSpace) \
422 allocator(omp_pteam_mem_alloc)
423 [[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
424 #pragma omp allocate(SharedMemVariableSharingSpacePtr) \
425 allocator(omp_pteam_mem_alloc)
426
__kmpc_begin_sharing_variables(void *** GlobalArgs,uint64_t nArgs)427 void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
428 FunctionTracingRAII();
429 if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
430 SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
431 } else {
432 SharedMemVariableSharingSpacePtr = (void **)memory::allocGlobal(
433 nArgs * sizeof(void *), "new extended args");
434 ASSERT(SharedMemVariableSharingSpacePtr != nullptr &&
435 "Nullptr returned by malloc!");
436 }
437 *GlobalArgs = SharedMemVariableSharingSpacePtr;
438 }
439
__kmpc_end_sharing_variables()440 void __kmpc_end_sharing_variables() {
441 FunctionTracingRAII();
442 if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
443 memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args");
444 }
445
__kmpc_get_shared_variables(void *** GlobalArgs)446 void __kmpc_get_shared_variables(void ***GlobalArgs) {
447 FunctionTracingRAII();
448 *GlobalArgs = SharedMemVariableSharingSpacePtr;
449 }
450 }
451 #pragma omp end declare target
452