1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Parallel implementation in the GPU. Here is the pattern:
10 //
11 //    while (not finished) {
12 //
13 //    if (master) {
14 //      sequential code, decide which par loop to do, or if finished
15 //     __kmpc_kernel_prepare_parallel() // exec by master only
16 //    }
17 //    syncthreads // A
18 //    __kmpc_kernel_parallel() // exec by all
19 //    if (this thread is included in the parallel) {
20 //      switch () for all parallel loops
21 //      __kmpc_kernel_end_parallel() // exec only by threads in parallel
22 //    }
23 //
24 //
25 //    The reason we don't exec end_parallel for the threads not included
26 //    in the parallel loop is that for each barrier in the parallel
27 //    region, these non-included threads will cycle through the
28 //    syncthread A. Thus they must preserve their current threadId that
29 //    is larger than thread in team.
30 //
31 //    To make a long story short...
32 //
33 //===----------------------------------------------------------------------===//
34 
35 #include "Debug.h"
36 #include "Interface.h"
37 #include "Mapping.h"
38 #include "State.h"
39 #include "Synchronization.h"
40 #include "Types.h"
41 #include "Utils.h"
42 
43 using namespace _OMP;
44 
45 #pragma omp begin declare target device_type(nohost)
46 
47 namespace {
48 
49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
50   uint32_t NThreadsICV =
51       NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
52   uint32_t NumThreads = mapping::getBlockSize();
53 
54   if (NThreadsICV != 0 && NThreadsICV < NumThreads)
55     NumThreads = NThreadsICV;
56 
57   // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
58   if (NumThreads < mapping::getWarpSize())
59     NumThreads = 1;
60   else
61     NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
62 
63   return NumThreads;
64 }
65 
66 // Invoke an outlined parallel function unwrapping arguments (up to 32).
67 void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
68                      void **args, int64_t nargs) {
69   DebugEntryRAII Entry(__FILE__, __LINE__, "<OpenMP Outlined Function>");
70   switch (nargs) {
71 #include "generated_microtask_cases.gen"
72   default:
73     PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
74     __builtin_trap();
75   }
76 }
77 
78 } // namespace
79 
80 extern "C" {
81 
82 void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
83                         int32_t num_threads, int proc_bind, void *fn,
84                         void *wrapper_fn, void **args, int64_t nargs) {
85   FunctionTracingRAII();
86 
87   uint32_t TId = mapping::getThreadIdInBlock();
88   // Handle the serialized case first, same for SPMD/non-SPMD.
89   if (OMP_UNLIKELY(!if_expr || icv::Level)) {
90     state::DateEnvironmentRAII DERAII(ident);
91     ++icv::Level;
92     invokeMicrotask(TId, 0, fn, args, nargs);
93     state::exitDataEnvironment();
94     return;
95   }
96 
97   uint32_t NumThreads = determineNumberOfThreads(num_threads);
98   if (mapping::isSPMDMode()) {
99     // Avoid the race between the read of the `icv::Level` above and the write
100     // below by synchronizing all threads here.
101     synchronize::threadsAligned();
102     {
103       // Note that the order here is important. `icv::Level` has to be updated
104       // last or the other updates will cause a thread specific state to be
105       // created.
106       state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
107                                             1u, TId == 0, ident);
108       state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0,
109                                        ident);
110       state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident);
111 
112       // Synchronize all threads after the main thread (TId == 0) set up the
113       // team state properly.
114       synchronize::threadsAligned();
115 
116       ASSERT(state::ParallelTeamSize == NumThreads);
117       ASSERT(icv::ActiveLevel == 1u);
118       ASSERT(icv::Level == 1u);
119 
120       if (TId < NumThreads)
121         invokeMicrotask(TId, 0, fn, args, nargs);
122 
123       // Synchronize all threads at the end of a parallel region.
124       synchronize::threadsAligned();
125     }
126 
127     // Synchronize all threads to make sure every thread exits the scope above;
128     // otherwise the following assertions and the assumption in
129     // __kmpc_target_deinit may not hold.
130     synchronize::threadsAligned();
131 
132     ASSERT(state::ParallelTeamSize == 1u);
133     ASSERT(icv::ActiveLevel == 0u);
134     ASSERT(icv::Level == 0u);
135     return;
136   }
137 
138   // We do *not* create a new data environment because all threads in the team
139   // that are active are now running this parallel region. They share the
140   // TeamState, which has an increase level-var and potentially active-level
141   // set, but they do not have individual ThreadStates yet. If they ever
142   // modify the ICVs beyond this point a ThreadStates will be allocated.
143 
144   bool IsActiveParallelRegion = NumThreads > 1;
145   if (!IsActiveParallelRegion) {
146     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
147     invokeMicrotask(TId, 0, fn, args, nargs);
148     return;
149   }
150 
151   void **GlobalArgs = nullptr;
152   if (nargs) {
153     __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
154     switch (nargs) {
155     default:
156       for (int I = 0; I < nargs; I++)
157         GlobalArgs[I] = args[I];
158       break;
159     case 16:
160       GlobalArgs[15] = args[15];
161       [[fallthrough]];
162     case 15:
163       GlobalArgs[14] = args[14];
164       [[fallthrough]];
165     case 14:
166       GlobalArgs[13] = args[13];
167       [[fallthrough]];
168     case 13:
169       GlobalArgs[12] = args[12];
170       [[fallthrough]];
171     case 12:
172       GlobalArgs[11] = args[11];
173       [[fallthrough]];
174     case 11:
175       GlobalArgs[10] = args[10];
176       [[fallthrough]];
177     case 10:
178       GlobalArgs[9] = args[9];
179       [[fallthrough]];
180     case 9:
181       GlobalArgs[8] = args[8];
182       [[fallthrough]];
183     case 8:
184       GlobalArgs[7] = args[7];
185       [[fallthrough]];
186     case 7:
187       GlobalArgs[6] = args[6];
188       [[fallthrough]];
189     case 6:
190       GlobalArgs[5] = args[5];
191       [[fallthrough]];
192     case 5:
193       GlobalArgs[4] = args[4];
194       [[fallthrough]];
195     case 4:
196       GlobalArgs[3] = args[3];
197       [[fallthrough]];
198     case 3:
199       GlobalArgs[2] = args[2];
200       [[fallthrough]];
201     case 2:
202       GlobalArgs[1] = args[1];
203       [[fallthrough]];
204     case 1:
205       GlobalArgs[0] = args[0];
206       [[fallthrough]];
207     case 0:
208       break;
209     }
210   }
211 
212   {
213     // Note that the order here is important. `icv::Level` has to be updated
214     // last or the other updates will cause a thread specific state to be
215     // created.
216     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
217                                           1u, true, ident);
218     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
219                                           (void *)nullptr, true, ident);
220     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident);
221     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
222 
223     // Master signals work to activate workers.
224     synchronize::threads();
225     // Master waits for workers to signal.
226     synchronize::threads();
227   }
228 
229   if (nargs)
230     __kmpc_end_sharing_variables();
231 }
232 
233 __attribute__((noinline)) bool
234 __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
235   FunctionTracingRAII();
236   // Work function and arguments for L1 parallel region.
237   *WorkFn = state::ParallelRegionFn;
238 
239   // If this is the termination signal from the master, quit early.
240   if (!*WorkFn)
241     return false;
242 
243   // Set to true for workers participating in the parallel region.
244   uint32_t TId = mapping::getThreadIdInBlock();
245   bool ThreadIsActive = TId < state::ParallelTeamSize;
246   return ThreadIsActive;
247 }
248 
249 __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
250   FunctionTracingRAII();
251   // In case we have modified an ICV for this thread before a ThreadState was
252   // created. We drop it now to not contaminate the next parallel region.
253   ASSERT(!mapping::isSPMDMode());
254   uint32_t TId = mapping::getThreadIdInBlock();
255   state::resetStateForThread(TId);
256   ASSERT(!mapping::isSPMDMode());
257 }
258 
259 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) {
260   FunctionTracingRAII();
261   return omp_get_level();
262 }
263 
264 int32_t __kmpc_global_thread_num(IdentTy *) {
265   FunctionTracingRAII();
266   return omp_get_thread_num();
267 }
268 
269 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
270                            int32_t thread_limit) {
271   FunctionTracingRAII();
272 }
273 
274 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {
275   FunctionTracingRAII();
276 }
277 }
278 
279 #pragma omp end declare target
280