1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Parallel implementation in the GPU. Here is the pattern:
10 //
11 // while (not finished) {
12 //
13 // if (master) {
14 // sequential code, decide which par loop to do, or if finished
15 // __kmpc_kernel_prepare_parallel() // exec by master only
16 // }
17 // syncthreads // A
18 // __kmpc_kernel_parallel() // exec by all
19 // if (this thread is included in the parallel) {
20 // switch () for all parallel loops
21 // __kmpc_kernel_end_parallel() // exec only by threads in parallel
22 // }
23 //
24 //
25 // The reason we don't exec end_parallel for the threads not included
26 // in the parallel loop is that for each barrier in the parallel
27 // region, these non-included threads will cycle through the
28 // syncthread A. Thus they must preserve their current threadId that
29 // is larger than thread in team.
30 //
31 // To make a long story short...
32 //
33 //===----------------------------------------------------------------------===//
34
35 #include "Debug.h"
36 #include "Interface.h"
37 #include "Mapping.h"
38 #include "State.h"
39 #include "Synchronization.h"
40 #include "Types.h"
41 #include "Utils.h"
42
43 using namespace _OMP;
44
45 #pragma omp begin declare target device_type(nohost)
46
47 namespace {
48
determineNumberOfThreads(int32_t NumThreadsClause)49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
50 uint32_t NThreadsICV =
51 NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
52 uint32_t NumThreads = mapping::getBlockSize();
53
54 if (NThreadsICV != 0 && NThreadsICV < NumThreads)
55 NumThreads = NThreadsICV;
56
57 // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
58 if (NumThreads < mapping::getWarpSize())
59 NumThreads = 1;
60 else
61 NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
62
63 return NumThreads;
64 }
65
66 // Invoke an outlined parallel function unwrapping arguments (up to 32).
invokeMicrotask(int32_t global_tid,int32_t bound_tid,void * fn,void ** args,int64_t nargs)67 void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
68 void **args, int64_t nargs) {
69 DebugEntryRAII Entry(__FILE__, __LINE__, "<OpenMP Outlined Function>");
70 switch (nargs) {
71 #include "generated_microtask_cases.gen"
72 default:
73 PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
74 __builtin_trap();
75 }
76 }
77
78 } // namespace
79
80 extern "C" {
81
__kmpc_parallel_51(IdentTy * ident,int32_t,int32_t if_expr,int32_t num_threads,int proc_bind,void * fn,void * wrapper_fn,void ** args,int64_t nargs)82 void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
83 int32_t num_threads, int proc_bind, void *fn,
84 void *wrapper_fn, void **args, int64_t nargs) {
85 FunctionTracingRAII();
86
87 uint32_t TId = mapping::getThreadIdInBlock();
88
89 // Handle the serialized case first, same for SPMD/non-SPMD:
90 // 1) if-clause(0)
91 // 2) nested parallel regions
92 // 3) parallel in task or other thread state inducing construct
93 if (OMP_UNLIKELY(!if_expr || icv::Level || state::HasThreadState)) {
94 state::DateEnvironmentRAII DERAII(ident);
95 ++icv::Level;
96 invokeMicrotask(TId, 0, fn, args, nargs);
97 return;
98 }
99
100 // From this point forward we know that there is no thread state used.
101 ASSERT(state::HasThreadState == false);
102
103 uint32_t NumThreads = determineNumberOfThreads(num_threads);
104 if (mapping::isSPMDMode()) {
105 // Avoid the race between the read of the `icv::Level` above and the write
106 // below by synchronizing all threads here.
107 synchronize::threadsAligned();
108 {
109 // Note that the order here is important. `icv::Level` has to be updated
110 // last or the other updates will cause a thread specific state to be
111 // created.
112 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
113 1u, TId == 0, ident,
114 /* ForceTeamState */ true);
115 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0,
116 ident, /* ForceTeamState */ true);
117 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
118 /* ForceTeamState */ true);
119
120 // Synchronize all threads after the main thread (TId == 0) set up the
121 // team state properly.
122 synchronize::threadsAligned();
123
124 state::ParallelTeamSize.assert_eq(NumThreads, ident,
125 /* ForceTeamState */ true);
126 icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true);
127 icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true);
128
129 if (TId < NumThreads)
130 invokeMicrotask(TId, 0, fn, args, nargs);
131
132 // Synchronize all threads at the end of a parallel region.
133 synchronize::threadsAligned();
134 }
135
136 // Synchronize all threads to make sure every thread exits the scope above;
137 // otherwise the following assertions and the assumption in
138 // __kmpc_target_deinit may not hold.
139 synchronize::threadsAligned();
140
141 state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true);
142 icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true);
143 icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true);
144 return;
145 }
146
147 // We do *not* create a new data environment because all threads in the team
148 // that are active are now running this parallel region. They share the
149 // TeamState, which has an increase level-var and potentially active-level
150 // set, but they do not have individual ThreadStates yet. If they ever
151 // modify the ICVs beyond this point a ThreadStates will be allocated.
152
153 bool IsActiveParallelRegion = NumThreads > 1;
154 if (!IsActiveParallelRegion) {
155 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
156 invokeMicrotask(TId, 0, fn, args, nargs);
157 return;
158 }
159
160 void **GlobalArgs = nullptr;
161 if (nargs) {
162 __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
163 switch (nargs) {
164 default:
165 for (int I = 0; I < nargs; I++)
166 GlobalArgs[I] = args[I];
167 break;
168 case 16:
169 GlobalArgs[15] = args[15];
170 [[fallthrough]];
171 case 15:
172 GlobalArgs[14] = args[14];
173 [[fallthrough]];
174 case 14:
175 GlobalArgs[13] = args[13];
176 [[fallthrough]];
177 case 13:
178 GlobalArgs[12] = args[12];
179 [[fallthrough]];
180 case 12:
181 GlobalArgs[11] = args[11];
182 [[fallthrough]];
183 case 11:
184 GlobalArgs[10] = args[10];
185 [[fallthrough]];
186 case 10:
187 GlobalArgs[9] = args[9];
188 [[fallthrough]];
189 case 9:
190 GlobalArgs[8] = args[8];
191 [[fallthrough]];
192 case 8:
193 GlobalArgs[7] = args[7];
194 [[fallthrough]];
195 case 7:
196 GlobalArgs[6] = args[6];
197 [[fallthrough]];
198 case 6:
199 GlobalArgs[5] = args[5];
200 [[fallthrough]];
201 case 5:
202 GlobalArgs[4] = args[4];
203 [[fallthrough]];
204 case 4:
205 GlobalArgs[3] = args[3];
206 [[fallthrough]];
207 case 3:
208 GlobalArgs[2] = args[2];
209 [[fallthrough]];
210 case 2:
211 GlobalArgs[1] = args[1];
212 [[fallthrough]];
213 case 1:
214 GlobalArgs[0] = args[0];
215 [[fallthrough]];
216 case 0:
217 break;
218 }
219 }
220
221 {
222 // Note that the order here is important. `icv::Level` has to be updated
223 // last or the other updates will cause a thread specific state to be
224 // created.
225 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
226 1u, true, ident,
227 /* ForceTeamState */ true);
228 state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
229 (void *)nullptr, true, ident,
230 /* ForceTeamState */ true);
231 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
232 /* ForceTeamState */ true);
233 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
234 /* ForceTeamState */ true);
235
236 // Master signals work to activate workers.
237 synchronize::threads();
238 // Master waits for workers to signal.
239 synchronize::threads();
240 }
241
242 if (nargs)
243 __kmpc_end_sharing_variables();
244 }
245
246 __attribute__((noinline)) bool
__kmpc_kernel_parallel(ParallelRegionFnTy * WorkFn)247 __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
248 FunctionTracingRAII();
249 // Work function and arguments for L1 parallel region.
250 *WorkFn = state::ParallelRegionFn;
251
252 // If this is the termination signal from the master, quit early.
253 if (!*WorkFn)
254 return false;
255
256 // Set to true for workers participating in the parallel region.
257 uint32_t TId = mapping::getThreadIdInBlock();
258 bool ThreadIsActive = TId < state::ParallelTeamSize;
259 return ThreadIsActive;
260 }
261
__kmpc_kernel_end_parallel()262 __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
263 FunctionTracingRAII();
264 // In case we have modified an ICV for this thread before a ThreadState was
265 // created. We drop it now to not contaminate the next parallel region.
266 ASSERT(!mapping::isSPMDMode());
267 uint32_t TId = mapping::getThreadIdInBlock();
268 state::resetStateForThread(TId);
269 ASSERT(!mapping::isSPMDMode());
270 }
271
__kmpc_parallel_level(IdentTy *,uint32_t)272 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) {
273 FunctionTracingRAII();
274 return omp_get_level();
275 }
276
__kmpc_global_thread_num(IdentTy *)277 int32_t __kmpc_global_thread_num(IdentTy *) {
278 FunctionTracingRAII();
279 return omp_get_thread_num();
280 }
281
__kmpc_push_num_teams(IdentTy * loc,int32_t tid,int32_t num_teams,int32_t thread_limit)282 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
283 int32_t thread_limit) {
284 FunctionTracingRAII();
285 }
286
__kmpc_push_proc_bind(IdentTy * loc,uint32_t tid,int proc_bind)287 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {
288 FunctionTracingRAII();
289 }
290 }
291
292 #pragma omp end declare target
293