167ab875fSJohannes Doerfert //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
267ab875fSJohannes Doerfert //
367ab875fSJohannes Doerfert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
467ab875fSJohannes Doerfert // See https://llvm.org/LICENSE.txt for license information.
567ab875fSJohannes Doerfert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
667ab875fSJohannes Doerfert //
767ab875fSJohannes Doerfert //===----------------------------------------------------------------------===//
867ab875fSJohannes Doerfert //
967ab875fSJohannes Doerfert // Parallel implementation in the GPU. Here is the pattern:
1067ab875fSJohannes Doerfert //
1167ab875fSJohannes Doerfert // while (not finished) {
1267ab875fSJohannes Doerfert //
1367ab875fSJohannes Doerfert // if (master) {
1467ab875fSJohannes Doerfert // sequential code, decide which par loop to do, or if finished
1567ab875fSJohannes Doerfert // __kmpc_kernel_prepare_parallel() // exec by master only
1667ab875fSJohannes Doerfert // }
1767ab875fSJohannes Doerfert // syncthreads // A
1867ab875fSJohannes Doerfert // __kmpc_kernel_parallel() // exec by all
1967ab875fSJohannes Doerfert // if (this thread is included in the parallel) {
2067ab875fSJohannes Doerfert // switch () for all parallel loops
2167ab875fSJohannes Doerfert // __kmpc_kernel_end_parallel() // exec only by threads in parallel
2267ab875fSJohannes Doerfert // }
2367ab875fSJohannes Doerfert //
2467ab875fSJohannes Doerfert //
2567ab875fSJohannes Doerfert // The reason we don't exec end_parallel for the threads not included
2667ab875fSJohannes Doerfert // in the parallel loop is that for each barrier in the parallel
2767ab875fSJohannes Doerfert // region, these non-included threads will cycle through the
2867ab875fSJohannes Doerfert // syncthread A. Thus they must preserve their current threadId that
2967ab875fSJohannes Doerfert // is larger than thread in team.
3067ab875fSJohannes Doerfert //
3167ab875fSJohannes Doerfert // To make a long story short...
3267ab875fSJohannes Doerfert //
3367ab875fSJohannes Doerfert //===----------------------------------------------------------------------===//
3467ab875fSJohannes Doerfert
3567ab875fSJohannes Doerfert #include "Debug.h"
3667ab875fSJohannes Doerfert #include "Interface.h"
3767ab875fSJohannes Doerfert #include "Mapping.h"
3867ab875fSJohannes Doerfert #include "State.h"
3967ab875fSJohannes Doerfert #include "Synchronization.h"
4067ab875fSJohannes Doerfert #include "Types.h"
4167ab875fSJohannes Doerfert #include "Utils.h"
4267ab875fSJohannes Doerfert
4367ab875fSJohannes Doerfert using namespace _OMP;
4467ab875fSJohannes Doerfert
45b4f8443dSJoseph Huber #pragma omp begin declare target device_type(nohost)
4667ab875fSJohannes Doerfert
4767ab875fSJohannes Doerfert namespace {
4867ab875fSJohannes Doerfert
determineNumberOfThreads(int32_t NumThreadsClause)4967ab875fSJohannes Doerfert uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
5067ab875fSJohannes Doerfert uint32_t NThreadsICV =
5167ab875fSJohannes Doerfert NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
5267ab875fSJohannes Doerfert uint32_t NumThreads = mapping::getBlockSize();
5367ab875fSJohannes Doerfert
5467ab875fSJohannes Doerfert if (NThreadsICV != 0 && NThreadsICV < NumThreads)
5567ab875fSJohannes Doerfert NumThreads = NThreadsICV;
5667ab875fSJohannes Doerfert
5767ab875fSJohannes Doerfert // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
5867ab875fSJohannes Doerfert if (NumThreads < mapping::getWarpSize())
5967ab875fSJohannes Doerfert NumThreads = 1;
6067ab875fSJohannes Doerfert else
6167ab875fSJohannes Doerfert NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
6267ab875fSJohannes Doerfert
6367ab875fSJohannes Doerfert return NumThreads;
6467ab875fSJohannes Doerfert }
6567ab875fSJohannes Doerfert
6667ab875fSJohannes Doerfert // Invoke an outlined parallel function unwrapping arguments (up to 32).
invokeMicrotask(int32_t global_tid,int32_t bound_tid,void * fn,void ** args,int64_t nargs)6767ab875fSJohannes Doerfert void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
6867ab875fSJohannes Doerfert void **args, int64_t nargs) {
6974f91741SJoseph Huber DebugEntryRAII Entry(__FILE__, __LINE__, "<OpenMP Outlined Function>");
7067ab875fSJohannes Doerfert switch (nargs) {
7167ab875fSJohannes Doerfert #include "generated_microtask_cases.gen"
7267ab875fSJohannes Doerfert default:
7367ab875fSJohannes Doerfert PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
7467ab875fSJohannes Doerfert __builtin_trap();
7567ab875fSJohannes Doerfert }
7667ab875fSJohannes Doerfert }
7767ab875fSJohannes Doerfert
7867ab875fSJohannes Doerfert } // namespace
7967ab875fSJohannes Doerfert
8067ab875fSJohannes Doerfert extern "C" {
8167ab875fSJohannes Doerfert
__kmpc_parallel_51(IdentTy * ident,int32_t,int32_t if_expr,int32_t num_threads,int proc_bind,void * fn,void * wrapper_fn,void ** args,int64_t nargs)8267ab875fSJohannes Doerfert void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
8367ab875fSJohannes Doerfert int32_t num_threads, int proc_bind, void *fn,
8467ab875fSJohannes Doerfert void *wrapper_fn, void **args, int64_t nargs) {
8574f91741SJoseph Huber FunctionTracingRAII();
8667ab875fSJohannes Doerfert
8767ab875fSJohannes Doerfert uint32_t TId = mapping::getThreadIdInBlock();
88d1501526SJohannes Doerfert
89d1501526SJohannes Doerfert // Handle the serialized case first, same for SPMD/non-SPMD:
90d1501526SJohannes Doerfert // 1) if-clause(0)
91d1501526SJohannes Doerfert // 2) nested parallel regions
92d1501526SJohannes Doerfert // 3) parallel in task or other thread state inducing construct
93d1501526SJohannes Doerfert if (OMP_UNLIKELY(!if_expr || icv::Level || state::HasThreadState)) {
941e121568SJohannes Doerfert state::DateEnvironmentRAII DERAII(ident);
9548877525SJohannes Doerfert ++icv::Level;
9667ab875fSJohannes Doerfert invokeMicrotask(TId, 0, fn, args, nargs);
9767ab875fSJohannes Doerfert return;
9867ab875fSJohannes Doerfert }
9967ab875fSJohannes Doerfert
100d1501526SJohannes Doerfert // From this point forward we know that there is no thread state used.
101d1501526SJohannes Doerfert ASSERT(state::HasThreadState == false);
102d1501526SJohannes Doerfert
10367ab875fSJohannes Doerfert uint32_t NumThreads = determineNumberOfThreads(num_threads);
10467ab875fSJohannes Doerfert if (mapping::isSPMDMode()) {
105b16aadf0SJohannes Doerfert // Avoid the race between the read of the `icv::Level` above and the write
106b16aadf0SJohannes Doerfert // below by synchronizing all threads here.
107b16aadf0SJohannes Doerfert synchronize::threadsAligned();
10867ab875fSJohannes Doerfert {
109b16aadf0SJohannes Doerfert // Note that the order here is important. `icv::Level` has to be updated
110b16aadf0SJohannes Doerfert // last or the other updates will cause a thread specific state to be
111b16aadf0SJohannes Doerfert // created.
11267ab875fSJohannes Doerfert state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
113d1501526SJohannes Doerfert 1u, TId == 0, ident,
114d1501526SJohannes Doerfert /* ForceTeamState */ true);
1151e121568SJohannes Doerfert state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0,
116d1501526SJohannes Doerfert ident, /* ForceTeamState */ true);
117d1501526SJohannes Doerfert state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
118d1501526SJohannes Doerfert /* ForceTeamState */ true);
119b16aadf0SJohannes Doerfert
120b16aadf0SJohannes Doerfert // Synchronize all threads after the main thread (TId == 0) set up the
121b16aadf0SJohannes Doerfert // team state properly.
122b16aadf0SJohannes Doerfert synchronize::threadsAligned();
123b16aadf0SJohannes Doerfert
124d1501526SJohannes Doerfert state::ParallelTeamSize.assert_eq(NumThreads, ident,
125d1501526SJohannes Doerfert /* ForceTeamState */ true);
126d1501526SJohannes Doerfert icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true);
127d1501526SJohannes Doerfert icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true);
12867ab875fSJohannes Doerfert
12967ab875fSJohannes Doerfert if (TId < NumThreads)
13067ab875fSJohannes Doerfert invokeMicrotask(TId, 0, fn, args, nargs);
131b16aadf0SJohannes Doerfert
132b16aadf0SJohannes Doerfert // Synchronize all threads at the end of a parallel region.
133b16aadf0SJohannes Doerfert synchronize::threadsAligned();
13444710940SJohannes Doerfert }
135b16aadf0SJohannes Doerfert
136025f5492SShilei Tian // Synchronize all threads to make sure every thread exits the scope above;
137025f5492SShilei Tian // otherwise the following assertions and the assumption in
138025f5492SShilei Tian // __kmpc_target_deinit may not hold.
139025f5492SShilei Tian synchronize::threadsAligned();
140025f5492SShilei Tian
141d1501526SJohannes Doerfert state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true);
142d1501526SJohannes Doerfert icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true);
143d1501526SJohannes Doerfert icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true);
14467ab875fSJohannes Doerfert return;
14567ab875fSJohannes Doerfert }
14667ab875fSJohannes Doerfert
14767ab875fSJohannes Doerfert // We do *not* create a new data environment because all threads in the team
14867ab875fSJohannes Doerfert // that are active are now running this parallel region. They share the
14967ab875fSJohannes Doerfert // TeamState, which has an increase level-var and potentially active-level
15067ab875fSJohannes Doerfert // set, but they do not have individual ThreadStates yet. If they ever
15167ab875fSJohannes Doerfert // modify the ICVs beyond this point a ThreadStates will be allocated.
15267ab875fSJohannes Doerfert
15367ab875fSJohannes Doerfert bool IsActiveParallelRegion = NumThreads > 1;
15467ab875fSJohannes Doerfert if (!IsActiveParallelRegion) {
1551e121568SJohannes Doerfert state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
15667ab875fSJohannes Doerfert invokeMicrotask(TId, 0, fn, args, nargs);
15767ab875fSJohannes Doerfert return;
15867ab875fSJohannes Doerfert }
15967ab875fSJohannes Doerfert
16067ab875fSJohannes Doerfert void **GlobalArgs = nullptr;
16167ab875fSJohannes Doerfert if (nargs) {
16267ab875fSJohannes Doerfert __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
163a619072cSJoseph Huber switch (nargs) {
164a619072cSJoseph Huber default:
16567ab875fSJohannes Doerfert for (int I = 0; I < nargs; I++)
16667ab875fSJohannes Doerfert GlobalArgs[I] = args[I];
167a619072cSJoseph Huber break;
168a619072cSJoseph Huber case 16:
169a619072cSJoseph Huber GlobalArgs[15] = args[15];
170ce0caf41SJoseph Huber [[fallthrough]];
171a619072cSJoseph Huber case 15:
172a619072cSJoseph Huber GlobalArgs[14] = args[14];
173ce0caf41SJoseph Huber [[fallthrough]];
174a619072cSJoseph Huber case 14:
175a619072cSJoseph Huber GlobalArgs[13] = args[13];
176ce0caf41SJoseph Huber [[fallthrough]];
177a619072cSJoseph Huber case 13:
178a619072cSJoseph Huber GlobalArgs[12] = args[12];
179ce0caf41SJoseph Huber [[fallthrough]];
180a619072cSJoseph Huber case 12:
181a619072cSJoseph Huber GlobalArgs[11] = args[11];
182ce0caf41SJoseph Huber [[fallthrough]];
183a619072cSJoseph Huber case 11:
184a619072cSJoseph Huber GlobalArgs[10] = args[10];
185ce0caf41SJoseph Huber [[fallthrough]];
186a619072cSJoseph Huber case 10:
187a619072cSJoseph Huber GlobalArgs[9] = args[9];
188ce0caf41SJoseph Huber [[fallthrough]];
189a619072cSJoseph Huber case 9:
190a619072cSJoseph Huber GlobalArgs[8] = args[8];
191ce0caf41SJoseph Huber [[fallthrough]];
192a619072cSJoseph Huber case 8:
193a619072cSJoseph Huber GlobalArgs[7] = args[7];
194ce0caf41SJoseph Huber [[fallthrough]];
195a619072cSJoseph Huber case 7:
196a619072cSJoseph Huber GlobalArgs[6] = args[6];
197ce0caf41SJoseph Huber [[fallthrough]];
198a619072cSJoseph Huber case 6:
199a619072cSJoseph Huber GlobalArgs[5] = args[5];
200ce0caf41SJoseph Huber [[fallthrough]];
201a619072cSJoseph Huber case 5:
202a619072cSJoseph Huber GlobalArgs[4] = args[4];
203ce0caf41SJoseph Huber [[fallthrough]];
204a619072cSJoseph Huber case 4:
205a619072cSJoseph Huber GlobalArgs[3] = args[3];
206ce0caf41SJoseph Huber [[fallthrough]];
207a619072cSJoseph Huber case 3:
208a619072cSJoseph Huber GlobalArgs[2] = args[2];
209ce0caf41SJoseph Huber [[fallthrough]];
210a619072cSJoseph Huber case 2:
211a619072cSJoseph Huber GlobalArgs[1] = args[1];
212ce0caf41SJoseph Huber [[fallthrough]];
213a619072cSJoseph Huber case 1:
214a619072cSJoseph Huber GlobalArgs[0] = args[0];
215ce0caf41SJoseph Huber [[fallthrough]];
216a619072cSJoseph Huber case 0:
217a619072cSJoseph Huber break;
218a619072cSJoseph Huber }
21967ab875fSJohannes Doerfert }
22067ab875fSJohannes Doerfert
22167ab875fSJohannes Doerfert {
222b16aadf0SJohannes Doerfert // Note that the order here is important. `icv::Level` has to be updated
223b16aadf0SJohannes Doerfert // last or the other updates will cause a thread specific state to be
224b16aadf0SJohannes Doerfert // created.
22567ab875fSJohannes Doerfert state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
226d1501526SJohannes Doerfert 1u, true, ident,
227d1501526SJohannes Doerfert /* ForceTeamState */ true);
22867ab875fSJohannes Doerfert state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
229d1501526SJohannes Doerfert (void *)nullptr, true, ident,
230d1501526SJohannes Doerfert /* ForceTeamState */ true);
231d1501526SJohannes Doerfert state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
232d1501526SJohannes Doerfert /* ForceTeamState */ true);
233d1501526SJohannes Doerfert state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
234d1501526SJohannes Doerfert /* ForceTeamState */ true);
23567ab875fSJohannes Doerfert
23667ab875fSJohannes Doerfert // Master signals work to activate workers.
23767ab875fSJohannes Doerfert synchronize::threads();
23867ab875fSJohannes Doerfert // Master waits for workers to signal.
23967ab875fSJohannes Doerfert synchronize::threads();
24067ab875fSJohannes Doerfert }
24167ab875fSJohannes Doerfert
24267ab875fSJohannes Doerfert if (nargs)
243e3ee7624SJoseph Huber __kmpc_end_sharing_variables();
24467ab875fSJohannes Doerfert }
24567ab875fSJohannes Doerfert
246*fd8fd9e5SJoseph Huber __attribute__((noinline)) bool
__kmpc_kernel_parallel(ParallelRegionFnTy * WorkFn)247*fd8fd9e5SJoseph Huber __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
24874f91741SJoseph Huber FunctionTracingRAII();
24967ab875fSJohannes Doerfert // Work function and arguments for L1 parallel region.
25067ab875fSJohannes Doerfert *WorkFn = state::ParallelRegionFn;
25167ab875fSJohannes Doerfert
25267ab875fSJohannes Doerfert // If this is the termination signal from the master, quit early.
25367ab875fSJohannes Doerfert if (!*WorkFn)
25467ab875fSJohannes Doerfert return false;
25567ab875fSJohannes Doerfert
25667ab875fSJohannes Doerfert // Set to true for workers participating in the parallel region.
25767ab875fSJohannes Doerfert uint32_t TId = mapping::getThreadIdInBlock();
25867ab875fSJohannes Doerfert bool ThreadIsActive = TId < state::ParallelTeamSize;
25967ab875fSJohannes Doerfert return ThreadIsActive;
26067ab875fSJohannes Doerfert }
26167ab875fSJohannes Doerfert
__kmpc_kernel_end_parallel()262*fd8fd9e5SJoseph Huber __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
26374f91741SJoseph Huber FunctionTracingRAII();
26467ab875fSJohannes Doerfert // In case we have modified an ICV for this thread before a ThreadState was
26567ab875fSJohannes Doerfert // created. We drop it now to not contaminate the next parallel region.
26667ab875fSJohannes Doerfert ASSERT(!mapping::isSPMDMode());
26767ab875fSJohannes Doerfert uint32_t TId = mapping::getThreadIdInBlock();
26867ab875fSJohannes Doerfert state::resetStateForThread(TId);
26967ab875fSJohannes Doerfert ASSERT(!mapping::isSPMDMode());
27067ab875fSJohannes Doerfert }
27167ab875fSJohannes Doerfert
__kmpc_parallel_level(IdentTy *,uint32_t)27274f91741SJoseph Huber uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) {
27374f91741SJoseph Huber FunctionTracingRAII();
27474f91741SJoseph Huber return omp_get_level();
27574f91741SJoseph Huber }
27667ab875fSJohannes Doerfert
__kmpc_global_thread_num(IdentTy *)27774f91741SJoseph Huber int32_t __kmpc_global_thread_num(IdentTy *) {
27874f91741SJoseph Huber FunctionTracingRAII();
27974f91741SJoseph Huber return omp_get_thread_num();
28074f91741SJoseph Huber }
28167ab875fSJohannes Doerfert
__kmpc_push_num_teams(IdentTy * loc,int32_t tid,int32_t num_teams,int32_t thread_limit)28267ab875fSJohannes Doerfert void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
28374f91741SJoseph Huber int32_t thread_limit) {
28474f91741SJoseph Huber FunctionTracingRAII();
28574f91741SJoseph Huber }
28667ab875fSJohannes Doerfert
__kmpc_push_proc_bind(IdentTy * loc,uint32_t tid,int proc_bind)28774f91741SJoseph Huber void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {
28874f91741SJoseph Huber FunctionTracingRAII();
28974f91741SJoseph Huber }
29067ab875fSJohannes Doerfert }
29167ab875fSJohannes Doerfert
29267ab875fSJohannes Doerfert #pragma omp end declare target
293