1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Parallel implementation in the GPU. Here is the pattern:
10 //
11 //    while (not finished) {
12 //
13 //    if (master) {
14 //      sequential code, decide which par loop to do, or if finished
15 //     __kmpc_kernel_prepare_parallel() // exec by master only
16 //    }
17 //    syncthreads // A
18 //    __kmpc_kernel_parallel() // exec by all
19 //    if (this thread is included in the parallel) {
20 //      switch () for all parallel loops
21 //      __kmpc_kernel_end_parallel() // exec only by threads in parallel
22 //    }
23 //
24 //
25 //    The reason we don't exec end_parallel for the threads not included
26 //    in the parallel loop is that for each barrier in the parallel
27 //    region, these non-included threads will cycle through the
28 //    syncthread A. Thus they must preserve their current threadId that
29 //    is larger than thread in team.
30 //
31 //    To make a long story short...
32 //
33 //===----------------------------------------------------------------------===//
34 
35 #include "Debug.h"
36 #include "Interface.h"
37 #include "Mapping.h"
38 #include "State.h"
39 #include "Synchronization.h"
40 #include "Types.h"
41 #include "Utils.h"
42 
43 using namespace _OMP;
44 
45 #pragma omp declare target
46 
47 namespace {
48 
49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
50   uint32_t NThreadsICV =
51       NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
52   uint32_t NumThreads = mapping::getBlockSize();
53 
54   if (NThreadsICV != 0 && NThreadsICV < NumThreads)
55     NumThreads = NThreadsICV;
56 
57   // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
58   if (NumThreads < mapping::getWarpSize())
59     NumThreads = 1;
60   else
61     NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
62 
63   return NumThreads;
64 }
65 
66 // Invoke an outlined parallel function unwrapping arguments (up to 32).
67 void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn,
68                      void **args, int64_t nargs) {
69   switch (nargs) {
70 #include "generated_microtask_cases.gen"
71   default:
72     PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
73     __builtin_trap();
74   }
75 }
76 
77 } // namespace
78 
79 extern "C" {
80 
81 void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
82                         int32_t num_threads, int proc_bind, void *fn,
83                         void *wrapper_fn, void **args, int64_t nargs) {
84 
85   uint32_t TId = mapping::getThreadIdInBlock();
86   // Handle the serialized case first, same for SPMD/non-SPMD.
87   if (OMP_UNLIKELY(!if_expr || icv::Level)) {
88     __kmpc_serialized_parallel(ident, TId);
89     invokeMicrotask(TId, 0, fn, args, nargs);
90     __kmpc_end_serialized_parallel(ident, TId);
91     return;
92   }
93 
94   uint32_t NumThreads = determineNumberOfThreads(num_threads);
95   if (mapping::isSPMDMode()) {
96     {
97       state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
98                                             1u, TId == 0);
99       state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0);
100       state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0);
101       synchronize::threads();
102 
103       if (TId < NumThreads)
104         invokeMicrotask(TId, 0, fn, args, nargs);
105     }
106     synchronize::threads();
107     return;
108   }
109 
110   // We do *not* create a new data environment because all threads in the team
111   // that are active are now running this parallel region. They share the
112   // TeamState, which has an increase level-var and potentially active-level
113   // set, but they do not have individual ThreadStates yet. If they ever
114   // modify the ICVs beyond this point a ThreadStates will be allocated.
115 
116   bool IsActiveParallelRegion = NumThreads > 1;
117   if (!IsActiveParallelRegion) {
118     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true);
119     invokeMicrotask(TId, 0, fn, args, nargs);
120     return;
121   }
122 
123   void **GlobalArgs = nullptr;
124   if (nargs) {
125     __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
126 #pragma unroll
127     for (int I = 0; I < nargs; I++)
128       GlobalArgs[I] = args[I];
129   }
130 
131   {
132     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
133                                           1u, true);
134     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
135                                           (void *)nullptr, true);
136     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true);
137     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true);
138 
139     // Master signals work to activate workers.
140     synchronize::threads();
141     // Master waits for workers to signal.
142     synchronize::threads();
143   }
144 
145   if (nargs)
146     __kmpc_end_sharing_variables();
147 }
148 
149 __attribute__((noinline)) bool
150 __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
151   // Work function and arguments for L1 parallel region.
152   *WorkFn = state::ParallelRegionFn;
153 
154   // If this is the termination signal from the master, quit early.
155   if (!*WorkFn)
156     return false;
157 
158   // Set to true for workers participating in the parallel region.
159   uint32_t TId = mapping::getThreadIdInBlock();
160   bool ThreadIsActive = TId < state::ParallelTeamSize;
161   return ThreadIsActive;
162 }
163 
164 __attribute__((noinline)) void __kmpc_kernel_end_parallel() {
165   // In case we have modified an ICV for this thread before a ThreadState was
166   // created. We drop it now to not contaminate the next parallel region.
167   ASSERT(!mapping::isSPMDMode());
168   uint32_t TId = mapping::getThreadIdInBlock();
169   state::resetStateForThread(TId);
170   ASSERT(!mapping::isSPMDMode());
171 }
172 
173 void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) {
174   state::enterDataEnvironment();
175   ++icv::Level;
176 }
177 
178 void __kmpc_end_serialized_parallel(IdentTy *, uint32_t TId) {
179   state::exitDataEnvironment();
180   --icv::Level;
181 }
182 
183 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
184 
185 int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
186 
187 void __kmpc_push_num_threads(IdentTy *, int32_t, int32_t NumThreads) {
188   icv::NThreads = NumThreads;
189 }
190 
191 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
192                            int32_t thread_limit) {}
193 
194 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
195 }
196 
197 #pragma omp end declare target
198