1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Parallel implementation in the GPU. Here is the pattern: 10 // 11 // while (not finished) { 12 // 13 // if (master) { 14 // sequential code, decide which par loop to do, or if finished 15 // __kmpc_kernel_prepare_parallel() // exec by master only 16 // } 17 // syncthreads // A 18 // __kmpc_kernel_parallel() // exec by all 19 // if (this thread is included in the parallel) { 20 // switch () for all parallel loops 21 // __kmpc_kernel_end_parallel() // exec only by threads in parallel 22 // } 23 // 24 // 25 // The reason we don't exec end_parallel for the threads not included 26 // in the parallel loop is that for each barrier in the parallel 27 // region, these non-included threads will cycle through the 28 // syncthread A. Thus they must preserve their current threadId that 29 // is larger than thread in team. 30 // 31 // To make a long story short... 32 // 33 //===----------------------------------------------------------------------===// 34 35 #include "Debug.h" 36 #include "Interface.h" 37 #include "Mapping.h" 38 #include "State.h" 39 #include "Synchronization.h" 40 #include "Types.h" 41 #include "Utils.h" 42 43 using namespace _OMP; 44 45 #pragma omp begin declare target device_type(nohost) 46 47 namespace { 48 49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { 50 uint32_t NThreadsICV = 51 NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; 52 uint32_t NumThreads = mapping::getBlockSize(); 53 54 if (NThreadsICV != 0 && NThreadsICV < NumThreads) 55 NumThreads = NThreadsICV; 56 57 // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP. 58 if (NumThreads < mapping::getWarpSize()) 59 NumThreads = 1; 60 else 61 NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); 62 63 return NumThreads; 64 } 65 66 // Invoke an outlined parallel function unwrapping arguments (up to 32). 67 void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn, 68 void **args, int64_t nargs) { 69 DebugEntryRAII Entry(__FILE__, __LINE__, "<OpenMP Outlined Function>"); 70 switch (nargs) { 71 #include "generated_microtask_cases.gen" 72 default: 73 PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n"); 74 __builtin_trap(); 75 } 76 } 77 78 } // namespace 79 80 extern "C" { 81 82 void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, 83 int32_t num_threads, int proc_bind, void *fn, 84 void *wrapper_fn, void **args, int64_t nargs) { 85 FunctionTracingRAII(); 86 87 uint32_t TId = mapping::getThreadIdInBlock(); 88 89 // Handle the serialized case first, same for SPMD/non-SPMD: 90 // 1) if-clause(0) 91 // 2) nested parallel regions 92 // 3) parallel in task or other thread state inducing construct 93 if (OMP_UNLIKELY(!if_expr || icv::Level || state::HasThreadState)) { 94 state::DateEnvironmentRAII DERAII(ident); 95 ++icv::Level; 96 invokeMicrotask(TId, 0, fn, args, nargs); 97 return; 98 } 99 100 // From this point forward we know that there is no thread state used. 101 ASSERT(state::HasThreadState == false); 102 103 uint32_t NumThreads = determineNumberOfThreads(num_threads); 104 if (mapping::isSPMDMode()) { 105 // Avoid the race between the read of the `icv::Level` above and the write 106 // below by synchronizing all threads here. 107 synchronize::threadsAligned(); 108 { 109 // Note that the order here is important. `icv::Level` has to be updated 110 // last or the other updates will cause a thread specific state to be 111 // created. 112 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads, 113 1u, TId == 0, ident, 114 /* ForceTeamState */ true); 115 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, 116 ident, /* ForceTeamState */ true); 117 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident, 118 /* ForceTeamState */ true); 119 120 // Synchronize all threads after the main thread (TId == 0) set up the 121 // team state properly. 122 synchronize::threadsAligned(); 123 124 state::ParallelTeamSize.assert_eq(NumThreads, ident, 125 /* ForceTeamState */ true); 126 icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true); 127 icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true); 128 129 if (TId < NumThreads) 130 invokeMicrotask(TId, 0, fn, args, nargs); 131 132 // Synchronize all threads at the end of a parallel region. 133 synchronize::threadsAligned(); 134 } 135 136 // Synchronize all threads to make sure every thread exits the scope above; 137 // otherwise the following assertions and the assumption in 138 // __kmpc_target_deinit may not hold. 139 synchronize::threadsAligned(); 140 141 state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true); 142 icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true); 143 icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true); 144 return; 145 } 146 147 // We do *not* create a new data environment because all threads in the team 148 // that are active are now running this parallel region. They share the 149 // TeamState, which has an increase level-var and potentially active-level 150 // set, but they do not have individual ThreadStates yet. If they ever 151 // modify the ICVs beyond this point a ThreadStates will be allocated. 152 153 bool IsActiveParallelRegion = NumThreads > 1; 154 if (!IsActiveParallelRegion) { 155 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident); 156 invokeMicrotask(TId, 0, fn, args, nargs); 157 return; 158 } 159 160 void **GlobalArgs = nullptr; 161 if (nargs) { 162 __kmpc_begin_sharing_variables(&GlobalArgs, nargs); 163 switch (nargs) { 164 default: 165 for (int I = 0; I < nargs; I++) 166 GlobalArgs[I] = args[I]; 167 break; 168 case 16: 169 GlobalArgs[15] = args[15]; 170 [[fallthrough]]; 171 case 15: 172 GlobalArgs[14] = args[14]; 173 [[fallthrough]]; 174 case 14: 175 GlobalArgs[13] = args[13]; 176 [[fallthrough]]; 177 case 13: 178 GlobalArgs[12] = args[12]; 179 [[fallthrough]]; 180 case 12: 181 GlobalArgs[11] = args[11]; 182 [[fallthrough]]; 183 case 11: 184 GlobalArgs[10] = args[10]; 185 [[fallthrough]]; 186 case 10: 187 GlobalArgs[9] = args[9]; 188 [[fallthrough]]; 189 case 9: 190 GlobalArgs[8] = args[8]; 191 [[fallthrough]]; 192 case 8: 193 GlobalArgs[7] = args[7]; 194 [[fallthrough]]; 195 case 7: 196 GlobalArgs[6] = args[6]; 197 [[fallthrough]]; 198 case 6: 199 GlobalArgs[5] = args[5]; 200 [[fallthrough]]; 201 case 5: 202 GlobalArgs[4] = args[4]; 203 [[fallthrough]]; 204 case 4: 205 GlobalArgs[3] = args[3]; 206 [[fallthrough]]; 207 case 3: 208 GlobalArgs[2] = args[2]; 209 [[fallthrough]]; 210 case 2: 211 GlobalArgs[1] = args[1]; 212 [[fallthrough]]; 213 case 1: 214 GlobalArgs[0] = args[0]; 215 [[fallthrough]]; 216 case 0: 217 break; 218 } 219 } 220 221 { 222 // Note that the order here is important. `icv::Level` has to be updated 223 // last or the other updates will cause a thread specific state to be 224 // created. 225 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads, 226 1u, true, ident, 227 /* ForceTeamState */ true); 228 state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, 229 (void *)nullptr, true, ident, 230 /* ForceTeamState */ true); 231 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident, 232 /* ForceTeamState */ true); 233 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident, 234 /* ForceTeamState */ true); 235 236 // Master signals work to activate workers. 237 synchronize::threads(); 238 // Master waits for workers to signal. 239 synchronize::threads(); 240 } 241 242 if (nargs) 243 __kmpc_end_sharing_variables(); 244 } 245 246 bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { 247 FunctionTracingRAII(); 248 // Work function and arguments for L1 parallel region. 249 *WorkFn = state::ParallelRegionFn; 250 251 // If this is the termination signal from the master, quit early. 252 if (!*WorkFn) 253 return false; 254 255 // Set to true for workers participating in the parallel region. 256 uint32_t TId = mapping::getThreadIdInBlock(); 257 bool ThreadIsActive = TId < state::ParallelTeamSize; 258 return ThreadIsActive; 259 } 260 261 void __kmpc_kernel_end_parallel() { 262 FunctionTracingRAII(); 263 // In case we have modified an ICV for this thread before a ThreadState was 264 // created. We drop it now to not contaminate the next parallel region. 265 ASSERT(!mapping::isSPMDMode()); 266 uint32_t TId = mapping::getThreadIdInBlock(); 267 state::resetStateForThread(TId); 268 ASSERT(!mapping::isSPMDMode()); 269 } 270 271 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { 272 FunctionTracingRAII(); 273 return omp_get_level(); 274 } 275 276 int32_t __kmpc_global_thread_num(IdentTy *) { 277 FunctionTracingRAII(); 278 return omp_get_thread_num(); 279 } 280 281 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, 282 int32_t thread_limit) { 283 FunctionTracingRAII(); 284 } 285 286 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) { 287 FunctionTracingRAII(); 288 } 289 } 290 291 #pragma omp end declare target 292