1 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Parallel implementation in the GPU. Here is the pattern: 10 // 11 // while (not finished) { 12 // 13 // if (master) { 14 // sequential code, decide which par loop to do, or if finished 15 // __kmpc_kernel_prepare_parallel() // exec by master only 16 // } 17 // syncthreads // A 18 // __kmpc_kernel_parallel() // exec by all 19 // if (this thread is included in the parallel) { 20 // switch () for all parallel loops 21 // __kmpc_kernel_end_parallel() // exec only by threads in parallel 22 // } 23 // 24 // 25 // The reason we don't exec end_parallel for the threads not included 26 // in the parallel loop is that for each barrier in the parallel 27 // region, these non-included threads will cycle through the 28 // syncthread A. Thus they must preserve their current threadId that 29 // is larger than thread in team. 30 // 31 // To make a long story short... 32 // 33 //===----------------------------------------------------------------------===// 34 35 #include "Debug.h" 36 #include "Interface.h" 37 #include "Mapping.h" 38 #include "State.h" 39 #include "Synchronization.h" 40 #include "Types.h" 41 #include "Utils.h" 42 43 using namespace _OMP; 44 45 #pragma omp declare target 46 47 namespace { 48 49 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { 50 uint32_t NThreadsICV = 51 NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; 52 uint32_t NumThreads = mapping::getBlockSize(); 53 54 if (NThreadsICV != 0 && NThreadsICV < NumThreads) 55 NumThreads = NThreadsICV; 56 57 // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP. 58 if (NumThreads < mapping::getWarpSize()) 59 NumThreads = 1; 60 else 61 NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1)); 62 63 return NumThreads; 64 } 65 66 // Invoke an outlined parallel function unwrapping arguments (up to 32). 67 void invokeMicrotask(int32_t global_tid, int32_t bound_tid, void *fn, 68 void **args, int64_t nargs) { 69 switch (nargs) { 70 #include "generated_microtask_cases.gen" 71 default: 72 PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n"); 73 __builtin_trap(); 74 } 75 } 76 77 } // namespace 78 79 extern "C" { 80 81 void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, 82 int32_t num_threads, int proc_bind, void *fn, 83 void *wrapper_fn, void **args, int64_t nargs) { 84 85 uint32_t TId = mapping::getThreadIdInBlock(); 86 // Handle the serialized case first, same for SPMD/non-SPMD. 87 if (OMP_UNLIKELY(!if_expr || icv::Level)) { 88 __kmpc_serialized_parallel(ident, TId); 89 invokeMicrotask(TId, 0, fn, args, nargs); 90 __kmpc_end_serialized_parallel(ident, TId); 91 return; 92 } 93 94 uint32_t NumThreads = determineNumberOfThreads(num_threads); 95 if (mapping::isSPMDMode()) { 96 synchronize::threads(); 97 { 98 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads, 99 1u, TId == 0); 100 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0); 101 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0); 102 synchronize::threads(); 103 104 if (TId < NumThreads) 105 invokeMicrotask(TId, 0, fn, args, nargs); 106 synchronize::threads(); 107 } 108 return; 109 } 110 111 // We do *not* create a new data environment because all threads in the team 112 // that are active are now running this parallel region. They share the 113 // TeamState, which has an increase level-var and potentially active-level 114 // set, but they do not have individual ThreadStates yet. If they ever 115 // modify the ICVs beyond this point a ThreadStates will be allocated. 116 117 bool IsActiveParallelRegion = NumThreads > 1; 118 if (!IsActiveParallelRegion) { 119 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true); 120 invokeMicrotask(TId, 0, fn, args, nargs); 121 return; 122 } 123 124 void **GlobalArgs = nullptr; 125 if (nargs) { 126 __kmpc_begin_sharing_variables(&GlobalArgs, nargs); 127 #pragma unroll 128 for (int I = 0; I < nargs; I++) 129 GlobalArgs[I] = args[I]; 130 } 131 132 { 133 state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads, 134 1u, true); 135 state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, 136 (void *)nullptr, true); 137 state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true); 138 state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true); 139 140 // Master signals work to activate workers. 141 synchronize::threads(); 142 // Master waits for workers to signal. 143 synchronize::threads(); 144 } 145 146 if (nargs) 147 __kmpc_end_sharing_variables(); 148 } 149 150 __attribute__((noinline)) bool 151 __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { 152 // Work function and arguments for L1 parallel region. 153 *WorkFn = state::ParallelRegionFn; 154 155 // If this is the termination signal from the master, quit early. 156 if (!*WorkFn) 157 return false; 158 159 // Set to true for workers participating in the parallel region. 160 uint32_t TId = mapping::getThreadIdInBlock(); 161 bool ThreadIsActive = TId < state::ParallelTeamSize; 162 return ThreadIsActive; 163 } 164 165 __attribute__((noinline)) void __kmpc_kernel_end_parallel() { 166 // In case we have modified an ICV for this thread before a ThreadState was 167 // created. We drop it now to not contaminate the next parallel region. 168 ASSERT(!mapping::isSPMDMode()); 169 uint32_t TId = mapping::getThreadIdInBlock(); 170 state::resetStateForThread(TId); 171 ASSERT(!mapping::isSPMDMode()); 172 } 173 174 void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) { 175 state::enterDataEnvironment(); 176 ++icv::Level; 177 } 178 179 void __kmpc_end_serialized_parallel(IdentTy *, uint32_t TId) { 180 state::exitDataEnvironment(); 181 --icv::Level; 182 } 183 184 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); } 185 186 int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); } 187 188 void __kmpc_push_num_threads(IdentTy *, int32_t, int32_t NumThreads) { 189 icv::NThreads = NumThreads; 190 } 191 192 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, 193 int32_t thread_limit) {} 194 195 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} 196 } 197 198 #pragma omp end declare target 199