1 //===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the implementation of reduction with KMPC interface. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "Debug.h" 14 #include "Interface.h" 15 #include "Mapping.h" 16 #include "State.h" 17 #include "Synchronization.h" 18 #include "Types.h" 19 #include "Utils.h" 20 21 using namespace _OMP; 22 23 namespace { 24 25 #pragma omp declare target 26 27 void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { 28 for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { 29 shflFct(reduce_data, /*LaneId - not used= */ 0, 30 /*Offset = */ mask, /*AlgoVersion=*/0); 31 } 32 } 33 34 void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, 35 uint32_t size, uint32_t tid) { 36 uint32_t curr_size; 37 uint32_t mask; 38 curr_size = size; 39 mask = curr_size / 2; 40 while (mask > 0) { 41 shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); 42 curr_size = (curr_size + 1) / 2; 43 mask = curr_size / 2; 44 } 45 } 46 47 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 48 static uint32_t gpu_irregular_simd_reduce(void *reduce_data, 49 ShuffleReductFnTy shflFct) { 50 uint32_t size, remote_id, physical_lane_id; 51 physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); 52 __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); 53 __kmpc_impl_lanemask_t Liveness = mapping::activemask(); 54 uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; 55 __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); 56 do { 57 Liveness = mapping::activemask(); 58 remote_id = utils::ffs(Liveness & lanemask_gt); 59 size = utils::popc(Liveness); 60 logical_lane_id /= 2; 61 shflFct(reduce_data, /*LaneId =*/logical_lane_id, 62 /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); 63 } while (logical_lane_id % 2 == 0 && size > 1); 64 return (logical_lane_id == 0); 65 } 66 #endif 67 68 static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars, 69 uint64_t reduce_size, 70 void *reduce_data, 71 ShuffleReductFnTy shflFct, 72 InterWarpCopyFnTy cpyFct, 73 bool isSPMDExecutionMode, bool) { 74 uint32_t BlockThreadId = mapping::getThreadIdInBlock(); 75 if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false)) 76 BlockThreadId = 0; 77 uint32_t NumThreads = omp_get_num_threads(); 78 if (NumThreads == 1) 79 return 1; 80 /* 81 * This reduce function handles reduction within a team. It handles 82 * parallel regions in both L1 and L2 parallelism levels. It also 83 * supports Generic, SPMD, and NoOMP modes. 84 * 85 * 1. Reduce within a warp. 86 * 2. Warp master copies value to warp 0 via shared memory. 87 * 3. Warp 0 reduces to a single value. 88 * 4. The reduced value is available in the thread that returns 1. 89 */ 90 91 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 92 uint32_t WarpsNeeded = 93 (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); 94 uint32_t WarpId = mapping::getWarpId(); 95 96 // Volta execution model: 97 // For the Generic execution mode a parallel region either has 1 thread and 98 // beyond that, always a multiple of 32. For the SPMD execution mode we may 99 // have any number of threads. 100 if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1)) 101 gpu_regular_warp_reduce(reduce_data, shflFct); 102 else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. 103 gpu_irregular_warp_reduce(reduce_data, shflFct, 104 /*LaneCount=*/NumThreads % mapping::getWarpSize(), 105 /*LaneId=*/mapping::getThreadIdInBlock() % 106 mapping::getWarpSize()); 107 108 // When we have more than [mapping::getWarpSize()] number of threads 109 // a block reduction is performed here. 110 // 111 // Only L1 parallel region can enter this if condition. 112 if (NumThreads > mapping::getWarpSize()) { 113 // Gather all the reduced values from each warp 114 // to the first warp. 115 cpyFct(reduce_data, WarpsNeeded); 116 117 if (WarpId == 0) 118 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, 119 BlockThreadId); 120 } 121 return BlockThreadId == 0; 122 #else 123 __kmpc_impl_lanemask_t Liveness = mapping::activemask(); 124 if (Liveness == lanes::All) // Full warp 125 gpu_regular_warp_reduce(reduce_data, shflFct); 126 else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes 127 gpu_irregular_warp_reduce(reduce_data, shflFct, 128 /*LaneCount=*/utils::popc(Liveness), 129 /*LaneId=*/mapping::getThreadIdInBlock() % 130 mapping::getWarpSize()); 131 else { // Dispersed lanes. Only threads in L2 132 // parallel region may enter here; return 133 // early. 134 return gpu_irregular_simd_reduce(reduce_data, shflFct); 135 } 136 137 // When we have more than [mapping::getWarpSize()] number of threads 138 // a block reduction is performed here. 139 // 140 // Only L1 parallel region can enter this if condition. 141 if (NumThreads > mapping::getWarpSize()) { 142 uint32_t WarpsNeeded = 143 (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); 144 // Gather all the reduced values from each warp 145 // to the first warp. 146 cpyFct(reduce_data, WarpsNeeded); 147 148 uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); 149 if (WarpId == 0) 150 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, 151 BlockThreadId); 152 153 return BlockThreadId == 0; 154 } 155 156 // Get the OMP thread Id. This is different from BlockThreadId in the case of 157 // an L2 parallel region. 158 return TId == 0; 159 #endif // __CUDA_ARCH__ >= 700 160 } 161 162 uint32_t roundToWarpsize(uint32_t s) { 163 if (s < mapping::getWarpSize()) 164 return 1; 165 return (s & ~(unsigned)(mapping::getWarpSize() - 1)); 166 } 167 168 uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } 169 170 static volatile uint32_t IterCnt = 0; 171 static volatile uint32_t Cnt = 0; 172 173 } // namespace 174 175 extern "C" { 176 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( 177 IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size, 178 void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) { 179 FunctionTracingRAII(); 180 return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data, 181 shflFct, cpyFct, mapping::isSPMDMode(), 182 false); 183 } 184 185 int32_t __kmpc_nvptx_teams_reduce_nowait_v2( 186 IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records, 187 void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct, 188 ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct, 189 ListGlobalFnTy glredFct) { 190 FunctionTracingRAII(); 191 192 // Terminate all threads in non-SPMD mode except for the master thread. 193 uint32_t ThreadId = mapping::getThreadIdInBlock(); 194 if (mapping::isGenericMode()) { 195 if (!mapping::isMainThreadInGenericMode()) 196 return 0; 197 ThreadId = 0; 198 } 199 200 // In non-generic mode all workers participate in the teams reduction. 201 // In generic mode only the team master participates in the teams 202 // reduction because the workers are waiting for parallel work. 203 uint32_t NumThreads = omp_get_num_threads(); 204 uint32_t TeamId = omp_get_team_num(); 205 uint32_t NumTeams = omp_get_num_teams(); 206 static unsigned SHARED(Bound); 207 static unsigned SHARED(ChunkTeamCount); 208 209 // Block progress for teams greater than the current upper 210 // limit. We always only allow a number of teams less or equal 211 // to the number of slots in the buffer. 212 bool IsMaster = (ThreadId == 0); 213 while (IsMaster) { 214 Bound = atomic::load((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST); 215 if (TeamId < Bound + num_of_records) 216 break; 217 } 218 219 if (IsMaster) { 220 int ModBockId = TeamId % num_of_records; 221 if (TeamId < num_of_records) { 222 lgcpyFct(GlobalBuffer, ModBockId, reduce_data); 223 } else 224 lgredFct(GlobalBuffer, ModBockId, reduce_data); 225 226 fence::system(__ATOMIC_SEQ_CST); 227 228 // Increment team counter. 229 // This counter is incremented by all teams in the current 230 // BUFFER_SIZE chunk. 231 ChunkTeamCount = 232 atomic::inc((uint32_t *)&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST); 233 } 234 // Synchronize 235 if (mapping::isSPMDMode()) 236 __kmpc_barrier(Loc, TId); 237 238 // reduce_data is global or shared so before being reduced within the 239 // warp we need to bring it in local memory: 240 // local_reduce_data = reduce_data[i] 241 // 242 // Example for 3 reduction variables a, b, c (of potentially different 243 // types): 244 // 245 // buffer layout (struct of arrays): 246 // a, a, ..., a, b, b, ... b, c, c, ... c 247 // |__________| 248 // num_of_records 249 // 250 // local_data_reduce layout (struct): 251 // a, b, c 252 // 253 // Each thread will have a local struct containing the values to be 254 // reduced: 255 // 1. do reduction within each warp. 256 // 2. do reduction across warps. 257 // 3. write the final result to the main reduction variable 258 // by returning 1 in the thread holding the reduction result. 259 260 // Check if this is the very last team. 261 unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records)); 262 if (ChunkTeamCount == NumTeams - Bound - 1) { 263 // 264 // Last team processing. 265 // 266 if (ThreadId >= NumRecs) 267 return 0; 268 NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs)); 269 if (ThreadId >= NumThreads) 270 return 0; 271 272 // Load from buffer and reduce. 273 glcpyFct(GlobalBuffer, ThreadId, reduce_data); 274 for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) 275 glredFct(GlobalBuffer, i, reduce_data); 276 277 // Reduce across warps to the warp master. 278 if (NumThreads > 1) { 279 gpu_regular_warp_reduce(reduce_data, shflFct); 280 281 // When we have more than [mapping::getWarpSize()] number of threads 282 // a block reduction is performed here. 283 uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads); 284 if (ActiveThreads > mapping::getWarpSize()) { 285 uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) / 286 mapping::getWarpSize(); 287 // Gather all the reduced values from each warp 288 // to the first warp. 289 cpyFct(reduce_data, WarpsNeeded); 290 291 uint32_t WarpId = ThreadId / mapping::getWarpSize(); 292 if (WarpId == 0) 293 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, 294 ThreadId); 295 } 296 } 297 298 if (IsMaster) { 299 Cnt = 0; 300 IterCnt = 0; 301 return 1; 302 } 303 return 0; 304 } 305 if (IsMaster && ChunkTeamCount == num_of_records - 1) { 306 // Allow SIZE number of teams to proceed writing their 307 // intermediate results to the global buffer. 308 atomic::add((uint32_t *)&IterCnt, uint32_t(num_of_records), 309 __ATOMIC_SEQ_CST); 310 } 311 312 return 0; 313 } 314 315 void __kmpc_nvptx_end_reduce(int32_t TId) { FunctionTracingRAII(); } 316 317 void __kmpc_nvptx_end_reduce_nowait(int32_t TId) { FunctionTracingRAII(); } 318 } 319 320 #pragma omp end declare target 321