1 //===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the implementation of reduction with KMPC interface. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "Debug.h" 14 #include "Interface.h" 15 #include "Mapping.h" 16 #include "State.h" 17 #include "Synchronization.h" 18 #include "Types.h" 19 #include "Utils.h" 20 21 using namespace _OMP; 22 23 namespace { 24 25 #pragma omp declare target 26 27 void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { 28 for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { 29 shflFct(reduce_data, /*LaneId - not used= */ 0, 30 /*Offset = */ mask, /*AlgoVersion=*/0); 31 } 32 } 33 34 void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, 35 uint32_t size, uint32_t tid) { 36 uint32_t curr_size; 37 uint32_t mask; 38 curr_size = size; 39 mask = curr_size / 2; 40 while (mask > 0) { 41 shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); 42 curr_size = (curr_size + 1) / 2; 43 mask = curr_size / 2; 44 } 45 } 46 47 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 48 static uint32_t gpu_irregular_simd_reduce(void *reduce_data, 49 ShuffleReductFnTy shflFct) { 50 uint32_t size, remote_id, physical_lane_id; 51 physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); 52 __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); 53 __kmpc_impl_lanemask_t Liveness = mapping::activemask(); 54 uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2; 55 __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT(); 56 do { 57 Liveness = mapping::activemask(); 58 remote_id = utils::ffs(Liveness & lanemask_gt); 59 size = utils::popc(Liveness); 60 logical_lane_id /= 2; 61 shflFct(reduce_data, /*LaneId =*/logical_lane_id, 62 /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); 63 } while (logical_lane_id % 2 == 0 && size > 1); 64 return (logical_lane_id == 0); 65 } 66 #endif 67 68 static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars, 69 uint64_t reduce_size, 70 void *reduce_data, 71 ShuffleReductFnTy shflFct, 72 InterWarpCopyFnTy cpyFct, 73 bool isSPMDExecutionMode, bool) { 74 uint32_t BlockThreadId = mapping::getThreadIdInBlock(); 75 if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false)) 76 BlockThreadId = 0; 77 uint32_t NumThreads = omp_get_num_threads(); 78 if (NumThreads == 1) 79 return 1; 80 /* 81 * This reduce function handles reduction within a team. It handles 82 * parallel regions in both L1 and L2 parallelism levels. It also 83 * supports Generic, SPMD, and NoOMP modes. 84 * 85 * 1. Reduce within a warp. 86 * 2. Warp master copies value to warp 0 via shared memory. 87 * 3. Warp 0 reduces to a single value. 88 * 4. The reduced value is available in the thread that returns 1. 89 */ 90 91 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 92 uint32_t WarpsNeeded = 93 (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); 94 uint32_t WarpId = mapping::getWarpId(); 95 96 // Volta execution model: 97 // For the Generic execution mode a parallel region either has 1 thread and 98 // beyond that, always a multiple of 32. For the SPMD execution mode we may 99 // have any number of threads. 100 if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1)) 101 gpu_regular_warp_reduce(reduce_data, shflFct); 102 else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. 103 gpu_irregular_warp_reduce(reduce_data, shflFct, 104 /*LaneCount=*/NumThreads % mapping::getWarpSize(), 105 /*LaneId=*/mapping::getThreadIdInBlock() % 106 mapping::getWarpSize()); 107 108 // When we have more than [mapping::getWarpSize()] number of threads 109 // a block reduction is performed here. 110 // 111 // Only L1 parallel region can enter this if condition. 112 if (NumThreads > mapping::getWarpSize()) { 113 // Gather all the reduced values from each warp 114 // to the first warp. 115 cpyFct(reduce_data, WarpsNeeded); 116 117 if (WarpId == 0) 118 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, 119 BlockThreadId); 120 } 121 return BlockThreadId == 0; 122 #else 123 __kmpc_impl_lanemask_t Liveness = mapping::activemask(); 124 if (Liveness == lanes::All) // Full warp 125 gpu_regular_warp_reduce(reduce_data, shflFct); 126 else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes 127 gpu_irregular_warp_reduce(reduce_data, shflFct, 128 /*LaneCount=*/utils::popc(Liveness), 129 /*LaneId=*/mapping::getThreadIdInBlock() % 130 mapping::getWarpSize()); 131 else { // Dispersed lanes. Only threads in L2 132 // parallel region may enter here; return 133 // early. 134 return gpu_irregular_simd_reduce(reduce_data, shflFct); 135 } 136 137 // When we have more than [mapping::getWarpSize()] number of threads 138 // a block reduction is performed here. 139 // 140 // Only L1 parallel region can enter this if condition. 141 if (NumThreads > mapping::getWarpSize()) { 142 uint32_t WarpsNeeded = 143 (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); 144 // Gather all the reduced values from each warp 145 // to the first warp. 146 cpyFct(reduce_data, WarpsNeeded); 147 148 uint32_t WarpId = BlockThreadId / mapping::getWarpSize(); 149 if (WarpId == 0) 150 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, 151 BlockThreadId); 152 153 return BlockThreadId == 0; 154 } 155 156 // Get the OMP thread Id. This is different from BlockThreadId in the case of 157 // an L2 parallel region. 158 return TId == 0; 159 #endif // __CUDA_ARCH__ >= 700 160 } 161 162 uint32_t roundToWarpsize(uint32_t s) { 163 if (s < mapping::getWarpSize()) 164 return 1; 165 return (s & ~(unsigned)(mapping::getWarpSize() - 1)); 166 } 167 168 uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } 169 170 static volatile uint32_t IterCnt = 0; 171 static volatile uint32_t Cnt = 0; 172 173 } // namespace 174 175 extern "C" { 176 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( 177 IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size, 178 void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) { 179 return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data, 180 shflFct, cpyFct, mapping::isSPMDMode(), 181 false); 182 } 183 184 int32_t __kmpc_nvptx_teams_reduce_nowait_v2( 185 IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records, 186 void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct, 187 ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct, 188 ListGlobalFnTy glredFct) { 189 190 // Terminate all threads in non-SPMD mode except for the master thread. 191 uint32_t ThreadId = mapping::getThreadIdInBlock(); 192 if (mapping::isGenericMode()) { 193 if (!mapping::isMainThreadInGenericMode()) 194 return 0; 195 ThreadId = 0; 196 } 197 198 // In non-generic mode all workers participate in the teams reduction. 199 // In generic mode only the team master participates in the teams 200 // reduction because the workers are waiting for parallel work. 201 uint32_t NumThreads = omp_get_num_threads(); 202 uint32_t TeamId = omp_get_team_num(); 203 uint32_t NumTeams = omp_get_num_teams(); 204 static unsigned SHARED(Bound); 205 static unsigned SHARED(ChunkTeamCount); 206 207 // Block progress for teams greater than the current upper 208 // limit. We always only allow a number of teams less or equal 209 // to the number of slots in the buffer. 210 bool IsMaster = (ThreadId == 0); 211 while (IsMaster) { 212 Bound = atomic::read((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST); 213 if (TeamId < Bound + num_of_records) 214 break; 215 } 216 217 if (IsMaster) { 218 int ModBockId = TeamId % num_of_records; 219 if (TeamId < num_of_records) { 220 lgcpyFct(GlobalBuffer, ModBockId, reduce_data); 221 } else 222 lgredFct(GlobalBuffer, ModBockId, reduce_data); 223 224 fence::system(__ATOMIC_SEQ_CST); 225 226 // Increment team counter. 227 // This counter is incremented by all teams in the current 228 // BUFFER_SIZE chunk. 229 ChunkTeamCount = 230 atomic::inc((uint32_t *)&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST); 231 } 232 // Synchronize 233 if (mapping::isSPMDMode()) 234 __kmpc_barrier(Loc, TId); 235 236 // reduce_data is global or shared so before being reduced within the 237 // warp we need to bring it in local memory: 238 // local_reduce_data = reduce_data[i] 239 // 240 // Example for 3 reduction variables a, b, c (of potentially different 241 // types): 242 // 243 // buffer layout (struct of arrays): 244 // a, a, ..., a, b, b, ... b, c, c, ... c 245 // |__________| 246 // num_of_records 247 // 248 // local_data_reduce layout (struct): 249 // a, b, c 250 // 251 // Each thread will have a local struct containing the values to be 252 // reduced: 253 // 1. do reduction within each warp. 254 // 2. do reduction across warps. 255 // 3. write the final result to the main reduction variable 256 // by returning 1 in the thread holding the reduction result. 257 258 // Check if this is the very last team. 259 unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records)); 260 if (ChunkTeamCount == NumTeams - Bound - 1) { 261 // 262 // Last team processing. 263 // 264 if (ThreadId >= NumRecs) 265 return 0; 266 NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs)); 267 if (ThreadId >= NumThreads) 268 return 0; 269 270 // Load from buffer and reduce. 271 glcpyFct(GlobalBuffer, ThreadId, reduce_data); 272 for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) 273 glredFct(GlobalBuffer, i, reduce_data); 274 275 // Reduce across warps to the warp master. 276 if (NumThreads > 1) { 277 gpu_regular_warp_reduce(reduce_data, shflFct); 278 279 // When we have more than [mapping::getWarpSize()] number of threads 280 // a block reduction is performed here. 281 uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads); 282 if (ActiveThreads > mapping::getWarpSize()) { 283 uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) / 284 mapping::getWarpSize(); 285 // Gather all the reduced values from each warp 286 // to the first warp. 287 cpyFct(reduce_data, WarpsNeeded); 288 289 uint32_t WarpId = ThreadId / mapping::getWarpSize(); 290 if (WarpId == 0) 291 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, 292 ThreadId); 293 } 294 } 295 296 if (IsMaster) { 297 Cnt = 0; 298 IterCnt = 0; 299 return 1; 300 } 301 return 0; 302 } 303 if (IsMaster && ChunkTeamCount == num_of_records - 1) { 304 // Allow SIZE number of teams to proceed writing their 305 // intermediate results to the global buffer. 306 atomic::add((uint32_t *)&IterCnt, uint32_t(num_of_records), 307 __ATOMIC_SEQ_CST); 308 } 309 310 return 0; 311 } 312 313 void __kmpc_nvptx_end_reduce(int32_t TId) {} 314 315 void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {} 316 } 317 318 #pragma omp end declare target 319