1 //===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of reduction with KMPC interface.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "Debug.h"
14 #include "Interface.h"
15 #include "Mapping.h"
16 #include "State.h"
17 #include "Synchronization.h"
18 #include "Types.h"
19 #include "Utils.h"
20
21 using namespace _OMP;
22
23 namespace {
24
25 #pragma omp begin declare target device_type(nohost)
26
gpu_regular_warp_reduce(void * reduce_data,ShuffleReductFnTy shflFct)27 void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
28 for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
29 shflFct(reduce_data, /*LaneId - not used= */ 0,
30 /*Offset = */ mask, /*AlgoVersion=*/0);
31 }
32 }
33
gpu_irregular_warp_reduce(void * reduce_data,ShuffleReductFnTy shflFct,uint32_t size,uint32_t tid)34 void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
35 uint32_t size, uint32_t tid) {
36 uint32_t curr_size;
37 uint32_t mask;
38 curr_size = size;
39 mask = curr_size / 2;
40 while (mask > 0) {
41 shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
42 curr_size = (curr_size + 1) / 2;
43 mask = curr_size / 2;
44 }
45 }
46
47 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
gpu_irregular_simd_reduce(void * reduce_data,ShuffleReductFnTy shflFct)48 static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
49 ShuffleReductFnTy shflFct) {
50 uint32_t size, remote_id, physical_lane_id;
51 physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
52 __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
53 __kmpc_impl_lanemask_t Liveness = mapping::activemask();
54 uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
55 __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
56 do {
57 Liveness = mapping::activemask();
58 remote_id = utils::ffs(Liveness & lanemask_gt);
59 size = utils::popc(Liveness);
60 logical_lane_id /= 2;
61 shflFct(reduce_data, /*LaneId =*/logical_lane_id,
62 /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
63 } while (logical_lane_id % 2 == 0 && size > 1);
64 return (logical_lane_id == 0);
65 }
66 #endif
67
nvptx_parallel_reduce_nowait(int32_t TId,int32_t num_vars,uint64_t reduce_size,void * reduce_data,ShuffleReductFnTy shflFct,InterWarpCopyFnTy cpyFct,bool isSPMDExecutionMode,bool)68 static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
69 uint64_t reduce_size,
70 void *reduce_data,
71 ShuffleReductFnTy shflFct,
72 InterWarpCopyFnTy cpyFct,
73 bool isSPMDExecutionMode, bool) {
74 uint32_t BlockThreadId = mapping::getThreadIdInBlock();
75 if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
76 BlockThreadId = 0;
77 uint32_t NumThreads = omp_get_num_threads();
78 if (NumThreads == 1)
79 return 1;
80 /*
81 * This reduce function handles reduction within a team. It handles
82 * parallel regions in both L1 and L2 parallelism levels. It also
83 * supports Generic, SPMD, and NoOMP modes.
84 *
85 * 1. Reduce within a warp.
86 * 2. Warp master copies value to warp 0 via shared memory.
87 * 3. Warp 0 reduces to a single value.
88 * 4. The reduced value is available in the thread that returns 1.
89 */
90
91 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
92 uint32_t WarpsNeeded =
93 (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
94 uint32_t WarpId = mapping::getWarpId();
95
96 // Volta execution model:
97 // For the Generic execution mode a parallel region either has 1 thread and
98 // beyond that, always a multiple of 32. For the SPMD execution mode we may
99 // have any number of threads.
100 if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
101 gpu_regular_warp_reduce(reduce_data, shflFct);
102 else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
103 gpu_irregular_warp_reduce(reduce_data, shflFct,
104 /*LaneCount=*/NumThreads % mapping::getWarpSize(),
105 /*LaneId=*/mapping::getThreadIdInBlock() %
106 mapping::getWarpSize());
107
108 // When we have more than [mapping::getWarpSize()] number of threads
109 // a block reduction is performed here.
110 //
111 // Only L1 parallel region can enter this if condition.
112 if (NumThreads > mapping::getWarpSize()) {
113 // Gather all the reduced values from each warp
114 // to the first warp.
115 cpyFct(reduce_data, WarpsNeeded);
116
117 if (WarpId == 0)
118 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
119 BlockThreadId);
120 }
121 return BlockThreadId == 0;
122 #else
123 __kmpc_impl_lanemask_t Liveness = mapping::activemask();
124 if (Liveness == lanes::All) // Full warp
125 gpu_regular_warp_reduce(reduce_data, shflFct);
126 else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
127 gpu_irregular_warp_reduce(reduce_data, shflFct,
128 /*LaneCount=*/utils::popc(Liveness),
129 /*LaneId=*/mapping::getThreadIdInBlock() %
130 mapping::getWarpSize());
131 else { // Dispersed lanes. Only threads in L2
132 // parallel region may enter here; return
133 // early.
134 return gpu_irregular_simd_reduce(reduce_data, shflFct);
135 }
136
137 // When we have more than [mapping::getWarpSize()] number of threads
138 // a block reduction is performed here.
139 //
140 // Only L1 parallel region can enter this if condition.
141 if (NumThreads > mapping::getWarpSize()) {
142 uint32_t WarpsNeeded =
143 (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
144 // Gather all the reduced values from each warp
145 // to the first warp.
146 cpyFct(reduce_data, WarpsNeeded);
147
148 uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
149 if (WarpId == 0)
150 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
151 BlockThreadId);
152
153 return BlockThreadId == 0;
154 }
155
156 // Get the OMP thread Id. This is different from BlockThreadId in the case of
157 // an L2 parallel region.
158 return TId == 0;
159 #endif // __CUDA_ARCH__ >= 700
160 }
161
roundToWarpsize(uint32_t s)162 uint32_t roundToWarpsize(uint32_t s) {
163 if (s < mapping::getWarpSize())
164 return 1;
165 return (s & ~(unsigned)(mapping::getWarpSize() - 1));
166 }
167
kmpcMin(uint32_t x,uint32_t y)168 uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
169
170 static uint32_t IterCnt = 0;
171 static uint32_t Cnt = 0;
172
173 } // namespace
174
175 extern "C" {
__kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy * Loc,int32_t TId,int32_t num_vars,uint64_t reduce_size,void * reduce_data,ShuffleReductFnTy shflFct,InterWarpCopyFnTy cpyFct)176 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
177 IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
178 void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
179 FunctionTracingRAII();
180 return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
181 shflFct, cpyFct, mapping::isSPMDMode(),
182 false);
183 }
184
__kmpc_nvptx_teams_reduce_nowait_v2(IdentTy * Loc,int32_t TId,void * GlobalBuffer,uint32_t num_of_records,void * reduce_data,ShuffleReductFnTy shflFct,InterWarpCopyFnTy cpyFct,ListGlobalFnTy lgcpyFct,ListGlobalFnTy lgredFct,ListGlobalFnTy glcpyFct,ListGlobalFnTy glredFct)185 int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
186 IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
187 void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
188 ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
189 ListGlobalFnTy glredFct) {
190 FunctionTracingRAII();
191
192 // Terminate all threads in non-SPMD mode except for the master thread.
193 uint32_t ThreadId = mapping::getThreadIdInBlock();
194 if (mapping::isGenericMode()) {
195 if (!mapping::isMainThreadInGenericMode())
196 return 0;
197 ThreadId = 0;
198 }
199
200 // In non-generic mode all workers participate in the teams reduction.
201 // In generic mode only the team master participates in the teams
202 // reduction because the workers are waiting for parallel work.
203 uint32_t NumThreads = omp_get_num_threads();
204 uint32_t TeamId = omp_get_team_num();
205 uint32_t NumTeams = omp_get_num_teams();
206 static unsigned SHARED(Bound);
207 static unsigned SHARED(ChunkTeamCount);
208
209 // Block progress for teams greater than the current upper
210 // limit. We always only allow a number of teams less or equal
211 // to the number of slots in the buffer.
212 bool IsMaster = (ThreadId == 0);
213 while (IsMaster) {
214 Bound = atomic::load(&IterCnt, __ATOMIC_SEQ_CST);
215 if (TeamId < Bound + num_of_records)
216 break;
217 }
218
219 if (IsMaster) {
220 int ModBockId = TeamId % num_of_records;
221 if (TeamId < num_of_records) {
222 lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
223 } else
224 lgredFct(GlobalBuffer, ModBockId, reduce_data);
225
226 fence::system(__ATOMIC_SEQ_CST);
227
228 // Increment team counter.
229 // This counter is incremented by all teams in the current
230 // BUFFER_SIZE chunk.
231 ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST);
232 }
233 // Synchronize
234 if (mapping::isSPMDMode())
235 __kmpc_barrier(Loc, TId);
236
237 // reduce_data is global or shared so before being reduced within the
238 // warp we need to bring it in local memory:
239 // local_reduce_data = reduce_data[i]
240 //
241 // Example for 3 reduction variables a, b, c (of potentially different
242 // types):
243 //
244 // buffer layout (struct of arrays):
245 // a, a, ..., a, b, b, ... b, c, c, ... c
246 // |__________|
247 // num_of_records
248 //
249 // local_data_reduce layout (struct):
250 // a, b, c
251 //
252 // Each thread will have a local struct containing the values to be
253 // reduced:
254 // 1. do reduction within each warp.
255 // 2. do reduction across warps.
256 // 3. write the final result to the main reduction variable
257 // by returning 1 in the thread holding the reduction result.
258
259 // Check if this is the very last team.
260 unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
261 if (ChunkTeamCount == NumTeams - Bound - 1) {
262 //
263 // Last team processing.
264 //
265 if (ThreadId >= NumRecs)
266 return 0;
267 NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
268 if (ThreadId >= NumThreads)
269 return 0;
270
271 // Load from buffer and reduce.
272 glcpyFct(GlobalBuffer, ThreadId, reduce_data);
273 for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
274 glredFct(GlobalBuffer, i, reduce_data);
275
276 // Reduce across warps to the warp master.
277 if (NumThreads > 1) {
278 gpu_regular_warp_reduce(reduce_data, shflFct);
279
280 // When we have more than [mapping::getWarpSize()] number of threads
281 // a block reduction is performed here.
282 uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
283 if (ActiveThreads > mapping::getWarpSize()) {
284 uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
285 mapping::getWarpSize();
286 // Gather all the reduced values from each warp
287 // to the first warp.
288 cpyFct(reduce_data, WarpsNeeded);
289
290 uint32_t WarpId = ThreadId / mapping::getWarpSize();
291 if (WarpId == 0)
292 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
293 ThreadId);
294 }
295 }
296
297 if (IsMaster) {
298 Cnt = 0;
299 IterCnt = 0;
300 return 1;
301 }
302 return 0;
303 }
304 if (IsMaster && ChunkTeamCount == num_of_records - 1) {
305 // Allow SIZE number of teams to proceed writing their
306 // intermediate results to the global buffer.
307 atomic::add(&IterCnt, uint32_t(num_of_records), __ATOMIC_SEQ_CST);
308 }
309
310 return 0;
311 }
312
__kmpc_nvptx_end_reduce(int32_t TId)313 void __kmpc_nvptx_end_reduce(int32_t TId) { FunctionTracingRAII(); }
314
__kmpc_nvptx_end_reduce_nowait(int32_t TId)315 void __kmpc_nvptx_end_reduce_nowait(int32_t TId) { FunctionTracingRAII(); }
316 }
317
318 #pragma omp end declare target
319