1 //===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of reduction with KMPC interface.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "Debug.h"
14 #include "Interface.h"
15 #include "Mapping.h"
16 #include "State.h"
17 #include "Synchronization.h"
18 #include "Types.h"
19 #include "Utils.h"
20 
21 using namespace _OMP;
22 
23 namespace {
24 
25 #pragma omp declare target
26 
27 void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
28   for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
29     shflFct(reduce_data, /*LaneId - not used= */ 0,
30             /*Offset = */ mask, /*AlgoVersion=*/0);
31   }
32 }
33 
34 void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
35                                uint32_t size, uint32_t tid) {
36   uint32_t curr_size;
37   uint32_t mask;
38   curr_size = size;
39   mask = curr_size / 2;
40   while (mask > 0) {
41     shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
42     curr_size = (curr_size + 1) / 2;
43     mask = curr_size / 2;
44   }
45 }
46 
47 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
48 static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
49                                           ShuffleReductFnTy shflFct) {
50   uint32_t size, remote_id, physical_lane_id;
51   physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
52   __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
53   __kmpc_impl_lanemask_t Liveness = mapping::activemask();
54   uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
55   __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
56   do {
57     Liveness = mapping::activemask();
58     remote_id = utils::ffs(Liveness & lanemask_gt);
59     size = utils::popc(Liveness);
60     logical_lane_id /= 2;
61     shflFct(reduce_data, /*LaneId =*/logical_lane_id,
62             /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
63   } while (logical_lane_id % 2 == 0 && size > 1);
64   return (logical_lane_id == 0);
65 }
66 #endif
67 
68 static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
69                                             uint64_t reduce_size,
70                                             void *reduce_data,
71                                             ShuffleReductFnTy shflFct,
72                                             InterWarpCopyFnTy cpyFct,
73                                             bool isSPMDExecutionMode, bool) {
74   uint32_t BlockThreadId = mapping::getThreadIdInBlock();
75   if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
76     BlockThreadId = 0;
77   uint32_t NumThreads = omp_get_num_threads();
78   if (NumThreads == 1)
79     return 1;
80     /*
81      * This reduce function handles reduction within a team. It handles
82      * parallel regions in both L1 and L2 parallelism levels. It also
83      * supports Generic, SPMD, and NoOMP modes.
84      *
85      * 1. Reduce within a warp.
86      * 2. Warp master copies value to warp 0 via shared memory.
87      * 3. Warp 0 reduces to a single value.
88      * 4. The reduced value is available in the thread that returns 1.
89      */
90 
91 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
92   uint32_t WarpsNeeded =
93       (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
94   uint32_t WarpId = mapping::getWarpId();
95 
96   // Volta execution model:
97   // For the Generic execution mode a parallel region either has 1 thread and
98   // beyond that, always a multiple of 32. For the SPMD execution mode we may
99   // have any number of threads.
100   if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
101     gpu_regular_warp_reduce(reduce_data, shflFct);
102   else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
103     gpu_irregular_warp_reduce(reduce_data, shflFct,
104                               /*LaneCount=*/NumThreads % mapping::getWarpSize(),
105                               /*LaneId=*/mapping::getThreadIdInBlock() %
106                                   mapping::getWarpSize());
107 
108   // When we have more than [mapping::getWarpSize()] number of threads
109   // a block reduction is performed here.
110   //
111   // Only L1 parallel region can enter this if condition.
112   if (NumThreads > mapping::getWarpSize()) {
113     // Gather all the reduced values from each warp
114     // to the first warp.
115     cpyFct(reduce_data, WarpsNeeded);
116 
117     if (WarpId == 0)
118       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
119                                 BlockThreadId);
120   }
121   return BlockThreadId == 0;
122 #else
123   __kmpc_impl_lanemask_t Liveness = mapping::activemask();
124   if (Liveness == lanes::All) // Full warp
125     gpu_regular_warp_reduce(reduce_data, shflFct);
126   else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
127     gpu_irregular_warp_reduce(reduce_data, shflFct,
128                               /*LaneCount=*/utils::popc(Liveness),
129                               /*LaneId=*/mapping::getThreadIdInBlock() %
130                                   mapping::getWarpSize());
131   else { // Dispersed lanes. Only threads in L2
132          // parallel region may enter here; return
133          // early.
134     return gpu_irregular_simd_reduce(reduce_data, shflFct);
135   }
136 
137   // When we have more than [mapping::getWarpSize()] number of threads
138   // a block reduction is performed here.
139   //
140   // Only L1 parallel region can enter this if condition.
141   if (NumThreads > mapping::getWarpSize()) {
142     uint32_t WarpsNeeded =
143         (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
144     // Gather all the reduced values from each warp
145     // to the first warp.
146     cpyFct(reduce_data, WarpsNeeded);
147 
148     uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
149     if (WarpId == 0)
150       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
151                                 BlockThreadId);
152 
153     return BlockThreadId == 0;
154   }
155 
156   // Get the OMP thread Id. This is different from BlockThreadId in the case of
157   // an L2 parallel region.
158   return TId == 0;
159 #endif // __CUDA_ARCH__ >= 700
160 }
161 
162 uint32_t roundToWarpsize(uint32_t s) {
163   if (s < mapping::getWarpSize())
164     return 1;
165   return (s & ~(unsigned)(mapping::getWarpSize() - 1));
166 }
167 
168 uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
169 
170 static volatile uint32_t IterCnt = 0;
171 static volatile uint32_t Cnt = 0;
172 
173 } // namespace
174 
175 extern "C" {
176 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
177     IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
178     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
179   FunctionTracingRAII();
180   return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
181                                       shflFct, cpyFct, mapping::isSPMDMode(),
182                                       false);
183 }
184 
185 int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
186     IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
187     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
188     ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
189     ListGlobalFnTy glredFct) {
190   FunctionTracingRAII();
191 
192   // Terminate all threads in non-SPMD mode except for the master thread.
193   uint32_t ThreadId = mapping::getThreadIdInBlock();
194   if (mapping::isGenericMode()) {
195     if (!mapping::isMainThreadInGenericMode())
196       return 0;
197     ThreadId = 0;
198   }
199 
200   // In non-generic mode all workers participate in the teams reduction.
201   // In generic mode only the team master participates in the teams
202   // reduction because the workers are waiting for parallel work.
203   uint32_t NumThreads = omp_get_num_threads();
204   uint32_t TeamId = omp_get_team_num();
205   uint32_t NumTeams = omp_get_num_teams();
206   static unsigned SHARED(Bound);
207   static unsigned SHARED(ChunkTeamCount);
208 
209   // Block progress for teams greater than the current upper
210   // limit. We always only allow a number of teams less or equal
211   // to the number of slots in the buffer.
212   bool IsMaster = (ThreadId == 0);
213   while (IsMaster) {
214     Bound = atomic::load((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
215     if (TeamId < Bound + num_of_records)
216       break;
217   }
218 
219   if (IsMaster) {
220     int ModBockId = TeamId % num_of_records;
221     if (TeamId < num_of_records) {
222       lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
223     } else
224       lgredFct(GlobalBuffer, ModBockId, reduce_data);
225 
226     fence::system(__ATOMIC_SEQ_CST);
227 
228     // Increment team counter.
229     // This counter is incremented by all teams in the current
230     // BUFFER_SIZE chunk.
231     ChunkTeamCount =
232         atomic::inc((uint32_t *)&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST);
233   }
234   // Synchronize
235   if (mapping::isSPMDMode())
236     __kmpc_barrier(Loc, TId);
237 
238   // reduce_data is global or shared so before being reduced within the
239   // warp we need to bring it in local memory:
240   // local_reduce_data = reduce_data[i]
241   //
242   // Example for 3 reduction variables a, b, c (of potentially different
243   // types):
244   //
245   // buffer layout (struct of arrays):
246   // a, a, ..., a, b, b, ... b, c, c, ... c
247   // |__________|
248   //     num_of_records
249   //
250   // local_data_reduce layout (struct):
251   // a, b, c
252   //
253   // Each thread will have a local struct containing the values to be
254   // reduced:
255   //      1. do reduction within each warp.
256   //      2. do reduction across warps.
257   //      3. write the final result to the main reduction variable
258   //         by returning 1 in the thread holding the reduction result.
259 
260   // Check if this is the very last team.
261   unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
262   if (ChunkTeamCount == NumTeams - Bound - 1) {
263     //
264     // Last team processing.
265     //
266     if (ThreadId >= NumRecs)
267       return 0;
268     NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
269     if (ThreadId >= NumThreads)
270       return 0;
271 
272     // Load from buffer and reduce.
273     glcpyFct(GlobalBuffer, ThreadId, reduce_data);
274     for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
275       glredFct(GlobalBuffer, i, reduce_data);
276 
277     // Reduce across warps to the warp master.
278     if (NumThreads > 1) {
279       gpu_regular_warp_reduce(reduce_data, shflFct);
280 
281       // When we have more than [mapping::getWarpSize()] number of threads
282       // a block reduction is performed here.
283       uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
284       if (ActiveThreads > mapping::getWarpSize()) {
285         uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
286                                mapping::getWarpSize();
287         // Gather all the reduced values from each warp
288         // to the first warp.
289         cpyFct(reduce_data, WarpsNeeded);
290 
291         uint32_t WarpId = ThreadId / mapping::getWarpSize();
292         if (WarpId == 0)
293           gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
294                                     ThreadId);
295       }
296     }
297 
298     if (IsMaster) {
299       Cnt = 0;
300       IterCnt = 0;
301       return 1;
302     }
303     return 0;
304   }
305   if (IsMaster && ChunkTeamCount == num_of_records - 1) {
306     // Allow SIZE number of teams to proceed writing their
307     // intermediate results to the global buffer.
308     atomic::add((uint32_t *)&IterCnt, uint32_t(num_of_records),
309                 __ATOMIC_SEQ_CST);
310   }
311 
312   return 0;
313 }
314 
315 void __kmpc_nvptx_end_reduce(int32_t TId) { FunctionTracingRAII(); }
316 
317 void __kmpc_nvptx_end_reduce_nowait(int32_t TId) { FunctionTracingRAII(); }
318 }
319 
320 #pragma omp end declare target
321