167ab875fSJohannes Doerfert //===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
267ab875fSJohannes Doerfert //
367ab875fSJohannes Doerfert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
467ab875fSJohannes Doerfert // See https://llvm.org/LICENSE.txt for license information.
567ab875fSJohannes Doerfert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
667ab875fSJohannes Doerfert //
767ab875fSJohannes Doerfert //===----------------------------------------------------------------------===//
867ab875fSJohannes Doerfert //
967ab875fSJohannes Doerfert // This file contains the implementation of reduction with KMPC interface.
1067ab875fSJohannes Doerfert //
1167ab875fSJohannes Doerfert //===----------------------------------------------------------------------===//
1267ab875fSJohannes Doerfert 
1367ab875fSJohannes Doerfert #include "Debug.h"
1467ab875fSJohannes Doerfert #include "Interface.h"
1567ab875fSJohannes Doerfert #include "Mapping.h"
1667ab875fSJohannes Doerfert #include "State.h"
1767ab875fSJohannes Doerfert #include "Synchronization.h"
1867ab875fSJohannes Doerfert #include "Types.h"
1967ab875fSJohannes Doerfert #include "Utils.h"
2067ab875fSJohannes Doerfert 
2167ab875fSJohannes Doerfert using namespace _OMP;
2267ab875fSJohannes Doerfert 
2367ab875fSJohannes Doerfert namespace {
2467ab875fSJohannes Doerfert 
25b4f8443dSJoseph Huber #pragma omp begin declare target device_type(nohost)
2667ab875fSJohannes Doerfert 
gpu_regular_warp_reduce(void * reduce_data,ShuffleReductFnTy shflFct)2767ab875fSJohannes Doerfert void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
2867ab875fSJohannes Doerfert   for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
2967ab875fSJohannes Doerfert     shflFct(reduce_data, /*LaneId - not used= */ 0,
3067ab875fSJohannes Doerfert             /*Offset = */ mask, /*AlgoVersion=*/0);
3167ab875fSJohannes Doerfert   }
3267ab875fSJohannes Doerfert }
3367ab875fSJohannes Doerfert 
gpu_irregular_warp_reduce(void * reduce_data,ShuffleReductFnTy shflFct,uint32_t size,uint32_t tid)3467ab875fSJohannes Doerfert void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
3567ab875fSJohannes Doerfert                                uint32_t size, uint32_t tid) {
3667ab875fSJohannes Doerfert   uint32_t curr_size;
3767ab875fSJohannes Doerfert   uint32_t mask;
3867ab875fSJohannes Doerfert   curr_size = size;
3967ab875fSJohannes Doerfert   mask = curr_size / 2;
4067ab875fSJohannes Doerfert   while (mask > 0) {
4167ab875fSJohannes Doerfert     shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
4267ab875fSJohannes Doerfert     curr_size = (curr_size + 1) / 2;
4367ab875fSJohannes Doerfert     mask = curr_size / 2;
4467ab875fSJohannes Doerfert   }
4567ab875fSJohannes Doerfert }
4667ab875fSJohannes Doerfert 
4767ab875fSJohannes Doerfert #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
gpu_irregular_simd_reduce(void * reduce_data,ShuffleReductFnTy shflFct)4867ab875fSJohannes Doerfert static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
4967ab875fSJohannes Doerfert                                           ShuffleReductFnTy shflFct) {
5067ab875fSJohannes Doerfert   uint32_t size, remote_id, physical_lane_id;
5167ab875fSJohannes Doerfert   physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
5267ab875fSJohannes Doerfert   __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
5367ab875fSJohannes Doerfert   __kmpc_impl_lanemask_t Liveness = mapping::activemask();
5467ab875fSJohannes Doerfert   uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
5567ab875fSJohannes Doerfert   __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
5667ab875fSJohannes Doerfert   do {
5767ab875fSJohannes Doerfert     Liveness = mapping::activemask();
5867ab875fSJohannes Doerfert     remote_id = utils::ffs(Liveness & lanemask_gt);
5967ab875fSJohannes Doerfert     size = utils::popc(Liveness);
6067ab875fSJohannes Doerfert     logical_lane_id /= 2;
6167ab875fSJohannes Doerfert     shflFct(reduce_data, /*LaneId =*/logical_lane_id,
6267ab875fSJohannes Doerfert             /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
6367ab875fSJohannes Doerfert   } while (logical_lane_id % 2 == 0 && size > 1);
6467ab875fSJohannes Doerfert   return (logical_lane_id == 0);
6567ab875fSJohannes Doerfert }
6667ab875fSJohannes Doerfert #endif
6767ab875fSJohannes Doerfert 
nvptx_parallel_reduce_nowait(int32_t TId,int32_t num_vars,uint64_t reduce_size,void * reduce_data,ShuffleReductFnTy shflFct,InterWarpCopyFnTy cpyFct,bool isSPMDExecutionMode,bool)6867ab875fSJohannes Doerfert static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
6967ab875fSJohannes Doerfert                                             uint64_t reduce_size,
7067ab875fSJohannes Doerfert                                             void *reduce_data,
7167ab875fSJohannes Doerfert                                             ShuffleReductFnTy shflFct,
7267ab875fSJohannes Doerfert                                             InterWarpCopyFnTy cpyFct,
7367ab875fSJohannes Doerfert                                             bool isSPMDExecutionMode, bool) {
7467ab875fSJohannes Doerfert   uint32_t BlockThreadId = mapping::getThreadIdInBlock();
7585ad5663SJoseph Huber   if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
7667ab875fSJohannes Doerfert     BlockThreadId = 0;
7767ab875fSJohannes Doerfert   uint32_t NumThreads = omp_get_num_threads();
7867ab875fSJohannes Doerfert   if (NumThreads == 1)
7967ab875fSJohannes Doerfert     return 1;
8067ab875fSJohannes Doerfert     /*
8167ab875fSJohannes Doerfert      * This reduce function handles reduction within a team. It handles
8267ab875fSJohannes Doerfert      * parallel regions in both L1 and L2 parallelism levels. It also
8367ab875fSJohannes Doerfert      * supports Generic, SPMD, and NoOMP modes.
8467ab875fSJohannes Doerfert      *
8567ab875fSJohannes Doerfert      * 1. Reduce within a warp.
8667ab875fSJohannes Doerfert      * 2. Warp master copies value to warp 0 via shared memory.
8767ab875fSJohannes Doerfert      * 3. Warp 0 reduces to a single value.
8867ab875fSJohannes Doerfert      * 4. The reduced value is available in the thread that returns 1.
8967ab875fSJohannes Doerfert      */
9067ab875fSJohannes Doerfert 
9167ab875fSJohannes Doerfert #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
9267ab875fSJohannes Doerfert   uint32_t WarpsNeeded =
9367ab875fSJohannes Doerfert       (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
9467ab875fSJohannes Doerfert   uint32_t WarpId = mapping::getWarpId();
9567ab875fSJohannes Doerfert 
9667ab875fSJohannes Doerfert   // Volta execution model:
9767ab875fSJohannes Doerfert   // For the Generic execution mode a parallel region either has 1 thread and
9867ab875fSJohannes Doerfert   // beyond that, always a multiple of 32. For the SPMD execution mode we may
9967ab875fSJohannes Doerfert   // have any number of threads.
10067ab875fSJohannes Doerfert   if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
10167ab875fSJohannes Doerfert     gpu_regular_warp_reduce(reduce_data, shflFct);
10267ab875fSJohannes Doerfert   else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
10367ab875fSJohannes Doerfert     gpu_irregular_warp_reduce(reduce_data, shflFct,
10467ab875fSJohannes Doerfert                               /*LaneCount=*/NumThreads % mapping::getWarpSize(),
10567ab875fSJohannes Doerfert                               /*LaneId=*/mapping::getThreadIdInBlock() %
10667ab875fSJohannes Doerfert                                   mapping::getWarpSize());
10767ab875fSJohannes Doerfert 
10867ab875fSJohannes Doerfert   // When we have more than [mapping::getWarpSize()] number of threads
10967ab875fSJohannes Doerfert   // a block reduction is performed here.
11067ab875fSJohannes Doerfert   //
11167ab875fSJohannes Doerfert   // Only L1 parallel region can enter this if condition.
11267ab875fSJohannes Doerfert   if (NumThreads > mapping::getWarpSize()) {
11367ab875fSJohannes Doerfert     // Gather all the reduced values from each warp
11467ab875fSJohannes Doerfert     // to the first warp.
11567ab875fSJohannes Doerfert     cpyFct(reduce_data, WarpsNeeded);
11667ab875fSJohannes Doerfert 
11767ab875fSJohannes Doerfert     if (WarpId == 0)
11867ab875fSJohannes Doerfert       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
11967ab875fSJohannes Doerfert                                 BlockThreadId);
12067ab875fSJohannes Doerfert   }
12167ab875fSJohannes Doerfert   return BlockThreadId == 0;
12267ab875fSJohannes Doerfert #else
12367ab875fSJohannes Doerfert   __kmpc_impl_lanemask_t Liveness = mapping::activemask();
12467ab875fSJohannes Doerfert   if (Liveness == lanes::All) // Full warp
12567ab875fSJohannes Doerfert     gpu_regular_warp_reduce(reduce_data, shflFct);
12667ab875fSJohannes Doerfert   else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
12767ab875fSJohannes Doerfert     gpu_irregular_warp_reduce(reduce_data, shflFct,
12867ab875fSJohannes Doerfert                               /*LaneCount=*/utils::popc(Liveness),
12967ab875fSJohannes Doerfert                               /*LaneId=*/mapping::getThreadIdInBlock() %
13067ab875fSJohannes Doerfert                                   mapping::getWarpSize());
13167ab875fSJohannes Doerfert   else { // Dispersed lanes. Only threads in L2
13267ab875fSJohannes Doerfert          // parallel region may enter here; return
13367ab875fSJohannes Doerfert          // early.
13467ab875fSJohannes Doerfert     return gpu_irregular_simd_reduce(reduce_data, shflFct);
13567ab875fSJohannes Doerfert   }
13667ab875fSJohannes Doerfert 
13767ab875fSJohannes Doerfert   // When we have more than [mapping::getWarpSize()] number of threads
13867ab875fSJohannes Doerfert   // a block reduction is performed here.
13967ab875fSJohannes Doerfert   //
14067ab875fSJohannes Doerfert   // Only L1 parallel region can enter this if condition.
14167ab875fSJohannes Doerfert   if (NumThreads > mapping::getWarpSize()) {
14267ab875fSJohannes Doerfert     uint32_t WarpsNeeded =
14367ab875fSJohannes Doerfert         (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
14467ab875fSJohannes Doerfert     // Gather all the reduced values from each warp
14567ab875fSJohannes Doerfert     // to the first warp.
14667ab875fSJohannes Doerfert     cpyFct(reduce_data, WarpsNeeded);
14767ab875fSJohannes Doerfert 
14867ab875fSJohannes Doerfert     uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
14967ab875fSJohannes Doerfert     if (WarpId == 0)
15067ab875fSJohannes Doerfert       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
15167ab875fSJohannes Doerfert                                 BlockThreadId);
15267ab875fSJohannes Doerfert 
15367ab875fSJohannes Doerfert     return BlockThreadId == 0;
15467ab875fSJohannes Doerfert   }
15567ab875fSJohannes Doerfert 
15667ab875fSJohannes Doerfert   // Get the OMP thread Id. This is different from BlockThreadId in the case of
15767ab875fSJohannes Doerfert   // an L2 parallel region.
15867ab875fSJohannes Doerfert   return TId == 0;
15967ab875fSJohannes Doerfert #endif // __CUDA_ARCH__ >= 700
16067ab875fSJohannes Doerfert }
16167ab875fSJohannes Doerfert 
roundToWarpsize(uint32_t s)16267ab875fSJohannes Doerfert uint32_t roundToWarpsize(uint32_t s) {
16367ab875fSJohannes Doerfert   if (s < mapping::getWarpSize())
16467ab875fSJohannes Doerfert     return 1;
16567ab875fSJohannes Doerfert   return (s & ~(unsigned)(mapping::getWarpSize() - 1));
16667ab875fSJohannes Doerfert }
16767ab875fSJohannes Doerfert 
kmpcMin(uint32_t x,uint32_t y)16867ab875fSJohannes Doerfert uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
16967ab875fSJohannes Doerfert 
170*ce0caf41SJoseph Huber static uint32_t IterCnt = 0;
171*ce0caf41SJoseph Huber static uint32_t Cnt = 0;
17267ab875fSJohannes Doerfert 
17367ab875fSJohannes Doerfert } // namespace
17467ab875fSJohannes Doerfert 
17567ab875fSJohannes Doerfert extern "C" {
__kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy * Loc,int32_t TId,int32_t num_vars,uint64_t reduce_size,void * reduce_data,ShuffleReductFnTy shflFct,InterWarpCopyFnTy cpyFct)17667ab875fSJohannes Doerfert int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
17767ab875fSJohannes Doerfert     IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
17867ab875fSJohannes Doerfert     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
17974f91741SJoseph Huber   FunctionTracingRAII();
18067ab875fSJohannes Doerfert   return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
18167ab875fSJohannes Doerfert                                       shflFct, cpyFct, mapping::isSPMDMode(),
18267ab875fSJohannes Doerfert                                       false);
18367ab875fSJohannes Doerfert }
18467ab875fSJohannes Doerfert 
__kmpc_nvptx_teams_reduce_nowait_v2(IdentTy * Loc,int32_t TId,void * GlobalBuffer,uint32_t num_of_records,void * reduce_data,ShuffleReductFnTy shflFct,InterWarpCopyFnTy cpyFct,ListGlobalFnTy lgcpyFct,ListGlobalFnTy lgredFct,ListGlobalFnTy glcpyFct,ListGlobalFnTy glredFct)18567ab875fSJohannes Doerfert int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
18667ab875fSJohannes Doerfert     IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
18767ab875fSJohannes Doerfert     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
18867ab875fSJohannes Doerfert     ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
18967ab875fSJohannes Doerfert     ListGlobalFnTy glredFct) {
19074f91741SJoseph Huber   FunctionTracingRAII();
19167ab875fSJohannes Doerfert 
19267ab875fSJohannes Doerfert   // Terminate all threads in non-SPMD mode except for the master thread.
19367ab875fSJohannes Doerfert   uint32_t ThreadId = mapping::getThreadIdInBlock();
19467ab875fSJohannes Doerfert   if (mapping::isGenericMode()) {
19567ab875fSJohannes Doerfert     if (!mapping::isMainThreadInGenericMode())
19667ab875fSJohannes Doerfert       return 0;
19767ab875fSJohannes Doerfert     ThreadId = 0;
19867ab875fSJohannes Doerfert   }
19967ab875fSJohannes Doerfert 
20067ab875fSJohannes Doerfert   // In non-generic mode all workers participate in the teams reduction.
20167ab875fSJohannes Doerfert   // In generic mode only the team master participates in the teams
20267ab875fSJohannes Doerfert   // reduction because the workers are waiting for parallel work.
20367ab875fSJohannes Doerfert   uint32_t NumThreads = omp_get_num_threads();
20467ab875fSJohannes Doerfert   uint32_t TeamId = omp_get_team_num();
20567ab875fSJohannes Doerfert   uint32_t NumTeams = omp_get_num_teams();
20667ab875fSJohannes Doerfert   static unsigned SHARED(Bound);
20767ab875fSJohannes Doerfert   static unsigned SHARED(ChunkTeamCount);
20867ab875fSJohannes Doerfert 
20967ab875fSJohannes Doerfert   // Block progress for teams greater than the current upper
21067ab875fSJohannes Doerfert   // limit. We always only allow a number of teams less or equal
21167ab875fSJohannes Doerfert   // to the number of slots in the buffer.
21267ab875fSJohannes Doerfert   bool IsMaster = (ThreadId == 0);
21367ab875fSJohannes Doerfert   while (IsMaster) {
214*ce0caf41SJoseph Huber     Bound = atomic::load(&IterCnt, __ATOMIC_SEQ_CST);
21567ab875fSJohannes Doerfert     if (TeamId < Bound + num_of_records)
21667ab875fSJohannes Doerfert       break;
21767ab875fSJohannes Doerfert   }
21867ab875fSJohannes Doerfert 
21967ab875fSJohannes Doerfert   if (IsMaster) {
22067ab875fSJohannes Doerfert     int ModBockId = TeamId % num_of_records;
22167ab875fSJohannes Doerfert     if (TeamId < num_of_records) {
22267ab875fSJohannes Doerfert       lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
22367ab875fSJohannes Doerfert     } else
22467ab875fSJohannes Doerfert       lgredFct(GlobalBuffer, ModBockId, reduce_data);
22567ab875fSJohannes Doerfert 
22667ab875fSJohannes Doerfert     fence::system(__ATOMIC_SEQ_CST);
22767ab875fSJohannes Doerfert 
22867ab875fSJohannes Doerfert     // Increment team counter.
22967ab875fSJohannes Doerfert     // This counter is incremented by all teams in the current
23067ab875fSJohannes Doerfert     // BUFFER_SIZE chunk.
231*ce0caf41SJoseph Huber     ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST);
23267ab875fSJohannes Doerfert   }
23367ab875fSJohannes Doerfert   // Synchronize
23467ab875fSJohannes Doerfert   if (mapping::isSPMDMode())
23567ab875fSJohannes Doerfert     __kmpc_barrier(Loc, TId);
23667ab875fSJohannes Doerfert 
23767ab875fSJohannes Doerfert   // reduce_data is global or shared so before being reduced within the
23867ab875fSJohannes Doerfert   // warp we need to bring it in local memory:
23967ab875fSJohannes Doerfert   // local_reduce_data = reduce_data[i]
24067ab875fSJohannes Doerfert   //
24167ab875fSJohannes Doerfert   // Example for 3 reduction variables a, b, c (of potentially different
24267ab875fSJohannes Doerfert   // types):
24367ab875fSJohannes Doerfert   //
24467ab875fSJohannes Doerfert   // buffer layout (struct of arrays):
24567ab875fSJohannes Doerfert   // a, a, ..., a, b, b, ... b, c, c, ... c
24667ab875fSJohannes Doerfert   // |__________|
24767ab875fSJohannes Doerfert   //     num_of_records
24867ab875fSJohannes Doerfert   //
24967ab875fSJohannes Doerfert   // local_data_reduce layout (struct):
25067ab875fSJohannes Doerfert   // a, b, c
25167ab875fSJohannes Doerfert   //
25267ab875fSJohannes Doerfert   // Each thread will have a local struct containing the values to be
25367ab875fSJohannes Doerfert   // reduced:
25467ab875fSJohannes Doerfert   //      1. do reduction within each warp.
25567ab875fSJohannes Doerfert   //      2. do reduction across warps.
25667ab875fSJohannes Doerfert   //      3. write the final result to the main reduction variable
25767ab875fSJohannes Doerfert   //         by returning 1 in the thread holding the reduction result.
25867ab875fSJohannes Doerfert 
25967ab875fSJohannes Doerfert   // Check if this is the very last team.
26067ab875fSJohannes Doerfert   unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
26167ab875fSJohannes Doerfert   if (ChunkTeamCount == NumTeams - Bound - 1) {
26267ab875fSJohannes Doerfert     //
26367ab875fSJohannes Doerfert     // Last team processing.
26467ab875fSJohannes Doerfert     //
26567ab875fSJohannes Doerfert     if (ThreadId >= NumRecs)
26667ab875fSJohannes Doerfert       return 0;
26767ab875fSJohannes Doerfert     NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
26867ab875fSJohannes Doerfert     if (ThreadId >= NumThreads)
26967ab875fSJohannes Doerfert       return 0;
27067ab875fSJohannes Doerfert 
27167ab875fSJohannes Doerfert     // Load from buffer and reduce.
27267ab875fSJohannes Doerfert     glcpyFct(GlobalBuffer, ThreadId, reduce_data);
27367ab875fSJohannes Doerfert     for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
27467ab875fSJohannes Doerfert       glredFct(GlobalBuffer, i, reduce_data);
27567ab875fSJohannes Doerfert 
27667ab875fSJohannes Doerfert     // Reduce across warps to the warp master.
27767ab875fSJohannes Doerfert     if (NumThreads > 1) {
27867ab875fSJohannes Doerfert       gpu_regular_warp_reduce(reduce_data, shflFct);
27967ab875fSJohannes Doerfert 
28067ab875fSJohannes Doerfert       // When we have more than [mapping::getWarpSize()] number of threads
28167ab875fSJohannes Doerfert       // a block reduction is performed here.
28267ab875fSJohannes Doerfert       uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
28367ab875fSJohannes Doerfert       if (ActiveThreads > mapping::getWarpSize()) {
28467ab875fSJohannes Doerfert         uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
28567ab875fSJohannes Doerfert                                mapping::getWarpSize();
28667ab875fSJohannes Doerfert         // Gather all the reduced values from each warp
28767ab875fSJohannes Doerfert         // to the first warp.
28867ab875fSJohannes Doerfert         cpyFct(reduce_data, WarpsNeeded);
28967ab875fSJohannes Doerfert 
29067ab875fSJohannes Doerfert         uint32_t WarpId = ThreadId / mapping::getWarpSize();
29167ab875fSJohannes Doerfert         if (WarpId == 0)
29267ab875fSJohannes Doerfert           gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
29367ab875fSJohannes Doerfert                                     ThreadId);
29467ab875fSJohannes Doerfert       }
29567ab875fSJohannes Doerfert     }
29667ab875fSJohannes Doerfert 
29767ab875fSJohannes Doerfert     if (IsMaster) {
29867ab875fSJohannes Doerfert       Cnt = 0;
29967ab875fSJohannes Doerfert       IterCnt = 0;
30067ab875fSJohannes Doerfert       return 1;
30167ab875fSJohannes Doerfert     }
30267ab875fSJohannes Doerfert     return 0;
30367ab875fSJohannes Doerfert   }
30467ab875fSJohannes Doerfert   if (IsMaster && ChunkTeamCount == num_of_records - 1) {
30567ab875fSJohannes Doerfert     // Allow SIZE number of teams to proceed writing their
30667ab875fSJohannes Doerfert     // intermediate results to the global buffer.
307*ce0caf41SJoseph Huber     atomic::add(&IterCnt, uint32_t(num_of_records), __ATOMIC_SEQ_CST);
30867ab875fSJohannes Doerfert   }
30967ab875fSJohannes Doerfert 
31067ab875fSJohannes Doerfert   return 0;
31167ab875fSJohannes Doerfert }
31267ab875fSJohannes Doerfert 
__kmpc_nvptx_end_reduce(int32_t TId)31374f91741SJoseph Huber void __kmpc_nvptx_end_reduce(int32_t TId) { FunctionTracingRAII(); }
31467ab875fSJohannes Doerfert 
__kmpc_nvptx_end_reduce_nowait(int32_t TId)31574f91741SJoseph Huber void __kmpc_nvptx_end_reduce_nowait(int32_t TId) { FunctionTracingRAII(); }
31667ab875fSJohannes Doerfert }
31767ab875fSJohannes Doerfert 
31867ab875fSJohannes Doerfert #pragma omp end declare target
319