167ab875fSJohannes Doerfert //===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
267ab875fSJohannes Doerfert //
367ab875fSJohannes Doerfert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
467ab875fSJohannes Doerfert // See https://llvm.org/LICENSE.txt for license information.
567ab875fSJohannes Doerfert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
667ab875fSJohannes Doerfert //
767ab875fSJohannes Doerfert //===----------------------------------------------------------------------===//
867ab875fSJohannes Doerfert //
967ab875fSJohannes Doerfert // This file contains the implementation of reduction with KMPC interface.
1067ab875fSJohannes Doerfert //
1167ab875fSJohannes Doerfert //===----------------------------------------------------------------------===//
1267ab875fSJohannes Doerfert
1367ab875fSJohannes Doerfert #include "Debug.h"
1467ab875fSJohannes Doerfert #include "Interface.h"
1567ab875fSJohannes Doerfert #include "Mapping.h"
1667ab875fSJohannes Doerfert #include "State.h"
1767ab875fSJohannes Doerfert #include "Synchronization.h"
1867ab875fSJohannes Doerfert #include "Types.h"
1967ab875fSJohannes Doerfert #include "Utils.h"
2067ab875fSJohannes Doerfert
2167ab875fSJohannes Doerfert using namespace _OMP;
2267ab875fSJohannes Doerfert
2367ab875fSJohannes Doerfert namespace {
2467ab875fSJohannes Doerfert
25b4f8443dSJoseph Huber #pragma omp begin declare target device_type(nohost)
2667ab875fSJohannes Doerfert
gpu_regular_warp_reduce(void * reduce_data,ShuffleReductFnTy shflFct)2767ab875fSJohannes Doerfert void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
2867ab875fSJohannes Doerfert for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
2967ab875fSJohannes Doerfert shflFct(reduce_data, /*LaneId - not used= */ 0,
3067ab875fSJohannes Doerfert /*Offset = */ mask, /*AlgoVersion=*/0);
3167ab875fSJohannes Doerfert }
3267ab875fSJohannes Doerfert }
3367ab875fSJohannes Doerfert
gpu_irregular_warp_reduce(void * reduce_data,ShuffleReductFnTy shflFct,uint32_t size,uint32_t tid)3467ab875fSJohannes Doerfert void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
3567ab875fSJohannes Doerfert uint32_t size, uint32_t tid) {
3667ab875fSJohannes Doerfert uint32_t curr_size;
3767ab875fSJohannes Doerfert uint32_t mask;
3867ab875fSJohannes Doerfert curr_size = size;
3967ab875fSJohannes Doerfert mask = curr_size / 2;
4067ab875fSJohannes Doerfert while (mask > 0) {
4167ab875fSJohannes Doerfert shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
4267ab875fSJohannes Doerfert curr_size = (curr_size + 1) / 2;
4367ab875fSJohannes Doerfert mask = curr_size / 2;
4467ab875fSJohannes Doerfert }
4567ab875fSJohannes Doerfert }
4667ab875fSJohannes Doerfert
4767ab875fSJohannes Doerfert #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
gpu_irregular_simd_reduce(void * reduce_data,ShuffleReductFnTy shflFct)4867ab875fSJohannes Doerfert static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
4967ab875fSJohannes Doerfert ShuffleReductFnTy shflFct) {
5067ab875fSJohannes Doerfert uint32_t size, remote_id, physical_lane_id;
5167ab875fSJohannes Doerfert physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
5267ab875fSJohannes Doerfert __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
5367ab875fSJohannes Doerfert __kmpc_impl_lanemask_t Liveness = mapping::activemask();
5467ab875fSJohannes Doerfert uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
5567ab875fSJohannes Doerfert __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
5667ab875fSJohannes Doerfert do {
5767ab875fSJohannes Doerfert Liveness = mapping::activemask();
5867ab875fSJohannes Doerfert remote_id = utils::ffs(Liveness & lanemask_gt);
5967ab875fSJohannes Doerfert size = utils::popc(Liveness);
6067ab875fSJohannes Doerfert logical_lane_id /= 2;
6167ab875fSJohannes Doerfert shflFct(reduce_data, /*LaneId =*/logical_lane_id,
6267ab875fSJohannes Doerfert /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
6367ab875fSJohannes Doerfert } while (logical_lane_id % 2 == 0 && size > 1);
6467ab875fSJohannes Doerfert return (logical_lane_id == 0);
6567ab875fSJohannes Doerfert }
6667ab875fSJohannes Doerfert #endif
6767ab875fSJohannes Doerfert
nvptx_parallel_reduce_nowait(int32_t TId,int32_t num_vars,uint64_t reduce_size,void * reduce_data,ShuffleReductFnTy shflFct,InterWarpCopyFnTy cpyFct,bool isSPMDExecutionMode,bool)6867ab875fSJohannes Doerfert static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
6967ab875fSJohannes Doerfert uint64_t reduce_size,
7067ab875fSJohannes Doerfert void *reduce_data,
7167ab875fSJohannes Doerfert ShuffleReductFnTy shflFct,
7267ab875fSJohannes Doerfert InterWarpCopyFnTy cpyFct,
7367ab875fSJohannes Doerfert bool isSPMDExecutionMode, bool) {
7467ab875fSJohannes Doerfert uint32_t BlockThreadId = mapping::getThreadIdInBlock();
7585ad5663SJoseph Huber if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
7667ab875fSJohannes Doerfert BlockThreadId = 0;
7767ab875fSJohannes Doerfert uint32_t NumThreads = omp_get_num_threads();
7867ab875fSJohannes Doerfert if (NumThreads == 1)
7967ab875fSJohannes Doerfert return 1;
8067ab875fSJohannes Doerfert /*
8167ab875fSJohannes Doerfert * This reduce function handles reduction within a team. It handles
8267ab875fSJohannes Doerfert * parallel regions in both L1 and L2 parallelism levels. It also
8367ab875fSJohannes Doerfert * supports Generic, SPMD, and NoOMP modes.
8467ab875fSJohannes Doerfert *
8567ab875fSJohannes Doerfert * 1. Reduce within a warp.
8667ab875fSJohannes Doerfert * 2. Warp master copies value to warp 0 via shared memory.
8767ab875fSJohannes Doerfert * 3. Warp 0 reduces to a single value.
8867ab875fSJohannes Doerfert * 4. The reduced value is available in the thread that returns 1.
8967ab875fSJohannes Doerfert */
9067ab875fSJohannes Doerfert
9167ab875fSJohannes Doerfert #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
9267ab875fSJohannes Doerfert uint32_t WarpsNeeded =
9367ab875fSJohannes Doerfert (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
9467ab875fSJohannes Doerfert uint32_t WarpId = mapping::getWarpId();
9567ab875fSJohannes Doerfert
9667ab875fSJohannes Doerfert // Volta execution model:
9767ab875fSJohannes Doerfert // For the Generic execution mode a parallel region either has 1 thread and
9867ab875fSJohannes Doerfert // beyond that, always a multiple of 32. For the SPMD execution mode we may
9967ab875fSJohannes Doerfert // have any number of threads.
10067ab875fSJohannes Doerfert if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
10167ab875fSJohannes Doerfert gpu_regular_warp_reduce(reduce_data, shflFct);
10267ab875fSJohannes Doerfert else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
10367ab875fSJohannes Doerfert gpu_irregular_warp_reduce(reduce_data, shflFct,
10467ab875fSJohannes Doerfert /*LaneCount=*/NumThreads % mapping::getWarpSize(),
10567ab875fSJohannes Doerfert /*LaneId=*/mapping::getThreadIdInBlock() %
10667ab875fSJohannes Doerfert mapping::getWarpSize());
10767ab875fSJohannes Doerfert
10867ab875fSJohannes Doerfert // When we have more than [mapping::getWarpSize()] number of threads
10967ab875fSJohannes Doerfert // a block reduction is performed here.
11067ab875fSJohannes Doerfert //
11167ab875fSJohannes Doerfert // Only L1 parallel region can enter this if condition.
11267ab875fSJohannes Doerfert if (NumThreads > mapping::getWarpSize()) {
11367ab875fSJohannes Doerfert // Gather all the reduced values from each warp
11467ab875fSJohannes Doerfert // to the first warp.
11567ab875fSJohannes Doerfert cpyFct(reduce_data, WarpsNeeded);
11667ab875fSJohannes Doerfert
11767ab875fSJohannes Doerfert if (WarpId == 0)
11867ab875fSJohannes Doerfert gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
11967ab875fSJohannes Doerfert BlockThreadId);
12067ab875fSJohannes Doerfert }
12167ab875fSJohannes Doerfert return BlockThreadId == 0;
12267ab875fSJohannes Doerfert #else
12367ab875fSJohannes Doerfert __kmpc_impl_lanemask_t Liveness = mapping::activemask();
12467ab875fSJohannes Doerfert if (Liveness == lanes::All) // Full warp
12567ab875fSJohannes Doerfert gpu_regular_warp_reduce(reduce_data, shflFct);
12667ab875fSJohannes Doerfert else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
12767ab875fSJohannes Doerfert gpu_irregular_warp_reduce(reduce_data, shflFct,
12867ab875fSJohannes Doerfert /*LaneCount=*/utils::popc(Liveness),
12967ab875fSJohannes Doerfert /*LaneId=*/mapping::getThreadIdInBlock() %
13067ab875fSJohannes Doerfert mapping::getWarpSize());
13167ab875fSJohannes Doerfert else { // Dispersed lanes. Only threads in L2
13267ab875fSJohannes Doerfert // parallel region may enter here; return
13367ab875fSJohannes Doerfert // early.
13467ab875fSJohannes Doerfert return gpu_irregular_simd_reduce(reduce_data, shflFct);
13567ab875fSJohannes Doerfert }
13667ab875fSJohannes Doerfert
13767ab875fSJohannes Doerfert // When we have more than [mapping::getWarpSize()] number of threads
13867ab875fSJohannes Doerfert // a block reduction is performed here.
13967ab875fSJohannes Doerfert //
14067ab875fSJohannes Doerfert // Only L1 parallel region can enter this if condition.
14167ab875fSJohannes Doerfert if (NumThreads > mapping::getWarpSize()) {
14267ab875fSJohannes Doerfert uint32_t WarpsNeeded =
14367ab875fSJohannes Doerfert (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
14467ab875fSJohannes Doerfert // Gather all the reduced values from each warp
14567ab875fSJohannes Doerfert // to the first warp.
14667ab875fSJohannes Doerfert cpyFct(reduce_data, WarpsNeeded);
14767ab875fSJohannes Doerfert
14867ab875fSJohannes Doerfert uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
14967ab875fSJohannes Doerfert if (WarpId == 0)
15067ab875fSJohannes Doerfert gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
15167ab875fSJohannes Doerfert BlockThreadId);
15267ab875fSJohannes Doerfert
15367ab875fSJohannes Doerfert return BlockThreadId == 0;
15467ab875fSJohannes Doerfert }
15567ab875fSJohannes Doerfert
15667ab875fSJohannes Doerfert // Get the OMP thread Id. This is different from BlockThreadId in the case of
15767ab875fSJohannes Doerfert // an L2 parallel region.
15867ab875fSJohannes Doerfert return TId == 0;
15967ab875fSJohannes Doerfert #endif // __CUDA_ARCH__ >= 700
16067ab875fSJohannes Doerfert }
16167ab875fSJohannes Doerfert
roundToWarpsize(uint32_t s)16267ab875fSJohannes Doerfert uint32_t roundToWarpsize(uint32_t s) {
16367ab875fSJohannes Doerfert if (s < mapping::getWarpSize())
16467ab875fSJohannes Doerfert return 1;
16567ab875fSJohannes Doerfert return (s & ~(unsigned)(mapping::getWarpSize() - 1));
16667ab875fSJohannes Doerfert }
16767ab875fSJohannes Doerfert
kmpcMin(uint32_t x,uint32_t y)16867ab875fSJohannes Doerfert uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
16967ab875fSJohannes Doerfert
170*ce0caf41SJoseph Huber static uint32_t IterCnt = 0;
171*ce0caf41SJoseph Huber static uint32_t Cnt = 0;
17267ab875fSJohannes Doerfert
17367ab875fSJohannes Doerfert } // namespace
17467ab875fSJohannes Doerfert
17567ab875fSJohannes Doerfert extern "C" {
__kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy * Loc,int32_t TId,int32_t num_vars,uint64_t reduce_size,void * reduce_data,ShuffleReductFnTy shflFct,InterWarpCopyFnTy cpyFct)17667ab875fSJohannes Doerfert int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
17767ab875fSJohannes Doerfert IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
17867ab875fSJohannes Doerfert void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
17974f91741SJoseph Huber FunctionTracingRAII();
18067ab875fSJohannes Doerfert return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
18167ab875fSJohannes Doerfert shflFct, cpyFct, mapping::isSPMDMode(),
18267ab875fSJohannes Doerfert false);
18367ab875fSJohannes Doerfert }
18467ab875fSJohannes Doerfert
__kmpc_nvptx_teams_reduce_nowait_v2(IdentTy * Loc,int32_t TId,void * GlobalBuffer,uint32_t num_of_records,void * reduce_data,ShuffleReductFnTy shflFct,InterWarpCopyFnTy cpyFct,ListGlobalFnTy lgcpyFct,ListGlobalFnTy lgredFct,ListGlobalFnTy glcpyFct,ListGlobalFnTy glredFct)18567ab875fSJohannes Doerfert int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
18667ab875fSJohannes Doerfert IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
18767ab875fSJohannes Doerfert void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
18867ab875fSJohannes Doerfert ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
18967ab875fSJohannes Doerfert ListGlobalFnTy glredFct) {
19074f91741SJoseph Huber FunctionTracingRAII();
19167ab875fSJohannes Doerfert
19267ab875fSJohannes Doerfert // Terminate all threads in non-SPMD mode except for the master thread.
19367ab875fSJohannes Doerfert uint32_t ThreadId = mapping::getThreadIdInBlock();
19467ab875fSJohannes Doerfert if (mapping::isGenericMode()) {
19567ab875fSJohannes Doerfert if (!mapping::isMainThreadInGenericMode())
19667ab875fSJohannes Doerfert return 0;
19767ab875fSJohannes Doerfert ThreadId = 0;
19867ab875fSJohannes Doerfert }
19967ab875fSJohannes Doerfert
20067ab875fSJohannes Doerfert // In non-generic mode all workers participate in the teams reduction.
20167ab875fSJohannes Doerfert // In generic mode only the team master participates in the teams
20267ab875fSJohannes Doerfert // reduction because the workers are waiting for parallel work.
20367ab875fSJohannes Doerfert uint32_t NumThreads = omp_get_num_threads();
20467ab875fSJohannes Doerfert uint32_t TeamId = omp_get_team_num();
20567ab875fSJohannes Doerfert uint32_t NumTeams = omp_get_num_teams();
20667ab875fSJohannes Doerfert static unsigned SHARED(Bound);
20767ab875fSJohannes Doerfert static unsigned SHARED(ChunkTeamCount);
20867ab875fSJohannes Doerfert
20967ab875fSJohannes Doerfert // Block progress for teams greater than the current upper
21067ab875fSJohannes Doerfert // limit. We always only allow a number of teams less or equal
21167ab875fSJohannes Doerfert // to the number of slots in the buffer.
21267ab875fSJohannes Doerfert bool IsMaster = (ThreadId == 0);
21367ab875fSJohannes Doerfert while (IsMaster) {
214*ce0caf41SJoseph Huber Bound = atomic::load(&IterCnt, __ATOMIC_SEQ_CST);
21567ab875fSJohannes Doerfert if (TeamId < Bound + num_of_records)
21667ab875fSJohannes Doerfert break;
21767ab875fSJohannes Doerfert }
21867ab875fSJohannes Doerfert
21967ab875fSJohannes Doerfert if (IsMaster) {
22067ab875fSJohannes Doerfert int ModBockId = TeamId % num_of_records;
22167ab875fSJohannes Doerfert if (TeamId < num_of_records) {
22267ab875fSJohannes Doerfert lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
22367ab875fSJohannes Doerfert } else
22467ab875fSJohannes Doerfert lgredFct(GlobalBuffer, ModBockId, reduce_data);
22567ab875fSJohannes Doerfert
22667ab875fSJohannes Doerfert fence::system(__ATOMIC_SEQ_CST);
22767ab875fSJohannes Doerfert
22867ab875fSJohannes Doerfert // Increment team counter.
22967ab875fSJohannes Doerfert // This counter is incremented by all teams in the current
23067ab875fSJohannes Doerfert // BUFFER_SIZE chunk.
231*ce0caf41SJoseph Huber ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST);
23267ab875fSJohannes Doerfert }
23367ab875fSJohannes Doerfert // Synchronize
23467ab875fSJohannes Doerfert if (mapping::isSPMDMode())
23567ab875fSJohannes Doerfert __kmpc_barrier(Loc, TId);
23667ab875fSJohannes Doerfert
23767ab875fSJohannes Doerfert // reduce_data is global or shared so before being reduced within the
23867ab875fSJohannes Doerfert // warp we need to bring it in local memory:
23967ab875fSJohannes Doerfert // local_reduce_data = reduce_data[i]
24067ab875fSJohannes Doerfert //
24167ab875fSJohannes Doerfert // Example for 3 reduction variables a, b, c (of potentially different
24267ab875fSJohannes Doerfert // types):
24367ab875fSJohannes Doerfert //
24467ab875fSJohannes Doerfert // buffer layout (struct of arrays):
24567ab875fSJohannes Doerfert // a, a, ..., a, b, b, ... b, c, c, ... c
24667ab875fSJohannes Doerfert // |__________|
24767ab875fSJohannes Doerfert // num_of_records
24867ab875fSJohannes Doerfert //
24967ab875fSJohannes Doerfert // local_data_reduce layout (struct):
25067ab875fSJohannes Doerfert // a, b, c
25167ab875fSJohannes Doerfert //
25267ab875fSJohannes Doerfert // Each thread will have a local struct containing the values to be
25367ab875fSJohannes Doerfert // reduced:
25467ab875fSJohannes Doerfert // 1. do reduction within each warp.
25567ab875fSJohannes Doerfert // 2. do reduction across warps.
25667ab875fSJohannes Doerfert // 3. write the final result to the main reduction variable
25767ab875fSJohannes Doerfert // by returning 1 in the thread holding the reduction result.
25867ab875fSJohannes Doerfert
25967ab875fSJohannes Doerfert // Check if this is the very last team.
26067ab875fSJohannes Doerfert unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
26167ab875fSJohannes Doerfert if (ChunkTeamCount == NumTeams - Bound - 1) {
26267ab875fSJohannes Doerfert //
26367ab875fSJohannes Doerfert // Last team processing.
26467ab875fSJohannes Doerfert //
26567ab875fSJohannes Doerfert if (ThreadId >= NumRecs)
26667ab875fSJohannes Doerfert return 0;
26767ab875fSJohannes Doerfert NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
26867ab875fSJohannes Doerfert if (ThreadId >= NumThreads)
26967ab875fSJohannes Doerfert return 0;
27067ab875fSJohannes Doerfert
27167ab875fSJohannes Doerfert // Load from buffer and reduce.
27267ab875fSJohannes Doerfert glcpyFct(GlobalBuffer, ThreadId, reduce_data);
27367ab875fSJohannes Doerfert for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
27467ab875fSJohannes Doerfert glredFct(GlobalBuffer, i, reduce_data);
27567ab875fSJohannes Doerfert
27667ab875fSJohannes Doerfert // Reduce across warps to the warp master.
27767ab875fSJohannes Doerfert if (NumThreads > 1) {
27867ab875fSJohannes Doerfert gpu_regular_warp_reduce(reduce_data, shflFct);
27967ab875fSJohannes Doerfert
28067ab875fSJohannes Doerfert // When we have more than [mapping::getWarpSize()] number of threads
28167ab875fSJohannes Doerfert // a block reduction is performed here.
28267ab875fSJohannes Doerfert uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
28367ab875fSJohannes Doerfert if (ActiveThreads > mapping::getWarpSize()) {
28467ab875fSJohannes Doerfert uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
28567ab875fSJohannes Doerfert mapping::getWarpSize();
28667ab875fSJohannes Doerfert // Gather all the reduced values from each warp
28767ab875fSJohannes Doerfert // to the first warp.
28867ab875fSJohannes Doerfert cpyFct(reduce_data, WarpsNeeded);
28967ab875fSJohannes Doerfert
29067ab875fSJohannes Doerfert uint32_t WarpId = ThreadId / mapping::getWarpSize();
29167ab875fSJohannes Doerfert if (WarpId == 0)
29267ab875fSJohannes Doerfert gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
29367ab875fSJohannes Doerfert ThreadId);
29467ab875fSJohannes Doerfert }
29567ab875fSJohannes Doerfert }
29667ab875fSJohannes Doerfert
29767ab875fSJohannes Doerfert if (IsMaster) {
29867ab875fSJohannes Doerfert Cnt = 0;
29967ab875fSJohannes Doerfert IterCnt = 0;
30067ab875fSJohannes Doerfert return 1;
30167ab875fSJohannes Doerfert }
30267ab875fSJohannes Doerfert return 0;
30367ab875fSJohannes Doerfert }
30467ab875fSJohannes Doerfert if (IsMaster && ChunkTeamCount == num_of_records - 1) {
30567ab875fSJohannes Doerfert // Allow SIZE number of teams to proceed writing their
30667ab875fSJohannes Doerfert // intermediate results to the global buffer.
307*ce0caf41SJoseph Huber atomic::add(&IterCnt, uint32_t(num_of_records), __ATOMIC_SEQ_CST);
30867ab875fSJohannes Doerfert }
30967ab875fSJohannes Doerfert
31067ab875fSJohannes Doerfert return 0;
31167ab875fSJohannes Doerfert }
31267ab875fSJohannes Doerfert
__kmpc_nvptx_end_reduce(int32_t TId)31374f91741SJoseph Huber void __kmpc_nvptx_end_reduce(int32_t TId) { FunctionTracingRAII(); }
31467ab875fSJohannes Doerfert
__kmpc_nvptx_end_reduce_nowait(int32_t TId)31574f91741SJoseph Huber void __kmpc_nvptx_end_reduce_nowait(int32_t TId) { FunctionTracingRAII(); }
31667ab875fSJohannes Doerfert }
31767ab875fSJohannes Doerfert
31867ab875fSJohannes Doerfert #pragma omp end declare target
319