167ab875fSJohannes Doerfert //===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===//
267ab875fSJohannes Doerfert //
367ab875fSJohannes Doerfert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
467ab875fSJohannes Doerfert // See https://llvm.org/LICENSE.txt for license information.
567ab875fSJohannes Doerfert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
667ab875fSJohannes Doerfert //
767ab875fSJohannes Doerfert //===----------------------------------------------------------------------===//
867ab875fSJohannes Doerfert //
967ab875fSJohannes Doerfert //
1067ab875fSJohannes Doerfert //===----------------------------------------------------------------------===//
1167ab875fSJohannes Doerfert
1267ab875fSJohannes Doerfert #include "Utils.h"
1367ab875fSJohannes Doerfert
1474f91741SJoseph Huber #include "Debug.h"
1567ab875fSJohannes Doerfert #include "Interface.h"
1667ab875fSJohannes Doerfert #include "Mapping.h"
1767ab875fSJohannes Doerfert
18b4f8443dSJoseph Huber #pragma omp begin declare target device_type(nohost)
1967ab875fSJohannes Doerfert
2067ab875fSJohannes Doerfert using namespace _OMP;
2167ab875fSJohannes Doerfert
2267ab875fSJohannes Doerfert namespace _OMP {
2367ab875fSJohannes Doerfert /// Helper to keep code alive without introducing a performance penalty.
keepAlive()24*421b1f55SJoseph Huber __attribute__((weak, optnone, cold)) KEEP_ALIVE void keepAlive() {
2560a40cf3SJoseph Huber __kmpc_get_hardware_thread_id_in_block();
26bad44d5fSJoseph Huber __kmpc_get_hardware_num_threads_in_block();
27737c4a26SAtmn Patel __kmpc_get_warp_size();
2867ab875fSJohannes Doerfert __kmpc_barrier_simple_spmd(nullptr, 0);
2973720c80SJohannes Doerfert __kmpc_barrier_simple_generic(nullptr, 0);
3067ab875fSJohannes Doerfert }
3167ab875fSJohannes Doerfert } // namespace _OMP
3267ab875fSJohannes Doerfert
3367ab875fSJohannes Doerfert namespace impl {
3467ab875fSJohannes Doerfert
35b4f8443dSJoseph Huber void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits);
36b4f8443dSJoseph Huber uint64_t Pack(uint32_t LowBits, uint32_t HighBits);
37b4f8443dSJoseph Huber
3867ab875fSJohannes Doerfert /// AMDGCN Implementation
3967ab875fSJohannes Doerfert ///
4067ab875fSJohannes Doerfert ///{
4167ab875fSJohannes Doerfert #pragma omp begin declare variant match(device = {arch(amdgcn)})
4267ab875fSJohannes Doerfert
Unpack(uint64_t Val,uint32_t * LowBits,uint32_t * HighBits)4367ab875fSJohannes Doerfert void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
447272982eSJon Chesterfield static_assert(sizeof(unsigned long) == 8, "");
457272982eSJon Chesterfield *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
467272982eSJon Chesterfield *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
4767ab875fSJohannes Doerfert }
4867ab875fSJohannes Doerfert
Pack(uint32_t LowBits,uint32_t HighBits)4967ab875fSJohannes Doerfert uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
5067ab875fSJohannes Doerfert return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
5167ab875fSJohannes Doerfert }
5267ab875fSJohannes Doerfert
5367ab875fSJohannes Doerfert #pragma omp end declare variant
5467ab875fSJohannes Doerfert
5567ab875fSJohannes Doerfert /// NVPTX Implementation
5667ab875fSJohannes Doerfert ///
5767ab875fSJohannes Doerfert ///{
5867ab875fSJohannes Doerfert #pragma omp begin declare variant match( \
5967ab875fSJohannes Doerfert device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
6067ab875fSJohannes Doerfert
Unpack(uint64_t Val,uint32_t * LowBits,uint32_t * HighBits)6167ab875fSJohannes Doerfert void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
6267ab875fSJohannes Doerfert uint32_t LowBitsLocal, HighBitsLocal;
6367ab875fSJohannes Doerfert asm("mov.b64 {%0,%1}, %2;"
6467ab875fSJohannes Doerfert : "=r"(LowBitsLocal), "=r"(HighBitsLocal)
6567ab875fSJohannes Doerfert : "l"(Val));
6667ab875fSJohannes Doerfert *LowBits = LowBitsLocal;
6767ab875fSJohannes Doerfert *HighBits = HighBitsLocal;
6867ab875fSJohannes Doerfert }
6967ab875fSJohannes Doerfert
Pack(uint32_t LowBits,uint32_t HighBits)7067ab875fSJohannes Doerfert uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
7167ab875fSJohannes Doerfert uint64_t Val;
7267ab875fSJohannes Doerfert asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits));
7367ab875fSJohannes Doerfert return Val;
7467ab875fSJohannes Doerfert }
7567ab875fSJohannes Doerfert
7667ab875fSJohannes Doerfert #pragma omp end declare variant
7767ab875fSJohannes Doerfert
78b4f8443dSJoseph Huber int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
79b4f8443dSJoseph Huber int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
80b4f8443dSJoseph Huber int32_t Width);
81b4f8443dSJoseph Huber
8267ab875fSJohannes Doerfert /// AMDGCN Implementation
8367ab875fSJohannes Doerfert ///
8467ab875fSJohannes Doerfert ///{
8567ab875fSJohannes Doerfert #pragma omp begin declare variant match(device = {arch(amdgcn)})
8667ab875fSJohannes Doerfert
shuffle(uint64_t Mask,int32_t Var,int32_t SrcLane)8767ab875fSJohannes Doerfert int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
8867ab875fSJohannes Doerfert int Width = mapping::getWarpSize();
897272982eSJon Chesterfield int Self = mapping::getThreadIdInWarp();
9067ab875fSJohannes Doerfert int Index = SrcLane + (Self & ~(Width - 1));
9167ab875fSJohannes Doerfert return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
9267ab875fSJohannes Doerfert }
9367ab875fSJohannes Doerfert
shuffleDown(uint64_t Mask,int32_t Var,uint32_t LaneDelta,int32_t Width)9467ab875fSJohannes Doerfert int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
9567ab875fSJohannes Doerfert int32_t Width) {
9667ab875fSJohannes Doerfert int Self = mapping::getThreadIdInWarp();
9767ab875fSJohannes Doerfert int Index = Self + LaneDelta;
9867ab875fSJohannes Doerfert Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
9967ab875fSJohannes Doerfert return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
10067ab875fSJohannes Doerfert }
10167ab875fSJohannes Doerfert
10267ab875fSJohannes Doerfert #pragma omp end declare variant
10367ab875fSJohannes Doerfert ///}
10467ab875fSJohannes Doerfert
10567ab875fSJohannes Doerfert /// NVPTX Implementation
10667ab875fSJohannes Doerfert ///
10767ab875fSJohannes Doerfert ///{
10867ab875fSJohannes Doerfert #pragma omp begin declare variant match( \
10967ab875fSJohannes Doerfert device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
11067ab875fSJohannes Doerfert
shuffle(uint64_t Mask,int32_t Var,int32_t SrcLane)11167ab875fSJohannes Doerfert int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
11267ab875fSJohannes Doerfert return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
11367ab875fSJohannes Doerfert }
11467ab875fSJohannes Doerfert
shuffleDown(uint64_t Mask,int32_t Var,uint32_t Delta,int32_t Width)11567ab875fSJohannes Doerfert int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
11667ab875fSJohannes Doerfert int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
11767ab875fSJohannes Doerfert return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
11867ab875fSJohannes Doerfert }
11967ab875fSJohannes Doerfert
12067ab875fSJohannes Doerfert #pragma omp end declare variant
12167ab875fSJohannes Doerfert } // namespace impl
12267ab875fSJohannes Doerfert
pack(uint32_t LowBits,uint32_t HighBits)12367ab875fSJohannes Doerfert uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
12467ab875fSJohannes Doerfert return impl::Pack(LowBits, HighBits);
12567ab875fSJohannes Doerfert }
12667ab875fSJohannes Doerfert
unpack(uint64_t Val,uint32_t & LowBits,uint32_t & HighBits)12767ab875fSJohannes Doerfert void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
12867ab875fSJohannes Doerfert impl::Unpack(Val, &LowBits, &HighBits);
12967ab875fSJohannes Doerfert }
13067ab875fSJohannes Doerfert
shuffle(uint64_t Mask,int32_t Var,int32_t SrcLane)13167ab875fSJohannes Doerfert int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
13267ab875fSJohannes Doerfert return impl::shuffle(Mask, Var, SrcLane);
13367ab875fSJohannes Doerfert }
13467ab875fSJohannes Doerfert
shuffleDown(uint64_t Mask,int32_t Var,uint32_t Delta,int32_t Width)13567ab875fSJohannes Doerfert int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
13667ab875fSJohannes Doerfert int32_t Width) {
13767ab875fSJohannes Doerfert return impl::shuffleDown(Mask, Var, Delta, Width);
13867ab875fSJohannes Doerfert }
13967ab875fSJohannes Doerfert
14067ab875fSJohannes Doerfert extern "C" {
__kmpc_shuffle_int32(int32_t Val,int16_t Delta,int16_t SrcLane)14167ab875fSJohannes Doerfert int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
14274f91741SJoseph Huber FunctionTracingRAII();
14367ab875fSJohannes Doerfert return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
14467ab875fSJohannes Doerfert }
14567ab875fSJohannes Doerfert
__kmpc_shuffle_int64(int64_t Val,int16_t Delta,int16_t Width)14667ab875fSJohannes Doerfert int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
14774f91741SJoseph Huber FunctionTracingRAII();
14867ab875fSJohannes Doerfert uint32_t lo, hi;
14967ab875fSJohannes Doerfert utils::unpack(Val, lo, hi);
15067ab875fSJohannes Doerfert hi = impl::shuffleDown(lanes::All, hi, Delta, Width);
15167ab875fSJohannes Doerfert lo = impl::shuffleDown(lanes::All, lo, Delta, Width);
15267ab875fSJohannes Doerfert return utils::pack(lo, hi);
15367ab875fSJohannes Doerfert }
15467ab875fSJohannes Doerfert }
15567ab875fSJohannes Doerfert
15667ab875fSJohannes Doerfert #pragma omp end declare target
157