1 //===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //
10 //===----------------------------------------------------------------------===//
11 
12 #include "Utils.h"
13 
14 #include "Debug.h"
15 #include "Interface.h"
16 #include "Mapping.h"
17 
18 #pragma omp declare target
19 
20 using namespace _OMP;
21 
22 namespace _OMP {
23 /// Helper to keep code alive without introducing a performance penalty.
24 __attribute__((used, weak, optnone)) void keepAlive() {
25   __kmpc_get_hardware_thread_id_in_block();
26   __kmpc_get_hardware_num_threads_in_block();
27   __kmpc_barrier_simple_spmd(nullptr, 0);
28 }
29 } // namespace _OMP
30 
31 namespace impl {
32 
33 /// AMDGCN Implementation
34 ///
35 ///{
36 #pragma omp begin declare variant match(device = {arch(amdgcn)})
37 
38 void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
39   static_assert(sizeof(unsigned long) == 8, "");
40   *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
41   *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
42 }
43 
44 uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
45   return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
46 }
47 
48 #pragma omp end declare variant
49 
50 /// NVPTX Implementation
51 ///
52 ///{
53 #pragma omp begin declare variant match(                                       \
54     device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
55 
56 void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
57   uint32_t LowBitsLocal, HighBitsLocal;
58   asm("mov.b64 {%0,%1}, %2;"
59       : "=r"(LowBitsLocal), "=r"(HighBitsLocal)
60       : "l"(Val));
61   *LowBits = LowBitsLocal;
62   *HighBits = HighBitsLocal;
63 }
64 
65 uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
66   uint64_t Val;
67   asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits));
68   return Val;
69 }
70 
71 #pragma omp end declare variant
72 
73 /// AMDGCN Implementation
74 ///
75 ///{
76 #pragma omp begin declare variant match(device = {arch(amdgcn)})
77 
78 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
79   int Width = mapping::getWarpSize();
80   int Self = mapping::getThreadIdInWarp();
81   int Index = SrcLane + (Self & ~(Width - 1));
82   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
83 }
84 
85 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
86                     int32_t Width) {
87   int Self = mapping::getThreadIdInWarp();
88   int Index = Self + LaneDelta;
89   Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
90   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
91 }
92 
93 #pragma omp end declare variant
94 ///}
95 
96 /// NVPTX Implementation
97 ///
98 ///{
99 #pragma omp begin declare variant match(                                       \
100     device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
101 
102 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
103   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
104 }
105 
106 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
107   int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
108   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
109 }
110 
111 #pragma omp end declare variant
112 } // namespace impl
113 
114 uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
115   return impl::Pack(LowBits, HighBits);
116 }
117 
118 void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
119   impl::Unpack(Val, &LowBits, &HighBits);
120 }
121 
122 int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
123   return impl::shuffle(Mask, Var, SrcLane);
124 }
125 
126 int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
127                            int32_t Width) {
128   return impl::shuffleDown(Mask, Var, Delta, Width);
129 }
130 
131 extern "C" {
132 int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
133   FunctionTracingRAII();
134   return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
135 }
136 
137 int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
138   FunctionTracingRAII();
139   uint32_t lo, hi;
140   utils::unpack(Val, lo, hi);
141   hi = impl::shuffleDown(lanes::All, hi, Delta, Width);
142   lo = impl::shuffleDown(lanes::All, lo, Delta, Width);
143   return utils::pack(lo, hi);
144 }
145 }
146 
147 #pragma omp end declare target
148