19e05c084SJon Chesterfield //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
2d0b31295SJon Chesterfield //
3d0b31295SJon Chesterfield // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4d0b31295SJon Chesterfield // See https://llvm.org/LICENSE.txt for license information.
5d0b31295SJon Chesterfield // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6d0b31295SJon Chesterfield //
7d0b31295SJon Chesterfield //===----------------------------------------------------------------------===//
8d0b31295SJon Chesterfield //
9d022f39dSRon Lieberman // RTL for AMD hsa machine
10d0b31295SJon Chesterfield //
11d0b31295SJon Chesterfield //===----------------------------------------------------------------------===//
12d0b31295SJon Chesterfield 
13d0b31295SJon Chesterfield #include <algorithm>
14d0b31295SJon Chesterfield #include <assert.h>
15d0b31295SJon Chesterfield #include <cstdio>
16d0b31295SJon Chesterfield #include <cstdlib>
17d0b31295SJon Chesterfield #include <cstring>
1820df2c70SPushpinder Singh #include <functional>
19d0b31295SJon Chesterfield #include <libelf.h>
20d0b31295SJon Chesterfield #include <list>
21d0b31295SJon Chesterfield #include <memory>
225d989fb3SJonChesterfield #include <mutex>
235d989fb3SJonChesterfield #include <shared_mutex>
24d0b31295SJon Chesterfield #include <unordered_map>
25d0b31295SJon Chesterfield #include <vector>
26d0b31295SJon Chesterfield 
2768ab93f4SJon Chesterfield #include "impl_runtime.h"
28d27d0a67SJoseph Huber #include "interop_hsa.h"
29d0b31295SJon Chesterfield 
30d0b31295SJon Chesterfield #include "internal.h"
313153bdd5SJon Chesterfield #include "rt.h"
32d0b31295SJon Chesterfield 
330c554a47SJon Chesterfield #include "DeviceEnvironment.h"
34cab9f692SJon Chesterfield #include "get_elf_mach_gfx_name.h"
35d0b31295SJon Chesterfield #include "omptargetplugin.h"
3630c0d5b4SRon Lieberman #include "print_tracing.h"
37d0b31295SJon Chesterfield 
384075a811SSaiyedul Islam #include "llvm/ADT/StringMap.h"
394075a811SSaiyedul Islam #include "llvm/ADT/StringRef.h"
40b75a7481SJon Chesterfield #include "llvm/Frontend/OpenMP/OMPConstants.h"
417d2ecef5SJonChesterfield #include "llvm/Frontend/OpenMP/OMPGridValues.h"
42d0b31295SJon Chesterfield 
434075a811SSaiyedul Islam using namespace llvm;
444075a811SSaiyedul Islam 
4571f46930SJon Chesterfield // hostrpc interface, FIXME: consider moving to its own include these are
4671f46930SJon Chesterfield // statically linked into amdgpu/plugin if present from hostrpc_services.a,
4771f46930SJon Chesterfield // linked as --whole-archive to override the weak symbols that are used to
4871f46930SJon Chesterfield // implement a fallback for toolchains that do not yet have a hostrpc library.
4971f46930SJon Chesterfield extern "C" {
50d27d0a67SJoseph Huber uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
51d27d0a67SJoseph Huber                                uint32_t DeviceId);
5271f46930SJon Chesterfield hsa_status_t hostrpc_init();
5371f46930SJon Chesterfield hsa_status_t hostrpc_terminate();
5471f46930SJon Chesterfield 
hostrpc_init()5571f46930SJon Chesterfield __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
hostrpc_terminate()5671f46930SJon Chesterfield __attribute__((weak)) hsa_status_t hostrpc_terminate() {
5771f46930SJon Chesterfield   return HSA_STATUS_SUCCESS;
5871f46930SJon Chesterfield }
hostrpc_assign_buffer(hsa_agent_t,hsa_queue_t *,uint32_t DeviceId)59a74826d3SJon Chesterfield __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
60d27d0a67SJoseph Huber                                                      uint32_t DeviceId) {
6171f46930SJon Chesterfield   DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
6271f46930SJon Chesterfield      "missing\n",
63d27d0a67SJoseph Huber      DeviceId);
6471f46930SJon Chesterfield   return 0;
6571f46930SJon Chesterfield }
6671f46930SJon Chesterfield }
6771f46930SJon Chesterfield 
68e0b713a0SDhruva Chakrabarti // Heuristic parameters used for kernel launch
69e0b713a0SDhruva Chakrabarti // Number of teams per CU to allow scheduling flexibility
70e0b713a0SDhruva Chakrabarti static const unsigned DefaultTeamsPerCU = 4;
71e0b713a0SDhruva Chakrabarti 
72d0b31295SJon Chesterfield int print_kernel_trace;
73d0b31295SJon Chesterfield 
74d0b31295SJon Chesterfield #ifdef OMPTARGET_DEBUG
75d0b31295SJon Chesterfield #define check(msg, status)                                                     \
7607f59baaSJon Chesterfield   if (status != HSA_STATUS_SUCCESS) {                                          \
77d0b31295SJon Chesterfield     DP(#msg " failed\n");                                                      \
78d0b31295SJon Chesterfield   } else {                                                                     \
79d0b31295SJon Chesterfield     DP(#msg " succeeded\n");                                                   \
80d0b31295SJon Chesterfield   }
81d0b31295SJon Chesterfield #else
82d0b31295SJon Chesterfield #define check(msg, status)                                                     \
83d0b31295SJon Chesterfield   {}
84d0b31295SJon Chesterfield #endif
85d0b31295SJon Chesterfield 
86a81c68aeSShilei Tian #include "elf_common.h"
87d0b31295SJon Chesterfield 
88cadcaf3fSPushpinder Singh namespace hsa {
iterate_agents(C Cb)89d27d0a67SJoseph Huber template <typename C> hsa_status_t iterate_agents(C Cb) {
90d27d0a67SJoseph Huber   auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
91d27d0a67SJoseph Huber     C *Unwrapped = static_cast<C *>(Data);
92d27d0a67SJoseph Huber     return (*Unwrapped)(Agent);
93cadcaf3fSPushpinder Singh   };
94d27d0a67SJoseph Huber   return hsa_iterate_agents(L, static_cast<void *>(&Cb));
95cadcaf3fSPushpinder Singh }
96cadcaf3fSPushpinder Singh 
9720df2c70SPushpinder Singh template <typename C>
amd_agent_iterate_memory_pools(hsa_agent_t Agent,C Cb)98d27d0a67SJoseph Huber hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
99d27d0a67SJoseph Huber   auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
100d27d0a67SJoseph Huber     C *Unwrapped = static_cast<C *>(Data);
101d27d0a67SJoseph Huber     return (*Unwrapped)(MemoryPool);
10220df2c70SPushpinder Singh   };
10320df2c70SPushpinder Singh 
104d27d0a67SJoseph Huber   return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
10520df2c70SPushpinder Singh }
10620df2c70SPushpinder Singh 
107cadcaf3fSPushpinder Singh } // namespace hsa
108cadcaf3fSPushpinder Singh 
109d0b31295SJon Chesterfield /// Keep entries table per device
110d0b31295SJon Chesterfield struct FuncOrGblEntryTy {
111d0b31295SJon Chesterfield   __tgt_target_table Table;
112d0b31295SJon Chesterfield   std::vector<__tgt_offload_entry> Entries;
113d0b31295SJon Chesterfield };
114d0b31295SJon Chesterfield 
115d0b31295SJon Chesterfield struct KernelArgPool {
116d0b31295SJon Chesterfield private:
117d27d0a67SJoseph Huber   static pthread_mutex_t Mutex;
118d0b31295SJon Chesterfield 
119d0b31295SJon Chesterfield public:
120d27d0a67SJoseph Huber   uint32_t KernargSegmentSize;
121d27d0a67SJoseph Huber   void *KernargRegion = nullptr;
122d27d0a67SJoseph Huber   std::queue<int> FreeKernargSegments;
123d0b31295SJon Chesterfield 
kernargSizeIncludingImplicitKernelArgPool124d27d0a67SJoseph Huber   uint32_t kernargSizeIncludingImplicit() {
125d27d0a67SJoseph Huber     return KernargSegmentSize + sizeof(impl_implicit_args_t);
126d0b31295SJon Chesterfield   }
127d0b31295SJon Chesterfield 
~KernelArgPoolKernelArgPool128d0b31295SJon Chesterfield   ~KernelArgPool() {
129d27d0a67SJoseph Huber     if (KernargRegion) {
130d27d0a67SJoseph Huber       auto R = hsa_amd_memory_pool_free(KernargRegion);
131d27d0a67SJoseph Huber       if (R != HSA_STATUS_SUCCESS) {
132d27d0a67SJoseph Huber         DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R));
1339934571eSJon Chesterfield       }
134d0b31295SJon Chesterfield     }
135d0b31295SJon Chesterfield   }
136d0b31295SJon Chesterfield 
137d0b31295SJon Chesterfield   // Can't really copy or move a mutex
138d0b31295SJon Chesterfield   KernelArgPool() = default;
139d0b31295SJon Chesterfield   KernelArgPool(const KernelArgPool &) = delete;
140d0b31295SJon Chesterfield   KernelArgPool(KernelArgPool &&) = delete;
141d0b31295SJon Chesterfield 
KernelArgPoolKernelArgPool142d27d0a67SJoseph Huber   KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
143d27d0a67SJoseph Huber       : KernargSegmentSize(KernargSegmentSize) {
144d0b31295SJon Chesterfield 
1453d853429SJon Chesterfield     // impl uses one pool per kernel for all gpus, with a fixed upper size
146d0b31295SJon Chesterfield     // preserving that exact scheme here, including the queue<int>
147d18fb09cSJon Chesterfield 
148d27d0a67SJoseph Huber     hsa_status_t Err = hsa_amd_memory_pool_allocate(
149d27d0a67SJoseph Huber         MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0,
150d27d0a67SJoseph Huber         &KernargRegion);
151d18fb09cSJon Chesterfield 
152d27d0a67SJoseph Huber     if (Err != HSA_STATUS_SUCCESS) {
153d27d0a67SJoseph Huber       DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err));
154d27d0a67SJoseph Huber       KernargRegion = nullptr; // paranoid
155d18fb09cSJon Chesterfield       return;
1569934571eSJon Chesterfield     }
157d18fb09cSJon Chesterfield 
158d27d0a67SJoseph Huber     Err = core::allow_access_to_all_gpu_agents(KernargRegion);
159d27d0a67SJoseph Huber     if (Err != HSA_STATUS_SUCCESS) {
160d18fb09cSJon Chesterfield       DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
161d27d0a67SJoseph Huber          get_error_string(Err));
162d27d0a67SJoseph Huber       auto R = hsa_amd_memory_pool_free(KernargRegion);
163d27d0a67SJoseph Huber       if (R != HSA_STATUS_SUCCESS) {
164d18fb09cSJon Chesterfield         // if free failed, can't do anything more to resolve it
165d27d0a67SJoseph Huber         DP("hsa memory poll free failed: %s\n", get_error_string(Err));
166d18fb09cSJon Chesterfield       }
167d27d0a67SJoseph Huber       KernargRegion = nullptr;
168d18fb09cSJon Chesterfield       return;
169d18fb09cSJon Chesterfield     }
170d0b31295SJon Chesterfield 
171d27d0a67SJoseph Huber     for (int I = 0; I < MAX_NUM_KERNELS; I++) {
172d27d0a67SJoseph Huber       FreeKernargSegments.push(I);
173d0b31295SJon Chesterfield     }
174d0b31295SJon Chesterfield   }
175d0b31295SJon Chesterfield 
allocateKernelArgPool176d27d0a67SJoseph Huber   void *allocate(uint64_t ArgNum) {
177d27d0a67SJoseph Huber     assert((ArgNum * sizeof(void *)) == KernargSegmentSize);
178d27d0a67SJoseph Huber     Lock L(&Mutex);
179d27d0a67SJoseph Huber     void *Res = nullptr;
180d27d0a67SJoseph Huber     if (!FreeKernargSegments.empty()) {
181d0b31295SJon Chesterfield 
182d27d0a67SJoseph Huber       int FreeIdx = FreeKernargSegments.front();
183d27d0a67SJoseph Huber       Res = static_cast<void *>(static_cast<char *>(KernargRegion) +
184d27d0a67SJoseph Huber                                 (FreeIdx * kernargSizeIncludingImplicit()));
185d27d0a67SJoseph Huber       assert(FreeIdx == pointerToIndex(Res));
186d27d0a67SJoseph Huber       FreeKernargSegments.pop();
187d0b31295SJon Chesterfield     }
188d27d0a67SJoseph Huber     return Res;
189d0b31295SJon Chesterfield   }
190d0b31295SJon Chesterfield 
deallocateKernelArgPool191d27d0a67SJoseph Huber   void deallocate(void *Ptr) {
192d27d0a67SJoseph Huber     Lock L(&Mutex);
193d27d0a67SJoseph Huber     int Idx = pointerToIndex(Ptr);
194d27d0a67SJoseph Huber     FreeKernargSegments.push(Idx);
195d0b31295SJon Chesterfield   }
196d0b31295SJon Chesterfield 
197d0b31295SJon Chesterfield private:
pointerToIndexKernelArgPool198d27d0a67SJoseph Huber   int pointerToIndex(void *Ptr) {
199d27d0a67SJoseph Huber     ptrdiff_t Bytes =
200d27d0a67SJoseph Huber         static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion);
201d27d0a67SJoseph Huber     assert(Bytes >= 0);
202d27d0a67SJoseph Huber     assert(Bytes % kernargSizeIncludingImplicit() == 0);
203d27d0a67SJoseph Huber     return Bytes / kernargSizeIncludingImplicit();
204d0b31295SJon Chesterfield   }
205d27d0a67SJoseph Huber   struct Lock {
LockKernelArgPool::Lock206d27d0a67SJoseph Huber     Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
~LockKernelArgPool::Lock207d27d0a67SJoseph Huber     ~Lock() { pthread_mutex_unlock(M); }
208d27d0a67SJoseph Huber     pthread_mutex_t *M;
209d0b31295SJon Chesterfield   };
210d0b31295SJon Chesterfield };
211d27d0a67SJoseph Huber pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
212d0b31295SJon Chesterfield 
213*046d5b91SJoseph Huber std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
214*046d5b91SJoseph Huber     KernelArgPoolMap;
215*046d5b91SJoseph Huber 
216d0b31295SJon Chesterfield /// Use a single entity to encode a kernel and a set of flags
217d0b31295SJon Chesterfield struct KernelTy {
218b75a7481SJon Chesterfield   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
219d0b31295SJon Chesterfield   int16_t ConstWGSize;
220d27d0a67SJoseph Huber   int32_t DeviceId;
221ae9d96a6SJon Chesterfield   void *CallStackAddr = nullptr;
222d0b31295SJon Chesterfield   const char *Name;
223d0b31295SJon Chesterfield 
KernelTyKernelTy224d27d0a67SJoseph Huber   KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
225d27d0a67SJoseph Huber            int32_t DeviceId, void *CallStackAddr, const char *Name,
226d27d0a67SJoseph Huber            uint32_t KernargSegmentSize,
227*046d5b91SJoseph Huber            hsa_amd_memory_pool_t &KernArgMemoryPool)
228d27d0a67SJoseph Huber       : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
229d27d0a67SJoseph Huber         DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
230d0b31295SJon Chesterfield     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
231d0b31295SJon Chesterfield 
232d27d0a67SJoseph Huber     std::string N(Name);
233d0b31295SJon Chesterfield     if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
234d0b31295SJon Chesterfield       KernelArgPoolMap.insert(
235f5f329a3SPushpinder Singh           std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
236d27d0a67SJoseph Huber                                 KernargSegmentSize, KernArgMemoryPool))));
237d0b31295SJon Chesterfield     }
238d0b31295SJon Chesterfield   }
239d0b31295SJon Chesterfield };
240d0b31295SJon Chesterfield 
241*046d5b91SJoseph Huber /// List that contains all the kernels.
242*046d5b91SJoseph Huber /// FIXME: we may need this to be per device and per library.
243*046d5b91SJoseph Huber std::list<KernelTy> KernelsList;
244*046d5b91SJoseph Huber 
findAgents(Callback CB)245d27d0a67SJoseph Huber template <typename Callback> static hsa_status_t findAgents(Callback CB) {
246d0b31295SJon Chesterfield 
247d27d0a67SJoseph Huber   hsa_status_t Err =
248d27d0a67SJoseph Huber       hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
249d27d0a67SJoseph Huber         hsa_device_type_t DeviceType;
250d0b31295SJon Chesterfield         // get_info fails iff HSA runtime not yet initialized
251d27d0a67SJoseph Huber         hsa_status_t Err =
252d27d0a67SJoseph Huber             hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
2539b2c6c07SJon Chesterfield 
254d27d0a67SJoseph Huber         if (Err != HSA_STATUS_SUCCESS) {
2559b2c6c07SJon Chesterfield           if (print_kernel_trace > 0)
256d27d0a67SJoseph Huber             DP("rtl.cpp: err %s\n", get_error_string(Err));
2579b2c6c07SJon Chesterfield 
258d27d0a67SJoseph Huber           return Err;
2599b2c6c07SJon Chesterfield         }
260d0b31295SJon Chesterfield 
261d27d0a67SJoseph Huber         CB(DeviceType, Agent);
262d0b31295SJon Chesterfield         return HSA_STATUS_SUCCESS;
263cadcaf3fSPushpinder Singh       });
264d0b31295SJon Chesterfield 
265d0b31295SJon Chesterfield   // iterate_agents fails iff HSA runtime not yet initialized
266d27d0a67SJoseph Huber   if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
267d27d0a67SJoseph Huber     DP("rtl.cpp: err %s\n", get_error_string(Err));
268cadcaf3fSPushpinder Singh   }
269cadcaf3fSPushpinder Singh 
270d27d0a67SJoseph Huber   return Err;
271d0b31295SJon Chesterfield }
272d0b31295SJon Chesterfield 
callbackQueue(hsa_status_t Status,hsa_queue_t * Source,void * Data)273d27d0a67SJoseph Huber static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
274d27d0a67SJoseph Huber                           void *Data) {
275d27d0a67SJoseph Huber   if (Status != HSA_STATUS_SUCCESS) {
276d27d0a67SJoseph Huber     const char *StatusString;
277d27d0a67SJoseph Huber     if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
278d27d0a67SJoseph Huber       StatusString = "unavailable";
279d0b31295SJon Chesterfield     }
280d27d0a67SJoseph Huber     DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,
281d27d0a67SJoseph Huber        Status, StatusString);
282d0b31295SJon Chesterfield     abort();
283d0b31295SJon Chesterfield   }
284d0b31295SJon Chesterfield }
285d0b31295SJon Chesterfield 
286d0b31295SJon Chesterfield namespace core {
2877e9351b9SJon Chesterfield namespace {
28815ed5c0aSJose Manuel Monsalve Diaz 
checkResult(hsa_status_t Err,const char * ErrMsg)28915ed5c0aSJose Manuel Monsalve Diaz bool checkResult(hsa_status_t Err, const char *ErrMsg) {
29015ed5c0aSJose Manuel Monsalve Diaz   if (Err == HSA_STATUS_SUCCESS)
29115ed5c0aSJose Manuel Monsalve Diaz     return true;
29215ed5c0aSJose Manuel Monsalve Diaz 
29315ed5c0aSJose Manuel Monsalve Diaz   REPORT("%s", ErrMsg);
29415ed5c0aSJose Manuel Monsalve Diaz   REPORT("%s", get_error_string(Err));
29515ed5c0aSJose Manuel Monsalve Diaz   return false;
29615ed5c0aSJose Manuel Monsalve Diaz }
29715ed5c0aSJose Manuel Monsalve Diaz 
packetStoreRelease(uint32_t * Packet,uint16_t Header,uint16_t Rest)298d27d0a67SJoseph Huber void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
299d27d0a67SJoseph Huber   __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE);
300d0b31295SJon Chesterfield }
301d0b31295SJon Chesterfield 
createHeader()302d27d0a67SJoseph Huber uint16_t createHeader() {
303d27d0a67SJoseph Huber   uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
304d27d0a67SJoseph Huber   Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
305d27d0a67SJoseph Huber   Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
306d27d0a67SJoseph Huber   return Header;
307d0b31295SJon Chesterfield }
308f5f329a3SPushpinder Singh 
isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool)309b1695c2eSPushpinder Singh hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
3109d0eb440SPushpinder Singh   bool AllocAllowed = false;
3119d0eb440SPushpinder Singh   hsa_status_t Err = hsa_amd_memory_pool_get_info(
3129d0eb440SPushpinder Singh       MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
3139d0eb440SPushpinder Singh       &AllocAllowed);
3149d0eb440SPushpinder Singh   if (Err != HSA_STATUS_SUCCESS) {
3159d0eb440SPushpinder Singh     DP("Alloc allowed in memory pool check failed: %s\n",
3169d0eb440SPushpinder Singh        get_error_string(Err));
317b1695c2eSPushpinder Singh     return Err;
3189d0eb440SPushpinder Singh   }
3199d0eb440SPushpinder Singh 
320b1695c2eSPushpinder Singh   size_t Size = 0;
321b1695c2eSPushpinder Singh   Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
322b1695c2eSPushpinder Singh                                      &Size);
323b1695c2eSPushpinder Singh   if (Err != HSA_STATUS_SUCCESS) {
324b1695c2eSPushpinder Singh     DP("Get memory pool size failed: %s\n", get_error_string(Err));
325b1695c2eSPushpinder Singh     return Err;
326b1695c2eSPushpinder Singh   }
327b1695c2eSPushpinder Singh 
328b1695c2eSPushpinder Singh   return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
3299d0eb440SPushpinder Singh }
3309d0eb440SPushpinder Singh 
addMemoryPool(hsa_amd_memory_pool_t MemoryPool,void * Data)33105ba9ff6SJon Chesterfield hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
332f5f329a3SPushpinder Singh   std::vector<hsa_amd_memory_pool_t> *Result =
333f5f329a3SPushpinder Singh       static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
334f5f329a3SPushpinder Singh 
335d27d0a67SJoseph Huber   hsa_status_t Err;
336d27d0a67SJoseph Huber   if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
337d27d0a67SJoseph Huber     return Err;
338f5f329a3SPushpinder Singh   }
339f5f329a3SPushpinder Singh 
340f5f329a3SPushpinder Singh   Result->push_back(MemoryPool);
341f5f329a3SPushpinder Singh   return HSA_STATUS_SUCCESS;
342f5f329a3SPushpinder Singh }
343f5f329a3SPushpinder Singh 
3447e9351b9SJon Chesterfield } // namespace
345d0b31295SJon Chesterfield } // namespace core
346d0b31295SJon Chesterfield 
347ddfb074aSJon Chesterfield struct EnvironmentVariables {
348ddfb074aSJon Chesterfield   int NumTeams;
349ddfb074aSJon Chesterfield   int TeamLimit;
350ddfb074aSJon Chesterfield   int TeamThreadLimit;
351ddfb074aSJon Chesterfield   int MaxTeamsDefault;
352f4f23de1SJoseph Huber   int DynamicMemSize;
353ddfb074aSJon Chesterfield };
354ddfb074aSJon Chesterfield 
35578f92c38SJon Chesterfield template <uint32_t wavesize>
getGridValue()356ba0af885SJon Chesterfield static constexpr const llvm::omp::GV &getGridValue() {
35778f92c38SJon Chesterfield   return llvm::omp::getAMDGPUGridValues<wavesize>();
358ba0af885SJon Chesterfield }
359ba0af885SJon Chesterfield 
3606760234eSJon Chesterfield struct HSALifetime {
3616760234eSJon Chesterfield   // Wrapper around HSA used to ensure it is constructed before other types
3626760234eSJon Chesterfield   // and destructed after, which means said other types can use raii for
3636760234eSJon Chesterfield   // cleanup without risking running outside of the lifetime of HSA
3646760234eSJon Chesterfield   const hsa_status_t S;
3656760234eSJon Chesterfield 
HSAInitSuccessHSALifetime366d83dc4c6SCarlo Bertolli   bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
HSALifetimeHSALifetime3676760234eSJon Chesterfield   HSALifetime() : S(hsa_init()) {}
3686760234eSJon Chesterfield 
~HSALifetimeHSALifetime3696760234eSJon Chesterfield   ~HSALifetime() {
3706760234eSJon Chesterfield     if (S == HSA_STATUS_SUCCESS) {
3716760234eSJon Chesterfield       hsa_status_t Err = hsa_shut_down();
3726760234eSJon Chesterfield       if (Err != HSA_STATUS_SUCCESS) {
3736760234eSJon Chesterfield         // Can't call into HSA to get a string from the integer
3746760234eSJon Chesterfield         DP("Shutting down HSA failed: %d\n", Err);
3756760234eSJon Chesterfield       }
3766760234eSJon Chesterfield     }
3776760234eSJon Chesterfield   }
3786760234eSJon Chesterfield };
3796760234eSJon Chesterfield 
380d83dc4c6SCarlo Bertolli // Handle scheduling of multiple hsa_queue's per device to
381d83dc4c6SCarlo Bertolli // multiple threads (one scheduler per device)
382d83dc4c6SCarlo Bertolli class HSAQueueScheduler {
383d83dc4c6SCarlo Bertolli public:
HSAQueueScheduler()384d27d0a67SJoseph Huber   HSAQueueScheduler() : Current(0) {}
385d83dc4c6SCarlo Bertolli 
386d83dc4c6SCarlo Bertolli   HSAQueueScheduler(const HSAQueueScheduler &) = delete;
387d83dc4c6SCarlo Bertolli 
HSAQueueScheduler(HSAQueueScheduler && Q)388d27d0a67SJoseph Huber   HSAQueueScheduler(HSAQueueScheduler &&Q) {
389d27d0a67SJoseph Huber     Current = Q.Current.load();
390d27d0a67SJoseph Huber     for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
391d27d0a67SJoseph Huber       HSAQueues[I] = Q.HSAQueues[I];
392d27d0a67SJoseph Huber       Q.HSAQueues[I] = nullptr;
393d83dc4c6SCarlo Bertolli     }
394d83dc4c6SCarlo Bertolli   }
395d83dc4c6SCarlo Bertolli 
396d83dc4c6SCarlo Bertolli   // \return false if any HSA queue creation fails
createQueues(hsa_agent_t HSAAgent,uint32_t QueueSize)397d27d0a67SJoseph Huber   bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
398d27d0a67SJoseph Huber     for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
399d83dc4c6SCarlo Bertolli       hsa_queue_t *Q = nullptr;
400d27d0a67SJoseph Huber       hsa_status_t Rc =
401d27d0a67SJoseph Huber           hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
402d83dc4c6SCarlo Bertolli                            callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
403d27d0a67SJoseph Huber       if (Rc != HSA_STATUS_SUCCESS) {
404d27d0a67SJoseph Huber         DP("Failed to create HSA queue %d\n", I);
405d83dc4c6SCarlo Bertolli         return false;
406d83dc4c6SCarlo Bertolli       }
407d27d0a67SJoseph Huber       HSAQueues[I] = Q;
408d83dc4c6SCarlo Bertolli     }
409d83dc4c6SCarlo Bertolli     return true;
410d83dc4c6SCarlo Bertolli   }
411d83dc4c6SCarlo Bertolli 
~HSAQueueScheduler()412d83dc4c6SCarlo Bertolli   ~HSAQueueScheduler() {
413d27d0a67SJoseph Huber     for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
414d27d0a67SJoseph Huber       if (HSAQueues[I]) {
415d27d0a67SJoseph Huber         hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
416d27d0a67SJoseph Huber         if (Err != HSA_STATUS_SUCCESS)
417d83dc4c6SCarlo Bertolli           DP("Error destroying HSA queue");
418d83dc4c6SCarlo Bertolli       }
419d83dc4c6SCarlo Bertolli     }
420d83dc4c6SCarlo Bertolli   }
421d83dc4c6SCarlo Bertolli 
422d83dc4c6SCarlo Bertolli   // \return next queue to use for device
next()423d27d0a67SJoseph Huber   hsa_queue_t *next() {
424d27d0a67SJoseph Huber     return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
425d83dc4c6SCarlo Bertolli                      NUM_QUEUES_PER_DEVICE];
426d83dc4c6SCarlo Bertolli   }
427d83dc4c6SCarlo Bertolli 
428d83dc4c6SCarlo Bertolli private:
429d83dc4c6SCarlo Bertolli   // Number of queues per device
430d83dc4c6SCarlo Bertolli   enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
431d83dc4c6SCarlo Bertolli   hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
432d27d0a67SJoseph Huber   std::atomic<uint8_t> Current;
433d83dc4c6SCarlo Bertolli };
434d83dc4c6SCarlo Bertolli 
435d0b31295SJon Chesterfield /// Class containing all the device information
436d83dc4c6SCarlo Bertolli class RTLDeviceInfoTy : HSALifetime {
437d0b31295SJon Chesterfield   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
438d0b31295SJon Chesterfield 
4398cf93a35SJon Chesterfield   struct QueueDeleter {
operator ()RTLDeviceInfoTy::QueueDeleter4408cf93a35SJon Chesterfield     void operator()(hsa_queue_t *Q) {
4418cf93a35SJon Chesterfield       if (Q) {
4428cf93a35SJon Chesterfield         hsa_status_t Err = hsa_queue_destroy(Q);
4438cf93a35SJon Chesterfield         if (Err != HSA_STATUS_SUCCESS) {
4448cf93a35SJon Chesterfield           DP("Error destroying hsa queue: %s\n", get_error_string(Err));
4458cf93a35SJon Chesterfield         }
4468cf93a35SJon Chesterfield       }
4478cf93a35SJon Chesterfield     }
4488cf93a35SJon Chesterfield   };
4498cf93a35SJon Chesterfield 
450d0b31295SJon Chesterfield public:
451738734f6SJon Chesterfield   bool ConstructionSucceeded = false;
452738734f6SJon Chesterfield 
4535d989fb3SJonChesterfield   // load binary populates symbol tables and mutates various global state
4545d989fb3SJonChesterfield   // run uses those symbol tables
455d27d0a67SJoseph Huber   std::shared_timed_mutex LoadRunLock;
4565d989fb3SJonChesterfield 
457dd0b463dSJon Chesterfield   int NumberOfDevices = 0;
458d0b31295SJon Chesterfield 
459d0b31295SJon Chesterfield   // GPU devices
460d0b31295SJon Chesterfield   std::vector<hsa_agent_t> HSAAgents;
461d83dc4c6SCarlo Bertolli   std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
462d0b31295SJon Chesterfield 
463cadcaf3fSPushpinder Singh   // CPUs
464cadcaf3fSPushpinder Singh   std::vector<hsa_agent_t> CPUAgents;
465cadcaf3fSPushpinder Singh 
466d0b31295SJon Chesterfield   // Device properties
467d0b31295SJon Chesterfield   std::vector<int> ComputeUnits;
468d0b31295SJon Chesterfield   std::vector<int> GroupsPerDevice;
469d0b31295SJon Chesterfield   std::vector<int> ThreadsPerGroup;
470d0b31295SJon Chesterfield   std::vector<int> WarpSize;
471cab9f692SJon Chesterfield   std::vector<std::string> GPUName;
4724075a811SSaiyedul Islam   std::vector<std::string> TargetID;
473d0b31295SJon Chesterfield 
474d0b31295SJon Chesterfield   // OpenMP properties
475d0b31295SJon Chesterfield   std::vector<int> NumTeams;
476d0b31295SJon Chesterfield   std::vector<int> NumThreads;
477d0b31295SJon Chesterfield 
478d0b31295SJon Chesterfield   // OpenMP Environment properties
479ddfb074aSJon Chesterfield   EnvironmentVariables Env;
480d0b31295SJon Chesterfield 
481d0b31295SJon Chesterfield   // OpenMP Requires Flags
482d0b31295SJon Chesterfield   int64_t RequiresFlags;
483d0b31295SJon Chesterfield 
484d0b31295SJon Chesterfield   // Resource pools
485d0b31295SJon Chesterfield   SignalPoolT FreeSignalPool;
486d0b31295SJon Chesterfield 
487d27d0a67SJoseph Huber   bool HostcallRequired = false;
488df005fa3SJon Chesterfield 
489d7503c3bSPushpinder Singh   std::vector<hsa_executable_t> HSAExecutables;
490d7503c3bSPushpinder Singh 
4917648b697SPushpinder Singh   std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
4927648b697SPushpinder Singh   std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
4937648b697SPushpinder Singh 
494f5f329a3SPushpinder Singh   hsa_amd_memory_pool_t KernArgPool;
495f5f329a3SPushpinder Singh 
49620df2c70SPushpinder Singh   // fine grained memory pool for host allocations
49720df2c70SPushpinder Singh   hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
49820df2c70SPushpinder Singh 
49920df2c70SPushpinder Singh   // fine and coarse-grained memory pools per offloading device
50020df2c70SPushpinder Singh   std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
50120df2c70SPushpinder Singh   std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
50220df2c70SPushpinder Singh 
503d27d0a67SJoseph Huber   struct ImplFreePtrDeletor {
operator ()RTLDeviceInfoTy::ImplFreePtrDeletor504d27d0a67SJoseph Huber     void operator()(void *P) {
505d27d0a67SJoseph Huber       core::Runtime::Memfree(P); // ignore failure to free
5065d989fb3SJonChesterfield     }
5075d989fb3SJonChesterfield   };
5085d989fb3SJonChesterfield 
5095d989fb3SJonChesterfield   // device_State shared across loaded binaries, error if inconsistent size
510d27d0a67SJoseph Huber   std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
511d27d0a67SJoseph Huber       DeviceStateStore;
5125d989fb3SJonChesterfield 
513ae9d96a6SJon Chesterfield   static const unsigned HardTeamLimit =
514ae9d96a6SJon Chesterfield       (1 << 16) - 1; // 64K needed to fit in uint16
515d0b31295SJon Chesterfield   static const int DefaultNumTeams = 128;
51678f92c38SJon Chesterfield 
51778f92c38SJon Chesterfield   // These need to be per-device since different devices can have different
51878f92c38SJon Chesterfield   // wave sizes, but are currently the same number for each so that refactor
51978f92c38SJon Chesterfield   // can be postponed.
52078f92c38SJon Chesterfield   static_assert(getGridValue<32>().GV_Max_Teams ==
52178f92c38SJon Chesterfield                     getGridValue<64>().GV_Max_Teams,
52278f92c38SJon Chesterfield                 "");
523d27d0a67SJoseph Huber   static const int MaxTeams = getGridValue<64>().GV_Max_Teams;
52478f92c38SJon Chesterfield 
52578f92c38SJon Chesterfield   static_assert(getGridValue<32>().GV_Max_WG_Size ==
52678f92c38SJon Chesterfield                     getGridValue<64>().GV_Max_WG_Size,
52778f92c38SJon Chesterfield                 "");
528d27d0a67SJoseph Huber   static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;
52978f92c38SJon Chesterfield 
53078f92c38SJon Chesterfield   static_assert(getGridValue<32>().GV_Default_WG_Size ==
53178f92c38SJon Chesterfield                     getGridValue<64>().GV_Default_WG_Size,
53278f92c38SJon Chesterfield                 "");
533d27d0a67SJoseph Huber   static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;
534d0b31295SJon Chesterfield 
535d27d0a67SJoseph Huber   using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size,
536cc8dc5e2SCarlo Bertolli                                       hsa_agent_t, hsa_amd_memory_pool_t);
freesignalpoolMemcpy(void * Dest,void * Src,size_t Size,MemcpyFunc Func,int32_t DeviceId)537d27d0a67SJoseph Huber   hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size,
538d27d0a67SJoseph Huber                                     MemcpyFunc Func, int32_t DeviceId) {
539d27d0a67SJoseph Huber     hsa_agent_t Agent = HSAAgents[DeviceId];
540d27d0a67SJoseph Huber     hsa_signal_t S = FreeSignalPool.pop();
541d27d0a67SJoseph Huber     if (S.handle == 0) {
54207f59baaSJon Chesterfield       return HSA_STATUS_ERROR;
5435d989fb3SJonChesterfield     }
544d27d0a67SJoseph Huber     hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
545d27d0a67SJoseph Huber     FreeSignalPool.push(S);
546d27d0a67SJoseph Huber     return R;
5475d989fb3SJonChesterfield   }
5485d989fb3SJonChesterfield 
freesignalpoolMemcpyD2H(void * Dest,void * Src,size_t Size,int32_t DeviceId)549d27d0a67SJoseph Huber   hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size,
550d27d0a67SJoseph Huber                                        int32_t DeviceId) {
551d27d0a67SJoseph Huber     return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
552aa616efbSPushpinder Singh   }
553aa616efbSPushpinder Singh 
freesignalpoolMemcpyH2D(void * Dest,void * Src,size_t Size,int32_t DeviceId)554d27d0a67SJoseph Huber   hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size,
555d27d0a67SJoseph Huber                                        int32_t DeviceId) {
556d27d0a67SJoseph Huber     return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
557aa616efbSPushpinder Singh   }
558aa616efbSPushpinder Singh 
printDeviceInfo(int32_t DeviceId,hsa_agent_t Agent)559d27d0a67SJoseph Huber   static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
56015ed5c0aSJose Manuel Monsalve Diaz     char TmpChar[1000];
561d27d0a67SJoseph Huber     uint16_t Major, Minor;
56215ed5c0aSJose Manuel Monsalve Diaz     uint32_t TmpUInt;
56315ed5c0aSJose Manuel Monsalve Diaz     uint32_t TmpUInt2;
56415ed5c0aSJose Manuel Monsalve Diaz     uint32_t CacheSize[4];
56515ed5c0aSJose Manuel Monsalve Diaz     bool TmpBool;
566d27d0a67SJoseph Huber     uint16_t WorkgroupMaxDim[3];
567d27d0a67SJoseph Huber     hsa_dim3_t GridMaxDim;
56815ed5c0aSJose Manuel Monsalve Diaz 
56915ed5c0aSJose Manuel Monsalve Diaz     // Getting basic information about HSA and Device
57015ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
571d27d0a67SJoseph Huber         hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
57215ed5c0aSJose Manuel Monsalve Diaz         "Error from hsa_system_get_info when obtaining "
57315ed5c0aSJose Manuel Monsalve Diaz         "HSA_SYSTEM_INFO_VERSION_MAJOR\n");
57415ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
575d27d0a67SJoseph Huber         hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
57615ed5c0aSJose Manuel Monsalve Diaz         "Error from hsa_system_get_info when obtaining "
57715ed5c0aSJose Manuel Monsalve Diaz         "HSA_SYSTEM_INFO_VERSION_MINOR\n");
578d27d0a67SJoseph Huber     printf("    HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
579d27d0a67SJoseph Huber     printf("    HSA OpenMP Device Number: \t\t%d \n", DeviceId);
58015ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
58115ed5c0aSJose Manuel Monsalve Diaz         hsa_agent_get_info(
582d27d0a67SJoseph Huber             Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
58315ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
58415ed5c0aSJose Manuel Monsalve Diaz         "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
58515ed5c0aSJose Manuel Monsalve Diaz     printf("    Product Name: \t\t\t%s \n", TmpChar);
586d27d0a67SJoseph Huber     core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
58715ed5c0aSJose Manuel Monsalve Diaz                       "Error returned from hsa_agent_get_info when obtaining "
58815ed5c0aSJose Manuel Monsalve Diaz                       "HSA_AGENT_INFO_NAME\n");
58915ed5c0aSJose Manuel Monsalve Diaz     printf("    Device Name: \t\t\t%s \n", TmpChar);
59015ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
591d27d0a67SJoseph Huber         hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
59215ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
59315ed5c0aSJose Manuel Monsalve Diaz         "HSA_AGENT_INFO_NAME\n");
59415ed5c0aSJose Manuel Monsalve Diaz     printf("    Vendor Name: \t\t\t%s \n", TmpChar);
595d27d0a67SJoseph Huber     hsa_device_type_t DevType;
59615ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
597d27d0a67SJoseph Huber         hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
59815ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
59915ed5c0aSJose Manuel Monsalve Diaz         "HSA_AGENT_INFO_DEVICE\n");
60015ed5c0aSJose Manuel Monsalve Diaz     printf("    Device Type: \t\t\t%s \n",
601d27d0a67SJoseph Huber            DevType == HSA_DEVICE_TYPE_CPU
60215ed5c0aSJose Manuel Monsalve Diaz                ? "CPU"
603d27d0a67SJoseph Huber                : (DevType == HSA_DEVICE_TYPE_GPU
60415ed5c0aSJose Manuel Monsalve Diaz                       ? "GPU"
605d27d0a67SJoseph Huber                       : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
60615ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
607d27d0a67SJoseph Huber         hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
60815ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
60915ed5c0aSJose Manuel Monsalve Diaz         "HSA_AGENT_INFO_QUEUES_MAX\n");
61015ed5c0aSJose Manuel Monsalve Diaz     printf("    Max Queues: \t\t\t%u \n", TmpUInt);
61115ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
612d27d0a67SJoseph Huber         hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
61315ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
61415ed5c0aSJose Manuel Monsalve Diaz         "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
61515ed5c0aSJose Manuel Monsalve Diaz     printf("    Queue Min Size: \t\t\t%u \n", TmpUInt);
61615ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
617d27d0a67SJoseph Huber         hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
61815ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
61915ed5c0aSJose Manuel Monsalve Diaz         "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
62015ed5c0aSJose Manuel Monsalve Diaz     printf("    Queue Max Size: \t\t\t%u \n", TmpUInt);
62115ed5c0aSJose Manuel Monsalve Diaz 
62215ed5c0aSJose Manuel Monsalve Diaz     // Getting cache information
62315ed5c0aSJose Manuel Monsalve Diaz     printf("    Cache:\n");
62415ed5c0aSJose Manuel Monsalve Diaz 
62515ed5c0aSJose Manuel Monsalve Diaz     // FIXME: This is deprecated according to HSA documentation. But using
62615ed5c0aSJose Manuel Monsalve Diaz     // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
62715ed5c0aSJose Manuel Monsalve Diaz     // runtime.
62815ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
629d27d0a67SJoseph Huber         hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
63015ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
63115ed5c0aSJose Manuel Monsalve Diaz         "HSA_AGENT_INFO_CACHE_SIZE\n");
63215ed5c0aSJose Manuel Monsalve Diaz 
633d27d0a67SJoseph Huber     for (int I = 0; I < 4; I++) {
634d27d0a67SJoseph Huber       if (CacheSize[I]) {
635d27d0a67SJoseph Huber         printf("      L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
63615ed5c0aSJose Manuel Monsalve Diaz       }
63715ed5c0aSJose Manuel Monsalve Diaz     }
63815ed5c0aSJose Manuel Monsalve Diaz 
63915ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
640d27d0a67SJoseph Huber         hsa_agent_get_info(Agent,
64115ed5c0aSJose Manuel Monsalve Diaz                            (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
64215ed5c0aSJose Manuel Monsalve Diaz                            &TmpUInt),
64315ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
64415ed5c0aSJose Manuel Monsalve Diaz         "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
64515ed5c0aSJose Manuel Monsalve Diaz     printf("    Cacheline Size: \t\t\t%u \n", TmpUInt);
64615ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
64715ed5c0aSJose Manuel Monsalve Diaz         hsa_agent_get_info(
648d27d0a67SJoseph Huber             Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
64915ed5c0aSJose Manuel Monsalve Diaz             &TmpUInt),
65015ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
65115ed5c0aSJose Manuel Monsalve Diaz         "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
65215ed5c0aSJose Manuel Monsalve Diaz     printf("    Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
65315ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
65415ed5c0aSJose Manuel Monsalve Diaz         hsa_agent_get_info(
655d27d0a67SJoseph Huber             Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
65615ed5c0aSJose Manuel Monsalve Diaz             &TmpUInt),
65715ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
65815ed5c0aSJose Manuel Monsalve Diaz         "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
65915ed5c0aSJose Manuel Monsalve Diaz     printf("    Compute Units: \t\t\t%u \n", TmpUInt);
66015ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(hsa_agent_get_info(
661d27d0a67SJoseph Huber                           Agent,
66215ed5c0aSJose Manuel Monsalve Diaz                           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
66315ed5c0aSJose Manuel Monsalve Diaz                           &TmpUInt),
66415ed5c0aSJose Manuel Monsalve Diaz                       "Error returned from hsa_agent_get_info when obtaining "
66515ed5c0aSJose Manuel Monsalve Diaz                       "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
66615ed5c0aSJose Manuel Monsalve Diaz     printf("    SIMD per CU: \t\t\t%u \n", TmpUInt);
66715ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
668d27d0a67SJoseph Huber         hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
66915ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
67015ed5c0aSJose Manuel Monsalve Diaz         "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
67115ed5c0aSJose Manuel Monsalve Diaz     printf("    Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
67215ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
673d27d0a67SJoseph Huber         hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
67415ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
67515ed5c0aSJose Manuel Monsalve Diaz         "HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
67615ed5c0aSJose Manuel Monsalve Diaz     printf("    Wavefront Size: \t\t\t%u \n", TmpUInt2);
67715ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
678d27d0a67SJoseph Huber         hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
67915ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
68015ed5c0aSJose Manuel Monsalve Diaz         "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
68115ed5c0aSJose Manuel Monsalve Diaz     printf("    Workgroup Max Size: \t\t%u \n", TmpUInt);
682d27d0a67SJoseph Huber     core::checkResult(hsa_agent_get_info(Agent,
68315ed5c0aSJose Manuel Monsalve Diaz                                          HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
684d27d0a67SJoseph Huber                                          WorkgroupMaxDim),
68515ed5c0aSJose Manuel Monsalve Diaz                       "Error returned from hsa_agent_get_info when obtaining "
68615ed5c0aSJose Manuel Monsalve Diaz                       "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
68715ed5c0aSJose Manuel Monsalve Diaz     printf("    Workgroup Max Size per Dimension:\n");
688d27d0a67SJoseph Huber     printf("      x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
689d27d0a67SJoseph Huber     printf("      y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
690d27d0a67SJoseph Huber     printf("      z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
69115ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(hsa_agent_get_info(
692d27d0a67SJoseph Huber                           Agent,
69315ed5c0aSJose Manuel Monsalve Diaz                           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
69415ed5c0aSJose Manuel Monsalve Diaz                           &TmpUInt),
69515ed5c0aSJose Manuel Monsalve Diaz                       "Error returned from hsa_agent_get_info when obtaining "
69615ed5c0aSJose Manuel Monsalve Diaz                       "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
69715ed5c0aSJose Manuel Monsalve Diaz     printf("    Max Waves Per CU: \t\t\t%u \n", TmpUInt);
69815ed5c0aSJose Manuel Monsalve Diaz     printf("    Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
69915ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
700d27d0a67SJoseph Huber         hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
70115ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
70215ed5c0aSJose Manuel Monsalve Diaz         "HSA_AGENT_INFO_GRID_MAX_SIZE\n");
70315ed5c0aSJose Manuel Monsalve Diaz     printf("    Grid Max Size: \t\t\t%u \n", TmpUInt);
70415ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
705d27d0a67SJoseph Huber         hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
70615ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
70715ed5c0aSJose Manuel Monsalve Diaz         "HSA_AGENT_INFO_GRID_MAX_DIM\n");
70815ed5c0aSJose Manuel Monsalve Diaz     printf("    Grid Max Size per Dimension: \t\t\n");
709d27d0a67SJoseph Huber     printf("      x: \t\t\t\t%u\n", GridMaxDim.x);
710d27d0a67SJoseph Huber     printf("      y: \t\t\t\t%u\n", GridMaxDim.y);
711d27d0a67SJoseph Huber     printf("      z: \t\t\t\t%u\n", GridMaxDim.z);
71215ed5c0aSJose Manuel Monsalve Diaz     core::checkResult(
713d27d0a67SJoseph Huber         hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
71415ed5c0aSJose Manuel Monsalve Diaz         "Error returned from hsa_agent_get_info when obtaining "
71515ed5c0aSJose Manuel Monsalve Diaz         "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
71615ed5c0aSJose Manuel Monsalve Diaz     printf("    Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
71715ed5c0aSJose Manuel Monsalve Diaz 
71815ed5c0aSJose Manuel Monsalve Diaz     printf("    Memory Pools:\n");
719d27d0a67SJoseph Huber     auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
72015ed5c0aSJose Manuel Monsalve Diaz       std::string TmpStr;
721d27d0a67SJoseph Huber       size_t Size;
722d27d0a67SJoseph Huber       bool Alloc, Access;
723d27d0a67SJoseph Huber       hsa_amd_segment_t Segment;
724d27d0a67SJoseph Huber       hsa_amd_memory_pool_global_flag_t GlobalFlags;
72515ed5c0aSJose Manuel Monsalve Diaz       core::checkResult(
72615ed5c0aSJose Manuel Monsalve Diaz           hsa_amd_memory_pool_get_info(
727d27d0a67SJoseph Huber               Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
72815ed5c0aSJose Manuel Monsalve Diaz           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
72915ed5c0aSJose Manuel Monsalve Diaz           "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
73015ed5c0aSJose Manuel Monsalve Diaz       core::checkResult(hsa_amd_memory_pool_get_info(
731d27d0a67SJoseph Huber                             Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
73215ed5c0aSJose Manuel Monsalve Diaz                         "Error returned from hsa_amd_memory_pool_get_info when "
73315ed5c0aSJose Manuel Monsalve Diaz                         "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
73415ed5c0aSJose Manuel Monsalve Diaz 
735d27d0a67SJoseph Huber       switch (Segment) {
73615ed5c0aSJose Manuel Monsalve Diaz       case HSA_AMD_SEGMENT_GLOBAL:
73715ed5c0aSJose Manuel Monsalve Diaz         TmpStr = "GLOBAL; FLAGS: ";
738d27d0a67SJoseph Huber         if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
73915ed5c0aSJose Manuel Monsalve Diaz           TmpStr += "KERNARG, ";
740d27d0a67SJoseph Huber         if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
74115ed5c0aSJose Manuel Monsalve Diaz           TmpStr += "FINE GRAINED, ";
742d27d0a67SJoseph Huber         if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
74315ed5c0aSJose Manuel Monsalve Diaz           TmpStr += "COARSE GRAINED, ";
74415ed5c0aSJose Manuel Monsalve Diaz         break;
74515ed5c0aSJose Manuel Monsalve Diaz       case HSA_AMD_SEGMENT_READONLY:
74615ed5c0aSJose Manuel Monsalve Diaz         TmpStr = "READONLY";
74715ed5c0aSJose Manuel Monsalve Diaz         break;
74815ed5c0aSJose Manuel Monsalve Diaz       case HSA_AMD_SEGMENT_PRIVATE:
74915ed5c0aSJose Manuel Monsalve Diaz         TmpStr = "PRIVATE";
75015ed5c0aSJose Manuel Monsalve Diaz         break;
75115ed5c0aSJose Manuel Monsalve Diaz       case HSA_AMD_SEGMENT_GROUP:
75215ed5c0aSJose Manuel Monsalve Diaz         TmpStr = "GROUP";
75315ed5c0aSJose Manuel Monsalve Diaz         break;
75415ed5c0aSJose Manuel Monsalve Diaz       }
75515ed5c0aSJose Manuel Monsalve Diaz       printf("      Pool %s: \n", TmpStr.c_str());
75615ed5c0aSJose Manuel Monsalve Diaz 
75715ed5c0aSJose Manuel Monsalve Diaz       core::checkResult(hsa_amd_memory_pool_get_info(
758d27d0a67SJoseph Huber                             Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
75915ed5c0aSJose Manuel Monsalve Diaz                         "Error returned from hsa_amd_memory_pool_get_info when "
76015ed5c0aSJose Manuel Monsalve Diaz                         "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
761d27d0a67SJoseph Huber       printf("        Size: \t\t\t\t %zu bytes\n", Size);
76215ed5c0aSJose Manuel Monsalve Diaz       core::checkResult(
76315ed5c0aSJose Manuel Monsalve Diaz           hsa_amd_memory_pool_get_info(
764d27d0a67SJoseph Huber               Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
76515ed5c0aSJose Manuel Monsalve Diaz           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
76615ed5c0aSJose Manuel Monsalve Diaz           "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
767d27d0a67SJoseph Huber       printf("        Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
76815ed5c0aSJose Manuel Monsalve Diaz       core::checkResult(
76915ed5c0aSJose Manuel Monsalve Diaz           hsa_amd_memory_pool_get_info(
770d27d0a67SJoseph Huber               Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
77115ed5c0aSJose Manuel Monsalve Diaz           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
77215ed5c0aSJose Manuel Monsalve Diaz           "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
773d27d0a67SJoseph Huber       printf("        Runtime Alloc Granule: \t\t %zu bytes\n", Size);
77415ed5c0aSJose Manuel Monsalve Diaz       core::checkResult(
77515ed5c0aSJose Manuel Monsalve Diaz           hsa_amd_memory_pool_get_info(
776d27d0a67SJoseph Huber               Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
77715ed5c0aSJose Manuel Monsalve Diaz           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
77815ed5c0aSJose Manuel Monsalve Diaz           "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
779d27d0a67SJoseph Huber       printf("        Runtime Alloc alignment: \t %zu bytes\n", Size);
78015ed5c0aSJose Manuel Monsalve Diaz       core::checkResult(
78115ed5c0aSJose Manuel Monsalve Diaz           hsa_amd_memory_pool_get_info(
782d27d0a67SJoseph Huber               Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
78315ed5c0aSJose Manuel Monsalve Diaz           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
78415ed5c0aSJose Manuel Monsalve Diaz           "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
78515ed5c0aSJose Manuel Monsalve Diaz       printf("        Accessable by all: \t\t %s\n",
786d27d0a67SJoseph Huber              (Access ? "TRUE" : "FALSE"));
78715ed5c0aSJose Manuel Monsalve Diaz 
78815ed5c0aSJose Manuel Monsalve Diaz       return HSA_STATUS_SUCCESS;
78915ed5c0aSJose Manuel Monsalve Diaz     };
79015ed5c0aSJose Manuel Monsalve Diaz     // Iterate over all the memory regions for this agent. Get the memory region
79115ed5c0aSJose Manuel Monsalve Diaz     // type and size
792d27d0a67SJoseph Huber     hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);
79315ed5c0aSJose Manuel Monsalve Diaz 
79415ed5c0aSJose Manuel Monsalve Diaz     printf("    ISAs:\n");
795d27d0a67SJoseph Huber     auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
79615ed5c0aSJose Manuel Monsalve Diaz       char TmpChar[1000];
797d27d0a67SJoseph Huber       core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
79815ed5c0aSJose Manuel Monsalve Diaz                         "Error returned from hsa_isa_get_info_alt when "
79915ed5c0aSJose Manuel Monsalve Diaz                         "obtaining HSA_ISA_INFO_NAME\n");
80015ed5c0aSJose Manuel Monsalve Diaz       printf("        Name: \t\t\t\t %s\n", TmpChar);
80115ed5c0aSJose Manuel Monsalve Diaz 
80215ed5c0aSJose Manuel Monsalve Diaz       return HSA_STATUS_SUCCESS;
80315ed5c0aSJose Manuel Monsalve Diaz     };
80415ed5c0aSJose Manuel Monsalve Diaz     // Iterate over all the memory regions for this agent. Get the memory region
80515ed5c0aSJose Manuel Monsalve Diaz     // type and size
806d27d0a67SJoseph Huber     hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
80715ed5c0aSJose Manuel Monsalve Diaz   }
80815ed5c0aSJose Manuel Monsalve Diaz 
809d0b31295SJon Chesterfield   // Record entry point associated with device
addOffloadEntry(int32_t DeviceId,__tgt_offload_entry Entry)810d27d0a67SJoseph Huber   void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
811d27d0a67SJoseph Huber     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
812d0b31295SJon Chesterfield            "Unexpected device id!");
813d27d0a67SJoseph Huber     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
814d0b31295SJon Chesterfield 
815d27d0a67SJoseph Huber     E.Entries.push_back(Entry);
816d0b31295SJon Chesterfield   }
817d0b31295SJon Chesterfield 
818d0b31295SJon Chesterfield   // Return true if the entry is associated with device
findOffloadEntry(int32_t DeviceId,void * Addr)819d27d0a67SJoseph Huber   bool findOffloadEntry(int32_t DeviceId, void *Addr) {
820d27d0a67SJoseph Huber     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
821d0b31295SJon Chesterfield            "Unexpected device id!");
822d27d0a67SJoseph Huber     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
823d0b31295SJon Chesterfield 
824d27d0a67SJoseph Huber     for (auto &It : E.Entries) {
825d27d0a67SJoseph Huber       if (It.addr == Addr)
826d0b31295SJon Chesterfield         return true;
827d0b31295SJon Chesterfield     }
828d0b31295SJon Chesterfield 
829d0b31295SJon Chesterfield     return false;
830d0b31295SJon Chesterfield   }
831d0b31295SJon Chesterfield 
832d0b31295SJon Chesterfield   // Return the pointer to the target entries table
getOffloadEntriesTable(int32_t DeviceId)833d27d0a67SJoseph Huber   __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
834d27d0a67SJoseph Huber     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
835d0b31295SJon Chesterfield            "Unexpected device id!");
836d27d0a67SJoseph Huber     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
837d0b31295SJon Chesterfield 
838d27d0a67SJoseph Huber     int32_t Size = E.Entries.size();
839d0b31295SJon Chesterfield 
840d0b31295SJon Chesterfield     // Table is empty
841d27d0a67SJoseph Huber     if (!Size)
842d0b31295SJon Chesterfield       return 0;
843d0b31295SJon Chesterfield 
844d27d0a67SJoseph Huber     __tgt_offload_entry *Begin = &E.Entries[0];
845d27d0a67SJoseph Huber     __tgt_offload_entry *End = &E.Entries[Size - 1];
846d0b31295SJon Chesterfield 
847d0b31295SJon Chesterfield     // Update table info according to the entries and return the pointer
848d27d0a67SJoseph Huber     E.Table.EntriesBegin = Begin;
849d27d0a67SJoseph Huber     E.Table.EntriesEnd = ++End;
850d0b31295SJon Chesterfield 
851d0b31295SJon Chesterfield     return &E.Table;
852d0b31295SJon Chesterfield   }
853d0b31295SJon Chesterfield 
854d0b31295SJon Chesterfield   // Clear entries table for a device
clearOffloadEntriesTable(int DeviceId)855d27d0a67SJoseph Huber   void clearOffloadEntriesTable(int DeviceId) {
856d27d0a67SJoseph Huber     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
857d0b31295SJon Chesterfield            "Unexpected device id!");
858d27d0a67SJoseph Huber     FuncGblEntries[DeviceId].emplace_back();
859d27d0a67SJoseph Huber     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
860*046d5b91SJoseph Huber     // KernelArgPoolMap.clear();
861d0b31295SJon Chesterfield     E.Entries.clear();
862d0b31295SJon Chesterfield     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
863d0b31295SJon Chesterfield   }
864d0b31295SJon Chesterfield 
addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,unsigned int DeviceId)86520df2c70SPushpinder Singh   hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
8661f940b69SJoseph Huber                                    unsigned int DeviceId) {
86720df2c70SPushpinder Singh     assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
86820df2c70SPushpinder Singh     uint32_t GlobalFlags = 0;
86920df2c70SPushpinder Singh     hsa_status_t Err = hsa_amd_memory_pool_get_info(
87020df2c70SPushpinder Singh         MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
87120df2c70SPushpinder Singh 
87220df2c70SPushpinder Singh     if (Err != HSA_STATUS_SUCCESS) {
87320df2c70SPushpinder Singh       return Err;
87420df2c70SPushpinder Singh     }
87520df2c70SPushpinder Singh 
87620df2c70SPushpinder Singh     if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
87720df2c70SPushpinder Singh       DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
87820df2c70SPushpinder Singh     } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
87920df2c70SPushpinder Singh       DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
88020df2c70SPushpinder Singh     }
88120df2c70SPushpinder Singh 
88220df2c70SPushpinder Singh     return HSA_STATUS_SUCCESS;
88320df2c70SPushpinder Singh   }
88420df2c70SPushpinder Singh 
setupDevicePools(const std::vector<hsa_agent_t> & Agents)88505ba9ff6SJon Chesterfield   hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
8861f940b69SJoseph Huber     for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
88705ba9ff6SJon Chesterfield       hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
88805ba9ff6SJon Chesterfield           Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
88905ba9ff6SJon Chesterfield             hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
89005ba9ff6SJon Chesterfield             if (ValidStatus != HSA_STATUS_SUCCESS) {
89105ba9ff6SJon Chesterfield               DP("Alloc allowed in memory pool check failed: %s\n",
89205ba9ff6SJon Chesterfield                  get_error_string(ValidStatus));
89305ba9ff6SJon Chesterfield               return HSA_STATUS_SUCCESS;
89405ba9ff6SJon Chesterfield             }
89505ba9ff6SJon Chesterfield             return addDeviceMemoryPool(MemoryPool, DeviceId);
89605ba9ff6SJon Chesterfield           });
89705ba9ff6SJon Chesterfield 
89805ba9ff6SJon Chesterfield       if (Err != HSA_STATUS_SUCCESS) {
89905ba9ff6SJon Chesterfield         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
90005ba9ff6SJon Chesterfield            "Iterate all memory pools", get_error_string(Err));
90105ba9ff6SJon Chesterfield         return Err;
90205ba9ff6SJon Chesterfield       }
90305ba9ff6SJon Chesterfield     }
90405ba9ff6SJon Chesterfield     return HSA_STATUS_SUCCESS;
90505ba9ff6SJon Chesterfield   }
90605ba9ff6SJon Chesterfield 
setupHostMemoryPools(std::vector<hsa_agent_t> & Agents)90705ba9ff6SJon Chesterfield   hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
90805ba9ff6SJon Chesterfield     std::vector<hsa_amd_memory_pool_t> HostPools;
90905ba9ff6SJon Chesterfield 
91005ba9ff6SJon Chesterfield     // collect all the "valid" pools for all the given agents.
91105ba9ff6SJon Chesterfield     for (const auto &Agent : Agents) {
91205ba9ff6SJon Chesterfield       hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
91305ba9ff6SJon Chesterfield           Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
91405ba9ff6SJon Chesterfield       if (Err != HSA_STATUS_SUCCESS) {
9151bc3a6e4SJon Chesterfield         DP("addMemoryPool returned %s, continuing\n", get_error_string(Err));
91605ba9ff6SJon Chesterfield       }
91705ba9ff6SJon Chesterfield     }
91805ba9ff6SJon Chesterfield 
91905ba9ff6SJon Chesterfield     // We need two fine-grained pools.
92005ba9ff6SJon Chesterfield     //  1. One with kernarg flag set for storing kernel arguments
92105ba9ff6SJon Chesterfield     //  2. Second for host allocations
92205ba9ff6SJon Chesterfield     bool FineGrainedMemoryPoolSet = false;
92305ba9ff6SJon Chesterfield     bool KernArgPoolSet = false;
92405ba9ff6SJon Chesterfield     for (const auto &MemoryPool : HostPools) {
92505ba9ff6SJon Chesterfield       hsa_status_t Err = HSA_STATUS_SUCCESS;
92620df2c70SPushpinder Singh       uint32_t GlobalFlags = 0;
92705ba9ff6SJon Chesterfield       Err = hsa_amd_memory_pool_get_info(
92820df2c70SPushpinder Singh           MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
92920df2c70SPushpinder Singh       if (Err != HSA_STATUS_SUCCESS) {
93005ba9ff6SJon Chesterfield         DP("Get memory pool info failed: %s\n", get_error_string(Err));
93120df2c70SPushpinder Singh         return Err;
93220df2c70SPushpinder Singh       }
93320df2c70SPushpinder Singh 
93405ba9ff6SJon Chesterfield       if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
93505ba9ff6SJon Chesterfield         if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
93605ba9ff6SJon Chesterfield           KernArgPool = MemoryPool;
93705ba9ff6SJon Chesterfield           KernArgPoolSet = true;
93820df2c70SPushpinder Singh         }
93920df2c70SPushpinder Singh         HostFineGrainedMemoryPool = MemoryPool;
94005ba9ff6SJon Chesterfield         FineGrainedMemoryPoolSet = true;
94105ba9ff6SJon Chesterfield       }
94220df2c70SPushpinder Singh     }
94320df2c70SPushpinder Singh 
94405ba9ff6SJon Chesterfield     if (FineGrainedMemoryPoolSet && KernArgPoolSet)
94520df2c70SPushpinder Singh       return HSA_STATUS_SUCCESS;
94620df2c70SPushpinder Singh 
94705ba9ff6SJon Chesterfield     return HSA_STATUS_ERROR;
94820df2c70SPushpinder Singh   }
94920df2c70SPushpinder Singh 
getDeviceMemoryPool(unsigned int DeviceId)9501f940b69SJoseph Huber   hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
95120df2c70SPushpinder Singh     assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
95220df2c70SPushpinder Singh            "Invalid device Id");
95320df2c70SPushpinder Singh     return DeviceCoarseGrainedMemoryPools[DeviceId];
95420df2c70SPushpinder Singh   }
95520df2c70SPushpinder Singh 
getHostMemoryPool()95620df2c70SPushpinder Singh   hsa_amd_memory_pool_t getHostMemoryPool() {
95720df2c70SPushpinder Singh     return HostFineGrainedMemoryPool;
95820df2c70SPushpinder Singh   }
95920df2c70SPushpinder Singh 
readEnv(const char * Env,int Default=-1)960f4f23de1SJoseph Huber   static int readEnv(const char *Env, int Default = -1) {
961d27d0a67SJoseph Huber     const char *EnvStr = getenv(Env);
962d27d0a67SJoseph Huber     int Res = Default;
963d27d0a67SJoseph Huber     if (EnvStr) {
964d27d0a67SJoseph Huber       Res = std::stoi(EnvStr);
965d27d0a67SJoseph Huber       DP("Parsed %s=%d\n", Env, Res);
966ddfb074aSJon Chesterfield     }
967d27d0a67SJoseph Huber     return Res;
968ddfb074aSJon Chesterfield   }
969ddfb074aSJon Chesterfield 
RTLDeviceInfoTy()970d0b31295SJon Chesterfield   RTLDeviceInfoTy() {
9716760234eSJon Chesterfield     DP("Start initializing " GETNAME(TARGET_NAME) "\n");
9726760234eSJon Chesterfield 
973d0b31295SJon Chesterfield     // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
974d0b31295SJon Chesterfield     // anytime. You do not need a debug library build.
975d0b31295SJon Chesterfield     //  0 => no tracing
976d0b31295SJon Chesterfield     //  1 => tracing dispatch only
977d0b31295SJon Chesterfield     // >1 => verbosity increase
9786760234eSJon Chesterfield 
979d83dc4c6SCarlo Bertolli     if (!HSAInitSuccess()) {
9806760234eSJon Chesterfield       DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
9816760234eSJon Chesterfield       return;
9826760234eSJon Chesterfield     }
9836760234eSJon Chesterfield 
984d27d0a67SJoseph Huber     if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
985d27d0a67SJoseph Huber       print_kernel_trace = atoi(EnvStr);
986d0b31295SJon Chesterfield     else
987d0b31295SJon Chesterfield       print_kernel_trace = 0;
988d0b31295SJon Chesterfield 
989d27d0a67SJoseph Huber     hsa_status_t Err = core::atl_init_gpu_context();
990d27d0a67SJoseph Huber     if (Err != HSA_STATUS_SUCCESS) {
991a90da62aSJon Chesterfield       DP("Error when initializing " GETNAME(TARGET_NAME) "\n");
992d0b31295SJon Chesterfield       return;
993d0b31295SJon Chesterfield     }
994a2d6ef58SPushpinder Singh 
9953153bdd5SJon Chesterfield     // Init hostcall soon after initializing hsa
99671f46930SJon Chesterfield     hostrpc_init();
997d0b31295SJon Chesterfield 
998d27d0a67SJoseph Huber     Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
999cadcaf3fSPushpinder Singh       if (DeviceType == HSA_DEVICE_TYPE_CPU) {
1000cadcaf3fSPushpinder Singh         CPUAgents.push_back(Agent);
1001cadcaf3fSPushpinder Singh       } else {
1002cadcaf3fSPushpinder Singh         HSAAgents.push_back(Agent);
1003cadcaf3fSPushpinder Singh       }
1004cadcaf3fSPushpinder Singh     });
1005d27d0a67SJoseph Huber     if (Err != HSA_STATUS_SUCCESS)
1006cadcaf3fSPushpinder Singh       return;
1007cadcaf3fSPushpinder Singh 
1008d0b31295SJon Chesterfield     NumberOfDevices = (int)HSAAgents.size();
1009d0b31295SJon Chesterfield 
1010d0b31295SJon Chesterfield     if (NumberOfDevices == 0) {
1011d0b31295SJon Chesterfield       DP("There are no devices supporting HSA.\n");
1012d0b31295SJon Chesterfield       return;
1013d0b31295SJon Chesterfield     }
1014d27d0a67SJoseph Huber     DP("There are %d devices supporting HSA.\n", NumberOfDevices);
1015f5f329a3SPushpinder Singh 
1016d0b31295SJon Chesterfield     // Init the device info
1017d83dc4c6SCarlo Bertolli     HSAQueueSchedulers.reserve(NumberOfDevices);
1018d0b31295SJon Chesterfield     FuncGblEntries.resize(NumberOfDevices);
1019d0b31295SJon Chesterfield     ThreadsPerGroup.resize(NumberOfDevices);
1020d0b31295SJon Chesterfield     ComputeUnits.resize(NumberOfDevices);
1021cab9f692SJon Chesterfield     GPUName.resize(NumberOfDevices);
1022d0b31295SJon Chesterfield     GroupsPerDevice.resize(NumberOfDevices);
1023d0b31295SJon Chesterfield     WarpSize.resize(NumberOfDevices);
1024d0b31295SJon Chesterfield     NumTeams.resize(NumberOfDevices);
1025d0b31295SJon Chesterfield     NumThreads.resize(NumberOfDevices);
1026d27d0a67SJoseph Huber     DeviceStateStore.resize(NumberOfDevices);
10277648b697SPushpinder Singh     KernelInfoTable.resize(NumberOfDevices);
10287648b697SPushpinder Singh     SymbolInfoTable.resize(NumberOfDevices);
102920df2c70SPushpinder Singh     DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
103020df2c70SPushpinder Singh     DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
103120df2c70SPushpinder Singh 
1032d27d0a67SJoseph Huber     Err = setupDevicePools(HSAAgents);
1033d27d0a67SJoseph Huber     if (Err != HSA_STATUS_SUCCESS) {
103405ba9ff6SJon Chesterfield       DP("Setup for Device Memory Pools failed\n");
103505ba9ff6SJon Chesterfield       return;
103605ba9ff6SJon Chesterfield     }
103705ba9ff6SJon Chesterfield 
1038d27d0a67SJoseph Huber     Err = setupHostMemoryPools(CPUAgents);
1039d27d0a67SJoseph Huber     if (Err != HSA_STATUS_SUCCESS) {
104005ba9ff6SJon Chesterfield       DP("Setup for Host Memory Pools failed\n");
104120df2c70SPushpinder Singh       return;
104220df2c70SPushpinder Singh     }
1043d0b31295SJon Chesterfield 
1044d27d0a67SJoseph Huber     for (int I = 0; I < NumberOfDevices; I++) {
1045d27d0a67SJoseph Huber       uint32_t QueueSize = 0;
1046d0b31295SJon Chesterfield       {
1047d27d0a67SJoseph Huber         hsa_status_t Err = hsa_agent_get_info(
1048d27d0a67SJoseph Huber             HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
1049d27d0a67SJoseph Huber         if (Err != HSA_STATUS_SUCCESS) {
1050d27d0a67SJoseph Huber           DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I);
1051d18fb09cSJon Chesterfield           return;
10529934571eSJon Chesterfield         }
10533153bdd5SJon Chesterfield         enum { MaxQueueSize = 4096 };
1054d27d0a67SJoseph Huber         if (QueueSize > MaxQueueSize) {
1055d27d0a67SJoseph Huber           QueueSize = MaxQueueSize;
1056d0b31295SJon Chesterfield         }
1057d0b31295SJon Chesterfield       }
1058d0b31295SJon Chesterfield 
10598cf93a35SJon Chesterfield       {
1060d83dc4c6SCarlo Bertolli         HSAQueueScheduler QSched;
1061d27d0a67SJoseph Huber         if (!QSched.createQueues(HSAAgents[I], QueueSize))
1062d0b31295SJon Chesterfield           return;
1063d83dc4c6SCarlo Bertolli         HSAQueueSchedulers.emplace_back(std::move(QSched));
10648cf93a35SJon Chesterfield       }
10655d989fb3SJonChesterfield 
1066d27d0a67SJoseph Huber       DeviceStateStore[I] = {nullptr, 0};
1067d0b31295SJon Chesterfield     }
1068d0b31295SJon Chesterfield 
1069d27d0a67SJoseph Huber     for (int I = 0; I < NumberOfDevices; I++) {
1070d27d0a67SJoseph Huber       ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
1071d27d0a67SJoseph Huber       GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
1072d27d0a67SJoseph Huber       ComputeUnits[I] = 1;
1073d27d0a67SJoseph Huber       DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,
1074d27d0a67SJoseph Huber          GroupsPerDevice[I], ThreadsPerGroup[I]);
1075d0b31295SJon Chesterfield     }
1076d0b31295SJon Chesterfield 
1077d0b31295SJon Chesterfield     // Get environment variables regarding teams
1078f4f23de1SJoseph Huber     Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
1079f4f23de1SJoseph Huber     Env.NumTeams = readEnv("OMP_NUM_TEAMS");
1080f4f23de1SJoseph Huber     Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
1081f4f23de1SJoseph Huber     Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
1082f4f23de1SJoseph Huber     Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
1083d0b31295SJon Chesterfield 
1084d0b31295SJon Chesterfield     // Default state.
1085d0b31295SJon Chesterfield     RequiresFlags = OMP_REQ_UNDEFINED;
1086738734f6SJon Chesterfield 
1087738734f6SJon Chesterfield     ConstructionSucceeded = true;
1088d0b31295SJon Chesterfield   }
1089d0b31295SJon Chesterfield 
~RTLDeviceInfoTy()1090d0b31295SJon Chesterfield   ~RTLDeviceInfoTy() {
1091a90da62aSJon Chesterfield     DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
1092d83dc4c6SCarlo Bertolli     if (!HSAInitSuccess()) {
1093dd0b463dSJon Chesterfield       // Then none of these can have been set up and they can't be torn down
1094dd0b463dSJon Chesterfield       return;
1095dd0b463dSJon Chesterfield     }
10965d989fb3SJonChesterfield     // Run destructors on types that use HSA before
10973d853429SJon Chesterfield     // impl_finalize removes access to it
1098d27d0a67SJoseph Huber     DeviceStateStore.clear();
10995d989fb3SJonChesterfield     KernelArgPoolMap.clear();
11003153bdd5SJon Chesterfield     // Terminate hostrpc before finalizing hsa
110171f46930SJon Chesterfield     hostrpc_terminate();
1102d7503c3bSPushpinder Singh 
1103a2d6ef58SPushpinder Singh     hsa_status_t Err;
1104d18fb09cSJon Chesterfield     for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
1105a2d6ef58SPushpinder Singh       Err = hsa_executable_destroy(HSAExecutables[I]);
1106d18fb09cSJon Chesterfield       if (Err != HSA_STATUS_SUCCESS) {
1107d18fb09cSJon Chesterfield         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1108d18fb09cSJon Chesterfield            "Destroying executable", get_error_string(Err));
1109d18fb09cSJon Chesterfield       }
1110d18fb09cSJon Chesterfield     }
1111d0b31295SJon Chesterfield   }
1112d0b31295SJon Chesterfield };
1113d0b31295SJon Chesterfield 
1114d0b31295SJon Chesterfield pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
1115d0b31295SJon Chesterfield 
1116*046d5b91SJoseph Huber // Putting accesses to DeviceInfo global behind a function call prior
1117*046d5b91SJoseph Huber // to changing to use init_plugin/deinit_plugin calls
1118*046d5b91SJoseph Huber static RTLDeviceInfoTy DeviceInfoState;
DeviceInfo()1119*046d5b91SJoseph Huber static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; }
1120cb24013bSJon Chesterfield 
1121d0b31295SJon Chesterfield namespace {
1122d0b31295SJon Chesterfield 
dataRetrieve(int32_t DeviceId,void * HstPtr,void * TgtPtr,int64_t Size,__tgt_async_info * AsyncInfo)1123d0b31295SJon Chesterfield int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
11245449fbb5SJohannes Doerfert                      __tgt_async_info *AsyncInfo) {
11255449fbb5SJohannes Doerfert   assert(AsyncInfo && "AsyncInfo is nullptr");
1126b5151c32SJon Chesterfield   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
1127d0b31295SJon Chesterfield   // Return success if we are not copying back to host from target.
1128d0b31295SJon Chesterfield   if (!HstPtr)
1129d0b31295SJon Chesterfield     return OFFLOAD_SUCCESS;
1130d27d0a67SJoseph Huber   hsa_status_t Err;
1131d0b31295SJon Chesterfield   DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
1132d0b31295SJon Chesterfield      (long long unsigned)(Elf64_Addr)TgtPtr,
1133d0b31295SJon Chesterfield      (long long unsigned)(Elf64_Addr)HstPtr);
11345d989fb3SJonChesterfield 
1135b5151c32SJon Chesterfield   Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
113655dc1235SJonChesterfield                                            DeviceId);
11375d989fb3SJonChesterfield 
1138d27d0a67SJoseph Huber   if (Err != HSA_STATUS_SUCCESS) {
1139d0b31295SJon Chesterfield     DP("Error when copying data from device to host. Pointers: "
1140d0b31295SJon Chesterfield        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
1141d0b31295SJon Chesterfield        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
1142d0b31295SJon Chesterfield     return OFFLOAD_FAIL;
1143d0b31295SJon Chesterfield   }
1144d0b31295SJon Chesterfield   DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
1145d0b31295SJon Chesterfield      (long long unsigned)(Elf64_Addr)TgtPtr,
1146d0b31295SJon Chesterfield      (long long unsigned)(Elf64_Addr)HstPtr);
1147d0b31295SJon Chesterfield   return OFFLOAD_SUCCESS;
1148d0b31295SJon Chesterfield }
1149d0b31295SJon Chesterfield 
dataSubmit(int32_t DeviceId,void * TgtPtr,void * HstPtr,int64_t Size,__tgt_async_info * AsyncInfo)1150d0b31295SJon Chesterfield int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
11515449fbb5SJohannes Doerfert                    __tgt_async_info *AsyncInfo) {
11525449fbb5SJohannes Doerfert   assert(AsyncInfo && "AsyncInfo is nullptr");
1153d27d0a67SJoseph Huber   hsa_status_t Err;
1154b5151c32SJon Chesterfield   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
1155d0b31295SJon Chesterfield   // Return success if we are not doing host to target.
1156d0b31295SJon Chesterfield   if (!HstPtr)
1157d0b31295SJon Chesterfield     return OFFLOAD_SUCCESS;
1158d0b31295SJon Chesterfield 
1159d0b31295SJon Chesterfield   DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
1160d0b31295SJon Chesterfield      (long long unsigned)(Elf64_Addr)HstPtr,
1161d0b31295SJon Chesterfield      (long long unsigned)(Elf64_Addr)TgtPtr);
1162b5151c32SJon Chesterfield   Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
116355dc1235SJonChesterfield                                              DeviceId);
1164d27d0a67SJoseph Huber   if (Err != HSA_STATUS_SUCCESS) {
1165d0b31295SJon Chesterfield     DP("Error when copying data from host to device. Pointers: "
1166d0b31295SJon Chesterfield        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
1167d0b31295SJon Chesterfield        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
1168d0b31295SJon Chesterfield     return OFFLOAD_FAIL;
1169d0b31295SJon Chesterfield   }
1170d0b31295SJon Chesterfield   return OFFLOAD_SUCCESS;
1171d0b31295SJon Chesterfield }
1172d0b31295SJon Chesterfield 
1173d0b31295SJon Chesterfield // Async.
1174d0b31295SJon Chesterfield // The implementation was written with cuda streams in mind. The semantics of
1175d0b31295SJon Chesterfield // that are to execute kernels on a queue in order of insertion. A synchronise
1176d0b31295SJon Chesterfield // call then makes writes visible between host and device. This means a series
1177d0b31295SJon Chesterfield // of N data_submit_async calls are expected to execute serially. HSA offers
1178d0b31295SJon Chesterfield // various options to run the data copies concurrently. This may require changes
1179d0b31295SJon Chesterfield // to libomptarget.
1180d0b31295SJon Chesterfield 
1181d0b31295SJon Chesterfield // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
1182d0b31295SJon Chesterfield // there are no outstanding kernels that need to be synchronized. Any async call
1183d0b31295SJon Chesterfield // may be passed a Queue==0, at which point the cuda implementation will set it
1184d0b31295SJon Chesterfield // to non-null (see getStream). The cuda streams are per-device. Upstream may
11855449fbb5SJohannes Doerfert // change this interface to explicitly initialize the AsyncInfo_pointer, but
1186d0b31295SJon Chesterfield // until then hsa lazily initializes it as well.
1187d0b31295SJon Chesterfield 
initAsyncInfo(__tgt_async_info * AsyncInfo)11885449fbb5SJohannes Doerfert void initAsyncInfo(__tgt_async_info *AsyncInfo) {
1189d0b31295SJon Chesterfield   // set non-null while using async calls, return to null to indicate completion
11905449fbb5SJohannes Doerfert   assert(AsyncInfo);
11915449fbb5SJohannes Doerfert   if (!AsyncInfo->Queue) {
11925449fbb5SJohannes Doerfert     AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX);
1193d0b31295SJon Chesterfield   }
1194d0b31295SJon Chesterfield }
finiAsyncInfo(__tgt_async_info * AsyncInfo)11955449fbb5SJohannes Doerfert void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
11965449fbb5SJohannes Doerfert   assert(AsyncInfo);
11975449fbb5SJohannes Doerfert   assert(AsyncInfo->Queue);
11985449fbb5SJohannes Doerfert   AsyncInfo->Queue = 0;
1199d0b31295SJon Chesterfield }
1200cab9f692SJon Chesterfield 
1201d9b1d827SCarlo Bertolli // Determine launch values for kernel.
1202d27d0a67SJoseph Huber struct LaunchVals {
1203d9b1d827SCarlo Bertolli   int WorkgroupSize;
1204d9b1d827SCarlo Bertolli   int GridSize;
1205d9b1d827SCarlo Bertolli };
getLaunchVals(int WarpSize,EnvironmentVariables Env,int ConstWGSize,llvm::omp::OMPTgtExecModeFlags ExecutionMode,int NumTeams,int ThreadLimit,uint64_t LoopTripcount,int DeviceNumTeams)1206d27d0a67SJoseph Huber LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
1207d9b1d827SCarlo Bertolli                          int ConstWGSize,
1208d9b1d827SCarlo Bertolli                          llvm::omp::OMPTgtExecModeFlags ExecutionMode,
1209d27d0a67SJoseph Huber                          int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
1210d27d0a67SJoseph Huber                          int DeviceNumTeams) {
1211d9b1d827SCarlo Bertolli 
1212d27d0a67SJoseph Huber   int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
1213d27d0a67SJoseph Huber   int NumGroups = 0;
1214d9b1d827SCarlo Bertolli 
1215d27d0a67SJoseph Huber   int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
12161f940b69SJoseph Huber   if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
1217d27d0a67SJoseph Huber     MaxTeams = RTLDeviceInfoTy::HardTeamLimit;
1218d9b1d827SCarlo Bertolli 
1219d9b1d827SCarlo Bertolli   if (print_kernel_trace & STARTUP_DETAILS) {
1220d27d0a67SJoseph Huber     DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams);
1221d27d0a67SJoseph Huber     DP("Max_Teams: %d\n", MaxTeams);
1222d9b1d827SCarlo Bertolli     DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
1223d27d0a67SJoseph Huber     DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize);
1224d9b1d827SCarlo Bertolli     DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
1225d27d0a67SJoseph Huber        RTLDeviceInfoTy::DefaultWgSize);
1226d27d0a67SJoseph Huber     DP("thread_limit: %d\n", ThreadLimit);
1227d27d0a67SJoseph Huber     DP("threadsPerGroup: %d\n", ThreadsPerGroup);
1228d9b1d827SCarlo Bertolli     DP("ConstWGSize: %d\n", ConstWGSize);
1229d9b1d827SCarlo Bertolli   }
1230d9b1d827SCarlo Bertolli   // check for thread_limit() clause
1231d27d0a67SJoseph Huber   if (ThreadLimit > 0) {
1232d27d0a67SJoseph Huber     ThreadsPerGroup = ThreadLimit;
1233d27d0a67SJoseph Huber     DP("Setting threads per block to requested %d\n", ThreadLimit);
1234d9b1d827SCarlo Bertolli     // Add master warp for GENERIC
1235d9b1d827SCarlo Bertolli     if (ExecutionMode ==
1236d9b1d827SCarlo Bertolli         llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1237d27d0a67SJoseph Huber       ThreadsPerGroup += WarpSize;
1238d9b1d827SCarlo Bertolli       DP("Adding master wavefront: +%d threads\n", WarpSize);
1239d9b1d827SCarlo Bertolli     }
1240d27d0a67SJoseph Huber     if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
1241d27d0a67SJoseph Huber       ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
1242d27d0a67SJoseph Huber       DP("Setting threads per block to maximum %d\n", ThreadsPerGroup);
1243d9b1d827SCarlo Bertolli     }
1244d9b1d827SCarlo Bertolli   }
1245d9b1d827SCarlo Bertolli   // check flat_max_work_group_size attr here
1246d27d0a67SJoseph Huber   if (ThreadsPerGroup > ConstWGSize) {
1247d27d0a67SJoseph Huber     ThreadsPerGroup = ConstWGSize;
1248d9b1d827SCarlo Bertolli     DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
1249d27d0a67SJoseph Huber        ThreadsPerGroup);
1250d9b1d827SCarlo Bertolli   }
1251d9b1d827SCarlo Bertolli   if (print_kernel_trace & STARTUP_DETAILS)
1252d27d0a67SJoseph Huber     DP("threadsPerGroup: %d\n", ThreadsPerGroup);
1253d27d0a67SJoseph Huber   DP("Preparing %d threads\n", ThreadsPerGroup);
1254d9b1d827SCarlo Bertolli 
1255d9b1d827SCarlo Bertolli   // Set default num_groups (teams)
1256d9b1d827SCarlo Bertolli   if (Env.TeamLimit > 0)
1257d27d0a67SJoseph Huber     NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
1258d9b1d827SCarlo Bertolli   else
1259d27d0a67SJoseph Huber     NumGroups = MaxTeams;
1260d27d0a67SJoseph Huber   DP("Set default num of groups %d\n", NumGroups);
1261d9b1d827SCarlo Bertolli 
1262d9b1d827SCarlo Bertolli   if (print_kernel_trace & STARTUP_DETAILS) {
1263d27d0a67SJoseph Huber     DP("num_groups: %d\n", NumGroups);
1264d27d0a67SJoseph Huber     DP("num_teams: %d\n", NumTeams);
1265d9b1d827SCarlo Bertolli   }
1266d9b1d827SCarlo Bertolli 
1267d9b1d827SCarlo Bertolli   // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1268d9b1d827SCarlo Bertolli   // This reduction is typical for default case (no thread_limit clause).
1269d9b1d827SCarlo Bertolli   // or when user goes crazy with num_teams clause.
1270d9b1d827SCarlo Bertolli   // FIXME: We cant distinguish between a constant or variable thread limit.
1271d9b1d827SCarlo Bertolli   // So we only handle constant thread_limits.
1272d27d0a67SJoseph Huber   if (ThreadsPerGroup >
1273d27d0a67SJoseph Huber       RTLDeviceInfoTy::DefaultWgSize) //  256 < threadsPerGroup <= 1024
1274d9b1d827SCarlo Bertolli     // Should we round threadsPerGroup up to nearest WarpSize
1275d9b1d827SCarlo Bertolli     // here?
1276d27d0a67SJoseph Huber     NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;
1277d9b1d827SCarlo Bertolli 
1278d9b1d827SCarlo Bertolli   // check for num_teams() clause
1279d27d0a67SJoseph Huber   if (NumTeams > 0) {
1280d27d0a67SJoseph Huber     NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
1281d9b1d827SCarlo Bertolli   }
1282d9b1d827SCarlo Bertolli   if (print_kernel_trace & STARTUP_DETAILS) {
1283d27d0a67SJoseph Huber     DP("num_groups: %d\n", NumGroups);
1284d9b1d827SCarlo Bertolli     DP("Env.NumTeams %d\n", Env.NumTeams);
1285d9b1d827SCarlo Bertolli     DP("Env.TeamLimit %d\n", Env.TeamLimit);
1286d9b1d827SCarlo Bertolli   }
1287d9b1d827SCarlo Bertolli 
1288d9b1d827SCarlo Bertolli   if (Env.NumTeams > 0) {
1289d27d0a67SJoseph Huber     NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
1290d9b1d827SCarlo Bertolli     DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
1291d9b1d827SCarlo Bertolli   } else if (Env.TeamLimit > 0) {
1292d27d0a67SJoseph Huber     NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
1293d9b1d827SCarlo Bertolli     DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
1294d9b1d827SCarlo Bertolli   } else {
1295d27d0a67SJoseph Huber     if (NumTeams <= 0) {
1296d27d0a67SJoseph Huber       if (LoopTripcount > 0) {
1297d9b1d827SCarlo Bertolli         if (ExecutionMode ==
1298d9b1d827SCarlo Bertolli             llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
1299d9b1d827SCarlo Bertolli           // round up to the nearest integer
1300d27d0a67SJoseph Huber           NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
1301d9b1d827SCarlo Bertolli         } else if (ExecutionMode ==
1302d9b1d827SCarlo Bertolli                    llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1303d27d0a67SJoseph Huber           NumGroups = LoopTripcount;
1304d9b1d827SCarlo Bertolli         } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
1305d9b1d827SCarlo Bertolli           // This is a generic kernel that was transformed to use SPMD-mode
1306d9b1d827SCarlo Bertolli           // execution but uses Generic-mode semantics for scheduling.
1307d27d0a67SJoseph Huber           NumGroups = LoopTripcount;
1308d9b1d827SCarlo Bertolli         }
1309d9b1d827SCarlo Bertolli         DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
1310d9b1d827SCarlo Bertolli            "threads per block %d\n",
1311d27d0a67SJoseph Huber            NumGroups, LoopTripcount, ThreadsPerGroup);
1312d9b1d827SCarlo Bertolli       }
1313d9b1d827SCarlo Bertolli     } else {
1314d27d0a67SJoseph Huber       NumGroups = NumTeams;
1315d9b1d827SCarlo Bertolli     }
1316d27d0a67SJoseph Huber     if (NumGroups > MaxTeams) {
1317d27d0a67SJoseph Huber       NumGroups = MaxTeams;
1318d9b1d827SCarlo Bertolli       if (print_kernel_trace & STARTUP_DETAILS)
1319d27d0a67SJoseph Huber         DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams);
1320d9b1d827SCarlo Bertolli     }
1321d27d0a67SJoseph Huber     if (NumGroups > NumTeams && NumTeams > 0) {
1322d27d0a67SJoseph Huber       NumGroups = NumTeams;
1323d9b1d827SCarlo Bertolli       if (print_kernel_trace & STARTUP_DETAILS)
1324d27d0a67SJoseph Huber         DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,
1325d27d0a67SJoseph Huber            NumTeams);
1326d9b1d827SCarlo Bertolli     }
1327d9b1d827SCarlo Bertolli   }
1328d9b1d827SCarlo Bertolli 
1329d9b1d827SCarlo Bertolli   // num_teams clause always honored, no matter what, unless DEFAULT is active.
1330d27d0a67SJoseph Huber   if (NumTeams > 0) {
1331d27d0a67SJoseph Huber     NumGroups = NumTeams;
1332d9b1d827SCarlo Bertolli     // Cap num_groups to EnvMaxTeamsDefault if set.
1333d27d0a67SJoseph Huber     if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
1334d27d0a67SJoseph Huber       NumGroups = Env.MaxTeamsDefault;
1335d9b1d827SCarlo Bertolli   }
1336d9b1d827SCarlo Bertolli   if (print_kernel_trace & STARTUP_DETAILS) {
1337d27d0a67SJoseph Huber     DP("threadsPerGroup: %d\n", ThreadsPerGroup);
1338d27d0a67SJoseph Huber     DP("num_groups: %d\n", NumGroups);
1339d27d0a67SJoseph Huber     DP("loop_tripcount: %ld\n", LoopTripcount);
1340d9b1d827SCarlo Bertolli   }
1341d27d0a67SJoseph Huber   DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,
1342d27d0a67SJoseph Huber      ThreadsPerGroup);
1343d9b1d827SCarlo Bertolli 
1344d27d0a67SJoseph Huber   LaunchVals Res;
1345d27d0a67SJoseph Huber   Res.WorkgroupSize = ThreadsPerGroup;
1346d27d0a67SJoseph Huber   Res.GridSize = ThreadsPerGroup * NumGroups;
1347d27d0a67SJoseph Huber   return Res;
1348d9b1d827SCarlo Bertolli }
1349d9b1d827SCarlo Bertolli 
acquireAvailablePacketId(hsa_queue_t * Queue)1350d27d0a67SJoseph Huber static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
1351d27d0a67SJoseph Huber   uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
1352d27d0a67SJoseph Huber   bool Full = true;
1353d27d0a67SJoseph Huber   while (Full) {
1354d27d0a67SJoseph Huber     Full =
1355d27d0a67SJoseph Huber         PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
1356d9b1d827SCarlo Bertolli   }
1357d27d0a67SJoseph Huber   return PacketId;
1358d9b1d827SCarlo Bertolli }
1359d9b1d827SCarlo Bertolli 
runRegionLocked(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t NumTeams,int32_t ThreadLimit,uint64_t LoopTripcount)1360d27d0a67SJoseph Huber int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1361d27d0a67SJoseph Huber                         ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
1362d27d0a67SJoseph Huber                         int32_t ThreadLimit, uint64_t LoopTripcount) {
1363d9b1d827SCarlo Bertolli   // Set the context we are using
1364d9b1d827SCarlo Bertolli   // update thread limit content in gpu memory if un-initialized or specified
1365d9b1d827SCarlo Bertolli   // from host
1366d9b1d827SCarlo Bertolli 
1367d27d0a67SJoseph Huber   DP("Run target team region thread_limit %d\n", ThreadLimit);
1368d9b1d827SCarlo Bertolli 
1369d9b1d827SCarlo Bertolli   // All args are references.
1370d27d0a67SJoseph Huber   std::vector<void *> Args(ArgNum);
1371d27d0a67SJoseph Huber   std::vector<void *> Ptrs(ArgNum);
1372d9b1d827SCarlo Bertolli 
1373d27d0a67SJoseph Huber   DP("Arg_num: %d\n", ArgNum);
1374d27d0a67SJoseph Huber   for (int32_t I = 0; I < ArgNum; ++I) {
1375d27d0a67SJoseph Huber     Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1376d27d0a67SJoseph Huber     Args[I] = &Ptrs[I];
1377d27d0a67SJoseph Huber     DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I]));
1378d9b1d827SCarlo Bertolli   }
1379d9b1d827SCarlo Bertolli 
1380d27d0a67SJoseph Huber   KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;
1381d9b1d827SCarlo Bertolli 
1382d27d0a67SJoseph Huber   std::string KernelName = std::string(KernelInfo->Name);
1383b5151c32SJon Chesterfield   auto &KernelInfoTable = DeviceInfo().KernelInfoTable;
1384d27d0a67SJoseph Huber   if (KernelInfoTable[DeviceId].find(KernelName) ==
1385d27d0a67SJoseph Huber       KernelInfoTable[DeviceId].end()) {
1386d27d0a67SJoseph Huber     DP("Kernel %s not found\n", KernelName.c_str());
1387d9b1d827SCarlo Bertolli     return OFFLOAD_FAIL;
1388d9b1d827SCarlo Bertolli   }
1389d9b1d827SCarlo Bertolli 
1390d9b1d827SCarlo Bertolli   const atl_kernel_info_t KernelInfoEntry =
1391d27d0a67SJoseph Huber       KernelInfoTable[DeviceId][KernelName];
1392d27d0a67SJoseph Huber   const uint32_t GroupSegmentSize =
1393b5151c32SJon Chesterfield       KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize;
1394d27d0a67SJoseph Huber   const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
1395d27d0a67SJoseph Huber   const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
1396d27d0a67SJoseph Huber   const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
1397d27d0a67SJoseph Huber   const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;
1398d9b1d827SCarlo Bertolli 
1399d27d0a67SJoseph Huber   assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count);
1400d9b1d827SCarlo Bertolli 
1401d9b1d827SCarlo Bertolli   /*
1402d9b1d827SCarlo Bertolli    * Set limit based on ThreadsPerGroup and GroupsPerDevice
1403d9b1d827SCarlo Bertolli    */
1404d27d0a67SJoseph Huber   LaunchVals LV =
1405b5151c32SJon Chesterfield       getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env,
1406d9b1d827SCarlo Bertolli                     KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
1407d27d0a67SJoseph Huber                     NumTeams,      // From run_region arg
1408d27d0a67SJoseph Huber                     ThreadLimit,   // From run_region arg
1409d27d0a67SJoseph Huber                     LoopTripcount, // From run_region arg
1410b5151c32SJon Chesterfield                     DeviceInfo().NumTeams[KernelInfo->DeviceId]);
1411d9b1d827SCarlo Bertolli   const int GridSize = LV.GridSize;
1412d9b1d827SCarlo Bertolli   const int WorkgroupSize = LV.WorkgroupSize;
1413d9b1d827SCarlo Bertolli 
1414d9b1d827SCarlo Bertolli   if (print_kernel_trace >= LAUNCH) {
1415d27d0a67SJoseph Huber     int NumGroups = GridSize / WorkgroupSize;
1416d9b1d827SCarlo Bertolli     // enum modes are SPMD, GENERIC, NONE 0,1,2
1417d9b1d827SCarlo Bertolli     // if doing rtl timing, print to stderr, unless stdout requested.
1418d27d0a67SJoseph Huber     bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
1419d27d0a67SJoseph Huber     fprintf(TraceToStdout ? stdout : stderr,
1420d9b1d827SCarlo Bertolli             "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
1421d9b1d827SCarlo Bertolli             "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
1422d9b1d827SCarlo Bertolli             "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
1423d27d0a67SJoseph Huber             DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
1424d27d0a67SJoseph Huber             ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
1425d27d0a67SJoseph Huber             GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
1426d27d0a67SJoseph Huber             VgprSpillCount, LoopTripcount, KernelInfo->Name);
1427d9b1d827SCarlo Bertolli   }
1428d9b1d827SCarlo Bertolli 
1429d9b1d827SCarlo Bertolli   // Run on the device.
1430d9b1d827SCarlo Bertolli   {
1431b5151c32SJon Chesterfield     hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next();
1432d27d0a67SJoseph Huber     if (!Queue) {
1433d9b1d827SCarlo Bertolli       return OFFLOAD_FAIL;
1434d9b1d827SCarlo Bertolli     }
1435d27d0a67SJoseph Huber     uint64_t PacketId = acquireAvailablePacketId(Queue);
1436d9b1d827SCarlo Bertolli 
1437d27d0a67SJoseph Huber     const uint32_t Mask = Queue->size - 1; // size is a power of 2
1438d27d0a67SJoseph Huber     hsa_kernel_dispatch_packet_t *Packet =
1439d27d0a67SJoseph Huber         (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
1440d9b1d827SCarlo Bertolli 
1441d9b1d827SCarlo Bertolli     // packet->header is written last
1442d27d0a67SJoseph Huber     Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1443d27d0a67SJoseph Huber     Packet->workgroup_size_x = WorkgroupSize;
1444d27d0a67SJoseph Huber     Packet->workgroup_size_y = 1;
1445d27d0a67SJoseph Huber     Packet->workgroup_size_z = 1;
1446d27d0a67SJoseph Huber     Packet->reserved0 = 0;
1447d27d0a67SJoseph Huber     Packet->grid_size_x = GridSize;
1448d27d0a67SJoseph Huber     Packet->grid_size_y = 1;
1449d27d0a67SJoseph Huber     Packet->grid_size_z = 1;
1450d27d0a67SJoseph Huber     Packet->private_segment_size = KernelInfoEntry.private_segment_size;
1451d27d0a67SJoseph Huber     Packet->group_segment_size = GroupSegmentSize;
1452d27d0a67SJoseph Huber     Packet->kernel_object = KernelInfoEntry.kernel_object;
1453d27d0a67SJoseph Huber     Packet->kernarg_address = 0;     // use the block allocator
1454d27d0a67SJoseph Huber     Packet->reserved2 = 0;           // impl writes id_ here
1455d27d0a67SJoseph Huber     Packet->completion_signal = {0}; // may want a pool of signals
1456d9b1d827SCarlo Bertolli 
1457d9b1d827SCarlo Bertolli     KernelArgPool *ArgPool = nullptr;
1458d27d0a67SJoseph Huber     void *KernArg = nullptr;
1459d9b1d827SCarlo Bertolli     {
1460*046d5b91SJoseph Huber       auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
1461*046d5b91SJoseph Huber       if (It != KernelArgPoolMap.end()) {
1462d27d0a67SJoseph Huber         ArgPool = (It->second).get();
1463d9b1d827SCarlo Bertolli       }
1464d9b1d827SCarlo Bertolli     }
1465d9b1d827SCarlo Bertolli     if (!ArgPool) {
1466d9b1d827SCarlo Bertolli       DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
1467d27d0a67SJoseph Huber          DeviceId);
1468d9b1d827SCarlo Bertolli     }
1469d9b1d827SCarlo Bertolli     {
1470d9b1d827SCarlo Bertolli       if (ArgPool) {
1471d27d0a67SJoseph Huber         assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)));
1472d27d0a67SJoseph Huber         KernArg = ArgPool->allocate(ArgNum);
1473d9b1d827SCarlo Bertolli       }
1474d27d0a67SJoseph Huber       if (!KernArg) {
1475d9b1d827SCarlo Bertolli         DP("Allocate kernarg failed\n");
1476d9b1d827SCarlo Bertolli         return OFFLOAD_FAIL;
1477d9b1d827SCarlo Bertolli       }
1478d9b1d827SCarlo Bertolli 
1479d9b1d827SCarlo Bertolli       // Copy explicit arguments
1480d27d0a67SJoseph Huber       for (int I = 0; I < ArgNum; I++) {
1481d27d0a67SJoseph Huber         memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *));
1482d9b1d827SCarlo Bertolli       }
1483d9b1d827SCarlo Bertolli 
1484d9b1d827SCarlo Bertolli       // Initialize implicit arguments. TODO: Which of these can be dropped
1485d27d0a67SJoseph Huber       impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>(
1486d27d0a67SJoseph Huber           static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
1487d27d0a67SJoseph Huber       memset(ImplArgs, 0,
1488d9b1d827SCarlo Bertolli              sizeof(impl_implicit_args_t)); // may not be necessary
1489d27d0a67SJoseph Huber       ImplArgs->offset_x = 0;
1490d27d0a67SJoseph Huber       ImplArgs->offset_y = 0;
1491d27d0a67SJoseph Huber       ImplArgs->offset_z = 0;
1492d9b1d827SCarlo Bertolli 
1493d9b1d827SCarlo Bertolli       // assign a hostcall buffer for the selected Q
1494b5151c32SJon Chesterfield       if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) {
1495d9b1d827SCarlo Bertolli         // hostrpc_assign_buffer is not thread safe, and this function is
1496d9b1d827SCarlo Bertolli         // under a multiple reader lock, not a writer lock.
1497d27d0a67SJoseph Huber         static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER;
1498d27d0a67SJoseph Huber         pthread_mutex_lock(&HostcallInitLock);
1499b5151c32SJon Chesterfield         uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo().HSAAgents[DeviceId],
1500d27d0a67SJoseph Huber                                                 Queue, DeviceId);
1501d27d0a67SJoseph Huber         pthread_mutex_unlock(&HostcallInitLock);
1502d27d0a67SJoseph Huber         if (!Buffer) {
1503d9b1d827SCarlo Bertolli           DP("hostrpc_assign_buffer failed, gpu would dereference null and "
1504d9b1d827SCarlo Bertolli              "error\n");
1505d9b1d827SCarlo Bertolli           return OFFLOAD_FAIL;
1506d9b1d827SCarlo Bertolli         }
1507d9b1d827SCarlo Bertolli 
15087086a1dbSDhruva Chakrabarti         DP("Implicit argument count: %d\n",
15097086a1dbSDhruva Chakrabarti            KernelInfoEntry.implicit_argument_count);
1510d9b1d827SCarlo Bertolli         if (KernelInfoEntry.implicit_argument_count >= 4) {
1511d9b1d827SCarlo Bertolli           // Initialise pointer for implicit_argument_count != 0 ABI
1512d9b1d827SCarlo Bertolli           // Guess that the right implicit argument is at offset 24 after
1513d9b1d827SCarlo Bertolli           // the explicit arguments. In the future, should be able to read
1514d9b1d827SCarlo Bertolli           // the offset from msgpack. Clang is not annotating it at present.
1515d9b1d827SCarlo Bertolli           uint64_t Offset =
1516d9b1d827SCarlo Bertolli               sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
1517d27d0a67SJoseph Huber           if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
15187086a1dbSDhruva Chakrabarti             DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "
15197086a1dbSDhruva Chakrabarti                "args: %d\n",
1520d27d0a67SJoseph Huber                Offset + 8, ArgPool->kernargSizeIncludingImplicit());
1521d9b1d827SCarlo Bertolli           } else {
1522d27d0a67SJoseph Huber             memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
1523d9b1d827SCarlo Bertolli           }
1524d9b1d827SCarlo Bertolli         }
1525d9b1d827SCarlo Bertolli 
1526d9b1d827SCarlo Bertolli         // initialise pointer for implicit_argument_count == 0 ABI
1527d27d0a67SJoseph Huber         ImplArgs->hostcall_ptr = Buffer;
1528d9b1d827SCarlo Bertolli       }
1529d9b1d827SCarlo Bertolli 
1530d27d0a67SJoseph Huber       Packet->kernarg_address = KernArg;
1531d9b1d827SCarlo Bertolli     }
1532d9b1d827SCarlo Bertolli 
1533b5151c32SJon Chesterfield     hsa_signal_t S = DeviceInfo().FreeSignalPool.pop();
1534d27d0a67SJoseph Huber     if (S.handle == 0) {
1535d9b1d827SCarlo Bertolli       DP("Failed to get signal instance\n");
1536d9b1d827SCarlo Bertolli       return OFFLOAD_FAIL;
1537d9b1d827SCarlo Bertolli     }
1538d27d0a67SJoseph Huber     Packet->completion_signal = S;
1539d27d0a67SJoseph Huber     hsa_signal_store_relaxed(Packet->completion_signal, 1);
1540d9b1d827SCarlo Bertolli 
1541d9b1d827SCarlo Bertolli     // Publish the packet indicating it is ready to be processed
1542d27d0a67SJoseph Huber     core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
1543d27d0a67SJoseph Huber                              core::createHeader(), Packet->setup);
1544d9b1d827SCarlo Bertolli 
1545d9b1d827SCarlo Bertolli     // Since the packet is already published, its contents must not be
1546d9b1d827SCarlo Bertolli     // accessed any more
1547d27d0a67SJoseph Huber     hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
1548d9b1d827SCarlo Bertolli 
1549d27d0a67SJoseph Huber     while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
1550d9b1d827SCarlo Bertolli                                      HSA_WAIT_STATE_BLOCKED) != 0)
1551d9b1d827SCarlo Bertolli       ;
1552d9b1d827SCarlo Bertolli 
1553d9b1d827SCarlo Bertolli     assert(ArgPool);
1554d27d0a67SJoseph Huber     ArgPool->deallocate(KernArg);
1555b5151c32SJon Chesterfield     DeviceInfo().FreeSignalPool.push(S);
1556d9b1d827SCarlo Bertolli   }
1557d9b1d827SCarlo Bertolli 
1558d9b1d827SCarlo Bertolli   DP("Kernel completed\n");
1559d9b1d827SCarlo Bertolli   return OFFLOAD_SUCCESS;
1560d9b1d827SCarlo Bertolli }
1561d9b1d827SCarlo Bertolli 
elfMachineIdIsAmdgcn(__tgt_device_image * Image)1562d27d0a67SJoseph Huber bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
1563d27d0a67SJoseph Huber   const uint16_t AmdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
1564d27d0a67SJoseph Huber   int32_t R = elf_check_machine(Image, AmdgcnMachineID);
1565d27d0a67SJoseph Huber   if (!R) {
1566cab9f692SJon Chesterfield     DP("Supported machine ID not found\n");
1567cab9f692SJon Chesterfield   }
1568d27d0a67SJoseph Huber   return R;
1569cab9f692SJon Chesterfield }
1570cab9f692SJon Chesterfield 
elfEFlags(__tgt_device_image * Image)1571d27d0a67SJoseph Huber uint32_t elfEFlags(__tgt_device_image *Image) {
1572d27d0a67SJoseph Huber   char *ImgBegin = (char *)Image->ImageStart;
1573d27d0a67SJoseph Huber   size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;
1574cab9f692SJon Chesterfield 
1575d27d0a67SJoseph Huber   Elf *E = elf_memory(ImgBegin, ImgSize);
1576d27d0a67SJoseph Huber   if (!E) {
1577cab9f692SJon Chesterfield     DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
1578cab9f692SJon Chesterfield     return 0;
1579cab9f692SJon Chesterfield   }
1580cab9f692SJon Chesterfield 
1581d27d0a67SJoseph Huber   Elf64_Ehdr *Eh64 = elf64_getehdr(E);
1582cab9f692SJon Chesterfield 
1583d27d0a67SJoseph Huber   if (!Eh64) {
1584cab9f692SJon Chesterfield     DP("Unable to get machine ID from ELF file!\n");
1585d27d0a67SJoseph Huber     elf_end(E);
1586cab9f692SJon Chesterfield     return 0;
1587cab9f692SJon Chesterfield   }
1588cab9f692SJon Chesterfield 
1589d27d0a67SJoseph Huber   uint32_t Flags = Eh64->e_flags;
1590cab9f692SJon Chesterfield 
1591d27d0a67SJoseph Huber   elf_end(E);
1592cab9f692SJon Chesterfield   DP("ELF Flags: 0x%x\n", Flags);
1593cab9f692SJon Chesterfield   return Flags;
1594cab9f692SJon Chesterfield }
159591dfb32fSJon Chesterfield 
enforceUpperBound(T * Value,T Upper)1596d27d0a67SJoseph Huber template <typename T> bool enforceUpperBound(T *Value, T Upper) {
1597d27d0a67SJoseph Huber   bool Changed = *Value > Upper;
1598d27d0a67SJoseph Huber   if (Changed) {
1599d27d0a67SJoseph Huber     *Value = Upper;
160091dfb32fSJon Chesterfield   }
1601d27d0a67SJoseph Huber   return Changed;
160291dfb32fSJon Chesterfield }
160391dfb32fSJon Chesterfield 
findOnlyShtHash(Elf * Elf)1604d27d0a67SJoseph Huber Elf64_Shdr *findOnlyShtHash(Elf *Elf) {
160591dfb32fSJon Chesterfield   size_t N;
1606d27d0a67SJoseph Huber   int Rc = elf_getshdrnum(Elf, &N);
1607d27d0a67SJoseph Huber   if (Rc != 0) {
160891dfb32fSJon Chesterfield     return nullptr;
160991dfb32fSJon Chesterfield   }
161091dfb32fSJon Chesterfield 
1611d27d0a67SJoseph Huber   Elf64_Shdr *Result = nullptr;
1612d27d0a67SJoseph Huber   for (size_t I = 0; I < N; I++) {
1613d27d0a67SJoseph Huber     Elf_Scn *Scn = elf_getscn(Elf, I);
1614d27d0a67SJoseph Huber     if (Scn) {
1615d27d0a67SJoseph Huber       Elf64_Shdr *Shdr = elf64_getshdr(Scn);
1616d27d0a67SJoseph Huber       if (Shdr) {
1617d27d0a67SJoseph Huber         if (Shdr->sh_type == SHT_HASH) {
1618d27d0a67SJoseph Huber           if (Result == nullptr) {
1619d27d0a67SJoseph Huber             Result = Shdr;
162091dfb32fSJon Chesterfield           } else {
162191dfb32fSJon Chesterfield             // multiple SHT_HASH sections not handled
162291dfb32fSJon Chesterfield             return nullptr;
162391dfb32fSJon Chesterfield           }
162491dfb32fSJon Chesterfield         }
162591dfb32fSJon Chesterfield       }
162691dfb32fSJon Chesterfield     }
162791dfb32fSJon Chesterfield   }
1628d27d0a67SJoseph Huber   return Result;
162991dfb32fSJon Chesterfield }
163091dfb32fSJon Chesterfield 
elfLookup(Elf * Elf,char * Base,Elf64_Shdr * SectionHash,const char * Symname)1631d27d0a67SJoseph Huber const Elf64_Sym *elfLookup(Elf *Elf, char *Base, Elf64_Shdr *SectionHash,
1632d27d0a67SJoseph Huber                            const char *Symname) {
163391dfb32fSJon Chesterfield 
1634d27d0a67SJoseph Huber   assert(SectionHash);
1635d27d0a67SJoseph Huber   size_t SectionSymtabIndex = SectionHash->sh_link;
1636d27d0a67SJoseph Huber   Elf64_Shdr *SectionSymtab =
1637d27d0a67SJoseph Huber       elf64_getshdr(elf_getscn(Elf, SectionSymtabIndex));
1638d27d0a67SJoseph Huber   size_t SectionStrtabIndex = SectionSymtab->sh_link;
163991dfb32fSJon Chesterfield 
1640d27d0a67SJoseph Huber   const Elf64_Sym *Symtab =
1641d27d0a67SJoseph Huber       reinterpret_cast<const Elf64_Sym *>(Base + SectionSymtab->sh_offset);
164291dfb32fSJon Chesterfield 
1643d27d0a67SJoseph Huber   const uint32_t *Hashtab =
1644d27d0a67SJoseph Huber       reinterpret_cast<const uint32_t *>(Base + SectionHash->sh_offset);
164591dfb32fSJon Chesterfield 
164691dfb32fSJon Chesterfield   // Layout:
164791dfb32fSJon Chesterfield   // nbucket
164891dfb32fSJon Chesterfield   // nchain
164991dfb32fSJon Chesterfield   // bucket[nbucket]
165091dfb32fSJon Chesterfield   // chain[nchain]
1651d27d0a67SJoseph Huber   uint32_t Nbucket = Hashtab[0];
1652d27d0a67SJoseph Huber   const uint32_t *Bucket = &Hashtab[2];
1653d27d0a67SJoseph Huber   const uint32_t *Chain = &Hashtab[Nbucket + 2];
165491dfb32fSJon Chesterfield 
1655d27d0a67SJoseph Huber   const size_t Max = strlen(Symname) + 1;
1656d27d0a67SJoseph Huber   const uint32_t Hash = elf_hash(Symname);
1657d27d0a67SJoseph Huber   for (uint32_t I = Bucket[Hash % Nbucket]; I != 0; I = Chain[I]) {
1658d27d0a67SJoseph Huber     char *N = elf_strptr(Elf, SectionStrtabIndex, Symtab[I].st_name);
1659d27d0a67SJoseph Huber     if (strncmp(Symname, N, Max) == 0) {
1660d27d0a67SJoseph Huber       return &Symtab[I];
166191dfb32fSJon Chesterfield     }
166291dfb32fSJon Chesterfield   }
166391dfb32fSJon Chesterfield 
166491dfb32fSJon Chesterfield   return nullptr;
166591dfb32fSJon Chesterfield }
166691dfb32fSJon Chesterfield 
1667d27d0a67SJoseph Huber struct SymbolInfo {
1668d27d0a67SJoseph Huber   void *Addr = nullptr;
1669d27d0a67SJoseph Huber   uint32_t Size = UINT32_MAX;
1670d27d0a67SJoseph Huber   uint32_t ShType = SHT_NULL;
167191dfb32fSJon Chesterfield };
167291dfb32fSJon Chesterfield 
getSymbolInfoWithoutLoading(Elf * Elf,char * Base,const char * Symname,SymbolInfo * Res)1673d27d0a67SJoseph Huber int getSymbolInfoWithoutLoading(Elf *Elf, char *Base, const char *Symname,
1674d27d0a67SJoseph Huber                                 SymbolInfo *Res) {
1675d27d0a67SJoseph Huber   if (elf_kind(Elf) != ELF_K_ELF) {
167691dfb32fSJon Chesterfield     return 1;
167791dfb32fSJon Chesterfield   }
167891dfb32fSJon Chesterfield 
1679d27d0a67SJoseph Huber   Elf64_Shdr *SectionHash = findOnlyShtHash(Elf);
1680d27d0a67SJoseph Huber   if (!SectionHash) {
168191dfb32fSJon Chesterfield     return 1;
168291dfb32fSJon Chesterfield   }
168391dfb32fSJon Chesterfield 
1684d27d0a67SJoseph Huber   const Elf64_Sym *Sym = elfLookup(Elf, Base, SectionHash, Symname);
1685d27d0a67SJoseph Huber   if (!Sym) {
168691dfb32fSJon Chesterfield     return 1;
168791dfb32fSJon Chesterfield   }
168891dfb32fSJon Chesterfield 
1689d27d0a67SJoseph Huber   if (Sym->st_size > UINT32_MAX) {
169091dfb32fSJon Chesterfield     return 1;
169191dfb32fSJon Chesterfield   }
169291dfb32fSJon Chesterfield 
1693d27d0a67SJoseph Huber   if (Sym->st_shndx == SHN_UNDEF) {
169491dfb32fSJon Chesterfield     return 1;
169591dfb32fSJon Chesterfield   }
169691dfb32fSJon Chesterfield 
1697d27d0a67SJoseph Huber   Elf_Scn *Section = elf_getscn(Elf, Sym->st_shndx);
1698d27d0a67SJoseph Huber   if (!Section) {
169991dfb32fSJon Chesterfield     return 1;
170091dfb32fSJon Chesterfield   }
170191dfb32fSJon Chesterfield 
1702d27d0a67SJoseph Huber   Elf64_Shdr *Header = elf64_getshdr(Section);
1703d27d0a67SJoseph Huber   if (!Header) {
170491dfb32fSJon Chesterfield     return 1;
170591dfb32fSJon Chesterfield   }
170691dfb32fSJon Chesterfield 
1707d27d0a67SJoseph Huber   Res->Addr = Sym->st_value + Base;
1708d27d0a67SJoseph Huber   Res->Size = static_cast<uint32_t>(Sym->st_size);
1709d27d0a67SJoseph Huber   Res->ShType = Header->sh_type;
171091dfb32fSJon Chesterfield   return 0;
171191dfb32fSJon Chesterfield }
171291dfb32fSJon Chesterfield 
getSymbolInfoWithoutLoading(char * Base,size_t ImgSize,const char * Symname,SymbolInfo * Res)1713d27d0a67SJoseph Huber int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *Symname,
1714d27d0a67SJoseph Huber                                 SymbolInfo *Res) {
1715d27d0a67SJoseph Huber   Elf *Elf = elf_memory(Base, ImgSize);
1716d27d0a67SJoseph Huber   if (Elf) {
1717d27d0a67SJoseph Huber     int Rc = getSymbolInfoWithoutLoading(Elf, Base, Symname, Res);
1718d27d0a67SJoseph Huber     elf_end(Elf);
1719d27d0a67SJoseph Huber     return Rc;
172091dfb32fSJon Chesterfield   }
172191dfb32fSJon Chesterfield   return 1;
172291dfb32fSJon Chesterfield }
172391dfb32fSJon Chesterfield 
interopGetSymbolInfo(char * Base,size_t ImgSize,const char * SymName,void ** VarAddr,uint32_t * VarSize)1724d27d0a67SJoseph Huber hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
1725d27d0a67SJoseph Huber                                   const char *SymName, void **VarAddr,
1726d27d0a67SJoseph Huber                                   uint32_t *VarSize) {
1727d27d0a67SJoseph Huber   SymbolInfo SI;
1728d27d0a67SJoseph Huber   int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
1729d27d0a67SJoseph Huber   if (Rc == 0) {
1730d27d0a67SJoseph Huber     *VarAddr = SI.Addr;
1731d27d0a67SJoseph Huber     *VarSize = SI.Size;
173291dfb32fSJon Chesterfield     return HSA_STATUS_SUCCESS;
173391dfb32fSJon Chesterfield   }
1734d27d0a67SJoseph Huber   return HSA_STATUS_ERROR;
173591dfb32fSJon Chesterfield }
173691dfb32fSJon Chesterfield 
173791dfb32fSJon Chesterfield template <typename C>
moduleRegisterFromMemoryToPlace(std::map<std::string,atl_kernel_info_t> & KernelInfoTable,std::map<std::string,atl_symbol_info_t> & SymbolInfoTable,void * ModuleBytes,size_t ModuleSize,int DeviceId,C Cb,std::vector<hsa_executable_t> & HSAExecutables)1738d27d0a67SJoseph Huber hsa_status_t moduleRegisterFromMemoryToPlace(
173991dfb32fSJon Chesterfield     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
174091dfb32fSJon Chesterfield     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
1741d27d0a67SJoseph Huber     void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
174291dfb32fSJon Chesterfield     std::vector<hsa_executable_t> &HSAExecutables) {
1743d27d0a67SJoseph Huber   auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t {
1744d27d0a67SJoseph Huber     C *Unwrapped = static_cast<C *>(CbState);
1745d27d0a67SJoseph Huber     return (*Unwrapped)(Data, Size);
174691dfb32fSJon Chesterfield   };
174791dfb32fSJon Chesterfield   return core::RegisterModuleFromMemory(
1748d27d0a67SJoseph Huber       KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
1749b5151c32SJon Chesterfield       DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
175091dfb32fSJon Chesterfield       HSAExecutables);
175191dfb32fSJon Chesterfield }
175291dfb32fSJon Chesterfield 
getDeviceStateBytes(char * ImageStart,size_t ImgSize)1753d27d0a67SJoseph Huber uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
1754d27d0a67SJoseph Huber   uint64_t DeviceStateBytes = 0;
175591dfb32fSJon Chesterfield   {
175691dfb32fSJon Chesterfield     // If this is the deviceRTL, get the state variable size
1757d27d0a67SJoseph Huber     SymbolInfo SizeSi;
1758d27d0a67SJoseph Huber     int Rc = getSymbolInfoWithoutLoading(
1759d27d0a67SJoseph Huber         ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);
176091dfb32fSJon Chesterfield 
1761d27d0a67SJoseph Huber     if (Rc == 0) {
1762d27d0a67SJoseph Huber       if (SizeSi.Size != sizeof(uint64_t)) {
176391dfb32fSJon Chesterfield         DP("Found device_State_size variable with wrong size\n");
176491dfb32fSJon Chesterfield         return 0;
176591dfb32fSJon Chesterfield       }
176691dfb32fSJon Chesterfield 
176791dfb32fSJon Chesterfield       // Read number of bytes directly from the elf
1768d27d0a67SJoseph Huber       memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
176991dfb32fSJon Chesterfield     }
177091dfb32fSJon Chesterfield   }
1771d27d0a67SJoseph Huber   return DeviceStateBytes;
177291dfb32fSJon Chesterfield }
177391dfb32fSJon Chesterfield 
1774d27d0a67SJoseph Huber struct DeviceEnvironment {
177591dfb32fSJon Chesterfield   // initialise an DeviceEnvironmentTy in the deviceRTL
177691dfb32fSJon Chesterfield   // patches around differences in the deviceRTL between trunk, aomp,
177791dfb32fSJon Chesterfield   // rocmcc. Over time these differences will tend to zero and this class
177891dfb32fSJon Chesterfield   // simplified.
177991dfb32fSJon Chesterfield   // Symbol may be in .data or .bss, and may be missing fields, todo:
178091dfb32fSJon Chesterfield   // review aomp/trunk/rocm and simplify the following
178191dfb32fSJon Chesterfield 
178291dfb32fSJon Chesterfield   // The symbol may also have been deadstripped because the device side
178391dfb32fSJon Chesterfield   // accessors were unused.
178491dfb32fSJon Chesterfield 
178591dfb32fSJon Chesterfield   // If the symbol is in .data (aomp, rocm) it can be written directly.
178691dfb32fSJon Chesterfield   // If it is in .bss, we must wait for it to be allocated space on the
178791dfb32fSJon Chesterfield   // gpu (trunk) and initialize after loading.
sym__anone797b96e0b11::DeviceEnvironment178891dfb32fSJon Chesterfield   const char *sym() { return "omptarget_device_environment"; }
178991dfb32fSJon Chesterfield 
1790d27d0a67SJoseph Huber   DeviceEnvironmentTy HostDeviceEnv;
1791d27d0a67SJoseph Huber   SymbolInfo SI;
1792d27d0a67SJoseph Huber   bool Valid = false;
179391dfb32fSJon Chesterfield 
1794d27d0a67SJoseph Huber   __tgt_device_image *Image;
1795d27d0a67SJoseph Huber   const size_t ImgSize;
179691dfb32fSJon Chesterfield 
DeviceEnvironment__anone797b96e0b11::DeviceEnvironment1797d27d0a67SJoseph Huber   DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
1798d27d0a67SJoseph Huber                     __tgt_device_image *Image, const size_t ImgSize)
1799d27d0a67SJoseph Huber       : Image(Image), ImgSize(ImgSize) {
180091dfb32fSJon Chesterfield 
1801d27d0a67SJoseph Huber     HostDeviceEnv.NumDevices = NumberDevices;
1802d27d0a67SJoseph Huber     HostDeviceEnv.DeviceNum = DeviceId;
1803d27d0a67SJoseph Huber     HostDeviceEnv.DebugKind = 0;
1804d27d0a67SJoseph Huber     HostDeviceEnv.DynamicMemSize = DynamicMemSize;
1805d27d0a67SJoseph Huber     if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
1806d27d0a67SJoseph Huber       HostDeviceEnv.DebugKind = std::stoi(EnvStr);
180791dfb32fSJon Chesterfield 
1808d27d0a67SJoseph Huber     int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
1809d27d0a67SJoseph Huber                                          sym(), &SI);
1810d27d0a67SJoseph Huber     if (Rc != 0) {
181191dfb32fSJon Chesterfield       DP("Finding global device environment '%s' - symbol missing.\n", sym());
181291dfb32fSJon Chesterfield       return;
181391dfb32fSJon Chesterfield     }
181491dfb32fSJon Chesterfield 
1815d27d0a67SJoseph Huber     if (SI.Size > sizeof(HostDeviceEnv)) {
1816d27d0a67SJoseph Huber       DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,
1817d27d0a67SJoseph Huber          sizeof(HostDeviceEnv));
181891dfb32fSJon Chesterfield       return;
181991dfb32fSJon Chesterfield     }
182091dfb32fSJon Chesterfield 
1821d27d0a67SJoseph Huber     Valid = true;
182291dfb32fSJon Chesterfield   }
182391dfb32fSJon Chesterfield 
inImage__anone797b96e0b11::DeviceEnvironment1824d27d0a67SJoseph Huber   bool inImage() { return SI.ShType != SHT_NOBITS; }
182591dfb32fSJon Chesterfield 
beforeLoading__anone797b96e0b11::DeviceEnvironment1826d27d0a67SJoseph Huber   hsa_status_t beforeLoading(void *Data, size_t Size) {
1827d27d0a67SJoseph Huber     if (Valid) {
1828d27d0a67SJoseph Huber       if (inImage()) {
182991dfb32fSJon Chesterfield         DP("Setting global device environment before load (%u bytes)\n",
1830d27d0a67SJoseph Huber            SI.Size);
1831d27d0a67SJoseph Huber         uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart;
1832d27d0a67SJoseph Huber         void *Pos = (char *)Data + Offset;
1833d27d0a67SJoseph Huber         memcpy(Pos, &HostDeviceEnv, SI.Size);
183491dfb32fSJon Chesterfield       }
183591dfb32fSJon Chesterfield     }
183691dfb32fSJon Chesterfield     return HSA_STATUS_SUCCESS;
183791dfb32fSJon Chesterfield   }
183891dfb32fSJon Chesterfield 
afterLoading__anone797b96e0b11::DeviceEnvironment1839d27d0a67SJoseph Huber   hsa_status_t afterLoading() {
1840d27d0a67SJoseph Huber     if (Valid) {
1841d27d0a67SJoseph Huber       if (!inImage()) {
184291dfb32fSJon Chesterfield         DP("Setting global device environment after load (%u bytes)\n",
1843d27d0a67SJoseph Huber            SI.Size);
1844d27d0a67SJoseph Huber         int DeviceId = HostDeviceEnv.DeviceNum;
1845b5151c32SJon Chesterfield         auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
1846d27d0a67SJoseph Huber         void *StatePtr;
1847d27d0a67SJoseph Huber         uint32_t StatePtrSize;
1848d27d0a67SJoseph Huber         hsa_status_t Err = interop_hsa_get_symbol_info(
1849d27d0a67SJoseph Huber             SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
1850d27d0a67SJoseph Huber         if (Err != HSA_STATUS_SUCCESS) {
185191dfb32fSJon Chesterfield           DP("failed to find %s in loaded image\n", sym());
1852d27d0a67SJoseph Huber           return Err;
185391dfb32fSJon Chesterfield         }
185491dfb32fSJon Chesterfield 
1855d27d0a67SJoseph Huber         if (StatePtrSize != SI.Size) {
1856d27d0a67SJoseph Huber           DP("Symbol had size %u before loading, %u after\n", StatePtrSize,
1857d27d0a67SJoseph Huber              SI.Size);
185891dfb32fSJon Chesterfield           return HSA_STATUS_ERROR;
185991dfb32fSJon Chesterfield         }
186091dfb32fSJon Chesterfield 
1861b5151c32SJon Chesterfield         return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
1862d27d0a67SJoseph Huber                                                   StatePtrSize, DeviceId);
186391dfb32fSJon Chesterfield       }
186491dfb32fSJon Chesterfield     }
186591dfb32fSJon Chesterfield     return HSA_STATUS_SUCCESS;
186691dfb32fSJon Chesterfield   }
186791dfb32fSJon Chesterfield };
186891dfb32fSJon Chesterfield 
implCalloc(void ** RetPtr,size_t Size,int DeviceId)1869d27d0a67SJoseph Huber hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
1870d27d0a67SJoseph Huber   uint64_t Rounded = 4 * ((Size + 3) / 4);
1871d27d0a67SJoseph Huber   void *Ptr;
1872b5151c32SJon Chesterfield   hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
1873d27d0a67SJoseph Huber   hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
1874d27d0a67SJoseph Huber   if (Err != HSA_STATUS_SUCCESS) {
1875d27d0a67SJoseph Huber     return Err;
187691dfb32fSJon Chesterfield   }
187791dfb32fSJon Chesterfield 
1878d27d0a67SJoseph Huber   hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
1879d27d0a67SJoseph Huber   if (Rc != HSA_STATUS_SUCCESS) {
1880d27d0a67SJoseph Huber     DP("zero fill device_state failed with %u\n", Rc);
1881d27d0a67SJoseph Huber     core::Runtime::Memfree(Ptr);
188291dfb32fSJon Chesterfield     return HSA_STATUS_ERROR;
188391dfb32fSJon Chesterfield   }
188491dfb32fSJon Chesterfield 
1885d27d0a67SJoseph Huber   *RetPtr = Ptr;
188691dfb32fSJon Chesterfield   return HSA_STATUS_SUCCESS;
188791dfb32fSJon Chesterfield }
188891dfb32fSJon Chesterfield 
imageContainsSymbol(void * Data,size_t Size,const char * Sym)1889d27d0a67SJoseph Huber bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
1890d27d0a67SJoseph Huber   SymbolInfo SI;
1891d27d0a67SJoseph Huber   int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
1892d27d0a67SJoseph Huber   return (Rc == 0) && (SI.Addr != nullptr);
189391dfb32fSJon Chesterfield }
189491dfb32fSJon Chesterfield 
1895d0b31295SJon Chesterfield } // namespace
1896d0b31295SJon Chesterfield 
189791dfb32fSJon Chesterfield namespace core {
allow_access_to_all_gpu_agents(void * Ptr)1898d27d0a67SJoseph Huber hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
1899b5151c32SJon Chesterfield   return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(),
1900b5151c32SJon Chesterfield                                      &DeviceInfo().HSAAgents[0], NULL, Ptr);
190191dfb32fSJon Chesterfield }
190291dfb32fSJon Chesterfield } // namespace core
190391dfb32fSJon Chesterfield 
GetIsaInfo(hsa_isa_t isa,void * data)19044075a811SSaiyedul Islam static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
19054075a811SSaiyedul Islam   hsa_status_t err;
19064075a811SSaiyedul Islam   uint32_t name_len;
19074075a811SSaiyedul Islam   err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len);
19084075a811SSaiyedul Islam   if (err != HSA_STATUS_SUCCESS) {
19094075a811SSaiyedul Islam     DP("Error getting ISA info length\n");
19104075a811SSaiyedul Islam     return err;
19114075a811SSaiyedul Islam   }
19124075a811SSaiyedul Islam 
19134075a811SSaiyedul Islam   char TargetID[name_len];
19144075a811SSaiyedul Islam   err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID);
19154075a811SSaiyedul Islam   if (err != HSA_STATUS_SUCCESS) {
19164075a811SSaiyedul Islam     DP("Error getting ISA info name\n");
19174075a811SSaiyedul Islam     return err;
19184075a811SSaiyedul Islam   }
19194075a811SSaiyedul Islam 
19204075a811SSaiyedul Islam   auto TripleTargetID = llvm::StringRef(TargetID);
19214075a811SSaiyedul Islam   if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
1922b5151c32SJon Chesterfield     DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str());
19234075a811SSaiyedul Islam   }
19244075a811SSaiyedul Islam   return HSA_STATUS_SUCCESS;
19254075a811SSaiyedul Islam }
19264075a811SSaiyedul Islam 
19274075a811SSaiyedul Islam /// Parse a TargetID to get processor arch and feature map.
19284075a811SSaiyedul Islam /// Returns processor subarch.
19294075a811SSaiyedul Islam /// Returns TargetID features in \p FeatureMap argument.
19304075a811SSaiyedul Islam /// If the \p TargetID contains feature+, FeatureMap it to true.
19314075a811SSaiyedul Islam /// If the \p TargetID contains feature-, FeatureMap it to false.
19324075a811SSaiyedul Islam /// If the \p TargetID does not contain a feature (default), do not map it.
parseTargetID(StringRef TargetID,StringMap<bool> & FeatureMap)19334075a811SSaiyedul Islam StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) {
19344075a811SSaiyedul Islam   if (TargetID.empty())
19354075a811SSaiyedul Islam     return llvm::StringRef();
19364075a811SSaiyedul Islam 
19374075a811SSaiyedul Islam   auto ArchFeature = TargetID.split(":");
19384075a811SSaiyedul Islam   auto Arch = ArchFeature.first;
19394075a811SSaiyedul Islam   auto Features = ArchFeature.second;
19404075a811SSaiyedul Islam   if (Features.empty())
19414075a811SSaiyedul Islam     return Arch;
19424075a811SSaiyedul Islam 
19434075a811SSaiyedul Islam   if (Features.contains("sramecc+")) {
19444075a811SSaiyedul Islam     FeatureMap.insert(std::pair<std::string, bool>("sramecc", true));
19454075a811SSaiyedul Islam   } else if (Features.contains("sramecc-")) {
19464075a811SSaiyedul Islam     FeatureMap.insert(std::pair<std::string, bool>("sramecc", false));
19474075a811SSaiyedul Islam   }
19484075a811SSaiyedul Islam   if (Features.contains("xnack+")) {
19494075a811SSaiyedul Islam     FeatureMap.insert(std::pair<std::string, bool>("xnack", true));
19504075a811SSaiyedul Islam   } else if (Features.contains("xnack-")) {
19514075a811SSaiyedul Islam     FeatureMap.insert(std::pair<std::string, bool>("xnack", false));
19524075a811SSaiyedul Islam   }
19534075a811SSaiyedul Islam 
19544075a811SSaiyedul Islam   return Arch;
19554075a811SSaiyedul Islam }
19564075a811SSaiyedul Islam 
19574075a811SSaiyedul Islam /// Checks if an image \p ImgInfo is compatible with current
19584075a811SSaiyedul Islam /// system's environment \p EnvInfo
IsImageCompatibleWithEnv(const char * ImgInfo,std::string EnvInfo)19594075a811SSaiyedul Islam bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) {
19604075a811SSaiyedul Islam   llvm::StringRef ImgTID(ImgInfo), EnvTID(EnvInfo);
19614075a811SSaiyedul Islam 
19624075a811SSaiyedul Islam   // Compatible in case of exact match
19634075a811SSaiyedul Islam   if (ImgTID == EnvTID) {
19644075a811SSaiyedul Islam     DP("Compatible: Exact match \t[Image: %s]\t:\t[Environment: %s]\n",
19654075a811SSaiyedul Islam        ImgTID.data(), EnvTID.data());
19664075a811SSaiyedul Islam     return true;
19674075a811SSaiyedul Islam   }
19684075a811SSaiyedul Islam 
19694075a811SSaiyedul Islam   // Incompatible if Archs mismatch.
19704075a811SSaiyedul Islam   StringMap<bool> ImgMap, EnvMap;
19714075a811SSaiyedul Islam   StringRef ImgArch = parseTargetID(ImgTID, ImgMap);
19724075a811SSaiyedul Islam   StringRef EnvArch = parseTargetID(EnvTID, EnvMap);
19734075a811SSaiyedul Islam 
19744075a811SSaiyedul Islam   // Both EnvArch and ImgArch can't be empty here.
19754075a811SSaiyedul Islam   if (EnvArch.empty() || ImgArch.empty() || !ImgArch.contains(EnvArch)) {
19764075a811SSaiyedul Islam     DP("Incompatible: Processor mismatch \t[Image: %s]\t:\t[Environment: %s]\n",
19774075a811SSaiyedul Islam        ImgTID.data(), EnvTID.data());
19784075a811SSaiyedul Islam     return false;
19794075a811SSaiyedul Islam   }
19804075a811SSaiyedul Islam 
19814075a811SSaiyedul Islam   // Incompatible if image has more features than the environment, irrespective
19824075a811SSaiyedul Islam   // of type or sign of features.
19834075a811SSaiyedul Islam   if (ImgMap.size() > EnvMap.size()) {
19844075a811SSaiyedul Islam     DP("Incompatible: Image has more features than the environment \t[Image: "
19854075a811SSaiyedul Islam        "%s]\t:\t[Environment: %s]\n",
19864075a811SSaiyedul Islam        ImgTID.data(), EnvTID.data());
19874075a811SSaiyedul Islam     return false;
19884075a811SSaiyedul Islam   }
19894075a811SSaiyedul Islam 
19904075a811SSaiyedul Islam   // Compatible if each target feature specified by the environment is
19914075a811SSaiyedul Islam   // compatible with target feature of the image. The target feature is
19924075a811SSaiyedul Islam   // compatible if the iamge does not specify it (meaning Any), or if it
19934075a811SSaiyedul Islam   // specifies it with the same value (meaning On or Off).
19944075a811SSaiyedul Islam   for (const auto &ImgFeature : ImgMap) {
19954075a811SSaiyedul Islam     auto EnvFeature = EnvMap.find(ImgFeature.first());
19964075a811SSaiyedul Islam     if (EnvFeature == EnvMap.end()) {
19974075a811SSaiyedul Islam       DP("Incompatible: Value of Image's non-ANY feature is not matching with "
19984075a811SSaiyedul Islam          "the Environment feature's ANY value \t[Image: %s]\t:\t[Environment: "
19994075a811SSaiyedul Islam          "%s]\n",
20004075a811SSaiyedul Islam          ImgTID.data(), EnvTID.data());
20014075a811SSaiyedul Islam       return false;
20024075a811SSaiyedul Islam     } else if (EnvFeature->first() == ImgFeature.first() &&
20034075a811SSaiyedul Islam                EnvFeature->second != ImgFeature.second) {
20044075a811SSaiyedul Islam       DP("Incompatible: Value of Image's non-ANY feature is not matching with "
20054075a811SSaiyedul Islam          "the Environment feature's non-ANY value \t[Image: "
20064075a811SSaiyedul Islam          "%s]\t:\t[Environment: %s]\n",
20074075a811SSaiyedul Islam          ImgTID.data(), EnvTID.data());
20084075a811SSaiyedul Islam       return false;
20094075a811SSaiyedul Islam     }
20104075a811SSaiyedul Islam   }
20114075a811SSaiyedul Islam 
20124075a811SSaiyedul Islam   // Image is compatible if all features of Environment are:
20134075a811SSaiyedul Islam   //   - either, present in the Image's features map with the same sign,
20144075a811SSaiyedul Islam   //   - or, the feature is missing from Image's features map i.e. it is
20154075a811SSaiyedul Islam   //   set to ANY
20164075a811SSaiyedul Islam   DP("Compatible: Target IDs are compatible \t[Image: %s]\t:\t[Environment: "
20174075a811SSaiyedul Islam      "%s]\n",
20184075a811SSaiyedul Islam      ImgTID.data(), EnvTID.data());
20194075a811SSaiyedul Islam   return true;
20204075a811SSaiyedul Islam }
20214075a811SSaiyedul Islam 
202291dfb32fSJon Chesterfield extern "C" {
__tgt_rtl_is_valid_binary(__tgt_device_image * Image)2023d27d0a67SJoseph Huber int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
2024d27d0a67SJoseph Huber   return elfMachineIdIsAmdgcn(Image);
2025d0b31295SJon Chesterfield }
2026d0b31295SJon Chesterfield 
__tgt_rtl_is_valid_binary_info(__tgt_device_image * image,__tgt_image_info * info)20274075a811SSaiyedul Islam int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
20284075a811SSaiyedul Islam                                        __tgt_image_info *info) {
20294075a811SSaiyedul Islam   if (!__tgt_rtl_is_valid_binary(image))
20304075a811SSaiyedul Islam     return false;
20314075a811SSaiyedul Islam 
20324075a811SSaiyedul Islam   // A subarchitecture was not specified. Assume it is compatible.
20334075a811SSaiyedul Islam   if (!info->Arch)
20344075a811SSaiyedul Islam     return true;
20354075a811SSaiyedul Islam 
20364075a811SSaiyedul Islam   int32_t NumberOfDevices = __tgt_rtl_number_of_devices();
20374075a811SSaiyedul Islam 
20384075a811SSaiyedul Islam   for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
20394075a811SSaiyedul Islam     __tgt_rtl_init_device(DeviceId);
2040b5151c32SJon Chesterfield     hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId];
20414075a811SSaiyedul Islam     hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
20424075a811SSaiyedul Islam     if (err != HSA_STATUS_SUCCESS) {
20434075a811SSaiyedul Islam       DP("Error iterating ISAs\n");
20444075a811SSaiyedul Islam       return false;
20454075a811SSaiyedul Islam     }
2046b5151c32SJon Chesterfield     if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo().TargetID[DeviceId]))
20474075a811SSaiyedul Islam       return false;
20484075a811SSaiyedul Islam   }
20494075a811SSaiyedul Islam   DP("Image has Target ID compatible with the current environment: %s\n",
20504075a811SSaiyedul Islam      info->Arch);
20514075a811SSaiyedul Islam   return true;
20524075a811SSaiyedul Islam }
20534075a811SSaiyedul Islam 
__tgt_rtl_init_plugin()2054*046d5b91SJoseph Huber int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS; }
__tgt_rtl_deinit_plugin()2055*046d5b91SJoseph Huber int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS; }
2056*046d5b91SJoseph Huber 
__tgt_rtl_number_of_devices()2057738734f6SJon Chesterfield int __tgt_rtl_number_of_devices() {
2058738734f6SJon Chesterfield   // If the construction failed, no methods are safe to call
2059b5151c32SJon Chesterfield   if (DeviceInfo().ConstructionSucceeded) {
2060b5151c32SJon Chesterfield     return DeviceInfo().NumberOfDevices;
2061d27d0a67SJoseph Huber   }
2062738734f6SJon Chesterfield   DP("AMDGPU plugin construction failed. Zero devices available\n");
2063738734f6SJon Chesterfield   return 0;
2064738734f6SJon Chesterfield }
2065d0b31295SJon Chesterfield 
__tgt_rtl_init_requires(int64_t RequiresFlags)2066d0b31295SJon Chesterfield int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
2067d0b31295SJon Chesterfield   DP("Init requires flags to %ld\n", RequiresFlags);
2068b5151c32SJon Chesterfield   DeviceInfo().RequiresFlags = RequiresFlags;
2069d0b31295SJon Chesterfield   return RequiresFlags;
2070d0b31295SJon Chesterfield }
2071d0b31295SJon Chesterfield 
__tgt_rtl_init_device(int DeviceId)2072d27d0a67SJoseph Huber int32_t __tgt_rtl_init_device(int DeviceId) {
20734075a811SSaiyedul Islam   hsa_status_t Err = hsa_init();
20744075a811SSaiyedul Islam   if (Err != HSA_STATUS_SUCCESS) {
20754075a811SSaiyedul Islam     DP("HSA Initialization Failed.\n");
20764075a811SSaiyedul Islam     return HSA_STATUS_ERROR;
20774075a811SSaiyedul Islam   }
2078d0b31295SJon Chesterfield   // this is per device id init
2079d27d0a67SJoseph Huber   DP("Initialize the device id: %d\n", DeviceId);
2080d0b31295SJon Chesterfield 
2081b5151c32SJon Chesterfield   hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
2082d0b31295SJon Chesterfield 
2083d0b31295SJon Chesterfield   // Get number of Compute Unit
2084d27d0a67SJoseph Huber   uint32_t ComputeUnits = 0;
2085d27d0a67SJoseph Huber   Err = hsa_agent_get_info(
2086d27d0a67SJoseph Huber       Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
2087d27d0a67SJoseph Huber       &ComputeUnits);
2088d27d0a67SJoseph Huber   if (Err != HSA_STATUS_SUCCESS) {
2089b5151c32SJon Chesterfield     DeviceInfo().ComputeUnits[DeviceId] = 1;
2090d0b31295SJon Chesterfield     DP("Error getting compute units : settiing to 1\n");
2091d0b31295SJon Chesterfield   } else {
2092b5151c32SJon Chesterfield     DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits;
2093b5151c32SJon Chesterfield     DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]);
2094d0b31295SJon Chesterfield   }
2095cab9f692SJon Chesterfield 
2096cab9f692SJon Chesterfield   char GetInfoName[64]; // 64 max size returned by get info
2097d27d0a67SJoseph Huber   Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
2098cab9f692SJon Chesterfield                            (void *)GetInfoName);
2099d27d0a67SJoseph Huber   if (Err)
2100b5151c32SJon Chesterfield     DeviceInfo().GPUName[DeviceId] = "--unknown gpu--";
2101cab9f692SJon Chesterfield   else {
2102b5151c32SJon Chesterfield     DeviceInfo().GPUName[DeviceId] = GetInfoName;
2103cab9f692SJon Chesterfield   }
2104cab9f692SJon Chesterfield 
210530c0d5b4SRon Lieberman   if (print_kernel_trace & STARTUP_DETAILS)
2106d27d0a67SJoseph Huber     DP("Device#%-2d CU's: %2d %s\n", DeviceId,
2107b5151c32SJon Chesterfield        DeviceInfo().ComputeUnits[DeviceId], DeviceInfo().GPUName[DeviceId].c_str());
2108d0b31295SJon Chesterfield 
2109d0b31295SJon Chesterfield   // Query attributes to determine number of threads/block and blocks/grid.
2110d27d0a67SJoseph Huber   uint16_t WorkgroupMaxDim[3];
2111d27d0a67SJoseph Huber   Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
2112d27d0a67SJoseph Huber                            &WorkgroupMaxDim);
2113d27d0a67SJoseph Huber   if (Err != HSA_STATUS_SUCCESS) {
2114b5151c32SJon Chesterfield     DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
2115d0b31295SJon Chesterfield     DP("Error getting grid dims: num groups : %d\n",
2116d0b31295SJon Chesterfield        RTLDeviceInfoTy::DefaultNumTeams);
2117d27d0a67SJoseph Huber   } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
2118b5151c32SJon Chesterfield     DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
2119b5151c32SJon Chesterfield     DP("Using %d ROCm blocks per grid\n", DeviceInfo().GroupsPerDevice[DeviceId]);
2120d0b31295SJon Chesterfield   } else {
2121b5151c32SJon Chesterfield     DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
2122d0b31295SJon Chesterfield     DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
2123d0b31295SJon Chesterfield        "at the hard limit\n",
2124d27d0a67SJoseph Huber        WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit);
2125d0b31295SJon Chesterfield   }
2126d0b31295SJon Chesterfield 
2127d0b31295SJon Chesterfield   // Get thread limit
2128d27d0a67SJoseph Huber   hsa_dim3_t GridMaxDim;
2129d27d0a67SJoseph Huber   Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
2130d27d0a67SJoseph Huber   if (Err == HSA_STATUS_SUCCESS) {
2131b5151c32SJon Chesterfield     DeviceInfo().ThreadsPerGroup[DeviceId] =
2132d27d0a67SJoseph Huber         reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
2133b5151c32SJon Chesterfield         DeviceInfo().GroupsPerDevice[DeviceId];
21342fdf8bbdSJon Chesterfield 
2135b5151c32SJon Chesterfield     if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) {
2136b5151c32SJon Chesterfield       DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
2137d27d0a67SJoseph Huber       DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
2138b5151c32SJon Chesterfield     } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId],
2139d27d0a67SJoseph Huber                                  RTLDeviceInfoTy::MaxWgSize)) {
2140d27d0a67SJoseph Huber       DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
2141d0b31295SJon Chesterfield     } else {
2142d0b31295SJon Chesterfield       DP("Using ROCm Queried thread limit: %d\n",
2143b5151c32SJon Chesterfield          DeviceInfo().ThreadsPerGroup[DeviceId]);
2144d0b31295SJon Chesterfield     }
2145d0b31295SJon Chesterfield   } else {
2146b5151c32SJon Chesterfield     DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
2147d0b31295SJon Chesterfield     DP("Error getting max block dimension, use default:%d \n",
2148d27d0a67SJoseph Huber        RTLDeviceInfoTy::MaxWgSize);
2149d0b31295SJon Chesterfield   }
2150d0b31295SJon Chesterfield 
2151d0b31295SJon Chesterfield   // Get wavefront size
2152d27d0a67SJoseph Huber   uint32_t WavefrontSize = 0;
2153d27d0a67SJoseph Huber   Err =
2154d27d0a67SJoseph Huber       hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
2155d27d0a67SJoseph Huber   if (Err == HSA_STATUS_SUCCESS) {
2156d27d0a67SJoseph Huber     DP("Queried wavefront size: %d\n", WavefrontSize);
2157b5151c32SJon Chesterfield     DeviceInfo().WarpSize[DeviceId] = WavefrontSize;
2158d0b31295SJon Chesterfield   } else {
215978f92c38SJon Chesterfield     // TODO: Burn the wavefront size into the code object
216078f92c38SJon Chesterfield     DP("Warning: Unknown wavefront size, assuming 64\n");
2161b5151c32SJon Chesterfield     DeviceInfo().WarpSize[DeviceId] = 64;
2162d0b31295SJon Chesterfield   }
2163d0b31295SJon Chesterfield 
2164d0b31295SJon Chesterfield   // Adjust teams to the env variables
21652fdf8bbdSJon Chesterfield 
2166b5151c32SJon Chesterfield   if (DeviceInfo().Env.TeamLimit > 0 &&
2167b5151c32SJon Chesterfield       (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId],
2168b5151c32SJon Chesterfield                          DeviceInfo().Env.TeamLimit))) {
2169d0b31295SJon Chesterfield     DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
2170b5151c32SJon Chesterfield        DeviceInfo().Env.TeamLimit);
2171d0b31295SJon Chesterfield   }
2172d0b31295SJon Chesterfield 
2173d0b31295SJon Chesterfield   // Set default number of teams
2174b5151c32SJon Chesterfield   if (DeviceInfo().Env.NumTeams > 0) {
2175b5151c32SJon Chesterfield     DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams;
2176d0b31295SJon Chesterfield     DP("Default number of teams set according to environment %d\n",
2177b5151c32SJon Chesterfield        DeviceInfo().Env.NumTeams);
2178d0b31295SJon Chesterfield   } else {
2179c9bc4148SJon Chesterfield     char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
2180e0b713a0SDhruva Chakrabarti     int TeamsPerCU = DefaultTeamsPerCU;
2181c9bc4148SJon Chesterfield     if (TeamsPerCUEnvStr) {
2182c9bc4148SJon Chesterfield       TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
2183c9bc4148SJon Chesterfield     }
2184c9bc4148SJon Chesterfield 
2185b5151c32SJon Chesterfield     DeviceInfo().NumTeams[DeviceId] =
2186b5151c32SJon Chesterfield         TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId];
2187c9bc4148SJon Chesterfield     DP("Default number of teams = %d * number of compute units %d\n",
2188b5151c32SJon Chesterfield        TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]);
2189d0b31295SJon Chesterfield   }
2190d0b31295SJon Chesterfield 
2191b5151c32SJon Chesterfield   if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId],
2192b5151c32SJon Chesterfield                         DeviceInfo().GroupsPerDevice[DeviceId])) {
2193d0b31295SJon Chesterfield     DP("Default number of teams exceeds device limit, capping at %d\n",
2194b5151c32SJon Chesterfield        DeviceInfo().GroupsPerDevice[DeviceId]);
2195d0b31295SJon Chesterfield   }
2196d0b31295SJon Chesterfield 
2197422adaa8SJoseph Huber   // Adjust threads to the env variables
2198b5151c32SJon Chesterfield   if (DeviceInfo().Env.TeamThreadLimit > 0 &&
2199b5151c32SJon Chesterfield       (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2200b5151c32SJon Chesterfield                          DeviceInfo().Env.TeamThreadLimit))) {
2201422adaa8SJoseph Huber     DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
2202b5151c32SJon Chesterfield        DeviceInfo().Env.TeamThreadLimit);
2203422adaa8SJoseph Huber   }
2204422adaa8SJoseph Huber 
2205d0b31295SJon Chesterfield   // Set default number of threads
2206b5151c32SJon Chesterfield   DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
2207d0b31295SJon Chesterfield   DP("Default number of threads set according to library's default %d\n",
2208d27d0a67SJoseph Huber      RTLDeviceInfoTy::DefaultWgSize);
2209b5151c32SJon Chesterfield   if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2210b5151c32SJon Chesterfield                         DeviceInfo().ThreadsPerGroup[DeviceId])) {
2211d0b31295SJon Chesterfield     DP("Default number of threads exceeds device limit, capping at %d\n",
2212b5151c32SJon Chesterfield        DeviceInfo().ThreadsPerGroup[DeviceId]);
2213d0b31295SJon Chesterfield   }
2214d0b31295SJon Chesterfield 
2215d0b31295SJon Chesterfield   DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
2216b5151c32SJon Chesterfield      DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],
2217b5151c32SJon Chesterfield      DeviceInfo().ThreadsPerGroup[DeviceId]);
2218d0b31295SJon Chesterfield 
2219d27d0a67SJoseph Huber   DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,
2220b5151c32SJon Chesterfield      DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],
2221b5151c32SJon Chesterfield      DeviceInfo().GroupsPerDevice[DeviceId],
2222b5151c32SJon Chesterfield      DeviceInfo().GroupsPerDevice[DeviceId] *
2223b5151c32SJon Chesterfield          DeviceInfo().ThreadsPerGroup[DeviceId]);
2224d0b31295SJon Chesterfield 
2225d0b31295SJon Chesterfield   return OFFLOAD_SUCCESS;
2226d0b31295SJon Chesterfield }
2227d0b31295SJon Chesterfield 
2228d0b31295SJon Chesterfield static __tgt_target_table *
2229d27d0a67SJoseph Huber __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
2230d0b31295SJon Chesterfield 
__tgt_rtl_load_binary(int32_t DeviceId,__tgt_device_image * Image)2231d27d0a67SJoseph Huber __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
2232d27d0a67SJoseph Huber                                           __tgt_device_image *Image) {
2233b5151c32SJon Chesterfield   DeviceInfo().LoadRunLock.lock();
2234d27d0a67SJoseph Huber   __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
2235b5151c32SJon Chesterfield   DeviceInfo().LoadRunLock.unlock();
2236d27d0a67SJoseph Huber   return Res;
2237d0b31295SJon Chesterfield }
2238d0b31295SJon Chesterfield 
__tgt_rtl_load_binary_locked(int32_t DeviceId,__tgt_device_image * Image)2239d27d0a67SJoseph Huber __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
2240d27d0a67SJoseph Huber                                                  __tgt_device_image *Image) {
2241d27d0a67SJoseph Huber   // This function loads the device image onto gpu[DeviceId] and does other
2242ae9d96a6SJon Chesterfield   // per-image initialization work. Specifically:
2243ae9d96a6SJon Chesterfield   //
22440c554a47SJon Chesterfield   // - Initialize an DeviceEnvironmentTy instance embedded in the
2245ae9d96a6SJon Chesterfield   //   image at the symbol "omptarget_device_environment"
22460c554a47SJon Chesterfield   //   Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
2247ae9d96a6SJon Chesterfield   //
2248ae9d96a6SJon Chesterfield   // - Allocate a large array per-gpu (could be moved to init_device)
2249ae9d96a6SJon Chesterfield   //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
2250ae9d96a6SJon Chesterfield   //   - Allocate at least that many bytes of gpu memory
2251ae9d96a6SJon Chesterfield   //   - Zero initialize it
2252ae9d96a6SJon Chesterfield   //   - Write the pointer to the symbol omptarget_nvptx_device_State
2253ae9d96a6SJon Chesterfield   //
2254ae9d96a6SJon Chesterfield   // - Pulls some per-kernel information together from various sources and
2255ae9d96a6SJon Chesterfield   //   records it in the KernelsList for quicker access later
2256ae9d96a6SJon Chesterfield   //
2257ae9d96a6SJon Chesterfield   // The initialization can be done before or after loading the image onto the
2258ae9d96a6SJon Chesterfield   // gpu. This function presently does a mixture. Using the hsa api to get/set
2259ae9d96a6SJon Chesterfield   // the information is simpler to implement, in exchange for more complicated
2260ae9d96a6SJon Chesterfield   // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
2261ae9d96a6SJon Chesterfield   // back from the gpu vs a hashtable lookup on the host.
22625d989fb3SJonChesterfield 
2263d27d0a67SJoseph Huber   const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;
2264d0b31295SJon Chesterfield 
2265b5151c32SJon Chesterfield   DeviceInfo().clearOffloadEntriesTable(DeviceId);
2266d0b31295SJon Chesterfield 
2267d0b31295SJon Chesterfield   // We do not need to set the ELF version because the caller of this function
2268d0b31295SJon Chesterfield   // had to do that to decide the right runtime to use
2269d0b31295SJon Chesterfield 
2270d27d0a67SJoseph Huber   if (!elfMachineIdIsAmdgcn(Image))
2271d0b31295SJon Chesterfield     return NULL;
2272d0b31295SJon Chesterfield 
2273d0b31295SJon Chesterfield   {
2274b5151c32SJon Chesterfield     auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
2275b5151c32SJon Chesterfield                                  DeviceInfo().Env.DynamicMemSize, Image, ImgSize);
2276e191d311SJon Chesterfield 
2277b5151c32SJon Chesterfield     auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId];
2278b5151c32SJon Chesterfield     auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
2279d27d0a67SJoseph Huber     hsa_status_t Err = moduleRegisterFromMemoryToPlace(
2280d27d0a67SJoseph Huber         KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
2281d27d0a67SJoseph Huber         [&](void *Data, size_t Size) {
2282d27d0a67SJoseph Huber           if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
2283b5151c32SJon Chesterfield             __atomic_store_n(&DeviceInfo().HostcallRequired, true,
2284df005fa3SJon Chesterfield                              __ATOMIC_RELEASE);
2285df005fa3SJon Chesterfield           }
2286d27d0a67SJoseph Huber           return Env.beforeLoading(Data, Size);
2287df005fa3SJon Chesterfield         },
2288b5151c32SJon Chesterfield         DeviceInfo().HSAExecutables);
2289d0b31295SJon Chesterfield 
2290d27d0a67SJoseph Huber     check("Module registering", Err);
2291d27d0a67SJoseph Huber     if (Err != HSA_STATUS_SUCCESS) {
2292b5151c32SJon Chesterfield       const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str();
2293d27d0a67SJoseph Huber       const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
22949b2c6c07SJon Chesterfield 
22959b2c6c07SJon Chesterfield       if (strcmp(DeviceName, ElfName) != 0) {
22969b2c6c07SJon Chesterfield         DP("Possible gpu arch mismatch: device:%s, image:%s please check"
2297cab9f692SJon Chesterfield            " compiler flag: -march=<gpu>\n",
22989b2c6c07SJon Chesterfield            DeviceName, ElfName);
22999b2c6c07SJon Chesterfield       } else {
2300d27d0a67SJoseph Huber         DP("Error loading image onto GPU: %s\n", get_error_string(Err));
23019b2c6c07SJon Chesterfield       }
23029b2c6c07SJon Chesterfield 
2303d0b31295SJon Chesterfield       return NULL;
2304d0b31295SJon Chesterfield     }
2305e191d311SJon Chesterfield 
2306d27d0a67SJoseph Huber     Err = Env.afterLoading();
2307d27d0a67SJoseph Huber     if (Err != HSA_STATUS_SUCCESS) {
2308e191d311SJon Chesterfield       return NULL;
2309e191d311SJon Chesterfield     }
2310d0b31295SJon Chesterfield   }
2311d0b31295SJon Chesterfield 
23123153bdd5SJon Chesterfield   DP("AMDGPU module successfully loaded!\n");
2313d0b31295SJon Chesterfield 
2314e1b8e8a1SJon Chesterfield   {
2315e1b8e8a1SJon Chesterfield     // the device_State array is either large value in bss or a void* that
2316e1b8e8a1SJon Chesterfield     // needs to be assigned to a pointer to an array of size device_state_bytes
231756c446a8SJon Chesterfield     // If absent, it has been deadstripped and needs no setup.
2318e1b8e8a1SJon Chesterfield 
2319d27d0a67SJoseph Huber     void *StatePtr;
2320d27d0a67SJoseph Huber     uint32_t StatePtrSize;
2321b5151c32SJon Chesterfield     auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2322d27d0a67SJoseph Huber     hsa_status_t Err = interop_hsa_get_symbol_info(
2323d27d0a67SJoseph Huber         SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
2324d27d0a67SJoseph Huber         &StatePtrSize);
2325e1b8e8a1SJon Chesterfield 
2326d27d0a67SJoseph Huber     if (Err != HSA_STATUS_SUCCESS) {
232756c446a8SJon Chesterfield       DP("No device_state symbol found, skipping initialization\n");
232856c446a8SJon Chesterfield     } else {
2329d27d0a67SJoseph Huber       if (StatePtrSize < sizeof(void *)) {
2330d27d0a67SJoseph Huber         DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,
233156c446a8SJon Chesterfield            sizeof(void *));
2332e1b8e8a1SJon Chesterfield         return NULL;
2333e1b8e8a1SJon Chesterfield       }
2334e1b8e8a1SJon Chesterfield 
2335e1b8e8a1SJon Chesterfield       // if it's larger than a void*, assume it's a bss array and no further
2336e1b8e8a1SJon Chesterfield       // initialization is required. Only try to set up a pointer for
2337e1b8e8a1SJon Chesterfield       // sizeof(void*)
2338d27d0a67SJoseph Huber       if (StatePtrSize == sizeof(void *)) {
2339d27d0a67SJoseph Huber         uint64_t DeviceStateBytes =
2340d27d0a67SJoseph Huber             getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
2341d27d0a67SJoseph Huber         if (DeviceStateBytes == 0) {
234256c446a8SJon Chesterfield           DP("Can't initialize device_State, missing size information\n");
2343e1b8e8a1SJon Chesterfield           return NULL;
2344e1b8e8a1SJon Chesterfield         }
23455d989fb3SJonChesterfield 
2346b5151c32SJon Chesterfield         auto &DSS = DeviceInfo().DeviceStateStore[DeviceId];
2347d27d0a67SJoseph Huber         if (DSS.first.get() == nullptr) {
2348d27d0a67SJoseph Huber           assert(DSS.second == 0);
2349d27d0a67SJoseph Huber           void *Ptr = NULL;
2350d27d0a67SJoseph Huber           hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
2351d27d0a67SJoseph Huber           if (Err != HSA_STATUS_SUCCESS) {
235256c446a8SJon Chesterfield             DP("Failed to allocate device_state array\n");
23535d989fb3SJonChesterfield             return NULL;
23545d989fb3SJonChesterfield           }
2355d27d0a67SJoseph Huber           DSS = {
2356d27d0a67SJoseph Huber               std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
2357d27d0a67SJoseph Huber               DeviceStateBytes,
235856c446a8SJon Chesterfield           };
23595d989fb3SJonChesterfield         }
23605d989fb3SJonChesterfield 
2361d27d0a67SJoseph Huber         void *Ptr = DSS.first.get();
2362d27d0a67SJoseph Huber         if (DeviceStateBytes != DSS.second) {
236356c446a8SJon Chesterfield           DP("Inconsistent sizes of device_State unsupported\n");
236456c446a8SJon Chesterfield           return NULL;
23655d989fb3SJonChesterfield         }
23665d989fb3SJonChesterfield 
23675d989fb3SJonChesterfield         // write ptr to device memory so it can be used by later kernels
2368207f96e8SJoseph Huber         Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
2369207f96e8SJoseph Huber                                                    sizeof(void *), DeviceId);
2370d27d0a67SJoseph Huber         if (Err != HSA_STATUS_SUCCESS) {
237156c446a8SJon Chesterfield           DP("memcpy install of state_ptr failed\n");
23725d989fb3SJonChesterfield           return NULL;
23735d989fb3SJonChesterfield         }
23745d989fb3SJonChesterfield       }
2375e1b8e8a1SJon Chesterfield     }
237656c446a8SJon Chesterfield   }
23775d989fb3SJonChesterfield 
2378d0b31295SJon Chesterfield   // Here, we take advantage of the data that is appended after img_end to get
2379d0b31295SJon Chesterfield   // the symbols' name we need to load. This data consist of the host entries
2380d0b31295SJon Chesterfield   // begin and end as well as the target name (see the offloading linker script
2381d0b31295SJon Chesterfield   // creation in clang compiler).
2382d0b31295SJon Chesterfield 
2383d0b31295SJon Chesterfield   // Find the symbols in the module by name. The name can be obtain by
2384d0b31295SJon Chesterfield   // concatenating the host entry name with the target name
2385d0b31295SJon Chesterfield 
2386d27d0a67SJoseph Huber   __tgt_offload_entry *HostBegin = Image->EntriesBegin;
2387d27d0a67SJoseph Huber   __tgt_offload_entry *HostEnd = Image->EntriesEnd;
2388d0b31295SJon Chesterfield 
2389d27d0a67SJoseph Huber   for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
2390d0b31295SJon Chesterfield 
2391d27d0a67SJoseph Huber     if (!E->addr) {
2392d0b31295SJon Chesterfield       // The host should have always something in the address to
2393d0b31295SJon Chesterfield       // uniquely identify the target region.
23949b2c6c07SJon Chesterfield       DP("Analyzing host entry '<null>' (size = %lld)...\n",
2395d27d0a67SJoseph Huber          (unsigned long long)E->size);
2396d0b31295SJon Chesterfield       return NULL;
2397d0b31295SJon Chesterfield     }
2398d0b31295SJon Chesterfield 
2399d27d0a67SJoseph Huber     if (E->size) {
2400d27d0a67SJoseph Huber       __tgt_offload_entry Entry = *E;
2401d0b31295SJon Chesterfield 
2402d27d0a67SJoseph Huber       void *Varptr;
2403d27d0a67SJoseph Huber       uint32_t Varsize;
2404d0b31295SJon Chesterfield 
2405b5151c32SJon Chesterfield       auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2406d27d0a67SJoseph Huber       hsa_status_t Err = interop_hsa_get_symbol_info(
2407d27d0a67SJoseph Huber           SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
2408d0b31295SJon Chesterfield 
2409d27d0a67SJoseph Huber       if (Err != HSA_STATUS_SUCCESS) {
2410d0b31295SJon Chesterfield         // Inform the user what symbol prevented offloading
2411d27d0a67SJoseph Huber         DP("Loading global '%s' (Failed)\n", E->name);
2412d0b31295SJon Chesterfield         return NULL;
2413d0b31295SJon Chesterfield       }
2414d0b31295SJon Chesterfield 
2415d27d0a67SJoseph Huber       if (Varsize != E->size) {
2416d27d0a67SJoseph Huber         DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,
2417d27d0a67SJoseph Huber            Varsize, E->size);
2418d0b31295SJon Chesterfield         return NULL;
2419d0b31295SJon Chesterfield       }
2420d0b31295SJon Chesterfield 
2421d0b31295SJon Chesterfield       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
2422d27d0a67SJoseph Huber          DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr));
2423d27d0a67SJoseph Huber       Entry.addr = (void *)Varptr;
2424d0b31295SJon Chesterfield 
2425b5151c32SJon Chesterfield       DeviceInfo().addOffloadEntry(DeviceId, Entry);
2426d0b31295SJon Chesterfield 
2427b5151c32SJon Chesterfield       if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
2428d27d0a67SJoseph Huber           E->flags & OMP_DECLARE_TARGET_LINK) {
2429d0b31295SJon Chesterfield         // If unified memory is present any target link variables
2430d0b31295SJon Chesterfield         // can access host addresses directly. There is no longer a
2431d0b31295SJon Chesterfield         // need for device copies.
2432207f96e8SJoseph Huber         Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, sizeof(void *),
2433207f96e8SJoseph Huber                                                  DeviceId);
2434d27d0a67SJoseph Huber         if (Err != HSA_STATUS_SUCCESS)
2435d0b31295SJon Chesterfield           DP("Error when copying USM\n");
2436d0b31295SJon Chesterfield         DP("Copy linked variable host address (" DPxMOD ")"
2437d0b31295SJon Chesterfield            "to device address (" DPxMOD ")\n",
2438d27d0a67SJoseph Huber            DPxPTR(*((void **)E->addr)), DPxPTR(Varptr));
2439d0b31295SJon Chesterfield       }
2440d0b31295SJon Chesterfield 
2441d0b31295SJon Chesterfield       continue;
2442d0b31295SJon Chesterfield     }
2443d0b31295SJon Chesterfield 
2444d27d0a67SJoseph Huber     DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name));
2445d0b31295SJon Chesterfield 
24469cdaf0b0SJon Chesterfield     // errors in kernarg_segment_size previously treated as = 0 (or as undef)
2447d27d0a67SJoseph Huber     uint32_t KernargSegmentSize = 0;
2448b5151c32SJon Chesterfield     auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId];
2449d27d0a67SJoseph Huber     hsa_status_t Err = HSA_STATUS_SUCCESS;
2450d27d0a67SJoseph Huber     if (!E->name) {
2451d27d0a67SJoseph Huber       Err = HSA_STATUS_ERROR;
24529cdaf0b0SJon Chesterfield     } else {
2453d27d0a67SJoseph Huber       std::string KernelStr = std::string(E->name);
2454d27d0a67SJoseph Huber       auto It = KernelInfoMap.find(KernelStr);
24559cdaf0b0SJon Chesterfield       if (It != KernelInfoMap.end()) {
2456d27d0a67SJoseph Huber         atl_kernel_info_t Info = It->second;
2457d27d0a67SJoseph Huber         KernargSegmentSize = Info.kernel_segment_size;
24589cdaf0b0SJon Chesterfield       } else {
2459d27d0a67SJoseph Huber         Err = HSA_STATUS_ERROR;
24609cdaf0b0SJon Chesterfield       }
24619cdaf0b0SJon Chesterfield     }
2462d0b31295SJon Chesterfield 
2463d0b31295SJon Chesterfield     // default value GENERIC (in case symbol is missing from cubin file)
2464b75a7481SJon Chesterfield     llvm::omp::OMPTgtExecModeFlags ExecModeVal =
2465b75a7481SJon Chesterfield         llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2466d0b31295SJon Chesterfield 
2467d0b31295SJon Chesterfield     // get flat group size if present, else Default_WG_Size
2468d27d0a67SJoseph Huber     int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2469d0b31295SJon Chesterfield 
2470d0b31295SJon Chesterfield     // get Kernel Descriptor if present.
2471d0b31295SJon Chesterfield     // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
2472d0b31295SJon Chesterfield     struct KernDescValType {
2473d0b31295SJon Chesterfield       uint16_t Version;
2474d0b31295SJon Chesterfield       uint16_t TSize;
2475d27d0a67SJoseph Huber       uint16_t WGSize;
2476d0b31295SJon Chesterfield     };
2477d0b31295SJon Chesterfield     struct KernDescValType KernDescVal;
2478d27d0a67SJoseph Huber     std::string KernDescNameStr(E->name);
2479d0b31295SJon Chesterfield     KernDescNameStr += "_kern_desc";
2480d0b31295SJon Chesterfield     const char *KernDescName = KernDescNameStr.c_str();
2481d0b31295SJon Chesterfield 
2482d0b31295SJon Chesterfield     void *KernDescPtr;
2483d0b31295SJon Chesterfield     uint32_t KernDescSize;
2484ae9d96a6SJon Chesterfield     void *CallStackAddr = nullptr;
2485d27d0a67SJoseph Huber     Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
2486d27d0a67SJoseph Huber                                &KernDescPtr, &KernDescSize);
2487d0b31295SJon Chesterfield 
2488d27d0a67SJoseph Huber     if (Err == HSA_STATUS_SUCCESS) {
2489d0b31295SJon Chesterfield       if ((size_t)KernDescSize != sizeof(KernDescVal))
2490d0b31295SJon Chesterfield         DP("Loading global computation properties '%s' - size mismatch (%u != "
2491d0b31295SJon Chesterfield            "%lu)\n",
2492d0b31295SJon Chesterfield            KernDescName, KernDescSize, sizeof(KernDescVal));
2493d0b31295SJon Chesterfield 
2494d0b31295SJon Chesterfield       memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
2495d0b31295SJon Chesterfield 
2496d0b31295SJon Chesterfield       // Check structure size against recorded size.
2497d0b31295SJon Chesterfield       if ((size_t)KernDescSize != KernDescVal.TSize)
2498d0b31295SJon Chesterfield         DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
2499d0b31295SJon Chesterfield            sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
2500d0b31295SJon Chesterfield 
2501d0b31295SJon Chesterfield       DP("After loading global for %s KernDesc \n", KernDescName);
2502d0b31295SJon Chesterfield       DP("KernDesc: Version: %d\n", KernDescVal.Version);
2503d0b31295SJon Chesterfield       DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
2504d27d0a67SJoseph Huber       DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize);
2505d0b31295SJon Chesterfield 
2506d27d0a67SJoseph Huber       if (KernDescVal.WGSize == 0) {
2507d27d0a67SJoseph Huber         KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
2508d27d0a67SJoseph Huber         DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize);
250998c36f00SDhruva Chakrabarti       }
2510d27d0a67SJoseph Huber       WGSizeVal = KernDescVal.WGSize;
2511d0b31295SJon Chesterfield       DP("WGSizeVal %d\n", WGSizeVal);
2512d27d0a67SJoseph Huber       check("Loading KernDesc computation property", Err);
2513d0b31295SJon Chesterfield     } else {
2514d0b31295SJon Chesterfield       DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
2515d0b31295SJon Chesterfield 
2516d0b31295SJon Chesterfield       // Flat group size
2517d27d0a67SJoseph Huber       std::string WGSizeNameStr(E->name);
2518d0b31295SJon Chesterfield       WGSizeNameStr += "_wg_size";
2519d0b31295SJon Chesterfield       const char *WGSizeName = WGSizeNameStr.c_str();
2520d0b31295SJon Chesterfield 
2521d0b31295SJon Chesterfield       void *WGSizePtr;
2522d0b31295SJon Chesterfield       uint32_t WGSize;
2523d27d0a67SJoseph Huber       Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
2524d27d0a67SJoseph Huber                                  &WGSizePtr, &WGSize);
2525d0b31295SJon Chesterfield 
2526d27d0a67SJoseph Huber       if (Err == HSA_STATUS_SUCCESS) {
2527d0b31295SJon Chesterfield         if ((size_t)WGSize != sizeof(int16_t)) {
2528d0b31295SJon Chesterfield           DP("Loading global computation properties '%s' - size mismatch (%u "
2529d0b31295SJon Chesterfield              "!= "
2530d0b31295SJon Chesterfield              "%lu)\n",
2531d0b31295SJon Chesterfield              WGSizeName, WGSize, sizeof(int16_t));
2532d0b31295SJon Chesterfield           return NULL;
2533d0b31295SJon Chesterfield         }
2534d0b31295SJon Chesterfield 
2535d0b31295SJon Chesterfield         memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
2536d0b31295SJon Chesterfield 
2537d0b31295SJon Chesterfield         DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
2538d0b31295SJon Chesterfield 
2539d27d0a67SJoseph Huber         if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize ||
2540d27d0a67SJoseph Huber             WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
2541d0b31295SJon Chesterfield           DP("Error wrong WGSize value specified in HSA code object file: "
2542d0b31295SJon Chesterfield              "%d\n",
2543d0b31295SJon Chesterfield              WGSizeVal);
2544d27d0a67SJoseph Huber           WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2545d0b31295SJon Chesterfield         }
2546d0b31295SJon Chesterfield       } else {
2547d0b31295SJon Chesterfield         DP("Warning: Loading WGSize '%s' - symbol not found, "
2548d0b31295SJon Chesterfield            "using default value %d\n",
2549d0b31295SJon Chesterfield            WGSizeName, WGSizeVal);
2550d0b31295SJon Chesterfield       }
2551d0b31295SJon Chesterfield 
2552d27d0a67SJoseph Huber       check("Loading WGSize computation property", Err);
2553d0b31295SJon Chesterfield     }
2554d0b31295SJon Chesterfield 
2555b75a7481SJon Chesterfield     // Read execution mode from global in binary
2556d27d0a67SJoseph Huber     std::string ExecModeNameStr(E->name);
2557b75a7481SJon Chesterfield     ExecModeNameStr += "_exec_mode";
2558b75a7481SJon Chesterfield     const char *ExecModeName = ExecModeNameStr.c_str();
2559b75a7481SJon Chesterfield 
2560b75a7481SJon Chesterfield     void *ExecModePtr;
2561d27d0a67SJoseph Huber     uint32_t VarSize;
2562d27d0a67SJoseph Huber     Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
2563d27d0a67SJoseph Huber                                &ExecModePtr, &VarSize);
2564b75a7481SJon Chesterfield 
2565d27d0a67SJoseph Huber     if (Err == HSA_STATUS_SUCCESS) {
2566d27d0a67SJoseph Huber       if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
2567b75a7481SJon Chesterfield         DP("Loading global computation properties '%s' - size mismatch(%u != "
2568b75a7481SJon Chesterfield            "%lu)\n",
2569d27d0a67SJoseph Huber            ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
2570b75a7481SJon Chesterfield         return NULL;
2571b75a7481SJon Chesterfield       }
2572b75a7481SJon Chesterfield 
2573d27d0a67SJoseph Huber       memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);
2574b75a7481SJon Chesterfield 
2575b75a7481SJon Chesterfield       DP("After loading global for %s ExecMode = %d\n", ExecModeName,
2576b75a7481SJon Chesterfield          ExecModeVal);
2577b75a7481SJon Chesterfield 
2578b75a7481SJon Chesterfield       if (ExecModeVal < 0 ||
2579b75a7481SJon Chesterfield           ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
2580b75a7481SJon Chesterfield         DP("Error wrong exec_mode value specified in HSA code object file: "
2581b75a7481SJon Chesterfield            "%d\n",
2582b75a7481SJon Chesterfield            ExecModeVal);
2583b75a7481SJon Chesterfield         return NULL;
2584b75a7481SJon Chesterfield       }
2585b75a7481SJon Chesterfield     } else {
2586b75a7481SJon Chesterfield       DP("Loading global exec_mode '%s' - symbol missing, using default "
2587b75a7481SJon Chesterfield          "value "
2588b75a7481SJon Chesterfield          "GENERIC (1)\n",
2589b75a7481SJon Chesterfield          ExecModeName);
2590b75a7481SJon Chesterfield     }
2591d27d0a67SJoseph Huber     check("Loading computation property", Err);
2592b75a7481SJon Chesterfield 
2593*046d5b91SJoseph Huber     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
2594*046d5b91SJoseph Huber                                    CallStackAddr, E->name, KernargSegmentSize,
2595*046d5b91SJoseph Huber                                    DeviceInfo().KernArgPool));
2596d27d0a67SJoseph Huber     __tgt_offload_entry Entry = *E;
2597*046d5b91SJoseph Huber     Entry.addr = (void *)&KernelsList.back();
2598b5151c32SJon Chesterfield     DeviceInfo().addOffloadEntry(DeviceId, Entry);
2599d27d0a67SJoseph Huber     DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
2600d0b31295SJon Chesterfield   }
2601d0b31295SJon Chesterfield 
2602b5151c32SJon Chesterfield   return DeviceInfo().getOffloadEntriesTable(DeviceId);
2603d0b31295SJon Chesterfield }
2604d0b31295SJon Chesterfield 
__tgt_rtl_data_alloc(int DeviceId,int64_t Size,void *,int32_t Kind)2605d27d0a67SJoseph Huber void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
2606d27d0a67SJoseph Huber   void *Ptr = NULL;
2607b5151c32SJon Chesterfield   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
26082468fdd9SGeorge Rokos 
2609d27d0a67SJoseph Huber   if (Kind != TARGET_ALLOC_DEFAULT) {
26102468fdd9SGeorge Rokos     REPORT("Invalid target data allocation kind or requested allocator not "
26112468fdd9SGeorge Rokos            "implemented yet\n");
26122468fdd9SGeorge Rokos     return NULL;
26132468fdd9SGeorge Rokos   }
26142468fdd9SGeorge Rokos 
2615b5151c32SJon Chesterfield   hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
2616d27d0a67SJoseph Huber   hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
2617d27d0a67SJoseph Huber   DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,
2618d27d0a67SJoseph Huber      (long long unsigned)(Elf64_Addr)Ptr);
2619d27d0a67SJoseph Huber   Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL;
2620d27d0a67SJoseph Huber   return Ptr;
2621d0b31295SJon Chesterfield }
2622d0b31295SJon Chesterfield 
__tgt_rtl_data_submit(int DeviceId,void * TgtPtr,void * HstPtr,int64_t Size)2623d27d0a67SJoseph Huber int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
2624d27d0a67SJoseph Huber                               int64_t Size) {
2625b5151c32SJon Chesterfield   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
26265449fbb5SJohannes Doerfert   __tgt_async_info AsyncInfo;
2627d27d0a67SJoseph Huber   int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
2628d27d0a67SJoseph Huber   if (Rc != OFFLOAD_SUCCESS)
2629d0b31295SJon Chesterfield     return OFFLOAD_FAIL;
2630d0b31295SJon Chesterfield 
2631d27d0a67SJoseph Huber   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2632d0b31295SJon Chesterfield }
2633d0b31295SJon Chesterfield 
__tgt_rtl_data_submit_async(int DeviceId,void * TgtPtr,void * HstPtr,int64_t Size,__tgt_async_info * AsyncInfo)2634d27d0a67SJoseph Huber int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
2635d27d0a67SJoseph Huber                                     int64_t Size, __tgt_async_info *AsyncInfo) {
2636b5151c32SJon Chesterfield   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
26375449fbb5SJohannes Doerfert   if (AsyncInfo) {
26385449fbb5SJohannes Doerfert     initAsyncInfo(AsyncInfo);
2639d27d0a67SJoseph Huber     return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
2640d0b31295SJon Chesterfield   }
2641d27d0a67SJoseph Huber   return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
2642d0b31295SJon Chesterfield }
2643d0b31295SJon Chesterfield 
__tgt_rtl_data_retrieve(int DeviceId,void * HstPtr,void * TgtPtr,int64_t Size)2644d27d0a67SJoseph Huber int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
2645d27d0a67SJoseph Huber                                 int64_t Size) {
2646b5151c32SJon Chesterfield   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
26475449fbb5SJohannes Doerfert   __tgt_async_info AsyncInfo;
2648d27d0a67SJoseph Huber   int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
2649d27d0a67SJoseph Huber   if (Rc != OFFLOAD_SUCCESS)
2650d0b31295SJon Chesterfield     return OFFLOAD_FAIL;
2651d0b31295SJon Chesterfield 
2652d27d0a67SJoseph Huber   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2653d0b31295SJon Chesterfield }
2654d0b31295SJon Chesterfield 
__tgt_rtl_data_retrieve_async(int DeviceId,void * HstPtr,void * TgtPtr,int64_t Size,__tgt_async_info * AsyncInfo)2655d27d0a67SJoseph Huber int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
2656d27d0a67SJoseph Huber                                       int64_t Size,
26575449fbb5SJohannes Doerfert                                       __tgt_async_info *AsyncInfo) {
26585449fbb5SJohannes Doerfert   assert(AsyncInfo && "AsyncInfo is nullptr");
2659b5151c32SJon Chesterfield   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
26605449fbb5SJohannes Doerfert   initAsyncInfo(AsyncInfo);
2661d27d0a67SJoseph Huber   return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
2662d0b31295SJon Chesterfield }
2663d0b31295SJon Chesterfield 
__tgt_rtl_data_delete(int DeviceId,void * TgtPtr)2664d27d0a67SJoseph Huber int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) {
2665b5151c32SJon Chesterfield   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2666d27d0a67SJoseph Huber   hsa_status_t Err;
2667d27d0a67SJoseph Huber   DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr);
2668d27d0a67SJoseph Huber   Err = core::Runtime::Memfree(TgtPtr);
2669d27d0a67SJoseph Huber   if (Err != HSA_STATUS_SUCCESS) {
2670d0b31295SJon Chesterfield     DP("Error when freeing CUDA memory\n");
2671d0b31295SJon Chesterfield     return OFFLOAD_FAIL;
2672d0b31295SJon Chesterfield   }
2673d0b31295SJon Chesterfield   return OFFLOAD_SUCCESS;
2674d0b31295SJon Chesterfield }
2675d0b31295SJon Chesterfield 
__tgt_rtl_run_target_team_region(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t NumTeams,int32_t ThreadLimit,uint64_t LoopTripcount)2676d27d0a67SJoseph Huber int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
2677d27d0a67SJoseph Huber                                          void **TgtArgs, ptrdiff_t *TgtOffsets,
2678d27d0a67SJoseph Huber                                          int32_t ArgNum, int32_t NumTeams,
2679d27d0a67SJoseph Huber                                          int32_t ThreadLimit,
2680d27d0a67SJoseph Huber                                          uint64_t LoopTripcount) {
26815d989fb3SJonChesterfield 
2682b5151c32SJon Chesterfield   DeviceInfo().LoadRunLock.lock_shared();
2683d27d0a67SJoseph Huber   int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2684d27d0a67SJoseph Huber                                 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
26855d989fb3SJonChesterfield 
2686b5151c32SJon Chesterfield   DeviceInfo().LoadRunLock.unlock_shared();
2687d27d0a67SJoseph Huber   return Res;
26885d989fb3SJonChesterfield }
26895d989fb3SJonChesterfield 
__tgt_rtl_run_target_region(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum)2690d27d0a67SJoseph Huber int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
2691d27d0a67SJoseph Huber                                     void **TgtArgs, ptrdiff_t *TgtOffsets,
2692d27d0a67SJoseph Huber                                     int32_t ArgNum) {
2693d0b31295SJon Chesterfield   // use one team and one thread
2694d0b31295SJon Chesterfield   // fix thread num
2695d27d0a67SJoseph Huber   int32_t TeamNum = 1;
2696d27d0a67SJoseph Huber   int32_t ThreadLimit = 0; // use default
2697d27d0a67SJoseph Huber   return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs,
2698d27d0a67SJoseph Huber                                           TgtOffsets, ArgNum, TeamNum,
2699d27d0a67SJoseph Huber                                           ThreadLimit, 0);
2700d0b31295SJon Chesterfield }
2701d0b31295SJon Chesterfield 
__tgt_rtl_run_target_team_region_async(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t NumTeams,int32_t ThreadLimit,uint64_t LoopTripcount,__tgt_async_info * AsyncInfo)270228309c54SCarlo Bertolli int32_t __tgt_rtl_run_target_team_region_async(
2703d27d0a67SJoseph Huber     int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
2704d27d0a67SJoseph Huber     int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit,
2705d27d0a67SJoseph Huber     uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) {
2706d3abb04eSCarlo Bertolli   assert(AsyncInfo && "AsyncInfo is nullptr");
2707d3abb04eSCarlo Bertolli   initAsyncInfo(AsyncInfo);
270828309c54SCarlo Bertolli 
2709b5151c32SJon Chesterfield   DeviceInfo().LoadRunLock.lock_shared();
2710d27d0a67SJoseph Huber   int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2711d27d0a67SJoseph Huber                                 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
271228309c54SCarlo Bertolli 
2713b5151c32SJon Chesterfield   DeviceInfo().LoadRunLock.unlock_shared();
2714d27d0a67SJoseph Huber   return Res;
271528309c54SCarlo Bertolli }
271628309c54SCarlo Bertolli 
__tgt_rtl_run_target_region_async(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,__tgt_async_info * AsyncInfo)2717d27d0a67SJoseph Huber int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
2718d27d0a67SJoseph Huber                                           void **TgtArgs, ptrdiff_t *TgtOffsets,
2719d27d0a67SJoseph Huber                                           int32_t ArgNum,
27205449fbb5SJohannes Doerfert                                           __tgt_async_info *AsyncInfo) {
2721d0b31295SJon Chesterfield   // use one team and one thread
2722d0b31295SJon Chesterfield   // fix thread num
2723d27d0a67SJoseph Huber   int32_t TeamNum = 1;
2724d27d0a67SJoseph Huber   int32_t ThreadLimit = 0; // use default
2725d27d0a67SJoseph Huber   return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs,
2726d27d0a67SJoseph Huber                                                 TgtOffsets, ArgNum, TeamNum,
2727d27d0a67SJoseph Huber                                                 ThreadLimit, 0, AsyncInfo);
2728d0b31295SJon Chesterfield }
2729d0b31295SJon Chesterfield 
__tgt_rtl_synchronize(int32_t DeviceId,__tgt_async_info * AsyncInfo)2730d27d0a67SJoseph Huber int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
27315449fbb5SJohannes Doerfert   assert(AsyncInfo && "AsyncInfo is nullptr");
2732d0b31295SJon Chesterfield 
27335449fbb5SJohannes Doerfert   // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
2734d0b31295SJon Chesterfield   // is not ensured by devices.cpp for amdgcn
27355449fbb5SJohannes Doerfert   // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
27365449fbb5SJohannes Doerfert   if (AsyncInfo->Queue) {
27375449fbb5SJohannes Doerfert     finiAsyncInfo(AsyncInfo);
2738d0b31295SJon Chesterfield   }
2739d0b31295SJon Chesterfield   return OFFLOAD_SUCCESS;
2740d0b31295SJon Chesterfield }
274115ed5c0aSJose Manuel Monsalve Diaz 
__tgt_rtl_print_device_info(int32_t DeviceId)2742d27d0a67SJoseph Huber void __tgt_rtl_print_device_info(int32_t DeviceId) {
2743d27d0a67SJoseph Huber   // TODO: Assertion to see if DeviceId is correct
274415ed5c0aSJose Manuel Monsalve Diaz   // NOTE: We don't need to set context for print device info.
274515ed5c0aSJose Manuel Monsalve Diaz 
2746b5151c32SJon Chesterfield   DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
274715ed5c0aSJose Manuel Monsalve Diaz }
274815ed5c0aSJose Manuel Monsalve Diaz 
274991dfb32fSJon Chesterfield } // extern "C"
2750