1 //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for AMD hsa machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <assert.h>
15 #include <cstdio>
16 #include <cstdlib>
17 #include <cstring>
18 #include <functional>
19 #include <libelf.h>
20 #include <list>
21 #include <memory>
22 #include <mutex>
23 #include <shared_mutex>
24 #include <unordered_map>
25 #include <vector>
26 
27 #include "impl_runtime.h"
28 #include "interop_hsa.h"
29 
30 #include "internal.h"
31 #include "rt.h"
32 
33 #include "DeviceEnvironment.h"
34 #include "get_elf_mach_gfx_name.h"
35 #include "omptargetplugin.h"
36 #include "print_tracing.h"
37 
38 #include "llvm/ADT/StringMap.h"
39 #include "llvm/ADT/StringRef.h"
40 #include "llvm/Frontend/OpenMP/OMPConstants.h"
41 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
42 
43 using namespace llvm;
44 
45 // hostrpc interface, FIXME: consider moving to its own include these are
46 // statically linked into amdgpu/plugin if present from hostrpc_services.a,
47 // linked as --whole-archive to override the weak symbols that are used to
48 // implement a fallback for toolchains that do not yet have a hostrpc library.
49 extern "C" {
50 uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
51                                uint32_t DeviceId);
52 hsa_status_t hostrpc_init();
53 hsa_status_t hostrpc_terminate();
54 
hostrpc_init()55 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
hostrpc_terminate()56 __attribute__((weak)) hsa_status_t hostrpc_terminate() {
57   return HSA_STATUS_SUCCESS;
58 }
hostrpc_assign_buffer(hsa_agent_t,hsa_queue_t *,uint32_t DeviceId)59 __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
60                                                      uint32_t DeviceId) {
61   DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
62      "missing\n",
63      DeviceId);
64   return 0;
65 }
66 }
67 
68 // Heuristic parameters used for kernel launch
69 // Number of teams per CU to allow scheduling flexibility
70 static const unsigned DefaultTeamsPerCU = 4;
71 
72 int print_kernel_trace;
73 
74 #ifdef OMPTARGET_DEBUG
75 #define check(msg, status)                                                     \
76   if (status != HSA_STATUS_SUCCESS) {                                          \
77     DP(#msg " failed\n");                                                      \
78   } else {                                                                     \
79     DP(#msg " succeeded\n");                                                   \
80   }
81 #else
82 #define check(msg, status)                                                     \
83   {}
84 #endif
85 
86 #include "elf_common.h"
87 
88 namespace hsa {
iterate_agents(C Cb)89 template <typename C> hsa_status_t iterate_agents(C Cb) {
90   auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
91     C *Unwrapped = static_cast<C *>(Data);
92     return (*Unwrapped)(Agent);
93   };
94   return hsa_iterate_agents(L, static_cast<void *>(&Cb));
95 }
96 
97 template <typename C>
amd_agent_iterate_memory_pools(hsa_agent_t Agent,C Cb)98 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
99   auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
100     C *Unwrapped = static_cast<C *>(Data);
101     return (*Unwrapped)(MemoryPool);
102   };
103 
104   return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
105 }
106 
107 } // namespace hsa
108 
109 /// Keep entries table per device
110 struct FuncOrGblEntryTy {
111   __tgt_target_table Table;
112   std::vector<__tgt_offload_entry> Entries;
113 };
114 
115 struct KernelArgPool {
116 private:
117   static pthread_mutex_t Mutex;
118 
119 public:
120   uint32_t KernargSegmentSize;
121   void *KernargRegion = nullptr;
122   std::queue<int> FreeKernargSegments;
123 
kernargSizeIncludingImplicitKernelArgPool124   uint32_t kernargSizeIncludingImplicit() {
125     return KernargSegmentSize + sizeof(impl_implicit_args_t);
126   }
127 
~KernelArgPoolKernelArgPool128   ~KernelArgPool() {
129     if (KernargRegion) {
130       auto R = hsa_amd_memory_pool_free(KernargRegion);
131       if (R != HSA_STATUS_SUCCESS) {
132         DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R));
133       }
134     }
135   }
136 
137   // Can't really copy or move a mutex
138   KernelArgPool() = default;
139   KernelArgPool(const KernelArgPool &) = delete;
140   KernelArgPool(KernelArgPool &&) = delete;
141 
KernelArgPoolKernelArgPool142   KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
143       : KernargSegmentSize(KernargSegmentSize) {
144 
145     // impl uses one pool per kernel for all gpus, with a fixed upper size
146     // preserving that exact scheme here, including the queue<int>
147 
148     hsa_status_t Err = hsa_amd_memory_pool_allocate(
149         MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0,
150         &KernargRegion);
151 
152     if (Err != HSA_STATUS_SUCCESS) {
153       DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err));
154       KernargRegion = nullptr; // paranoid
155       return;
156     }
157 
158     Err = core::allow_access_to_all_gpu_agents(KernargRegion);
159     if (Err != HSA_STATUS_SUCCESS) {
160       DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
161          get_error_string(Err));
162       auto R = hsa_amd_memory_pool_free(KernargRegion);
163       if (R != HSA_STATUS_SUCCESS) {
164         // if free failed, can't do anything more to resolve it
165         DP("hsa memory poll free failed: %s\n", get_error_string(Err));
166       }
167       KernargRegion = nullptr;
168       return;
169     }
170 
171     for (int I = 0; I < MAX_NUM_KERNELS; I++) {
172       FreeKernargSegments.push(I);
173     }
174   }
175 
allocateKernelArgPool176   void *allocate(uint64_t ArgNum) {
177     assert((ArgNum * sizeof(void *)) == KernargSegmentSize);
178     Lock L(&Mutex);
179     void *Res = nullptr;
180     if (!FreeKernargSegments.empty()) {
181 
182       int FreeIdx = FreeKernargSegments.front();
183       Res = static_cast<void *>(static_cast<char *>(KernargRegion) +
184                                 (FreeIdx * kernargSizeIncludingImplicit()));
185       assert(FreeIdx == pointerToIndex(Res));
186       FreeKernargSegments.pop();
187     }
188     return Res;
189   }
190 
deallocateKernelArgPool191   void deallocate(void *Ptr) {
192     Lock L(&Mutex);
193     int Idx = pointerToIndex(Ptr);
194     FreeKernargSegments.push(Idx);
195   }
196 
197 private:
pointerToIndexKernelArgPool198   int pointerToIndex(void *Ptr) {
199     ptrdiff_t Bytes =
200         static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion);
201     assert(Bytes >= 0);
202     assert(Bytes % kernargSizeIncludingImplicit() == 0);
203     return Bytes / kernargSizeIncludingImplicit();
204   }
205   struct Lock {
LockKernelArgPool::Lock206     Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
~LockKernelArgPool::Lock207     ~Lock() { pthread_mutex_unlock(M); }
208     pthread_mutex_t *M;
209   };
210 };
211 pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
212 
213 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
214     KernelArgPoolMap;
215 
216 /// Use a single entity to encode a kernel and a set of flags
217 struct KernelTy {
218   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
219   int16_t ConstWGSize;
220   int32_t DeviceId;
221   void *CallStackAddr = nullptr;
222   const char *Name;
223 
KernelTyKernelTy224   KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
225            int32_t DeviceId, void *CallStackAddr, const char *Name,
226            uint32_t KernargSegmentSize,
227            hsa_amd_memory_pool_t &KernArgMemoryPool)
228       : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
229         DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
230     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
231 
232     std::string N(Name);
233     if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
234       KernelArgPoolMap.insert(
235           std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
236                                 KernargSegmentSize, KernArgMemoryPool))));
237     }
238   }
239 };
240 
241 /// List that contains all the kernels.
242 /// FIXME: we may need this to be per device and per library.
243 std::list<KernelTy> KernelsList;
244 
findAgents(Callback CB)245 template <typename Callback> static hsa_status_t findAgents(Callback CB) {
246 
247   hsa_status_t Err =
248       hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
249         hsa_device_type_t DeviceType;
250         // get_info fails iff HSA runtime not yet initialized
251         hsa_status_t Err =
252             hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
253 
254         if (Err != HSA_STATUS_SUCCESS) {
255           if (print_kernel_trace > 0)
256             DP("rtl.cpp: err %s\n", get_error_string(Err));
257 
258           return Err;
259         }
260 
261         CB(DeviceType, Agent);
262         return HSA_STATUS_SUCCESS;
263       });
264 
265   // iterate_agents fails iff HSA runtime not yet initialized
266   if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
267     DP("rtl.cpp: err %s\n", get_error_string(Err));
268   }
269 
270   return Err;
271 }
272 
callbackQueue(hsa_status_t Status,hsa_queue_t * Source,void * Data)273 static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
274                           void *Data) {
275   if (Status != HSA_STATUS_SUCCESS) {
276     const char *StatusString;
277     if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
278       StatusString = "unavailable";
279     }
280     DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,
281        Status, StatusString);
282     abort();
283   }
284 }
285 
286 namespace core {
287 namespace {
288 
checkResult(hsa_status_t Err,const char * ErrMsg)289 bool checkResult(hsa_status_t Err, const char *ErrMsg) {
290   if (Err == HSA_STATUS_SUCCESS)
291     return true;
292 
293   REPORT("%s", ErrMsg);
294   REPORT("%s", get_error_string(Err));
295   return false;
296 }
297 
packetStoreRelease(uint32_t * Packet,uint16_t Header,uint16_t Rest)298 void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
299   __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE);
300 }
301 
createHeader()302 uint16_t createHeader() {
303   uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
304   Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
305   Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
306   return Header;
307 }
308 
isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool)309 hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
310   bool AllocAllowed = false;
311   hsa_status_t Err = hsa_amd_memory_pool_get_info(
312       MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
313       &AllocAllowed);
314   if (Err != HSA_STATUS_SUCCESS) {
315     DP("Alloc allowed in memory pool check failed: %s\n",
316        get_error_string(Err));
317     return Err;
318   }
319 
320   size_t Size = 0;
321   Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
322                                      &Size);
323   if (Err != HSA_STATUS_SUCCESS) {
324     DP("Get memory pool size failed: %s\n", get_error_string(Err));
325     return Err;
326   }
327 
328   return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
329 }
330 
addMemoryPool(hsa_amd_memory_pool_t MemoryPool,void * Data)331 hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
332   std::vector<hsa_amd_memory_pool_t> *Result =
333       static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
334 
335   hsa_status_t Err;
336   if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
337     return Err;
338   }
339 
340   Result->push_back(MemoryPool);
341   return HSA_STATUS_SUCCESS;
342 }
343 
344 } // namespace
345 } // namespace core
346 
347 struct EnvironmentVariables {
348   int NumTeams;
349   int TeamLimit;
350   int TeamThreadLimit;
351   int MaxTeamsDefault;
352   int DynamicMemSize;
353 };
354 
355 template <uint32_t wavesize>
getGridValue()356 static constexpr const llvm::omp::GV &getGridValue() {
357   return llvm::omp::getAMDGPUGridValues<wavesize>();
358 }
359 
360 struct HSALifetime {
361   // Wrapper around HSA used to ensure it is constructed before other types
362   // and destructed after, which means said other types can use raii for
363   // cleanup without risking running outside of the lifetime of HSA
364   const hsa_status_t S;
365 
HSAInitSuccessHSALifetime366   bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
HSALifetimeHSALifetime367   HSALifetime() : S(hsa_init()) {}
368 
~HSALifetimeHSALifetime369   ~HSALifetime() {
370     if (S == HSA_STATUS_SUCCESS) {
371       hsa_status_t Err = hsa_shut_down();
372       if (Err != HSA_STATUS_SUCCESS) {
373         // Can't call into HSA to get a string from the integer
374         DP("Shutting down HSA failed: %d\n", Err);
375       }
376     }
377   }
378 };
379 
380 // Handle scheduling of multiple hsa_queue's per device to
381 // multiple threads (one scheduler per device)
382 class HSAQueueScheduler {
383 public:
HSAQueueScheduler()384   HSAQueueScheduler() : Current(0) {}
385 
386   HSAQueueScheduler(const HSAQueueScheduler &) = delete;
387 
HSAQueueScheduler(HSAQueueScheduler && Q)388   HSAQueueScheduler(HSAQueueScheduler &&Q) {
389     Current = Q.Current.load();
390     for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
391       HSAQueues[I] = Q.HSAQueues[I];
392       Q.HSAQueues[I] = nullptr;
393     }
394   }
395 
396   // \return false if any HSA queue creation fails
createQueues(hsa_agent_t HSAAgent,uint32_t QueueSize)397   bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
398     for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
399       hsa_queue_t *Q = nullptr;
400       hsa_status_t Rc =
401           hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
402                            callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
403       if (Rc != HSA_STATUS_SUCCESS) {
404         DP("Failed to create HSA queue %d\n", I);
405         return false;
406       }
407       HSAQueues[I] = Q;
408     }
409     return true;
410   }
411 
~HSAQueueScheduler()412   ~HSAQueueScheduler() {
413     for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
414       if (HSAQueues[I]) {
415         hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
416         if (Err != HSA_STATUS_SUCCESS)
417           DP("Error destroying HSA queue");
418       }
419     }
420   }
421 
422   // \return next queue to use for device
next()423   hsa_queue_t *next() {
424     return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
425                      NUM_QUEUES_PER_DEVICE];
426   }
427 
428 private:
429   // Number of queues per device
430   enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
431   hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
432   std::atomic<uint8_t> Current;
433 };
434 
435 /// Class containing all the device information
436 class RTLDeviceInfoTy : HSALifetime {
437   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
438 
439   struct QueueDeleter {
operator ()RTLDeviceInfoTy::QueueDeleter440     void operator()(hsa_queue_t *Q) {
441       if (Q) {
442         hsa_status_t Err = hsa_queue_destroy(Q);
443         if (Err != HSA_STATUS_SUCCESS) {
444           DP("Error destroying hsa queue: %s\n", get_error_string(Err));
445         }
446       }
447     }
448   };
449 
450 public:
451   bool ConstructionSucceeded = false;
452 
453   // load binary populates symbol tables and mutates various global state
454   // run uses those symbol tables
455   std::shared_timed_mutex LoadRunLock;
456 
457   int NumberOfDevices = 0;
458 
459   // GPU devices
460   std::vector<hsa_agent_t> HSAAgents;
461   std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
462 
463   // CPUs
464   std::vector<hsa_agent_t> CPUAgents;
465 
466   // Device properties
467   std::vector<int> ComputeUnits;
468   std::vector<int> GroupsPerDevice;
469   std::vector<int> ThreadsPerGroup;
470   std::vector<int> WarpSize;
471   std::vector<std::string> GPUName;
472   std::vector<std::string> TargetID;
473 
474   // OpenMP properties
475   std::vector<int> NumTeams;
476   std::vector<int> NumThreads;
477 
478   // OpenMP Environment properties
479   EnvironmentVariables Env;
480 
481   // OpenMP Requires Flags
482   int64_t RequiresFlags;
483 
484   // Resource pools
485   SignalPoolT FreeSignalPool;
486 
487   bool HostcallRequired = false;
488 
489   std::vector<hsa_executable_t> HSAExecutables;
490 
491   std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
492   std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
493 
494   hsa_amd_memory_pool_t KernArgPool;
495 
496   // fine grained memory pool for host allocations
497   hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
498 
499   // fine and coarse-grained memory pools per offloading device
500   std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
501   std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
502 
503   struct ImplFreePtrDeletor {
operator ()RTLDeviceInfoTy::ImplFreePtrDeletor504     void operator()(void *P) {
505       core::Runtime::Memfree(P); // ignore failure to free
506     }
507   };
508 
509   // device_State shared across loaded binaries, error if inconsistent size
510   std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
511       DeviceStateStore;
512 
513   static const unsigned HardTeamLimit =
514       (1 << 16) - 1; // 64K needed to fit in uint16
515   static const int DefaultNumTeams = 128;
516 
517   // These need to be per-device since different devices can have different
518   // wave sizes, but are currently the same number for each so that refactor
519   // can be postponed.
520   static_assert(getGridValue<32>().GV_Max_Teams ==
521                     getGridValue<64>().GV_Max_Teams,
522                 "");
523   static const int MaxTeams = getGridValue<64>().GV_Max_Teams;
524 
525   static_assert(getGridValue<32>().GV_Max_WG_Size ==
526                     getGridValue<64>().GV_Max_WG_Size,
527                 "");
528   static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;
529 
530   static_assert(getGridValue<32>().GV_Default_WG_Size ==
531                     getGridValue<64>().GV_Default_WG_Size,
532                 "");
533   static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;
534 
535   using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size,
536                                       hsa_agent_t, hsa_amd_memory_pool_t);
freesignalpoolMemcpy(void * Dest,void * Src,size_t Size,MemcpyFunc Func,int32_t DeviceId)537   hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size,
538                                     MemcpyFunc Func, int32_t DeviceId) {
539     hsa_agent_t Agent = HSAAgents[DeviceId];
540     hsa_signal_t S = FreeSignalPool.pop();
541     if (S.handle == 0) {
542       return HSA_STATUS_ERROR;
543     }
544     hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
545     FreeSignalPool.push(S);
546     return R;
547   }
548 
freesignalpoolMemcpyD2H(void * Dest,void * Src,size_t Size,int32_t DeviceId)549   hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size,
550                                        int32_t DeviceId) {
551     return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
552   }
553 
freesignalpoolMemcpyH2D(void * Dest,void * Src,size_t Size,int32_t DeviceId)554   hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size,
555                                        int32_t DeviceId) {
556     return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
557   }
558 
printDeviceInfo(int32_t DeviceId,hsa_agent_t Agent)559   static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
560     char TmpChar[1000];
561     uint16_t Major, Minor;
562     uint32_t TmpUInt;
563     uint32_t TmpUInt2;
564     uint32_t CacheSize[4];
565     bool TmpBool;
566     uint16_t WorkgroupMaxDim[3];
567     hsa_dim3_t GridMaxDim;
568 
569     // Getting basic information about HSA and Device
570     core::checkResult(
571         hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
572         "Error from hsa_system_get_info when obtaining "
573         "HSA_SYSTEM_INFO_VERSION_MAJOR\n");
574     core::checkResult(
575         hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
576         "Error from hsa_system_get_info when obtaining "
577         "HSA_SYSTEM_INFO_VERSION_MINOR\n");
578     printf("    HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
579     printf("    HSA OpenMP Device Number: \t\t%d \n", DeviceId);
580     core::checkResult(
581         hsa_agent_get_info(
582             Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
583         "Error returned from hsa_agent_get_info when obtaining "
584         "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
585     printf("    Product Name: \t\t\t%s \n", TmpChar);
586     core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
587                       "Error returned from hsa_agent_get_info when obtaining "
588                       "HSA_AGENT_INFO_NAME\n");
589     printf("    Device Name: \t\t\t%s \n", TmpChar);
590     core::checkResult(
591         hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
592         "Error returned from hsa_agent_get_info when obtaining "
593         "HSA_AGENT_INFO_NAME\n");
594     printf("    Vendor Name: \t\t\t%s \n", TmpChar);
595     hsa_device_type_t DevType;
596     core::checkResult(
597         hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
598         "Error returned from hsa_agent_get_info when obtaining "
599         "HSA_AGENT_INFO_DEVICE\n");
600     printf("    Device Type: \t\t\t%s \n",
601            DevType == HSA_DEVICE_TYPE_CPU
602                ? "CPU"
603                : (DevType == HSA_DEVICE_TYPE_GPU
604                       ? "GPU"
605                       : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
606     core::checkResult(
607         hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
608         "Error returned from hsa_agent_get_info when obtaining "
609         "HSA_AGENT_INFO_QUEUES_MAX\n");
610     printf("    Max Queues: \t\t\t%u \n", TmpUInt);
611     core::checkResult(
612         hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
613         "Error returned from hsa_agent_get_info when obtaining "
614         "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
615     printf("    Queue Min Size: \t\t\t%u \n", TmpUInt);
616     core::checkResult(
617         hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
618         "Error returned from hsa_agent_get_info when obtaining "
619         "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
620     printf("    Queue Max Size: \t\t\t%u \n", TmpUInt);
621 
622     // Getting cache information
623     printf("    Cache:\n");
624 
625     // FIXME: This is deprecated according to HSA documentation. But using
626     // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
627     // runtime.
628     core::checkResult(
629         hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
630         "Error returned from hsa_agent_get_info when obtaining "
631         "HSA_AGENT_INFO_CACHE_SIZE\n");
632 
633     for (int I = 0; I < 4; I++) {
634       if (CacheSize[I]) {
635         printf("      L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
636       }
637     }
638 
639     core::checkResult(
640         hsa_agent_get_info(Agent,
641                            (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
642                            &TmpUInt),
643         "Error returned from hsa_agent_get_info when obtaining "
644         "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
645     printf("    Cacheline Size: \t\t\t%u \n", TmpUInt);
646     core::checkResult(
647         hsa_agent_get_info(
648             Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
649             &TmpUInt),
650         "Error returned from hsa_agent_get_info when obtaining "
651         "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
652     printf("    Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
653     core::checkResult(
654         hsa_agent_get_info(
655             Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
656             &TmpUInt),
657         "Error returned from hsa_agent_get_info when obtaining "
658         "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
659     printf("    Compute Units: \t\t\t%u \n", TmpUInt);
660     core::checkResult(hsa_agent_get_info(
661                           Agent,
662                           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
663                           &TmpUInt),
664                       "Error returned from hsa_agent_get_info when obtaining "
665                       "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
666     printf("    SIMD per CU: \t\t\t%u \n", TmpUInt);
667     core::checkResult(
668         hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
669         "Error returned from hsa_agent_get_info when obtaining "
670         "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
671     printf("    Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
672     core::checkResult(
673         hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
674         "Error returned from hsa_agent_get_info when obtaining "
675         "HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
676     printf("    Wavefront Size: \t\t\t%u \n", TmpUInt2);
677     core::checkResult(
678         hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
679         "Error returned from hsa_agent_get_info when obtaining "
680         "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
681     printf("    Workgroup Max Size: \t\t%u \n", TmpUInt);
682     core::checkResult(hsa_agent_get_info(Agent,
683                                          HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
684                                          WorkgroupMaxDim),
685                       "Error returned from hsa_agent_get_info when obtaining "
686                       "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
687     printf("    Workgroup Max Size per Dimension:\n");
688     printf("      x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
689     printf("      y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
690     printf("      z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
691     core::checkResult(hsa_agent_get_info(
692                           Agent,
693                           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
694                           &TmpUInt),
695                       "Error returned from hsa_agent_get_info when obtaining "
696                       "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
697     printf("    Max Waves Per CU: \t\t\t%u \n", TmpUInt);
698     printf("    Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
699     core::checkResult(
700         hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
701         "Error returned from hsa_agent_get_info when obtaining "
702         "HSA_AGENT_INFO_GRID_MAX_SIZE\n");
703     printf("    Grid Max Size: \t\t\t%u \n", TmpUInt);
704     core::checkResult(
705         hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
706         "Error returned from hsa_agent_get_info when obtaining "
707         "HSA_AGENT_INFO_GRID_MAX_DIM\n");
708     printf("    Grid Max Size per Dimension: \t\t\n");
709     printf("      x: \t\t\t\t%u\n", GridMaxDim.x);
710     printf("      y: \t\t\t\t%u\n", GridMaxDim.y);
711     printf("      z: \t\t\t\t%u\n", GridMaxDim.z);
712     core::checkResult(
713         hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
714         "Error returned from hsa_agent_get_info when obtaining "
715         "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
716     printf("    Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
717 
718     printf("    Memory Pools:\n");
719     auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
720       std::string TmpStr;
721       size_t Size;
722       bool Alloc, Access;
723       hsa_amd_segment_t Segment;
724       hsa_amd_memory_pool_global_flag_t GlobalFlags;
725       core::checkResult(
726           hsa_amd_memory_pool_get_info(
727               Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
728           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
729           "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
730       core::checkResult(hsa_amd_memory_pool_get_info(
731                             Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
732                         "Error returned from hsa_amd_memory_pool_get_info when "
733                         "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
734 
735       switch (Segment) {
736       case HSA_AMD_SEGMENT_GLOBAL:
737         TmpStr = "GLOBAL; FLAGS: ";
738         if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
739           TmpStr += "KERNARG, ";
740         if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
741           TmpStr += "FINE GRAINED, ";
742         if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
743           TmpStr += "COARSE GRAINED, ";
744         break;
745       case HSA_AMD_SEGMENT_READONLY:
746         TmpStr = "READONLY";
747         break;
748       case HSA_AMD_SEGMENT_PRIVATE:
749         TmpStr = "PRIVATE";
750         break;
751       case HSA_AMD_SEGMENT_GROUP:
752         TmpStr = "GROUP";
753         break;
754       }
755       printf("      Pool %s: \n", TmpStr.c_str());
756 
757       core::checkResult(hsa_amd_memory_pool_get_info(
758                             Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
759                         "Error returned from hsa_amd_memory_pool_get_info when "
760                         "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
761       printf("        Size: \t\t\t\t %zu bytes\n", Size);
762       core::checkResult(
763           hsa_amd_memory_pool_get_info(
764               Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
765           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
766           "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
767       printf("        Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
768       core::checkResult(
769           hsa_amd_memory_pool_get_info(
770               Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
771           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
772           "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
773       printf("        Runtime Alloc Granule: \t\t %zu bytes\n", Size);
774       core::checkResult(
775           hsa_amd_memory_pool_get_info(
776               Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
777           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
778           "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
779       printf("        Runtime Alloc alignment: \t %zu bytes\n", Size);
780       core::checkResult(
781           hsa_amd_memory_pool_get_info(
782               Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
783           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
784           "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
785       printf("        Accessable by all: \t\t %s\n",
786              (Access ? "TRUE" : "FALSE"));
787 
788       return HSA_STATUS_SUCCESS;
789     };
790     // Iterate over all the memory regions for this agent. Get the memory region
791     // type and size
792     hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);
793 
794     printf("    ISAs:\n");
795     auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
796       char TmpChar[1000];
797       core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
798                         "Error returned from hsa_isa_get_info_alt when "
799                         "obtaining HSA_ISA_INFO_NAME\n");
800       printf("        Name: \t\t\t\t %s\n", TmpChar);
801 
802       return HSA_STATUS_SUCCESS;
803     };
804     // Iterate over all the memory regions for this agent. Get the memory region
805     // type and size
806     hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
807   }
808 
809   // Record entry point associated with device
addOffloadEntry(int32_t DeviceId,__tgt_offload_entry Entry)810   void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
811     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
812            "Unexpected device id!");
813     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
814 
815     E.Entries.push_back(Entry);
816   }
817 
818   // Return true if the entry is associated with device
findOffloadEntry(int32_t DeviceId,void * Addr)819   bool findOffloadEntry(int32_t DeviceId, void *Addr) {
820     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
821            "Unexpected device id!");
822     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
823 
824     for (auto &It : E.Entries) {
825       if (It.addr == Addr)
826         return true;
827     }
828 
829     return false;
830   }
831 
832   // Return the pointer to the target entries table
getOffloadEntriesTable(int32_t DeviceId)833   __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
834     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
835            "Unexpected device id!");
836     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
837 
838     int32_t Size = E.Entries.size();
839 
840     // Table is empty
841     if (!Size)
842       return 0;
843 
844     __tgt_offload_entry *Begin = &E.Entries[0];
845     __tgt_offload_entry *End = &E.Entries[Size - 1];
846 
847     // Update table info according to the entries and return the pointer
848     E.Table.EntriesBegin = Begin;
849     E.Table.EntriesEnd = ++End;
850 
851     return &E.Table;
852   }
853 
854   // Clear entries table for a device
clearOffloadEntriesTable(int DeviceId)855   void clearOffloadEntriesTable(int DeviceId) {
856     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
857            "Unexpected device id!");
858     FuncGblEntries[DeviceId].emplace_back();
859     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
860     // KernelArgPoolMap.clear();
861     E.Entries.clear();
862     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
863   }
864 
addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,unsigned int DeviceId)865   hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
866                                    unsigned int DeviceId) {
867     assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
868     uint32_t GlobalFlags = 0;
869     hsa_status_t Err = hsa_amd_memory_pool_get_info(
870         MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
871 
872     if (Err != HSA_STATUS_SUCCESS) {
873       return Err;
874     }
875 
876     if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
877       DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
878     } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
879       DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
880     }
881 
882     return HSA_STATUS_SUCCESS;
883   }
884 
setupDevicePools(const std::vector<hsa_agent_t> & Agents)885   hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
886     for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
887       hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
888           Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
889             hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
890             if (ValidStatus != HSA_STATUS_SUCCESS) {
891               DP("Alloc allowed in memory pool check failed: %s\n",
892                  get_error_string(ValidStatus));
893               return HSA_STATUS_SUCCESS;
894             }
895             return addDeviceMemoryPool(MemoryPool, DeviceId);
896           });
897 
898       if (Err != HSA_STATUS_SUCCESS) {
899         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
900            "Iterate all memory pools", get_error_string(Err));
901         return Err;
902       }
903     }
904     return HSA_STATUS_SUCCESS;
905   }
906 
setupHostMemoryPools(std::vector<hsa_agent_t> & Agents)907   hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
908     std::vector<hsa_amd_memory_pool_t> HostPools;
909 
910     // collect all the "valid" pools for all the given agents.
911     for (const auto &Agent : Agents) {
912       hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
913           Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
914       if (Err != HSA_STATUS_SUCCESS) {
915         DP("addMemoryPool returned %s, continuing\n", get_error_string(Err));
916       }
917     }
918 
919     // We need two fine-grained pools.
920     //  1. One with kernarg flag set for storing kernel arguments
921     //  2. Second for host allocations
922     bool FineGrainedMemoryPoolSet = false;
923     bool KernArgPoolSet = false;
924     for (const auto &MemoryPool : HostPools) {
925       hsa_status_t Err = HSA_STATUS_SUCCESS;
926       uint32_t GlobalFlags = 0;
927       Err = hsa_amd_memory_pool_get_info(
928           MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
929       if (Err != HSA_STATUS_SUCCESS) {
930         DP("Get memory pool info failed: %s\n", get_error_string(Err));
931         return Err;
932       }
933 
934       if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
935         if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
936           KernArgPool = MemoryPool;
937           KernArgPoolSet = true;
938         }
939         HostFineGrainedMemoryPool = MemoryPool;
940         FineGrainedMemoryPoolSet = true;
941       }
942     }
943 
944     if (FineGrainedMemoryPoolSet && KernArgPoolSet)
945       return HSA_STATUS_SUCCESS;
946 
947     return HSA_STATUS_ERROR;
948   }
949 
getDeviceMemoryPool(unsigned int DeviceId)950   hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
951     assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
952            "Invalid device Id");
953     return DeviceCoarseGrainedMemoryPools[DeviceId];
954   }
955 
getHostMemoryPool()956   hsa_amd_memory_pool_t getHostMemoryPool() {
957     return HostFineGrainedMemoryPool;
958   }
959 
readEnv(const char * Env,int Default=-1)960   static int readEnv(const char *Env, int Default = -1) {
961     const char *EnvStr = getenv(Env);
962     int Res = Default;
963     if (EnvStr) {
964       Res = std::stoi(EnvStr);
965       DP("Parsed %s=%d\n", Env, Res);
966     }
967     return Res;
968   }
969 
RTLDeviceInfoTy()970   RTLDeviceInfoTy() {
971     DP("Start initializing " GETNAME(TARGET_NAME) "\n");
972 
973     // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
974     // anytime. You do not need a debug library build.
975     //  0 => no tracing
976     //  1 => tracing dispatch only
977     // >1 => verbosity increase
978 
979     if (!HSAInitSuccess()) {
980       DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
981       return;
982     }
983 
984     if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
985       print_kernel_trace = atoi(EnvStr);
986     else
987       print_kernel_trace = 0;
988 
989     hsa_status_t Err = core::atl_init_gpu_context();
990     if (Err != HSA_STATUS_SUCCESS) {
991       DP("Error when initializing " GETNAME(TARGET_NAME) "\n");
992       return;
993     }
994 
995     // Init hostcall soon after initializing hsa
996     hostrpc_init();
997 
998     Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
999       if (DeviceType == HSA_DEVICE_TYPE_CPU) {
1000         CPUAgents.push_back(Agent);
1001       } else {
1002         HSAAgents.push_back(Agent);
1003       }
1004     });
1005     if (Err != HSA_STATUS_SUCCESS)
1006       return;
1007 
1008     NumberOfDevices = (int)HSAAgents.size();
1009 
1010     if (NumberOfDevices == 0) {
1011       DP("There are no devices supporting HSA.\n");
1012       return;
1013     }
1014     DP("There are %d devices supporting HSA.\n", NumberOfDevices);
1015 
1016     // Init the device info
1017     HSAQueueSchedulers.reserve(NumberOfDevices);
1018     FuncGblEntries.resize(NumberOfDevices);
1019     ThreadsPerGroup.resize(NumberOfDevices);
1020     ComputeUnits.resize(NumberOfDevices);
1021     GPUName.resize(NumberOfDevices);
1022     GroupsPerDevice.resize(NumberOfDevices);
1023     WarpSize.resize(NumberOfDevices);
1024     NumTeams.resize(NumberOfDevices);
1025     NumThreads.resize(NumberOfDevices);
1026     DeviceStateStore.resize(NumberOfDevices);
1027     KernelInfoTable.resize(NumberOfDevices);
1028     SymbolInfoTable.resize(NumberOfDevices);
1029     DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
1030     DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
1031 
1032     Err = setupDevicePools(HSAAgents);
1033     if (Err != HSA_STATUS_SUCCESS) {
1034       DP("Setup for Device Memory Pools failed\n");
1035       return;
1036     }
1037 
1038     Err = setupHostMemoryPools(CPUAgents);
1039     if (Err != HSA_STATUS_SUCCESS) {
1040       DP("Setup for Host Memory Pools failed\n");
1041       return;
1042     }
1043 
1044     for (int I = 0; I < NumberOfDevices; I++) {
1045       uint32_t QueueSize = 0;
1046       {
1047         hsa_status_t Err = hsa_agent_get_info(
1048             HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
1049         if (Err != HSA_STATUS_SUCCESS) {
1050           DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I);
1051           return;
1052         }
1053         enum { MaxQueueSize = 4096 };
1054         if (QueueSize > MaxQueueSize) {
1055           QueueSize = MaxQueueSize;
1056         }
1057       }
1058 
1059       {
1060         HSAQueueScheduler QSched;
1061         if (!QSched.createQueues(HSAAgents[I], QueueSize))
1062           return;
1063         HSAQueueSchedulers.emplace_back(std::move(QSched));
1064       }
1065 
1066       DeviceStateStore[I] = {nullptr, 0};
1067     }
1068 
1069     for (int I = 0; I < NumberOfDevices; I++) {
1070       ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
1071       GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
1072       ComputeUnits[I] = 1;
1073       DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,
1074          GroupsPerDevice[I], ThreadsPerGroup[I]);
1075     }
1076 
1077     // Get environment variables regarding teams
1078     Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
1079     Env.NumTeams = readEnv("OMP_NUM_TEAMS");
1080     Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
1081     Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
1082     Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
1083 
1084     // Default state.
1085     RequiresFlags = OMP_REQ_UNDEFINED;
1086 
1087     ConstructionSucceeded = true;
1088   }
1089 
~RTLDeviceInfoTy()1090   ~RTLDeviceInfoTy() {
1091     DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
1092     if (!HSAInitSuccess()) {
1093       // Then none of these can have been set up and they can't be torn down
1094       return;
1095     }
1096     // Run destructors on types that use HSA before
1097     // impl_finalize removes access to it
1098     DeviceStateStore.clear();
1099     KernelArgPoolMap.clear();
1100     // Terminate hostrpc before finalizing hsa
1101     hostrpc_terminate();
1102 
1103     hsa_status_t Err;
1104     for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
1105       Err = hsa_executable_destroy(HSAExecutables[I]);
1106       if (Err != HSA_STATUS_SUCCESS) {
1107         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1108            "Destroying executable", get_error_string(Err));
1109       }
1110     }
1111   }
1112 };
1113 
1114 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
1115 
1116 // Putting accesses to DeviceInfo global behind a function call prior
1117 // to changing to use init_plugin/deinit_plugin calls
1118 static RTLDeviceInfoTy DeviceInfoState;
DeviceInfo()1119 static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; }
1120 
1121 namespace {
1122 
dataRetrieve(int32_t DeviceId,void * HstPtr,void * TgtPtr,int64_t Size,__tgt_async_info * AsyncInfo)1123 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
1124                      __tgt_async_info *AsyncInfo) {
1125   assert(AsyncInfo && "AsyncInfo is nullptr");
1126   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
1127   // Return success if we are not copying back to host from target.
1128   if (!HstPtr)
1129     return OFFLOAD_SUCCESS;
1130   hsa_status_t Err;
1131   DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
1132      (long long unsigned)(Elf64_Addr)TgtPtr,
1133      (long long unsigned)(Elf64_Addr)HstPtr);
1134 
1135   Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
1136                                            DeviceId);
1137 
1138   if (Err != HSA_STATUS_SUCCESS) {
1139     DP("Error when copying data from device to host. Pointers: "
1140        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
1141        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
1142     return OFFLOAD_FAIL;
1143   }
1144   DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
1145      (long long unsigned)(Elf64_Addr)TgtPtr,
1146      (long long unsigned)(Elf64_Addr)HstPtr);
1147   return OFFLOAD_SUCCESS;
1148 }
1149 
dataSubmit(int32_t DeviceId,void * TgtPtr,void * HstPtr,int64_t Size,__tgt_async_info * AsyncInfo)1150 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
1151                    __tgt_async_info *AsyncInfo) {
1152   assert(AsyncInfo && "AsyncInfo is nullptr");
1153   hsa_status_t Err;
1154   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
1155   // Return success if we are not doing host to target.
1156   if (!HstPtr)
1157     return OFFLOAD_SUCCESS;
1158 
1159   DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
1160      (long long unsigned)(Elf64_Addr)HstPtr,
1161      (long long unsigned)(Elf64_Addr)TgtPtr);
1162   Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
1163                                              DeviceId);
1164   if (Err != HSA_STATUS_SUCCESS) {
1165     DP("Error when copying data from host to device. Pointers: "
1166        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
1167        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
1168     return OFFLOAD_FAIL;
1169   }
1170   return OFFLOAD_SUCCESS;
1171 }
1172 
1173 // Async.
1174 // The implementation was written with cuda streams in mind. The semantics of
1175 // that are to execute kernels on a queue in order of insertion. A synchronise
1176 // call then makes writes visible between host and device. This means a series
1177 // of N data_submit_async calls are expected to execute serially. HSA offers
1178 // various options to run the data copies concurrently. This may require changes
1179 // to libomptarget.
1180 
1181 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
1182 // there are no outstanding kernels that need to be synchronized. Any async call
1183 // may be passed a Queue==0, at which point the cuda implementation will set it
1184 // to non-null (see getStream). The cuda streams are per-device. Upstream may
1185 // change this interface to explicitly initialize the AsyncInfo_pointer, but
1186 // until then hsa lazily initializes it as well.
1187 
initAsyncInfo(__tgt_async_info * AsyncInfo)1188 void initAsyncInfo(__tgt_async_info *AsyncInfo) {
1189   // set non-null while using async calls, return to null to indicate completion
1190   assert(AsyncInfo);
1191   if (!AsyncInfo->Queue) {
1192     AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX);
1193   }
1194 }
finiAsyncInfo(__tgt_async_info * AsyncInfo)1195 void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
1196   assert(AsyncInfo);
1197   assert(AsyncInfo->Queue);
1198   AsyncInfo->Queue = 0;
1199 }
1200 
1201 // Determine launch values for kernel.
1202 struct LaunchVals {
1203   int WorkgroupSize;
1204   int GridSize;
1205 };
getLaunchVals(int WarpSize,EnvironmentVariables Env,int ConstWGSize,llvm::omp::OMPTgtExecModeFlags ExecutionMode,int NumTeams,int ThreadLimit,uint64_t LoopTripcount,int DeviceNumTeams)1206 LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
1207                          int ConstWGSize,
1208                          llvm::omp::OMPTgtExecModeFlags ExecutionMode,
1209                          int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
1210                          int DeviceNumTeams) {
1211 
1212   int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
1213   int NumGroups = 0;
1214 
1215   int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
1216   if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
1217     MaxTeams = RTLDeviceInfoTy::HardTeamLimit;
1218 
1219   if (print_kernel_trace & STARTUP_DETAILS) {
1220     DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams);
1221     DP("Max_Teams: %d\n", MaxTeams);
1222     DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
1223     DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize);
1224     DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
1225        RTLDeviceInfoTy::DefaultWgSize);
1226     DP("thread_limit: %d\n", ThreadLimit);
1227     DP("threadsPerGroup: %d\n", ThreadsPerGroup);
1228     DP("ConstWGSize: %d\n", ConstWGSize);
1229   }
1230   // check for thread_limit() clause
1231   if (ThreadLimit > 0) {
1232     ThreadsPerGroup = ThreadLimit;
1233     DP("Setting threads per block to requested %d\n", ThreadLimit);
1234     // Add master warp for GENERIC
1235     if (ExecutionMode ==
1236         llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1237       ThreadsPerGroup += WarpSize;
1238       DP("Adding master wavefront: +%d threads\n", WarpSize);
1239     }
1240     if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
1241       ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
1242       DP("Setting threads per block to maximum %d\n", ThreadsPerGroup);
1243     }
1244   }
1245   // check flat_max_work_group_size attr here
1246   if (ThreadsPerGroup > ConstWGSize) {
1247     ThreadsPerGroup = ConstWGSize;
1248     DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
1249        ThreadsPerGroup);
1250   }
1251   if (print_kernel_trace & STARTUP_DETAILS)
1252     DP("threadsPerGroup: %d\n", ThreadsPerGroup);
1253   DP("Preparing %d threads\n", ThreadsPerGroup);
1254 
1255   // Set default num_groups (teams)
1256   if (Env.TeamLimit > 0)
1257     NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
1258   else
1259     NumGroups = MaxTeams;
1260   DP("Set default num of groups %d\n", NumGroups);
1261 
1262   if (print_kernel_trace & STARTUP_DETAILS) {
1263     DP("num_groups: %d\n", NumGroups);
1264     DP("num_teams: %d\n", NumTeams);
1265   }
1266 
1267   // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1268   // This reduction is typical for default case (no thread_limit clause).
1269   // or when user goes crazy with num_teams clause.
1270   // FIXME: We cant distinguish between a constant or variable thread limit.
1271   // So we only handle constant thread_limits.
1272   if (ThreadsPerGroup >
1273       RTLDeviceInfoTy::DefaultWgSize) //  256 < threadsPerGroup <= 1024
1274     // Should we round threadsPerGroup up to nearest WarpSize
1275     // here?
1276     NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;
1277 
1278   // check for num_teams() clause
1279   if (NumTeams > 0) {
1280     NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
1281   }
1282   if (print_kernel_trace & STARTUP_DETAILS) {
1283     DP("num_groups: %d\n", NumGroups);
1284     DP("Env.NumTeams %d\n", Env.NumTeams);
1285     DP("Env.TeamLimit %d\n", Env.TeamLimit);
1286   }
1287 
1288   if (Env.NumTeams > 0) {
1289     NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
1290     DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
1291   } else if (Env.TeamLimit > 0) {
1292     NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
1293     DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
1294   } else {
1295     if (NumTeams <= 0) {
1296       if (LoopTripcount > 0) {
1297         if (ExecutionMode ==
1298             llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
1299           // round up to the nearest integer
1300           NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
1301         } else if (ExecutionMode ==
1302                    llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1303           NumGroups = LoopTripcount;
1304         } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
1305           // This is a generic kernel that was transformed to use SPMD-mode
1306           // execution but uses Generic-mode semantics for scheduling.
1307           NumGroups = LoopTripcount;
1308         }
1309         DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
1310            "threads per block %d\n",
1311            NumGroups, LoopTripcount, ThreadsPerGroup);
1312       }
1313     } else {
1314       NumGroups = NumTeams;
1315     }
1316     if (NumGroups > MaxTeams) {
1317       NumGroups = MaxTeams;
1318       if (print_kernel_trace & STARTUP_DETAILS)
1319         DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams);
1320     }
1321     if (NumGroups > NumTeams && NumTeams > 0) {
1322       NumGroups = NumTeams;
1323       if (print_kernel_trace & STARTUP_DETAILS)
1324         DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,
1325            NumTeams);
1326     }
1327   }
1328 
1329   // num_teams clause always honored, no matter what, unless DEFAULT is active.
1330   if (NumTeams > 0) {
1331     NumGroups = NumTeams;
1332     // Cap num_groups to EnvMaxTeamsDefault if set.
1333     if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
1334       NumGroups = Env.MaxTeamsDefault;
1335   }
1336   if (print_kernel_trace & STARTUP_DETAILS) {
1337     DP("threadsPerGroup: %d\n", ThreadsPerGroup);
1338     DP("num_groups: %d\n", NumGroups);
1339     DP("loop_tripcount: %ld\n", LoopTripcount);
1340   }
1341   DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,
1342      ThreadsPerGroup);
1343 
1344   LaunchVals Res;
1345   Res.WorkgroupSize = ThreadsPerGroup;
1346   Res.GridSize = ThreadsPerGroup * NumGroups;
1347   return Res;
1348 }
1349 
acquireAvailablePacketId(hsa_queue_t * Queue)1350 static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
1351   uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
1352   bool Full = true;
1353   while (Full) {
1354     Full =
1355         PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
1356   }
1357   return PacketId;
1358 }
1359 
runRegionLocked(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t NumTeams,int32_t ThreadLimit,uint64_t LoopTripcount)1360 int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1361                         ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
1362                         int32_t ThreadLimit, uint64_t LoopTripcount) {
1363   // Set the context we are using
1364   // update thread limit content in gpu memory if un-initialized or specified
1365   // from host
1366 
1367   DP("Run target team region thread_limit %d\n", ThreadLimit);
1368 
1369   // All args are references.
1370   std::vector<void *> Args(ArgNum);
1371   std::vector<void *> Ptrs(ArgNum);
1372 
1373   DP("Arg_num: %d\n", ArgNum);
1374   for (int32_t I = 0; I < ArgNum; ++I) {
1375     Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1376     Args[I] = &Ptrs[I];
1377     DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I]));
1378   }
1379 
1380   KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;
1381 
1382   std::string KernelName = std::string(KernelInfo->Name);
1383   auto &KernelInfoTable = DeviceInfo().KernelInfoTable;
1384   if (KernelInfoTable[DeviceId].find(KernelName) ==
1385       KernelInfoTable[DeviceId].end()) {
1386     DP("Kernel %s not found\n", KernelName.c_str());
1387     return OFFLOAD_FAIL;
1388   }
1389 
1390   const atl_kernel_info_t KernelInfoEntry =
1391       KernelInfoTable[DeviceId][KernelName];
1392   const uint32_t GroupSegmentSize =
1393       KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize;
1394   const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
1395   const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
1396   const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
1397   const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;
1398 
1399   assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count);
1400 
1401   /*
1402    * Set limit based on ThreadsPerGroup and GroupsPerDevice
1403    */
1404   LaunchVals LV =
1405       getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env,
1406                     KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
1407                     NumTeams,      // From run_region arg
1408                     ThreadLimit,   // From run_region arg
1409                     LoopTripcount, // From run_region arg
1410                     DeviceInfo().NumTeams[KernelInfo->DeviceId]);
1411   const int GridSize = LV.GridSize;
1412   const int WorkgroupSize = LV.WorkgroupSize;
1413 
1414   if (print_kernel_trace >= LAUNCH) {
1415     int NumGroups = GridSize / WorkgroupSize;
1416     // enum modes are SPMD, GENERIC, NONE 0,1,2
1417     // if doing rtl timing, print to stderr, unless stdout requested.
1418     bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
1419     fprintf(TraceToStdout ? stdout : stderr,
1420             "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
1421             "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
1422             "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
1423             DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
1424             ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
1425             GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
1426             VgprSpillCount, LoopTripcount, KernelInfo->Name);
1427   }
1428 
1429   // Run on the device.
1430   {
1431     hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next();
1432     if (!Queue) {
1433       return OFFLOAD_FAIL;
1434     }
1435     uint64_t PacketId = acquireAvailablePacketId(Queue);
1436 
1437     const uint32_t Mask = Queue->size - 1; // size is a power of 2
1438     hsa_kernel_dispatch_packet_t *Packet =
1439         (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
1440 
1441     // packet->header is written last
1442     Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1443     Packet->workgroup_size_x = WorkgroupSize;
1444     Packet->workgroup_size_y = 1;
1445     Packet->workgroup_size_z = 1;
1446     Packet->reserved0 = 0;
1447     Packet->grid_size_x = GridSize;
1448     Packet->grid_size_y = 1;
1449     Packet->grid_size_z = 1;
1450     Packet->private_segment_size = KernelInfoEntry.private_segment_size;
1451     Packet->group_segment_size = GroupSegmentSize;
1452     Packet->kernel_object = KernelInfoEntry.kernel_object;
1453     Packet->kernarg_address = 0;     // use the block allocator
1454     Packet->reserved2 = 0;           // impl writes id_ here
1455     Packet->completion_signal = {0}; // may want a pool of signals
1456 
1457     KernelArgPool *ArgPool = nullptr;
1458     void *KernArg = nullptr;
1459     {
1460       auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
1461       if (It != KernelArgPoolMap.end()) {
1462         ArgPool = (It->second).get();
1463       }
1464     }
1465     if (!ArgPool) {
1466       DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
1467          DeviceId);
1468     }
1469     {
1470       if (ArgPool) {
1471         assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)));
1472         KernArg = ArgPool->allocate(ArgNum);
1473       }
1474       if (!KernArg) {
1475         DP("Allocate kernarg failed\n");
1476         return OFFLOAD_FAIL;
1477       }
1478 
1479       // Copy explicit arguments
1480       for (int I = 0; I < ArgNum; I++) {
1481         memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *));
1482       }
1483 
1484       // Initialize implicit arguments. TODO: Which of these can be dropped
1485       impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>(
1486           static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
1487       memset(ImplArgs, 0,
1488              sizeof(impl_implicit_args_t)); // may not be necessary
1489       ImplArgs->offset_x = 0;
1490       ImplArgs->offset_y = 0;
1491       ImplArgs->offset_z = 0;
1492 
1493       // assign a hostcall buffer for the selected Q
1494       if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) {
1495         // hostrpc_assign_buffer is not thread safe, and this function is
1496         // under a multiple reader lock, not a writer lock.
1497         static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER;
1498         pthread_mutex_lock(&HostcallInitLock);
1499         uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo().HSAAgents[DeviceId],
1500                                                 Queue, DeviceId);
1501         pthread_mutex_unlock(&HostcallInitLock);
1502         if (!Buffer) {
1503           DP("hostrpc_assign_buffer failed, gpu would dereference null and "
1504              "error\n");
1505           return OFFLOAD_FAIL;
1506         }
1507 
1508         DP("Implicit argument count: %d\n",
1509            KernelInfoEntry.implicit_argument_count);
1510         if (KernelInfoEntry.implicit_argument_count >= 4) {
1511           // Initialise pointer for implicit_argument_count != 0 ABI
1512           // Guess that the right implicit argument is at offset 24 after
1513           // the explicit arguments. In the future, should be able to read
1514           // the offset from msgpack. Clang is not annotating it at present.
1515           uint64_t Offset =
1516               sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
1517           if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
1518             DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "
1519                "args: %d\n",
1520                Offset + 8, ArgPool->kernargSizeIncludingImplicit());
1521           } else {
1522             memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
1523           }
1524         }
1525 
1526         // initialise pointer for implicit_argument_count == 0 ABI
1527         ImplArgs->hostcall_ptr = Buffer;
1528       }
1529 
1530       Packet->kernarg_address = KernArg;
1531     }
1532 
1533     hsa_signal_t S = DeviceInfo().FreeSignalPool.pop();
1534     if (S.handle == 0) {
1535       DP("Failed to get signal instance\n");
1536       return OFFLOAD_FAIL;
1537     }
1538     Packet->completion_signal = S;
1539     hsa_signal_store_relaxed(Packet->completion_signal, 1);
1540 
1541     // Publish the packet indicating it is ready to be processed
1542     core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
1543                              core::createHeader(), Packet->setup);
1544 
1545     // Since the packet is already published, its contents must not be
1546     // accessed any more
1547     hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
1548 
1549     while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
1550                                      HSA_WAIT_STATE_BLOCKED) != 0)
1551       ;
1552 
1553     assert(ArgPool);
1554     ArgPool->deallocate(KernArg);
1555     DeviceInfo().FreeSignalPool.push(S);
1556   }
1557 
1558   DP("Kernel completed\n");
1559   return OFFLOAD_SUCCESS;
1560 }
1561 
elfMachineIdIsAmdgcn(__tgt_device_image * Image)1562 bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
1563   const uint16_t AmdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
1564   int32_t R = elf_check_machine(Image, AmdgcnMachineID);
1565   if (!R) {
1566     DP("Supported machine ID not found\n");
1567   }
1568   return R;
1569 }
1570 
elfEFlags(__tgt_device_image * Image)1571 uint32_t elfEFlags(__tgt_device_image *Image) {
1572   char *ImgBegin = (char *)Image->ImageStart;
1573   size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;
1574 
1575   Elf *E = elf_memory(ImgBegin, ImgSize);
1576   if (!E) {
1577     DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
1578     return 0;
1579   }
1580 
1581   Elf64_Ehdr *Eh64 = elf64_getehdr(E);
1582 
1583   if (!Eh64) {
1584     DP("Unable to get machine ID from ELF file!\n");
1585     elf_end(E);
1586     return 0;
1587   }
1588 
1589   uint32_t Flags = Eh64->e_flags;
1590 
1591   elf_end(E);
1592   DP("ELF Flags: 0x%x\n", Flags);
1593   return Flags;
1594 }
1595 
enforceUpperBound(T * Value,T Upper)1596 template <typename T> bool enforceUpperBound(T *Value, T Upper) {
1597   bool Changed = *Value > Upper;
1598   if (Changed) {
1599     *Value = Upper;
1600   }
1601   return Changed;
1602 }
1603 
findOnlyShtHash(Elf * Elf)1604 Elf64_Shdr *findOnlyShtHash(Elf *Elf) {
1605   size_t N;
1606   int Rc = elf_getshdrnum(Elf, &N);
1607   if (Rc != 0) {
1608     return nullptr;
1609   }
1610 
1611   Elf64_Shdr *Result = nullptr;
1612   for (size_t I = 0; I < N; I++) {
1613     Elf_Scn *Scn = elf_getscn(Elf, I);
1614     if (Scn) {
1615       Elf64_Shdr *Shdr = elf64_getshdr(Scn);
1616       if (Shdr) {
1617         if (Shdr->sh_type == SHT_HASH) {
1618           if (Result == nullptr) {
1619             Result = Shdr;
1620           } else {
1621             // multiple SHT_HASH sections not handled
1622             return nullptr;
1623           }
1624         }
1625       }
1626     }
1627   }
1628   return Result;
1629 }
1630 
elfLookup(Elf * Elf,char * Base,Elf64_Shdr * SectionHash,const char * Symname)1631 const Elf64_Sym *elfLookup(Elf *Elf, char *Base, Elf64_Shdr *SectionHash,
1632                            const char *Symname) {
1633 
1634   assert(SectionHash);
1635   size_t SectionSymtabIndex = SectionHash->sh_link;
1636   Elf64_Shdr *SectionSymtab =
1637       elf64_getshdr(elf_getscn(Elf, SectionSymtabIndex));
1638   size_t SectionStrtabIndex = SectionSymtab->sh_link;
1639 
1640   const Elf64_Sym *Symtab =
1641       reinterpret_cast<const Elf64_Sym *>(Base + SectionSymtab->sh_offset);
1642 
1643   const uint32_t *Hashtab =
1644       reinterpret_cast<const uint32_t *>(Base + SectionHash->sh_offset);
1645 
1646   // Layout:
1647   // nbucket
1648   // nchain
1649   // bucket[nbucket]
1650   // chain[nchain]
1651   uint32_t Nbucket = Hashtab[0];
1652   const uint32_t *Bucket = &Hashtab[2];
1653   const uint32_t *Chain = &Hashtab[Nbucket + 2];
1654 
1655   const size_t Max = strlen(Symname) + 1;
1656   const uint32_t Hash = elf_hash(Symname);
1657   for (uint32_t I = Bucket[Hash % Nbucket]; I != 0; I = Chain[I]) {
1658     char *N = elf_strptr(Elf, SectionStrtabIndex, Symtab[I].st_name);
1659     if (strncmp(Symname, N, Max) == 0) {
1660       return &Symtab[I];
1661     }
1662   }
1663 
1664   return nullptr;
1665 }
1666 
1667 struct SymbolInfo {
1668   void *Addr = nullptr;
1669   uint32_t Size = UINT32_MAX;
1670   uint32_t ShType = SHT_NULL;
1671 };
1672 
getSymbolInfoWithoutLoading(Elf * Elf,char * Base,const char * Symname,SymbolInfo * Res)1673 int getSymbolInfoWithoutLoading(Elf *Elf, char *Base, const char *Symname,
1674                                 SymbolInfo *Res) {
1675   if (elf_kind(Elf) != ELF_K_ELF) {
1676     return 1;
1677   }
1678 
1679   Elf64_Shdr *SectionHash = findOnlyShtHash(Elf);
1680   if (!SectionHash) {
1681     return 1;
1682   }
1683 
1684   const Elf64_Sym *Sym = elfLookup(Elf, Base, SectionHash, Symname);
1685   if (!Sym) {
1686     return 1;
1687   }
1688 
1689   if (Sym->st_size > UINT32_MAX) {
1690     return 1;
1691   }
1692 
1693   if (Sym->st_shndx == SHN_UNDEF) {
1694     return 1;
1695   }
1696 
1697   Elf_Scn *Section = elf_getscn(Elf, Sym->st_shndx);
1698   if (!Section) {
1699     return 1;
1700   }
1701 
1702   Elf64_Shdr *Header = elf64_getshdr(Section);
1703   if (!Header) {
1704     return 1;
1705   }
1706 
1707   Res->Addr = Sym->st_value + Base;
1708   Res->Size = static_cast<uint32_t>(Sym->st_size);
1709   Res->ShType = Header->sh_type;
1710   return 0;
1711 }
1712 
getSymbolInfoWithoutLoading(char * Base,size_t ImgSize,const char * Symname,SymbolInfo * Res)1713 int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *Symname,
1714                                 SymbolInfo *Res) {
1715   Elf *Elf = elf_memory(Base, ImgSize);
1716   if (Elf) {
1717     int Rc = getSymbolInfoWithoutLoading(Elf, Base, Symname, Res);
1718     elf_end(Elf);
1719     return Rc;
1720   }
1721   return 1;
1722 }
1723 
interopGetSymbolInfo(char * Base,size_t ImgSize,const char * SymName,void ** VarAddr,uint32_t * VarSize)1724 hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
1725                                   const char *SymName, void **VarAddr,
1726                                   uint32_t *VarSize) {
1727   SymbolInfo SI;
1728   int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
1729   if (Rc == 0) {
1730     *VarAddr = SI.Addr;
1731     *VarSize = SI.Size;
1732     return HSA_STATUS_SUCCESS;
1733   }
1734   return HSA_STATUS_ERROR;
1735 }
1736 
1737 template <typename C>
moduleRegisterFromMemoryToPlace(std::map<std::string,atl_kernel_info_t> & KernelInfoTable,std::map<std::string,atl_symbol_info_t> & SymbolInfoTable,void * ModuleBytes,size_t ModuleSize,int DeviceId,C Cb,std::vector<hsa_executable_t> & HSAExecutables)1738 hsa_status_t moduleRegisterFromMemoryToPlace(
1739     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
1740     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
1741     void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
1742     std::vector<hsa_executable_t> &HSAExecutables) {
1743   auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t {
1744     C *Unwrapped = static_cast<C *>(CbState);
1745     return (*Unwrapped)(Data, Size);
1746   };
1747   return core::RegisterModuleFromMemory(
1748       KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
1749       DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
1750       HSAExecutables);
1751 }
1752 
getDeviceStateBytes(char * ImageStart,size_t ImgSize)1753 uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
1754   uint64_t DeviceStateBytes = 0;
1755   {
1756     // If this is the deviceRTL, get the state variable size
1757     SymbolInfo SizeSi;
1758     int Rc = getSymbolInfoWithoutLoading(
1759         ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);
1760 
1761     if (Rc == 0) {
1762       if (SizeSi.Size != sizeof(uint64_t)) {
1763         DP("Found device_State_size variable with wrong size\n");
1764         return 0;
1765       }
1766 
1767       // Read number of bytes directly from the elf
1768       memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
1769     }
1770   }
1771   return DeviceStateBytes;
1772 }
1773 
1774 struct DeviceEnvironment {
1775   // initialise an DeviceEnvironmentTy in the deviceRTL
1776   // patches around differences in the deviceRTL between trunk, aomp,
1777   // rocmcc. Over time these differences will tend to zero and this class
1778   // simplified.
1779   // Symbol may be in .data or .bss, and may be missing fields, todo:
1780   // review aomp/trunk/rocm and simplify the following
1781 
1782   // The symbol may also have been deadstripped because the device side
1783   // accessors were unused.
1784 
1785   // If the symbol is in .data (aomp, rocm) it can be written directly.
1786   // If it is in .bss, we must wait for it to be allocated space on the
1787   // gpu (trunk) and initialize after loading.
sym__anone797b96e0b11::DeviceEnvironment1788   const char *sym() { return "omptarget_device_environment"; }
1789 
1790   DeviceEnvironmentTy HostDeviceEnv;
1791   SymbolInfo SI;
1792   bool Valid = false;
1793 
1794   __tgt_device_image *Image;
1795   const size_t ImgSize;
1796 
DeviceEnvironment__anone797b96e0b11::DeviceEnvironment1797   DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
1798                     __tgt_device_image *Image, const size_t ImgSize)
1799       : Image(Image), ImgSize(ImgSize) {
1800 
1801     HostDeviceEnv.NumDevices = NumberDevices;
1802     HostDeviceEnv.DeviceNum = DeviceId;
1803     HostDeviceEnv.DebugKind = 0;
1804     HostDeviceEnv.DynamicMemSize = DynamicMemSize;
1805     if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
1806       HostDeviceEnv.DebugKind = std::stoi(EnvStr);
1807 
1808     int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
1809                                          sym(), &SI);
1810     if (Rc != 0) {
1811       DP("Finding global device environment '%s' - symbol missing.\n", sym());
1812       return;
1813     }
1814 
1815     if (SI.Size > sizeof(HostDeviceEnv)) {
1816       DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,
1817          sizeof(HostDeviceEnv));
1818       return;
1819     }
1820 
1821     Valid = true;
1822   }
1823 
inImage__anone797b96e0b11::DeviceEnvironment1824   bool inImage() { return SI.ShType != SHT_NOBITS; }
1825 
beforeLoading__anone797b96e0b11::DeviceEnvironment1826   hsa_status_t beforeLoading(void *Data, size_t Size) {
1827     if (Valid) {
1828       if (inImage()) {
1829         DP("Setting global device environment before load (%u bytes)\n",
1830            SI.Size);
1831         uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart;
1832         void *Pos = (char *)Data + Offset;
1833         memcpy(Pos, &HostDeviceEnv, SI.Size);
1834       }
1835     }
1836     return HSA_STATUS_SUCCESS;
1837   }
1838 
afterLoading__anone797b96e0b11::DeviceEnvironment1839   hsa_status_t afterLoading() {
1840     if (Valid) {
1841       if (!inImage()) {
1842         DP("Setting global device environment after load (%u bytes)\n",
1843            SI.Size);
1844         int DeviceId = HostDeviceEnv.DeviceNum;
1845         auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
1846         void *StatePtr;
1847         uint32_t StatePtrSize;
1848         hsa_status_t Err = interop_hsa_get_symbol_info(
1849             SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
1850         if (Err != HSA_STATUS_SUCCESS) {
1851           DP("failed to find %s in loaded image\n", sym());
1852           return Err;
1853         }
1854 
1855         if (StatePtrSize != SI.Size) {
1856           DP("Symbol had size %u before loading, %u after\n", StatePtrSize,
1857              SI.Size);
1858           return HSA_STATUS_ERROR;
1859         }
1860 
1861         return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
1862                                                   StatePtrSize, DeviceId);
1863       }
1864     }
1865     return HSA_STATUS_SUCCESS;
1866   }
1867 };
1868 
implCalloc(void ** RetPtr,size_t Size,int DeviceId)1869 hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
1870   uint64_t Rounded = 4 * ((Size + 3) / 4);
1871   void *Ptr;
1872   hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
1873   hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
1874   if (Err != HSA_STATUS_SUCCESS) {
1875     return Err;
1876   }
1877 
1878   hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
1879   if (Rc != HSA_STATUS_SUCCESS) {
1880     DP("zero fill device_state failed with %u\n", Rc);
1881     core::Runtime::Memfree(Ptr);
1882     return HSA_STATUS_ERROR;
1883   }
1884 
1885   *RetPtr = Ptr;
1886   return HSA_STATUS_SUCCESS;
1887 }
1888 
imageContainsSymbol(void * Data,size_t Size,const char * Sym)1889 bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
1890   SymbolInfo SI;
1891   int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
1892   return (Rc == 0) && (SI.Addr != nullptr);
1893 }
1894 
1895 } // namespace
1896 
1897 namespace core {
allow_access_to_all_gpu_agents(void * Ptr)1898 hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
1899   return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(),
1900                                      &DeviceInfo().HSAAgents[0], NULL, Ptr);
1901 }
1902 } // namespace core
1903 
GetIsaInfo(hsa_isa_t isa,void * data)1904 static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
1905   hsa_status_t err;
1906   uint32_t name_len;
1907   err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len);
1908   if (err != HSA_STATUS_SUCCESS) {
1909     DP("Error getting ISA info length\n");
1910     return err;
1911   }
1912 
1913   char TargetID[name_len];
1914   err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID);
1915   if (err != HSA_STATUS_SUCCESS) {
1916     DP("Error getting ISA info name\n");
1917     return err;
1918   }
1919 
1920   auto TripleTargetID = llvm::StringRef(TargetID);
1921   if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
1922     DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str());
1923   }
1924   return HSA_STATUS_SUCCESS;
1925 }
1926 
1927 /// Parse a TargetID to get processor arch and feature map.
1928 /// Returns processor subarch.
1929 /// Returns TargetID features in \p FeatureMap argument.
1930 /// If the \p TargetID contains feature+, FeatureMap it to true.
1931 /// If the \p TargetID contains feature-, FeatureMap it to false.
1932 /// If the \p TargetID does not contain a feature (default), do not map it.
parseTargetID(StringRef TargetID,StringMap<bool> & FeatureMap)1933 StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) {
1934   if (TargetID.empty())
1935     return llvm::StringRef();
1936 
1937   auto ArchFeature = TargetID.split(":");
1938   auto Arch = ArchFeature.first;
1939   auto Features = ArchFeature.second;
1940   if (Features.empty())
1941     return Arch;
1942 
1943   if (Features.contains("sramecc+")) {
1944     FeatureMap.insert(std::pair<std::string, bool>("sramecc", true));
1945   } else if (Features.contains("sramecc-")) {
1946     FeatureMap.insert(std::pair<std::string, bool>("sramecc", false));
1947   }
1948   if (Features.contains("xnack+")) {
1949     FeatureMap.insert(std::pair<std::string, bool>("xnack", true));
1950   } else if (Features.contains("xnack-")) {
1951     FeatureMap.insert(std::pair<std::string, bool>("xnack", false));
1952   }
1953 
1954   return Arch;
1955 }
1956 
1957 /// Checks if an image \p ImgInfo is compatible with current
1958 /// system's environment \p EnvInfo
IsImageCompatibleWithEnv(const char * ImgInfo,std::string EnvInfo)1959 bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) {
1960   llvm::StringRef ImgTID(ImgInfo), EnvTID(EnvInfo);
1961 
1962   // Compatible in case of exact match
1963   if (ImgTID == EnvTID) {
1964     DP("Compatible: Exact match \t[Image: %s]\t:\t[Environment: %s]\n",
1965        ImgTID.data(), EnvTID.data());
1966     return true;
1967   }
1968 
1969   // Incompatible if Archs mismatch.
1970   StringMap<bool> ImgMap, EnvMap;
1971   StringRef ImgArch = parseTargetID(ImgTID, ImgMap);
1972   StringRef EnvArch = parseTargetID(EnvTID, EnvMap);
1973 
1974   // Both EnvArch and ImgArch can't be empty here.
1975   if (EnvArch.empty() || ImgArch.empty() || !ImgArch.contains(EnvArch)) {
1976     DP("Incompatible: Processor mismatch \t[Image: %s]\t:\t[Environment: %s]\n",
1977        ImgTID.data(), EnvTID.data());
1978     return false;
1979   }
1980 
1981   // Incompatible if image has more features than the environment, irrespective
1982   // of type or sign of features.
1983   if (ImgMap.size() > EnvMap.size()) {
1984     DP("Incompatible: Image has more features than the environment \t[Image: "
1985        "%s]\t:\t[Environment: %s]\n",
1986        ImgTID.data(), EnvTID.data());
1987     return false;
1988   }
1989 
1990   // Compatible if each target feature specified by the environment is
1991   // compatible with target feature of the image. The target feature is
1992   // compatible if the iamge does not specify it (meaning Any), or if it
1993   // specifies it with the same value (meaning On or Off).
1994   for (const auto &ImgFeature : ImgMap) {
1995     auto EnvFeature = EnvMap.find(ImgFeature.first());
1996     if (EnvFeature == EnvMap.end()) {
1997       DP("Incompatible: Value of Image's non-ANY feature is not matching with "
1998          "the Environment feature's ANY value \t[Image: %s]\t:\t[Environment: "
1999          "%s]\n",
2000          ImgTID.data(), EnvTID.data());
2001       return false;
2002     } else if (EnvFeature->first() == ImgFeature.first() &&
2003                EnvFeature->second != ImgFeature.second) {
2004       DP("Incompatible: Value of Image's non-ANY feature is not matching with "
2005          "the Environment feature's non-ANY value \t[Image: "
2006          "%s]\t:\t[Environment: %s]\n",
2007          ImgTID.data(), EnvTID.data());
2008       return false;
2009     }
2010   }
2011 
2012   // Image is compatible if all features of Environment are:
2013   //   - either, present in the Image's features map with the same sign,
2014   //   - or, the feature is missing from Image's features map i.e. it is
2015   //   set to ANY
2016   DP("Compatible: Target IDs are compatible \t[Image: %s]\t:\t[Environment: "
2017      "%s]\n",
2018      ImgTID.data(), EnvTID.data());
2019   return true;
2020 }
2021 
2022 extern "C" {
__tgt_rtl_is_valid_binary(__tgt_device_image * Image)2023 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
2024   return elfMachineIdIsAmdgcn(Image);
2025 }
2026 
__tgt_rtl_is_valid_binary_info(__tgt_device_image * image,__tgt_image_info * info)2027 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
2028                                        __tgt_image_info *info) {
2029   if (!__tgt_rtl_is_valid_binary(image))
2030     return false;
2031 
2032   // A subarchitecture was not specified. Assume it is compatible.
2033   if (!info->Arch)
2034     return true;
2035 
2036   int32_t NumberOfDevices = __tgt_rtl_number_of_devices();
2037 
2038   for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
2039     __tgt_rtl_init_device(DeviceId);
2040     hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId];
2041     hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
2042     if (err != HSA_STATUS_SUCCESS) {
2043       DP("Error iterating ISAs\n");
2044       return false;
2045     }
2046     if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo().TargetID[DeviceId]))
2047       return false;
2048   }
2049   DP("Image has Target ID compatible with the current environment: %s\n",
2050      info->Arch);
2051   return true;
2052 }
2053 
__tgt_rtl_init_plugin()2054 int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS; }
__tgt_rtl_deinit_plugin()2055 int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS; }
2056 
__tgt_rtl_number_of_devices()2057 int __tgt_rtl_number_of_devices() {
2058   // If the construction failed, no methods are safe to call
2059   if (DeviceInfo().ConstructionSucceeded) {
2060     return DeviceInfo().NumberOfDevices;
2061   }
2062   DP("AMDGPU plugin construction failed. Zero devices available\n");
2063   return 0;
2064 }
2065 
__tgt_rtl_init_requires(int64_t RequiresFlags)2066 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
2067   DP("Init requires flags to %ld\n", RequiresFlags);
2068   DeviceInfo().RequiresFlags = RequiresFlags;
2069   return RequiresFlags;
2070 }
2071 
__tgt_rtl_init_device(int DeviceId)2072 int32_t __tgt_rtl_init_device(int DeviceId) {
2073   hsa_status_t Err = hsa_init();
2074   if (Err != HSA_STATUS_SUCCESS) {
2075     DP("HSA Initialization Failed.\n");
2076     return HSA_STATUS_ERROR;
2077   }
2078   // this is per device id init
2079   DP("Initialize the device id: %d\n", DeviceId);
2080 
2081   hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
2082 
2083   // Get number of Compute Unit
2084   uint32_t ComputeUnits = 0;
2085   Err = hsa_agent_get_info(
2086       Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
2087       &ComputeUnits);
2088   if (Err != HSA_STATUS_SUCCESS) {
2089     DeviceInfo().ComputeUnits[DeviceId] = 1;
2090     DP("Error getting compute units : settiing to 1\n");
2091   } else {
2092     DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits;
2093     DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]);
2094   }
2095 
2096   char GetInfoName[64]; // 64 max size returned by get info
2097   Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
2098                            (void *)GetInfoName);
2099   if (Err)
2100     DeviceInfo().GPUName[DeviceId] = "--unknown gpu--";
2101   else {
2102     DeviceInfo().GPUName[DeviceId] = GetInfoName;
2103   }
2104 
2105   if (print_kernel_trace & STARTUP_DETAILS)
2106     DP("Device#%-2d CU's: %2d %s\n", DeviceId,
2107        DeviceInfo().ComputeUnits[DeviceId], DeviceInfo().GPUName[DeviceId].c_str());
2108 
2109   // Query attributes to determine number of threads/block and blocks/grid.
2110   uint16_t WorkgroupMaxDim[3];
2111   Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
2112                            &WorkgroupMaxDim);
2113   if (Err != HSA_STATUS_SUCCESS) {
2114     DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
2115     DP("Error getting grid dims: num groups : %d\n",
2116        RTLDeviceInfoTy::DefaultNumTeams);
2117   } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
2118     DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
2119     DP("Using %d ROCm blocks per grid\n", DeviceInfo().GroupsPerDevice[DeviceId]);
2120   } else {
2121     DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
2122     DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
2123        "at the hard limit\n",
2124        WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit);
2125   }
2126 
2127   // Get thread limit
2128   hsa_dim3_t GridMaxDim;
2129   Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
2130   if (Err == HSA_STATUS_SUCCESS) {
2131     DeviceInfo().ThreadsPerGroup[DeviceId] =
2132         reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
2133         DeviceInfo().GroupsPerDevice[DeviceId];
2134 
2135     if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) {
2136       DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
2137       DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
2138     } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId],
2139                                  RTLDeviceInfoTy::MaxWgSize)) {
2140       DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
2141     } else {
2142       DP("Using ROCm Queried thread limit: %d\n",
2143          DeviceInfo().ThreadsPerGroup[DeviceId]);
2144     }
2145   } else {
2146     DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
2147     DP("Error getting max block dimension, use default:%d \n",
2148        RTLDeviceInfoTy::MaxWgSize);
2149   }
2150 
2151   // Get wavefront size
2152   uint32_t WavefrontSize = 0;
2153   Err =
2154       hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
2155   if (Err == HSA_STATUS_SUCCESS) {
2156     DP("Queried wavefront size: %d\n", WavefrontSize);
2157     DeviceInfo().WarpSize[DeviceId] = WavefrontSize;
2158   } else {
2159     // TODO: Burn the wavefront size into the code object
2160     DP("Warning: Unknown wavefront size, assuming 64\n");
2161     DeviceInfo().WarpSize[DeviceId] = 64;
2162   }
2163 
2164   // Adjust teams to the env variables
2165 
2166   if (DeviceInfo().Env.TeamLimit > 0 &&
2167       (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId],
2168                          DeviceInfo().Env.TeamLimit))) {
2169     DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
2170        DeviceInfo().Env.TeamLimit);
2171   }
2172 
2173   // Set default number of teams
2174   if (DeviceInfo().Env.NumTeams > 0) {
2175     DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams;
2176     DP("Default number of teams set according to environment %d\n",
2177        DeviceInfo().Env.NumTeams);
2178   } else {
2179     char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
2180     int TeamsPerCU = DefaultTeamsPerCU;
2181     if (TeamsPerCUEnvStr) {
2182       TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
2183     }
2184 
2185     DeviceInfo().NumTeams[DeviceId] =
2186         TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId];
2187     DP("Default number of teams = %d * number of compute units %d\n",
2188        TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]);
2189   }
2190 
2191   if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId],
2192                         DeviceInfo().GroupsPerDevice[DeviceId])) {
2193     DP("Default number of teams exceeds device limit, capping at %d\n",
2194        DeviceInfo().GroupsPerDevice[DeviceId]);
2195   }
2196 
2197   // Adjust threads to the env variables
2198   if (DeviceInfo().Env.TeamThreadLimit > 0 &&
2199       (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2200                          DeviceInfo().Env.TeamThreadLimit))) {
2201     DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
2202        DeviceInfo().Env.TeamThreadLimit);
2203   }
2204 
2205   // Set default number of threads
2206   DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
2207   DP("Default number of threads set according to library's default %d\n",
2208      RTLDeviceInfoTy::DefaultWgSize);
2209   if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2210                         DeviceInfo().ThreadsPerGroup[DeviceId])) {
2211     DP("Default number of threads exceeds device limit, capping at %d\n",
2212        DeviceInfo().ThreadsPerGroup[DeviceId]);
2213   }
2214 
2215   DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
2216      DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],
2217      DeviceInfo().ThreadsPerGroup[DeviceId]);
2218 
2219   DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,
2220      DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],
2221      DeviceInfo().GroupsPerDevice[DeviceId],
2222      DeviceInfo().GroupsPerDevice[DeviceId] *
2223          DeviceInfo().ThreadsPerGroup[DeviceId]);
2224 
2225   return OFFLOAD_SUCCESS;
2226 }
2227 
2228 static __tgt_target_table *
2229 __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
2230 
__tgt_rtl_load_binary(int32_t DeviceId,__tgt_device_image * Image)2231 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
2232                                           __tgt_device_image *Image) {
2233   DeviceInfo().LoadRunLock.lock();
2234   __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
2235   DeviceInfo().LoadRunLock.unlock();
2236   return Res;
2237 }
2238 
__tgt_rtl_load_binary_locked(int32_t DeviceId,__tgt_device_image * Image)2239 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
2240                                                  __tgt_device_image *Image) {
2241   // This function loads the device image onto gpu[DeviceId] and does other
2242   // per-image initialization work. Specifically:
2243   //
2244   // - Initialize an DeviceEnvironmentTy instance embedded in the
2245   //   image at the symbol "omptarget_device_environment"
2246   //   Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
2247   //
2248   // - Allocate a large array per-gpu (could be moved to init_device)
2249   //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
2250   //   - Allocate at least that many bytes of gpu memory
2251   //   - Zero initialize it
2252   //   - Write the pointer to the symbol omptarget_nvptx_device_State
2253   //
2254   // - Pulls some per-kernel information together from various sources and
2255   //   records it in the KernelsList for quicker access later
2256   //
2257   // The initialization can be done before or after loading the image onto the
2258   // gpu. This function presently does a mixture. Using the hsa api to get/set
2259   // the information is simpler to implement, in exchange for more complicated
2260   // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
2261   // back from the gpu vs a hashtable lookup on the host.
2262 
2263   const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;
2264 
2265   DeviceInfo().clearOffloadEntriesTable(DeviceId);
2266 
2267   // We do not need to set the ELF version because the caller of this function
2268   // had to do that to decide the right runtime to use
2269 
2270   if (!elfMachineIdIsAmdgcn(Image))
2271     return NULL;
2272 
2273   {
2274     auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
2275                                  DeviceInfo().Env.DynamicMemSize, Image, ImgSize);
2276 
2277     auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId];
2278     auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
2279     hsa_status_t Err = moduleRegisterFromMemoryToPlace(
2280         KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
2281         [&](void *Data, size_t Size) {
2282           if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
2283             __atomic_store_n(&DeviceInfo().HostcallRequired, true,
2284                              __ATOMIC_RELEASE);
2285           }
2286           return Env.beforeLoading(Data, Size);
2287         },
2288         DeviceInfo().HSAExecutables);
2289 
2290     check("Module registering", Err);
2291     if (Err != HSA_STATUS_SUCCESS) {
2292       const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str();
2293       const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
2294 
2295       if (strcmp(DeviceName, ElfName) != 0) {
2296         DP("Possible gpu arch mismatch: device:%s, image:%s please check"
2297            " compiler flag: -march=<gpu>\n",
2298            DeviceName, ElfName);
2299       } else {
2300         DP("Error loading image onto GPU: %s\n", get_error_string(Err));
2301       }
2302 
2303       return NULL;
2304     }
2305 
2306     Err = Env.afterLoading();
2307     if (Err != HSA_STATUS_SUCCESS) {
2308       return NULL;
2309     }
2310   }
2311 
2312   DP("AMDGPU module successfully loaded!\n");
2313 
2314   {
2315     // the device_State array is either large value in bss or a void* that
2316     // needs to be assigned to a pointer to an array of size device_state_bytes
2317     // If absent, it has been deadstripped and needs no setup.
2318 
2319     void *StatePtr;
2320     uint32_t StatePtrSize;
2321     auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2322     hsa_status_t Err = interop_hsa_get_symbol_info(
2323         SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
2324         &StatePtrSize);
2325 
2326     if (Err != HSA_STATUS_SUCCESS) {
2327       DP("No device_state symbol found, skipping initialization\n");
2328     } else {
2329       if (StatePtrSize < sizeof(void *)) {
2330         DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,
2331            sizeof(void *));
2332         return NULL;
2333       }
2334 
2335       // if it's larger than a void*, assume it's a bss array and no further
2336       // initialization is required. Only try to set up a pointer for
2337       // sizeof(void*)
2338       if (StatePtrSize == sizeof(void *)) {
2339         uint64_t DeviceStateBytes =
2340             getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
2341         if (DeviceStateBytes == 0) {
2342           DP("Can't initialize device_State, missing size information\n");
2343           return NULL;
2344         }
2345 
2346         auto &DSS = DeviceInfo().DeviceStateStore[DeviceId];
2347         if (DSS.first.get() == nullptr) {
2348           assert(DSS.second == 0);
2349           void *Ptr = NULL;
2350           hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
2351           if (Err != HSA_STATUS_SUCCESS) {
2352             DP("Failed to allocate device_state array\n");
2353             return NULL;
2354           }
2355           DSS = {
2356               std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
2357               DeviceStateBytes,
2358           };
2359         }
2360 
2361         void *Ptr = DSS.first.get();
2362         if (DeviceStateBytes != DSS.second) {
2363           DP("Inconsistent sizes of device_State unsupported\n");
2364           return NULL;
2365         }
2366 
2367         // write ptr to device memory so it can be used by later kernels
2368         Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
2369                                                    sizeof(void *), DeviceId);
2370         if (Err != HSA_STATUS_SUCCESS) {
2371           DP("memcpy install of state_ptr failed\n");
2372           return NULL;
2373         }
2374       }
2375     }
2376   }
2377 
2378   // Here, we take advantage of the data that is appended after img_end to get
2379   // the symbols' name we need to load. This data consist of the host entries
2380   // begin and end as well as the target name (see the offloading linker script
2381   // creation in clang compiler).
2382 
2383   // Find the symbols in the module by name. The name can be obtain by
2384   // concatenating the host entry name with the target name
2385 
2386   __tgt_offload_entry *HostBegin = Image->EntriesBegin;
2387   __tgt_offload_entry *HostEnd = Image->EntriesEnd;
2388 
2389   for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
2390 
2391     if (!E->addr) {
2392       // The host should have always something in the address to
2393       // uniquely identify the target region.
2394       DP("Analyzing host entry '<null>' (size = %lld)...\n",
2395          (unsigned long long)E->size);
2396       return NULL;
2397     }
2398 
2399     if (E->size) {
2400       __tgt_offload_entry Entry = *E;
2401 
2402       void *Varptr;
2403       uint32_t Varsize;
2404 
2405       auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2406       hsa_status_t Err = interop_hsa_get_symbol_info(
2407           SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
2408 
2409       if (Err != HSA_STATUS_SUCCESS) {
2410         // Inform the user what symbol prevented offloading
2411         DP("Loading global '%s' (Failed)\n", E->name);
2412         return NULL;
2413       }
2414 
2415       if (Varsize != E->size) {
2416         DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,
2417            Varsize, E->size);
2418         return NULL;
2419       }
2420 
2421       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
2422          DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr));
2423       Entry.addr = (void *)Varptr;
2424 
2425       DeviceInfo().addOffloadEntry(DeviceId, Entry);
2426 
2427       if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
2428           E->flags & OMP_DECLARE_TARGET_LINK) {
2429         // If unified memory is present any target link variables
2430         // can access host addresses directly. There is no longer a
2431         // need for device copies.
2432         Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, sizeof(void *),
2433                                                  DeviceId);
2434         if (Err != HSA_STATUS_SUCCESS)
2435           DP("Error when copying USM\n");
2436         DP("Copy linked variable host address (" DPxMOD ")"
2437            "to device address (" DPxMOD ")\n",
2438            DPxPTR(*((void **)E->addr)), DPxPTR(Varptr));
2439       }
2440 
2441       continue;
2442     }
2443 
2444     DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name));
2445 
2446     // errors in kernarg_segment_size previously treated as = 0 (or as undef)
2447     uint32_t KernargSegmentSize = 0;
2448     auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId];
2449     hsa_status_t Err = HSA_STATUS_SUCCESS;
2450     if (!E->name) {
2451       Err = HSA_STATUS_ERROR;
2452     } else {
2453       std::string KernelStr = std::string(E->name);
2454       auto It = KernelInfoMap.find(KernelStr);
2455       if (It != KernelInfoMap.end()) {
2456         atl_kernel_info_t Info = It->second;
2457         KernargSegmentSize = Info.kernel_segment_size;
2458       } else {
2459         Err = HSA_STATUS_ERROR;
2460       }
2461     }
2462 
2463     // default value GENERIC (in case symbol is missing from cubin file)
2464     llvm::omp::OMPTgtExecModeFlags ExecModeVal =
2465         llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2466 
2467     // get flat group size if present, else Default_WG_Size
2468     int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2469 
2470     // get Kernel Descriptor if present.
2471     // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
2472     struct KernDescValType {
2473       uint16_t Version;
2474       uint16_t TSize;
2475       uint16_t WGSize;
2476     };
2477     struct KernDescValType KernDescVal;
2478     std::string KernDescNameStr(E->name);
2479     KernDescNameStr += "_kern_desc";
2480     const char *KernDescName = KernDescNameStr.c_str();
2481 
2482     void *KernDescPtr;
2483     uint32_t KernDescSize;
2484     void *CallStackAddr = nullptr;
2485     Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
2486                                &KernDescPtr, &KernDescSize);
2487 
2488     if (Err == HSA_STATUS_SUCCESS) {
2489       if ((size_t)KernDescSize != sizeof(KernDescVal))
2490         DP("Loading global computation properties '%s' - size mismatch (%u != "
2491            "%lu)\n",
2492            KernDescName, KernDescSize, sizeof(KernDescVal));
2493 
2494       memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
2495 
2496       // Check structure size against recorded size.
2497       if ((size_t)KernDescSize != KernDescVal.TSize)
2498         DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
2499            sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
2500 
2501       DP("After loading global for %s KernDesc \n", KernDescName);
2502       DP("KernDesc: Version: %d\n", KernDescVal.Version);
2503       DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
2504       DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize);
2505 
2506       if (KernDescVal.WGSize == 0) {
2507         KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
2508         DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize);
2509       }
2510       WGSizeVal = KernDescVal.WGSize;
2511       DP("WGSizeVal %d\n", WGSizeVal);
2512       check("Loading KernDesc computation property", Err);
2513     } else {
2514       DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
2515 
2516       // Flat group size
2517       std::string WGSizeNameStr(E->name);
2518       WGSizeNameStr += "_wg_size";
2519       const char *WGSizeName = WGSizeNameStr.c_str();
2520 
2521       void *WGSizePtr;
2522       uint32_t WGSize;
2523       Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
2524                                  &WGSizePtr, &WGSize);
2525 
2526       if (Err == HSA_STATUS_SUCCESS) {
2527         if ((size_t)WGSize != sizeof(int16_t)) {
2528           DP("Loading global computation properties '%s' - size mismatch (%u "
2529              "!= "
2530              "%lu)\n",
2531              WGSizeName, WGSize, sizeof(int16_t));
2532           return NULL;
2533         }
2534 
2535         memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
2536 
2537         DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
2538 
2539         if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize ||
2540             WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
2541           DP("Error wrong WGSize value specified in HSA code object file: "
2542              "%d\n",
2543              WGSizeVal);
2544           WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2545         }
2546       } else {
2547         DP("Warning: Loading WGSize '%s' - symbol not found, "
2548            "using default value %d\n",
2549            WGSizeName, WGSizeVal);
2550       }
2551 
2552       check("Loading WGSize computation property", Err);
2553     }
2554 
2555     // Read execution mode from global in binary
2556     std::string ExecModeNameStr(E->name);
2557     ExecModeNameStr += "_exec_mode";
2558     const char *ExecModeName = ExecModeNameStr.c_str();
2559 
2560     void *ExecModePtr;
2561     uint32_t VarSize;
2562     Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
2563                                &ExecModePtr, &VarSize);
2564 
2565     if (Err == HSA_STATUS_SUCCESS) {
2566       if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
2567         DP("Loading global computation properties '%s' - size mismatch(%u != "
2568            "%lu)\n",
2569            ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
2570         return NULL;
2571       }
2572 
2573       memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);
2574 
2575       DP("After loading global for %s ExecMode = %d\n", ExecModeName,
2576          ExecModeVal);
2577 
2578       if (ExecModeVal < 0 ||
2579           ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
2580         DP("Error wrong exec_mode value specified in HSA code object file: "
2581            "%d\n",
2582            ExecModeVal);
2583         return NULL;
2584       }
2585     } else {
2586       DP("Loading global exec_mode '%s' - symbol missing, using default "
2587          "value "
2588          "GENERIC (1)\n",
2589          ExecModeName);
2590     }
2591     check("Loading computation property", Err);
2592 
2593     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
2594                                    CallStackAddr, E->name, KernargSegmentSize,
2595                                    DeviceInfo().KernArgPool));
2596     __tgt_offload_entry Entry = *E;
2597     Entry.addr = (void *)&KernelsList.back();
2598     DeviceInfo().addOffloadEntry(DeviceId, Entry);
2599     DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
2600   }
2601 
2602   return DeviceInfo().getOffloadEntriesTable(DeviceId);
2603 }
2604 
__tgt_rtl_data_alloc(int DeviceId,int64_t Size,void *,int32_t Kind)2605 void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
2606   void *Ptr = NULL;
2607   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2608 
2609   if (Kind != TARGET_ALLOC_DEFAULT) {
2610     REPORT("Invalid target data allocation kind or requested allocator not "
2611            "implemented yet\n");
2612     return NULL;
2613   }
2614 
2615   hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
2616   hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
2617   DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,
2618      (long long unsigned)(Elf64_Addr)Ptr);
2619   Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL;
2620   return Ptr;
2621 }
2622 
__tgt_rtl_data_submit(int DeviceId,void * TgtPtr,void * HstPtr,int64_t Size)2623 int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
2624                               int64_t Size) {
2625   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2626   __tgt_async_info AsyncInfo;
2627   int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
2628   if (Rc != OFFLOAD_SUCCESS)
2629     return OFFLOAD_FAIL;
2630 
2631   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2632 }
2633 
__tgt_rtl_data_submit_async(int DeviceId,void * TgtPtr,void * HstPtr,int64_t Size,__tgt_async_info * AsyncInfo)2634 int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
2635                                     int64_t Size, __tgt_async_info *AsyncInfo) {
2636   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2637   if (AsyncInfo) {
2638     initAsyncInfo(AsyncInfo);
2639     return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
2640   }
2641   return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
2642 }
2643 
__tgt_rtl_data_retrieve(int DeviceId,void * HstPtr,void * TgtPtr,int64_t Size)2644 int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
2645                                 int64_t Size) {
2646   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2647   __tgt_async_info AsyncInfo;
2648   int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
2649   if (Rc != OFFLOAD_SUCCESS)
2650     return OFFLOAD_FAIL;
2651 
2652   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2653 }
2654 
__tgt_rtl_data_retrieve_async(int DeviceId,void * HstPtr,void * TgtPtr,int64_t Size,__tgt_async_info * AsyncInfo)2655 int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
2656                                       int64_t Size,
2657                                       __tgt_async_info *AsyncInfo) {
2658   assert(AsyncInfo && "AsyncInfo is nullptr");
2659   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2660   initAsyncInfo(AsyncInfo);
2661   return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
2662 }
2663 
__tgt_rtl_data_delete(int DeviceId,void * TgtPtr)2664 int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) {
2665   assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2666   hsa_status_t Err;
2667   DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr);
2668   Err = core::Runtime::Memfree(TgtPtr);
2669   if (Err != HSA_STATUS_SUCCESS) {
2670     DP("Error when freeing CUDA memory\n");
2671     return OFFLOAD_FAIL;
2672   }
2673   return OFFLOAD_SUCCESS;
2674 }
2675 
__tgt_rtl_run_target_team_region(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t NumTeams,int32_t ThreadLimit,uint64_t LoopTripcount)2676 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
2677                                          void **TgtArgs, ptrdiff_t *TgtOffsets,
2678                                          int32_t ArgNum, int32_t NumTeams,
2679                                          int32_t ThreadLimit,
2680                                          uint64_t LoopTripcount) {
2681 
2682   DeviceInfo().LoadRunLock.lock_shared();
2683   int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2684                                 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
2685 
2686   DeviceInfo().LoadRunLock.unlock_shared();
2687   return Res;
2688 }
2689 
__tgt_rtl_run_target_region(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum)2690 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
2691                                     void **TgtArgs, ptrdiff_t *TgtOffsets,
2692                                     int32_t ArgNum) {
2693   // use one team and one thread
2694   // fix thread num
2695   int32_t TeamNum = 1;
2696   int32_t ThreadLimit = 0; // use default
2697   return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs,
2698                                           TgtOffsets, ArgNum, TeamNum,
2699                                           ThreadLimit, 0);
2700 }
2701 
__tgt_rtl_run_target_team_region_async(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t NumTeams,int32_t ThreadLimit,uint64_t LoopTripcount,__tgt_async_info * AsyncInfo)2702 int32_t __tgt_rtl_run_target_team_region_async(
2703     int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
2704     int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit,
2705     uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) {
2706   assert(AsyncInfo && "AsyncInfo is nullptr");
2707   initAsyncInfo(AsyncInfo);
2708 
2709   DeviceInfo().LoadRunLock.lock_shared();
2710   int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2711                                 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
2712 
2713   DeviceInfo().LoadRunLock.unlock_shared();
2714   return Res;
2715 }
2716 
__tgt_rtl_run_target_region_async(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,__tgt_async_info * AsyncInfo)2717 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
2718                                           void **TgtArgs, ptrdiff_t *TgtOffsets,
2719                                           int32_t ArgNum,
2720                                           __tgt_async_info *AsyncInfo) {
2721   // use one team and one thread
2722   // fix thread num
2723   int32_t TeamNum = 1;
2724   int32_t ThreadLimit = 0; // use default
2725   return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs,
2726                                                 TgtOffsets, ArgNum, TeamNum,
2727                                                 ThreadLimit, 0, AsyncInfo);
2728 }
2729 
__tgt_rtl_synchronize(int32_t DeviceId,__tgt_async_info * AsyncInfo)2730 int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
2731   assert(AsyncInfo && "AsyncInfo is nullptr");
2732 
2733   // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
2734   // is not ensured by devices.cpp for amdgcn
2735   // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
2736   if (AsyncInfo->Queue) {
2737     finiAsyncInfo(AsyncInfo);
2738   }
2739   return OFFLOAD_SUCCESS;
2740 }
2741 
__tgt_rtl_print_device_info(int32_t DeviceId)2742 void __tgt_rtl_print_device_info(int32_t DeviceId) {
2743   // TODO: Assertion to see if DeviceId is correct
2744   // NOTE: We don't need to set context for print device info.
2745 
2746   DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
2747 }
2748 
2749 } // extern "C"
2750