1 //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for AMD hsa machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <assert.h>
15 #include <cstdio>
16 #include <cstdlib>
17 #include <cstring>
18 #include <functional>
19 #include <libelf.h>
20 #include <list>
21 #include <memory>
22 #include <mutex>
23 #include <shared_mutex>
24 #include <unordered_map>
25 #include <vector>
26 
27 #include "impl_runtime.h"
28 #include "interop_hsa.h"
29 
30 #include "internal.h"
31 #include "rt.h"
32 
33 #include "DeviceEnvironment.h"
34 #include "get_elf_mach_gfx_name.h"
35 #include "omptargetplugin.h"
36 #include "print_tracing.h"
37 
38 #include "llvm/Frontend/OpenMP/OMPConstants.h"
39 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
40 
41 // hostrpc interface, FIXME: consider moving to its own include these are
42 // statically linked into amdgpu/plugin if present from hostrpc_services.a,
43 // linked as --whole-archive to override the weak symbols that are used to
44 // implement a fallback for toolchains that do not yet have a hostrpc library.
45 extern "C" {
46 uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
47                                uint32_t DeviceId);
48 hsa_status_t hostrpc_init();
49 hsa_status_t hostrpc_terminate();
50 
51 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
52 __attribute__((weak)) hsa_status_t hostrpc_terminate() {
53   return HSA_STATUS_SUCCESS;
54 }
55 __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
56                                                      uint32_t DeviceId) {
57   DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
58      "missing\n",
59      DeviceId);
60   return 0;
61 }
62 }
63 
64 // Heuristic parameters used for kernel launch
65 // Number of teams per CU to allow scheduling flexibility
66 static const unsigned DefaultTeamsPerCU = 4;
67 
68 int print_kernel_trace;
69 
70 #ifdef OMPTARGET_DEBUG
71 #define check(msg, status)                                                     \
72   if (status != HSA_STATUS_SUCCESS) {                                          \
73     DP(#msg " failed\n");                                                      \
74   } else {                                                                     \
75     DP(#msg " succeeded\n");                                                   \
76   }
77 #else
78 #define check(msg, status)                                                     \
79   {}
80 #endif
81 
82 #include "elf_common.h"
83 
84 namespace hsa {
85 template <typename C> hsa_status_t iterate_agents(C Cb) {
86   auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
87     C *Unwrapped = static_cast<C *>(Data);
88     return (*Unwrapped)(Agent);
89   };
90   return hsa_iterate_agents(L, static_cast<void *>(&Cb));
91 }
92 
93 template <typename C>
94 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
95   auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
96     C *Unwrapped = static_cast<C *>(Data);
97     return (*Unwrapped)(MemoryPool);
98   };
99 
100   return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
101 }
102 
103 } // namespace hsa
104 
105 /// Keep entries table per device
106 struct FuncOrGblEntryTy {
107   __tgt_target_table Table;
108   std::vector<__tgt_offload_entry> Entries;
109 };
110 
111 struct KernelArgPool {
112 private:
113   static pthread_mutex_t Mutex;
114 
115 public:
116   uint32_t KernargSegmentSize;
117   void *KernargRegion = nullptr;
118   std::queue<int> FreeKernargSegments;
119 
120   uint32_t kernargSizeIncludingImplicit() {
121     return KernargSegmentSize + sizeof(impl_implicit_args_t);
122   }
123 
124   ~KernelArgPool() {
125     if (KernargRegion) {
126       auto R = hsa_amd_memory_pool_free(KernargRegion);
127       if (R != HSA_STATUS_SUCCESS) {
128         DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R));
129       }
130     }
131   }
132 
133   // Can't really copy or move a mutex
134   KernelArgPool() = default;
135   KernelArgPool(const KernelArgPool &) = delete;
136   KernelArgPool(KernelArgPool &&) = delete;
137 
138   KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
139       : KernargSegmentSize(KernargSegmentSize) {
140 
141     // impl uses one pool per kernel for all gpus, with a fixed upper size
142     // preserving that exact scheme here, including the queue<int>
143 
144     hsa_status_t Err = hsa_amd_memory_pool_allocate(
145         MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0,
146         &KernargRegion);
147 
148     if (Err != HSA_STATUS_SUCCESS) {
149       DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err));
150       KernargRegion = nullptr; // paranoid
151       return;
152     }
153 
154     Err = core::allow_access_to_all_gpu_agents(KernargRegion);
155     if (Err != HSA_STATUS_SUCCESS) {
156       DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
157          get_error_string(Err));
158       auto R = hsa_amd_memory_pool_free(KernargRegion);
159       if (R != HSA_STATUS_SUCCESS) {
160         // if free failed, can't do anything more to resolve it
161         DP("hsa memory poll free failed: %s\n", get_error_string(Err));
162       }
163       KernargRegion = nullptr;
164       return;
165     }
166 
167     for (int I = 0; I < MAX_NUM_KERNELS; I++) {
168       FreeKernargSegments.push(I);
169     }
170   }
171 
172   void *allocate(uint64_t ArgNum) {
173     assert((ArgNum * sizeof(void *)) == KernargSegmentSize);
174     Lock L(&Mutex);
175     void *Res = nullptr;
176     if (!FreeKernargSegments.empty()) {
177 
178       int FreeIdx = FreeKernargSegments.front();
179       Res = static_cast<void *>(static_cast<char *>(KernargRegion) +
180                                 (FreeIdx * kernargSizeIncludingImplicit()));
181       assert(FreeIdx == pointerToIndex(Res));
182       FreeKernargSegments.pop();
183     }
184     return Res;
185   }
186 
187   void deallocate(void *Ptr) {
188     Lock L(&Mutex);
189     int Idx = pointerToIndex(Ptr);
190     FreeKernargSegments.push(Idx);
191   }
192 
193 private:
194   int pointerToIndex(void *Ptr) {
195     ptrdiff_t Bytes =
196         static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion);
197     assert(Bytes >= 0);
198     assert(Bytes % kernargSizeIncludingImplicit() == 0);
199     return Bytes / kernargSizeIncludingImplicit();
200   }
201   struct Lock {
202     Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
203     ~Lock() { pthread_mutex_unlock(M); }
204     pthread_mutex_t *M;
205   };
206 };
207 pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
208 
209 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
210     KernelArgPoolMap;
211 
212 /// Use a single entity to encode a kernel and a set of flags
213 struct KernelTy {
214   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
215   int16_t ConstWGSize;
216   int32_t DeviceId;
217   void *CallStackAddr = nullptr;
218   const char *Name;
219 
220   KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
221            int32_t DeviceId, void *CallStackAddr, const char *Name,
222            uint32_t KernargSegmentSize,
223            hsa_amd_memory_pool_t &KernArgMemoryPool)
224       : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
225         DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
226     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
227 
228     std::string N(Name);
229     if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
230       KernelArgPoolMap.insert(
231           std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
232                                 KernargSegmentSize, KernArgMemoryPool))));
233     }
234   }
235 };
236 
237 /// List that contains all the kernels.
238 /// FIXME: we may need this to be per device and per library.
239 std::list<KernelTy> KernelsList;
240 
241 template <typename Callback> static hsa_status_t findAgents(Callback CB) {
242 
243   hsa_status_t Err =
244       hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
245         hsa_device_type_t DeviceType;
246         // get_info fails iff HSA runtime not yet initialized
247         hsa_status_t Err =
248             hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
249 
250         if (Err != HSA_STATUS_SUCCESS) {
251           if (print_kernel_trace > 0)
252             DP("rtl.cpp: err %s\n", get_error_string(Err));
253 
254           return Err;
255         }
256 
257         CB(DeviceType, Agent);
258         return HSA_STATUS_SUCCESS;
259       });
260 
261   // iterate_agents fails iff HSA runtime not yet initialized
262   if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
263     DP("rtl.cpp: err %s\n", get_error_string(Err));
264   }
265 
266   return Err;
267 }
268 
269 static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
270                           void *Data) {
271   if (Status != HSA_STATUS_SUCCESS) {
272     const char *StatusString;
273     if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
274       StatusString = "unavailable";
275     }
276     DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,
277        Status, StatusString);
278     abort();
279   }
280 }
281 
282 namespace core {
283 namespace {
284 
285 bool checkResult(hsa_status_t Err, const char *ErrMsg) {
286   if (Err == HSA_STATUS_SUCCESS)
287     return true;
288 
289   REPORT("%s", ErrMsg);
290   REPORT("%s", get_error_string(Err));
291   return false;
292 }
293 
294 void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
295   __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE);
296 }
297 
298 uint16_t createHeader() {
299   uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
300   Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
301   Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
302   return Header;
303 }
304 
305 hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
306   bool AllocAllowed = false;
307   hsa_status_t Err = hsa_amd_memory_pool_get_info(
308       MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
309       &AllocAllowed);
310   if (Err != HSA_STATUS_SUCCESS) {
311     DP("Alloc allowed in memory pool check failed: %s\n",
312        get_error_string(Err));
313     return Err;
314   }
315 
316   size_t Size = 0;
317   Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
318                                      &Size);
319   if (Err != HSA_STATUS_SUCCESS) {
320     DP("Get memory pool size failed: %s\n", get_error_string(Err));
321     return Err;
322   }
323 
324   return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
325 }
326 
327 hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
328   std::vector<hsa_amd_memory_pool_t> *Result =
329       static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
330 
331   hsa_status_t Err;
332   if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
333     return Err;
334   }
335 
336   Result->push_back(MemoryPool);
337   return HSA_STATUS_SUCCESS;
338 }
339 
340 } // namespace
341 } // namespace core
342 
343 struct EnvironmentVariables {
344   int NumTeams;
345   int TeamLimit;
346   int TeamThreadLimit;
347   int MaxTeamsDefault;
348   int DynamicMemSize;
349 };
350 
351 template <uint32_t wavesize>
352 static constexpr const llvm::omp::GV &getGridValue() {
353   return llvm::omp::getAMDGPUGridValues<wavesize>();
354 }
355 
356 struct HSALifetime {
357   // Wrapper around HSA used to ensure it is constructed before other types
358   // and destructed after, which means said other types can use raii for
359   // cleanup without risking running outside of the lifetime of HSA
360   const hsa_status_t S;
361 
362   bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
363   HSALifetime() : S(hsa_init()) {}
364 
365   ~HSALifetime() {
366     if (S == HSA_STATUS_SUCCESS) {
367       hsa_status_t Err = hsa_shut_down();
368       if (Err != HSA_STATUS_SUCCESS) {
369         // Can't call into HSA to get a string from the integer
370         DP("Shutting down HSA failed: %d\n", Err);
371       }
372     }
373   }
374 };
375 
376 // Handle scheduling of multiple hsa_queue's per device to
377 // multiple threads (one scheduler per device)
378 class HSAQueueScheduler {
379 public:
380   HSAQueueScheduler() : Current(0) {}
381 
382   HSAQueueScheduler(const HSAQueueScheduler &) = delete;
383 
384   HSAQueueScheduler(HSAQueueScheduler &&Q) {
385     Current = Q.Current.load();
386     for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
387       HSAQueues[I] = Q.HSAQueues[I];
388       Q.HSAQueues[I] = nullptr;
389     }
390   }
391 
392   // \return false if any HSA queue creation fails
393   bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
394     for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
395       hsa_queue_t *Q = nullptr;
396       hsa_status_t Rc =
397           hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
398                            callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
399       if (Rc != HSA_STATUS_SUCCESS) {
400         DP("Failed to create HSA queue %d\n", I);
401         return false;
402       }
403       HSAQueues[I] = Q;
404     }
405     return true;
406   }
407 
408   ~HSAQueueScheduler() {
409     for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
410       if (HSAQueues[I]) {
411         hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
412         if (Err != HSA_STATUS_SUCCESS)
413           DP("Error destroying HSA queue");
414       }
415     }
416   }
417 
418   // \return next queue to use for device
419   hsa_queue_t *next() {
420     return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
421                      NUM_QUEUES_PER_DEVICE];
422   }
423 
424 private:
425   // Number of queues per device
426   enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
427   hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
428   std::atomic<uint8_t> Current;
429 };
430 
431 /// Class containing all the device information
432 class RTLDeviceInfoTy : HSALifetime {
433   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
434 
435   struct QueueDeleter {
436     void operator()(hsa_queue_t *Q) {
437       if (Q) {
438         hsa_status_t Err = hsa_queue_destroy(Q);
439         if (Err != HSA_STATUS_SUCCESS) {
440           DP("Error destroying hsa queue: %s\n", get_error_string(Err));
441         }
442       }
443     }
444   };
445 
446 public:
447   bool ConstructionSucceeded = false;
448 
449   // load binary populates symbol tables and mutates various global state
450   // run uses those symbol tables
451   std::shared_timed_mutex LoadRunLock;
452 
453   int NumberOfDevices = 0;
454 
455   // GPU devices
456   std::vector<hsa_agent_t> HSAAgents;
457   std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
458 
459   // CPUs
460   std::vector<hsa_agent_t> CPUAgents;
461 
462   // Device properties
463   std::vector<int> ComputeUnits;
464   std::vector<int> GroupsPerDevice;
465   std::vector<int> ThreadsPerGroup;
466   std::vector<int> WarpSize;
467   std::vector<std::string> GPUName;
468 
469   // OpenMP properties
470   std::vector<int> NumTeams;
471   std::vector<int> NumThreads;
472 
473   // OpenMP Environment properties
474   EnvironmentVariables Env;
475 
476   // OpenMP Requires Flags
477   int64_t RequiresFlags;
478 
479   // Resource pools
480   SignalPoolT FreeSignalPool;
481 
482   bool HostcallRequired = false;
483 
484   std::vector<hsa_executable_t> HSAExecutables;
485 
486   std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
487   std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
488 
489   hsa_amd_memory_pool_t KernArgPool;
490 
491   // fine grained memory pool for host allocations
492   hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
493 
494   // fine and coarse-grained memory pools per offloading device
495   std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
496   std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
497 
498   struct ImplFreePtrDeletor {
499     void operator()(void *P) {
500       core::Runtime::Memfree(P); // ignore failure to free
501     }
502   };
503 
504   // device_State shared across loaded binaries, error if inconsistent size
505   std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
506       DeviceStateStore;
507 
508   static const unsigned HardTeamLimit =
509       (1 << 16) - 1; // 64K needed to fit in uint16
510   static const int DefaultNumTeams = 128;
511 
512   // These need to be per-device since different devices can have different
513   // wave sizes, but are currently the same number for each so that refactor
514   // can be postponed.
515   static_assert(getGridValue<32>().GV_Max_Teams ==
516                     getGridValue<64>().GV_Max_Teams,
517                 "");
518   static const int MaxTeams = getGridValue<64>().GV_Max_Teams;
519 
520   static_assert(getGridValue<32>().GV_Max_WG_Size ==
521                     getGridValue<64>().GV_Max_WG_Size,
522                 "");
523   static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;
524 
525   static_assert(getGridValue<32>().GV_Default_WG_Size ==
526                     getGridValue<64>().GV_Default_WG_Size,
527                 "");
528   static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;
529 
530   using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size,
531                                       hsa_agent_t, hsa_amd_memory_pool_t);
532   hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size,
533                                     MemcpyFunc Func, int32_t DeviceId) {
534     hsa_agent_t Agent = HSAAgents[DeviceId];
535     hsa_signal_t S = FreeSignalPool.pop();
536     if (S.handle == 0) {
537       return HSA_STATUS_ERROR;
538     }
539     hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
540     FreeSignalPool.push(S);
541     return R;
542   }
543 
544   hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size,
545                                        int32_t DeviceId) {
546     return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
547   }
548 
549   hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size,
550                                        int32_t DeviceId) {
551     return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
552   }
553 
554   static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
555     char TmpChar[1000];
556     uint16_t Major, Minor;
557     uint32_t TmpUInt;
558     uint32_t TmpUInt2;
559     uint32_t CacheSize[4];
560     bool TmpBool;
561     uint16_t WorkgroupMaxDim[3];
562     hsa_dim3_t GridMaxDim;
563 
564     // Getting basic information about HSA and Device
565     core::checkResult(
566         hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
567         "Error from hsa_system_get_info when obtaining "
568         "HSA_SYSTEM_INFO_VERSION_MAJOR\n");
569     core::checkResult(
570         hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
571         "Error from hsa_system_get_info when obtaining "
572         "HSA_SYSTEM_INFO_VERSION_MINOR\n");
573     printf("    HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
574     printf("    HSA OpenMP Device Number: \t\t%d \n", DeviceId);
575     core::checkResult(
576         hsa_agent_get_info(
577             Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
578         "Error returned from hsa_agent_get_info when obtaining "
579         "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
580     printf("    Product Name: \t\t\t%s \n", TmpChar);
581     core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
582                       "Error returned from hsa_agent_get_info when obtaining "
583                       "HSA_AGENT_INFO_NAME\n");
584     printf("    Device Name: \t\t\t%s \n", TmpChar);
585     core::checkResult(
586         hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
587         "Error returned from hsa_agent_get_info when obtaining "
588         "HSA_AGENT_INFO_NAME\n");
589     printf("    Vendor Name: \t\t\t%s \n", TmpChar);
590     hsa_device_type_t DevType;
591     core::checkResult(
592         hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
593         "Error returned from hsa_agent_get_info when obtaining "
594         "HSA_AGENT_INFO_DEVICE\n");
595     printf("    Device Type: \t\t\t%s \n",
596            DevType == HSA_DEVICE_TYPE_CPU
597                ? "CPU"
598                : (DevType == HSA_DEVICE_TYPE_GPU
599                       ? "GPU"
600                       : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
601     core::checkResult(
602         hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
603         "Error returned from hsa_agent_get_info when obtaining "
604         "HSA_AGENT_INFO_QUEUES_MAX\n");
605     printf("    Max Queues: \t\t\t%u \n", TmpUInt);
606     core::checkResult(
607         hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
608         "Error returned from hsa_agent_get_info when obtaining "
609         "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
610     printf("    Queue Min Size: \t\t\t%u \n", TmpUInt);
611     core::checkResult(
612         hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
613         "Error returned from hsa_agent_get_info when obtaining "
614         "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
615     printf("    Queue Max Size: \t\t\t%u \n", TmpUInt);
616 
617     // Getting cache information
618     printf("    Cache:\n");
619 
620     // FIXME: This is deprecated according to HSA documentation. But using
621     // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
622     // runtime.
623     core::checkResult(
624         hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
625         "Error returned from hsa_agent_get_info when obtaining "
626         "HSA_AGENT_INFO_CACHE_SIZE\n");
627 
628     for (int I = 0; I < 4; I++) {
629       if (CacheSize[I]) {
630         printf("      L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
631       }
632     }
633 
634     core::checkResult(
635         hsa_agent_get_info(Agent,
636                            (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
637                            &TmpUInt),
638         "Error returned from hsa_agent_get_info when obtaining "
639         "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
640     printf("    Cacheline Size: \t\t\t%u \n", TmpUInt);
641     core::checkResult(
642         hsa_agent_get_info(
643             Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
644             &TmpUInt),
645         "Error returned from hsa_agent_get_info when obtaining "
646         "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
647     printf("    Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
648     core::checkResult(
649         hsa_agent_get_info(
650             Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
651             &TmpUInt),
652         "Error returned from hsa_agent_get_info when obtaining "
653         "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
654     printf("    Compute Units: \t\t\t%u \n", TmpUInt);
655     core::checkResult(hsa_agent_get_info(
656                           Agent,
657                           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
658                           &TmpUInt),
659                       "Error returned from hsa_agent_get_info when obtaining "
660                       "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
661     printf("    SIMD per CU: \t\t\t%u \n", TmpUInt);
662     core::checkResult(
663         hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
664         "Error returned from hsa_agent_get_info when obtaining "
665         "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
666     printf("    Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
667     core::checkResult(
668         hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
669         "Error returned from hsa_agent_get_info when obtaining "
670         "HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
671     printf("    Wavefront Size: \t\t\t%u \n", TmpUInt2);
672     core::checkResult(
673         hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
674         "Error returned from hsa_agent_get_info when obtaining "
675         "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
676     printf("    Workgroup Max Size: \t\t%u \n", TmpUInt);
677     core::checkResult(hsa_agent_get_info(Agent,
678                                          HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
679                                          WorkgroupMaxDim),
680                       "Error returned from hsa_agent_get_info when obtaining "
681                       "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
682     printf("    Workgroup Max Size per Dimension:\n");
683     printf("      x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
684     printf("      y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
685     printf("      z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
686     core::checkResult(hsa_agent_get_info(
687                           Agent,
688                           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
689                           &TmpUInt),
690                       "Error returned from hsa_agent_get_info when obtaining "
691                       "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
692     printf("    Max Waves Per CU: \t\t\t%u \n", TmpUInt);
693     printf("    Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
694     core::checkResult(
695         hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
696         "Error returned from hsa_agent_get_info when obtaining "
697         "HSA_AGENT_INFO_GRID_MAX_SIZE\n");
698     printf("    Grid Max Size: \t\t\t%u \n", TmpUInt);
699     core::checkResult(
700         hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
701         "Error returned from hsa_agent_get_info when obtaining "
702         "HSA_AGENT_INFO_GRID_MAX_DIM\n");
703     printf("    Grid Max Size per Dimension: \t\t\n");
704     printf("      x: \t\t\t\t%u\n", GridMaxDim.x);
705     printf("      y: \t\t\t\t%u\n", GridMaxDim.y);
706     printf("      z: \t\t\t\t%u\n", GridMaxDim.z);
707     core::checkResult(
708         hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
709         "Error returned from hsa_agent_get_info when obtaining "
710         "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
711     printf("    Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
712 
713     printf("    Memory Pools:\n");
714     auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
715       std::string TmpStr;
716       size_t Size;
717       bool Alloc, Access;
718       hsa_amd_segment_t Segment;
719       hsa_amd_memory_pool_global_flag_t GlobalFlags;
720       core::checkResult(
721           hsa_amd_memory_pool_get_info(
722               Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
723           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
724           "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
725       core::checkResult(hsa_amd_memory_pool_get_info(
726                             Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
727                         "Error returned from hsa_amd_memory_pool_get_info when "
728                         "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
729 
730       switch (Segment) {
731       case HSA_AMD_SEGMENT_GLOBAL:
732         TmpStr = "GLOBAL; FLAGS: ";
733         if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
734           TmpStr += "KERNARG, ";
735         if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
736           TmpStr += "FINE GRAINED, ";
737         if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
738           TmpStr += "COARSE GRAINED, ";
739         break;
740       case HSA_AMD_SEGMENT_READONLY:
741         TmpStr = "READONLY";
742         break;
743       case HSA_AMD_SEGMENT_PRIVATE:
744         TmpStr = "PRIVATE";
745         break;
746       case HSA_AMD_SEGMENT_GROUP:
747         TmpStr = "GROUP";
748         break;
749       }
750       printf("      Pool %s: \n", TmpStr.c_str());
751 
752       core::checkResult(hsa_amd_memory_pool_get_info(
753                             Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
754                         "Error returned from hsa_amd_memory_pool_get_info when "
755                         "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
756       printf("        Size: \t\t\t\t %zu bytes\n", Size);
757       core::checkResult(
758           hsa_amd_memory_pool_get_info(
759               Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
760           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
761           "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
762       printf("        Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
763       core::checkResult(
764           hsa_amd_memory_pool_get_info(
765               Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
766           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
767           "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
768       printf("        Runtime Alloc Granule: \t\t %zu bytes\n", Size);
769       core::checkResult(
770           hsa_amd_memory_pool_get_info(
771               Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
772           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
773           "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
774       printf("        Runtime Alloc alignment: \t %zu bytes\n", Size);
775       core::checkResult(
776           hsa_amd_memory_pool_get_info(
777               Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
778           "Error returned from hsa_amd_memory_pool_get_info when obtaining "
779           "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
780       printf("        Accessable by all: \t\t %s\n",
781              (Access ? "TRUE" : "FALSE"));
782 
783       return HSA_STATUS_SUCCESS;
784     };
785     // Iterate over all the memory regions for this agent. Get the memory region
786     // type and size
787     hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);
788 
789     printf("    ISAs:\n");
790     auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
791       char TmpChar[1000];
792       core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
793                         "Error returned from hsa_isa_get_info_alt when "
794                         "obtaining HSA_ISA_INFO_NAME\n");
795       printf("        Name: \t\t\t\t %s\n", TmpChar);
796 
797       return HSA_STATUS_SUCCESS;
798     };
799     // Iterate over all the memory regions for this agent. Get the memory region
800     // type and size
801     hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
802   }
803 
804   // Record entry point associated with device
805   void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
806     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
807            "Unexpected device id!");
808     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
809 
810     E.Entries.push_back(Entry);
811   }
812 
813   // Return true if the entry is associated with device
814   bool findOffloadEntry(int32_t DeviceId, void *Addr) {
815     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
816            "Unexpected device id!");
817     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
818 
819     for (auto &It : E.Entries) {
820       if (It.addr == Addr)
821         return true;
822     }
823 
824     return false;
825   }
826 
827   // Return the pointer to the target entries table
828   __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
829     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
830            "Unexpected device id!");
831     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
832 
833     int32_t Size = E.Entries.size();
834 
835     // Table is empty
836     if (!Size)
837       return 0;
838 
839     __tgt_offload_entry *Begin = &E.Entries[0];
840     __tgt_offload_entry *End = &E.Entries[Size - 1];
841 
842     // Update table info according to the entries and return the pointer
843     E.Table.EntriesBegin = Begin;
844     E.Table.EntriesEnd = ++End;
845 
846     return &E.Table;
847   }
848 
849   // Clear entries table for a device
850   void clearOffloadEntriesTable(int DeviceId) {
851     assert(DeviceId < (int32_t)FuncGblEntries.size() &&
852            "Unexpected device id!");
853     FuncGblEntries[DeviceId].emplace_back();
854     FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
855     // KernelArgPoolMap.clear();
856     E.Entries.clear();
857     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
858   }
859 
860   hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
861                                    unsigned int DeviceId) {
862     assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
863     uint32_t GlobalFlags = 0;
864     hsa_status_t Err = hsa_amd_memory_pool_get_info(
865         MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
866 
867     if (Err != HSA_STATUS_SUCCESS) {
868       return Err;
869     }
870 
871     if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
872       DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
873     } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
874       DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
875     }
876 
877     return HSA_STATUS_SUCCESS;
878   }
879 
880   hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
881     for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
882       hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
883           Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
884             hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
885             if (ValidStatus != HSA_STATUS_SUCCESS) {
886               DP("Alloc allowed in memory pool check failed: %s\n",
887                  get_error_string(ValidStatus));
888               return HSA_STATUS_SUCCESS;
889             }
890             return addDeviceMemoryPool(MemoryPool, DeviceId);
891           });
892 
893       if (Err != HSA_STATUS_SUCCESS) {
894         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
895            "Iterate all memory pools", get_error_string(Err));
896         return Err;
897       }
898     }
899     return HSA_STATUS_SUCCESS;
900   }
901 
902   hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
903     std::vector<hsa_amd_memory_pool_t> HostPools;
904 
905     // collect all the "valid" pools for all the given agents.
906     for (const auto &Agent : Agents) {
907       hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
908           Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
909       if (Err != HSA_STATUS_SUCCESS) {
910         DP("addMemoryPool returned %s, continuing\n", get_error_string(Err));
911       }
912     }
913 
914     // We need two fine-grained pools.
915     //  1. One with kernarg flag set for storing kernel arguments
916     //  2. Second for host allocations
917     bool FineGrainedMemoryPoolSet = false;
918     bool KernArgPoolSet = false;
919     for (const auto &MemoryPool : HostPools) {
920       hsa_status_t Err = HSA_STATUS_SUCCESS;
921       uint32_t GlobalFlags = 0;
922       Err = hsa_amd_memory_pool_get_info(
923           MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
924       if (Err != HSA_STATUS_SUCCESS) {
925         DP("Get memory pool info failed: %s\n", get_error_string(Err));
926         return Err;
927       }
928 
929       if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
930         if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
931           KernArgPool = MemoryPool;
932           KernArgPoolSet = true;
933         }
934         HostFineGrainedMemoryPool = MemoryPool;
935         FineGrainedMemoryPoolSet = true;
936       }
937     }
938 
939     if (FineGrainedMemoryPoolSet && KernArgPoolSet)
940       return HSA_STATUS_SUCCESS;
941 
942     return HSA_STATUS_ERROR;
943   }
944 
945   hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
946     assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
947            "Invalid device Id");
948     return DeviceCoarseGrainedMemoryPools[DeviceId];
949   }
950 
951   hsa_amd_memory_pool_t getHostMemoryPool() {
952     return HostFineGrainedMemoryPool;
953   }
954 
955   static int readEnv(const char *Env, int Default = -1) {
956     const char *EnvStr = getenv(Env);
957     int Res = Default;
958     if (EnvStr) {
959       Res = std::stoi(EnvStr);
960       DP("Parsed %s=%d\n", Env, Res);
961     }
962     return Res;
963   }
964 
965   RTLDeviceInfoTy() {
966     DP("Start initializing " GETNAME(TARGET_NAME) "\n");
967 
968     // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
969     // anytime. You do not need a debug library build.
970     //  0 => no tracing
971     //  1 => tracing dispatch only
972     // >1 => verbosity increase
973 
974     if (!HSAInitSuccess()) {
975       DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
976       return;
977     }
978 
979     if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
980       print_kernel_trace = atoi(EnvStr);
981     else
982       print_kernel_trace = 0;
983 
984     hsa_status_t Err = core::atl_init_gpu_context();
985     if (Err != HSA_STATUS_SUCCESS) {
986       DP("Error when initializing " GETNAME(TARGET_NAME) "\n");
987       return;
988     }
989 
990     // Init hostcall soon after initializing hsa
991     hostrpc_init();
992 
993     Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
994       if (DeviceType == HSA_DEVICE_TYPE_CPU) {
995         CPUAgents.push_back(Agent);
996       } else {
997         HSAAgents.push_back(Agent);
998       }
999     });
1000     if (Err != HSA_STATUS_SUCCESS)
1001       return;
1002 
1003     NumberOfDevices = (int)HSAAgents.size();
1004 
1005     if (NumberOfDevices == 0) {
1006       DP("There are no devices supporting HSA.\n");
1007       return;
1008     }
1009     DP("There are %d devices supporting HSA.\n", NumberOfDevices);
1010 
1011     // Init the device info
1012     HSAQueueSchedulers.reserve(NumberOfDevices);
1013     FuncGblEntries.resize(NumberOfDevices);
1014     ThreadsPerGroup.resize(NumberOfDevices);
1015     ComputeUnits.resize(NumberOfDevices);
1016     GPUName.resize(NumberOfDevices);
1017     GroupsPerDevice.resize(NumberOfDevices);
1018     WarpSize.resize(NumberOfDevices);
1019     NumTeams.resize(NumberOfDevices);
1020     NumThreads.resize(NumberOfDevices);
1021     DeviceStateStore.resize(NumberOfDevices);
1022     KernelInfoTable.resize(NumberOfDevices);
1023     SymbolInfoTable.resize(NumberOfDevices);
1024     DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
1025     DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
1026 
1027     Err = setupDevicePools(HSAAgents);
1028     if (Err != HSA_STATUS_SUCCESS) {
1029       DP("Setup for Device Memory Pools failed\n");
1030       return;
1031     }
1032 
1033     Err = setupHostMemoryPools(CPUAgents);
1034     if (Err != HSA_STATUS_SUCCESS) {
1035       DP("Setup for Host Memory Pools failed\n");
1036       return;
1037     }
1038 
1039     for (int I = 0; I < NumberOfDevices; I++) {
1040       uint32_t QueueSize = 0;
1041       {
1042         hsa_status_t Err = hsa_agent_get_info(
1043             HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
1044         if (Err != HSA_STATUS_SUCCESS) {
1045           DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I);
1046           return;
1047         }
1048         enum { MaxQueueSize = 4096 };
1049         if (QueueSize > MaxQueueSize) {
1050           QueueSize = MaxQueueSize;
1051         }
1052       }
1053 
1054       {
1055         HSAQueueScheduler QSched;
1056         if (!QSched.createQueues(HSAAgents[I], QueueSize))
1057           return;
1058         HSAQueueSchedulers.emplace_back(std::move(QSched));
1059       }
1060 
1061       DeviceStateStore[I] = {nullptr, 0};
1062     }
1063 
1064     for (int I = 0; I < NumberOfDevices; I++) {
1065       ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
1066       GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
1067       ComputeUnits[I] = 1;
1068       DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,
1069          GroupsPerDevice[I], ThreadsPerGroup[I]);
1070     }
1071 
1072     // Get environment variables regarding teams
1073     Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
1074     Env.NumTeams = readEnv("OMP_NUM_TEAMS");
1075     Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
1076     Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
1077     Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
1078 
1079     // Default state.
1080     RequiresFlags = OMP_REQ_UNDEFINED;
1081 
1082     ConstructionSucceeded = true;
1083   }
1084 
1085   ~RTLDeviceInfoTy() {
1086     DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
1087     if (!HSAInitSuccess()) {
1088       // Then none of these can have been set up and they can't be torn down
1089       return;
1090     }
1091     // Run destructors on types that use HSA before
1092     // impl_finalize removes access to it
1093     DeviceStateStore.clear();
1094     KernelArgPoolMap.clear();
1095     // Terminate hostrpc before finalizing hsa
1096     hostrpc_terminate();
1097 
1098     hsa_status_t Err;
1099     for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
1100       Err = hsa_executable_destroy(HSAExecutables[I]);
1101       if (Err != HSA_STATUS_SUCCESS) {
1102         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1103            "Destroying executable", get_error_string(Err));
1104       }
1105     }
1106   }
1107 };
1108 
1109 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
1110 
1111 static RTLDeviceInfoTy DeviceInfo;
1112 
1113 namespace {
1114 
1115 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
1116                      __tgt_async_info *AsyncInfo) {
1117   assert(AsyncInfo && "AsyncInfo is nullptr");
1118   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
1119   // Return success if we are not copying back to host from target.
1120   if (!HstPtr)
1121     return OFFLOAD_SUCCESS;
1122   hsa_status_t Err;
1123   DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
1124      (long long unsigned)(Elf64_Addr)TgtPtr,
1125      (long long unsigned)(Elf64_Addr)HstPtr);
1126 
1127   Err = DeviceInfo.freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
1128                                            DeviceId);
1129 
1130   if (Err != HSA_STATUS_SUCCESS) {
1131     DP("Error when copying data from device to host. Pointers: "
1132        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
1133        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
1134     return OFFLOAD_FAIL;
1135   }
1136   DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
1137      (long long unsigned)(Elf64_Addr)TgtPtr,
1138      (long long unsigned)(Elf64_Addr)HstPtr);
1139   return OFFLOAD_SUCCESS;
1140 }
1141 
1142 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
1143                    __tgt_async_info *AsyncInfo) {
1144   assert(AsyncInfo && "AsyncInfo is nullptr");
1145   hsa_status_t Err;
1146   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
1147   // Return success if we are not doing host to target.
1148   if (!HstPtr)
1149     return OFFLOAD_SUCCESS;
1150 
1151   DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
1152      (long long unsigned)(Elf64_Addr)HstPtr,
1153      (long long unsigned)(Elf64_Addr)TgtPtr);
1154   Err = DeviceInfo.freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
1155                                            DeviceId);
1156   if (Err != HSA_STATUS_SUCCESS) {
1157     DP("Error when copying data from host to device. Pointers: "
1158        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
1159        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
1160     return OFFLOAD_FAIL;
1161   }
1162   return OFFLOAD_SUCCESS;
1163 }
1164 
1165 // Async.
1166 // The implementation was written with cuda streams in mind. The semantics of
1167 // that are to execute kernels on a queue in order of insertion. A synchronise
1168 // call then makes writes visible between host and device. This means a series
1169 // of N data_submit_async calls are expected to execute serially. HSA offers
1170 // various options to run the data copies concurrently. This may require changes
1171 // to libomptarget.
1172 
1173 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
1174 // there are no outstanding kernels that need to be synchronized. Any async call
1175 // may be passed a Queue==0, at which point the cuda implementation will set it
1176 // to non-null (see getStream). The cuda streams are per-device. Upstream may
1177 // change this interface to explicitly initialize the AsyncInfo_pointer, but
1178 // until then hsa lazily initializes it as well.
1179 
1180 void initAsyncInfo(__tgt_async_info *AsyncInfo) {
1181   // set non-null while using async calls, return to null to indicate completion
1182   assert(AsyncInfo);
1183   if (!AsyncInfo->Queue) {
1184     AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX);
1185   }
1186 }
1187 void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
1188   assert(AsyncInfo);
1189   assert(AsyncInfo->Queue);
1190   AsyncInfo->Queue = 0;
1191 }
1192 
1193 // Determine launch values for kernel.
1194 struct LaunchVals {
1195   int WorkgroupSize;
1196   int GridSize;
1197 };
1198 LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
1199                          int ConstWGSize,
1200                          llvm::omp::OMPTgtExecModeFlags ExecutionMode,
1201                          int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
1202                          int DeviceNumTeams) {
1203 
1204   int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
1205   int NumGroups = 0;
1206 
1207   int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
1208   if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
1209     MaxTeams = RTLDeviceInfoTy::HardTeamLimit;
1210 
1211   if (print_kernel_trace & STARTUP_DETAILS) {
1212     DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams);
1213     DP("Max_Teams: %d\n", MaxTeams);
1214     DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
1215     DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize);
1216     DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
1217        RTLDeviceInfoTy::DefaultWgSize);
1218     DP("thread_limit: %d\n", ThreadLimit);
1219     DP("threadsPerGroup: %d\n", ThreadsPerGroup);
1220     DP("ConstWGSize: %d\n", ConstWGSize);
1221   }
1222   // check for thread_limit() clause
1223   if (ThreadLimit > 0) {
1224     ThreadsPerGroup = ThreadLimit;
1225     DP("Setting threads per block to requested %d\n", ThreadLimit);
1226     // Add master warp for GENERIC
1227     if (ExecutionMode ==
1228         llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1229       ThreadsPerGroup += WarpSize;
1230       DP("Adding master wavefront: +%d threads\n", WarpSize);
1231     }
1232     if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
1233       ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
1234       DP("Setting threads per block to maximum %d\n", ThreadsPerGroup);
1235     }
1236   }
1237   // check flat_max_work_group_size attr here
1238   if (ThreadsPerGroup > ConstWGSize) {
1239     ThreadsPerGroup = ConstWGSize;
1240     DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
1241        ThreadsPerGroup);
1242   }
1243   if (print_kernel_trace & STARTUP_DETAILS)
1244     DP("threadsPerGroup: %d\n", ThreadsPerGroup);
1245   DP("Preparing %d threads\n", ThreadsPerGroup);
1246 
1247   // Set default num_groups (teams)
1248   if (Env.TeamLimit > 0)
1249     NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
1250   else
1251     NumGroups = MaxTeams;
1252   DP("Set default num of groups %d\n", NumGroups);
1253 
1254   if (print_kernel_trace & STARTUP_DETAILS) {
1255     DP("num_groups: %d\n", NumGroups);
1256     DP("num_teams: %d\n", NumTeams);
1257   }
1258 
1259   // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1260   // This reduction is typical for default case (no thread_limit clause).
1261   // or when user goes crazy with num_teams clause.
1262   // FIXME: We cant distinguish between a constant or variable thread limit.
1263   // So we only handle constant thread_limits.
1264   if (ThreadsPerGroup >
1265       RTLDeviceInfoTy::DefaultWgSize) //  256 < threadsPerGroup <= 1024
1266     // Should we round threadsPerGroup up to nearest WarpSize
1267     // here?
1268     NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;
1269 
1270   // check for num_teams() clause
1271   if (NumTeams > 0) {
1272     NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
1273   }
1274   if (print_kernel_trace & STARTUP_DETAILS) {
1275     DP("num_groups: %d\n", NumGroups);
1276     DP("Env.NumTeams %d\n", Env.NumTeams);
1277     DP("Env.TeamLimit %d\n", Env.TeamLimit);
1278   }
1279 
1280   if (Env.NumTeams > 0) {
1281     NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
1282     DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
1283   } else if (Env.TeamLimit > 0) {
1284     NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
1285     DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
1286   } else {
1287     if (NumTeams <= 0) {
1288       if (LoopTripcount > 0) {
1289         if (ExecutionMode ==
1290             llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
1291           // round up to the nearest integer
1292           NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
1293         } else if (ExecutionMode ==
1294                    llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1295           NumGroups = LoopTripcount;
1296         } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
1297           // This is a generic kernel that was transformed to use SPMD-mode
1298           // execution but uses Generic-mode semantics for scheduling.
1299           NumGroups = LoopTripcount;
1300         }
1301         DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
1302            "threads per block %d\n",
1303            NumGroups, LoopTripcount, ThreadsPerGroup);
1304       }
1305     } else {
1306       NumGroups = NumTeams;
1307     }
1308     if (NumGroups > MaxTeams) {
1309       NumGroups = MaxTeams;
1310       if (print_kernel_trace & STARTUP_DETAILS)
1311         DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams);
1312     }
1313     if (NumGroups > NumTeams && NumTeams > 0) {
1314       NumGroups = NumTeams;
1315       if (print_kernel_trace & STARTUP_DETAILS)
1316         DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,
1317            NumTeams);
1318     }
1319   }
1320 
1321   // num_teams clause always honored, no matter what, unless DEFAULT is active.
1322   if (NumTeams > 0) {
1323     NumGroups = NumTeams;
1324     // Cap num_groups to EnvMaxTeamsDefault if set.
1325     if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
1326       NumGroups = Env.MaxTeamsDefault;
1327   }
1328   if (print_kernel_trace & STARTUP_DETAILS) {
1329     DP("threadsPerGroup: %d\n", ThreadsPerGroup);
1330     DP("num_groups: %d\n", NumGroups);
1331     DP("loop_tripcount: %ld\n", LoopTripcount);
1332   }
1333   DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,
1334      ThreadsPerGroup);
1335 
1336   LaunchVals Res;
1337   Res.WorkgroupSize = ThreadsPerGroup;
1338   Res.GridSize = ThreadsPerGroup * NumGroups;
1339   return Res;
1340 }
1341 
1342 static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
1343   uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
1344   bool Full = true;
1345   while (Full) {
1346     Full =
1347         PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
1348   }
1349   return PacketId;
1350 }
1351 
1352 int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1353                         ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
1354                         int32_t ThreadLimit, uint64_t LoopTripcount) {
1355   // Set the context we are using
1356   // update thread limit content in gpu memory if un-initialized or specified
1357   // from host
1358 
1359   DP("Run target team region thread_limit %d\n", ThreadLimit);
1360 
1361   // All args are references.
1362   std::vector<void *> Args(ArgNum);
1363   std::vector<void *> Ptrs(ArgNum);
1364 
1365   DP("Arg_num: %d\n", ArgNum);
1366   for (int32_t I = 0; I < ArgNum; ++I) {
1367     Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1368     Args[I] = &Ptrs[I];
1369     DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I]));
1370   }
1371 
1372   KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;
1373 
1374   std::string KernelName = std::string(KernelInfo->Name);
1375   auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
1376   if (KernelInfoTable[DeviceId].find(KernelName) ==
1377       KernelInfoTable[DeviceId].end()) {
1378     DP("Kernel %s not found\n", KernelName.c_str());
1379     return OFFLOAD_FAIL;
1380   }
1381 
1382   const atl_kernel_info_t KernelInfoEntry =
1383       KernelInfoTable[DeviceId][KernelName];
1384   const uint32_t GroupSegmentSize =
1385       KernelInfoEntry.group_segment_size + DeviceInfo.Env.DynamicMemSize;
1386   const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
1387   const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
1388   const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
1389   const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;
1390 
1391   assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count);
1392 
1393   /*
1394    * Set limit based on ThreadsPerGroup and GroupsPerDevice
1395    */
1396   LaunchVals LV =
1397       getLaunchVals(DeviceInfo.WarpSize[DeviceId], DeviceInfo.Env,
1398                     KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
1399                     NumTeams,      // From run_region arg
1400                     ThreadLimit,   // From run_region arg
1401                     LoopTripcount, // From run_region arg
1402                     DeviceInfo.NumTeams[KernelInfo->DeviceId]);
1403   const int GridSize = LV.GridSize;
1404   const int WorkgroupSize = LV.WorkgroupSize;
1405 
1406   if (print_kernel_trace >= LAUNCH) {
1407     int NumGroups = GridSize / WorkgroupSize;
1408     // enum modes are SPMD, GENERIC, NONE 0,1,2
1409     // if doing rtl timing, print to stderr, unless stdout requested.
1410     bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
1411     fprintf(TraceToStdout ? stdout : stderr,
1412             "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
1413             "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
1414             "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
1415             DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
1416             ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
1417             GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
1418             VgprSpillCount, LoopTripcount, KernelInfo->Name);
1419   }
1420 
1421   // Run on the device.
1422   {
1423     hsa_queue_t *Queue = DeviceInfo.HSAQueueSchedulers[DeviceId].next();
1424     if (!Queue) {
1425       return OFFLOAD_FAIL;
1426     }
1427     uint64_t PacketId = acquireAvailablePacketId(Queue);
1428 
1429     const uint32_t Mask = Queue->size - 1; // size is a power of 2
1430     hsa_kernel_dispatch_packet_t *Packet =
1431         (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
1432 
1433     // packet->header is written last
1434     Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1435     Packet->workgroup_size_x = WorkgroupSize;
1436     Packet->workgroup_size_y = 1;
1437     Packet->workgroup_size_z = 1;
1438     Packet->reserved0 = 0;
1439     Packet->grid_size_x = GridSize;
1440     Packet->grid_size_y = 1;
1441     Packet->grid_size_z = 1;
1442     Packet->private_segment_size = KernelInfoEntry.private_segment_size;
1443     Packet->group_segment_size = GroupSegmentSize;
1444     Packet->kernel_object = KernelInfoEntry.kernel_object;
1445     Packet->kernarg_address = 0;     // use the block allocator
1446     Packet->reserved2 = 0;           // impl writes id_ here
1447     Packet->completion_signal = {0}; // may want a pool of signals
1448 
1449     KernelArgPool *ArgPool = nullptr;
1450     void *KernArg = nullptr;
1451     {
1452       auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
1453       if (It != KernelArgPoolMap.end()) {
1454         ArgPool = (It->second).get();
1455       }
1456     }
1457     if (!ArgPool) {
1458       DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
1459          DeviceId);
1460     }
1461     {
1462       if (ArgPool) {
1463         assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)));
1464         KernArg = ArgPool->allocate(ArgNum);
1465       }
1466       if (!KernArg) {
1467         DP("Allocate kernarg failed\n");
1468         return OFFLOAD_FAIL;
1469       }
1470 
1471       // Copy explicit arguments
1472       for (int I = 0; I < ArgNum; I++) {
1473         memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *));
1474       }
1475 
1476       // Initialize implicit arguments. TODO: Which of these can be dropped
1477       impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>(
1478           static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
1479       memset(ImplArgs, 0,
1480              sizeof(impl_implicit_args_t)); // may not be necessary
1481       ImplArgs->offset_x = 0;
1482       ImplArgs->offset_y = 0;
1483       ImplArgs->offset_z = 0;
1484 
1485       // assign a hostcall buffer for the selected Q
1486       if (__atomic_load_n(&DeviceInfo.HostcallRequired, __ATOMIC_ACQUIRE)) {
1487         // hostrpc_assign_buffer is not thread safe, and this function is
1488         // under a multiple reader lock, not a writer lock.
1489         static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER;
1490         pthread_mutex_lock(&HostcallInitLock);
1491         uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo.HSAAgents[DeviceId],
1492                                                 Queue, DeviceId);
1493         pthread_mutex_unlock(&HostcallInitLock);
1494         if (!Buffer) {
1495           DP("hostrpc_assign_buffer failed, gpu would dereference null and "
1496              "error\n");
1497           return OFFLOAD_FAIL;
1498         }
1499 
1500         DP("Implicit argument count: %d\n",
1501            KernelInfoEntry.implicit_argument_count);
1502         if (KernelInfoEntry.implicit_argument_count >= 4) {
1503           // Initialise pointer for implicit_argument_count != 0 ABI
1504           // Guess that the right implicit argument is at offset 24 after
1505           // the explicit arguments. In the future, should be able to read
1506           // the offset from msgpack. Clang is not annotating it at present.
1507           uint64_t Offset =
1508               sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
1509           if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
1510             DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "
1511                "args: %d\n",
1512                Offset + 8, ArgPool->kernargSizeIncludingImplicit());
1513           } else {
1514             memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
1515           }
1516         }
1517 
1518         // initialise pointer for implicit_argument_count == 0 ABI
1519         ImplArgs->hostcall_ptr = Buffer;
1520       }
1521 
1522       Packet->kernarg_address = KernArg;
1523     }
1524 
1525     hsa_signal_t S = DeviceInfo.FreeSignalPool.pop();
1526     if (S.handle == 0) {
1527       DP("Failed to get signal instance\n");
1528       return OFFLOAD_FAIL;
1529     }
1530     Packet->completion_signal = S;
1531     hsa_signal_store_relaxed(Packet->completion_signal, 1);
1532 
1533     // Publish the packet indicating it is ready to be processed
1534     core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
1535                              core::createHeader(), Packet->setup);
1536 
1537     // Since the packet is already published, its contents must not be
1538     // accessed any more
1539     hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
1540 
1541     while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
1542                                      HSA_WAIT_STATE_BLOCKED) != 0)
1543       ;
1544 
1545     assert(ArgPool);
1546     ArgPool->deallocate(KernArg);
1547     DeviceInfo.FreeSignalPool.push(S);
1548   }
1549 
1550   DP("Kernel completed\n");
1551   return OFFLOAD_SUCCESS;
1552 }
1553 
1554 bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
1555   const uint16_t AmdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
1556   int32_t R = elf_check_machine(Image, AmdgcnMachineID);
1557   if (!R) {
1558     DP("Supported machine ID not found\n");
1559   }
1560   return R;
1561 }
1562 
1563 uint32_t elfEFlags(__tgt_device_image *Image) {
1564   char *ImgBegin = (char *)Image->ImageStart;
1565   size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;
1566 
1567   Elf *E = elf_memory(ImgBegin, ImgSize);
1568   if (!E) {
1569     DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
1570     return 0;
1571   }
1572 
1573   Elf64_Ehdr *Eh64 = elf64_getehdr(E);
1574 
1575   if (!Eh64) {
1576     DP("Unable to get machine ID from ELF file!\n");
1577     elf_end(E);
1578     return 0;
1579   }
1580 
1581   uint32_t Flags = Eh64->e_flags;
1582 
1583   elf_end(E);
1584   DP("ELF Flags: 0x%x\n", Flags);
1585   return Flags;
1586 }
1587 
1588 template <typename T> bool enforceUpperBound(T *Value, T Upper) {
1589   bool Changed = *Value > Upper;
1590   if (Changed) {
1591     *Value = Upper;
1592   }
1593   return Changed;
1594 }
1595 
1596 Elf64_Shdr *findOnlyShtHash(Elf *Elf) {
1597   size_t N;
1598   int Rc = elf_getshdrnum(Elf, &N);
1599   if (Rc != 0) {
1600     return nullptr;
1601   }
1602 
1603   Elf64_Shdr *Result = nullptr;
1604   for (size_t I = 0; I < N; I++) {
1605     Elf_Scn *Scn = elf_getscn(Elf, I);
1606     if (Scn) {
1607       Elf64_Shdr *Shdr = elf64_getshdr(Scn);
1608       if (Shdr) {
1609         if (Shdr->sh_type == SHT_HASH) {
1610           if (Result == nullptr) {
1611             Result = Shdr;
1612           } else {
1613             // multiple SHT_HASH sections not handled
1614             return nullptr;
1615           }
1616         }
1617       }
1618     }
1619   }
1620   return Result;
1621 }
1622 
1623 const Elf64_Sym *elfLookup(Elf *Elf, char *Base, Elf64_Shdr *SectionHash,
1624                            const char *Symname) {
1625 
1626   assert(SectionHash);
1627   size_t SectionSymtabIndex = SectionHash->sh_link;
1628   Elf64_Shdr *SectionSymtab =
1629       elf64_getshdr(elf_getscn(Elf, SectionSymtabIndex));
1630   size_t SectionStrtabIndex = SectionSymtab->sh_link;
1631 
1632   const Elf64_Sym *Symtab =
1633       reinterpret_cast<const Elf64_Sym *>(Base + SectionSymtab->sh_offset);
1634 
1635   const uint32_t *Hashtab =
1636       reinterpret_cast<const uint32_t *>(Base + SectionHash->sh_offset);
1637 
1638   // Layout:
1639   // nbucket
1640   // nchain
1641   // bucket[nbucket]
1642   // chain[nchain]
1643   uint32_t Nbucket = Hashtab[0];
1644   const uint32_t *Bucket = &Hashtab[2];
1645   const uint32_t *Chain = &Hashtab[Nbucket + 2];
1646 
1647   const size_t Max = strlen(Symname) + 1;
1648   const uint32_t Hash = elf_hash(Symname);
1649   for (uint32_t I = Bucket[Hash % Nbucket]; I != 0; I = Chain[I]) {
1650     char *N = elf_strptr(Elf, SectionStrtabIndex, Symtab[I].st_name);
1651     if (strncmp(Symname, N, Max) == 0) {
1652       return &Symtab[I];
1653     }
1654   }
1655 
1656   return nullptr;
1657 }
1658 
1659 struct SymbolInfo {
1660   void *Addr = nullptr;
1661   uint32_t Size = UINT32_MAX;
1662   uint32_t ShType = SHT_NULL;
1663 };
1664 
1665 int getSymbolInfoWithoutLoading(Elf *Elf, char *Base, const char *Symname,
1666                                 SymbolInfo *Res) {
1667   if (elf_kind(Elf) != ELF_K_ELF) {
1668     return 1;
1669   }
1670 
1671   Elf64_Shdr *SectionHash = findOnlyShtHash(Elf);
1672   if (!SectionHash) {
1673     return 1;
1674   }
1675 
1676   const Elf64_Sym *Sym = elfLookup(Elf, Base, SectionHash, Symname);
1677   if (!Sym) {
1678     return 1;
1679   }
1680 
1681   if (Sym->st_size > UINT32_MAX) {
1682     return 1;
1683   }
1684 
1685   if (Sym->st_shndx == SHN_UNDEF) {
1686     return 1;
1687   }
1688 
1689   Elf_Scn *Section = elf_getscn(Elf, Sym->st_shndx);
1690   if (!Section) {
1691     return 1;
1692   }
1693 
1694   Elf64_Shdr *Header = elf64_getshdr(Section);
1695   if (!Header) {
1696     return 1;
1697   }
1698 
1699   Res->Addr = Sym->st_value + Base;
1700   Res->Size = static_cast<uint32_t>(Sym->st_size);
1701   Res->ShType = Header->sh_type;
1702   return 0;
1703 }
1704 
1705 int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *Symname,
1706                                 SymbolInfo *Res) {
1707   Elf *Elf = elf_memory(Base, ImgSize);
1708   if (Elf) {
1709     int Rc = getSymbolInfoWithoutLoading(Elf, Base, Symname, Res);
1710     elf_end(Elf);
1711     return Rc;
1712   }
1713   return 1;
1714 }
1715 
1716 hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
1717                                   const char *SymName, void **VarAddr,
1718                                   uint32_t *VarSize) {
1719   SymbolInfo SI;
1720   int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
1721   if (Rc == 0) {
1722     *VarAddr = SI.Addr;
1723     *VarSize = SI.Size;
1724     return HSA_STATUS_SUCCESS;
1725   }
1726   return HSA_STATUS_ERROR;
1727 }
1728 
1729 template <typename C>
1730 hsa_status_t moduleRegisterFromMemoryToPlace(
1731     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
1732     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
1733     void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
1734     std::vector<hsa_executable_t> &HSAExecutables) {
1735   auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t {
1736     C *Unwrapped = static_cast<C *>(CbState);
1737     return (*Unwrapped)(Data, Size);
1738   };
1739   return core::RegisterModuleFromMemory(
1740       KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
1741       DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
1742       HSAExecutables);
1743 }
1744 
1745 uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
1746   uint64_t DeviceStateBytes = 0;
1747   {
1748     // If this is the deviceRTL, get the state variable size
1749     SymbolInfo SizeSi;
1750     int Rc = getSymbolInfoWithoutLoading(
1751         ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);
1752 
1753     if (Rc == 0) {
1754       if (SizeSi.Size != sizeof(uint64_t)) {
1755         DP("Found device_State_size variable with wrong size\n");
1756         return 0;
1757       }
1758 
1759       // Read number of bytes directly from the elf
1760       memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
1761     }
1762   }
1763   return DeviceStateBytes;
1764 }
1765 
1766 struct DeviceEnvironment {
1767   // initialise an DeviceEnvironmentTy in the deviceRTL
1768   // patches around differences in the deviceRTL between trunk, aomp,
1769   // rocmcc. Over time these differences will tend to zero and this class
1770   // simplified.
1771   // Symbol may be in .data or .bss, and may be missing fields, todo:
1772   // review aomp/trunk/rocm and simplify the following
1773 
1774   // The symbol may also have been deadstripped because the device side
1775   // accessors were unused.
1776 
1777   // If the symbol is in .data (aomp, rocm) it can be written directly.
1778   // If it is in .bss, we must wait for it to be allocated space on the
1779   // gpu (trunk) and initialize after loading.
1780   const char *sym() { return "omptarget_device_environment"; }
1781 
1782   DeviceEnvironmentTy HostDeviceEnv;
1783   SymbolInfo SI;
1784   bool Valid = false;
1785 
1786   __tgt_device_image *Image;
1787   const size_t ImgSize;
1788 
1789   DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
1790                     __tgt_device_image *Image, const size_t ImgSize)
1791       : Image(Image), ImgSize(ImgSize) {
1792 
1793     HostDeviceEnv.NumDevices = NumberDevices;
1794     HostDeviceEnv.DeviceNum = DeviceId;
1795     HostDeviceEnv.DebugKind = 0;
1796     HostDeviceEnv.DynamicMemSize = DynamicMemSize;
1797     if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
1798       HostDeviceEnv.DebugKind = std::stoi(EnvStr);
1799 
1800     int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
1801                                          sym(), &SI);
1802     if (Rc != 0) {
1803       DP("Finding global device environment '%s' - symbol missing.\n", sym());
1804       return;
1805     }
1806 
1807     if (SI.Size > sizeof(HostDeviceEnv)) {
1808       DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,
1809          sizeof(HostDeviceEnv));
1810       return;
1811     }
1812 
1813     Valid = true;
1814   }
1815 
1816   bool inImage() { return SI.ShType != SHT_NOBITS; }
1817 
1818   hsa_status_t beforeLoading(void *Data, size_t Size) {
1819     if (Valid) {
1820       if (inImage()) {
1821         DP("Setting global device environment before load (%u bytes)\n",
1822            SI.Size);
1823         uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart;
1824         void *Pos = (char *)Data + Offset;
1825         memcpy(Pos, &HostDeviceEnv, SI.Size);
1826       }
1827     }
1828     return HSA_STATUS_SUCCESS;
1829   }
1830 
1831   hsa_status_t afterLoading() {
1832     if (Valid) {
1833       if (!inImage()) {
1834         DP("Setting global device environment after load (%u bytes)\n",
1835            SI.Size);
1836         int DeviceId = HostDeviceEnv.DeviceNum;
1837         auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId];
1838         void *StatePtr;
1839         uint32_t StatePtrSize;
1840         hsa_status_t Err = interop_hsa_get_symbol_info(
1841             SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
1842         if (Err != HSA_STATUS_SUCCESS) {
1843           DP("failed to find %s in loaded image\n", sym());
1844           return Err;
1845         }
1846 
1847         if (StatePtrSize != SI.Size) {
1848           DP("Symbol had size %u before loading, %u after\n", StatePtrSize,
1849              SI.Size);
1850           return HSA_STATUS_ERROR;
1851         }
1852 
1853         return DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
1854                                                   StatePtrSize, DeviceId);
1855       }
1856     }
1857     return HSA_STATUS_SUCCESS;
1858   }
1859 };
1860 
1861 hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
1862   uint64_t Rounded = 4 * ((Size + 3) / 4);
1863   void *Ptr;
1864   hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId);
1865   hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
1866   if (Err != HSA_STATUS_SUCCESS) {
1867     return Err;
1868   }
1869 
1870   hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
1871   if (Rc != HSA_STATUS_SUCCESS) {
1872     DP("zero fill device_state failed with %u\n", Rc);
1873     core::Runtime::Memfree(Ptr);
1874     return HSA_STATUS_ERROR;
1875   }
1876 
1877   *RetPtr = Ptr;
1878   return HSA_STATUS_SUCCESS;
1879 }
1880 
1881 bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
1882   SymbolInfo SI;
1883   int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
1884   return (Rc == 0) && (SI.Addr != nullptr);
1885 }
1886 
1887 } // namespace
1888 
1889 namespace core {
1890 hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
1891   return hsa_amd_agents_allow_access(DeviceInfo.HSAAgents.size(),
1892                                      &DeviceInfo.HSAAgents[0], NULL, Ptr);
1893 }
1894 } // namespace core
1895 
1896 extern "C" {
1897 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
1898   return elfMachineIdIsAmdgcn(Image);
1899 }
1900 
1901 int __tgt_rtl_number_of_devices() {
1902   // If the construction failed, no methods are safe to call
1903   if (DeviceInfo.ConstructionSucceeded) {
1904     return DeviceInfo.NumberOfDevices;
1905   }
1906   DP("AMDGPU plugin construction failed. Zero devices available\n");
1907   return 0;
1908 }
1909 
1910 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1911   DP("Init requires flags to %ld\n", RequiresFlags);
1912   DeviceInfo.RequiresFlags = RequiresFlags;
1913   return RequiresFlags;
1914 }
1915 
1916 int32_t __tgt_rtl_init_device(int DeviceId) {
1917   hsa_status_t Err;
1918 
1919   // this is per device id init
1920   DP("Initialize the device id: %d\n", DeviceId);
1921 
1922   hsa_agent_t Agent = DeviceInfo.HSAAgents[DeviceId];
1923 
1924   // Get number of Compute Unit
1925   uint32_t ComputeUnits = 0;
1926   Err = hsa_agent_get_info(
1927       Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
1928       &ComputeUnits);
1929   if (Err != HSA_STATUS_SUCCESS) {
1930     DeviceInfo.ComputeUnits[DeviceId] = 1;
1931     DP("Error getting compute units : settiing to 1\n");
1932   } else {
1933     DeviceInfo.ComputeUnits[DeviceId] = ComputeUnits;
1934     DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[DeviceId]);
1935   }
1936 
1937   char GetInfoName[64]; // 64 max size returned by get info
1938   Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
1939                            (void *)GetInfoName);
1940   if (Err)
1941     DeviceInfo.GPUName[DeviceId] = "--unknown gpu--";
1942   else {
1943     DeviceInfo.GPUName[DeviceId] = GetInfoName;
1944   }
1945 
1946   if (print_kernel_trace & STARTUP_DETAILS)
1947     DP("Device#%-2d CU's: %2d %s\n", DeviceId,
1948        DeviceInfo.ComputeUnits[DeviceId], DeviceInfo.GPUName[DeviceId].c_str());
1949 
1950   // Query attributes to determine number of threads/block and blocks/grid.
1951   uint16_t WorkgroupMaxDim[3];
1952   Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
1953                            &WorkgroupMaxDim);
1954   if (Err != HSA_STATUS_SUCCESS) {
1955     DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
1956     DP("Error getting grid dims: num groups : %d\n",
1957        RTLDeviceInfoTy::DefaultNumTeams);
1958   } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
1959     DeviceInfo.GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
1960     DP("Using %d ROCm blocks per grid\n", DeviceInfo.GroupsPerDevice[DeviceId]);
1961   } else {
1962     DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
1963     DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
1964        "at the hard limit\n",
1965        WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit);
1966   }
1967 
1968   // Get thread limit
1969   hsa_dim3_t GridMaxDim;
1970   Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
1971   if (Err == HSA_STATUS_SUCCESS) {
1972     DeviceInfo.ThreadsPerGroup[DeviceId] =
1973         reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
1974         DeviceInfo.GroupsPerDevice[DeviceId];
1975 
1976     if (DeviceInfo.ThreadsPerGroup[DeviceId] == 0) {
1977       DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
1978       DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
1979     } else if (enforceUpperBound(&DeviceInfo.ThreadsPerGroup[DeviceId],
1980                                  RTLDeviceInfoTy::MaxWgSize)) {
1981       DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
1982     } else {
1983       DP("Using ROCm Queried thread limit: %d\n",
1984          DeviceInfo.ThreadsPerGroup[DeviceId]);
1985     }
1986   } else {
1987     DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
1988     DP("Error getting max block dimension, use default:%d \n",
1989        RTLDeviceInfoTy::MaxWgSize);
1990   }
1991 
1992   // Get wavefront size
1993   uint32_t WavefrontSize = 0;
1994   Err =
1995       hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
1996   if (Err == HSA_STATUS_SUCCESS) {
1997     DP("Queried wavefront size: %d\n", WavefrontSize);
1998     DeviceInfo.WarpSize[DeviceId] = WavefrontSize;
1999   } else {
2000     // TODO: Burn the wavefront size into the code object
2001     DP("Warning: Unknown wavefront size, assuming 64\n");
2002     DeviceInfo.WarpSize[DeviceId] = 64;
2003   }
2004 
2005   // Adjust teams to the env variables
2006 
2007   if (DeviceInfo.Env.TeamLimit > 0 &&
2008       (enforceUpperBound(&DeviceInfo.GroupsPerDevice[DeviceId],
2009                          DeviceInfo.Env.TeamLimit))) {
2010     DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
2011        DeviceInfo.Env.TeamLimit);
2012   }
2013 
2014   // Set default number of teams
2015   if (DeviceInfo.Env.NumTeams > 0) {
2016     DeviceInfo.NumTeams[DeviceId] = DeviceInfo.Env.NumTeams;
2017     DP("Default number of teams set according to environment %d\n",
2018        DeviceInfo.Env.NumTeams);
2019   } else {
2020     char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
2021     int TeamsPerCU = DefaultTeamsPerCU;
2022     if (TeamsPerCUEnvStr) {
2023       TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
2024     }
2025 
2026     DeviceInfo.NumTeams[DeviceId] =
2027         TeamsPerCU * DeviceInfo.ComputeUnits[DeviceId];
2028     DP("Default number of teams = %d * number of compute units %d\n",
2029        TeamsPerCU, DeviceInfo.ComputeUnits[DeviceId]);
2030   }
2031 
2032   if (enforceUpperBound(&DeviceInfo.NumTeams[DeviceId],
2033                         DeviceInfo.GroupsPerDevice[DeviceId])) {
2034     DP("Default number of teams exceeds device limit, capping at %d\n",
2035        DeviceInfo.GroupsPerDevice[DeviceId]);
2036   }
2037 
2038   // Adjust threads to the env variables
2039   if (DeviceInfo.Env.TeamThreadLimit > 0 &&
2040       (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId],
2041                          DeviceInfo.Env.TeamThreadLimit))) {
2042     DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
2043        DeviceInfo.Env.TeamThreadLimit);
2044   }
2045 
2046   // Set default number of threads
2047   DeviceInfo.NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
2048   DP("Default number of threads set according to library's default %d\n",
2049      RTLDeviceInfoTy::DefaultWgSize);
2050   if (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId],
2051                         DeviceInfo.ThreadsPerGroup[DeviceId])) {
2052     DP("Default number of threads exceeds device limit, capping at %d\n",
2053        DeviceInfo.ThreadsPerGroup[DeviceId]);
2054   }
2055 
2056   DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
2057      DeviceId, DeviceInfo.GroupsPerDevice[DeviceId],
2058      DeviceInfo.ThreadsPerGroup[DeviceId]);
2059 
2060   DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,
2061      DeviceInfo.WarpSize[DeviceId], DeviceInfo.ThreadsPerGroup[DeviceId],
2062      DeviceInfo.GroupsPerDevice[DeviceId],
2063      DeviceInfo.GroupsPerDevice[DeviceId] *
2064          DeviceInfo.ThreadsPerGroup[DeviceId]);
2065 
2066   return OFFLOAD_SUCCESS;
2067 }
2068 
2069 static __tgt_target_table *
2070 __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
2071 
2072 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
2073                                           __tgt_device_image *Image) {
2074   DeviceInfo.LoadRunLock.lock();
2075   __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
2076   DeviceInfo.LoadRunLock.unlock();
2077   return Res;
2078 }
2079 
2080 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
2081                                                  __tgt_device_image *Image) {
2082   // This function loads the device image onto gpu[DeviceId] and does other
2083   // per-image initialization work. Specifically:
2084   //
2085   // - Initialize an DeviceEnvironmentTy instance embedded in the
2086   //   image at the symbol "omptarget_device_environment"
2087   //   Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
2088   //
2089   // - Allocate a large array per-gpu (could be moved to init_device)
2090   //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
2091   //   - Allocate at least that many bytes of gpu memory
2092   //   - Zero initialize it
2093   //   - Write the pointer to the symbol omptarget_nvptx_device_State
2094   //
2095   // - Pulls some per-kernel information together from various sources and
2096   //   records it in the KernelsList for quicker access later
2097   //
2098   // The initialization can be done before or after loading the image onto the
2099   // gpu. This function presently does a mixture. Using the hsa api to get/set
2100   // the information is simpler to implement, in exchange for more complicated
2101   // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
2102   // back from the gpu vs a hashtable lookup on the host.
2103 
2104   const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;
2105 
2106   DeviceInfo.clearOffloadEntriesTable(DeviceId);
2107 
2108   // We do not need to set the ELF version because the caller of this function
2109   // had to do that to decide the right runtime to use
2110 
2111   if (!elfMachineIdIsAmdgcn(Image))
2112     return NULL;
2113 
2114   {
2115     auto Env = DeviceEnvironment(DeviceId, DeviceInfo.NumberOfDevices,
2116                                  DeviceInfo.Env.DynamicMemSize, Image, ImgSize);
2117 
2118     auto &KernelInfo = DeviceInfo.KernelInfoTable[DeviceId];
2119     auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId];
2120     hsa_status_t Err = moduleRegisterFromMemoryToPlace(
2121         KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
2122         [&](void *Data, size_t Size) {
2123           if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
2124             __atomic_store_n(&DeviceInfo.HostcallRequired, true,
2125                              __ATOMIC_RELEASE);
2126           }
2127           return Env.beforeLoading(Data, Size);
2128         },
2129         DeviceInfo.HSAExecutables);
2130 
2131     check("Module registering", Err);
2132     if (Err != HSA_STATUS_SUCCESS) {
2133       const char *DeviceName = DeviceInfo.GPUName[DeviceId].c_str();
2134       const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
2135 
2136       if (strcmp(DeviceName, ElfName) != 0) {
2137         DP("Possible gpu arch mismatch: device:%s, image:%s please check"
2138            " compiler flag: -march=<gpu>\n",
2139            DeviceName, ElfName);
2140       } else {
2141         DP("Error loading image onto GPU: %s\n", get_error_string(Err));
2142       }
2143 
2144       return NULL;
2145     }
2146 
2147     Err = Env.afterLoading();
2148     if (Err != HSA_STATUS_SUCCESS) {
2149       return NULL;
2150     }
2151   }
2152 
2153   DP("AMDGPU module successfully loaded!\n");
2154 
2155   {
2156     // the device_State array is either large value in bss or a void* that
2157     // needs to be assigned to a pointer to an array of size device_state_bytes
2158     // If absent, it has been deadstripped and needs no setup.
2159 
2160     void *StatePtr;
2161     uint32_t StatePtrSize;
2162     auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId];
2163     hsa_status_t Err = interop_hsa_get_symbol_info(
2164         SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
2165         &StatePtrSize);
2166 
2167     if (Err != HSA_STATUS_SUCCESS) {
2168       DP("No device_state symbol found, skipping initialization\n");
2169     } else {
2170       if (StatePtrSize < sizeof(void *)) {
2171         DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,
2172            sizeof(void *));
2173         return NULL;
2174       }
2175 
2176       // if it's larger than a void*, assume it's a bss array and no further
2177       // initialization is required. Only try to set up a pointer for
2178       // sizeof(void*)
2179       if (StatePtrSize == sizeof(void *)) {
2180         uint64_t DeviceStateBytes =
2181             getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
2182         if (DeviceStateBytes == 0) {
2183           DP("Can't initialize device_State, missing size information\n");
2184           return NULL;
2185         }
2186 
2187         auto &DSS = DeviceInfo.DeviceStateStore[DeviceId];
2188         if (DSS.first.get() == nullptr) {
2189           assert(DSS.second == 0);
2190           void *Ptr = NULL;
2191           hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
2192           if (Err != HSA_STATUS_SUCCESS) {
2193             DP("Failed to allocate device_state array\n");
2194             return NULL;
2195           }
2196           DSS = {
2197               std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
2198               DeviceStateBytes,
2199           };
2200         }
2201 
2202         void *Ptr = DSS.first.get();
2203         if (DeviceStateBytes != DSS.second) {
2204           DP("Inconsistent sizes of device_State unsupported\n");
2205           return NULL;
2206         }
2207 
2208         // write ptr to device memory so it can be used by later kernels
2209         Err = DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *),
2210                                                  DeviceId);
2211         if (Err != HSA_STATUS_SUCCESS) {
2212           DP("memcpy install of state_ptr failed\n");
2213           return NULL;
2214         }
2215       }
2216     }
2217   }
2218 
2219   // Here, we take advantage of the data that is appended after img_end to get
2220   // the symbols' name we need to load. This data consist of the host entries
2221   // begin and end as well as the target name (see the offloading linker script
2222   // creation in clang compiler).
2223 
2224   // Find the symbols in the module by name. The name can be obtain by
2225   // concatenating the host entry name with the target name
2226 
2227   __tgt_offload_entry *HostBegin = Image->EntriesBegin;
2228   __tgt_offload_entry *HostEnd = Image->EntriesEnd;
2229 
2230   for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
2231 
2232     if (!E->addr) {
2233       // The host should have always something in the address to
2234       // uniquely identify the target region.
2235       DP("Analyzing host entry '<null>' (size = %lld)...\n",
2236          (unsigned long long)E->size);
2237       return NULL;
2238     }
2239 
2240     if (E->size) {
2241       __tgt_offload_entry Entry = *E;
2242 
2243       void *Varptr;
2244       uint32_t Varsize;
2245 
2246       auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId];
2247       hsa_status_t Err = interop_hsa_get_symbol_info(
2248           SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
2249 
2250       if (Err != HSA_STATUS_SUCCESS) {
2251         // Inform the user what symbol prevented offloading
2252         DP("Loading global '%s' (Failed)\n", E->name);
2253         return NULL;
2254       }
2255 
2256       if (Varsize != E->size) {
2257         DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,
2258            Varsize, E->size);
2259         return NULL;
2260       }
2261 
2262       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
2263          DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr));
2264       Entry.addr = (void *)Varptr;
2265 
2266       DeviceInfo.addOffloadEntry(DeviceId, Entry);
2267 
2268       if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
2269           E->flags & OMP_DECLARE_TARGET_LINK) {
2270         // If unified memory is present any target link variables
2271         // can access host addresses directly. There is no longer a
2272         // need for device copies.
2273         Err = DeviceInfo.freesignalpoolMemcpyH2D(Varptr, E->addr,
2274                                                  sizeof(void *), DeviceId);
2275         if (Err != HSA_STATUS_SUCCESS)
2276           DP("Error when copying USM\n");
2277         DP("Copy linked variable host address (" DPxMOD ")"
2278            "to device address (" DPxMOD ")\n",
2279            DPxPTR(*((void **)E->addr)), DPxPTR(Varptr));
2280       }
2281 
2282       continue;
2283     }
2284 
2285     DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name));
2286 
2287     // errors in kernarg_segment_size previously treated as = 0 (or as undef)
2288     uint32_t KernargSegmentSize = 0;
2289     auto &KernelInfoMap = DeviceInfo.KernelInfoTable[DeviceId];
2290     hsa_status_t Err = HSA_STATUS_SUCCESS;
2291     if (!E->name) {
2292       Err = HSA_STATUS_ERROR;
2293     } else {
2294       std::string KernelStr = std::string(E->name);
2295       auto It = KernelInfoMap.find(KernelStr);
2296       if (It != KernelInfoMap.end()) {
2297         atl_kernel_info_t Info = It->second;
2298         KernargSegmentSize = Info.kernel_segment_size;
2299       } else {
2300         Err = HSA_STATUS_ERROR;
2301       }
2302     }
2303 
2304     // default value GENERIC (in case symbol is missing from cubin file)
2305     llvm::omp::OMPTgtExecModeFlags ExecModeVal =
2306         llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2307 
2308     // get flat group size if present, else Default_WG_Size
2309     int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2310 
2311     // get Kernel Descriptor if present.
2312     // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
2313     struct KernDescValType {
2314       uint16_t Version;
2315       uint16_t TSize;
2316       uint16_t WGSize;
2317     };
2318     struct KernDescValType KernDescVal;
2319     std::string KernDescNameStr(E->name);
2320     KernDescNameStr += "_kern_desc";
2321     const char *KernDescName = KernDescNameStr.c_str();
2322 
2323     void *KernDescPtr;
2324     uint32_t KernDescSize;
2325     void *CallStackAddr = nullptr;
2326     Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
2327                                &KernDescPtr, &KernDescSize);
2328 
2329     if (Err == HSA_STATUS_SUCCESS) {
2330       if ((size_t)KernDescSize != sizeof(KernDescVal))
2331         DP("Loading global computation properties '%s' - size mismatch (%u != "
2332            "%lu)\n",
2333            KernDescName, KernDescSize, sizeof(KernDescVal));
2334 
2335       memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
2336 
2337       // Check structure size against recorded size.
2338       if ((size_t)KernDescSize != KernDescVal.TSize)
2339         DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
2340            sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
2341 
2342       DP("After loading global for %s KernDesc \n", KernDescName);
2343       DP("KernDesc: Version: %d\n", KernDescVal.Version);
2344       DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
2345       DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize);
2346 
2347       if (KernDescVal.WGSize == 0) {
2348         KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
2349         DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize);
2350       }
2351       WGSizeVal = KernDescVal.WGSize;
2352       DP("WGSizeVal %d\n", WGSizeVal);
2353       check("Loading KernDesc computation property", Err);
2354     } else {
2355       DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
2356 
2357       // Flat group size
2358       std::string WGSizeNameStr(E->name);
2359       WGSizeNameStr += "_wg_size";
2360       const char *WGSizeName = WGSizeNameStr.c_str();
2361 
2362       void *WGSizePtr;
2363       uint32_t WGSize;
2364       Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
2365                                  &WGSizePtr, &WGSize);
2366 
2367       if (Err == HSA_STATUS_SUCCESS) {
2368         if ((size_t)WGSize != sizeof(int16_t)) {
2369           DP("Loading global computation properties '%s' - size mismatch (%u "
2370              "!= "
2371              "%lu)\n",
2372              WGSizeName, WGSize, sizeof(int16_t));
2373           return NULL;
2374         }
2375 
2376         memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
2377 
2378         DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
2379 
2380         if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize ||
2381             WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
2382           DP("Error wrong WGSize value specified in HSA code object file: "
2383              "%d\n",
2384              WGSizeVal);
2385           WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2386         }
2387       } else {
2388         DP("Warning: Loading WGSize '%s' - symbol not found, "
2389            "using default value %d\n",
2390            WGSizeName, WGSizeVal);
2391       }
2392 
2393       check("Loading WGSize computation property", Err);
2394     }
2395 
2396     // Read execution mode from global in binary
2397     std::string ExecModeNameStr(E->name);
2398     ExecModeNameStr += "_exec_mode";
2399     const char *ExecModeName = ExecModeNameStr.c_str();
2400 
2401     void *ExecModePtr;
2402     uint32_t VarSize;
2403     Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
2404                                &ExecModePtr, &VarSize);
2405 
2406     if (Err == HSA_STATUS_SUCCESS) {
2407       if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
2408         DP("Loading global computation properties '%s' - size mismatch(%u != "
2409            "%lu)\n",
2410            ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
2411         return NULL;
2412       }
2413 
2414       memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);
2415 
2416       DP("After loading global for %s ExecMode = %d\n", ExecModeName,
2417          ExecModeVal);
2418 
2419       if (ExecModeVal < 0 ||
2420           ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
2421         DP("Error wrong exec_mode value specified in HSA code object file: "
2422            "%d\n",
2423            ExecModeVal);
2424         return NULL;
2425       }
2426     } else {
2427       DP("Loading global exec_mode '%s' - symbol missing, using default "
2428          "value "
2429          "GENERIC (1)\n",
2430          ExecModeName);
2431     }
2432     check("Loading computation property", Err);
2433 
2434     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
2435                                    CallStackAddr, E->name, KernargSegmentSize,
2436                                    DeviceInfo.KernArgPool));
2437     __tgt_offload_entry Entry = *E;
2438     Entry.addr = (void *)&KernelsList.back();
2439     DeviceInfo.addOffloadEntry(DeviceId, Entry);
2440     DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
2441   }
2442 
2443   return DeviceInfo.getOffloadEntriesTable(DeviceId);
2444 }
2445 
2446 void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
2447   void *Ptr = NULL;
2448   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
2449 
2450   if (Kind != TARGET_ALLOC_DEFAULT) {
2451     REPORT("Invalid target data allocation kind or requested allocator not "
2452            "implemented yet\n");
2453     return NULL;
2454   }
2455 
2456   hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId);
2457   hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
2458   DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,
2459      (long long unsigned)(Elf64_Addr)Ptr);
2460   Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL;
2461   return Ptr;
2462 }
2463 
2464 int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
2465                               int64_t Size) {
2466   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
2467   __tgt_async_info AsyncInfo;
2468   int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
2469   if (Rc != OFFLOAD_SUCCESS)
2470     return OFFLOAD_FAIL;
2471 
2472   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2473 }
2474 
2475 int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
2476                                     int64_t Size, __tgt_async_info *AsyncInfo) {
2477   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
2478   if (AsyncInfo) {
2479     initAsyncInfo(AsyncInfo);
2480     return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
2481   }
2482   return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
2483 }
2484 
2485 int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
2486                                 int64_t Size) {
2487   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
2488   __tgt_async_info AsyncInfo;
2489   int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
2490   if (Rc != OFFLOAD_SUCCESS)
2491     return OFFLOAD_FAIL;
2492 
2493   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2494 }
2495 
2496 int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
2497                                       int64_t Size,
2498                                       __tgt_async_info *AsyncInfo) {
2499   assert(AsyncInfo && "AsyncInfo is nullptr");
2500   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
2501   initAsyncInfo(AsyncInfo);
2502   return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
2503 }
2504 
2505 int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) {
2506   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
2507   hsa_status_t Err;
2508   DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr);
2509   Err = core::Runtime::Memfree(TgtPtr);
2510   if (Err != HSA_STATUS_SUCCESS) {
2511     DP("Error when freeing CUDA memory\n");
2512     return OFFLOAD_FAIL;
2513   }
2514   return OFFLOAD_SUCCESS;
2515 }
2516 
2517 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
2518                                          void **TgtArgs, ptrdiff_t *TgtOffsets,
2519                                          int32_t ArgNum, int32_t NumTeams,
2520                                          int32_t ThreadLimit,
2521                                          uint64_t LoopTripcount) {
2522 
2523   DeviceInfo.LoadRunLock.lock_shared();
2524   int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2525                                 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
2526 
2527   DeviceInfo.LoadRunLock.unlock_shared();
2528   return Res;
2529 }
2530 
2531 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
2532                                     void **TgtArgs, ptrdiff_t *TgtOffsets,
2533                                     int32_t ArgNum) {
2534   // use one team and one thread
2535   // fix thread num
2536   int32_t TeamNum = 1;
2537   int32_t ThreadLimit = 0; // use default
2538   return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs,
2539                                           TgtOffsets, ArgNum, TeamNum,
2540                                           ThreadLimit, 0);
2541 }
2542 
2543 int32_t __tgt_rtl_run_target_team_region_async(
2544     int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
2545     int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit,
2546     uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) {
2547   assert(AsyncInfo && "AsyncInfo is nullptr");
2548   initAsyncInfo(AsyncInfo);
2549 
2550   DeviceInfo.LoadRunLock.lock_shared();
2551   int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2552                                 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
2553 
2554   DeviceInfo.LoadRunLock.unlock_shared();
2555   return Res;
2556 }
2557 
2558 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
2559                                           void **TgtArgs, ptrdiff_t *TgtOffsets,
2560                                           int32_t ArgNum,
2561                                           __tgt_async_info *AsyncInfo) {
2562   // use one team and one thread
2563   // fix thread num
2564   int32_t TeamNum = 1;
2565   int32_t ThreadLimit = 0; // use default
2566   return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs,
2567                                                 TgtOffsets, ArgNum, TeamNum,
2568                                                 ThreadLimit, 0, AsyncInfo);
2569 }
2570 
2571 int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
2572   assert(AsyncInfo && "AsyncInfo is nullptr");
2573 
2574   // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
2575   // is not ensured by devices.cpp for amdgcn
2576   // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
2577   if (AsyncInfo->Queue) {
2578     finiAsyncInfo(AsyncInfo);
2579   }
2580   return OFFLOAD_SUCCESS;
2581 }
2582 
2583 void __tgt_rtl_print_device_info(int32_t DeviceId) {
2584   // TODO: Assertion to see if DeviceId is correct
2585   // NOTE: We don't need to set context for print device info.
2586 
2587   DeviceInfo.printDeviceInfo(DeviceId, DeviceInfo.HSAAgents[DeviceId]);
2588 }
2589 
2590 } // extern "C"
2591