1 //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for hsa machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <assert.h>
15 #include <cstdio>
16 #include <cstdlib>
17 #include <cstring>
18 #include <functional>
19 #include <libelf.h>
20 #include <list>
21 #include <memory>
22 #include <mutex>
23 #include <shared_mutex>
24 #include <unordered_map>
25 #include <vector>
26 
27 #include "interop_hsa.h"
28 #include "impl_runtime.h"
29 
30 #include "internal.h"
31 #include "rt.h"
32 
33 #include "get_elf_mach_gfx_name.h"
34 #include "omptargetplugin.h"
35 #include "print_tracing.h"
36 
37 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
38 
39 
40 // hostrpc interface, FIXME: consider moving to its own include these are
41 // statically linked into amdgpu/plugin if present from hostrpc_services.a,
42 // linked as --whole-archive to override the weak symbols that are used to
43 // implement a fallback for toolchains that do not yet have a hostrpc library.
44 extern "C" {
45 unsigned long hostrpc_assign_buffer(hsa_agent_t agent, hsa_queue_t *this_Q,
46                                     uint32_t device_id);
47 hsa_status_t hostrpc_init();
48 hsa_status_t hostrpc_terminate();
49 
50 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
51 __attribute__((weak)) hsa_status_t hostrpc_terminate() {
52   return HSA_STATUS_SUCCESS;
53 }
54 __attribute__((weak)) unsigned long
55 hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, uint32_t device_id) {
56   DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
57      "missing\n",
58      device_id);
59   return 0;
60 }
61 }
62 
63 // Heuristic parameters used for kernel launch
64 // Number of teams per CU to allow scheduling flexibility
65 static const unsigned DefaultTeamsPerCU = 4;
66 
67 int print_kernel_trace;
68 
69 #ifdef OMPTARGET_DEBUG
70 #define check(msg, status)                                                     \
71   if (status != HSA_STATUS_SUCCESS) {                                          \
72     DP(#msg " failed\n");                                                      \
73   } else {                                                                     \
74     DP(#msg " succeeded\n");                                                   \
75   }
76 #else
77 #define check(msg, status)                                                     \
78   {}
79 #endif
80 
81 #include "elf_common.h"
82 
83 namespace hsa {
84 template <typename C> hsa_status_t iterate_agents(C cb) {
85   auto L = [](hsa_agent_t agent, void *data) -> hsa_status_t {
86     C *unwrapped = static_cast<C *>(data);
87     return (*unwrapped)(agent);
88   };
89   return hsa_iterate_agents(L, static_cast<void *>(&cb));
90 }
91 
92 template <typename C>
93 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C cb) {
94   auto L = [](hsa_amd_memory_pool_t MemoryPool, void *data) -> hsa_status_t {
95     C *unwrapped = static_cast<C *>(data);
96     return (*unwrapped)(MemoryPool);
97   };
98 
99   return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&cb));
100 }
101 
102 } // namespace hsa
103 
104 /// Keep entries table per device
105 struct FuncOrGblEntryTy {
106   __tgt_target_table Table;
107   std::vector<__tgt_offload_entry> Entries;
108 };
109 
110 enum ExecutionModeType {
111   SPMD,         // constructors, destructors,
112                 // combined constructs (`teams distribute parallel for [simd]`)
113   GENERIC,      // everything else
114   SPMD_GENERIC, // Generic kernel with SPMD execution
115   NONE
116 };
117 
118 struct KernelArgPool {
119 private:
120   static pthread_mutex_t mutex;
121 
122 public:
123   uint32_t kernarg_segment_size;
124   void *kernarg_region = nullptr;
125   std::queue<int> free_kernarg_segments;
126 
127   uint32_t kernarg_size_including_implicit() {
128     return kernarg_segment_size + sizeof(impl_implicit_args_t);
129   }
130 
131   ~KernelArgPool() {
132     if (kernarg_region) {
133       auto r = hsa_amd_memory_pool_free(kernarg_region);
134       if (r != HSA_STATUS_SUCCESS) {
135         DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(r));
136       }
137     }
138   }
139 
140   // Can't really copy or move a mutex
141   KernelArgPool() = default;
142   KernelArgPool(const KernelArgPool &) = delete;
143   KernelArgPool(KernelArgPool &&) = delete;
144 
145   KernelArgPool(uint32_t kernarg_segment_size,
146                 hsa_amd_memory_pool_t &memory_pool)
147       : kernarg_segment_size(kernarg_segment_size) {
148 
149     // impl uses one pool per kernel for all gpus, with a fixed upper size
150     // preserving that exact scheme here, including the queue<int>
151 
152     hsa_status_t err = hsa_amd_memory_pool_allocate(
153         memory_pool, kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
154         &kernarg_region);
155 
156     if (err != HSA_STATUS_SUCCESS) {
157       DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(err));
158       kernarg_region = nullptr; // paranoid
159       return;
160     }
161 
162     err = core::allow_access_to_all_gpu_agents(kernarg_region);
163     if (err != HSA_STATUS_SUCCESS) {
164       DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
165          get_error_string(err));
166       auto r = hsa_amd_memory_pool_free(kernarg_region);
167       if (r != HSA_STATUS_SUCCESS) {
168         // if free failed, can't do anything more to resolve it
169         DP("hsa memory poll free failed: %s\n", get_error_string(err));
170       }
171       kernarg_region = nullptr;
172       return;
173     }
174 
175     for (int i = 0; i < MAX_NUM_KERNELS; i++) {
176       free_kernarg_segments.push(i);
177     }
178   }
179 
180   void *allocate(uint64_t arg_num) {
181     assert((arg_num * sizeof(void *)) == kernarg_segment_size);
182     lock l(&mutex);
183     void *res = nullptr;
184     if (!free_kernarg_segments.empty()) {
185 
186       int free_idx = free_kernarg_segments.front();
187       res = static_cast<void *>(static_cast<char *>(kernarg_region) +
188                                 (free_idx * kernarg_size_including_implicit()));
189       assert(free_idx == pointer_to_index(res));
190       free_kernarg_segments.pop();
191     }
192     return res;
193   }
194 
195   void deallocate(void *ptr) {
196     lock l(&mutex);
197     int idx = pointer_to_index(ptr);
198     free_kernarg_segments.push(idx);
199   }
200 
201 private:
202   int pointer_to_index(void *ptr) {
203     ptrdiff_t bytes =
204         static_cast<char *>(ptr) - static_cast<char *>(kernarg_region);
205     assert(bytes >= 0);
206     assert(bytes % kernarg_size_including_implicit() == 0);
207     return bytes / kernarg_size_including_implicit();
208   }
209   struct lock {
210     lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
211     ~lock() { pthread_mutex_unlock(m); }
212     pthread_mutex_t *m;
213   };
214 };
215 pthread_mutex_t KernelArgPool::mutex = PTHREAD_MUTEX_INITIALIZER;
216 
217 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
218     KernelArgPoolMap;
219 
220 /// Use a single entity to encode a kernel and a set of flags
221 struct KernelTy {
222   // execution mode of kernel
223   // 0 - SPMD mode (without master warp)
224   // 1 - Generic mode (with master warp)
225   // 2 - SPMD mode execution with Generic mode semantics.
226   int8_t ExecutionMode;
227   int16_t ConstWGSize;
228   int32_t device_id;
229   void *CallStackAddr = nullptr;
230   const char *Name;
231 
232   KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id,
233            void *_CallStackAddr, const char *_Name,
234            uint32_t _kernarg_segment_size,
235            hsa_amd_memory_pool_t &KernArgMemoryPool)
236       : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
237         device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) {
238     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
239 
240     std::string N(_Name);
241     if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
242       KernelArgPoolMap.insert(
243           std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
244                                 _kernarg_segment_size, KernArgMemoryPool))));
245     }
246   }
247 };
248 
249 /// List that contains all the kernels.
250 /// FIXME: we may need this to be per device and per library.
251 std::list<KernelTy> KernelsList;
252 
253 template <typename Callback> static hsa_status_t FindAgents(Callback CB) {
254 
255   hsa_status_t err =
256       hsa::iterate_agents([&](hsa_agent_t agent) -> hsa_status_t {
257         hsa_device_type_t device_type;
258         // get_info fails iff HSA runtime not yet initialized
259         hsa_status_t err =
260             hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
261 
262         if (err != HSA_STATUS_SUCCESS) {
263           if (print_kernel_trace > 0)
264             DP("rtl.cpp: err %s\n", get_error_string(err));
265 
266           return err;
267         }
268 
269         CB(device_type, agent);
270         return HSA_STATUS_SUCCESS;
271       });
272 
273   // iterate_agents fails iff HSA runtime not yet initialized
274   if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) {
275     DP("rtl.cpp: err %s\n", get_error_string(err));
276   }
277 
278   return err;
279 }
280 
281 static void callbackQueue(hsa_status_t status, hsa_queue_t *source,
282                           void *data) {
283   if (status != HSA_STATUS_SUCCESS) {
284     const char *status_string;
285     if (hsa_status_string(status, &status_string) != HSA_STATUS_SUCCESS) {
286       status_string = "unavailable";
287     }
288     DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, source,
289        status, status_string);
290     abort();
291   }
292 }
293 
294 namespace core {
295 namespace {
296 void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest) {
297   __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
298 }
299 
300 uint16_t create_header() {
301   uint16_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
302   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
303   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
304   return header;
305 }
306 
307 hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
308   bool AllocAllowed = false;
309   hsa_status_t Err = hsa_amd_memory_pool_get_info(
310       MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
311       &AllocAllowed);
312   if (Err != HSA_STATUS_SUCCESS) {
313     DP("Alloc allowed in memory pool check failed: %s\n",
314        get_error_string(Err));
315     return Err;
316   }
317 
318   size_t Size = 0;
319   Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
320                                      &Size);
321   if (Err != HSA_STATUS_SUCCESS) {
322     DP("Get memory pool size failed: %s\n", get_error_string(Err));
323     return Err;
324   }
325 
326   return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
327 }
328 
329 hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
330   std::vector<hsa_amd_memory_pool_t> *Result =
331       static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
332 
333   hsa_status_t err;
334   if ((err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
335     return err;
336   }
337 
338   uint32_t GlobalFlags = 0;
339   err = hsa_amd_memory_pool_get_info(
340       MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
341   if (err != HSA_STATUS_SUCCESS) {
342     DP("Get memory pool info failed: %s\n", get_error_string(err));
343     return err;
344   }
345 
346   if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) &&
347       (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) {
348     Result->push_back(MemoryPool);
349   }
350 
351   return HSA_STATUS_SUCCESS;
352 }
353 
354 template <typename AccumulatorFunc>
355 hsa_status_t collectMemoryPools(const std::vector<hsa_agent_t> &Agents,
356                                 AccumulatorFunc Func) {
357   for (int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
358     hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
359         Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
360           hsa_status_t Err;
361           if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
362             DP("Skipping memory pool: %s\n", get_error_string(Err));
363           } else
364             Func(MemoryPool, DeviceId);
365           return HSA_STATUS_SUCCESS;
366         });
367 
368     if (Err != HSA_STATUS_SUCCESS) {
369       DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
370          "Iterate all memory pools", get_error_string(Err));
371       return Err;
372     }
373   }
374 
375   return HSA_STATUS_SUCCESS;
376 }
377 
378 std::pair<hsa_status_t, hsa_amd_memory_pool_t>
379 FindKernargPool(const std::vector<hsa_agent_t> &HSAAgents) {
380   std::vector<hsa_amd_memory_pool_t> KernArgPools;
381   for (const auto &Agent : HSAAgents) {
382     hsa_status_t err = HSA_STATUS_SUCCESS;
383     err = hsa_amd_agent_iterate_memory_pools(
384         Agent, addKernArgPool, static_cast<void *>(&KernArgPools));
385     if (err != HSA_STATUS_SUCCESS) {
386       DP("addKernArgPool returned %s, continuing\n", get_error_string(err));
387     }
388   }
389 
390   if (KernArgPools.empty()) {
391     DP("Unable to find any valid kernarg pool\n");
392     return {HSA_STATUS_ERROR, hsa_amd_memory_pool_t{}};
393   }
394 
395   return {HSA_STATUS_SUCCESS, KernArgPools[0]};
396 }
397 
398 } // namespace
399 } // namespace core
400 
401 struct EnvironmentVariables {
402   int NumTeams;
403   int TeamLimit;
404   int TeamThreadLimit;
405   int MaxTeamsDefault;
406 };
407 
408 template <uint32_t wavesize>
409 static constexpr const llvm::omp::GV &getGridValue() {
410   return llvm::omp::getAMDGPUGridValues<wavesize>();
411 }
412 
413 struct HSALifetime {
414   // Wrapper around HSA used to ensure it is constructed before other types
415   // and destructed after, which means said other types can use raii for
416   // cleanup without risking running outside of the lifetime of HSA
417   const hsa_status_t S;
418 
419   bool success() { return S == HSA_STATUS_SUCCESS; }
420   HSALifetime() : S(hsa_init()) {}
421 
422   ~HSALifetime() {
423     if (S == HSA_STATUS_SUCCESS) {
424       hsa_status_t Err = hsa_shut_down();
425       if (Err != HSA_STATUS_SUCCESS) {
426         // Can't call into HSA to get a string from the integer
427         DP("Shutting down HSA failed: %d\n", Err);
428       }
429     }
430   }
431 };
432 
433 /// Class containing all the device information
434 class RTLDeviceInfoTy {
435   HSALifetime HSA; // First field => constructed first and destructed last
436   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
437 
438   struct QueueDeleter {
439     void operator()(hsa_queue_t *Q) {
440       if (Q) {
441         hsa_status_t Err = hsa_queue_destroy(Q);
442         if (Err != HSA_STATUS_SUCCESS) {
443           DP("Error destroying hsa queue: %s\n", get_error_string(Err));
444         }
445       }
446     }
447   };
448 
449 public:
450   bool ConstructionSucceeded = false;
451 
452   // load binary populates symbol tables and mutates various global state
453   // run uses those symbol tables
454   std::shared_timed_mutex load_run_lock;
455 
456   int NumberOfDevices = 0;
457 
458   // GPU devices
459   std::vector<hsa_agent_t> HSAAgents;
460   std::vector<std::unique_ptr<hsa_queue_t, QueueDeleter>>
461       HSAQueues; // one per gpu
462 
463   // CPUs
464   std::vector<hsa_agent_t> CPUAgents;
465 
466   // Device properties
467   std::vector<int> ComputeUnits;
468   std::vector<int> GroupsPerDevice;
469   std::vector<int> ThreadsPerGroup;
470   std::vector<int> WarpSize;
471   std::vector<std::string> GPUName;
472 
473   // OpenMP properties
474   std::vector<int> NumTeams;
475   std::vector<int> NumThreads;
476 
477   // OpenMP Environment properties
478   EnvironmentVariables Env;
479 
480   // OpenMP Requires Flags
481   int64_t RequiresFlags;
482 
483   // Resource pools
484   SignalPoolT FreeSignalPool;
485 
486   bool hostcall_required = false;
487 
488   std::vector<hsa_executable_t> HSAExecutables;
489 
490   std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
491   std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
492 
493   hsa_amd_memory_pool_t KernArgPool;
494 
495   // fine grained memory pool for host allocations
496   hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
497 
498   // fine and coarse-grained memory pools per offloading device
499   std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
500   std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
501 
502   struct implFreePtrDeletor {
503     void operator()(void *p) {
504       core::Runtime::Memfree(p); // ignore failure to free
505     }
506   };
507 
508   // device_State shared across loaded binaries, error if inconsistent size
509   std::vector<std::pair<std::unique_ptr<void, implFreePtrDeletor>, uint64_t>>
510       deviceStateStore;
511 
512   static const unsigned HardTeamLimit =
513       (1 << 16) - 1; // 64K needed to fit in uint16
514   static const int DefaultNumTeams = 128;
515 
516   // These need to be per-device since different devices can have different
517   // wave sizes, but are currently the same number for each so that refactor
518   // can be postponed.
519   static_assert(getGridValue<32>().GV_Max_Teams ==
520                     getGridValue<64>().GV_Max_Teams,
521                 "");
522   static const int Max_Teams = getGridValue<64>().GV_Max_Teams;
523 
524   static_assert(getGridValue<32>().GV_Max_WG_Size ==
525                     getGridValue<64>().GV_Max_WG_Size,
526                 "");
527   static const int Max_WG_Size = getGridValue<64>().GV_Max_WG_Size;
528 
529   static_assert(getGridValue<32>().GV_Default_WG_Size ==
530                     getGridValue<64>().GV_Default_WG_Size,
531                 "");
532   static const int Default_WG_Size = getGridValue<64>().GV_Default_WG_Size;
533 
534   using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *,
535                                       size_t size, hsa_agent_t,
536                                       hsa_amd_memory_pool_t);
537   hsa_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
538                                      MemcpyFunc Func, int32_t deviceId) {
539     hsa_agent_t agent = HSAAgents[deviceId];
540     hsa_signal_t s = FreeSignalPool.pop();
541     if (s.handle == 0) {
542       return HSA_STATUS_ERROR;
543     }
544     hsa_status_t r = Func(s, dest, src, size, agent, HostFineGrainedMemoryPool);
545     FreeSignalPool.push(s);
546     return r;
547   }
548 
549   hsa_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
550                                          size_t size, int32_t deviceId) {
551     return freesignalpool_memcpy(dest, src, size, impl_memcpy_d2h, deviceId);
552   }
553 
554   hsa_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
555                                          size_t size, int32_t deviceId) {
556     return freesignalpool_memcpy(dest, src, size, impl_memcpy_h2d, deviceId);
557   }
558 
559   // Record entry point associated with device
560   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
561     assert(device_id < (int32_t)FuncGblEntries.size() &&
562            "Unexpected device id!");
563     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
564 
565     E.Entries.push_back(entry);
566   }
567 
568   // Return true if the entry is associated with device
569   bool findOffloadEntry(int32_t device_id, void *addr) {
570     assert(device_id < (int32_t)FuncGblEntries.size() &&
571            "Unexpected device id!");
572     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
573 
574     for (auto &it : E.Entries) {
575       if (it.addr == addr)
576         return true;
577     }
578 
579     return false;
580   }
581 
582   // Return the pointer to the target entries table
583   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
584     assert(device_id < (int32_t)FuncGblEntries.size() &&
585            "Unexpected device id!");
586     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
587 
588     int32_t size = E.Entries.size();
589 
590     // Table is empty
591     if (!size)
592       return 0;
593 
594     __tgt_offload_entry *begin = &E.Entries[0];
595     __tgt_offload_entry *end = &E.Entries[size - 1];
596 
597     // Update table info according to the entries and return the pointer
598     E.Table.EntriesBegin = begin;
599     E.Table.EntriesEnd = ++end;
600 
601     return &E.Table;
602   }
603 
604   // Clear entries table for a device
605   void clearOffloadEntriesTable(int device_id) {
606     assert(device_id < (int32_t)FuncGblEntries.size() &&
607            "Unexpected device id!");
608     FuncGblEntries[device_id].emplace_back();
609     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
610     // KernelArgPoolMap.clear();
611     E.Entries.clear();
612     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
613   }
614 
615   hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
616                                    int DeviceId) {
617     assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
618     uint32_t GlobalFlags = 0;
619     hsa_status_t Err = hsa_amd_memory_pool_get_info(
620         MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
621 
622     if (Err != HSA_STATUS_SUCCESS) {
623       return Err;
624     }
625 
626     if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
627       DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
628     } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
629       DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
630     }
631 
632     return HSA_STATUS_SUCCESS;
633   }
634 
635   hsa_status_t addHostMemoryPool(hsa_amd_memory_pool_t MemoryPool,
636                                  int DeviceId) {
637     uint32_t GlobalFlags = 0;
638     hsa_status_t Err = hsa_amd_memory_pool_get_info(
639         MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
640 
641     if (Err != HSA_STATUS_SUCCESS) {
642       return Err;
643     }
644 
645     uint32_t Size;
646     Err = hsa_amd_memory_pool_get_info(MemoryPool,
647                                        HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size);
648     if (Err != HSA_STATUS_SUCCESS) {
649       return Err;
650     }
651 
652     if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED &&
653         Size > 0) {
654       HostFineGrainedMemoryPool = MemoryPool;
655     }
656 
657     return HSA_STATUS_SUCCESS;
658   }
659 
660   hsa_status_t setupMemoryPools() {
661     using namespace std::placeholders;
662     hsa_status_t Err;
663     Err = core::collectMemoryPools(
664         CPUAgents, std::bind(&RTLDeviceInfoTy::addHostMemoryPool, this, _1, _2));
665     if (Err != HSA_STATUS_SUCCESS) {
666       DP("HSA error in collecting memory pools for CPU: %s\n",
667          get_error_string(Err));
668       return Err;
669     }
670     Err = core::collectMemoryPools(
671         HSAAgents, std::bind(&RTLDeviceInfoTy::addDeviceMemoryPool, this, _1, _2));
672     if (Err != HSA_STATUS_SUCCESS) {
673       DP("HSA error in collecting memory pools for offload devices: %s\n",
674          get_error_string(Err));
675       return Err;
676     }
677     return HSA_STATUS_SUCCESS;
678   }
679 
680   hsa_amd_memory_pool_t getDeviceMemoryPool(int DeviceId) {
681     assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
682            "Invalid device Id");
683     return DeviceCoarseGrainedMemoryPools[DeviceId];
684   }
685 
686   hsa_amd_memory_pool_t getHostMemoryPool() {
687     return HostFineGrainedMemoryPool;
688   }
689 
690   static int readEnvElseMinusOne(const char *Env) {
691     const char *envStr = getenv(Env);
692     int res = -1;
693     if (envStr) {
694       res = std::stoi(envStr);
695       DP("Parsed %s=%d\n", Env, res);
696     }
697     return res;
698   }
699 
700   RTLDeviceInfoTy() {
701     DP("Start initializing " GETNAME(TARGET_NAME) "\n");
702 
703     // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
704     // anytime. You do not need a debug library build.
705     //  0 => no tracing
706     //  1 => tracing dispatch only
707     // >1 => verbosity increase
708 
709     if (!HSA.success()) {
710       DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
711       return;
712     }
713 
714     if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
715       print_kernel_trace = atoi(envStr);
716     else
717       print_kernel_trace = 0;
718 
719     hsa_status_t err = core::atl_init_gpu_context();
720     if (err != HSA_STATUS_SUCCESS) {
721       DP("Error when initializing " GETNAME(TARGET_NAME) "\n");
722       return;
723     }
724 
725     // Init hostcall soon after initializing hsa
726     hostrpc_init();
727 
728     err = FindAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
729       if (DeviceType == HSA_DEVICE_TYPE_CPU) {
730         CPUAgents.push_back(Agent);
731       } else {
732         HSAAgents.push_back(Agent);
733       }
734     });
735     if (err != HSA_STATUS_SUCCESS)
736       return;
737 
738     NumberOfDevices = (int)HSAAgents.size();
739 
740     if (NumberOfDevices == 0) {
741       DP("There are no devices supporting HSA.\n");
742       return;
743     } else {
744       DP("There are %d devices supporting HSA.\n", NumberOfDevices);
745     }
746     std::tie(err, KernArgPool) = core::FindKernargPool(CPUAgents);
747     if (err != HSA_STATUS_SUCCESS) {
748       DP("Error when reading memory pools\n");
749       return;
750     }
751 
752     // Init the device info
753     HSAQueues.resize(NumberOfDevices);
754     FuncGblEntries.resize(NumberOfDevices);
755     ThreadsPerGroup.resize(NumberOfDevices);
756     ComputeUnits.resize(NumberOfDevices);
757     GPUName.resize(NumberOfDevices);
758     GroupsPerDevice.resize(NumberOfDevices);
759     WarpSize.resize(NumberOfDevices);
760     NumTeams.resize(NumberOfDevices);
761     NumThreads.resize(NumberOfDevices);
762     deviceStateStore.resize(NumberOfDevices);
763     KernelInfoTable.resize(NumberOfDevices);
764     SymbolInfoTable.resize(NumberOfDevices);
765     DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
766     DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
767 
768     err = setupMemoryPools();
769     if (err != HSA_STATUS_SUCCESS) {
770       DP("Error when setting up memory pools");
771       return;
772     }
773 
774     for (int i = 0; i < NumberOfDevices; i++) {
775       uint32_t queue_size = 0;
776       {
777         hsa_status_t err = hsa_agent_get_info(
778             HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
779         if (err != HSA_STATUS_SUCCESS) {
780           DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", i);
781           return;
782         }
783         enum { MaxQueueSize = 4096 };
784         if (queue_size > MaxQueueSize) {
785           queue_size = MaxQueueSize;
786         }
787       }
788 
789       {
790         hsa_queue_t *Q = nullptr;
791         hsa_status_t rc =
792             hsa_queue_create(HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI,
793                              callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
794         if (rc != HSA_STATUS_SUCCESS) {
795           DP("Failed to create HSA queue %d\n", i);
796           return;
797         }
798         HSAQueues[i].reset(Q);
799       }
800 
801       deviceStateStore[i] = {nullptr, 0};
802     }
803 
804     for (int i = 0; i < NumberOfDevices; i++) {
805       ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size;
806       GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams;
807       ComputeUnits[i] = 1;
808       DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", i,
809          GroupsPerDevice[i], ThreadsPerGroup[i]);
810     }
811 
812     // Get environment variables regarding teams
813     Env.TeamLimit = readEnvElseMinusOne("OMP_TEAM_LIMIT");
814     Env.NumTeams = readEnvElseMinusOne("OMP_NUM_TEAMS");
815     Env.MaxTeamsDefault = readEnvElseMinusOne("OMP_MAX_TEAMS_DEFAULT");
816     Env.TeamThreadLimit = readEnvElseMinusOne("OMP_TEAMS_THREAD_LIMIT");
817 
818     // Default state.
819     RequiresFlags = OMP_REQ_UNDEFINED;
820 
821     ConstructionSucceeded = true;
822   }
823 
824   ~RTLDeviceInfoTy() {
825     DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
826     if (!HSA.success()) {
827       // Then none of these can have been set up and they can't be torn down
828       return;
829     }
830     // Run destructors on types that use HSA before
831     // impl_finalize removes access to it
832     deviceStateStore.clear();
833     KernelArgPoolMap.clear();
834     // Terminate hostrpc before finalizing hsa
835     hostrpc_terminate();
836 
837     hsa_status_t Err;
838     for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
839       Err = hsa_executable_destroy(HSAExecutables[I]);
840       if (Err != HSA_STATUS_SUCCESS) {
841         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
842            "Destroying executable", get_error_string(Err));
843       }
844     }
845   }
846 };
847 
848 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
849 
850 // TODO: May need to drop the trailing to fields until deviceRTL is updated
851 struct omptarget_device_environmentTy {
852   int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG
853                        // only useful for Debug build of deviceRTLs
854   int32_t num_devices; // gets number of active offload devices
855   int32_t device_num;  // gets a value 0 to num_devices-1
856 };
857 
858 static RTLDeviceInfoTy DeviceInfo;
859 
860 namespace {
861 
862 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
863                      __tgt_async_info *AsyncInfo) {
864   assert(AsyncInfo && "AsyncInfo is nullptr");
865   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
866   // Return success if we are not copying back to host from target.
867   if (!HstPtr)
868     return OFFLOAD_SUCCESS;
869   hsa_status_t err;
870   DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
871      (long long unsigned)(Elf64_Addr)TgtPtr,
872      (long long unsigned)(Elf64_Addr)HstPtr);
873 
874   err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size,
875                                              DeviceId);
876 
877   if (err != HSA_STATUS_SUCCESS) {
878     DP("Error when copying data from device to host. Pointers: "
879        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
880        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
881     return OFFLOAD_FAIL;
882   }
883   DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
884      (long long unsigned)(Elf64_Addr)TgtPtr,
885      (long long unsigned)(Elf64_Addr)HstPtr);
886   return OFFLOAD_SUCCESS;
887 }
888 
889 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
890                    __tgt_async_info *AsyncInfo) {
891   assert(AsyncInfo && "AsyncInfo is nullptr");
892   hsa_status_t err;
893   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
894   // Return success if we are not doing host to target.
895   if (!HstPtr)
896     return OFFLOAD_SUCCESS;
897 
898   DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
899      (long long unsigned)(Elf64_Addr)HstPtr,
900      (long long unsigned)(Elf64_Addr)TgtPtr);
901   err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size,
902                                              DeviceId);
903   if (err != HSA_STATUS_SUCCESS) {
904     DP("Error when copying data from host to device. Pointers: "
905        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
906        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
907     return OFFLOAD_FAIL;
908   }
909   return OFFLOAD_SUCCESS;
910 }
911 
912 // Async.
913 // The implementation was written with cuda streams in mind. The semantics of
914 // that are to execute kernels on a queue in order of insertion. A synchronise
915 // call then makes writes visible between host and device. This means a series
916 // of N data_submit_async calls are expected to execute serially. HSA offers
917 // various options to run the data copies concurrently. This may require changes
918 // to libomptarget.
919 
920 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
921 // there are no outstanding kernels that need to be synchronized. Any async call
922 // may be passed a Queue==0, at which point the cuda implementation will set it
923 // to non-null (see getStream). The cuda streams are per-device. Upstream may
924 // change this interface to explicitly initialize the AsyncInfo_pointer, but
925 // until then hsa lazily initializes it as well.
926 
927 void initAsyncInfo(__tgt_async_info *AsyncInfo) {
928   // set non-null while using async calls, return to null to indicate completion
929   assert(AsyncInfo);
930   if (!AsyncInfo->Queue) {
931     AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX);
932   }
933 }
934 void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
935   assert(AsyncInfo);
936   assert(AsyncInfo->Queue);
937   AsyncInfo->Queue = 0;
938 }
939 
940 bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
941   const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
942   int32_t r = elf_check_machine(image, amdgcnMachineID);
943   if (!r) {
944     DP("Supported machine ID not found\n");
945   }
946   return r;
947 }
948 
949 uint32_t elf_e_flags(__tgt_device_image *image) {
950   char *img_begin = (char *)image->ImageStart;
951   size_t img_size = (char *)image->ImageEnd - img_begin;
952 
953   Elf *e = elf_memory(img_begin, img_size);
954   if (!e) {
955     DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
956     return 0;
957   }
958 
959   Elf64_Ehdr *eh64 = elf64_getehdr(e);
960 
961   if (!eh64) {
962     DP("Unable to get machine ID from ELF file!\n");
963     elf_end(e);
964     return 0;
965   }
966 
967   uint32_t Flags = eh64->e_flags;
968 
969   elf_end(e);
970   DP("ELF Flags: 0x%x\n", Flags);
971   return Flags;
972 }
973 } // namespace
974 
975 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
976   return elf_machine_id_is_amdgcn(image);
977 }
978 
979 int __tgt_rtl_number_of_devices() {
980   // If the construction failed, no methods are safe to call
981   if (DeviceInfo.ConstructionSucceeded) {
982     return DeviceInfo.NumberOfDevices;
983   } else {
984     DP("AMDGPU plugin construction failed. Zero devices available\n");
985     return 0;
986   }
987 }
988 
989 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
990   DP("Init requires flags to %ld\n", RequiresFlags);
991   DeviceInfo.RequiresFlags = RequiresFlags;
992   return RequiresFlags;
993 }
994 
995 namespace {
996 template <typename T> bool enforce_upper_bound(T *value, T upper) {
997   bool changed = *value > upper;
998   if (changed) {
999     *value = upper;
1000   }
1001   return changed;
1002 }
1003 } // namespace
1004 
1005 int32_t __tgt_rtl_init_device(int device_id) {
1006   hsa_status_t err;
1007 
1008   // this is per device id init
1009   DP("Initialize the device id: %d\n", device_id);
1010 
1011   hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
1012 
1013   // Get number of Compute Unit
1014   uint32_t compute_units = 0;
1015   err = hsa_agent_get_info(
1016       agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
1017       &compute_units);
1018   if (err != HSA_STATUS_SUCCESS) {
1019     DeviceInfo.ComputeUnits[device_id] = 1;
1020     DP("Error getting compute units : settiing to 1\n");
1021   } else {
1022     DeviceInfo.ComputeUnits[device_id] = compute_units;
1023     DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]);
1024   }
1025 
1026   char GetInfoName[64]; // 64 max size returned by get info
1027   err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
1028                            (void *)GetInfoName);
1029   if (err)
1030     DeviceInfo.GPUName[device_id] = "--unknown gpu--";
1031   else {
1032     DeviceInfo.GPUName[device_id] = GetInfoName;
1033   }
1034 
1035   if (print_kernel_trace & STARTUP_DETAILS)
1036     DP("Device#%-2d CU's: %2d %s\n", device_id,
1037        DeviceInfo.ComputeUnits[device_id],
1038        DeviceInfo.GPUName[device_id].c_str());
1039 
1040   // Query attributes to determine number of threads/block and blocks/grid.
1041   uint16_t workgroup_max_dim[3];
1042   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
1043                            &workgroup_max_dim);
1044   if (err != HSA_STATUS_SUCCESS) {
1045     DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
1046     DP("Error getting grid dims: num groups : %d\n",
1047        RTLDeviceInfoTy::DefaultNumTeams);
1048   } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
1049     DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0];
1050     DP("Using %d ROCm blocks per grid\n",
1051        DeviceInfo.GroupsPerDevice[device_id]);
1052   } else {
1053     DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit;
1054     DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
1055        "at the hard limit\n",
1056        workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit);
1057   }
1058 
1059   // Get thread limit
1060   hsa_dim3_t grid_max_dim;
1061   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim);
1062   if (err == HSA_STATUS_SUCCESS) {
1063     DeviceInfo.ThreadsPerGroup[device_id] =
1064         reinterpret_cast<uint32_t *>(&grid_max_dim)[0] /
1065         DeviceInfo.GroupsPerDevice[device_id];
1066 
1067     if (DeviceInfo.ThreadsPerGroup[device_id] == 0) {
1068       DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
1069       DP("Default thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
1070     } else if (enforce_upper_bound(&DeviceInfo.ThreadsPerGroup[device_id],
1071                                    RTLDeviceInfoTy::Max_WG_Size)) {
1072       DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
1073     } else {
1074       DP("Using ROCm Queried thread limit: %d\n",
1075          DeviceInfo.ThreadsPerGroup[device_id]);
1076     }
1077   } else {
1078     DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
1079     DP("Error getting max block dimension, use default:%d \n",
1080        RTLDeviceInfoTy::Max_WG_Size);
1081   }
1082 
1083   // Get wavefront size
1084   uint32_t wavefront_size = 0;
1085   err =
1086       hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size);
1087   if (err == HSA_STATUS_SUCCESS) {
1088     DP("Queried wavefront size: %d\n", wavefront_size);
1089     DeviceInfo.WarpSize[device_id] = wavefront_size;
1090   } else {
1091     // TODO: Burn the wavefront size into the code object
1092     DP("Warning: Unknown wavefront size, assuming 64\n");
1093     DeviceInfo.WarpSize[device_id] = 64;
1094   }
1095 
1096   // Adjust teams to the env variables
1097 
1098   if (DeviceInfo.Env.TeamLimit > 0 &&
1099       (enforce_upper_bound(&DeviceInfo.GroupsPerDevice[device_id],
1100                            DeviceInfo.Env.TeamLimit))) {
1101     DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
1102        DeviceInfo.Env.TeamLimit);
1103   }
1104 
1105   // Set default number of teams
1106   if (DeviceInfo.Env.NumTeams > 0) {
1107     DeviceInfo.NumTeams[device_id] = DeviceInfo.Env.NumTeams;
1108     DP("Default number of teams set according to environment %d\n",
1109        DeviceInfo.Env.NumTeams);
1110   } else {
1111     char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
1112     int TeamsPerCU = DefaultTeamsPerCU;
1113     if (TeamsPerCUEnvStr) {
1114       TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
1115     }
1116 
1117     DeviceInfo.NumTeams[device_id] =
1118         TeamsPerCU * DeviceInfo.ComputeUnits[device_id];
1119     DP("Default number of teams = %d * number of compute units %d\n",
1120        TeamsPerCU, DeviceInfo.ComputeUnits[device_id]);
1121   }
1122 
1123   if (enforce_upper_bound(&DeviceInfo.NumTeams[device_id],
1124                           DeviceInfo.GroupsPerDevice[device_id])) {
1125     DP("Default number of teams exceeds device limit, capping at %d\n",
1126        DeviceInfo.GroupsPerDevice[device_id]);
1127   }
1128 
1129   // Adjust threads to the env variables
1130   if (DeviceInfo.Env.TeamThreadLimit > 0 &&
1131       (enforce_upper_bound(&DeviceInfo.NumThreads[device_id],
1132                            DeviceInfo.Env.TeamThreadLimit))) {
1133     DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
1134        DeviceInfo.Env.TeamThreadLimit);
1135   }
1136 
1137   // Set default number of threads
1138   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size;
1139   DP("Default number of threads set according to library's default %d\n",
1140      RTLDeviceInfoTy::Default_WG_Size);
1141   if (enforce_upper_bound(&DeviceInfo.NumThreads[device_id],
1142                           DeviceInfo.ThreadsPerGroup[device_id])) {
1143     DP("Default number of threads exceeds device limit, capping at %d\n",
1144        DeviceInfo.ThreadsPerGroup[device_id]);
1145   }
1146 
1147   DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
1148      device_id, DeviceInfo.GroupsPerDevice[device_id],
1149      DeviceInfo.ThreadsPerGroup[device_id]);
1150 
1151   DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id,
1152      DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id],
1153      DeviceInfo.GroupsPerDevice[device_id],
1154      DeviceInfo.GroupsPerDevice[device_id] *
1155          DeviceInfo.ThreadsPerGroup[device_id]);
1156 
1157   return OFFLOAD_SUCCESS;
1158 }
1159 
1160 namespace {
1161 Elf64_Shdr *find_only_SHT_HASH(Elf *elf) {
1162   size_t N;
1163   int rc = elf_getshdrnum(elf, &N);
1164   if (rc != 0) {
1165     return nullptr;
1166   }
1167 
1168   Elf64_Shdr *result = nullptr;
1169   for (size_t i = 0; i < N; i++) {
1170     Elf_Scn *scn = elf_getscn(elf, i);
1171     if (scn) {
1172       Elf64_Shdr *shdr = elf64_getshdr(scn);
1173       if (shdr) {
1174         if (shdr->sh_type == SHT_HASH) {
1175           if (result == nullptr) {
1176             result = shdr;
1177           } else {
1178             // multiple SHT_HASH sections not handled
1179             return nullptr;
1180           }
1181         }
1182       }
1183     }
1184   }
1185   return result;
1186 }
1187 
1188 const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash,
1189                             const char *symname) {
1190 
1191   assert(section_hash);
1192   size_t section_symtab_index = section_hash->sh_link;
1193   Elf64_Shdr *section_symtab =
1194       elf64_getshdr(elf_getscn(elf, section_symtab_index));
1195   size_t section_strtab_index = section_symtab->sh_link;
1196 
1197   const Elf64_Sym *symtab =
1198       reinterpret_cast<const Elf64_Sym *>(base + section_symtab->sh_offset);
1199 
1200   const uint32_t *hashtab =
1201       reinterpret_cast<const uint32_t *>(base + section_hash->sh_offset);
1202 
1203   // Layout:
1204   // nbucket
1205   // nchain
1206   // bucket[nbucket]
1207   // chain[nchain]
1208   uint32_t nbucket = hashtab[0];
1209   const uint32_t *bucket = &hashtab[2];
1210   const uint32_t *chain = &hashtab[nbucket + 2];
1211 
1212   const size_t max = strlen(symname) + 1;
1213   const uint32_t hash = elf_hash(symname);
1214   for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) {
1215     char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name);
1216     if (strncmp(symname, n, max) == 0) {
1217       return &symtab[i];
1218     }
1219   }
1220 
1221   return nullptr;
1222 }
1223 
1224 struct symbol_info {
1225   void *addr = nullptr;
1226   uint32_t size = UINT32_MAX;
1227   uint32_t sh_type = SHT_NULL;
1228 };
1229 
1230 int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname,
1231                                     symbol_info *res) {
1232   if (elf_kind(elf) != ELF_K_ELF) {
1233     return 1;
1234   }
1235 
1236   Elf64_Shdr *section_hash = find_only_SHT_HASH(elf);
1237   if (!section_hash) {
1238     return 1;
1239   }
1240 
1241   const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname);
1242   if (!sym) {
1243     return 1;
1244   }
1245 
1246   if (sym->st_size > UINT32_MAX) {
1247     return 1;
1248   }
1249 
1250   if (sym->st_shndx == SHN_UNDEF) {
1251     return 1;
1252   }
1253 
1254   Elf_Scn *section = elf_getscn(elf, sym->st_shndx);
1255   if (!section) {
1256     return 1;
1257   }
1258 
1259   Elf64_Shdr *header = elf64_getshdr(section);
1260   if (!header) {
1261     return 1;
1262   }
1263 
1264   res->addr = sym->st_value + base;
1265   res->size = static_cast<uint32_t>(sym->st_size);
1266   res->sh_type = header->sh_type;
1267   return 0;
1268 }
1269 
1270 int get_symbol_info_without_loading(char *base, size_t img_size,
1271                                     const char *symname, symbol_info *res) {
1272   Elf *elf = elf_memory(base, img_size);
1273   if (elf) {
1274     int rc = get_symbol_info_without_loading(elf, base, symname, res);
1275     elf_end(elf);
1276     return rc;
1277   }
1278   return 1;
1279 }
1280 
1281 hsa_status_t interop_get_symbol_info(char *base, size_t img_size,
1282                                      const char *symname, void **var_addr,
1283                                      uint32_t *var_size) {
1284   symbol_info si;
1285   int rc = get_symbol_info_without_loading(base, img_size, symname, &si);
1286   if (rc == 0) {
1287     *var_addr = si.addr;
1288     *var_size = si.size;
1289     return HSA_STATUS_SUCCESS;
1290   } else {
1291     return HSA_STATUS_ERROR;
1292   }
1293 }
1294 
1295 template <typename C>
1296 hsa_status_t module_register_from_memory_to_place(
1297     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
1298     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
1299     void *module_bytes, size_t module_size, int DeviceId, C cb,
1300     std::vector<hsa_executable_t> &HSAExecutables) {
1301   auto L = [](void *data, size_t size, void *cb_state) -> hsa_status_t {
1302     C *unwrapped = static_cast<C *>(cb_state);
1303     return (*unwrapped)(data, size);
1304   };
1305   return core::RegisterModuleFromMemory(
1306       KernelInfoTable, SymbolInfoTable, module_bytes, module_size,
1307       DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&cb),
1308       HSAExecutables);
1309 }
1310 } // namespace
1311 
1312 static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) {
1313   uint64_t device_State_bytes = 0;
1314   {
1315     // If this is the deviceRTL, get the state variable size
1316     symbol_info size_si;
1317     int rc = get_symbol_info_without_loading(
1318         ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si);
1319 
1320     if (rc == 0) {
1321       if (size_si.size != sizeof(uint64_t)) {
1322         DP("Found device_State_size variable with wrong size\n");
1323         return 0;
1324       }
1325 
1326       // Read number of bytes directly from the elf
1327       memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t));
1328     }
1329   }
1330   return device_State_bytes;
1331 }
1332 
1333 static __tgt_target_table *
1334 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
1335 
1336 static __tgt_target_table *
1337 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
1338 
1339 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1340                                           __tgt_device_image *image) {
1341   DeviceInfo.load_run_lock.lock();
1342   __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image);
1343   DeviceInfo.load_run_lock.unlock();
1344   return res;
1345 }
1346 
1347 struct device_environment {
1348   // initialise an omptarget_device_environmentTy in the deviceRTL
1349   // patches around differences in the deviceRTL between trunk, aomp,
1350   // rocmcc. Over time these differences will tend to zero and this class
1351   // simplified.
1352   // Symbol may be in .data or .bss, and may be missing fields:
1353   //  - aomp has debug_level, num_devices, device_num
1354   //  - trunk has debug_level
1355   //  - under review in trunk is debug_level, device_num
1356   //  - rocmcc matches aomp, patch to swap num_devices and device_num
1357 
1358   // The symbol may also have been deadstripped because the device side
1359   // accessors were unused.
1360 
1361   // If the symbol is in .data (aomp, rocm) it can be written directly.
1362   // If it is in .bss, we must wait for it to be allocated space on the
1363   // gpu (trunk) and initialize after loading.
1364   const char *sym() { return "omptarget_device_environment"; }
1365 
1366   omptarget_device_environmentTy host_device_env;
1367   symbol_info si;
1368   bool valid = false;
1369 
1370   __tgt_device_image *image;
1371   const size_t img_size;
1372 
1373   device_environment(int device_id, int number_devices,
1374                      __tgt_device_image *image, const size_t img_size)
1375       : image(image), img_size(img_size) {
1376 
1377     host_device_env.num_devices = number_devices;
1378     host_device_env.device_num = device_id;
1379     host_device_env.debug_level = 0;
1380 #ifdef OMPTARGET_DEBUG
1381     if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
1382       host_device_env.debug_level = std::stoi(envStr);
1383     }
1384 #endif
1385 
1386     int rc = get_symbol_info_without_loading((char *)image->ImageStart,
1387                                              img_size, sym(), &si);
1388     if (rc != 0) {
1389       DP("Finding global device environment '%s' - symbol missing.\n", sym());
1390       return;
1391     }
1392 
1393     if (si.size > sizeof(host_device_env)) {
1394       DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size,
1395          sizeof(host_device_env));
1396       return;
1397     }
1398 
1399     valid = true;
1400   }
1401 
1402   bool in_image() { return si.sh_type != SHT_NOBITS; }
1403 
1404   hsa_status_t before_loading(void *data, size_t size) {
1405     if (valid) {
1406       if (in_image()) {
1407         DP("Setting global device environment before load (%u bytes)\n",
1408            si.size);
1409         uint64_t offset = (char *)si.addr - (char *)image->ImageStart;
1410         void *pos = (char *)data + offset;
1411         memcpy(pos, &host_device_env, si.size);
1412       }
1413     }
1414     return HSA_STATUS_SUCCESS;
1415   }
1416 
1417   hsa_status_t after_loading() {
1418     if (valid) {
1419       if (!in_image()) {
1420         DP("Setting global device environment after load (%u bytes)\n",
1421            si.size);
1422         int device_id = host_device_env.device_num;
1423         auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
1424         void *state_ptr;
1425         uint32_t state_ptr_size;
1426         hsa_status_t err = interop_hsa_get_symbol_info(
1427             SymbolInfo, device_id, sym(), &state_ptr, &state_ptr_size);
1428         if (err != HSA_STATUS_SUCCESS) {
1429           DP("failed to find %s in loaded image\n", sym());
1430           return err;
1431         }
1432 
1433         if (state_ptr_size != si.size) {
1434           DP("Symbol had size %u before loading, %u after\n", state_ptr_size,
1435              si.size);
1436           return HSA_STATUS_ERROR;
1437         }
1438 
1439         return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env,
1440                                                     state_ptr_size, device_id);
1441       }
1442     }
1443     return HSA_STATUS_SUCCESS;
1444   }
1445 };
1446 
1447 static hsa_status_t impl_calloc(void **ret_ptr, size_t size, int DeviceId) {
1448   uint64_t rounded = 4 * ((size + 3) / 4);
1449   void *ptr;
1450   hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId);
1451   hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, rounded, 0, &ptr);
1452   if (err != HSA_STATUS_SUCCESS) {
1453     return err;
1454   }
1455 
1456   hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4);
1457   if (rc != HSA_STATUS_SUCCESS) {
1458     DP("zero fill device_state failed with %u\n", rc);
1459     core::Runtime::Memfree(ptr);
1460     return HSA_STATUS_ERROR;
1461   }
1462 
1463   *ret_ptr = ptr;
1464   return HSA_STATUS_SUCCESS;
1465 }
1466 
1467 static bool image_contains_symbol(void *data, size_t size, const char *sym) {
1468   symbol_info si;
1469   int rc = get_symbol_info_without_loading((char *)data, size, sym, &si);
1470   return (rc == 0) && (si.addr != nullptr);
1471 }
1472 
1473 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
1474                                                  __tgt_device_image *image) {
1475   // This function loads the device image onto gpu[device_id] and does other
1476   // per-image initialization work. Specifically:
1477   //
1478   // - Initialize an omptarget_device_environmentTy instance embedded in the
1479   //   image at the symbol "omptarget_device_environment"
1480   //   Fields debug_level, device_num, num_devices. Used by the deviceRTL.
1481   //
1482   // - Allocate a large array per-gpu (could be moved to init_device)
1483   //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
1484   //   - Allocate at least that many bytes of gpu memory
1485   //   - Zero initialize it
1486   //   - Write the pointer to the symbol omptarget_nvptx_device_State
1487   //
1488   // - Pulls some per-kernel information together from various sources and
1489   //   records it in the KernelsList for quicker access later
1490   //
1491   // The initialization can be done before or after loading the image onto the
1492   // gpu. This function presently does a mixture. Using the hsa api to get/set
1493   // the information is simpler to implement, in exchange for more complicated
1494   // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
1495   // back from the gpu vs a hashtable lookup on the host.
1496 
1497   const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart;
1498 
1499   DeviceInfo.clearOffloadEntriesTable(device_id);
1500 
1501   // We do not need to set the ELF version because the caller of this function
1502   // had to do that to decide the right runtime to use
1503 
1504   if (!elf_machine_id_is_amdgcn(image)) {
1505     return NULL;
1506   }
1507 
1508   {
1509     auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image,
1510                                   img_size);
1511 
1512     auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id];
1513     auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
1514     hsa_status_t err = module_register_from_memory_to_place(
1515         KernelInfo, SymbolInfo, (void *)image->ImageStart, img_size, device_id,
1516         [&](void *data, size_t size) {
1517           if (image_contains_symbol(data, size, "needs_hostcall_buffer")) {
1518             __atomic_store_n(&DeviceInfo.hostcall_required, true,
1519                              __ATOMIC_RELEASE);
1520           }
1521           return env.before_loading(data, size);
1522         },
1523         DeviceInfo.HSAExecutables);
1524 
1525     check("Module registering", err);
1526     if (err != HSA_STATUS_SUCCESS) {
1527       const char *DeviceName = DeviceInfo.GPUName[device_id].c_str();
1528       const char *ElfName = get_elf_mach_gfx_name(elf_e_flags(image));
1529 
1530       if (strcmp(DeviceName, ElfName) != 0) {
1531         DP("Possible gpu arch mismatch: device:%s, image:%s please check"
1532            " compiler flag: -march=<gpu>\n",
1533            DeviceName, ElfName);
1534       } else {
1535         DP("Error loading image onto GPU: %s\n", get_error_string(err));
1536       }
1537 
1538       return NULL;
1539     }
1540 
1541     err = env.after_loading();
1542     if (err != HSA_STATUS_SUCCESS) {
1543       return NULL;
1544     }
1545   }
1546 
1547   DP("AMDGPU module successfully loaded!\n");
1548 
1549   {
1550     // the device_State array is either large value in bss or a void* that
1551     // needs to be assigned to a pointer to an array of size device_state_bytes
1552     // If absent, it has been deadstripped and needs no setup.
1553 
1554     void *state_ptr;
1555     uint32_t state_ptr_size;
1556     auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
1557     hsa_status_t err = interop_hsa_get_symbol_info(
1558         SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr,
1559         &state_ptr_size);
1560 
1561     if (err != HSA_STATUS_SUCCESS) {
1562       DP("No device_state symbol found, skipping initialization\n");
1563     } else {
1564       if (state_ptr_size < sizeof(void *)) {
1565         DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size,
1566            sizeof(void *));
1567         return NULL;
1568       }
1569 
1570       // if it's larger than a void*, assume it's a bss array and no further
1571       // initialization is required. Only try to set up a pointer for
1572       // sizeof(void*)
1573       if (state_ptr_size == sizeof(void *)) {
1574         uint64_t device_State_bytes =
1575             get_device_State_bytes((char *)image->ImageStart, img_size);
1576         if (device_State_bytes == 0) {
1577           DP("Can't initialize device_State, missing size information\n");
1578           return NULL;
1579         }
1580 
1581         auto &dss = DeviceInfo.deviceStateStore[device_id];
1582         if (dss.first.get() == nullptr) {
1583           assert(dss.second == 0);
1584           void *ptr = NULL;
1585           hsa_status_t err = impl_calloc(&ptr, device_State_bytes, device_id);
1586           if (err != HSA_STATUS_SUCCESS) {
1587             DP("Failed to allocate device_state array\n");
1588             return NULL;
1589           }
1590           dss = {
1591               std::unique_ptr<void, RTLDeviceInfoTy::implFreePtrDeletor>{ptr},
1592               device_State_bytes,
1593           };
1594         }
1595 
1596         void *ptr = dss.first.get();
1597         if (device_State_bytes != dss.second) {
1598           DP("Inconsistent sizes of device_State unsupported\n");
1599           return NULL;
1600         }
1601 
1602         // write ptr to device memory so it can be used by later kernels
1603         err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr,
1604                                                    sizeof(void *), device_id);
1605         if (err != HSA_STATUS_SUCCESS) {
1606           DP("memcpy install of state_ptr failed\n");
1607           return NULL;
1608         }
1609       }
1610     }
1611   }
1612 
1613   // Here, we take advantage of the data that is appended after img_end to get
1614   // the symbols' name we need to load. This data consist of the host entries
1615   // begin and end as well as the target name (see the offloading linker script
1616   // creation in clang compiler).
1617 
1618   // Find the symbols in the module by name. The name can be obtain by
1619   // concatenating the host entry name with the target name
1620 
1621   __tgt_offload_entry *HostBegin = image->EntriesBegin;
1622   __tgt_offload_entry *HostEnd = image->EntriesEnd;
1623 
1624   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
1625 
1626     if (!e->addr) {
1627       // The host should have always something in the address to
1628       // uniquely identify the target region.
1629       DP("Analyzing host entry '<null>' (size = %lld)...\n",
1630          (unsigned long long)e->size);
1631       return NULL;
1632     }
1633 
1634     if (e->size) {
1635       __tgt_offload_entry entry = *e;
1636 
1637       void *varptr;
1638       uint32_t varsize;
1639 
1640       auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
1641       hsa_status_t err = interop_hsa_get_symbol_info(
1642           SymbolInfoMap, device_id, e->name, &varptr, &varsize);
1643 
1644       if (err != HSA_STATUS_SUCCESS) {
1645         // Inform the user what symbol prevented offloading
1646         DP("Loading global '%s' (Failed)\n", e->name);
1647         return NULL;
1648       }
1649 
1650       if (varsize != e->size) {
1651         DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name,
1652            varsize, e->size);
1653         return NULL;
1654       }
1655 
1656       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
1657          DPxPTR(e - HostBegin), e->name, DPxPTR(varptr));
1658       entry.addr = (void *)varptr;
1659 
1660       DeviceInfo.addOffloadEntry(device_id, entry);
1661 
1662       if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
1663           e->flags & OMP_DECLARE_TARGET_LINK) {
1664         // If unified memory is present any target link variables
1665         // can access host addresses directly. There is no longer a
1666         // need for device copies.
1667         err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
1668                                                    sizeof(void *), device_id);
1669         if (err != HSA_STATUS_SUCCESS)
1670           DP("Error when copying USM\n");
1671         DP("Copy linked variable host address (" DPxMOD ")"
1672            "to device address (" DPxMOD ")\n",
1673            DPxPTR(*((void **)e->addr)), DPxPTR(varptr));
1674       }
1675 
1676       continue;
1677     }
1678 
1679     DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name));
1680 
1681     uint32_t kernarg_segment_size;
1682     auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id];
1683     hsa_status_t err = interop_hsa_get_kernel_info(
1684         KernelInfoMap, device_id, e->name,
1685         HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1686         &kernarg_segment_size);
1687 
1688     // each arg is a void * in this openmp implementation
1689     uint32_t arg_num = kernarg_segment_size / sizeof(void *);
1690     std::vector<size_t> arg_sizes(arg_num);
1691     for (std::vector<size_t>::iterator it = arg_sizes.begin();
1692          it != arg_sizes.end(); it++) {
1693       *it = sizeof(void *);
1694     }
1695 
1696     // default value GENERIC (in case symbol is missing from cubin file)
1697     int8_t ExecModeVal = ExecutionModeType::GENERIC;
1698 
1699     // get flat group size if present, else Default_WG_Size
1700     int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
1701 
1702     // get Kernel Descriptor if present.
1703     // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
1704     struct KernDescValType {
1705       uint16_t Version;
1706       uint16_t TSize;
1707       uint16_t WG_Size;
1708       uint8_t Mode;
1709     };
1710     struct KernDescValType KernDescVal;
1711     std::string KernDescNameStr(e->name);
1712     KernDescNameStr += "_kern_desc";
1713     const char *KernDescName = KernDescNameStr.c_str();
1714 
1715     void *KernDescPtr;
1716     uint32_t KernDescSize;
1717     void *CallStackAddr = nullptr;
1718     err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1719                                   KernDescName, &KernDescPtr, &KernDescSize);
1720 
1721     if (err == HSA_STATUS_SUCCESS) {
1722       if ((size_t)KernDescSize != sizeof(KernDescVal))
1723         DP("Loading global computation properties '%s' - size mismatch (%u != "
1724            "%lu)\n",
1725            KernDescName, KernDescSize, sizeof(KernDescVal));
1726 
1727       memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
1728 
1729       // Check structure size against recorded size.
1730       if ((size_t)KernDescSize != KernDescVal.TSize)
1731         DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
1732            sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
1733 
1734       DP("After loading global for %s KernDesc \n", KernDescName);
1735       DP("KernDesc: Version: %d\n", KernDescVal.Version);
1736       DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
1737       DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
1738       DP("KernDesc: Mode: %d\n", KernDescVal.Mode);
1739 
1740       // Get ExecMode
1741       ExecModeVal = KernDescVal.Mode;
1742       DP("ExecModeVal %d\n", ExecModeVal);
1743       if (KernDescVal.WG_Size == 0) {
1744         KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size;
1745         DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size);
1746       }
1747       WGSizeVal = KernDescVal.WG_Size;
1748       DP("WGSizeVal %d\n", WGSizeVal);
1749       check("Loading KernDesc computation property", err);
1750     } else {
1751       DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
1752 
1753       // Generic
1754       std::string ExecModeNameStr(e->name);
1755       ExecModeNameStr += "_exec_mode";
1756       const char *ExecModeName = ExecModeNameStr.c_str();
1757 
1758       void *ExecModePtr;
1759       uint32_t varsize;
1760       err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1761                                     ExecModeName, &ExecModePtr, &varsize);
1762 
1763       if (err == HSA_STATUS_SUCCESS) {
1764         if ((size_t)varsize != sizeof(int8_t)) {
1765           DP("Loading global computation properties '%s' - size mismatch(%u != "
1766              "%lu)\n",
1767              ExecModeName, varsize, sizeof(int8_t));
1768           return NULL;
1769         }
1770 
1771         memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
1772 
1773         DP("After loading global for %s ExecMode = %d\n", ExecModeName,
1774            ExecModeVal);
1775 
1776         if (ExecModeVal < 0 || ExecModeVal > 2) {
1777           DP("Error wrong exec_mode value specified in HSA code object file: "
1778              "%d\n",
1779              ExecModeVal);
1780           return NULL;
1781         }
1782       } else {
1783         DP("Loading global exec_mode '%s' - symbol missing, using default "
1784            "value "
1785            "GENERIC (1)\n",
1786            ExecModeName);
1787       }
1788       check("Loading computation property", err);
1789 
1790       // Flat group size
1791       std::string WGSizeNameStr(e->name);
1792       WGSizeNameStr += "_wg_size";
1793       const char *WGSizeName = WGSizeNameStr.c_str();
1794 
1795       void *WGSizePtr;
1796       uint32_t WGSize;
1797       err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1798                                     WGSizeName, &WGSizePtr, &WGSize);
1799 
1800       if (err == HSA_STATUS_SUCCESS) {
1801         if ((size_t)WGSize != sizeof(int16_t)) {
1802           DP("Loading global computation properties '%s' - size mismatch (%u "
1803              "!= "
1804              "%lu)\n",
1805              WGSizeName, WGSize, sizeof(int16_t));
1806           return NULL;
1807         }
1808 
1809         memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
1810 
1811         DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
1812 
1813         if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size ||
1814             WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) {
1815           DP("Error wrong WGSize value specified in HSA code object file: "
1816              "%d\n",
1817              WGSizeVal);
1818           WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
1819         }
1820       } else {
1821         DP("Warning: Loading WGSize '%s' - symbol not found, "
1822            "using default value %d\n",
1823            WGSizeName, WGSizeVal);
1824       }
1825 
1826       check("Loading WGSize computation property", err);
1827     }
1828 
1829     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
1830                                    CallStackAddr, e->name, kernarg_segment_size,
1831                                    DeviceInfo.KernArgPool));
1832     __tgt_offload_entry entry = *e;
1833     entry.addr = (void *)&KernelsList.back();
1834     DeviceInfo.addOffloadEntry(device_id, entry);
1835     DP("Entry point %ld maps to %s\n", e - HostBegin, e->name);
1836   }
1837 
1838   return DeviceInfo.getOffloadEntriesTable(device_id);
1839 }
1840 
1841 void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) {
1842   void *ptr = NULL;
1843   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1844 
1845   if (kind != TARGET_ALLOC_DEFAULT) {
1846     REPORT("Invalid target data allocation kind or requested allocator not "
1847            "implemented yet\n");
1848     return NULL;
1849   }
1850 
1851   hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(device_id);
1852   hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, &ptr);
1853   DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size,
1854      (long long unsigned)(Elf64_Addr)ptr);
1855   ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL;
1856   return ptr;
1857 }
1858 
1859 int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr,
1860                               int64_t size) {
1861   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1862   __tgt_async_info AsyncInfo;
1863   int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo);
1864   if (rc != OFFLOAD_SUCCESS)
1865     return OFFLOAD_FAIL;
1866 
1867   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1868 }
1869 
1870 int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr,
1871                                     int64_t size, __tgt_async_info *AsyncInfo) {
1872   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1873   if (AsyncInfo) {
1874     initAsyncInfo(AsyncInfo);
1875     return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo);
1876   } else {
1877     return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size);
1878   }
1879 }
1880 
1881 int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr,
1882                                 int64_t size) {
1883   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1884   __tgt_async_info AsyncInfo;
1885   int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo);
1886   if (rc != OFFLOAD_SUCCESS)
1887     return OFFLOAD_FAIL;
1888 
1889   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1890 }
1891 
1892 int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr,
1893                                       void *tgt_ptr, int64_t size,
1894                                       __tgt_async_info *AsyncInfo) {
1895   assert(AsyncInfo && "AsyncInfo is nullptr");
1896   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1897   initAsyncInfo(AsyncInfo);
1898   return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo);
1899 }
1900 
1901 int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) {
1902   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1903   hsa_status_t err;
1904   DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr);
1905   err = core::Runtime::Memfree(tgt_ptr);
1906   if (err != HSA_STATUS_SUCCESS) {
1907     DP("Error when freeing CUDA memory\n");
1908     return OFFLOAD_FAIL;
1909   }
1910   return OFFLOAD_SUCCESS;
1911 }
1912 
1913 // Determine launch values for kernel.
1914 struct launchVals {
1915   int WorkgroupSize;
1916   int GridSize;
1917 };
1918 launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
1919                          int ConstWGSize, int ExecutionMode, int num_teams,
1920                          int thread_limit, uint64_t loop_tripcount,
1921                          int DeviceNumTeams) {
1922 
1923   int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
1924   int num_groups = 0;
1925 
1926   int Max_Teams =
1927       Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
1928   if (Max_Teams > RTLDeviceInfoTy::HardTeamLimit)
1929     Max_Teams = RTLDeviceInfoTy::HardTeamLimit;
1930 
1931   if (print_kernel_trace & STARTUP_DETAILS) {
1932     DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams);
1933     DP("Max_Teams: %d\n", Max_Teams);
1934     DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
1935     DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size);
1936     DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
1937        RTLDeviceInfoTy::Default_WG_Size);
1938     DP("thread_limit: %d\n", thread_limit);
1939     DP("threadsPerGroup: %d\n", threadsPerGroup);
1940     DP("ConstWGSize: %d\n", ConstWGSize);
1941   }
1942   // check for thread_limit() clause
1943   if (thread_limit > 0) {
1944     threadsPerGroup = thread_limit;
1945     DP("Setting threads per block to requested %d\n", thread_limit);
1946     if (ExecutionMode == GENERIC) { // Add master warp for GENERIC
1947       threadsPerGroup += WarpSize;
1948       DP("Adding master wavefront: +%d threads\n", WarpSize);
1949     }
1950     if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
1951       threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
1952       DP("Setting threads per block to maximum %d\n", threadsPerGroup);
1953     }
1954   }
1955   // check flat_max_work_group_size attr here
1956   if (threadsPerGroup > ConstWGSize) {
1957     threadsPerGroup = ConstWGSize;
1958     DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
1959        threadsPerGroup);
1960   }
1961   if (print_kernel_trace & STARTUP_DETAILS)
1962     DP("threadsPerGroup: %d\n", threadsPerGroup);
1963   DP("Preparing %d threads\n", threadsPerGroup);
1964 
1965   // Set default num_groups (teams)
1966   if (Env.TeamLimit > 0)
1967     num_groups = (Max_Teams < Env.TeamLimit) ? Max_Teams : Env.TeamLimit;
1968   else
1969     num_groups = Max_Teams;
1970   DP("Set default num of groups %d\n", num_groups);
1971 
1972   if (print_kernel_trace & STARTUP_DETAILS) {
1973     DP("num_groups: %d\n", num_groups);
1974     DP("num_teams: %d\n", num_teams);
1975   }
1976 
1977   // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1978   // This reduction is typical for default case (no thread_limit clause).
1979   // or when user goes crazy with num_teams clause.
1980   // FIXME: We cant distinguish between a constant or variable thread limit.
1981   // So we only handle constant thread_limits.
1982   if (threadsPerGroup >
1983       RTLDeviceInfoTy::Default_WG_Size) //  256 < threadsPerGroup <= 1024
1984     // Should we round threadsPerGroup up to nearest WarpSize
1985     // here?
1986     num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;
1987 
1988   // check for num_teams() clause
1989   if (num_teams > 0) {
1990     num_groups = (num_teams < num_groups) ? num_teams : num_groups;
1991   }
1992   if (print_kernel_trace & STARTUP_DETAILS) {
1993     DP("num_groups: %d\n", num_groups);
1994     DP("Env.NumTeams %d\n", Env.NumTeams);
1995     DP("Env.TeamLimit %d\n", Env.TeamLimit);
1996   }
1997 
1998   if (Env.NumTeams > 0) {
1999     num_groups = (Env.NumTeams < num_groups) ? Env.NumTeams : num_groups;
2000     DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
2001   } else if (Env.TeamLimit > 0) {
2002     num_groups = (Env.TeamLimit < num_groups) ? Env.TeamLimit : num_groups;
2003     DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
2004   } else {
2005     if (num_teams <= 0) {
2006       if (loop_tripcount > 0) {
2007         if (ExecutionMode == SPMD) {
2008           // round up to the nearest integer
2009           num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
2010         } else if (ExecutionMode == GENERIC) {
2011           num_groups = loop_tripcount;
2012         } else /* ExecutionMode == SPMD_GENERIC */ {
2013           // This is a generic kernel that was transformed to use SPMD-mode
2014           // execution but uses Generic-mode semantics for scheduling.
2015           num_groups = loop_tripcount;
2016         }
2017         DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
2018            "threads per block %d\n",
2019            num_groups, loop_tripcount, threadsPerGroup);
2020       }
2021     } else {
2022       num_groups = num_teams;
2023     }
2024     if (num_groups > Max_Teams) {
2025       num_groups = Max_Teams;
2026       if (print_kernel_trace & STARTUP_DETAILS)
2027         DP("Limiting num_groups %d to Max_Teams %d \n", num_groups, Max_Teams);
2028     }
2029     if (num_groups > num_teams && num_teams > 0) {
2030       num_groups = num_teams;
2031       if (print_kernel_trace & STARTUP_DETAILS)
2032         DP("Limiting num_groups %d to clause num_teams %d \n", num_groups,
2033            num_teams);
2034     }
2035   }
2036 
2037   // num_teams clause always honored, no matter what, unless DEFAULT is active.
2038   if (num_teams > 0) {
2039     num_groups = num_teams;
2040     // Cap num_groups to EnvMaxTeamsDefault if set.
2041     if (Env.MaxTeamsDefault > 0 && num_groups > Env.MaxTeamsDefault)
2042       num_groups = Env.MaxTeamsDefault;
2043   }
2044   if (print_kernel_trace & STARTUP_DETAILS) {
2045     DP("threadsPerGroup: %d\n", threadsPerGroup);
2046     DP("num_groups: %d\n", num_groups);
2047     DP("loop_tripcount: %ld\n", loop_tripcount);
2048   }
2049   DP("Final %d num_groups and %d threadsPerGroup\n", num_groups,
2050      threadsPerGroup);
2051 
2052   launchVals res;
2053   res.WorkgroupSize = threadsPerGroup;
2054   res.GridSize = threadsPerGroup * num_groups;
2055   return res;
2056 }
2057 
2058 static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
2059   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
2060   bool full = true;
2061   while (full) {
2062     full =
2063         packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue));
2064   }
2065   return packet_id;
2066 }
2067 
2068 static int32_t __tgt_rtl_run_target_team_region_locked(
2069     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
2070     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
2071     int32_t thread_limit, uint64_t loop_tripcount);
2072 
2073 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
2074                                          void **tgt_args,
2075                                          ptrdiff_t *tgt_offsets,
2076                                          int32_t arg_num, int32_t num_teams,
2077                                          int32_t thread_limit,
2078                                          uint64_t loop_tripcount) {
2079 
2080   DeviceInfo.load_run_lock.lock_shared();
2081   int32_t res = __tgt_rtl_run_target_team_region_locked(
2082       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams,
2083       thread_limit, loop_tripcount);
2084 
2085   DeviceInfo.load_run_lock.unlock_shared();
2086   return res;
2087 }
2088 
2089 int32_t __tgt_rtl_run_target_team_region_locked(
2090     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
2091     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
2092     int32_t thread_limit, uint64_t loop_tripcount) {
2093   // Set the context we are using
2094   // update thread limit content in gpu memory if un-initialized or specified
2095   // from host
2096 
2097   DP("Run target team region thread_limit %d\n", thread_limit);
2098 
2099   // All args are references.
2100   std::vector<void *> args(arg_num);
2101   std::vector<void *> ptrs(arg_num);
2102 
2103   DP("Arg_num: %d\n", arg_num);
2104   for (int32_t i = 0; i < arg_num; ++i) {
2105     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
2106     args[i] = &ptrs[i];
2107     DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
2108   }
2109 
2110   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
2111 
2112   std::string kernel_name = std::string(KernelInfo->Name);
2113   auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
2114   if (KernelInfoTable[device_id].find(kernel_name) ==
2115       KernelInfoTable[device_id].end()) {
2116     DP("Kernel %s not found\n", kernel_name.c_str());
2117     return OFFLOAD_FAIL;
2118   }
2119 
2120   const atl_kernel_info_t KernelInfoEntry =
2121       KernelInfoTable[device_id][kernel_name];
2122   const uint32_t group_segment_size = KernelInfoEntry.group_segment_size;
2123   const uint32_t sgpr_count = KernelInfoEntry.sgpr_count;
2124   const uint32_t vgpr_count = KernelInfoEntry.vgpr_count;
2125   const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count;
2126   const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count;
2127 
2128   assert(arg_num == (int)KernelInfoEntry.num_args);
2129 
2130   /*
2131    * Set limit based on ThreadsPerGroup and GroupsPerDevice
2132    */
2133   launchVals LV =
2134       getLaunchVals(DeviceInfo.WarpSize[device_id], DeviceInfo.Env,
2135                     KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
2136                     num_teams,      // From run_region arg
2137                     thread_limit,   // From run_region arg
2138                     loop_tripcount, // From run_region arg
2139                     DeviceInfo.NumTeams[KernelInfo->device_id]);
2140   const int GridSize = LV.GridSize;
2141   const int WorkgroupSize = LV.WorkgroupSize;
2142 
2143   if (print_kernel_trace >= LAUNCH) {
2144     int num_groups = GridSize / WorkgroupSize;
2145     // enum modes are SPMD, GENERIC, NONE 0,1,2
2146     // if doing rtl timing, print to stderr, unless stdout requested.
2147     bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
2148     fprintf(traceToStdout ? stdout : stderr,
2149             "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
2150             "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
2151             "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
2152             device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
2153             arg_num, num_groups, WorkgroupSize, num_teams, thread_limit,
2154             group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count,
2155             vgpr_spill_count, loop_tripcount, KernelInfo->Name);
2156   }
2157 
2158   // Run on the device.
2159   {
2160     hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get();
2161     if (!queue) {
2162       return OFFLOAD_FAIL;
2163     }
2164     uint64_t packet_id = acquire_available_packet_id(queue);
2165 
2166     const uint32_t mask = queue->size - 1; // size is a power of 2
2167     hsa_kernel_dispatch_packet_t *packet =
2168         (hsa_kernel_dispatch_packet_t *)queue->base_address +
2169         (packet_id & mask);
2170 
2171     // packet->header is written last
2172     packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
2173     packet->workgroup_size_x = WorkgroupSize;
2174     packet->workgroup_size_y = 1;
2175     packet->workgroup_size_z = 1;
2176     packet->reserved0 = 0;
2177     packet->grid_size_x = GridSize;
2178     packet->grid_size_y = 1;
2179     packet->grid_size_z = 1;
2180     packet->private_segment_size = KernelInfoEntry.private_segment_size;
2181     packet->group_segment_size = KernelInfoEntry.group_segment_size;
2182     packet->kernel_object = KernelInfoEntry.kernel_object;
2183     packet->kernarg_address = 0;     // use the block allocator
2184     packet->reserved2 = 0;           // impl writes id_ here
2185     packet->completion_signal = {0}; // may want a pool of signals
2186 
2187     KernelArgPool *ArgPool = nullptr;
2188     void *kernarg = nullptr;
2189     {
2190       auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name));
2191       if (it != KernelArgPoolMap.end()) {
2192         ArgPool = (it->second).get();
2193       }
2194     }
2195     if (!ArgPool) {
2196       DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
2197          device_id);
2198     }
2199     {
2200       if (ArgPool) {
2201         assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *)));
2202         kernarg = ArgPool->allocate(arg_num);
2203       }
2204       if (!kernarg) {
2205         DP("Allocate kernarg failed\n");
2206         return OFFLOAD_FAIL;
2207       }
2208 
2209       // Copy explicit arguments
2210       for (int i = 0; i < arg_num; i++) {
2211         memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *));
2212       }
2213 
2214       // Initialize implicit arguments. TODO: Which of these can be dropped
2215       impl_implicit_args_t *impl_args =
2216           reinterpret_cast<impl_implicit_args_t *>(
2217               static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size);
2218       memset(impl_args, 0,
2219              sizeof(impl_implicit_args_t)); // may not be necessary
2220       impl_args->offset_x = 0;
2221       impl_args->offset_y = 0;
2222       impl_args->offset_z = 0;
2223 
2224       // assign a hostcall buffer for the selected Q
2225       if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) {
2226         // hostrpc_assign_buffer is not thread safe, and this function is
2227         // under a multiple reader lock, not a writer lock.
2228         static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
2229         pthread_mutex_lock(&hostcall_init_lock);
2230         impl_args->hostcall_ptr = hostrpc_assign_buffer(
2231             DeviceInfo.HSAAgents[device_id], queue, device_id);
2232         pthread_mutex_unlock(&hostcall_init_lock);
2233         if (!impl_args->hostcall_ptr) {
2234           DP("hostrpc_assign_buffer failed, gpu would dereference null and "
2235              "error\n");
2236           return OFFLOAD_FAIL;
2237         }
2238       }
2239 
2240       packet->kernarg_address = kernarg;
2241     }
2242 
2243     hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
2244     if (s.handle == 0) {
2245       DP("Failed to get signal instance\n");
2246       return OFFLOAD_FAIL;
2247     }
2248     packet->completion_signal = s;
2249     hsa_signal_store_relaxed(packet->completion_signal, 1);
2250 
2251     // Publish the packet indicating it is ready to be processed
2252     core::packet_store_release(reinterpret_cast<uint32_t *>(packet),
2253                                core::create_header(), packet->setup);
2254 
2255     // Since the packet is already published, its contents must not be
2256     // accessed any more
2257     hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
2258 
2259     while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
2260                                      HSA_WAIT_STATE_BLOCKED) != 0)
2261       ;
2262 
2263     assert(ArgPool);
2264     ArgPool->deallocate(kernarg);
2265     DeviceInfo.FreeSignalPool.push(s);
2266   }
2267 
2268   DP("Kernel completed\n");
2269   return OFFLOAD_SUCCESS;
2270 }
2271 
2272 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
2273                                     void **tgt_args, ptrdiff_t *tgt_offsets,
2274                                     int32_t arg_num) {
2275   // use one team and one thread
2276   // fix thread num
2277   int32_t team_num = 1;
2278   int32_t thread_limit = 0; // use default
2279   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
2280                                           tgt_offsets, arg_num, team_num,
2281                                           thread_limit, 0);
2282 }
2283 
2284 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
2285                                           void *tgt_entry_ptr, void **tgt_args,
2286                                           ptrdiff_t *tgt_offsets,
2287                                           int32_t arg_num,
2288                                           __tgt_async_info *AsyncInfo) {
2289   assert(AsyncInfo && "AsyncInfo is nullptr");
2290   initAsyncInfo(AsyncInfo);
2291 
2292   // use one team and one thread
2293   // fix thread num
2294   int32_t team_num = 1;
2295   int32_t thread_limit = 0; // use default
2296   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
2297                                           tgt_offsets, arg_num, team_num,
2298                                           thread_limit, 0);
2299 }
2300 
2301 int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *AsyncInfo) {
2302   assert(AsyncInfo && "AsyncInfo is nullptr");
2303 
2304   // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
2305   // is not ensured by devices.cpp for amdgcn
2306   // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
2307   if (AsyncInfo->Queue) {
2308     finiAsyncInfo(AsyncInfo);
2309   }
2310   return OFFLOAD_SUCCESS;
2311 }
2312 
2313 namespace core {
2314 hsa_status_t allow_access_to_all_gpu_agents(void *ptr) {
2315   return hsa_amd_agents_allow_access(DeviceInfo.HSAAgents.size(),
2316                                      &DeviceInfo.HSAAgents[0], NULL, ptr);
2317 }
2318 
2319 } // namespace core
2320