1 //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for hsa machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <assert.h>
15 #include <cstdio>
16 #include <cstdlib>
17 #include <cstring>
18 #include <functional>
19 #include <libelf.h>
20 #include <list>
21 #include <memory>
22 #include <mutex>
23 #include <shared_mutex>
24 #include <unordered_map>
25 #include <vector>
26 
27 #include "interop_hsa.h"
28 #include "impl_runtime.h"
29 
30 #include "internal.h"
31 #include "rt.h"
32 
33 #include "get_elf_mach_gfx_name.h"
34 #include "omptargetplugin.h"
35 #include "print_tracing.h"
36 
37 #include "llvm/Frontend/OpenMP/OMPConstants.h"
38 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
39 
40 // hostrpc interface, FIXME: consider moving to its own include these are
41 // statically linked into amdgpu/plugin if present from hostrpc_services.a,
42 // linked as --whole-archive to override the weak symbols that are used to
43 // implement a fallback for toolchains that do not yet have a hostrpc library.
44 extern "C" {
45 unsigned long hostrpc_assign_buffer(hsa_agent_t agent, hsa_queue_t *this_Q,
46                                     uint32_t device_id);
47 hsa_status_t hostrpc_init();
48 hsa_status_t hostrpc_terminate();
49 
50 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
51 __attribute__((weak)) hsa_status_t hostrpc_terminate() {
52   return HSA_STATUS_SUCCESS;
53 }
54 __attribute__((weak)) unsigned long
55 hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, uint32_t device_id) {
56   DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
57      "missing\n",
58      device_id);
59   return 0;
60 }
61 }
62 
63 // Heuristic parameters used for kernel launch
64 // Number of teams per CU to allow scheduling flexibility
65 static const unsigned DefaultTeamsPerCU = 4;
66 
67 int print_kernel_trace;
68 
69 #ifdef OMPTARGET_DEBUG
70 #define check(msg, status)                                                     \
71   if (status != HSA_STATUS_SUCCESS) {                                          \
72     DP(#msg " failed\n");                                                      \
73   } else {                                                                     \
74     DP(#msg " succeeded\n");                                                   \
75   }
76 #else
77 #define check(msg, status)                                                     \
78   {}
79 #endif
80 
81 #include "elf_common.h"
82 
83 namespace hsa {
84 template <typename C> hsa_status_t iterate_agents(C cb) {
85   auto L = [](hsa_agent_t agent, void *data) -> hsa_status_t {
86     C *unwrapped = static_cast<C *>(data);
87     return (*unwrapped)(agent);
88   };
89   return hsa_iterate_agents(L, static_cast<void *>(&cb));
90 }
91 
92 template <typename C>
93 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C cb) {
94   auto L = [](hsa_amd_memory_pool_t MemoryPool, void *data) -> hsa_status_t {
95     C *unwrapped = static_cast<C *>(data);
96     return (*unwrapped)(MemoryPool);
97   };
98 
99   return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&cb));
100 }
101 
102 } // namespace hsa
103 
104 /// Keep entries table per device
105 struct FuncOrGblEntryTy {
106   __tgt_target_table Table;
107   std::vector<__tgt_offload_entry> Entries;
108 };
109 
110 struct KernelArgPool {
111 private:
112   static pthread_mutex_t mutex;
113 
114 public:
115   uint32_t kernarg_segment_size;
116   void *kernarg_region = nullptr;
117   std::queue<int> free_kernarg_segments;
118 
119   uint32_t kernarg_size_including_implicit() {
120     return kernarg_segment_size + sizeof(impl_implicit_args_t);
121   }
122 
123   ~KernelArgPool() {
124     if (kernarg_region) {
125       auto r = hsa_amd_memory_pool_free(kernarg_region);
126       if (r != HSA_STATUS_SUCCESS) {
127         DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(r));
128       }
129     }
130   }
131 
132   // Can't really copy or move a mutex
133   KernelArgPool() = default;
134   KernelArgPool(const KernelArgPool &) = delete;
135   KernelArgPool(KernelArgPool &&) = delete;
136 
137   KernelArgPool(uint32_t kernarg_segment_size,
138                 hsa_amd_memory_pool_t &memory_pool)
139       : kernarg_segment_size(kernarg_segment_size) {
140 
141     // impl uses one pool per kernel for all gpus, with a fixed upper size
142     // preserving that exact scheme here, including the queue<int>
143 
144     hsa_status_t err = hsa_amd_memory_pool_allocate(
145         memory_pool, kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
146         &kernarg_region);
147 
148     if (err != HSA_STATUS_SUCCESS) {
149       DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(err));
150       kernarg_region = nullptr; // paranoid
151       return;
152     }
153 
154     err = core::allow_access_to_all_gpu_agents(kernarg_region);
155     if (err != HSA_STATUS_SUCCESS) {
156       DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
157          get_error_string(err));
158       auto r = hsa_amd_memory_pool_free(kernarg_region);
159       if (r != HSA_STATUS_SUCCESS) {
160         // if free failed, can't do anything more to resolve it
161         DP("hsa memory poll free failed: %s\n", get_error_string(err));
162       }
163       kernarg_region = nullptr;
164       return;
165     }
166 
167     for (int i = 0; i < MAX_NUM_KERNELS; i++) {
168       free_kernarg_segments.push(i);
169     }
170   }
171 
172   void *allocate(uint64_t arg_num) {
173     assert((arg_num * sizeof(void *)) == kernarg_segment_size);
174     lock l(&mutex);
175     void *res = nullptr;
176     if (!free_kernarg_segments.empty()) {
177 
178       int free_idx = free_kernarg_segments.front();
179       res = static_cast<void *>(static_cast<char *>(kernarg_region) +
180                                 (free_idx * kernarg_size_including_implicit()));
181       assert(free_idx == pointer_to_index(res));
182       free_kernarg_segments.pop();
183     }
184     return res;
185   }
186 
187   void deallocate(void *ptr) {
188     lock l(&mutex);
189     int idx = pointer_to_index(ptr);
190     free_kernarg_segments.push(idx);
191   }
192 
193 private:
194   int pointer_to_index(void *ptr) {
195     ptrdiff_t bytes =
196         static_cast<char *>(ptr) - static_cast<char *>(kernarg_region);
197     assert(bytes >= 0);
198     assert(bytes % kernarg_size_including_implicit() == 0);
199     return bytes / kernarg_size_including_implicit();
200   }
201   struct lock {
202     lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
203     ~lock() { pthread_mutex_unlock(m); }
204     pthread_mutex_t *m;
205   };
206 };
207 pthread_mutex_t KernelArgPool::mutex = PTHREAD_MUTEX_INITIALIZER;
208 
209 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
210     KernelArgPoolMap;
211 
212 /// Use a single entity to encode a kernel and a set of flags
213 struct KernelTy {
214   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
215   int16_t ConstWGSize;
216   int32_t device_id;
217   void *CallStackAddr = nullptr;
218   const char *Name;
219 
220   KernelTy(llvm::omp::OMPTgtExecModeFlags _ExecutionMode, int16_t _ConstWGSize,
221            int32_t _device_id, void *_CallStackAddr, const char *_Name,
222            uint32_t _kernarg_segment_size,
223            hsa_amd_memory_pool_t &KernArgMemoryPool)
224       : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
225         device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) {
226     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
227 
228     std::string N(_Name);
229     if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
230       KernelArgPoolMap.insert(
231           std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
232                                 _kernarg_segment_size, KernArgMemoryPool))));
233     }
234   }
235 };
236 
237 /// List that contains all the kernels.
238 /// FIXME: we may need this to be per device and per library.
239 std::list<KernelTy> KernelsList;
240 
241 template <typename Callback> static hsa_status_t FindAgents(Callback CB) {
242 
243   hsa_status_t err =
244       hsa::iterate_agents([&](hsa_agent_t agent) -> hsa_status_t {
245         hsa_device_type_t device_type;
246         // get_info fails iff HSA runtime not yet initialized
247         hsa_status_t err =
248             hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
249 
250         if (err != HSA_STATUS_SUCCESS) {
251           if (print_kernel_trace > 0)
252             DP("rtl.cpp: err %s\n", get_error_string(err));
253 
254           return err;
255         }
256 
257         CB(device_type, agent);
258         return HSA_STATUS_SUCCESS;
259       });
260 
261   // iterate_agents fails iff HSA runtime not yet initialized
262   if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) {
263     DP("rtl.cpp: err %s\n", get_error_string(err));
264   }
265 
266   return err;
267 }
268 
269 static void callbackQueue(hsa_status_t status, hsa_queue_t *source,
270                           void *data) {
271   if (status != HSA_STATUS_SUCCESS) {
272     const char *status_string;
273     if (hsa_status_string(status, &status_string) != HSA_STATUS_SUCCESS) {
274       status_string = "unavailable";
275     }
276     DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, source,
277        status, status_string);
278     abort();
279   }
280 }
281 
282 namespace core {
283 namespace {
284 void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest) {
285   __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
286 }
287 
288 uint16_t create_header() {
289   uint16_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
290   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
291   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
292   return header;
293 }
294 
295 hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
296   bool AllocAllowed = false;
297   hsa_status_t Err = hsa_amd_memory_pool_get_info(
298       MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
299       &AllocAllowed);
300   if (Err != HSA_STATUS_SUCCESS) {
301     DP("Alloc allowed in memory pool check failed: %s\n",
302        get_error_string(Err));
303     return Err;
304   }
305 
306   size_t Size = 0;
307   Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
308                                      &Size);
309   if (Err != HSA_STATUS_SUCCESS) {
310     DP("Get memory pool size failed: %s\n", get_error_string(Err));
311     return Err;
312   }
313 
314   return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
315 }
316 
317 hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
318   std::vector<hsa_amd_memory_pool_t> *Result =
319       static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
320 
321   hsa_status_t err;
322   if ((err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
323     return err;
324   }
325 
326   Result->push_back(MemoryPool);
327   return HSA_STATUS_SUCCESS;
328 }
329 
330 } // namespace
331 } // namespace core
332 
333 struct EnvironmentVariables {
334   int NumTeams;
335   int TeamLimit;
336   int TeamThreadLimit;
337   int MaxTeamsDefault;
338 };
339 
340 template <uint32_t wavesize>
341 static constexpr const llvm::omp::GV &getGridValue() {
342   return llvm::omp::getAMDGPUGridValues<wavesize>();
343 }
344 
345 struct HSALifetime {
346   // Wrapper around HSA used to ensure it is constructed before other types
347   // and destructed after, which means said other types can use raii for
348   // cleanup without risking running outside of the lifetime of HSA
349   const hsa_status_t S;
350 
351   bool success() { return S == HSA_STATUS_SUCCESS; }
352   HSALifetime() : S(hsa_init()) {}
353 
354   ~HSALifetime() {
355     if (S == HSA_STATUS_SUCCESS) {
356       hsa_status_t Err = hsa_shut_down();
357       if (Err != HSA_STATUS_SUCCESS) {
358         // Can't call into HSA to get a string from the integer
359         DP("Shutting down HSA failed: %d\n", Err);
360       }
361     }
362   }
363 };
364 
365 /// Class containing all the device information
366 class RTLDeviceInfoTy {
367   HSALifetime HSA; // First field => constructed first and destructed last
368   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
369 
370   struct QueueDeleter {
371     void operator()(hsa_queue_t *Q) {
372       if (Q) {
373         hsa_status_t Err = hsa_queue_destroy(Q);
374         if (Err != HSA_STATUS_SUCCESS) {
375           DP("Error destroying hsa queue: %s\n", get_error_string(Err));
376         }
377       }
378     }
379   };
380 
381 public:
382   bool ConstructionSucceeded = false;
383 
384   // load binary populates symbol tables and mutates various global state
385   // run uses those symbol tables
386   std::shared_timed_mutex load_run_lock;
387 
388   int NumberOfDevices = 0;
389 
390   // GPU devices
391   std::vector<hsa_agent_t> HSAAgents;
392   std::vector<std::unique_ptr<hsa_queue_t, QueueDeleter>>
393       HSAQueues; // one per gpu
394 
395   // CPUs
396   std::vector<hsa_agent_t> CPUAgents;
397 
398   // Device properties
399   std::vector<int> ComputeUnits;
400   std::vector<int> GroupsPerDevice;
401   std::vector<int> ThreadsPerGroup;
402   std::vector<int> WarpSize;
403   std::vector<std::string> GPUName;
404 
405   // OpenMP properties
406   std::vector<int> NumTeams;
407   std::vector<int> NumThreads;
408 
409   // OpenMP Environment properties
410   EnvironmentVariables Env;
411 
412   // OpenMP Requires Flags
413   int64_t RequiresFlags;
414 
415   // Resource pools
416   SignalPoolT FreeSignalPool;
417 
418   bool hostcall_required = false;
419 
420   std::vector<hsa_executable_t> HSAExecutables;
421 
422   std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
423   std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
424 
425   hsa_amd_memory_pool_t KernArgPool;
426 
427   // fine grained memory pool for host allocations
428   hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
429 
430   // fine and coarse-grained memory pools per offloading device
431   std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
432   std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
433 
434   struct implFreePtrDeletor {
435     void operator()(void *p) {
436       core::Runtime::Memfree(p); // ignore failure to free
437     }
438   };
439 
440   // device_State shared across loaded binaries, error if inconsistent size
441   std::vector<std::pair<std::unique_ptr<void, implFreePtrDeletor>, uint64_t>>
442       deviceStateStore;
443 
444   static const unsigned HardTeamLimit =
445       (1 << 16) - 1; // 64K needed to fit in uint16
446   static const int DefaultNumTeams = 128;
447 
448   // These need to be per-device since different devices can have different
449   // wave sizes, but are currently the same number for each so that refactor
450   // can be postponed.
451   static_assert(getGridValue<32>().GV_Max_Teams ==
452                     getGridValue<64>().GV_Max_Teams,
453                 "");
454   static const int Max_Teams = getGridValue<64>().GV_Max_Teams;
455 
456   static_assert(getGridValue<32>().GV_Max_WG_Size ==
457                     getGridValue<64>().GV_Max_WG_Size,
458                 "");
459   static const int Max_WG_Size = getGridValue<64>().GV_Max_WG_Size;
460 
461   static_assert(getGridValue<32>().GV_Default_WG_Size ==
462                     getGridValue<64>().GV_Default_WG_Size,
463                 "");
464   static const int Default_WG_Size = getGridValue<64>().GV_Default_WG_Size;
465 
466   using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *,
467                                       size_t size, hsa_agent_t,
468                                       hsa_amd_memory_pool_t);
469   hsa_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
470                                      MemcpyFunc Func, int32_t deviceId) {
471     hsa_agent_t agent = HSAAgents[deviceId];
472     hsa_signal_t s = FreeSignalPool.pop();
473     if (s.handle == 0) {
474       return HSA_STATUS_ERROR;
475     }
476     hsa_status_t r = Func(s, dest, src, size, agent, HostFineGrainedMemoryPool);
477     FreeSignalPool.push(s);
478     return r;
479   }
480 
481   hsa_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
482                                          size_t size, int32_t deviceId) {
483     return freesignalpool_memcpy(dest, src, size, impl_memcpy_d2h, deviceId);
484   }
485 
486   hsa_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
487                                          size_t size, int32_t deviceId) {
488     return freesignalpool_memcpy(dest, src, size, impl_memcpy_h2d, deviceId);
489   }
490 
491   // Record entry point associated with device
492   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
493     assert(device_id < (int32_t)FuncGblEntries.size() &&
494            "Unexpected device id!");
495     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
496 
497     E.Entries.push_back(entry);
498   }
499 
500   // Return true if the entry is associated with device
501   bool findOffloadEntry(int32_t device_id, void *addr) {
502     assert(device_id < (int32_t)FuncGblEntries.size() &&
503            "Unexpected device id!");
504     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
505 
506     for (auto &it : E.Entries) {
507       if (it.addr == addr)
508         return true;
509     }
510 
511     return false;
512   }
513 
514   // Return the pointer to the target entries table
515   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
516     assert(device_id < (int32_t)FuncGblEntries.size() &&
517            "Unexpected device id!");
518     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
519 
520     int32_t size = E.Entries.size();
521 
522     // Table is empty
523     if (!size)
524       return 0;
525 
526     __tgt_offload_entry *begin = &E.Entries[0];
527     __tgt_offload_entry *end = &E.Entries[size - 1];
528 
529     // Update table info according to the entries and return the pointer
530     E.Table.EntriesBegin = begin;
531     E.Table.EntriesEnd = ++end;
532 
533     return &E.Table;
534   }
535 
536   // Clear entries table for a device
537   void clearOffloadEntriesTable(int device_id) {
538     assert(device_id < (int32_t)FuncGblEntries.size() &&
539            "Unexpected device id!");
540     FuncGblEntries[device_id].emplace_back();
541     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
542     // KernelArgPoolMap.clear();
543     E.Entries.clear();
544     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
545   }
546 
547   hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
548                                    int DeviceId) {
549     assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
550     uint32_t GlobalFlags = 0;
551     hsa_status_t Err = hsa_amd_memory_pool_get_info(
552         MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
553 
554     if (Err != HSA_STATUS_SUCCESS) {
555       return Err;
556     }
557 
558     if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
559       DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
560     } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
561       DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
562     }
563 
564     return HSA_STATUS_SUCCESS;
565   }
566 
567   hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
568     for (int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
569       hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
570           Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
571             hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
572             if (ValidStatus != HSA_STATUS_SUCCESS) {
573               DP("Alloc allowed in memory pool check failed: %s\n",
574                  get_error_string(ValidStatus));
575               return HSA_STATUS_SUCCESS;
576             }
577             return addDeviceMemoryPool(MemoryPool, DeviceId);
578           });
579 
580       if (Err != HSA_STATUS_SUCCESS) {
581         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
582            "Iterate all memory pools", get_error_string(Err));
583         return Err;
584       }
585     }
586     return HSA_STATUS_SUCCESS;
587   }
588 
589   hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
590     std::vector<hsa_amd_memory_pool_t> HostPools;
591 
592     // collect all the "valid" pools for all the given agents.
593     for (const auto &Agent : Agents) {
594       hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
595           Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
596       if (Err != HSA_STATUS_SUCCESS) {
597         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
598            "Iterate all memory pools", get_error_string(Err));
599         return Err;
600       }
601     }
602 
603     // We need two fine-grained pools.
604     //  1. One with kernarg flag set for storing kernel arguments
605     //  2. Second for host allocations
606     bool FineGrainedMemoryPoolSet = false;
607     bool KernArgPoolSet = false;
608     for (const auto &MemoryPool : HostPools) {
609       hsa_status_t Err = HSA_STATUS_SUCCESS;
610       uint32_t GlobalFlags = 0;
611       Err = hsa_amd_memory_pool_get_info(
612           MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
613       if (Err != HSA_STATUS_SUCCESS) {
614         DP("Get memory pool info failed: %s\n", get_error_string(Err));
615         return Err;
616       }
617 
618       if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
619         if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
620           KernArgPool = MemoryPool;
621           KernArgPoolSet = true;
622         }
623         HostFineGrainedMemoryPool = MemoryPool;
624         FineGrainedMemoryPoolSet = true;
625       }
626     }
627 
628     if (FineGrainedMemoryPoolSet && KernArgPoolSet)
629       return HSA_STATUS_SUCCESS;
630 
631     return HSA_STATUS_ERROR;
632   }
633 
634   hsa_amd_memory_pool_t getDeviceMemoryPool(int DeviceId) {
635     assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
636            "Invalid device Id");
637     return DeviceCoarseGrainedMemoryPools[DeviceId];
638   }
639 
640   hsa_amd_memory_pool_t getHostMemoryPool() {
641     return HostFineGrainedMemoryPool;
642   }
643 
644   static int readEnvElseMinusOne(const char *Env) {
645     const char *envStr = getenv(Env);
646     int res = -1;
647     if (envStr) {
648       res = std::stoi(envStr);
649       DP("Parsed %s=%d\n", Env, res);
650     }
651     return res;
652   }
653 
654   RTLDeviceInfoTy() {
655     DP("Start initializing " GETNAME(TARGET_NAME) "\n");
656 
657     // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
658     // anytime. You do not need a debug library build.
659     //  0 => no tracing
660     //  1 => tracing dispatch only
661     // >1 => verbosity increase
662 
663     if (!HSA.success()) {
664       DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
665       return;
666     }
667 
668     if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
669       print_kernel_trace = atoi(envStr);
670     else
671       print_kernel_trace = 0;
672 
673     hsa_status_t err = core::atl_init_gpu_context();
674     if (err != HSA_STATUS_SUCCESS) {
675       DP("Error when initializing " GETNAME(TARGET_NAME) "\n");
676       return;
677     }
678 
679     // Init hostcall soon after initializing hsa
680     hostrpc_init();
681 
682     err = FindAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
683       if (DeviceType == HSA_DEVICE_TYPE_CPU) {
684         CPUAgents.push_back(Agent);
685       } else {
686         HSAAgents.push_back(Agent);
687       }
688     });
689     if (err != HSA_STATUS_SUCCESS)
690       return;
691 
692     NumberOfDevices = (int)HSAAgents.size();
693 
694     if (NumberOfDevices == 0) {
695       DP("There are no devices supporting HSA.\n");
696       return;
697     } else {
698       DP("There are %d devices supporting HSA.\n", NumberOfDevices);
699     }
700 
701     // Init the device info
702     HSAQueues.resize(NumberOfDevices);
703     FuncGblEntries.resize(NumberOfDevices);
704     ThreadsPerGroup.resize(NumberOfDevices);
705     ComputeUnits.resize(NumberOfDevices);
706     GPUName.resize(NumberOfDevices);
707     GroupsPerDevice.resize(NumberOfDevices);
708     WarpSize.resize(NumberOfDevices);
709     NumTeams.resize(NumberOfDevices);
710     NumThreads.resize(NumberOfDevices);
711     deviceStateStore.resize(NumberOfDevices);
712     KernelInfoTable.resize(NumberOfDevices);
713     SymbolInfoTable.resize(NumberOfDevices);
714     DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
715     DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
716 
717     err = setupDevicePools(HSAAgents);
718     if (err != HSA_STATUS_SUCCESS) {
719       DP("Setup for Device Memory Pools failed\n");
720       return;
721     }
722 
723     err = setupHostMemoryPools(CPUAgents);
724     if (err != HSA_STATUS_SUCCESS) {
725       DP("Setup for Host Memory Pools failed\n");
726       return;
727     }
728 
729     for (int i = 0; i < NumberOfDevices; i++) {
730       uint32_t queue_size = 0;
731       {
732         hsa_status_t err = hsa_agent_get_info(
733             HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
734         if (err != HSA_STATUS_SUCCESS) {
735           DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", i);
736           return;
737         }
738         enum { MaxQueueSize = 4096 };
739         if (queue_size > MaxQueueSize) {
740           queue_size = MaxQueueSize;
741         }
742       }
743 
744       {
745         hsa_queue_t *Q = nullptr;
746         hsa_status_t rc =
747             hsa_queue_create(HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI,
748                              callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
749         if (rc != HSA_STATUS_SUCCESS) {
750           DP("Failed to create HSA queue %d\n", i);
751           return;
752         }
753         HSAQueues[i].reset(Q);
754       }
755 
756       deviceStateStore[i] = {nullptr, 0};
757     }
758 
759     for (int i = 0; i < NumberOfDevices; i++) {
760       ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size;
761       GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams;
762       ComputeUnits[i] = 1;
763       DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", i,
764          GroupsPerDevice[i], ThreadsPerGroup[i]);
765     }
766 
767     // Get environment variables regarding teams
768     Env.TeamLimit = readEnvElseMinusOne("OMP_TEAM_LIMIT");
769     Env.NumTeams = readEnvElseMinusOne("OMP_NUM_TEAMS");
770     Env.MaxTeamsDefault = readEnvElseMinusOne("OMP_MAX_TEAMS_DEFAULT");
771     Env.TeamThreadLimit = readEnvElseMinusOne("OMP_TEAMS_THREAD_LIMIT");
772 
773     // Default state.
774     RequiresFlags = OMP_REQ_UNDEFINED;
775 
776     ConstructionSucceeded = true;
777   }
778 
779   ~RTLDeviceInfoTy() {
780     DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
781     if (!HSA.success()) {
782       // Then none of these can have been set up and they can't be torn down
783       return;
784     }
785     // Run destructors on types that use HSA before
786     // impl_finalize removes access to it
787     deviceStateStore.clear();
788     KernelArgPoolMap.clear();
789     // Terminate hostrpc before finalizing hsa
790     hostrpc_terminate();
791 
792     hsa_status_t Err;
793     for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
794       Err = hsa_executable_destroy(HSAExecutables[I]);
795       if (Err != HSA_STATUS_SUCCESS) {
796         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
797            "Destroying executable", get_error_string(Err));
798       }
799     }
800   }
801 };
802 
803 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
804 
805 // TODO: May need to drop the trailing to fields until deviceRTL is updated
806 struct omptarget_device_environmentTy {
807   int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG
808                        // only useful for Debug build of deviceRTLs
809   int32_t num_devices; // gets number of active offload devices
810   int32_t device_num;  // gets a value 0 to num_devices-1
811 };
812 
813 static RTLDeviceInfoTy DeviceInfo;
814 
815 namespace {
816 
817 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
818                      __tgt_async_info *AsyncInfo) {
819   assert(AsyncInfo && "AsyncInfo is nullptr");
820   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
821   // Return success if we are not copying back to host from target.
822   if (!HstPtr)
823     return OFFLOAD_SUCCESS;
824   hsa_status_t err;
825   DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
826      (long long unsigned)(Elf64_Addr)TgtPtr,
827      (long long unsigned)(Elf64_Addr)HstPtr);
828 
829   err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size,
830                                              DeviceId);
831 
832   if (err != HSA_STATUS_SUCCESS) {
833     DP("Error when copying data from device to host. Pointers: "
834        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
835        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
836     return OFFLOAD_FAIL;
837   }
838   DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
839      (long long unsigned)(Elf64_Addr)TgtPtr,
840      (long long unsigned)(Elf64_Addr)HstPtr);
841   return OFFLOAD_SUCCESS;
842 }
843 
844 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
845                    __tgt_async_info *AsyncInfo) {
846   assert(AsyncInfo && "AsyncInfo is nullptr");
847   hsa_status_t err;
848   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
849   // Return success if we are not doing host to target.
850   if (!HstPtr)
851     return OFFLOAD_SUCCESS;
852 
853   DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
854      (long long unsigned)(Elf64_Addr)HstPtr,
855      (long long unsigned)(Elf64_Addr)TgtPtr);
856   err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size,
857                                              DeviceId);
858   if (err != HSA_STATUS_SUCCESS) {
859     DP("Error when copying data from host to device. Pointers: "
860        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
861        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
862     return OFFLOAD_FAIL;
863   }
864   return OFFLOAD_SUCCESS;
865 }
866 
867 // Async.
868 // The implementation was written with cuda streams in mind. The semantics of
869 // that are to execute kernels on a queue in order of insertion. A synchronise
870 // call then makes writes visible between host and device. This means a series
871 // of N data_submit_async calls are expected to execute serially. HSA offers
872 // various options to run the data copies concurrently. This may require changes
873 // to libomptarget.
874 
875 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
876 // there are no outstanding kernels that need to be synchronized. Any async call
877 // may be passed a Queue==0, at which point the cuda implementation will set it
878 // to non-null (see getStream). The cuda streams are per-device. Upstream may
879 // change this interface to explicitly initialize the AsyncInfo_pointer, but
880 // until then hsa lazily initializes it as well.
881 
882 void initAsyncInfo(__tgt_async_info *AsyncInfo) {
883   // set non-null while using async calls, return to null to indicate completion
884   assert(AsyncInfo);
885   if (!AsyncInfo->Queue) {
886     AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX);
887   }
888 }
889 void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
890   assert(AsyncInfo);
891   assert(AsyncInfo->Queue);
892   AsyncInfo->Queue = 0;
893 }
894 
895 bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
896   const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
897   int32_t r = elf_check_machine(image, amdgcnMachineID);
898   if (!r) {
899     DP("Supported machine ID not found\n");
900   }
901   return r;
902 }
903 
904 uint32_t elf_e_flags(__tgt_device_image *image) {
905   char *img_begin = (char *)image->ImageStart;
906   size_t img_size = (char *)image->ImageEnd - img_begin;
907 
908   Elf *e = elf_memory(img_begin, img_size);
909   if (!e) {
910     DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
911     return 0;
912   }
913 
914   Elf64_Ehdr *eh64 = elf64_getehdr(e);
915 
916   if (!eh64) {
917     DP("Unable to get machine ID from ELF file!\n");
918     elf_end(e);
919     return 0;
920   }
921 
922   uint32_t Flags = eh64->e_flags;
923 
924   elf_end(e);
925   DP("ELF Flags: 0x%x\n", Flags);
926   return Flags;
927 }
928 } // namespace
929 
930 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
931   return elf_machine_id_is_amdgcn(image);
932 }
933 
934 int __tgt_rtl_number_of_devices() {
935   // If the construction failed, no methods are safe to call
936   if (DeviceInfo.ConstructionSucceeded) {
937     return DeviceInfo.NumberOfDevices;
938   } else {
939     DP("AMDGPU plugin construction failed. Zero devices available\n");
940     return 0;
941   }
942 }
943 
944 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
945   DP("Init requires flags to %ld\n", RequiresFlags);
946   DeviceInfo.RequiresFlags = RequiresFlags;
947   return RequiresFlags;
948 }
949 
950 namespace {
951 template <typename T> bool enforce_upper_bound(T *value, T upper) {
952   bool changed = *value > upper;
953   if (changed) {
954     *value = upper;
955   }
956   return changed;
957 }
958 } // namespace
959 
960 int32_t __tgt_rtl_init_device(int device_id) {
961   hsa_status_t err;
962 
963   // this is per device id init
964   DP("Initialize the device id: %d\n", device_id);
965 
966   hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
967 
968   // Get number of Compute Unit
969   uint32_t compute_units = 0;
970   err = hsa_agent_get_info(
971       agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
972       &compute_units);
973   if (err != HSA_STATUS_SUCCESS) {
974     DeviceInfo.ComputeUnits[device_id] = 1;
975     DP("Error getting compute units : settiing to 1\n");
976   } else {
977     DeviceInfo.ComputeUnits[device_id] = compute_units;
978     DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]);
979   }
980 
981   char GetInfoName[64]; // 64 max size returned by get info
982   err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
983                            (void *)GetInfoName);
984   if (err)
985     DeviceInfo.GPUName[device_id] = "--unknown gpu--";
986   else {
987     DeviceInfo.GPUName[device_id] = GetInfoName;
988   }
989 
990   if (print_kernel_trace & STARTUP_DETAILS)
991     DP("Device#%-2d CU's: %2d %s\n", device_id,
992        DeviceInfo.ComputeUnits[device_id],
993        DeviceInfo.GPUName[device_id].c_str());
994 
995   // Query attributes to determine number of threads/block and blocks/grid.
996   uint16_t workgroup_max_dim[3];
997   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
998                            &workgroup_max_dim);
999   if (err != HSA_STATUS_SUCCESS) {
1000     DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
1001     DP("Error getting grid dims: num groups : %d\n",
1002        RTLDeviceInfoTy::DefaultNumTeams);
1003   } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
1004     DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0];
1005     DP("Using %d ROCm blocks per grid\n",
1006        DeviceInfo.GroupsPerDevice[device_id]);
1007   } else {
1008     DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit;
1009     DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
1010        "at the hard limit\n",
1011        workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit);
1012   }
1013 
1014   // Get thread limit
1015   hsa_dim3_t grid_max_dim;
1016   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim);
1017   if (err == HSA_STATUS_SUCCESS) {
1018     DeviceInfo.ThreadsPerGroup[device_id] =
1019         reinterpret_cast<uint32_t *>(&grid_max_dim)[0] /
1020         DeviceInfo.GroupsPerDevice[device_id];
1021 
1022     if (DeviceInfo.ThreadsPerGroup[device_id] == 0) {
1023       DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
1024       DP("Default thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
1025     } else if (enforce_upper_bound(&DeviceInfo.ThreadsPerGroup[device_id],
1026                                    RTLDeviceInfoTy::Max_WG_Size)) {
1027       DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
1028     } else {
1029       DP("Using ROCm Queried thread limit: %d\n",
1030          DeviceInfo.ThreadsPerGroup[device_id]);
1031     }
1032   } else {
1033     DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
1034     DP("Error getting max block dimension, use default:%d \n",
1035        RTLDeviceInfoTy::Max_WG_Size);
1036   }
1037 
1038   // Get wavefront size
1039   uint32_t wavefront_size = 0;
1040   err =
1041       hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size);
1042   if (err == HSA_STATUS_SUCCESS) {
1043     DP("Queried wavefront size: %d\n", wavefront_size);
1044     DeviceInfo.WarpSize[device_id] = wavefront_size;
1045   } else {
1046     // TODO: Burn the wavefront size into the code object
1047     DP("Warning: Unknown wavefront size, assuming 64\n");
1048     DeviceInfo.WarpSize[device_id] = 64;
1049   }
1050 
1051   // Adjust teams to the env variables
1052 
1053   if (DeviceInfo.Env.TeamLimit > 0 &&
1054       (enforce_upper_bound(&DeviceInfo.GroupsPerDevice[device_id],
1055                            DeviceInfo.Env.TeamLimit))) {
1056     DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
1057        DeviceInfo.Env.TeamLimit);
1058   }
1059 
1060   // Set default number of teams
1061   if (DeviceInfo.Env.NumTeams > 0) {
1062     DeviceInfo.NumTeams[device_id] = DeviceInfo.Env.NumTeams;
1063     DP("Default number of teams set according to environment %d\n",
1064        DeviceInfo.Env.NumTeams);
1065   } else {
1066     char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
1067     int TeamsPerCU = DefaultTeamsPerCU;
1068     if (TeamsPerCUEnvStr) {
1069       TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
1070     }
1071 
1072     DeviceInfo.NumTeams[device_id] =
1073         TeamsPerCU * DeviceInfo.ComputeUnits[device_id];
1074     DP("Default number of teams = %d * number of compute units %d\n",
1075        TeamsPerCU, DeviceInfo.ComputeUnits[device_id]);
1076   }
1077 
1078   if (enforce_upper_bound(&DeviceInfo.NumTeams[device_id],
1079                           DeviceInfo.GroupsPerDevice[device_id])) {
1080     DP("Default number of teams exceeds device limit, capping at %d\n",
1081        DeviceInfo.GroupsPerDevice[device_id]);
1082   }
1083 
1084   // Adjust threads to the env variables
1085   if (DeviceInfo.Env.TeamThreadLimit > 0 &&
1086       (enforce_upper_bound(&DeviceInfo.NumThreads[device_id],
1087                            DeviceInfo.Env.TeamThreadLimit))) {
1088     DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
1089        DeviceInfo.Env.TeamThreadLimit);
1090   }
1091 
1092   // Set default number of threads
1093   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size;
1094   DP("Default number of threads set according to library's default %d\n",
1095      RTLDeviceInfoTy::Default_WG_Size);
1096   if (enforce_upper_bound(&DeviceInfo.NumThreads[device_id],
1097                           DeviceInfo.ThreadsPerGroup[device_id])) {
1098     DP("Default number of threads exceeds device limit, capping at %d\n",
1099        DeviceInfo.ThreadsPerGroup[device_id]);
1100   }
1101 
1102   DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
1103      device_id, DeviceInfo.GroupsPerDevice[device_id],
1104      DeviceInfo.ThreadsPerGroup[device_id]);
1105 
1106   DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id,
1107      DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id],
1108      DeviceInfo.GroupsPerDevice[device_id],
1109      DeviceInfo.GroupsPerDevice[device_id] *
1110          DeviceInfo.ThreadsPerGroup[device_id]);
1111 
1112   return OFFLOAD_SUCCESS;
1113 }
1114 
1115 namespace {
1116 Elf64_Shdr *find_only_SHT_HASH(Elf *elf) {
1117   size_t N;
1118   int rc = elf_getshdrnum(elf, &N);
1119   if (rc != 0) {
1120     return nullptr;
1121   }
1122 
1123   Elf64_Shdr *result = nullptr;
1124   for (size_t i = 0; i < N; i++) {
1125     Elf_Scn *scn = elf_getscn(elf, i);
1126     if (scn) {
1127       Elf64_Shdr *shdr = elf64_getshdr(scn);
1128       if (shdr) {
1129         if (shdr->sh_type == SHT_HASH) {
1130           if (result == nullptr) {
1131             result = shdr;
1132           } else {
1133             // multiple SHT_HASH sections not handled
1134             return nullptr;
1135           }
1136         }
1137       }
1138     }
1139   }
1140   return result;
1141 }
1142 
1143 const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash,
1144                             const char *symname) {
1145 
1146   assert(section_hash);
1147   size_t section_symtab_index = section_hash->sh_link;
1148   Elf64_Shdr *section_symtab =
1149       elf64_getshdr(elf_getscn(elf, section_symtab_index));
1150   size_t section_strtab_index = section_symtab->sh_link;
1151 
1152   const Elf64_Sym *symtab =
1153       reinterpret_cast<const Elf64_Sym *>(base + section_symtab->sh_offset);
1154 
1155   const uint32_t *hashtab =
1156       reinterpret_cast<const uint32_t *>(base + section_hash->sh_offset);
1157 
1158   // Layout:
1159   // nbucket
1160   // nchain
1161   // bucket[nbucket]
1162   // chain[nchain]
1163   uint32_t nbucket = hashtab[0];
1164   const uint32_t *bucket = &hashtab[2];
1165   const uint32_t *chain = &hashtab[nbucket + 2];
1166 
1167   const size_t max = strlen(symname) + 1;
1168   const uint32_t hash = elf_hash(symname);
1169   for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) {
1170     char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name);
1171     if (strncmp(symname, n, max) == 0) {
1172       return &symtab[i];
1173     }
1174   }
1175 
1176   return nullptr;
1177 }
1178 
1179 struct symbol_info {
1180   void *addr = nullptr;
1181   uint32_t size = UINT32_MAX;
1182   uint32_t sh_type = SHT_NULL;
1183 };
1184 
1185 int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname,
1186                                     symbol_info *res) {
1187   if (elf_kind(elf) != ELF_K_ELF) {
1188     return 1;
1189   }
1190 
1191   Elf64_Shdr *section_hash = find_only_SHT_HASH(elf);
1192   if (!section_hash) {
1193     return 1;
1194   }
1195 
1196   const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname);
1197   if (!sym) {
1198     return 1;
1199   }
1200 
1201   if (sym->st_size > UINT32_MAX) {
1202     return 1;
1203   }
1204 
1205   if (sym->st_shndx == SHN_UNDEF) {
1206     return 1;
1207   }
1208 
1209   Elf_Scn *section = elf_getscn(elf, sym->st_shndx);
1210   if (!section) {
1211     return 1;
1212   }
1213 
1214   Elf64_Shdr *header = elf64_getshdr(section);
1215   if (!header) {
1216     return 1;
1217   }
1218 
1219   res->addr = sym->st_value + base;
1220   res->size = static_cast<uint32_t>(sym->st_size);
1221   res->sh_type = header->sh_type;
1222   return 0;
1223 }
1224 
1225 int get_symbol_info_without_loading(char *base, size_t img_size,
1226                                     const char *symname, symbol_info *res) {
1227   Elf *elf = elf_memory(base, img_size);
1228   if (elf) {
1229     int rc = get_symbol_info_without_loading(elf, base, symname, res);
1230     elf_end(elf);
1231     return rc;
1232   }
1233   return 1;
1234 }
1235 
1236 hsa_status_t interop_get_symbol_info(char *base, size_t img_size,
1237                                      const char *symname, void **var_addr,
1238                                      uint32_t *var_size) {
1239   symbol_info si;
1240   int rc = get_symbol_info_without_loading(base, img_size, symname, &si);
1241   if (rc == 0) {
1242     *var_addr = si.addr;
1243     *var_size = si.size;
1244     return HSA_STATUS_SUCCESS;
1245   } else {
1246     return HSA_STATUS_ERROR;
1247   }
1248 }
1249 
1250 template <typename C>
1251 hsa_status_t module_register_from_memory_to_place(
1252     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
1253     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
1254     void *module_bytes, size_t module_size, int DeviceId, C cb,
1255     std::vector<hsa_executable_t> &HSAExecutables) {
1256   auto L = [](void *data, size_t size, void *cb_state) -> hsa_status_t {
1257     C *unwrapped = static_cast<C *>(cb_state);
1258     return (*unwrapped)(data, size);
1259   };
1260   return core::RegisterModuleFromMemory(
1261       KernelInfoTable, SymbolInfoTable, module_bytes, module_size,
1262       DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&cb),
1263       HSAExecutables);
1264 }
1265 } // namespace
1266 
1267 static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) {
1268   uint64_t device_State_bytes = 0;
1269   {
1270     // If this is the deviceRTL, get the state variable size
1271     symbol_info size_si;
1272     int rc = get_symbol_info_without_loading(
1273         ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si);
1274 
1275     if (rc == 0) {
1276       if (size_si.size != sizeof(uint64_t)) {
1277         DP("Found device_State_size variable with wrong size\n");
1278         return 0;
1279       }
1280 
1281       // Read number of bytes directly from the elf
1282       memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t));
1283     }
1284   }
1285   return device_State_bytes;
1286 }
1287 
1288 static __tgt_target_table *
1289 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
1290 
1291 static __tgt_target_table *
1292 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
1293 
1294 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1295                                           __tgt_device_image *image) {
1296   DeviceInfo.load_run_lock.lock();
1297   __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image);
1298   DeviceInfo.load_run_lock.unlock();
1299   return res;
1300 }
1301 
1302 struct device_environment {
1303   // initialise an omptarget_device_environmentTy in the deviceRTL
1304   // patches around differences in the deviceRTL between trunk, aomp,
1305   // rocmcc. Over time these differences will tend to zero and this class
1306   // simplified.
1307   // Symbol may be in .data or .bss, and may be missing fields:
1308   //  - aomp has debug_level, num_devices, device_num
1309   //  - trunk has debug_level
1310   //  - under review in trunk is debug_level, device_num
1311   //  - rocmcc matches aomp, patch to swap num_devices and device_num
1312 
1313   // The symbol may also have been deadstripped because the device side
1314   // accessors were unused.
1315 
1316   // If the symbol is in .data (aomp, rocm) it can be written directly.
1317   // If it is in .bss, we must wait for it to be allocated space on the
1318   // gpu (trunk) and initialize after loading.
1319   const char *sym() { return "omptarget_device_environment"; }
1320 
1321   omptarget_device_environmentTy host_device_env;
1322   symbol_info si;
1323   bool valid = false;
1324 
1325   __tgt_device_image *image;
1326   const size_t img_size;
1327 
1328   device_environment(int device_id, int number_devices,
1329                      __tgt_device_image *image, const size_t img_size)
1330       : image(image), img_size(img_size) {
1331 
1332     host_device_env.num_devices = number_devices;
1333     host_device_env.device_num = device_id;
1334     host_device_env.debug_level = 0;
1335 #ifdef OMPTARGET_DEBUG
1336     if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
1337       host_device_env.debug_level = std::stoi(envStr);
1338     }
1339 #endif
1340 
1341     int rc = get_symbol_info_without_loading((char *)image->ImageStart,
1342                                              img_size, sym(), &si);
1343     if (rc != 0) {
1344       DP("Finding global device environment '%s' - symbol missing.\n", sym());
1345       return;
1346     }
1347 
1348     if (si.size > sizeof(host_device_env)) {
1349       DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size,
1350          sizeof(host_device_env));
1351       return;
1352     }
1353 
1354     valid = true;
1355   }
1356 
1357   bool in_image() { return si.sh_type != SHT_NOBITS; }
1358 
1359   hsa_status_t before_loading(void *data, size_t size) {
1360     if (valid) {
1361       if (in_image()) {
1362         DP("Setting global device environment before load (%u bytes)\n",
1363            si.size);
1364         uint64_t offset = (char *)si.addr - (char *)image->ImageStart;
1365         void *pos = (char *)data + offset;
1366         memcpy(pos, &host_device_env, si.size);
1367       }
1368     }
1369     return HSA_STATUS_SUCCESS;
1370   }
1371 
1372   hsa_status_t after_loading() {
1373     if (valid) {
1374       if (!in_image()) {
1375         DP("Setting global device environment after load (%u bytes)\n",
1376            si.size);
1377         int device_id = host_device_env.device_num;
1378         auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
1379         void *state_ptr;
1380         uint32_t state_ptr_size;
1381         hsa_status_t err = interop_hsa_get_symbol_info(
1382             SymbolInfo, device_id, sym(), &state_ptr, &state_ptr_size);
1383         if (err != HSA_STATUS_SUCCESS) {
1384           DP("failed to find %s in loaded image\n", sym());
1385           return err;
1386         }
1387 
1388         if (state_ptr_size != si.size) {
1389           DP("Symbol had size %u before loading, %u after\n", state_ptr_size,
1390              si.size);
1391           return HSA_STATUS_ERROR;
1392         }
1393 
1394         return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env,
1395                                                     state_ptr_size, device_id);
1396       }
1397     }
1398     return HSA_STATUS_SUCCESS;
1399   }
1400 };
1401 
1402 static hsa_status_t impl_calloc(void **ret_ptr, size_t size, int DeviceId) {
1403   uint64_t rounded = 4 * ((size + 3) / 4);
1404   void *ptr;
1405   hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId);
1406   hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, rounded, 0, &ptr);
1407   if (err != HSA_STATUS_SUCCESS) {
1408     return err;
1409   }
1410 
1411   hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4);
1412   if (rc != HSA_STATUS_SUCCESS) {
1413     DP("zero fill device_state failed with %u\n", rc);
1414     core::Runtime::Memfree(ptr);
1415     return HSA_STATUS_ERROR;
1416   }
1417 
1418   *ret_ptr = ptr;
1419   return HSA_STATUS_SUCCESS;
1420 }
1421 
1422 static bool image_contains_symbol(void *data, size_t size, const char *sym) {
1423   symbol_info si;
1424   int rc = get_symbol_info_without_loading((char *)data, size, sym, &si);
1425   return (rc == 0) && (si.addr != nullptr);
1426 }
1427 
1428 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
1429                                                  __tgt_device_image *image) {
1430   // This function loads the device image onto gpu[device_id] and does other
1431   // per-image initialization work. Specifically:
1432   //
1433   // - Initialize an omptarget_device_environmentTy instance embedded in the
1434   //   image at the symbol "omptarget_device_environment"
1435   //   Fields debug_level, device_num, num_devices. Used by the deviceRTL.
1436   //
1437   // - Allocate a large array per-gpu (could be moved to init_device)
1438   //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
1439   //   - Allocate at least that many bytes of gpu memory
1440   //   - Zero initialize it
1441   //   - Write the pointer to the symbol omptarget_nvptx_device_State
1442   //
1443   // - Pulls some per-kernel information together from various sources and
1444   //   records it in the KernelsList for quicker access later
1445   //
1446   // The initialization can be done before or after loading the image onto the
1447   // gpu. This function presently does a mixture. Using the hsa api to get/set
1448   // the information is simpler to implement, in exchange for more complicated
1449   // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
1450   // back from the gpu vs a hashtable lookup on the host.
1451 
1452   const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart;
1453 
1454   DeviceInfo.clearOffloadEntriesTable(device_id);
1455 
1456   // We do not need to set the ELF version because the caller of this function
1457   // had to do that to decide the right runtime to use
1458 
1459   if (!elf_machine_id_is_amdgcn(image)) {
1460     return NULL;
1461   }
1462 
1463   {
1464     auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image,
1465                                   img_size);
1466 
1467     auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id];
1468     auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
1469     hsa_status_t err = module_register_from_memory_to_place(
1470         KernelInfo, SymbolInfo, (void *)image->ImageStart, img_size, device_id,
1471         [&](void *data, size_t size) {
1472           if (image_contains_symbol(data, size, "needs_hostcall_buffer")) {
1473             __atomic_store_n(&DeviceInfo.hostcall_required, true,
1474                              __ATOMIC_RELEASE);
1475           }
1476           return env.before_loading(data, size);
1477         },
1478         DeviceInfo.HSAExecutables);
1479 
1480     check("Module registering", err);
1481     if (err != HSA_STATUS_SUCCESS) {
1482       const char *DeviceName = DeviceInfo.GPUName[device_id].c_str();
1483       const char *ElfName = get_elf_mach_gfx_name(elf_e_flags(image));
1484 
1485       if (strcmp(DeviceName, ElfName) != 0) {
1486         DP("Possible gpu arch mismatch: device:%s, image:%s please check"
1487            " compiler flag: -march=<gpu>\n",
1488            DeviceName, ElfName);
1489       } else {
1490         DP("Error loading image onto GPU: %s\n", get_error_string(err));
1491       }
1492 
1493       return NULL;
1494     }
1495 
1496     err = env.after_loading();
1497     if (err != HSA_STATUS_SUCCESS) {
1498       return NULL;
1499     }
1500   }
1501 
1502   DP("AMDGPU module successfully loaded!\n");
1503 
1504   {
1505     // the device_State array is either large value in bss or a void* that
1506     // needs to be assigned to a pointer to an array of size device_state_bytes
1507     // If absent, it has been deadstripped and needs no setup.
1508 
1509     void *state_ptr;
1510     uint32_t state_ptr_size;
1511     auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
1512     hsa_status_t err = interop_hsa_get_symbol_info(
1513         SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr,
1514         &state_ptr_size);
1515 
1516     if (err != HSA_STATUS_SUCCESS) {
1517       DP("No device_state symbol found, skipping initialization\n");
1518     } else {
1519       if (state_ptr_size < sizeof(void *)) {
1520         DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size,
1521            sizeof(void *));
1522         return NULL;
1523       }
1524 
1525       // if it's larger than a void*, assume it's a bss array and no further
1526       // initialization is required. Only try to set up a pointer for
1527       // sizeof(void*)
1528       if (state_ptr_size == sizeof(void *)) {
1529         uint64_t device_State_bytes =
1530             get_device_State_bytes((char *)image->ImageStart, img_size);
1531         if (device_State_bytes == 0) {
1532           DP("Can't initialize device_State, missing size information\n");
1533           return NULL;
1534         }
1535 
1536         auto &dss = DeviceInfo.deviceStateStore[device_id];
1537         if (dss.first.get() == nullptr) {
1538           assert(dss.second == 0);
1539           void *ptr = NULL;
1540           hsa_status_t err = impl_calloc(&ptr, device_State_bytes, device_id);
1541           if (err != HSA_STATUS_SUCCESS) {
1542             DP("Failed to allocate device_state array\n");
1543             return NULL;
1544           }
1545           dss = {
1546               std::unique_ptr<void, RTLDeviceInfoTy::implFreePtrDeletor>{ptr},
1547               device_State_bytes,
1548           };
1549         }
1550 
1551         void *ptr = dss.first.get();
1552         if (device_State_bytes != dss.second) {
1553           DP("Inconsistent sizes of device_State unsupported\n");
1554           return NULL;
1555         }
1556 
1557         // write ptr to device memory so it can be used by later kernels
1558         err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr,
1559                                                    sizeof(void *), device_id);
1560         if (err != HSA_STATUS_SUCCESS) {
1561           DP("memcpy install of state_ptr failed\n");
1562           return NULL;
1563         }
1564       }
1565     }
1566   }
1567 
1568   // Here, we take advantage of the data that is appended after img_end to get
1569   // the symbols' name we need to load. This data consist of the host entries
1570   // begin and end as well as the target name (see the offloading linker script
1571   // creation in clang compiler).
1572 
1573   // Find the symbols in the module by name. The name can be obtain by
1574   // concatenating the host entry name with the target name
1575 
1576   __tgt_offload_entry *HostBegin = image->EntriesBegin;
1577   __tgt_offload_entry *HostEnd = image->EntriesEnd;
1578 
1579   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
1580 
1581     if (!e->addr) {
1582       // The host should have always something in the address to
1583       // uniquely identify the target region.
1584       DP("Analyzing host entry '<null>' (size = %lld)...\n",
1585          (unsigned long long)e->size);
1586       return NULL;
1587     }
1588 
1589     if (e->size) {
1590       __tgt_offload_entry entry = *e;
1591 
1592       void *varptr;
1593       uint32_t varsize;
1594 
1595       auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
1596       hsa_status_t err = interop_hsa_get_symbol_info(
1597           SymbolInfoMap, device_id, e->name, &varptr, &varsize);
1598 
1599       if (err != HSA_STATUS_SUCCESS) {
1600         // Inform the user what symbol prevented offloading
1601         DP("Loading global '%s' (Failed)\n", e->name);
1602         return NULL;
1603       }
1604 
1605       if (varsize != e->size) {
1606         DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name,
1607            varsize, e->size);
1608         return NULL;
1609       }
1610 
1611       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
1612          DPxPTR(e - HostBegin), e->name, DPxPTR(varptr));
1613       entry.addr = (void *)varptr;
1614 
1615       DeviceInfo.addOffloadEntry(device_id, entry);
1616 
1617       if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
1618           e->flags & OMP_DECLARE_TARGET_LINK) {
1619         // If unified memory is present any target link variables
1620         // can access host addresses directly. There is no longer a
1621         // need for device copies.
1622         err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
1623                                                    sizeof(void *), device_id);
1624         if (err != HSA_STATUS_SUCCESS)
1625           DP("Error when copying USM\n");
1626         DP("Copy linked variable host address (" DPxMOD ")"
1627            "to device address (" DPxMOD ")\n",
1628            DPxPTR(*((void **)e->addr)), DPxPTR(varptr));
1629       }
1630 
1631       continue;
1632     }
1633 
1634     DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name));
1635 
1636     uint32_t kernarg_segment_size;
1637     auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id];
1638     hsa_status_t err = interop_hsa_get_kernel_info(
1639         KernelInfoMap, device_id, e->name,
1640         HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1641         &kernarg_segment_size);
1642 
1643     // each arg is a void * in this openmp implementation
1644     uint32_t arg_num = kernarg_segment_size / sizeof(void *);
1645     std::vector<size_t> arg_sizes(arg_num);
1646     for (std::vector<size_t>::iterator it = arg_sizes.begin();
1647          it != arg_sizes.end(); it++) {
1648       *it = sizeof(void *);
1649     }
1650 
1651     // default value GENERIC (in case symbol is missing from cubin file)
1652     llvm::omp::OMPTgtExecModeFlags ExecModeVal =
1653         llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
1654 
1655     // get flat group size if present, else Default_WG_Size
1656     int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
1657 
1658     // get Kernel Descriptor if present.
1659     // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
1660     struct KernDescValType {
1661       uint16_t Version;
1662       uint16_t TSize;
1663       uint16_t WG_Size;
1664     };
1665     struct KernDescValType KernDescVal;
1666     std::string KernDescNameStr(e->name);
1667     KernDescNameStr += "_kern_desc";
1668     const char *KernDescName = KernDescNameStr.c_str();
1669 
1670     void *KernDescPtr;
1671     uint32_t KernDescSize;
1672     void *CallStackAddr = nullptr;
1673     err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1674                                   KernDescName, &KernDescPtr, &KernDescSize);
1675 
1676     if (err == HSA_STATUS_SUCCESS) {
1677       if ((size_t)KernDescSize != sizeof(KernDescVal))
1678         DP("Loading global computation properties '%s' - size mismatch (%u != "
1679            "%lu)\n",
1680            KernDescName, KernDescSize, sizeof(KernDescVal));
1681 
1682       memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
1683 
1684       // Check structure size against recorded size.
1685       if ((size_t)KernDescSize != KernDescVal.TSize)
1686         DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
1687            sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
1688 
1689       DP("After loading global for %s KernDesc \n", KernDescName);
1690       DP("KernDesc: Version: %d\n", KernDescVal.Version);
1691       DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
1692       DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
1693 
1694       if (KernDescVal.WG_Size == 0) {
1695         KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size;
1696         DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size);
1697       }
1698       WGSizeVal = KernDescVal.WG_Size;
1699       DP("WGSizeVal %d\n", WGSizeVal);
1700       check("Loading KernDesc computation property", err);
1701     } else {
1702       DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
1703 
1704       // Flat group size
1705       std::string WGSizeNameStr(e->name);
1706       WGSizeNameStr += "_wg_size";
1707       const char *WGSizeName = WGSizeNameStr.c_str();
1708 
1709       void *WGSizePtr;
1710       uint32_t WGSize;
1711       err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1712                                     WGSizeName, &WGSizePtr, &WGSize);
1713 
1714       if (err == HSA_STATUS_SUCCESS) {
1715         if ((size_t)WGSize != sizeof(int16_t)) {
1716           DP("Loading global computation properties '%s' - size mismatch (%u "
1717              "!= "
1718              "%lu)\n",
1719              WGSizeName, WGSize, sizeof(int16_t));
1720           return NULL;
1721         }
1722 
1723         memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
1724 
1725         DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
1726 
1727         if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size ||
1728             WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) {
1729           DP("Error wrong WGSize value specified in HSA code object file: "
1730              "%d\n",
1731              WGSizeVal);
1732           WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
1733         }
1734       } else {
1735         DP("Warning: Loading WGSize '%s' - symbol not found, "
1736            "using default value %d\n",
1737            WGSizeName, WGSizeVal);
1738       }
1739 
1740       check("Loading WGSize computation property", err);
1741     }
1742 
1743     // Read execution mode from global in binary
1744     std::string ExecModeNameStr(e->name);
1745     ExecModeNameStr += "_exec_mode";
1746     const char *ExecModeName = ExecModeNameStr.c_str();
1747 
1748     void *ExecModePtr;
1749     uint32_t varsize;
1750     err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1751                                   ExecModeName, &ExecModePtr, &varsize);
1752 
1753     if (err == HSA_STATUS_SUCCESS) {
1754       if ((size_t)varsize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
1755         DP("Loading global computation properties '%s' - size mismatch(%u != "
1756            "%lu)\n",
1757            ExecModeName, varsize, sizeof(llvm::omp::OMPTgtExecModeFlags));
1758         return NULL;
1759       }
1760 
1761       memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
1762 
1763       DP("After loading global for %s ExecMode = %d\n", ExecModeName,
1764          ExecModeVal);
1765 
1766       if (ExecModeVal < 0 ||
1767           ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
1768         DP("Error wrong exec_mode value specified in HSA code object file: "
1769            "%d\n",
1770            ExecModeVal);
1771         return NULL;
1772       }
1773     } else {
1774       DP("Loading global exec_mode '%s' - symbol missing, using default "
1775          "value "
1776          "GENERIC (1)\n",
1777          ExecModeName);
1778     }
1779     check("Loading computation property", err);
1780 
1781     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
1782                                    CallStackAddr, e->name, kernarg_segment_size,
1783                                    DeviceInfo.KernArgPool));
1784     __tgt_offload_entry entry = *e;
1785     entry.addr = (void *)&KernelsList.back();
1786     DeviceInfo.addOffloadEntry(device_id, entry);
1787     DP("Entry point %ld maps to %s\n", e - HostBegin, e->name);
1788   }
1789 
1790   return DeviceInfo.getOffloadEntriesTable(device_id);
1791 }
1792 
1793 void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) {
1794   void *ptr = NULL;
1795   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1796 
1797   if (kind != TARGET_ALLOC_DEFAULT) {
1798     REPORT("Invalid target data allocation kind or requested allocator not "
1799            "implemented yet\n");
1800     return NULL;
1801   }
1802 
1803   hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(device_id);
1804   hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, &ptr);
1805   DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size,
1806      (long long unsigned)(Elf64_Addr)ptr);
1807   ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL;
1808   return ptr;
1809 }
1810 
1811 int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr,
1812                               int64_t size) {
1813   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1814   __tgt_async_info AsyncInfo;
1815   int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo);
1816   if (rc != OFFLOAD_SUCCESS)
1817     return OFFLOAD_FAIL;
1818 
1819   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1820 }
1821 
1822 int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr,
1823                                     int64_t size, __tgt_async_info *AsyncInfo) {
1824   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1825   if (AsyncInfo) {
1826     initAsyncInfo(AsyncInfo);
1827     return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo);
1828   } else {
1829     return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size);
1830   }
1831 }
1832 
1833 int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr,
1834                                 int64_t size) {
1835   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1836   __tgt_async_info AsyncInfo;
1837   int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo);
1838   if (rc != OFFLOAD_SUCCESS)
1839     return OFFLOAD_FAIL;
1840 
1841   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1842 }
1843 
1844 int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr,
1845                                       void *tgt_ptr, int64_t size,
1846                                       __tgt_async_info *AsyncInfo) {
1847   assert(AsyncInfo && "AsyncInfo is nullptr");
1848   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1849   initAsyncInfo(AsyncInfo);
1850   return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo);
1851 }
1852 
1853 int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) {
1854   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1855   hsa_status_t err;
1856   DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr);
1857   err = core::Runtime::Memfree(tgt_ptr);
1858   if (err != HSA_STATUS_SUCCESS) {
1859     DP("Error when freeing CUDA memory\n");
1860     return OFFLOAD_FAIL;
1861   }
1862   return OFFLOAD_SUCCESS;
1863 }
1864 
1865 // Determine launch values for kernel.
1866 struct launchVals {
1867   int WorkgroupSize;
1868   int GridSize;
1869 };
1870 launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
1871                          int ConstWGSize,
1872                          llvm::omp::OMPTgtExecModeFlags ExecutionMode,
1873                          int num_teams, int thread_limit,
1874                          uint64_t loop_tripcount, int DeviceNumTeams) {
1875 
1876   int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
1877   int num_groups = 0;
1878 
1879   int Max_Teams =
1880       Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
1881   if (Max_Teams > RTLDeviceInfoTy::HardTeamLimit)
1882     Max_Teams = RTLDeviceInfoTy::HardTeamLimit;
1883 
1884   if (print_kernel_trace & STARTUP_DETAILS) {
1885     DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams);
1886     DP("Max_Teams: %d\n", Max_Teams);
1887     DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
1888     DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size);
1889     DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
1890        RTLDeviceInfoTy::Default_WG_Size);
1891     DP("thread_limit: %d\n", thread_limit);
1892     DP("threadsPerGroup: %d\n", threadsPerGroup);
1893     DP("ConstWGSize: %d\n", ConstWGSize);
1894   }
1895   // check for thread_limit() clause
1896   if (thread_limit > 0) {
1897     threadsPerGroup = thread_limit;
1898     DP("Setting threads per block to requested %d\n", thread_limit);
1899     // Add master warp for GENERIC
1900     if (ExecutionMode ==
1901         llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1902       threadsPerGroup += WarpSize;
1903       DP("Adding master wavefront: +%d threads\n", WarpSize);
1904     }
1905     if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
1906       threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
1907       DP("Setting threads per block to maximum %d\n", threadsPerGroup);
1908     }
1909   }
1910   // check flat_max_work_group_size attr here
1911   if (threadsPerGroup > ConstWGSize) {
1912     threadsPerGroup = ConstWGSize;
1913     DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
1914        threadsPerGroup);
1915   }
1916   if (print_kernel_trace & STARTUP_DETAILS)
1917     DP("threadsPerGroup: %d\n", threadsPerGroup);
1918   DP("Preparing %d threads\n", threadsPerGroup);
1919 
1920   // Set default num_groups (teams)
1921   if (Env.TeamLimit > 0)
1922     num_groups = (Max_Teams < Env.TeamLimit) ? Max_Teams : Env.TeamLimit;
1923   else
1924     num_groups = Max_Teams;
1925   DP("Set default num of groups %d\n", num_groups);
1926 
1927   if (print_kernel_trace & STARTUP_DETAILS) {
1928     DP("num_groups: %d\n", num_groups);
1929     DP("num_teams: %d\n", num_teams);
1930   }
1931 
1932   // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1933   // This reduction is typical for default case (no thread_limit clause).
1934   // or when user goes crazy with num_teams clause.
1935   // FIXME: We cant distinguish between a constant or variable thread limit.
1936   // So we only handle constant thread_limits.
1937   if (threadsPerGroup >
1938       RTLDeviceInfoTy::Default_WG_Size) //  256 < threadsPerGroup <= 1024
1939     // Should we round threadsPerGroup up to nearest WarpSize
1940     // here?
1941     num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;
1942 
1943   // check for num_teams() clause
1944   if (num_teams > 0) {
1945     num_groups = (num_teams < num_groups) ? num_teams : num_groups;
1946   }
1947   if (print_kernel_trace & STARTUP_DETAILS) {
1948     DP("num_groups: %d\n", num_groups);
1949     DP("Env.NumTeams %d\n", Env.NumTeams);
1950     DP("Env.TeamLimit %d\n", Env.TeamLimit);
1951   }
1952 
1953   if (Env.NumTeams > 0) {
1954     num_groups = (Env.NumTeams < num_groups) ? Env.NumTeams : num_groups;
1955     DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
1956   } else if (Env.TeamLimit > 0) {
1957     num_groups = (Env.TeamLimit < num_groups) ? Env.TeamLimit : num_groups;
1958     DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
1959   } else {
1960     if (num_teams <= 0) {
1961       if (loop_tripcount > 0) {
1962         if (ExecutionMode ==
1963             llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
1964           // round up to the nearest integer
1965           num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
1966         } else if (ExecutionMode ==
1967                    llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1968           num_groups = loop_tripcount;
1969         } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
1970           // This is a generic kernel that was transformed to use SPMD-mode
1971           // execution but uses Generic-mode semantics for scheduling.
1972           num_groups = loop_tripcount;
1973         }
1974         DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
1975            "threads per block %d\n",
1976            num_groups, loop_tripcount, threadsPerGroup);
1977       }
1978     } else {
1979       num_groups = num_teams;
1980     }
1981     if (num_groups > Max_Teams) {
1982       num_groups = Max_Teams;
1983       if (print_kernel_trace & STARTUP_DETAILS)
1984         DP("Limiting num_groups %d to Max_Teams %d \n", num_groups, Max_Teams);
1985     }
1986     if (num_groups > num_teams && num_teams > 0) {
1987       num_groups = num_teams;
1988       if (print_kernel_trace & STARTUP_DETAILS)
1989         DP("Limiting num_groups %d to clause num_teams %d \n", num_groups,
1990            num_teams);
1991     }
1992   }
1993 
1994   // num_teams clause always honored, no matter what, unless DEFAULT is active.
1995   if (num_teams > 0) {
1996     num_groups = num_teams;
1997     // Cap num_groups to EnvMaxTeamsDefault if set.
1998     if (Env.MaxTeamsDefault > 0 && num_groups > Env.MaxTeamsDefault)
1999       num_groups = Env.MaxTeamsDefault;
2000   }
2001   if (print_kernel_trace & STARTUP_DETAILS) {
2002     DP("threadsPerGroup: %d\n", threadsPerGroup);
2003     DP("num_groups: %d\n", num_groups);
2004     DP("loop_tripcount: %ld\n", loop_tripcount);
2005   }
2006   DP("Final %d num_groups and %d threadsPerGroup\n", num_groups,
2007      threadsPerGroup);
2008 
2009   launchVals res;
2010   res.WorkgroupSize = threadsPerGroup;
2011   res.GridSize = threadsPerGroup * num_groups;
2012   return res;
2013 }
2014 
2015 static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
2016   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
2017   bool full = true;
2018   while (full) {
2019     full =
2020         packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue));
2021   }
2022   return packet_id;
2023 }
2024 
2025 static int32_t __tgt_rtl_run_target_team_region_locked(
2026     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
2027     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
2028     int32_t thread_limit, uint64_t loop_tripcount);
2029 
2030 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
2031                                          void **tgt_args,
2032                                          ptrdiff_t *tgt_offsets,
2033                                          int32_t arg_num, int32_t num_teams,
2034                                          int32_t thread_limit,
2035                                          uint64_t loop_tripcount) {
2036 
2037   DeviceInfo.load_run_lock.lock_shared();
2038   int32_t res = __tgt_rtl_run_target_team_region_locked(
2039       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams,
2040       thread_limit, loop_tripcount);
2041 
2042   DeviceInfo.load_run_lock.unlock_shared();
2043   return res;
2044 }
2045 
2046 int32_t __tgt_rtl_run_target_team_region_locked(
2047     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
2048     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
2049     int32_t thread_limit, uint64_t loop_tripcount) {
2050   // Set the context we are using
2051   // update thread limit content in gpu memory if un-initialized or specified
2052   // from host
2053 
2054   DP("Run target team region thread_limit %d\n", thread_limit);
2055 
2056   // All args are references.
2057   std::vector<void *> args(arg_num);
2058   std::vector<void *> ptrs(arg_num);
2059 
2060   DP("Arg_num: %d\n", arg_num);
2061   for (int32_t i = 0; i < arg_num; ++i) {
2062     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
2063     args[i] = &ptrs[i];
2064     DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
2065   }
2066 
2067   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
2068 
2069   std::string kernel_name = std::string(KernelInfo->Name);
2070   auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
2071   if (KernelInfoTable[device_id].find(kernel_name) ==
2072       KernelInfoTable[device_id].end()) {
2073     DP("Kernel %s not found\n", kernel_name.c_str());
2074     return OFFLOAD_FAIL;
2075   }
2076 
2077   const atl_kernel_info_t KernelInfoEntry =
2078       KernelInfoTable[device_id][kernel_name];
2079   const uint32_t group_segment_size = KernelInfoEntry.group_segment_size;
2080   const uint32_t sgpr_count = KernelInfoEntry.sgpr_count;
2081   const uint32_t vgpr_count = KernelInfoEntry.vgpr_count;
2082   const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count;
2083   const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count;
2084 
2085   assert(arg_num == (int)KernelInfoEntry.num_args);
2086 
2087   /*
2088    * Set limit based on ThreadsPerGroup and GroupsPerDevice
2089    */
2090   launchVals LV =
2091       getLaunchVals(DeviceInfo.WarpSize[device_id], DeviceInfo.Env,
2092                     KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
2093                     num_teams,      // From run_region arg
2094                     thread_limit,   // From run_region arg
2095                     loop_tripcount, // From run_region arg
2096                     DeviceInfo.NumTeams[KernelInfo->device_id]);
2097   const int GridSize = LV.GridSize;
2098   const int WorkgroupSize = LV.WorkgroupSize;
2099 
2100   if (print_kernel_trace >= LAUNCH) {
2101     int num_groups = GridSize / WorkgroupSize;
2102     // enum modes are SPMD, GENERIC, NONE 0,1,2
2103     // if doing rtl timing, print to stderr, unless stdout requested.
2104     bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
2105     fprintf(traceToStdout ? stdout : stderr,
2106             "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
2107             "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
2108             "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
2109             device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
2110             arg_num, num_groups, WorkgroupSize, num_teams, thread_limit,
2111             group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count,
2112             vgpr_spill_count, loop_tripcount, KernelInfo->Name);
2113   }
2114 
2115   // Run on the device.
2116   {
2117     hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id].get();
2118     if (!queue) {
2119       return OFFLOAD_FAIL;
2120     }
2121     uint64_t packet_id = acquire_available_packet_id(queue);
2122 
2123     const uint32_t mask = queue->size - 1; // size is a power of 2
2124     hsa_kernel_dispatch_packet_t *packet =
2125         (hsa_kernel_dispatch_packet_t *)queue->base_address +
2126         (packet_id & mask);
2127 
2128     // packet->header is written last
2129     packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
2130     packet->workgroup_size_x = WorkgroupSize;
2131     packet->workgroup_size_y = 1;
2132     packet->workgroup_size_z = 1;
2133     packet->reserved0 = 0;
2134     packet->grid_size_x = GridSize;
2135     packet->grid_size_y = 1;
2136     packet->grid_size_z = 1;
2137     packet->private_segment_size = KernelInfoEntry.private_segment_size;
2138     packet->group_segment_size = KernelInfoEntry.group_segment_size;
2139     packet->kernel_object = KernelInfoEntry.kernel_object;
2140     packet->kernarg_address = 0;     // use the block allocator
2141     packet->reserved2 = 0;           // impl writes id_ here
2142     packet->completion_signal = {0}; // may want a pool of signals
2143 
2144     KernelArgPool *ArgPool = nullptr;
2145     void *kernarg = nullptr;
2146     {
2147       auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name));
2148       if (it != KernelArgPoolMap.end()) {
2149         ArgPool = (it->second).get();
2150       }
2151     }
2152     if (!ArgPool) {
2153       DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
2154          device_id);
2155     }
2156     {
2157       if (ArgPool) {
2158         assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *)));
2159         kernarg = ArgPool->allocate(arg_num);
2160       }
2161       if (!kernarg) {
2162         DP("Allocate kernarg failed\n");
2163         return OFFLOAD_FAIL;
2164       }
2165 
2166       // Copy explicit arguments
2167       for (int i = 0; i < arg_num; i++) {
2168         memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *));
2169       }
2170 
2171       // Initialize implicit arguments. TODO: Which of these can be dropped
2172       impl_implicit_args_t *impl_args =
2173           reinterpret_cast<impl_implicit_args_t *>(
2174               static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size);
2175       memset(impl_args, 0,
2176              sizeof(impl_implicit_args_t)); // may not be necessary
2177       impl_args->offset_x = 0;
2178       impl_args->offset_y = 0;
2179       impl_args->offset_z = 0;
2180 
2181       // assign a hostcall buffer for the selected Q
2182       if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) {
2183         // hostrpc_assign_buffer is not thread safe, and this function is
2184         // under a multiple reader lock, not a writer lock.
2185         static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
2186         pthread_mutex_lock(&hostcall_init_lock);
2187         impl_args->hostcall_ptr = hostrpc_assign_buffer(
2188             DeviceInfo.HSAAgents[device_id], queue, device_id);
2189         pthread_mutex_unlock(&hostcall_init_lock);
2190         if (!impl_args->hostcall_ptr) {
2191           DP("hostrpc_assign_buffer failed, gpu would dereference null and "
2192              "error\n");
2193           return OFFLOAD_FAIL;
2194         }
2195       }
2196 
2197       packet->kernarg_address = kernarg;
2198     }
2199 
2200     hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
2201     if (s.handle == 0) {
2202       DP("Failed to get signal instance\n");
2203       return OFFLOAD_FAIL;
2204     }
2205     packet->completion_signal = s;
2206     hsa_signal_store_relaxed(packet->completion_signal, 1);
2207 
2208     // Publish the packet indicating it is ready to be processed
2209     core::packet_store_release(reinterpret_cast<uint32_t *>(packet),
2210                                core::create_header(), packet->setup);
2211 
2212     // Since the packet is already published, its contents must not be
2213     // accessed any more
2214     hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
2215 
2216     while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
2217                                      HSA_WAIT_STATE_BLOCKED) != 0)
2218       ;
2219 
2220     assert(ArgPool);
2221     ArgPool->deallocate(kernarg);
2222     DeviceInfo.FreeSignalPool.push(s);
2223   }
2224 
2225   DP("Kernel completed\n");
2226   return OFFLOAD_SUCCESS;
2227 }
2228 
2229 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
2230                                     void **tgt_args, ptrdiff_t *tgt_offsets,
2231                                     int32_t arg_num) {
2232   // use one team and one thread
2233   // fix thread num
2234   int32_t team_num = 1;
2235   int32_t thread_limit = 0; // use default
2236   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
2237                                           tgt_offsets, arg_num, team_num,
2238                                           thread_limit, 0);
2239 }
2240 
2241 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
2242                                           void *tgt_entry_ptr, void **tgt_args,
2243                                           ptrdiff_t *tgt_offsets,
2244                                           int32_t arg_num,
2245                                           __tgt_async_info *AsyncInfo) {
2246   assert(AsyncInfo && "AsyncInfo is nullptr");
2247   initAsyncInfo(AsyncInfo);
2248 
2249   // use one team and one thread
2250   // fix thread num
2251   int32_t team_num = 1;
2252   int32_t thread_limit = 0; // use default
2253   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
2254                                           tgt_offsets, arg_num, team_num,
2255                                           thread_limit, 0);
2256 }
2257 
2258 int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *AsyncInfo) {
2259   assert(AsyncInfo && "AsyncInfo is nullptr");
2260 
2261   // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
2262   // is not ensured by devices.cpp for amdgcn
2263   // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
2264   if (AsyncInfo->Queue) {
2265     finiAsyncInfo(AsyncInfo);
2266   }
2267   return OFFLOAD_SUCCESS;
2268 }
2269 
2270 namespace core {
2271 hsa_status_t allow_access_to_all_gpu_agents(void *ptr) {
2272   return hsa_amd_agents_allow_access(DeviceInfo.HSAAgents.size(),
2273                                      &DeviceInfo.HSAAgents[0], NULL, ptr);
2274 }
2275 
2276 } // namespace core
2277