1 //===----RTLs/hsa/src/rtl.cpp - Target RTLs Implementation -------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for hsa machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <assert.h>
15 #include <cstdio>
16 #include <cstdlib>
17 #include <cstring>
18 #include <elf.h>
19 #include <fstream>
20 #include <functional>
21 #include <iostream>
22 #include <libelf.h>
23 #include <list>
24 #include <memory>
25 #include <mutex>
26 #include <shared_mutex>
27 #include <unordered_map>
28 #include <vector>
29 
30 // Header from ATMI interface
31 #include "atmi_interop_hsa.h"
32 #include "atmi_runtime.h"
33 
34 #include "internal.h"
35 
36 #include "Debug.h"
37 #include "get_elf_mach_gfx_name.h"
38 #include "machine.h"
39 #include "omptargetplugin.h"
40 #include "print_tracing.h"
41 
42 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
43 
44 #ifndef TARGET_NAME
45 #define TARGET_NAME AMDHSA
46 #endif
47 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
48 
49 // hostrpc interface, FIXME: consider moving to its own include these are
50 // statically linked into amdgpu/plugin if present from hostrpc_services.a,
51 // linked as --whole-archive to override the weak symbols that are used to
52 // implement a fallback for toolchains that do not yet have a hostrpc library.
53 extern "C" {
54 unsigned long hostrpc_assign_buffer(hsa_agent_t agent, hsa_queue_t *this_Q,
55                                     uint32_t device_id);
56 hsa_status_t hostrpc_init();
57 hsa_status_t hostrpc_terminate();
58 
59 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
60 __attribute__((weak)) hsa_status_t hostrpc_terminate() {
61   return HSA_STATUS_SUCCESS;
62 }
63 __attribute__((weak)) unsigned long
64 hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, uint32_t device_id) {
65   DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
66      "missing\n",
67      device_id);
68   return 0;
69 }
70 }
71 
72 // Heuristic parameters used for kernel launch
73 // Number of teams per CU to allow scheduling flexibility
74 static const unsigned DefaultTeamsPerCU = 4;
75 
76 int print_kernel_trace;
77 
78 #ifdef OMPTARGET_DEBUG
79 #define check(msg, status)                                                     \
80   if (status != HSA_STATUS_SUCCESS) {                                          \
81     DP(#msg " failed\n");                                                      \
82   } else {                                                                     \
83     DP(#msg " succeeded\n");                                                   \
84   }
85 #else
86 #define check(msg, status)                                                     \
87   {}
88 #endif
89 
90 #include "elf_common.h"
91 
92 namespace core {
93 hsa_status_t RegisterModuleFromMemory(
94     std::map<std::string, atl_kernel_info_t> &KernelInfo,
95     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, void *, size_t,
96     hsa_agent_t agent,
97     hsa_status_t (*on_deserialized_data)(void *data, size_t size,
98                                          void *cb_state),
99     void *cb_state, std::vector<hsa_executable_t> &HSAExecutables);
100 }
101 
102 namespace hsa {
103 template <typename C> hsa_status_t iterate_agents(C cb) {
104   auto L = [](hsa_agent_t agent, void *data) -> hsa_status_t {
105     C *unwrapped = static_cast<C *>(data);
106     return (*unwrapped)(agent);
107   };
108   return hsa_iterate_agents(L, static_cast<void *>(&cb));
109 }
110 
111 template <typename C>
112 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C cb) {
113   auto L = [](hsa_amd_memory_pool_t MemoryPool, void *data) -> hsa_status_t {
114     C *unwrapped = static_cast<C *>(data);
115     return (*unwrapped)(MemoryPool);
116   };
117 
118   return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&cb));
119 }
120 
121 } // namespace hsa
122 
123 /// Keep entries table per device
124 struct FuncOrGblEntryTy {
125   __tgt_target_table Table;
126   std::vector<__tgt_offload_entry> Entries;
127 };
128 
129 enum ExecutionModeType {
130   SPMD,    // constructors, destructors,
131            // combined constructs (`teams distribute parallel for [simd]`)
132   GENERIC, // everything else
133   NONE
134 };
135 
136 struct KernelArgPool {
137 private:
138   static pthread_mutex_t mutex;
139 
140 public:
141   uint32_t kernarg_segment_size;
142   void *kernarg_region = nullptr;
143   std::queue<int> free_kernarg_segments;
144 
145   uint32_t kernarg_size_including_implicit() {
146     return kernarg_segment_size + sizeof(atmi_implicit_args_t);
147   }
148 
149   ~KernelArgPool() {
150     if (kernarg_region) {
151       auto r = hsa_amd_memory_pool_free(kernarg_region);
152       if (r != HSA_STATUS_SUCCESS) {
153         DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(r));
154       }
155     }
156   }
157 
158   // Can't really copy or move a mutex
159   KernelArgPool() = default;
160   KernelArgPool(const KernelArgPool &) = delete;
161   KernelArgPool(KernelArgPool &&) = delete;
162 
163   KernelArgPool(uint32_t kernarg_segment_size,
164                 hsa_amd_memory_pool_t &memory_pool)
165       : kernarg_segment_size(kernarg_segment_size) {
166 
167     // atmi uses one pool per kernel for all gpus, with a fixed upper size
168     // preserving that exact scheme here, including the queue<int>
169 
170     hsa_status_t err = hsa_amd_memory_pool_allocate(
171         memory_pool, kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0,
172         &kernarg_region);
173 
174     if (err != HSA_STATUS_SUCCESS) {
175       DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(err));
176       kernarg_region = nullptr; // paranoid
177       return;
178     }
179 
180     err = core::allow_access_to_all_gpu_agents(kernarg_region);
181     if (err != HSA_STATUS_SUCCESS) {
182       DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
183          get_error_string(err));
184       auto r = hsa_amd_memory_pool_free(kernarg_region);
185       if (r != HSA_STATUS_SUCCESS) {
186         // if free failed, can't do anything more to resolve it
187         DP("hsa memory poll free failed: %s\n", get_error_string(err));
188       }
189       kernarg_region = nullptr;
190       return;
191     }
192 
193     for (int i = 0; i < MAX_NUM_KERNELS; i++) {
194       free_kernarg_segments.push(i);
195     }
196   }
197 
198   void *allocate(uint64_t arg_num) {
199     assert((arg_num * sizeof(void *)) == kernarg_segment_size);
200     lock l(&mutex);
201     void *res = nullptr;
202     if (!free_kernarg_segments.empty()) {
203 
204       int free_idx = free_kernarg_segments.front();
205       res = static_cast<void *>(static_cast<char *>(kernarg_region) +
206                                 (free_idx * kernarg_size_including_implicit()));
207       assert(free_idx == pointer_to_index(res));
208       free_kernarg_segments.pop();
209     }
210     return res;
211   }
212 
213   void deallocate(void *ptr) {
214     lock l(&mutex);
215     int idx = pointer_to_index(ptr);
216     free_kernarg_segments.push(idx);
217   }
218 
219 private:
220   int pointer_to_index(void *ptr) {
221     ptrdiff_t bytes =
222         static_cast<char *>(ptr) - static_cast<char *>(kernarg_region);
223     assert(bytes >= 0);
224     assert(bytes % kernarg_size_including_implicit() == 0);
225     return bytes / kernarg_size_including_implicit();
226   }
227   struct lock {
228     lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
229     ~lock() { pthread_mutex_unlock(m); }
230     pthread_mutex_t *m;
231   };
232 };
233 pthread_mutex_t KernelArgPool::mutex = PTHREAD_MUTEX_INITIALIZER;
234 
235 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
236     KernelArgPoolMap;
237 
238 /// Use a single entity to encode a kernel and a set of flags
239 struct KernelTy {
240   // execution mode of kernel
241   // 0 - SPMD mode (without master warp)
242   // 1 - Generic mode (with master warp)
243   int8_t ExecutionMode;
244   int16_t ConstWGSize;
245   int32_t device_id;
246   void *CallStackAddr = nullptr;
247   const char *Name;
248 
249   KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id,
250            void *_CallStackAddr, const char *_Name,
251            uint32_t _kernarg_segment_size,
252            hsa_amd_memory_pool_t &KernArgMemoryPool)
253       : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize),
254         device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) {
255     DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
256 
257     std::string N(_Name);
258     if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
259       KernelArgPoolMap.insert(
260           std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
261                                 _kernarg_segment_size, KernArgMemoryPool))));
262     }
263   }
264 };
265 
266 /// List that contains all the kernels.
267 /// FIXME: we may need this to be per device and per library.
268 std::list<KernelTy> KernelsList;
269 
270 template <typename Callback> static hsa_status_t FindAgents(Callback CB) {
271 
272   hsa_status_t err =
273       hsa::iterate_agents([&](hsa_agent_t agent) -> hsa_status_t {
274         hsa_device_type_t device_type;
275         // get_info fails iff HSA runtime not yet initialized
276         hsa_status_t err =
277             hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
278         if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS)
279           printf("rtl.cpp: err %d\n", err);
280         assert(err == HSA_STATUS_SUCCESS);
281 
282         CB(device_type, agent);
283         return HSA_STATUS_SUCCESS;
284       });
285 
286   // iterate_agents fails iff HSA runtime not yet initialized
287   if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) {
288     printf("rtl.cpp: err %d\n", err);
289   }
290 
291   return err;
292 }
293 
294 static void callbackQueue(hsa_status_t status, hsa_queue_t *source,
295                           void *data) {
296   if (status != HSA_STATUS_SUCCESS) {
297     const char *status_string;
298     if (hsa_status_string(status, &status_string) != HSA_STATUS_SUCCESS) {
299       status_string = "unavailable";
300     }
301     fprintf(stderr, "[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__,
302             __LINE__, source, status, status_string);
303     abort();
304   }
305 }
306 
307 namespace core {
308 namespace {
309 void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest) {
310   __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
311 }
312 
313 uint16_t create_header() {
314   uint16_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
315   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
316   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
317   return header;
318 }
319 
320 hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
321   std::vector<hsa_amd_memory_pool_t> *Result =
322       static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
323   bool AllocAllowed = false;
324   hsa_status_t err = hsa_amd_memory_pool_get_info(
325       MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
326       &AllocAllowed);
327   if (err != HSA_STATUS_SUCCESS) {
328     fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n",
329             get_error_string(err));
330     return err;
331   }
332 
333   if (!AllocAllowed) {
334     // nothing needs to be done here.
335     return HSA_STATUS_SUCCESS;
336   }
337 
338   uint32_t GlobalFlags = 0;
339   err = hsa_amd_memory_pool_get_info(
340       MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
341   if (err != HSA_STATUS_SUCCESS) {
342     fprintf(stderr, "Get memory pool info failed: %s\n", get_error_string(err));
343     return err;
344   }
345 
346   size_t size = 0;
347   err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
348                                      &size);
349   if (err != HSA_STATUS_SUCCESS) {
350     fprintf(stderr, "Get memory pool size failed: %s\n", get_error_string(err));
351     return err;
352   }
353 
354   if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) &&
355       (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) &&
356       size > 0) {
357     Result->push_back(MemoryPool);
358   }
359 
360   return HSA_STATUS_SUCCESS;
361 }
362 
363 std::pair<hsa_status_t, bool>
364 isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
365   bool AllocAllowed = false;
366   hsa_status_t Err = hsa_amd_memory_pool_get_info(
367       MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
368       &AllocAllowed);
369   if (Err != HSA_STATUS_SUCCESS) {
370     fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n",
371             get_error_string(Err));
372     return {Err, false};
373   }
374 
375   return {HSA_STATUS_SUCCESS, AllocAllowed};
376 }
377 
378 template <typename AccumulatorFunc>
379 hsa_status_t collectMemoryPools(const std::vector<hsa_agent_t> &Agents,
380                                 AccumulatorFunc Func) {
381   for (int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
382     hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
383         Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
384           hsa_status_t Err;
385           bool Valid = false;
386           std::tie(Err, Valid) = isValidMemoryPool(MemoryPool);
387           if (Err != HSA_STATUS_SUCCESS) {
388             return Err;
389           }
390           if (Valid)
391             Func(MemoryPool, DeviceId);
392           return HSA_STATUS_SUCCESS;
393         });
394 
395     if (Err != HSA_STATUS_SUCCESS) {
396       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
397              "Iterate all memory pools", get_error_string(Err));
398       return Err;
399     }
400   }
401 
402   return HSA_STATUS_SUCCESS;
403 }
404 
405 std::pair<hsa_status_t, hsa_amd_memory_pool_t>
406 FindKernargPool(const std::vector<hsa_agent_t> &HSAAgents) {
407   std::vector<hsa_amd_memory_pool_t> KernArgPools;
408   for (const auto &Agent : HSAAgents) {
409     hsa_status_t err = HSA_STATUS_SUCCESS;
410     err = hsa_amd_agent_iterate_memory_pools(
411         Agent, addKernArgPool, static_cast<void *>(&KernArgPools));
412     if (err != HSA_STATUS_SUCCESS) {
413       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
414              "Iterate all memory pools", get_error_string(err));
415       return {err, hsa_amd_memory_pool_t{}};
416     }
417   }
418 
419   if (KernArgPools.empty()) {
420     fprintf(stderr, "Unable to find any valid kernarg pool\n");
421     return {HSA_STATUS_ERROR, hsa_amd_memory_pool_t{}};
422   }
423 
424   return {HSA_STATUS_SUCCESS, KernArgPools[0]};
425 }
426 
427 } // namespace
428 } // namespace core
429 
430 struct EnvironmentVariables {
431   int NumTeams;
432   int TeamLimit;
433   int TeamThreadLimit;
434   int MaxTeamsDefault;
435 };
436 
437 /// Class containing all the device information
438 class RTLDeviceInfoTy {
439   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
440 
441 public:
442   // load binary populates symbol tables and mutates various global state
443   // run uses those symbol tables
444   std::shared_timed_mutex load_run_lock;
445 
446   int NumberOfDevices;
447 
448   // GPU devices
449   std::vector<hsa_agent_t> HSAAgents;
450   std::vector<hsa_queue_t *> HSAQueues; // one per gpu
451 
452   // CPUs
453   std::vector<hsa_agent_t> CPUAgents;
454 
455   // Device properties
456   std::vector<int> ComputeUnits;
457   std::vector<int> GroupsPerDevice;
458   std::vector<int> ThreadsPerGroup;
459   std::vector<int> WarpSize;
460   std::vector<std::string> GPUName;
461 
462   // OpenMP properties
463   std::vector<int> NumTeams;
464   std::vector<int> NumThreads;
465 
466   // OpenMP Environment properties
467   EnvironmentVariables Env;
468 
469   // OpenMP Requires Flags
470   int64_t RequiresFlags;
471 
472   // Resource pools
473   SignalPoolT FreeSignalPool;
474 
475   bool hostcall_required = false;
476 
477   std::vector<hsa_executable_t> HSAExecutables;
478 
479   std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
480   std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
481 
482   hsa_amd_memory_pool_t KernArgPool;
483 
484   // fine grained memory pool for host allocations
485   hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
486 
487   // fine and coarse-grained memory pools per offloading device
488   std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
489   std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
490 
491   struct atmiFreePtrDeletor {
492     void operator()(void *p) {
493       core::Runtime::Memfree(p); // ignore failure to free
494     }
495   };
496 
497   // device_State shared across loaded binaries, error if inconsistent size
498   std::vector<std::pair<std::unique_ptr<void, atmiFreePtrDeletor>, uint64_t>>
499       deviceStateStore;
500 
501   static const unsigned HardTeamLimit =
502       (1 << 16) - 1; // 64K needed to fit in uint16
503   static const int DefaultNumTeams = 128;
504   static const int Max_Teams =
505       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams];
506   static const int Warp_Size =
507       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
508   static const int Max_WG_Size =
509       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size];
510   static const int Default_WG_Size =
511       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
512 
513   using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *,
514                                       size_t size, hsa_agent_t);
515   hsa_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size,
516                                      MemcpyFunc Func, int32_t deviceId) {
517     hsa_agent_t agent = HSAAgents[deviceId];
518     hsa_signal_t s = FreeSignalPool.pop();
519     if (s.handle == 0) {
520       return HSA_STATUS_ERROR;
521     }
522     hsa_status_t r = Func(s, dest, src, size, agent);
523     FreeSignalPool.push(s);
524     return r;
525   }
526 
527   hsa_status_t freesignalpool_memcpy_d2h(void *dest, const void *src,
528                                          size_t size, int32_t deviceId) {
529     return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h, deviceId);
530   }
531 
532   hsa_status_t freesignalpool_memcpy_h2d(void *dest, const void *src,
533                                          size_t size, int32_t deviceId) {
534     return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d, deviceId);
535   }
536 
537   // Record entry point associated with device
538   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
539     assert(device_id < (int32_t)FuncGblEntries.size() &&
540            "Unexpected device id!");
541     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
542 
543     E.Entries.push_back(entry);
544   }
545 
546   // Return true if the entry is associated with device
547   bool findOffloadEntry(int32_t device_id, void *addr) {
548     assert(device_id < (int32_t)FuncGblEntries.size() &&
549            "Unexpected device id!");
550     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
551 
552     for (auto &it : E.Entries) {
553       if (it.addr == addr)
554         return true;
555     }
556 
557     return false;
558   }
559 
560   // Return the pointer to the target entries table
561   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
562     assert(device_id < (int32_t)FuncGblEntries.size() &&
563            "Unexpected device id!");
564     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
565 
566     int32_t size = E.Entries.size();
567 
568     // Table is empty
569     if (!size)
570       return 0;
571 
572     __tgt_offload_entry *begin = &E.Entries[0];
573     __tgt_offload_entry *end = &E.Entries[size - 1];
574 
575     // Update table info according to the entries and return the pointer
576     E.Table.EntriesBegin = begin;
577     E.Table.EntriesEnd = ++end;
578 
579     return &E.Table;
580   }
581 
582   // Clear entries table for a device
583   void clearOffloadEntriesTable(int device_id) {
584     assert(device_id < (int32_t)FuncGblEntries.size() &&
585            "Unexpected device id!");
586     FuncGblEntries[device_id].emplace_back();
587     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
588     // KernelArgPoolMap.clear();
589     E.Entries.clear();
590     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
591   }
592 
593   hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
594                                    int DeviceId) {
595     assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
596     uint32_t GlobalFlags = 0;
597     hsa_status_t Err = hsa_amd_memory_pool_get_info(
598         MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
599 
600     if (Err != HSA_STATUS_SUCCESS) {
601       return Err;
602     }
603 
604     if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
605       DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
606     } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
607       DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
608     }
609 
610     return HSA_STATUS_SUCCESS;
611   }
612 
613   hsa_status_t addHostMemoryPool(hsa_amd_memory_pool_t MemoryPool,
614                                  int DeviceId) {
615     uint32_t GlobalFlags = 0;
616     hsa_status_t Err = hsa_amd_memory_pool_get_info(
617         MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
618 
619     if (Err != HSA_STATUS_SUCCESS) {
620       return Err;
621     }
622 
623     uint32_t Size;
624     Err = hsa_amd_memory_pool_get_info(MemoryPool,
625                                        HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size);
626     if (Err != HSA_STATUS_SUCCESS) {
627       return Err;
628     }
629 
630     if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED &&
631         Size > 0) {
632       HostFineGrainedMemoryPool = MemoryPool;
633     }
634 
635     return HSA_STATUS_SUCCESS;
636   }
637 
638   hsa_status_t setupMemoryPools() {
639     using namespace std::placeholders;
640     hsa_status_t Err;
641     Err = core::collectMemoryPools(
642         CPUAgents, std::bind(&RTLDeviceInfoTy::addHostMemoryPool, this, _1, _2));
643     if (Err != HSA_STATUS_SUCCESS) {
644       fprintf(stderr, "HSA error in collecting memory pools for CPU: %s\n",
645               get_error_string(Err));
646       return Err;
647     }
648     Err = core::collectMemoryPools(
649         HSAAgents, std::bind(&RTLDeviceInfoTy::addDeviceMemoryPool, this, _1, _2));
650     if (Err != HSA_STATUS_SUCCESS) {
651       fprintf(stderr,
652               "HSA error in collecting memory pools for offload devices: %s\n",
653               get_error_string(Err));
654       return Err;
655     }
656     return HSA_STATUS_SUCCESS;
657   }
658 
659   hsa_amd_memory_pool_t getDeviceMemoryPool(int DeviceId) {
660     assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
661            "Invalid device Id");
662     return DeviceCoarseGrainedMemoryPools[DeviceId];
663   }
664 
665   hsa_amd_memory_pool_t getHostMemoryPool() {
666     return HostFineGrainedMemoryPool;
667   }
668 
669   static int readEnvElseMinusOne(const char *Env) {
670     const char *envStr = getenv(Env);
671     int res = -1;
672     if (envStr) {
673       res = std::stoi(envStr);
674       DP("Parsed %s=%d\n", Env, res);
675     }
676     return res;
677   }
678 
679   RTLDeviceInfoTy() {
680     // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
681     // anytime. You do not need a debug library build.
682     //  0 => no tracing
683     //  1 => tracing dispatch only
684     // >1 => verbosity increase
685     if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
686       print_kernel_trace = atoi(envStr);
687     else
688       print_kernel_trace = 0;
689 
690     DP("Start initializing HSA-ATMI\n");
691     hsa_status_t err = core::atl_init_gpu_context();
692     if (err != HSA_STATUS_SUCCESS) {
693       DP("Error when initializing HSA-ATMI\n");
694       return;
695     }
696 
697     // Init hostcall soon after initializing ATMI
698     hostrpc_init();
699 
700     err = FindAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
701       if (DeviceType == HSA_DEVICE_TYPE_CPU) {
702         CPUAgents.push_back(Agent);
703       } else {
704         HSAAgents.push_back(Agent);
705       }
706     });
707     if (err != HSA_STATUS_SUCCESS)
708       return;
709 
710     NumberOfDevices = (int)HSAAgents.size();
711 
712     if (NumberOfDevices == 0) {
713       DP("There are no devices supporting HSA.\n");
714       return;
715     } else {
716       DP("There are %d devices supporting HSA.\n", NumberOfDevices);
717     }
718     std::tie(err, KernArgPool) = core::FindKernargPool(CPUAgents);
719     if (err != HSA_STATUS_SUCCESS) {
720       DP("Error when reading memory pools\n");
721       return;
722     }
723 
724     // Init the device info
725     HSAQueues.resize(NumberOfDevices);
726     FuncGblEntries.resize(NumberOfDevices);
727     ThreadsPerGroup.resize(NumberOfDevices);
728     ComputeUnits.resize(NumberOfDevices);
729     GPUName.resize(NumberOfDevices);
730     GroupsPerDevice.resize(NumberOfDevices);
731     WarpSize.resize(NumberOfDevices);
732     NumTeams.resize(NumberOfDevices);
733     NumThreads.resize(NumberOfDevices);
734     deviceStateStore.resize(NumberOfDevices);
735     KernelInfoTable.resize(NumberOfDevices);
736     SymbolInfoTable.resize(NumberOfDevices);
737     DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
738     DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
739 
740     err = setupMemoryPools();
741     if (err != HSA_STATUS_SUCCESS) {
742       DP("Error when setting up memory pools");
743       return;
744     }
745 
746     for (int i = 0; i < NumberOfDevices; i++) {
747       HSAQueues[i] = nullptr;
748     }
749 
750     for (int i = 0; i < NumberOfDevices; i++) {
751       uint32_t queue_size = 0;
752       {
753         hsa_status_t err = hsa_agent_get_info(
754             HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
755         if (err != HSA_STATUS_SUCCESS) {
756           DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", i);
757           return;
758         }
759         if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) {
760           queue_size = core::Runtime::getInstance().getMaxQueueSize();
761         }
762       }
763 
764       hsa_status_t rc = hsa_queue_create(
765           HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI, callbackQueue, NULL,
766           UINT32_MAX, UINT32_MAX, &HSAQueues[i]);
767       if (rc != HSA_STATUS_SUCCESS) {
768         DP("Failed to create HSA queue %d\n", i);
769         return;
770       }
771 
772       deviceStateStore[i] = {nullptr, 0};
773     }
774 
775     for (int i = 0; i < NumberOfDevices; i++) {
776       ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size;
777       GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams;
778       ComputeUnits[i] = 1;
779       DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", i,
780          GroupsPerDevice[i], ThreadsPerGroup[i]);
781     }
782 
783     // Get environment variables regarding teams
784     Env.TeamLimit = readEnvElseMinusOne("OMP_TEAM_LIMIT");
785     Env.NumTeams = readEnvElseMinusOne("OMP_NUM_TEAMS");
786     Env.MaxTeamsDefault = readEnvElseMinusOne("OMP_MAX_TEAMS_DEFAULT");
787     Env.TeamThreadLimit = readEnvElseMinusOne("OMP_TEAMS_THREAD_LIMIT");
788 
789     // Default state.
790     RequiresFlags = OMP_REQ_UNDEFINED;
791   }
792 
793   ~RTLDeviceInfoTy() {
794     DP("Finalizing the HSA-ATMI DeviceInfo.\n");
795     // Run destructors on types that use HSA before
796     // atmi_finalize removes access to it
797     deviceStateStore.clear();
798     KernelArgPoolMap.clear();
799     // Terminate hostrpc before finalizing ATMI
800     hostrpc_terminate();
801 
802     hsa_status_t Err;
803     for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
804       Err = hsa_executable_destroy(HSAExecutables[I]);
805       if (Err != HSA_STATUS_SUCCESS) {
806         DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
807            "Destroying executable", get_error_string(Err));
808       }
809     }
810 
811     Err = hsa_shut_down();
812     if (Err != HSA_STATUS_SUCCESS) {
813       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Shutting down HSA",
814              get_error_string(Err));
815     }
816   }
817 };
818 
819 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
820 
821 // TODO: May need to drop the trailing to fields until deviceRTL is updated
822 struct omptarget_device_environmentTy {
823   int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG
824                        // only useful for Debug build of deviceRTLs
825   int32_t num_devices; // gets number of active offload devices
826   int32_t device_num;  // gets a value 0 to num_devices-1
827 };
828 
829 static RTLDeviceInfoTy DeviceInfo;
830 
831 namespace {
832 
833 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
834                      __tgt_async_info *AsyncInfo) {
835   assert(AsyncInfo && "AsyncInfo is nullptr");
836   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
837   // Return success if we are not copying back to host from target.
838   if (!HstPtr)
839     return OFFLOAD_SUCCESS;
840   hsa_status_t err;
841   DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
842      (long long unsigned)(Elf64_Addr)TgtPtr,
843      (long long unsigned)(Elf64_Addr)HstPtr);
844 
845   err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size,
846                                              DeviceId);
847 
848   if (err != HSA_STATUS_SUCCESS) {
849     DP("Error when copying data from device to host. Pointers: "
850        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
851        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
852     return OFFLOAD_FAIL;
853   }
854   DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
855      (long long unsigned)(Elf64_Addr)TgtPtr,
856      (long long unsigned)(Elf64_Addr)HstPtr);
857   return OFFLOAD_SUCCESS;
858 }
859 
860 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
861                    __tgt_async_info *AsyncInfo) {
862   assert(AsyncInfo && "AsyncInfo is nullptr");
863   hsa_status_t err;
864   assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large");
865   // Return success if we are not doing host to target.
866   if (!HstPtr)
867     return OFFLOAD_SUCCESS;
868 
869   DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
870      (long long unsigned)(Elf64_Addr)HstPtr,
871      (long long unsigned)(Elf64_Addr)TgtPtr);
872   err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size,
873                                              DeviceId);
874   if (err != HSA_STATUS_SUCCESS) {
875     DP("Error when copying data from host to device. Pointers: "
876        "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
877        (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
878     return OFFLOAD_FAIL;
879   }
880   return OFFLOAD_SUCCESS;
881 }
882 
883 // Async.
884 // The implementation was written with cuda streams in mind. The semantics of
885 // that are to execute kernels on a queue in order of insertion. A synchronise
886 // call then makes writes visible between host and device. This means a series
887 // of N data_submit_async calls are expected to execute serially. HSA offers
888 // various options to run the data copies concurrently. This may require changes
889 // to libomptarget.
890 
891 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
892 // there are no outstanding kernels that need to be synchronized. Any async call
893 // may be passed a Queue==0, at which point the cuda implementation will set it
894 // to non-null (see getStream). The cuda streams are per-device. Upstream may
895 // change this interface to explicitly initialize the AsyncInfo_pointer, but
896 // until then hsa lazily initializes it as well.
897 
898 void initAsyncInfo(__tgt_async_info *AsyncInfo) {
899   // set non-null while using async calls, return to null to indicate completion
900   assert(AsyncInfo);
901   if (!AsyncInfo->Queue) {
902     AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX);
903   }
904 }
905 void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
906   assert(AsyncInfo);
907   assert(AsyncInfo->Queue);
908   AsyncInfo->Queue = 0;
909 }
910 
911 bool elf_machine_id_is_amdgcn(__tgt_device_image *image) {
912   const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
913   int32_t r = elf_check_machine(image, amdgcnMachineID);
914   if (!r) {
915     DP("Supported machine ID not found\n");
916   }
917   return r;
918 }
919 
920 uint32_t elf_e_flags(__tgt_device_image *image) {
921   char *img_begin = (char *)image->ImageStart;
922   size_t img_size = (char *)image->ImageEnd - img_begin;
923 
924   Elf *e = elf_memory(img_begin, img_size);
925   if (!e) {
926     DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
927     return 0;
928   }
929 
930   Elf64_Ehdr *eh64 = elf64_getehdr(e);
931 
932   if (!eh64) {
933     DP("Unable to get machine ID from ELF file!\n");
934     elf_end(e);
935     return 0;
936   }
937 
938   uint32_t Flags = eh64->e_flags;
939 
940   elf_end(e);
941   DP("ELF Flags: 0x%x\n", Flags);
942   return Flags;
943 }
944 } // namespace
945 
946 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
947   return elf_machine_id_is_amdgcn(image);
948 }
949 
950 int __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
951 
952 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
953   DP("Init requires flags to %ld\n", RequiresFlags);
954   DeviceInfo.RequiresFlags = RequiresFlags;
955   return RequiresFlags;
956 }
957 
958 namespace {
959 template <typename T> bool enforce_upper_bound(T *value, T upper) {
960   bool changed = *value > upper;
961   if (changed) {
962     *value = upper;
963   }
964   return changed;
965 }
966 } // namespace
967 
968 int32_t __tgt_rtl_init_device(int device_id) {
969   hsa_status_t err;
970 
971   // this is per device id init
972   DP("Initialize the device id: %d\n", device_id);
973 
974   hsa_agent_t agent = DeviceInfo.HSAAgents[device_id];
975 
976   // Get number of Compute Unit
977   uint32_t compute_units = 0;
978   err = hsa_agent_get_info(
979       agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
980       &compute_units);
981   if (err != HSA_STATUS_SUCCESS) {
982     DeviceInfo.ComputeUnits[device_id] = 1;
983     DP("Error getting compute units : settiing to 1\n");
984   } else {
985     DeviceInfo.ComputeUnits[device_id] = compute_units;
986     DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]);
987   }
988 
989   char GetInfoName[64]; // 64 max size returned by get info
990   err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
991                            (void *)GetInfoName);
992   if (err)
993     DeviceInfo.GPUName[device_id] = "--unknown gpu--";
994   else {
995     DeviceInfo.GPUName[device_id] = GetInfoName;
996   }
997 
998   if (print_kernel_trace & STARTUP_DETAILS)
999     fprintf(stderr, "Device#%-2d CU's: %2d %s\n", device_id,
1000             DeviceInfo.ComputeUnits[device_id],
1001             DeviceInfo.GPUName[device_id].c_str());
1002 
1003   // Query attributes to determine number of threads/block and blocks/grid.
1004   uint16_t workgroup_max_dim[3];
1005   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
1006                            &workgroup_max_dim);
1007   if (err != HSA_STATUS_SUCCESS) {
1008     DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
1009     DP("Error getting grid dims: num groups : %d\n",
1010        RTLDeviceInfoTy::DefaultNumTeams);
1011   } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
1012     DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0];
1013     DP("Using %d ROCm blocks per grid\n",
1014        DeviceInfo.GroupsPerDevice[device_id]);
1015   } else {
1016     DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit;
1017     DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
1018        "at the hard limit\n",
1019        workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit);
1020   }
1021 
1022   // Get thread limit
1023   hsa_dim3_t grid_max_dim;
1024   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim);
1025   if (err == HSA_STATUS_SUCCESS) {
1026     DeviceInfo.ThreadsPerGroup[device_id] =
1027         reinterpret_cast<uint32_t *>(&grid_max_dim)[0] /
1028         DeviceInfo.GroupsPerDevice[device_id];
1029 
1030     if (DeviceInfo.ThreadsPerGroup[device_id] == 0) {
1031       DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
1032       DP("Default thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
1033     } else if (enforce_upper_bound(&DeviceInfo.ThreadsPerGroup[device_id],
1034                                    RTLDeviceInfoTy::Max_WG_Size)) {
1035       DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size);
1036     } else {
1037       DP("Using ROCm Queried thread limit: %d\n",
1038          DeviceInfo.ThreadsPerGroup[device_id]);
1039     }
1040   } else {
1041     DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size;
1042     DP("Error getting max block dimension, use default:%d \n",
1043        RTLDeviceInfoTy::Max_WG_Size);
1044   }
1045 
1046   // Get wavefront size
1047   uint32_t wavefront_size = 0;
1048   err =
1049       hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size);
1050   if (err == HSA_STATUS_SUCCESS) {
1051     DP("Queried wavefront size: %d\n", wavefront_size);
1052     DeviceInfo.WarpSize[device_id] = wavefront_size;
1053   } else {
1054     DP("Default wavefront size: %d\n",
1055        llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]);
1056     DeviceInfo.WarpSize[device_id] =
1057         llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
1058   }
1059 
1060   // Adjust teams to the env variables
1061 
1062   if (DeviceInfo.Env.TeamLimit > 0 &&
1063       (enforce_upper_bound(&DeviceInfo.GroupsPerDevice[device_id],
1064                            DeviceInfo.Env.TeamLimit))) {
1065     DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
1066        DeviceInfo.Env.TeamLimit);
1067   }
1068 
1069   // Set default number of teams
1070   if (DeviceInfo.Env.NumTeams > 0) {
1071     DeviceInfo.NumTeams[device_id] = DeviceInfo.Env.NumTeams;
1072     DP("Default number of teams set according to environment %d\n",
1073        DeviceInfo.Env.NumTeams);
1074   } else {
1075     char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
1076     int TeamsPerCU = DefaultTeamsPerCU;
1077     if (TeamsPerCUEnvStr) {
1078       TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
1079     }
1080 
1081     DeviceInfo.NumTeams[device_id] =
1082         TeamsPerCU * DeviceInfo.ComputeUnits[device_id];
1083     DP("Default number of teams = %d * number of compute units %d\n",
1084        TeamsPerCU, DeviceInfo.ComputeUnits[device_id]);
1085   }
1086 
1087   if (enforce_upper_bound(&DeviceInfo.NumTeams[device_id],
1088                           DeviceInfo.GroupsPerDevice[device_id])) {
1089     DP("Default number of teams exceeds device limit, capping at %d\n",
1090        DeviceInfo.GroupsPerDevice[device_id]);
1091   }
1092 
1093   // Adjust threads to the env variables
1094   if (DeviceInfo.Env.TeamThreadLimit > 0 &&
1095       (enforce_upper_bound(&DeviceInfo.NumThreads[device_id],
1096                            DeviceInfo.Env.TeamThreadLimit))) {
1097     DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
1098        DeviceInfo.Env.TeamThreadLimit);
1099   }
1100 
1101   // Set default number of threads
1102   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size;
1103   DP("Default number of threads set according to library's default %d\n",
1104      RTLDeviceInfoTy::Default_WG_Size);
1105   if (enforce_upper_bound(&DeviceInfo.NumThreads[device_id],
1106                           DeviceInfo.ThreadsPerGroup[device_id])) {
1107     DP("Default number of threads exceeds device limit, capping at %d\n",
1108        DeviceInfo.ThreadsPerGroup[device_id]);
1109   }
1110 
1111   DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
1112      device_id, DeviceInfo.GroupsPerDevice[device_id],
1113      DeviceInfo.ThreadsPerGroup[device_id]);
1114 
1115   DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id,
1116      DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id],
1117      DeviceInfo.GroupsPerDevice[device_id],
1118      DeviceInfo.GroupsPerDevice[device_id] *
1119          DeviceInfo.ThreadsPerGroup[device_id]);
1120 
1121   return OFFLOAD_SUCCESS;
1122 }
1123 
1124 namespace {
1125 Elf64_Shdr *find_only_SHT_HASH(Elf *elf) {
1126   size_t N;
1127   int rc = elf_getshdrnum(elf, &N);
1128   if (rc != 0) {
1129     return nullptr;
1130   }
1131 
1132   Elf64_Shdr *result = nullptr;
1133   for (size_t i = 0; i < N; i++) {
1134     Elf_Scn *scn = elf_getscn(elf, i);
1135     if (scn) {
1136       Elf64_Shdr *shdr = elf64_getshdr(scn);
1137       if (shdr) {
1138         if (shdr->sh_type == SHT_HASH) {
1139           if (result == nullptr) {
1140             result = shdr;
1141           } else {
1142             // multiple SHT_HASH sections not handled
1143             return nullptr;
1144           }
1145         }
1146       }
1147     }
1148   }
1149   return result;
1150 }
1151 
1152 const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash,
1153                             const char *symname) {
1154 
1155   assert(section_hash);
1156   size_t section_symtab_index = section_hash->sh_link;
1157   Elf64_Shdr *section_symtab =
1158       elf64_getshdr(elf_getscn(elf, section_symtab_index));
1159   size_t section_strtab_index = section_symtab->sh_link;
1160 
1161   const Elf64_Sym *symtab =
1162       reinterpret_cast<const Elf64_Sym *>(base + section_symtab->sh_offset);
1163 
1164   const uint32_t *hashtab =
1165       reinterpret_cast<const uint32_t *>(base + section_hash->sh_offset);
1166 
1167   // Layout:
1168   // nbucket
1169   // nchain
1170   // bucket[nbucket]
1171   // chain[nchain]
1172   uint32_t nbucket = hashtab[0];
1173   const uint32_t *bucket = &hashtab[2];
1174   const uint32_t *chain = &hashtab[nbucket + 2];
1175 
1176   const size_t max = strlen(symname) + 1;
1177   const uint32_t hash = elf_hash(symname);
1178   for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) {
1179     char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name);
1180     if (strncmp(symname, n, max) == 0) {
1181       return &symtab[i];
1182     }
1183   }
1184 
1185   return nullptr;
1186 }
1187 
1188 struct symbol_info {
1189   void *addr = nullptr;
1190   uint32_t size = UINT32_MAX;
1191   uint32_t sh_type = SHT_NULL;
1192 };
1193 
1194 int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname,
1195                                     symbol_info *res) {
1196   if (elf_kind(elf) != ELF_K_ELF) {
1197     return 1;
1198   }
1199 
1200   Elf64_Shdr *section_hash = find_only_SHT_HASH(elf);
1201   if (!section_hash) {
1202     return 1;
1203   }
1204 
1205   const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname);
1206   if (!sym) {
1207     return 1;
1208   }
1209 
1210   if (sym->st_size > UINT32_MAX) {
1211     return 1;
1212   }
1213 
1214   if (sym->st_shndx == SHN_UNDEF) {
1215     return 1;
1216   }
1217 
1218   Elf_Scn *section = elf_getscn(elf, sym->st_shndx);
1219   if (!section) {
1220     return 1;
1221   }
1222 
1223   Elf64_Shdr *header = elf64_getshdr(section);
1224   if (!header) {
1225     return 1;
1226   }
1227 
1228   res->addr = sym->st_value + base;
1229   res->size = static_cast<uint32_t>(sym->st_size);
1230   res->sh_type = header->sh_type;
1231   return 0;
1232 }
1233 
1234 int get_symbol_info_without_loading(char *base, size_t img_size,
1235                                     const char *symname, symbol_info *res) {
1236   Elf *elf = elf_memory(base, img_size);
1237   if (elf) {
1238     int rc = get_symbol_info_without_loading(elf, base, symname, res);
1239     elf_end(elf);
1240     return rc;
1241   }
1242   return 1;
1243 }
1244 
1245 hsa_status_t interop_get_symbol_info(char *base, size_t img_size,
1246                                      const char *symname, void **var_addr,
1247                                      uint32_t *var_size) {
1248   symbol_info si;
1249   int rc = get_symbol_info_without_loading(base, img_size, symname, &si);
1250   if (rc == 0) {
1251     *var_addr = si.addr;
1252     *var_size = si.size;
1253     return HSA_STATUS_SUCCESS;
1254   } else {
1255     return HSA_STATUS_ERROR;
1256   }
1257 }
1258 
1259 template <typename C>
1260 hsa_status_t module_register_from_memory_to_place(
1261     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
1262     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
1263     void *module_bytes, size_t module_size, int DeviceId, C cb,
1264     std::vector<hsa_executable_t> &HSAExecutables) {
1265   auto L = [](void *data, size_t size, void *cb_state) -> hsa_status_t {
1266     C *unwrapped = static_cast<C *>(cb_state);
1267     return (*unwrapped)(data, size);
1268   };
1269   return core::RegisterModuleFromMemory(
1270       KernelInfoTable, SymbolInfoTable, module_bytes, module_size,
1271       DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&cb),
1272       HSAExecutables);
1273 }
1274 } // namespace
1275 
1276 static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) {
1277   uint64_t device_State_bytes = 0;
1278   {
1279     // If this is the deviceRTL, get the state variable size
1280     symbol_info size_si;
1281     int rc = get_symbol_info_without_loading(
1282         ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si);
1283 
1284     if (rc == 0) {
1285       if (size_si.size != sizeof(uint64_t)) {
1286         DP("Found device_State_size variable with wrong size\n");
1287         return 0;
1288       }
1289 
1290       // Read number of bytes directly from the elf
1291       memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t));
1292     }
1293   }
1294   return device_State_bytes;
1295 }
1296 
1297 static __tgt_target_table *
1298 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
1299 
1300 static __tgt_target_table *
1301 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image);
1302 
1303 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1304                                           __tgt_device_image *image) {
1305   DeviceInfo.load_run_lock.lock();
1306   __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image);
1307   DeviceInfo.load_run_lock.unlock();
1308   return res;
1309 }
1310 
1311 struct device_environment {
1312   // initialise an omptarget_device_environmentTy in the deviceRTL
1313   // patches around differences in the deviceRTL between trunk, aomp,
1314   // rocmcc. Over time these differences will tend to zero and this class
1315   // simplified.
1316   // Symbol may be in .data or .bss, and may be missing fields:
1317   //  - aomp has debug_level, num_devices, device_num
1318   //  - trunk has debug_level
1319   //  - under review in trunk is debug_level, device_num
1320   //  - rocmcc matches aomp, patch to swap num_devices and device_num
1321 
1322   // The symbol may also have been deadstripped because the device side
1323   // accessors were unused.
1324 
1325   // If the symbol is in .data (aomp, rocm) it can be written directly.
1326   // If it is in .bss, we must wait for it to be allocated space on the
1327   // gpu (trunk) and initialize after loading.
1328   const char *sym() { return "omptarget_device_environment"; }
1329 
1330   omptarget_device_environmentTy host_device_env;
1331   symbol_info si;
1332   bool valid = false;
1333 
1334   __tgt_device_image *image;
1335   const size_t img_size;
1336 
1337   device_environment(int device_id, int number_devices,
1338                      __tgt_device_image *image, const size_t img_size)
1339       : image(image), img_size(img_size) {
1340 
1341     host_device_env.num_devices = number_devices;
1342     host_device_env.device_num = device_id;
1343     host_device_env.debug_level = 0;
1344 #ifdef OMPTARGET_DEBUG
1345     if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
1346       host_device_env.debug_level = std::stoi(envStr);
1347     }
1348 #endif
1349 
1350     int rc = get_symbol_info_without_loading((char *)image->ImageStart,
1351                                              img_size, sym(), &si);
1352     if (rc != 0) {
1353       DP("Finding global device environment '%s' - symbol missing.\n", sym());
1354       return;
1355     }
1356 
1357     if (si.size > sizeof(host_device_env)) {
1358       DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size,
1359          sizeof(host_device_env));
1360       return;
1361     }
1362 
1363     valid = true;
1364   }
1365 
1366   bool in_image() { return si.sh_type != SHT_NOBITS; }
1367 
1368   hsa_status_t before_loading(void *data, size_t size) {
1369     if (valid) {
1370       if (in_image()) {
1371         DP("Setting global device environment before load (%u bytes)\n",
1372            si.size);
1373         uint64_t offset = (char *)si.addr - (char *)image->ImageStart;
1374         void *pos = (char *)data + offset;
1375         memcpy(pos, &host_device_env, si.size);
1376       }
1377     }
1378     return HSA_STATUS_SUCCESS;
1379   }
1380 
1381   hsa_status_t after_loading() {
1382     if (valid) {
1383       if (!in_image()) {
1384         DP("Setting global device environment after load (%u bytes)\n",
1385            si.size);
1386         int device_id = host_device_env.device_num;
1387         auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
1388         void *state_ptr;
1389         uint32_t state_ptr_size;
1390         hsa_status_t err = atmi_interop_hsa_get_symbol_info(
1391             SymbolInfo, device_id, sym(), &state_ptr, &state_ptr_size);
1392         if (err != HSA_STATUS_SUCCESS) {
1393           DP("failed to find %s in loaded image\n", sym());
1394           return err;
1395         }
1396 
1397         if (state_ptr_size != si.size) {
1398           DP("Symbol had size %u before loading, %u after\n", state_ptr_size,
1399              si.size);
1400           return HSA_STATUS_ERROR;
1401         }
1402 
1403         return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env,
1404                                                     state_ptr_size, device_id);
1405       }
1406     }
1407     return HSA_STATUS_SUCCESS;
1408   }
1409 };
1410 
1411 static hsa_status_t atmi_calloc(void **ret_ptr, size_t size, int DeviceId) {
1412   uint64_t rounded = 4 * ((size + 3) / 4);
1413   void *ptr;
1414   hsa_status_t err = core::Runtime::DeviceMalloc(&ptr, rounded, DeviceId);
1415   if (err != HSA_STATUS_SUCCESS) {
1416     return err;
1417   }
1418 
1419   hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4);
1420   if (rc != HSA_STATUS_SUCCESS) {
1421     fprintf(stderr, "zero fill device_state failed with %u\n", rc);
1422     core::Runtime::Memfree(ptr);
1423     return HSA_STATUS_ERROR;
1424   }
1425 
1426   *ret_ptr = ptr;
1427   return HSA_STATUS_SUCCESS;
1428 }
1429 
1430 static bool image_contains_symbol(void *data, size_t size, const char *sym) {
1431   symbol_info si;
1432   int rc = get_symbol_info_without_loading((char *)data, size, sym, &si);
1433   return (rc == 0) && (si.addr != nullptr);
1434 }
1435 
1436 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
1437                                                  __tgt_device_image *image) {
1438   // This function loads the device image onto gpu[device_id] and does other
1439   // per-image initialization work. Specifically:
1440   //
1441   // - Initialize an omptarget_device_environmentTy instance embedded in the
1442   //   image at the symbol "omptarget_device_environment"
1443   //   Fields debug_level, device_num, num_devices. Used by the deviceRTL.
1444   //
1445   // - Allocate a large array per-gpu (could be moved to init_device)
1446   //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
1447   //   - Allocate at least that many bytes of gpu memory
1448   //   - Zero initialize it
1449   //   - Write the pointer to the symbol omptarget_nvptx_device_State
1450   //
1451   // - Pulls some per-kernel information together from various sources and
1452   //   records it in the KernelsList for quicker access later
1453   //
1454   // The initialization can be done before or after loading the image onto the
1455   // gpu. This function presently does a mixture. Using the hsa api to get/set
1456   // the information is simpler to implement, in exchange for more complicated
1457   // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
1458   // back from the gpu vs a hashtable lookup on the host.
1459 
1460   const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart;
1461 
1462   DeviceInfo.clearOffloadEntriesTable(device_id);
1463 
1464   // We do not need to set the ELF version because the caller of this function
1465   // had to do that to decide the right runtime to use
1466 
1467   if (!elf_machine_id_is_amdgcn(image)) {
1468     return NULL;
1469   }
1470 
1471   {
1472     auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image,
1473                                   img_size);
1474 
1475     auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id];
1476     auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id];
1477     hsa_status_t err = module_register_from_memory_to_place(
1478         KernelInfo, SymbolInfo, (void *)image->ImageStart, img_size, device_id,
1479         [&](void *data, size_t size) {
1480           if (image_contains_symbol(data, size, "needs_hostcall_buffer")) {
1481             __atomic_store_n(&DeviceInfo.hostcall_required, true,
1482                              __ATOMIC_RELEASE);
1483           }
1484           return env.before_loading(data, size);
1485         },
1486         DeviceInfo.HSAExecutables);
1487 
1488     check("Module registering", err);
1489     if (err != HSA_STATUS_SUCCESS) {
1490       fprintf(stderr,
1491               "Possible gpu arch mismatch: device:%s, image:%s please check"
1492               " compiler flag: -march=<gpu>\n",
1493               DeviceInfo.GPUName[device_id].c_str(),
1494               get_elf_mach_gfx_name(elf_e_flags(image)));
1495       return NULL;
1496     }
1497 
1498     err = env.after_loading();
1499     if (err != HSA_STATUS_SUCCESS) {
1500       return NULL;
1501     }
1502   }
1503 
1504   DP("ATMI module successfully loaded!\n");
1505 
1506   {
1507     // the device_State array is either large value in bss or a void* that
1508     // needs to be assigned to a pointer to an array of size device_state_bytes
1509     // If absent, it has been deadstripped and needs no setup.
1510 
1511     void *state_ptr;
1512     uint32_t state_ptr_size;
1513     auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
1514     hsa_status_t err = atmi_interop_hsa_get_symbol_info(
1515         SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr,
1516         &state_ptr_size);
1517 
1518     if (err != HSA_STATUS_SUCCESS) {
1519       DP("No device_state symbol found, skipping initialization\n");
1520     } else {
1521       if (state_ptr_size < sizeof(void *)) {
1522         DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size,
1523            sizeof(void *));
1524         return NULL;
1525       }
1526 
1527       // if it's larger than a void*, assume it's a bss array and no further
1528       // initialization is required. Only try to set up a pointer for
1529       // sizeof(void*)
1530       if (state_ptr_size == sizeof(void *)) {
1531         uint64_t device_State_bytes =
1532             get_device_State_bytes((char *)image->ImageStart, img_size);
1533         if (device_State_bytes == 0) {
1534           DP("Can't initialize device_State, missing size information\n");
1535           return NULL;
1536         }
1537 
1538         auto &dss = DeviceInfo.deviceStateStore[device_id];
1539         if (dss.first.get() == nullptr) {
1540           assert(dss.second == 0);
1541           void *ptr = NULL;
1542           hsa_status_t err = atmi_calloc(&ptr, device_State_bytes, device_id);
1543           if (err != HSA_STATUS_SUCCESS) {
1544             DP("Failed to allocate device_state array\n");
1545             return NULL;
1546           }
1547           dss = {
1548               std::unique_ptr<void, RTLDeviceInfoTy::atmiFreePtrDeletor>{ptr},
1549               device_State_bytes,
1550           };
1551         }
1552 
1553         void *ptr = dss.first.get();
1554         if (device_State_bytes != dss.second) {
1555           DP("Inconsistent sizes of device_State unsupported\n");
1556           return NULL;
1557         }
1558 
1559         // write ptr to device memory so it can be used by later kernels
1560         err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr,
1561                                                    sizeof(void *), device_id);
1562         if (err != HSA_STATUS_SUCCESS) {
1563           DP("memcpy install of state_ptr failed\n");
1564           return NULL;
1565         }
1566       }
1567     }
1568   }
1569 
1570   // Here, we take advantage of the data that is appended after img_end to get
1571   // the symbols' name we need to load. This data consist of the host entries
1572   // begin and end as well as the target name (see the offloading linker script
1573   // creation in clang compiler).
1574 
1575   // Find the symbols in the module by name. The name can be obtain by
1576   // concatenating the host entry name with the target name
1577 
1578   __tgt_offload_entry *HostBegin = image->EntriesBegin;
1579   __tgt_offload_entry *HostEnd = image->EntriesEnd;
1580 
1581   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
1582 
1583     if (!e->addr) {
1584       // The host should have always something in the address to
1585       // uniquely identify the target region.
1586       fprintf(stderr, "Analyzing host entry '<null>' (size = %lld)...\n",
1587               (unsigned long long)e->size);
1588       return NULL;
1589     }
1590 
1591     if (e->size) {
1592       __tgt_offload_entry entry = *e;
1593 
1594       void *varptr;
1595       uint32_t varsize;
1596 
1597       auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id];
1598       hsa_status_t err = atmi_interop_hsa_get_symbol_info(
1599           SymbolInfoMap, device_id, e->name, &varptr, &varsize);
1600 
1601       if (err != HSA_STATUS_SUCCESS) {
1602         // Inform the user what symbol prevented offloading
1603         DP("Loading global '%s' (Failed)\n", e->name);
1604         return NULL;
1605       }
1606 
1607       if (varsize != e->size) {
1608         DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name,
1609            varsize, e->size);
1610         return NULL;
1611       }
1612 
1613       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
1614          DPxPTR(e - HostBegin), e->name, DPxPTR(varptr));
1615       entry.addr = (void *)varptr;
1616 
1617       DeviceInfo.addOffloadEntry(device_id, entry);
1618 
1619       if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
1620           e->flags & OMP_DECLARE_TARGET_LINK) {
1621         // If unified memory is present any target link variables
1622         // can access host addresses directly. There is no longer a
1623         // need for device copies.
1624         err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr,
1625                                                    sizeof(void *), device_id);
1626         if (err != HSA_STATUS_SUCCESS)
1627           DP("Error when copying USM\n");
1628         DP("Copy linked variable host address (" DPxMOD ")"
1629            "to device address (" DPxMOD ")\n",
1630            DPxPTR(*((void **)e->addr)), DPxPTR(varptr));
1631       }
1632 
1633       continue;
1634     }
1635 
1636     DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name));
1637 
1638     uint32_t kernarg_segment_size;
1639     auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id];
1640     hsa_status_t err = atmi_interop_hsa_get_kernel_info(
1641         KernelInfoMap, device_id, e->name,
1642         HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1643         &kernarg_segment_size);
1644 
1645     // each arg is a void * in this openmp implementation
1646     uint32_t arg_num = kernarg_segment_size / sizeof(void *);
1647     std::vector<size_t> arg_sizes(arg_num);
1648     for (std::vector<size_t>::iterator it = arg_sizes.begin();
1649          it != arg_sizes.end(); it++) {
1650       *it = sizeof(void *);
1651     }
1652 
1653     // default value GENERIC (in case symbol is missing from cubin file)
1654     int8_t ExecModeVal = ExecutionModeType::GENERIC;
1655 
1656     // get flat group size if present, else Default_WG_Size
1657     int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
1658 
1659     // get Kernel Descriptor if present.
1660     // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
1661     struct KernDescValType {
1662       uint16_t Version;
1663       uint16_t TSize;
1664       uint16_t WG_Size;
1665       uint8_t Mode;
1666     };
1667     struct KernDescValType KernDescVal;
1668     std::string KernDescNameStr(e->name);
1669     KernDescNameStr += "_kern_desc";
1670     const char *KernDescName = KernDescNameStr.c_str();
1671 
1672     void *KernDescPtr;
1673     uint32_t KernDescSize;
1674     void *CallStackAddr = nullptr;
1675     err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1676                                   KernDescName, &KernDescPtr, &KernDescSize);
1677 
1678     if (err == HSA_STATUS_SUCCESS) {
1679       if ((size_t)KernDescSize != sizeof(KernDescVal))
1680         DP("Loading global computation properties '%s' - size mismatch (%u != "
1681            "%lu)\n",
1682            KernDescName, KernDescSize, sizeof(KernDescVal));
1683 
1684       memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
1685 
1686       // Check structure size against recorded size.
1687       if ((size_t)KernDescSize != KernDescVal.TSize)
1688         DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
1689            sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
1690 
1691       DP("After loading global for %s KernDesc \n", KernDescName);
1692       DP("KernDesc: Version: %d\n", KernDescVal.Version);
1693       DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
1694       DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size);
1695       DP("KernDesc: Mode: %d\n", KernDescVal.Mode);
1696 
1697       // Get ExecMode
1698       ExecModeVal = KernDescVal.Mode;
1699       DP("ExecModeVal %d\n", ExecModeVal);
1700       if (KernDescVal.WG_Size == 0) {
1701         KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size;
1702         DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size);
1703       }
1704       WGSizeVal = KernDescVal.WG_Size;
1705       DP("WGSizeVal %d\n", WGSizeVal);
1706       check("Loading KernDesc computation property", err);
1707     } else {
1708       DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
1709 
1710       // Generic
1711       std::string ExecModeNameStr(e->name);
1712       ExecModeNameStr += "_exec_mode";
1713       const char *ExecModeName = ExecModeNameStr.c_str();
1714 
1715       void *ExecModePtr;
1716       uint32_t varsize;
1717       err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1718                                     ExecModeName, &ExecModePtr, &varsize);
1719 
1720       if (err == HSA_STATUS_SUCCESS) {
1721         if ((size_t)varsize != sizeof(int8_t)) {
1722           DP("Loading global computation properties '%s' - size mismatch(%u != "
1723              "%lu)\n",
1724              ExecModeName, varsize, sizeof(int8_t));
1725           return NULL;
1726         }
1727 
1728         memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize);
1729 
1730         DP("After loading global for %s ExecMode = %d\n", ExecModeName,
1731            ExecModeVal);
1732 
1733         if (ExecModeVal < 0 || ExecModeVal > 1) {
1734           DP("Error wrong exec_mode value specified in HSA code object file: "
1735              "%d\n",
1736              ExecModeVal);
1737           return NULL;
1738         }
1739       } else {
1740         DP("Loading global exec_mode '%s' - symbol missing, using default "
1741            "value "
1742            "GENERIC (1)\n",
1743            ExecModeName);
1744       }
1745       check("Loading computation property", err);
1746 
1747       // Flat group size
1748       std::string WGSizeNameStr(e->name);
1749       WGSizeNameStr += "_wg_size";
1750       const char *WGSizeName = WGSizeNameStr.c_str();
1751 
1752       void *WGSizePtr;
1753       uint32_t WGSize;
1754       err = interop_get_symbol_info((char *)image->ImageStart, img_size,
1755                                     WGSizeName, &WGSizePtr, &WGSize);
1756 
1757       if (err == HSA_STATUS_SUCCESS) {
1758         if ((size_t)WGSize != sizeof(int16_t)) {
1759           DP("Loading global computation properties '%s' - size mismatch (%u "
1760              "!= "
1761              "%lu)\n",
1762              WGSizeName, WGSize, sizeof(int16_t));
1763           return NULL;
1764         }
1765 
1766         memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
1767 
1768         DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
1769 
1770         if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size ||
1771             WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) {
1772           DP("Error wrong WGSize value specified in HSA code object file: "
1773              "%d\n",
1774              WGSizeVal);
1775           WGSizeVal = RTLDeviceInfoTy::Default_WG_Size;
1776         }
1777       } else {
1778         DP("Warning: Loading WGSize '%s' - symbol not found, "
1779            "using default value %d\n",
1780            WGSizeName, WGSizeVal);
1781       }
1782 
1783       check("Loading WGSize computation property", err);
1784     }
1785 
1786     KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id,
1787                                    CallStackAddr, e->name, kernarg_segment_size,
1788                                    DeviceInfo.KernArgPool));
1789     __tgt_offload_entry entry = *e;
1790     entry.addr = (void *)&KernelsList.back();
1791     DeviceInfo.addOffloadEntry(device_id, entry);
1792     DP("Entry point %ld maps to %s\n", e - HostBegin, e->name);
1793   }
1794 
1795   return DeviceInfo.getOffloadEntriesTable(device_id);
1796 }
1797 
1798 void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) {
1799   void *ptr = NULL;
1800   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1801 
1802   if (kind != TARGET_ALLOC_DEFAULT) {
1803     REPORT("Invalid target data allocation kind or requested allocator not "
1804            "implemented yet\n");
1805     return NULL;
1806   }
1807 
1808   hsa_status_t err = core::Runtime::DeviceMalloc(&ptr, size, device_id);
1809   DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size,
1810      (long long unsigned)(Elf64_Addr)ptr);
1811   ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL;
1812   return ptr;
1813 }
1814 
1815 int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr,
1816                               int64_t size) {
1817   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1818   __tgt_async_info AsyncInfo;
1819   int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo);
1820   if (rc != OFFLOAD_SUCCESS)
1821     return OFFLOAD_FAIL;
1822 
1823   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1824 }
1825 
1826 int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr,
1827                                     int64_t size, __tgt_async_info *AsyncInfo) {
1828   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1829   if (AsyncInfo) {
1830     initAsyncInfo(AsyncInfo);
1831     return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo);
1832   } else {
1833     return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size);
1834   }
1835 }
1836 
1837 int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr,
1838                                 int64_t size) {
1839   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1840   __tgt_async_info AsyncInfo;
1841   int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo);
1842   if (rc != OFFLOAD_SUCCESS)
1843     return OFFLOAD_FAIL;
1844 
1845   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1846 }
1847 
1848 int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr,
1849                                       void *tgt_ptr, int64_t size,
1850                                       __tgt_async_info *AsyncInfo) {
1851   assert(AsyncInfo && "AsyncInfo is nullptr");
1852   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1853   initAsyncInfo(AsyncInfo);
1854   return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo);
1855 }
1856 
1857 int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) {
1858   assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large");
1859   hsa_status_t err;
1860   DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr);
1861   err = core::Runtime::Memfree(tgt_ptr);
1862   if (err != HSA_STATUS_SUCCESS) {
1863     DP("Error when freeing CUDA memory\n");
1864     return OFFLOAD_FAIL;
1865   }
1866   return OFFLOAD_SUCCESS;
1867 }
1868 
1869 // Determine launch values for kernel.
1870 struct launchVals {
1871   int WorkgroupSize;
1872   int GridSize;
1873 };
1874 launchVals getLaunchVals(EnvironmentVariables Env, int ConstWGSize,
1875                          int ExecutionMode, int num_teams, int thread_limit,
1876                          uint64_t loop_tripcount, int DeviceNumTeams) {
1877 
1878   int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
1879   int num_groups = 0;
1880 
1881   int Max_Teams =
1882       Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
1883   if (Max_Teams > RTLDeviceInfoTy::HardTeamLimit)
1884     Max_Teams = RTLDeviceInfoTy::HardTeamLimit;
1885 
1886   if (print_kernel_trace & STARTUP_DETAILS) {
1887     fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n",
1888             RTLDeviceInfoTy::Max_Teams);
1889     fprintf(stderr, "Max_Teams: %d\n", Max_Teams);
1890     fprintf(stderr, "RTLDeviceInfoTy::Warp_Size: %d\n",
1891             RTLDeviceInfoTy::Warp_Size);
1892     fprintf(stderr, "RTLDeviceInfoTy::Max_WG_Size: %d\n",
1893             RTLDeviceInfoTy::Max_WG_Size);
1894     fprintf(stderr, "RTLDeviceInfoTy::Default_WG_Size: %d\n",
1895             RTLDeviceInfoTy::Default_WG_Size);
1896     fprintf(stderr, "thread_limit: %d\n", thread_limit);
1897     fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
1898     fprintf(stderr, "ConstWGSize: %d\n", ConstWGSize);
1899   }
1900   // check for thread_limit() clause
1901   if (thread_limit > 0) {
1902     threadsPerGroup = thread_limit;
1903     DP("Setting threads per block to requested %d\n", thread_limit);
1904     if (ExecutionMode == GENERIC) { // Add master warp for GENERIC
1905       threadsPerGroup += RTLDeviceInfoTy::Warp_Size;
1906       DP("Adding master wavefront: +%d threads\n", RTLDeviceInfoTy::Warp_Size);
1907     }
1908     if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
1909       threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
1910       DP("Setting threads per block to maximum %d\n", threadsPerGroup);
1911     }
1912   }
1913   // check flat_max_work_group_size attr here
1914   if (threadsPerGroup > ConstWGSize) {
1915     threadsPerGroup = ConstWGSize;
1916     DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
1917        threadsPerGroup);
1918   }
1919   if (print_kernel_trace & STARTUP_DETAILS)
1920     fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
1921   DP("Preparing %d threads\n", threadsPerGroup);
1922 
1923   // Set default num_groups (teams)
1924   if (Env.TeamLimit > 0)
1925     num_groups = (Max_Teams < Env.TeamLimit) ? Max_Teams : Env.TeamLimit;
1926   else
1927     num_groups = Max_Teams;
1928   DP("Set default num of groups %d\n", num_groups);
1929 
1930   if (print_kernel_trace & STARTUP_DETAILS) {
1931     fprintf(stderr, "num_groups: %d\n", num_groups);
1932     fprintf(stderr, "num_teams: %d\n", num_teams);
1933   }
1934 
1935   // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1936   // This reduction is typical for default case (no thread_limit clause).
1937   // or when user goes crazy with num_teams clause.
1938   // FIXME: We cant distinguish between a constant or variable thread limit.
1939   // So we only handle constant thread_limits.
1940   if (threadsPerGroup >
1941       RTLDeviceInfoTy::Default_WG_Size) //  256 < threadsPerGroup <= 1024
1942     // Should we round threadsPerGroup up to nearest RTLDeviceInfoTy::Warp_Size
1943     // here?
1944     num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;
1945 
1946   // check for num_teams() clause
1947   if (num_teams > 0) {
1948     num_groups = (num_teams < num_groups) ? num_teams : num_groups;
1949   }
1950   if (print_kernel_trace & STARTUP_DETAILS) {
1951     fprintf(stderr, "num_groups: %d\n", num_groups);
1952     fprintf(stderr, "Env.NumTeams %d\n", Env.NumTeams);
1953     fprintf(stderr, "Env.TeamLimit %d\n", Env.TeamLimit);
1954   }
1955 
1956   if (Env.NumTeams > 0) {
1957     num_groups = (Env.NumTeams < num_groups) ? Env.NumTeams : num_groups;
1958     DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
1959   } else if (Env.TeamLimit > 0) {
1960     num_groups = (Env.TeamLimit < num_groups) ? Env.TeamLimit : num_groups;
1961     DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
1962   } else {
1963     if (num_teams <= 0) {
1964       if (loop_tripcount > 0) {
1965         if (ExecutionMode == SPMD) {
1966           // round up to the nearest integer
1967           num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1;
1968         } else {
1969           num_groups = loop_tripcount;
1970         }
1971         DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
1972            "threads per block %d\n",
1973            num_groups, loop_tripcount, threadsPerGroup);
1974       }
1975     } else {
1976       num_groups = num_teams;
1977     }
1978     if (num_groups > Max_Teams) {
1979       num_groups = Max_Teams;
1980       if (print_kernel_trace & STARTUP_DETAILS)
1981         fprintf(stderr, "Limiting num_groups %d to Max_Teams %d \n", num_groups,
1982                 Max_Teams);
1983     }
1984     if (num_groups > num_teams && num_teams > 0) {
1985       num_groups = num_teams;
1986       if (print_kernel_trace & STARTUP_DETAILS)
1987         fprintf(stderr, "Limiting num_groups %d to clause num_teams %d \n",
1988                 num_groups, num_teams);
1989     }
1990   }
1991 
1992   // num_teams clause always honored, no matter what, unless DEFAULT is active.
1993   if (num_teams > 0) {
1994     num_groups = num_teams;
1995     // Cap num_groups to EnvMaxTeamsDefault if set.
1996     if (Env.MaxTeamsDefault > 0 && num_groups > Env.MaxTeamsDefault)
1997       num_groups = Env.MaxTeamsDefault;
1998   }
1999   if (print_kernel_trace & STARTUP_DETAILS) {
2000     fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
2001     fprintf(stderr, "num_groups: %d\n", num_groups);
2002     fprintf(stderr, "loop_tripcount: %ld\n", loop_tripcount);
2003   }
2004   DP("Final %d num_groups and %d threadsPerGroup\n", num_groups,
2005      threadsPerGroup);
2006 
2007   launchVals res;
2008   res.WorkgroupSize = threadsPerGroup;
2009   res.GridSize = threadsPerGroup * num_groups;
2010   return res;
2011 }
2012 
2013 static uint64_t acquire_available_packet_id(hsa_queue_t *queue) {
2014   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
2015   bool full = true;
2016   while (full) {
2017     full =
2018         packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue));
2019   }
2020   return packet_id;
2021 }
2022 
2023 static int32_t __tgt_rtl_run_target_team_region_locked(
2024     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
2025     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
2026     int32_t thread_limit, uint64_t loop_tripcount);
2027 
2028 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
2029                                          void **tgt_args,
2030                                          ptrdiff_t *tgt_offsets,
2031                                          int32_t arg_num, int32_t num_teams,
2032                                          int32_t thread_limit,
2033                                          uint64_t loop_tripcount) {
2034 
2035   DeviceInfo.load_run_lock.lock_shared();
2036   int32_t res = __tgt_rtl_run_target_team_region_locked(
2037       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams,
2038       thread_limit, loop_tripcount);
2039 
2040   DeviceInfo.load_run_lock.unlock_shared();
2041   return res;
2042 }
2043 
2044 int32_t __tgt_rtl_run_target_team_region_locked(
2045     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
2046     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams,
2047     int32_t thread_limit, uint64_t loop_tripcount) {
2048   // Set the context we are using
2049   // update thread limit content in gpu memory if un-initialized or specified
2050   // from host
2051 
2052   DP("Run target team region thread_limit %d\n", thread_limit);
2053 
2054   // All args are references.
2055   std::vector<void *> args(arg_num);
2056   std::vector<void *> ptrs(arg_num);
2057 
2058   DP("Arg_num: %d\n", arg_num);
2059   for (int32_t i = 0; i < arg_num; ++i) {
2060     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
2061     args[i] = &ptrs[i];
2062     DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i]));
2063   }
2064 
2065   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
2066 
2067   std::string kernel_name = std::string(KernelInfo->Name);
2068   auto &KernelInfoTable = DeviceInfo.KernelInfoTable;
2069   if (KernelInfoTable[device_id].find(kernel_name) ==
2070       KernelInfoTable[device_id].end()) {
2071     DP("Kernel %s not found\n", kernel_name.c_str());
2072     return OFFLOAD_FAIL;
2073   }
2074 
2075   const atl_kernel_info_t KernelInfoEntry =
2076       KernelInfoTable[device_id][kernel_name];
2077   const uint32_t group_segment_size = KernelInfoEntry.group_segment_size;
2078   const uint32_t sgpr_count = KernelInfoEntry.sgpr_count;
2079   const uint32_t vgpr_count = KernelInfoEntry.vgpr_count;
2080   const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count;
2081   const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count;
2082 
2083   assert(arg_num == (int)KernelInfoEntry.num_args);
2084 
2085   /*
2086    * Set limit based on ThreadsPerGroup and GroupsPerDevice
2087    */
2088   launchVals LV = getLaunchVals(DeviceInfo.Env, KernelInfo->ConstWGSize,
2089                                 KernelInfo->ExecutionMode,
2090                                 num_teams,      // From run_region arg
2091                                 thread_limit,   // From run_region arg
2092                                 loop_tripcount, // From run_region arg
2093                                 DeviceInfo.NumTeams[KernelInfo->device_id]);
2094   const int GridSize = LV.GridSize;
2095   const int WorkgroupSize = LV.WorkgroupSize;
2096 
2097   if (print_kernel_trace >= LAUNCH) {
2098     int num_groups = GridSize / WorkgroupSize;
2099     // enum modes are SPMD, GENERIC, NONE 0,1,2
2100     // if doing rtl timing, print to stderr, unless stdout requested.
2101     bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
2102     fprintf(traceToStdout ? stdout : stderr,
2103             "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
2104             "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
2105             "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
2106             device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
2107             arg_num, num_groups, WorkgroupSize, num_teams, thread_limit,
2108             group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count,
2109             vgpr_spill_count, loop_tripcount, KernelInfo->Name);
2110   }
2111 
2112   // Run on the device.
2113   {
2114     hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id];
2115     if (!queue) {
2116       return OFFLOAD_FAIL;
2117     }
2118     uint64_t packet_id = acquire_available_packet_id(queue);
2119 
2120     const uint32_t mask = queue->size - 1; // size is a power of 2
2121     hsa_kernel_dispatch_packet_t *packet =
2122         (hsa_kernel_dispatch_packet_t *)queue->base_address +
2123         (packet_id & mask);
2124 
2125     // packet->header is written last
2126     packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
2127     packet->workgroup_size_x = WorkgroupSize;
2128     packet->workgroup_size_y = 1;
2129     packet->workgroup_size_z = 1;
2130     packet->reserved0 = 0;
2131     packet->grid_size_x = GridSize;
2132     packet->grid_size_y = 1;
2133     packet->grid_size_z = 1;
2134     packet->private_segment_size = KernelInfoEntry.private_segment_size;
2135     packet->group_segment_size = KernelInfoEntry.group_segment_size;
2136     packet->kernel_object = KernelInfoEntry.kernel_object;
2137     packet->kernarg_address = 0;     // use the block allocator
2138     packet->reserved2 = 0;           // atmi writes id_ here
2139     packet->completion_signal = {0}; // may want a pool of signals
2140 
2141     KernelArgPool *ArgPool = nullptr;
2142     {
2143       auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name));
2144       if (it != KernelArgPoolMap.end()) {
2145         ArgPool = (it->second).get();
2146       }
2147     }
2148     if (!ArgPool) {
2149       DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
2150          device_id);
2151     }
2152     {
2153       void *kernarg = nullptr;
2154       if (ArgPool) {
2155         assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *)));
2156         kernarg = ArgPool->allocate(arg_num);
2157       }
2158       if (!kernarg) {
2159         DP("Allocate kernarg failed\n");
2160         return OFFLOAD_FAIL;
2161       }
2162 
2163       // Copy explicit arguments
2164       for (int i = 0; i < arg_num; i++) {
2165         memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *));
2166       }
2167 
2168       // Initialize implicit arguments. ATMI seems to leave most fields
2169       // uninitialized
2170       atmi_implicit_args_t *impl_args =
2171           reinterpret_cast<atmi_implicit_args_t *>(
2172               static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size);
2173       memset(impl_args, 0,
2174              sizeof(atmi_implicit_args_t)); // may not be necessary
2175       impl_args->offset_x = 0;
2176       impl_args->offset_y = 0;
2177       impl_args->offset_z = 0;
2178 
2179       // assign a hostcall buffer for the selected Q
2180       if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) {
2181         // hostrpc_assign_buffer is not thread safe, and this function is
2182         // under a multiple reader lock, not a writer lock.
2183         static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
2184         pthread_mutex_lock(&hostcall_init_lock);
2185         impl_args->hostcall_ptr = hostrpc_assign_buffer(
2186             DeviceInfo.HSAAgents[device_id], queue, device_id);
2187         pthread_mutex_unlock(&hostcall_init_lock);
2188         if (!impl_args->hostcall_ptr) {
2189           DP("hostrpc_assign_buffer failed, gpu would dereference null and "
2190              "error\n");
2191           return OFFLOAD_FAIL;
2192         }
2193       }
2194 
2195       packet->kernarg_address = kernarg;
2196     }
2197 
2198     {
2199       hsa_signal_t s = DeviceInfo.FreeSignalPool.pop();
2200       if (s.handle == 0) {
2201         DP("Failed to get signal instance\n");
2202         return OFFLOAD_FAIL;
2203       }
2204       packet->completion_signal = s;
2205       hsa_signal_store_relaxed(packet->completion_signal, 1);
2206     }
2207 
2208     core::packet_store_release(reinterpret_cast<uint32_t *>(packet),
2209                                core::create_header(), packet->setup);
2210 
2211     hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
2212 
2213     while (hsa_signal_wait_scacquire(packet->completion_signal,
2214                                      HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
2215                                      HSA_WAIT_STATE_BLOCKED) != 0)
2216       ;
2217 
2218     assert(ArgPool);
2219     ArgPool->deallocate(packet->kernarg_address);
2220     DeviceInfo.FreeSignalPool.push(packet->completion_signal);
2221   }
2222 
2223   DP("Kernel completed\n");
2224   return OFFLOAD_SUCCESS;
2225 }
2226 
2227 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
2228                                     void **tgt_args, ptrdiff_t *tgt_offsets,
2229                                     int32_t arg_num) {
2230   // use one team and one thread
2231   // fix thread num
2232   int32_t team_num = 1;
2233   int32_t thread_limit = 0; // use default
2234   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
2235                                           tgt_offsets, arg_num, team_num,
2236                                           thread_limit, 0);
2237 }
2238 
2239 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
2240                                           void *tgt_entry_ptr, void **tgt_args,
2241                                           ptrdiff_t *tgt_offsets,
2242                                           int32_t arg_num,
2243                                           __tgt_async_info *AsyncInfo) {
2244   assert(AsyncInfo && "AsyncInfo is nullptr");
2245   initAsyncInfo(AsyncInfo);
2246 
2247   // use one team and one thread
2248   // fix thread num
2249   int32_t team_num = 1;
2250   int32_t thread_limit = 0; // use default
2251   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
2252                                           tgt_offsets, arg_num, team_num,
2253                                           thread_limit, 0);
2254 }
2255 
2256 int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *AsyncInfo) {
2257   assert(AsyncInfo && "AsyncInfo is nullptr");
2258 
2259   // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
2260   // is not ensured by devices.cpp for amdgcn
2261   // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
2262   if (AsyncInfo->Queue) {
2263     finiAsyncInfo(AsyncInfo);
2264   }
2265   return OFFLOAD_SUCCESS;
2266 }
2267 
2268 namespace core {
2269 hsa_status_t allow_access_to_all_gpu_agents(void *ptr) {
2270   return hsa_amd_agents_allow_access(DeviceInfo.HSAAgents.size(),
2271                                      &DeviceInfo.HSAAgents[0], NULL, ptr);
2272 }
2273 
2274 } // namespace core
2275