1 /*===--------------------------------------------------------------------------
2  *              ATMI (Asynchronous Task and Memory Interface)
3  *
4  * This file is distributed under the MIT License. See LICENSE.txt for details.
5  *===------------------------------------------------------------------------*/
6 #include <gelf.h>
7 #include <libelf.h>
8 
9 #include <cassert>
10 #include <cstdarg>
11 #include <fstream>
12 #include <iomanip>
13 #include <iostream>
14 #include <set>
15 #include <string>
16 
17 #include "internal.h"
18 #include "machine.h"
19 #include "rt.h"
20 
21 #include "msgpack.h"
22 
23 namespace hsa {
24 // Wrap HSA iterate API in a shim that allows passing general callables
25 template <typename C>
26 hsa_status_t executable_iterate_symbols(hsa_executable_t executable, C cb) {
27   auto L = [](hsa_executable_t executable, hsa_executable_symbol_t symbol,
28               void *data) -> hsa_status_t {
29     C *unwrapped = static_cast<C *>(data);
30     return (*unwrapped)(executable, symbol);
31   };
32   return hsa_executable_iterate_symbols(executable, L,
33                                         static_cast<void *>(&cb));
34 }
35 } // namespace hsa
36 
37 typedef unsigned char *address;
38 /*
39  * Note descriptors.
40  */
41 typedef struct {
42   uint32_t n_namesz; /* Length of note's name. */
43   uint32_t n_descsz; /* Length of note's value. */
44   uint32_t n_type;   /* Type of note. */
45   // then name
46   // then padding, optional
47   // then desc, at 4 byte alignment (not 8, despite being elf64)
48 } Elf_Note;
49 
50 // The following include file and following structs/enums
51 // have been replicated on a per-use basis below. For example,
52 // llvm::AMDGPU::HSAMD::Kernel::Metadata has several fields,
53 // but we may care only about kernargSegmentSize_ for now, so
54 // we just include that field in our KernelMD implementation. We
55 // chose this approach to replicate in order to avoid forcing
56 // a dependency on LLVM_INCLUDE_DIR just to compile the runtime.
57 // #include "llvm/Support/AMDGPUMetadata.h"
58 // typedef llvm::AMDGPU::HSAMD::Metadata CodeObjectMD;
59 // typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
60 // typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
61 // using llvm::AMDGPU::HSAMD::AccessQualifier;
62 // using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
63 // using llvm::AMDGPU::HSAMD::ValueKind;
64 // using llvm::AMDGPU::HSAMD::ValueType;
65 
66 class KernelArgMD {
67 public:
68   enum class ValueKind {
69     HiddenGlobalOffsetX,
70     HiddenGlobalOffsetY,
71     HiddenGlobalOffsetZ,
72     HiddenNone,
73     HiddenPrintfBuffer,
74     HiddenDefaultQueue,
75     HiddenCompletionAction,
76     HiddenMultiGridSyncArg,
77     HiddenHostcallBuffer,
78     Unknown
79   };
80 
81   KernelArgMD()
82       : name_(std::string()), typeName_(std::string()), size_(0), offset_(0),
83         align_(0), valueKind_(ValueKind::Unknown) {}
84 
85   // fields
86   std::string name_;
87   std::string typeName_;
88   uint32_t size_;
89   uint32_t offset_;
90   uint32_t align_;
91   ValueKind valueKind_;
92 };
93 
94 class KernelMD {
95 public:
96   KernelMD() : kernargSegmentSize_(0ull) {}
97 
98   // fields
99   uint64_t kernargSegmentSize_;
100 };
101 
102 static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
103     //    Including only those fields that are relevant to the runtime.
104     //    {"ByValue", KernelArgMD::ValueKind::ByValue},
105     //    {"GlobalBuffer", KernelArgMD::ValueKind::GlobalBuffer},
106     //    {"DynamicSharedPointer",
107     //    KernelArgMD::ValueKind::DynamicSharedPointer},
108     //    {"Sampler", KernelArgMD::ValueKind::Sampler},
109     //    {"Image", KernelArgMD::ValueKind::Image},
110     //    {"Pipe", KernelArgMD::ValueKind::Pipe},
111     //    {"Queue", KernelArgMD::ValueKind::Queue},
112     {"HiddenGlobalOffsetX", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
113     {"HiddenGlobalOffsetY", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
114     {"HiddenGlobalOffsetZ", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
115     {"HiddenNone", KernelArgMD::ValueKind::HiddenNone},
116     {"HiddenPrintfBuffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
117     {"HiddenDefaultQueue", KernelArgMD::ValueKind::HiddenDefaultQueue},
118     {"HiddenCompletionAction", KernelArgMD::ValueKind::HiddenCompletionAction},
119     {"HiddenMultiGridSyncArg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
120     {"HiddenHostcallBuffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
121     // v3
122     //    {"by_value", KernelArgMD::ValueKind::ByValue},
123     //    {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer},
124     //    {"dynamic_shared_pointer",
125     //    KernelArgMD::ValueKind::DynamicSharedPointer},
126     //    {"sampler", KernelArgMD::ValueKind::Sampler},
127     //    {"image", KernelArgMD::ValueKind::Image},
128     //    {"pipe", KernelArgMD::ValueKind::Pipe},
129     //    {"queue", KernelArgMD::ValueKind::Queue},
130     {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
131     {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
132     {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
133     {"hidden_none", KernelArgMD::ValueKind::HiddenNone},
134     {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
135     {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue},
136     {"hidden_completion_action",
137      KernelArgMD::ValueKind::HiddenCompletionAction},
138     {"hidden_multigrid_sync_arg",
139      KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
140     {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
141 };
142 
143 // global variables. TODO: Get rid of these
144 atmi_machine_t g_atmi_machine;
145 ATLMachine g_atl_machine;
146 
147 std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
148 
149 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
150 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
151 
152 bool g_atmi_initialized = false;
153 
154 /*
155    atlc is all internal global values.
156    The structure atl_context_t is defined in atl_internal.h
157    Most references will use the global structure prefix atlc.
158 */
159 atl_context_t atlc = {.struct_initialized = false};
160 
161 namespace core {
162 /* Machine Info */
163 atmi_machine_t *Runtime::GetMachineInfo() {
164   if (!atlc.g_hsa_initialized)
165     return NULL;
166   return &g_atmi_machine;
167 }
168 
169 static void atl_set_atmi_initialized() {
170   // FIXME: thread safe? locks?
171   g_atmi_initialized = true;
172 }
173 
174 static void atl_reset_atmi_initialized() {
175   // FIXME: thread safe? locks?
176   g_atmi_initialized = false;
177 }
178 
179 bool atl_is_atmi_initialized() { return g_atmi_initialized; }
180 
181 hsa_status_t allow_access_to_all_gpu_agents(void *ptr) {
182   std::vector<ATLGPUProcessor> &gpu_procs =
183       g_atl_machine.processors<ATLGPUProcessor>();
184   std::vector<hsa_agent_t> agents;
185   for (uint32_t i = 0; i < gpu_procs.size(); i++) {
186     agents.push_back(gpu_procs[i].agent());
187   }
188   return hsa_amd_agents_allow_access(agents.size(), &agents[0], NULL, ptr);
189 }
190 
191 atmi_status_t Runtime::Initialize() {
192   atmi_devtype_t devtype = ATMI_DEVTYPE_GPU;
193   if (atl_is_atmi_initialized())
194     return ATMI_STATUS_SUCCESS;
195 
196   if (devtype == ATMI_DEVTYPE_ALL || devtype & ATMI_DEVTYPE_GPU) {
197     atmi_status_t rc = atl_init_gpu_context();
198     if (rc != ATMI_STATUS_SUCCESS) {
199       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "GPU context init",
200              get_atmi_error_string(atl_init_gpu_context()));
201       return rc;
202     }
203   }
204 
205   atl_set_atmi_initialized();
206   return ATMI_STATUS_SUCCESS;
207 }
208 
209 atmi_status_t Runtime::Finalize() {
210   atmi_status_t rc = ATMI_STATUS_SUCCESS;
211   for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) {
212     SymbolInfoTable[i].clear();
213   }
214   SymbolInfoTable.clear();
215   for (uint32_t i = 0; i < KernelInfoTable.size(); i++) {
216     KernelInfoTable[i].clear();
217   }
218   KernelInfoTable.clear();
219 
220   atl_reset_atmi_initialized();
221   hsa_status_t err = hsa_shut_down();
222   if (err != HSA_STATUS_SUCCESS) {
223     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Shutting down HSA",
224            get_error_string(err));
225     rc = ATMI_STATUS_ERROR;
226   }
227 
228   return rc;
229 }
230 
231 static void atmi_init_context_structs() {
232   atlc.struct_initialized = true; /* This only gets called one time */
233   atlc.g_hsa_initialized = false;
234   atlc.g_gpu_initialized = false;
235   atlc.g_tasks_initialized = false;
236 }
237 
238 // Implement memory_pool iteration function
239 static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool,
240                                          void *data) {
241   ATLProcessor *proc = reinterpret_cast<ATLProcessor *>(data);
242   hsa_status_t err = HSA_STATUS_SUCCESS;
243   // Check if the memory_pool is allowed to allocate, i.e. do not return group
244   // memory
245   bool alloc_allowed = false;
246   err = hsa_amd_memory_pool_get_info(
247       memory_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
248       &alloc_allowed);
249   if (err != HSA_STATUS_SUCCESS) {
250     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
251            "Alloc allowed in memory pool check", get_error_string(err));
252     return err;
253   }
254   if (alloc_allowed) {
255     uint32_t global_flag = 0;
256     err = hsa_amd_memory_pool_get_info(
257         memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
258     if (err != HSA_STATUS_SUCCESS) {
259       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
260              "Get memory pool info", get_error_string(err));
261       return err;
262     }
263     if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) {
264       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED);
265       proc->addMemory(new_mem);
266       if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) {
267         DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle);
268         atl_gpu_kernarg_pools.push_back(memory_pool);
269       }
270     } else {
271       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED);
272       proc->addMemory(new_mem);
273     }
274   }
275 
276   return err;
277 }
278 
279 static hsa_status_t get_agent_info(hsa_agent_t agent, void *data) {
280   hsa_status_t err = HSA_STATUS_SUCCESS;
281   hsa_device_type_t device_type;
282   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
283   if (err != HSA_STATUS_SUCCESS) {
284     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
285            "Get device type info", get_error_string(err));
286     return err;
287   }
288   switch (device_type) {
289   case HSA_DEVICE_TYPE_CPU: {
290     ATLCPUProcessor new_proc(agent);
291     err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
292                                              &new_proc);
293     if (err != HSA_STATUS_SUCCESS) {
294       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
295              "Iterate all memory pools", get_error_string(err));
296       return err;
297     }
298     g_atl_machine.addProcessor(new_proc);
299   } break;
300   case HSA_DEVICE_TYPE_GPU: {
301     hsa_profile_t profile;
302     err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &profile);
303     if (err != HSA_STATUS_SUCCESS) {
304       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
305              "Query the agent profile", get_error_string(err));
306       return err;
307     }
308     atmi_devtype_t gpu_type;
309     gpu_type =
310         (profile == HSA_PROFILE_FULL) ? ATMI_DEVTYPE_iGPU : ATMI_DEVTYPE_dGPU;
311     ATLGPUProcessor new_proc(agent, gpu_type);
312     err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
313                                              &new_proc);
314     if (err != HSA_STATUS_SUCCESS) {
315       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
316              "Iterate all memory pools", get_error_string(err));
317       return err;
318     }
319     g_atl_machine.addProcessor(new_proc);
320   } break;
321   case HSA_DEVICE_TYPE_DSP: {
322     err = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
323   } break;
324   }
325 
326   return err;
327 }
328 
329 hsa_status_t get_fine_grained_region(hsa_region_t region, void *data) {
330   hsa_region_segment_t segment;
331   hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
332   if (segment != HSA_REGION_SEGMENT_GLOBAL) {
333     return HSA_STATUS_SUCCESS;
334   }
335   hsa_region_global_flag_t flags;
336   hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
337   if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
338     hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data);
339     *ret = region;
340     return HSA_STATUS_INFO_BREAK;
341   }
342   return HSA_STATUS_SUCCESS;
343 }
344 
345 /* Determines if a memory region can be used for kernarg allocations.  */
346 static hsa_status_t get_kernarg_memory_region(hsa_region_t region, void *data) {
347   hsa_region_segment_t segment;
348   hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
349   if (HSA_REGION_SEGMENT_GLOBAL != segment) {
350     return HSA_STATUS_SUCCESS;
351   }
352 
353   hsa_region_global_flag_t flags;
354   hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
355   if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
356     hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data);
357     *ret = region;
358     return HSA_STATUS_INFO_BREAK;
359   }
360 
361   return HSA_STATUS_SUCCESS;
362 }
363 
364 static hsa_status_t init_compute_and_memory() {
365   hsa_status_t err;
366 
367   /* Iterate over the agents and pick the gpu agent */
368   err = hsa_iterate_agents(get_agent_info, NULL);
369   if (err == HSA_STATUS_INFO_BREAK) {
370     err = HSA_STATUS_SUCCESS;
371   }
372   if (err != HSA_STATUS_SUCCESS) {
373     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Getting a gpu agent",
374            get_error_string(err));
375     return err;
376   }
377 
378   /* Init all devices or individual device types? */
379   std::vector<ATLCPUProcessor> &cpu_procs =
380       g_atl_machine.processors<ATLCPUProcessor>();
381   std::vector<ATLGPUProcessor> &gpu_procs =
382       g_atl_machine.processors<ATLGPUProcessor>();
383   /* For CPU memory pools, add other devices that can access them directly
384    * or indirectly */
385   for (auto &cpu_proc : cpu_procs) {
386     for (auto &cpu_mem : cpu_proc.memories()) {
387       hsa_amd_memory_pool_t pool = cpu_mem.memory();
388       for (auto &gpu_proc : gpu_procs) {
389         hsa_agent_t agent = gpu_proc.agent();
390         hsa_amd_memory_pool_access_t access;
391         hsa_amd_agent_memory_pool_get_info(
392             agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
393         if (access != 0) {
394           // this means not NEVER, but could be YES or NO
395           // add this memory pool to the proc
396           gpu_proc.addMemory(cpu_mem);
397         }
398       }
399     }
400   }
401 
402   /* FIXME: are the below combinations of procs and memory pools needed?
403    * all to all compare procs with their memory pools and add those memory
404    * pools that are accessible by the target procs */
405   for (auto &gpu_proc : gpu_procs) {
406     for (auto &gpu_mem : gpu_proc.memories()) {
407       hsa_amd_memory_pool_t pool = gpu_mem.memory();
408       for (auto &cpu_proc : cpu_procs) {
409         hsa_agent_t agent = cpu_proc.agent();
410         hsa_amd_memory_pool_access_t access;
411         hsa_amd_agent_memory_pool_get_info(
412             agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
413         if (access != 0) {
414           // this means not NEVER, but could be YES or NO
415           // add this memory pool to the proc
416           cpu_proc.addMemory(gpu_mem);
417         }
418       }
419     }
420   }
421 
422   g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_CPU] = cpu_procs.size();
423   g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_GPU] = gpu_procs.size();
424 
425   size_t num_procs = cpu_procs.size() + gpu_procs.size();
426   // g_atmi_machine.devices = (atmi_device_t *)malloc(num_procs *
427   // sizeof(atmi_device_t));
428   atmi_device_t *all_devices = reinterpret_cast<atmi_device_t *>(
429       malloc(num_procs * sizeof(atmi_device_t)));
430   int num_iGPUs = 0;
431   int num_dGPUs = 0;
432   for (uint32_t i = 0; i < gpu_procs.size(); i++) {
433     if (gpu_procs[i].type() == ATMI_DEVTYPE_iGPU)
434       num_iGPUs++;
435     else
436       num_dGPUs++;
437   }
438   assert(num_iGPUs + num_dGPUs == gpu_procs.size() &&
439          "Number of dGPUs and iGPUs do not add up");
440   DEBUG_PRINT("CPU Agents: %lu\n", cpu_procs.size());
441   DEBUG_PRINT("iGPU Agents: %d\n", num_iGPUs);
442   DEBUG_PRINT("dGPU Agents: %d\n", num_dGPUs);
443   DEBUG_PRINT("GPU Agents: %lu\n", gpu_procs.size());
444 
445   g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_iGPU] = num_iGPUs;
446   g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_dGPU] = num_dGPUs;
447 
448   int cpus_begin = 0;
449   int cpus_end = cpu_procs.size();
450   int gpus_begin = cpu_procs.size();
451   int gpus_end = cpu_procs.size() + gpu_procs.size();
452   g_atmi_machine.devices_by_type[ATMI_DEVTYPE_CPU] = &all_devices[cpus_begin];
453   g_atmi_machine.devices_by_type[ATMI_DEVTYPE_GPU] = &all_devices[gpus_begin];
454   g_atmi_machine.devices_by_type[ATMI_DEVTYPE_iGPU] = &all_devices[gpus_begin];
455   g_atmi_machine.devices_by_type[ATMI_DEVTYPE_dGPU] = &all_devices[gpus_begin];
456   int proc_index = 0;
457   for (int i = cpus_begin; i < cpus_end; i++) {
458     all_devices[i].type = cpu_procs[proc_index].type();
459 
460     std::vector<ATLMemory> memories = cpu_procs[proc_index].memories();
461     int fine_memories_size = 0;
462     int coarse_memories_size = 0;
463     DEBUG_PRINT("CPU memory types:\t");
464     for (auto &memory : memories) {
465       atmi_memtype_t type = memory.type();
466       if (type == ATMI_MEMTYPE_FINE_GRAINED) {
467         fine_memories_size++;
468         DEBUG_PRINT("Fine\t");
469       } else {
470         coarse_memories_size++;
471         DEBUG_PRINT("Coarse\t");
472       }
473     }
474     DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
475     DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
476     proc_index++;
477   }
478   proc_index = 0;
479   for (int i = gpus_begin; i < gpus_end; i++) {
480     all_devices[i].type = gpu_procs[proc_index].type();
481 
482     std::vector<ATLMemory> memories = gpu_procs[proc_index].memories();
483     int fine_memories_size = 0;
484     int coarse_memories_size = 0;
485     DEBUG_PRINT("GPU memory types:\t");
486     for (auto &memory : memories) {
487       atmi_memtype_t type = memory.type();
488       if (type == ATMI_MEMTYPE_FINE_GRAINED) {
489         fine_memories_size++;
490         DEBUG_PRINT("Fine\t");
491       } else {
492         coarse_memories_size++;
493         DEBUG_PRINT("Coarse\t");
494       }
495     }
496     DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
497     DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
498     proc_index++;
499   }
500   proc_index = 0;
501   hsa_region_t atl_cpu_kernarg_region;
502   atl_cpu_kernarg_region.handle = (uint64_t)-1;
503   if (cpu_procs.size() > 0) {
504     err = hsa_agent_iterate_regions(
505         cpu_procs[0].agent(), get_fine_grained_region, &atl_cpu_kernarg_region);
506     if (err == HSA_STATUS_INFO_BREAK) {
507       err = HSA_STATUS_SUCCESS;
508     }
509     err = (atl_cpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR
510                                                           : HSA_STATUS_SUCCESS;
511     if (err != HSA_STATUS_SUCCESS) {
512       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
513              "Finding a CPU kernarg memory region handle",
514              get_error_string(err));
515       return err;
516     }
517   }
518   hsa_region_t atl_gpu_kernarg_region;
519   /* Find a memory region that supports kernel arguments.  */
520   atl_gpu_kernarg_region.handle = (uint64_t)-1;
521   if (gpu_procs.size() > 0) {
522     hsa_agent_iterate_regions(gpu_procs[0].agent(), get_kernarg_memory_region,
523                               &atl_gpu_kernarg_region);
524     err = (atl_gpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR
525                                                           : HSA_STATUS_SUCCESS;
526     if (err != HSA_STATUS_SUCCESS) {
527       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
528              "Finding a kernarg memory region", get_error_string(err));
529       return err;
530     }
531   }
532   if (num_procs > 0)
533     return HSA_STATUS_SUCCESS;
534   else
535     return HSA_STATUS_ERROR_NOT_INITIALIZED;
536 }
537 
538 hsa_status_t init_hsa() {
539   if (atlc.g_hsa_initialized == false) {
540     DEBUG_PRINT("Initializing HSA...");
541     hsa_status_t err = hsa_init();
542     if (err != HSA_STATUS_SUCCESS) {
543       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
544              "Initializing the hsa runtime", get_error_string(err));
545       return err;
546     }
547     if (err != HSA_STATUS_SUCCESS)
548       return err;
549 
550     err = init_compute_and_memory();
551     if (err != HSA_STATUS_SUCCESS)
552       return err;
553     if (err != HSA_STATUS_SUCCESS) {
554       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
555              "After initializing compute and memory", get_error_string(err));
556       return err;
557     }
558 
559     int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
560     KernelInfoTable.resize(gpu_count);
561     SymbolInfoTable.resize(gpu_count);
562     for (uint32_t i = 0; i < SymbolInfoTable.size(); i++)
563       SymbolInfoTable[i].clear();
564     for (uint32_t i = 0; i < KernelInfoTable.size(); i++)
565       KernelInfoTable[i].clear();
566     atlc.g_hsa_initialized = true;
567     DEBUG_PRINT("done\n");
568   }
569   return HSA_STATUS_SUCCESS;
570 }
571 
572 void init_tasks() {
573   if (atlc.g_tasks_initialized != false)
574     return;
575   std::vector<hsa_agent_t> gpu_agents;
576   int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
577   for (int gpu = 0; gpu < gpu_count; gpu++) {
578     atmi_place_t place = ATMI_PLACE_GPU(0, gpu);
579     ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
580     gpu_agents.push_back(proc.agent());
581   }
582   atlc.g_tasks_initialized = true;
583 }
584 
585 hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) {
586 #if (ROCM_VERSION_MAJOR >= 3) ||                                               \
587     (ROCM_VERSION_MAJOR >= 2 && ROCM_VERSION_MINOR >= 3)
588   if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) {
589 #else
590   if (event->event_type == GPU_MEMORY_FAULT_EVENT) {
591 #endif
592     hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault;
593     // memory_fault.agent
594     // memory_fault.virtual_address
595     // memory_fault.fault_reason_mask
596     // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address);
597     std::stringstream stream;
598     stream << std::hex << (uintptr_t)memory_fault.virtual_address;
599     std::string addr("0x" + stream.str());
600 
601     std::string err_string = "[GPU Memory Error] Addr: " + addr;
602     err_string += " Reason: ";
603     if (!(memory_fault.fault_reason_mask & 0x00111111)) {
604       err_string += "No Idea! ";
605     } else {
606       if (memory_fault.fault_reason_mask & 0x00000001)
607         err_string += "Page not present or supervisor privilege. ";
608       if (memory_fault.fault_reason_mask & 0x00000010)
609         err_string += "Write access to a read-only page. ";
610       if (memory_fault.fault_reason_mask & 0x00000100)
611         err_string += "Execute access to a page marked NX. ";
612       if (memory_fault.fault_reason_mask & 0x00001000)
613         err_string += "Host access only. ";
614       if (memory_fault.fault_reason_mask & 0x00010000)
615         err_string += "ECC failure (if supported by HW). ";
616       if (memory_fault.fault_reason_mask & 0x00100000)
617         err_string += "Can't determine the exact fault address. ";
618     }
619     fprintf(stderr, "%s\n", err_string.c_str());
620     return HSA_STATUS_ERROR;
621   }
622   return HSA_STATUS_SUCCESS;
623 }
624 
625 atmi_status_t atl_init_gpu_context() {
626   if (atlc.struct_initialized == false)
627     atmi_init_context_structs();
628   if (atlc.g_gpu_initialized != false)
629     return ATMI_STATUS_SUCCESS;
630 
631   hsa_status_t err;
632   err = init_hsa();
633   if (err != HSA_STATUS_SUCCESS)
634     return ATMI_STATUS_ERROR;
635 
636   err = hsa_amd_register_system_event_handler(callbackEvent, NULL);
637   if (err != HSA_STATUS_SUCCESS) {
638     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
639            "Registering the system for memory faults", get_error_string(err));
640     return ATMI_STATUS_ERROR;
641   }
642 
643   init_tasks();
644   atlc.g_gpu_initialized = true;
645   return ATMI_STATUS_SUCCESS;
646 }
647 
648 static bool isImplicit(KernelArgMD::ValueKind value_kind) {
649   switch (value_kind) {
650   case KernelArgMD::ValueKind::HiddenGlobalOffsetX:
651   case KernelArgMD::ValueKind::HiddenGlobalOffsetY:
652   case KernelArgMD::ValueKind::HiddenGlobalOffsetZ:
653   case KernelArgMD::ValueKind::HiddenNone:
654   case KernelArgMD::ValueKind::HiddenPrintfBuffer:
655   case KernelArgMD::ValueKind::HiddenDefaultQueue:
656   case KernelArgMD::ValueKind::HiddenCompletionAction:
657   case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
658   case KernelArgMD::ValueKind::HiddenHostcallBuffer:
659     return true;
660   default:
661     return false;
662   }
663 }
664 
665 static std::pair<unsigned char *, unsigned char *>
666 find_metadata(void *binary, size_t binSize) {
667   std::pair<unsigned char *, unsigned char *> failure = {nullptr, nullptr};
668 
669   Elf *e = elf_memory(static_cast<char *>(binary), binSize);
670   if (elf_kind(e) != ELF_K_ELF) {
671     return failure;
672   }
673 
674   size_t numpHdrs;
675   if (elf_getphdrnum(e, &numpHdrs) != 0) {
676     return failure;
677   }
678 
679   for (size_t i = 0; i < numpHdrs; ++i) {
680     GElf_Phdr pHdr;
681     if (gelf_getphdr(e, i, &pHdr) != &pHdr) {
682       continue;
683     }
684     // Look for the runtime metadata note
685     if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) {
686       // Iterate over the notes in this segment
687       address ptr = (address)binary + pHdr.p_offset;
688       address segmentEnd = ptr + pHdr.p_filesz;
689 
690       while (ptr < segmentEnd) {
691         Elf_Note *note = reinterpret_cast<Elf_Note *>(ptr);
692         address name = (address)&note[1];
693 
694         if (note->n_type == 7 || note->n_type == 8) {
695           return failure;
696         } else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA */ &&
697                    note->n_namesz == sizeof "AMD" &&
698                    !memcmp(name, "AMD", note->n_namesz)) {
699           // code object v2 uses yaml metadata, no longer supported
700           return failure;
701         } else if (note->n_type == 32 /* NT_AMDGPU_METADATA */ &&
702                    note->n_namesz == sizeof "AMDGPU" &&
703                    !memcmp(name, "AMDGPU", note->n_namesz)) {
704 
705           // n_descsz = 485
706           // value is padded to 4 byte alignment, may want to move end up to
707           // match
708           size_t offset = sizeof(uint32_t) * 3 /* fields */
709                           + sizeof("AMDGPU")   /* name */
710                           + 1 /* padding to 4 byte alignment */;
711 
712           // Including the trailing padding means both pointers are 4 bytes
713           // aligned, which may be useful later.
714           unsigned char *metadata_start = (unsigned char *)ptr + offset;
715           unsigned char *metadata_end =
716               metadata_start + core::alignUp(note->n_descsz, 4);
717           return {metadata_start, metadata_end};
718         }
719         ptr += sizeof(*note) + core::alignUp(note->n_namesz, sizeof(int)) +
720                core::alignUp(note->n_descsz, sizeof(int));
721       }
722     }
723   }
724 
725   return failure;
726 }
727 
728 namespace {
729 int map_lookup_array(msgpack::byte_range message, const char *needle,
730                      msgpack::byte_range *res, uint64_t *size) {
731   unsigned count = 0;
732   struct s : msgpack::functors_defaults<s> {
733     s(unsigned &count, uint64_t *size) : count(count), size(size) {}
734     unsigned &count;
735     uint64_t *size;
736     const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) {
737       count++;
738       *size = N;
739       return bytes.end;
740     }
741   };
742 
743   msgpack::foreach_map(message,
744                        [&](msgpack::byte_range key, msgpack::byte_range value) {
745                          if (msgpack::message_is_string(key, needle)) {
746                            // If the message is an array, record number of
747                            // elements in *size
748                            msgpack::handle_msgpack<s>(value, {count, size});
749                            // return the whole array
750                            *res = value;
751                          }
752                        });
753   // Only claim success if exactly one key/array pair matched
754   return count != 1;
755 }
756 
757 int map_lookup_string(msgpack::byte_range message, const char *needle,
758                       std::string *res) {
759   unsigned count = 0;
760   struct s : public msgpack::functors_defaults<s> {
761     s(unsigned &count, std::string *res) : count(count), res(res) {}
762     unsigned &count;
763     std::string *res;
764     void handle_string(size_t N, const unsigned char *str) {
765       count++;
766       *res = std::string(str, str + N);
767     }
768   };
769   msgpack::foreach_map(message,
770                        [&](msgpack::byte_range key, msgpack::byte_range value) {
771                          if (msgpack::message_is_string(key, needle)) {
772                            msgpack::handle_msgpack<s>(value, {count, res});
773                          }
774                        });
775   return count != 1;
776 }
777 
778 int map_lookup_uint64_t(msgpack::byte_range message, const char *needle,
779                         uint64_t *res) {
780   unsigned count = 0;
781   msgpack::foreach_map(message,
782                        [&](msgpack::byte_range key, msgpack::byte_range value) {
783                          if (msgpack::message_is_string(key, needle)) {
784                            msgpack::foronly_unsigned(value, [&](uint64_t x) {
785                              count++;
786                              *res = x;
787                            });
788                          }
789                        });
790   return count != 1;
791 }
792 
793 int array_lookup_element(msgpack::byte_range message, uint64_t elt,
794                          msgpack::byte_range *res) {
795   int rc = 1;
796   uint64_t i = 0;
797   msgpack::foreach_array(message, [&](msgpack::byte_range value) {
798     if (i == elt) {
799       *res = value;
800       rc = 0;
801     }
802     i++;
803   });
804   return rc;
805 }
806 
807 int populate_kernelArgMD(msgpack::byte_range args_element,
808                          KernelArgMD *kernelarg) {
809   using namespace msgpack;
810   int error = 0;
811   foreach_map(args_element, [&](byte_range key, byte_range value) -> void {
812     if (message_is_string(key, ".name")) {
813       foronly_string(value, [&](size_t N, const unsigned char *str) {
814         kernelarg->name_ = std::string(str, str + N);
815       });
816     } else if (message_is_string(key, ".type_name")) {
817       foronly_string(value, [&](size_t N, const unsigned char *str) {
818         kernelarg->typeName_ = std::string(str, str + N);
819       });
820     } else if (message_is_string(key, ".size")) {
821       foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; });
822     } else if (message_is_string(key, ".offset")) {
823       foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; });
824     } else if (message_is_string(key, ".value_kind")) {
825       foronly_string(value, [&](size_t N, const unsigned char *str) {
826         std::string s = std::string(str, str + N);
827         auto itValueKind = ArgValueKind.find(s);
828         if (itValueKind != ArgValueKind.end()) {
829           kernelarg->valueKind_ = itValueKind->second;
830         }
831       });
832     }
833   });
834   return error;
835 }
836 } // namespace
837 
838 static hsa_status_t get_code_object_custom_metadata(void *binary,
839                                                     size_t binSize, int gpu) {
840   // parse code object with different keys from v2
841   // also, the kernel name is not the same as the symbol name -- so a
842   // symbol->name map is needed
843 
844   std::pair<unsigned char *, unsigned char *> metadata =
845       find_metadata(binary, binSize);
846   if (!metadata.first) {
847     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
848   }
849 
850   uint64_t kernelsSize = 0;
851   int msgpack_errors = 0;
852   msgpack::byte_range kernel_array;
853   msgpack_errors =
854       map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels",
855                        &kernel_array, &kernelsSize);
856   if (msgpack_errors != 0) {
857     printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
858            "kernels lookup in program metadata");
859     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
860   }
861 
862   for (size_t i = 0; i < kernelsSize; i++) {
863     assert(msgpack_errors == 0);
864     std::string kernelName;
865     std::string symbolName;
866 
867     msgpack::byte_range element;
868     msgpack_errors += array_lookup_element(kernel_array, i, &element);
869     if (msgpack_errors != 0) {
870       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
871              "element lookup in kernel metadata");
872       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
873     }
874 
875     msgpack_errors += map_lookup_string(element, ".name", &kernelName);
876     msgpack_errors += map_lookup_string(element, ".symbol", &symbolName);
877     if (msgpack_errors != 0) {
878       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
879              "strings lookup in kernel metadata");
880       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
881     }
882 
883     // Make sure that kernelName + ".kd" == symbolName
884     if ((kernelName + ".kd") != symbolName) {
885       printf("[%s:%d] Kernel name mismatching symbol: %s != %s + .kd\n",
886              __FILE__, __LINE__, symbolName.c_str(), kernelName.c_str());
887       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
888     }
889 
890     atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, {}, {}, {}};
891 
892     uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count;
893     msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count);
894     if (msgpack_errors != 0) {
895       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
896              "sgpr count metadata lookup in kernel metadata");
897       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
898     }
899 
900     info.sgpr_count = sgpr_count;
901 
902     msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count);
903     if (msgpack_errors != 0) {
904       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
905              "vgpr count metadata lookup in kernel metadata");
906       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
907     }
908 
909     info.vgpr_count = vgpr_count;
910 
911     msgpack_errors +=
912         map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count);
913     if (msgpack_errors != 0) {
914       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
915              "sgpr spill count metadata lookup in kernel metadata");
916       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
917     }
918 
919     info.sgpr_spill_count = sgpr_spill_count;
920 
921     msgpack_errors +=
922         map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count);
923     if (msgpack_errors != 0) {
924       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
925              "vgpr spill count metadata lookup in kernel metadata");
926       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
927     }
928 
929     info.vgpr_spill_count = vgpr_spill_count;
930 
931     size_t kernel_explicit_args_size = 0;
932     uint64_t kernel_segment_size;
933     msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size",
934                                           &kernel_segment_size);
935     if (msgpack_errors != 0) {
936       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
937              "kernarg segment size metadata lookup in kernel metadata");
938       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
939     }
940 
941     bool hasHiddenArgs = false;
942     if (kernel_segment_size > 0) {
943       uint64_t argsSize;
944       size_t offset = 0;
945 
946       msgpack::byte_range args_array;
947       msgpack_errors +=
948           map_lookup_array(element, ".args", &args_array, &argsSize);
949       if (msgpack_errors != 0) {
950         printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
951                "kernel args metadata lookup in kernel metadata");
952         return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
953       }
954 
955       info.num_args = argsSize;
956 
957       for (size_t i = 0; i < argsSize; ++i) {
958         KernelArgMD lcArg;
959 
960         msgpack::byte_range args_element;
961         msgpack_errors += array_lookup_element(args_array, i, &args_element);
962         if (msgpack_errors != 0) {
963           printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
964                  "iterate args map in kernel args metadata");
965           return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
966         }
967 
968         msgpack_errors += populate_kernelArgMD(args_element, &lcArg);
969         if (msgpack_errors != 0) {
970           printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
971                  "iterate args map in kernel args metadata");
972           return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
973         }
974         // populate info with sizes and offsets
975         info.arg_sizes.push_back(lcArg.size_);
976         // v3 has offset field and not align field
977         size_t new_offset = lcArg.offset_;
978         size_t padding = new_offset - offset;
979         offset = new_offset;
980         info.arg_offsets.push_back(lcArg.offset_);
981         DEBUG_PRINT("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(),
982                     lcArg.size_, lcArg.offset_);
983         offset += lcArg.size_;
984 
985         // check if the arg is a hidden/implicit arg
986         // this logic assumes that all hidden args are 8-byte aligned
987         if (!isImplicit(lcArg.valueKind_)) {
988           kernel_explicit_args_size += lcArg.size_;
989         } else {
990           hasHiddenArgs = true;
991         }
992         kernel_explicit_args_size += padding;
993       }
994     }
995 
996     // add size of implicit args, e.g.: offset x, y and z and pipe pointer, but
997     // in ATMI, do not count the compiler set implicit args, but set your own
998     // implicit args by discounting the compiler set implicit args
999     info.kernel_segment_size =
1000         (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size) +
1001         sizeof(atmi_implicit_args_t);
1002     DEBUG_PRINT("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
1003                 kernel_segment_size, info.kernel_segment_size);
1004 
1005     // kernel received, now add it to the kernel info table
1006     KernelInfoTable[gpu][kernelName] = info;
1007   }
1008 
1009   return HSA_STATUS_SUCCESS;
1010 }
1011 
1012 static hsa_status_t populate_InfoTables(hsa_executable_symbol_t symbol,
1013                                         int gpu) {
1014   hsa_symbol_kind_t type;
1015 
1016   uint32_t name_length;
1017   hsa_status_t err;
1018   err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE,
1019                                        &type);
1020   if (err != HSA_STATUS_SUCCESS) {
1021     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1022            "Symbol info extraction", get_error_string(err));
1023     return err;
1024   }
1025   DEBUG_PRINT("Exec Symbol type: %d\n", type);
1026   if (type == HSA_SYMBOL_KIND_KERNEL) {
1027     err = hsa_executable_symbol_get_info(
1028         symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
1029     if (err != HSA_STATUS_SUCCESS) {
1030       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1031              "Symbol info extraction", get_error_string(err));
1032       return err;
1033     }
1034     char *name = reinterpret_cast<char *>(malloc(name_length + 1));
1035     err = hsa_executable_symbol_get_info(symbol,
1036                                          HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
1037     if (err != HSA_STATUS_SUCCESS) {
1038       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1039              "Symbol info extraction", get_error_string(err));
1040       return err;
1041     }
1042     // remove the suffix .kd from symbol name.
1043     name[name_length - 3] = 0;
1044 
1045     atl_kernel_info_t info;
1046     std::string kernelName(name);
1047     // by now, the kernel info table should already have an entry
1048     // because the non-ROCr custom code object parsing is called before
1049     // iterating over the code object symbols using ROCr
1050     if (KernelInfoTable[gpu].find(kernelName) == KernelInfoTable[gpu].end()) {
1051       return HSA_STATUS_ERROR;
1052     }
1053     // found, so assign and update
1054     info = KernelInfoTable[gpu][kernelName];
1055 
1056     /* Extract dispatch information from the symbol */
1057     err = hsa_executable_symbol_get_info(
1058         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
1059         &(info.kernel_object));
1060     if (err != HSA_STATUS_SUCCESS) {
1061       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1062              "Extracting the symbol from the executable",
1063              get_error_string(err));
1064       return err;
1065     }
1066     err = hsa_executable_symbol_get_info(
1067         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
1068         &(info.group_segment_size));
1069     if (err != HSA_STATUS_SUCCESS) {
1070       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1071              "Extracting the group segment size from the executable",
1072              get_error_string(err));
1073       return err;
1074     }
1075     err = hsa_executable_symbol_get_info(
1076         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
1077         &(info.private_segment_size));
1078     if (err != HSA_STATUS_SUCCESS) {
1079       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1080              "Extracting the private segment from the executable",
1081              get_error_string(err));
1082       return err;
1083     }
1084 
1085     DEBUG_PRINT(
1086         "Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes "
1087         "kernarg\n",
1088         kernelName.c_str(), info.kernel_object, info.group_segment_size,
1089         info.private_segment_size, info.kernel_segment_size);
1090 
1091     // assign it back to the kernel info table
1092     KernelInfoTable[gpu][kernelName] = info;
1093     free(name);
1094   } else if (type == HSA_SYMBOL_KIND_VARIABLE) {
1095     err = hsa_executable_symbol_get_info(
1096         symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
1097     if (err != HSA_STATUS_SUCCESS) {
1098       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1099              "Symbol info extraction", get_error_string(err));
1100       return err;
1101     }
1102     char *name = reinterpret_cast<char *>(malloc(name_length + 1));
1103     err = hsa_executable_symbol_get_info(symbol,
1104                                          HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
1105     if (err != HSA_STATUS_SUCCESS) {
1106       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1107              "Symbol info extraction", get_error_string(err));
1108       return err;
1109     }
1110     name[name_length] = 0;
1111 
1112     atl_symbol_info_t info;
1113 
1114     err = hsa_executable_symbol_get_info(
1115         symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr));
1116     if (err != HSA_STATUS_SUCCESS) {
1117       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1118              "Symbol info address extraction", get_error_string(err));
1119       return err;
1120     }
1121 
1122     err = hsa_executable_symbol_get_info(
1123         symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size));
1124     if (err != HSA_STATUS_SUCCESS) {
1125       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1126              "Symbol info size extraction", get_error_string(err));
1127       return err;
1128     }
1129 
1130     atmi_mem_place_t place = ATMI_MEM_PLACE(ATMI_DEVTYPE_GPU, gpu, 0);
1131     DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr,
1132                 info.size);
1133     err = register_allocation(reinterpret_cast<void *>(info.addr),
1134                               (size_t)info.size, place);
1135     if (err != HSA_STATUS_SUCCESS) {
1136       return err;
1137     }
1138     SymbolInfoTable[gpu][std::string(name)] = info;
1139     free(name);
1140   } else {
1141     DEBUG_PRINT("Symbol is an indirect function\n");
1142   }
1143   return HSA_STATUS_SUCCESS;
1144 }
1145 
1146 atmi_status_t Runtime::RegisterModuleFromMemory(
1147     void *module_bytes, size_t module_size, atmi_place_t place,
1148     atmi_status_t (*on_deserialized_data)(void *data, size_t size,
1149                                           void *cb_state),
1150     void *cb_state, std::vector<hsa_executable_t> &HSAExecutables) {
1151   hsa_status_t err;
1152   int gpu = place.device_id;
1153   assert(gpu >= 0);
1154 
1155   DEBUG_PRINT("Trying to load module to GPU-%d\n", gpu);
1156   ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
1157   hsa_agent_t agent = proc.agent();
1158   hsa_executable_t executable = {0};
1159   hsa_profile_t agent_profile;
1160 
1161   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
1162   if (err != HSA_STATUS_SUCCESS) {
1163     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1164            "Query the agent profile", get_error_string(err));
1165     return ATMI_STATUS_ERROR;
1166   }
1167   // FIXME: Assume that every profile is FULL until we understand how to build
1168   // GCN with base profile
1169   agent_profile = HSA_PROFILE_FULL;
1170   /* Create the empty executable.  */
1171   err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "",
1172                               &executable);
1173   if (err != HSA_STATUS_SUCCESS) {
1174     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1175            "Create the executable", get_error_string(err));
1176     return ATMI_STATUS_ERROR;
1177   }
1178 
1179   bool module_load_success = false;
1180   do // Existing control flow used continue, preserve that for this patch
1181   {
1182     {
1183       // Some metadata info is not available through ROCr API, so use custom
1184       // code object metadata parsing to collect such metadata info
1185 
1186       err = get_code_object_custom_metadata(module_bytes, module_size, gpu);
1187       if (err != HSA_STATUS_SUCCESS) {
1188         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1189                     "Getting custom code object metadata",
1190                     get_error_string(err));
1191         continue;
1192       }
1193 
1194       // Deserialize code object.
1195       hsa_code_object_t code_object = {0};
1196       err = hsa_code_object_deserialize(module_bytes, module_size, NULL,
1197                                         &code_object);
1198       if (err != HSA_STATUS_SUCCESS) {
1199         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1200                     "Code Object Deserialization", get_error_string(err));
1201         continue;
1202       }
1203       assert(0 != code_object.handle);
1204 
1205       // Mutating the device image here avoids another allocation & memcpy
1206       void *code_object_alloc_data =
1207           reinterpret_cast<void *>(code_object.handle);
1208       atmi_status_t atmi_err =
1209           on_deserialized_data(code_object_alloc_data, module_size, cb_state);
1210       if (atmi_err != ATMI_STATUS_SUCCESS) {
1211         printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1212                "Error in deserialized_data callback",
1213                get_atmi_error_string(atmi_err));
1214         return atmi_err;
1215       }
1216 
1217       /* Load the code object.  */
1218       err =
1219           hsa_executable_load_code_object(executable, agent, code_object, NULL);
1220       if (err != HSA_STATUS_SUCCESS) {
1221         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1222                     "Loading the code object", get_error_string(err));
1223         continue;
1224       }
1225 
1226       // cannot iterate over symbols until executable is frozen
1227     }
1228     module_load_success = true;
1229   } while (0);
1230   DEBUG_PRINT("Modules loaded successful? %d\n", module_load_success);
1231   if (module_load_success) {
1232     /* Freeze the executable; it can now be queried for symbols.  */
1233     err = hsa_executable_freeze(executable, "");
1234     if (err != HSA_STATUS_SUCCESS) {
1235       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1236              "Freeze the executable", get_error_string(err));
1237       return ATMI_STATUS_ERROR;
1238     }
1239 
1240     err = hsa::executable_iterate_symbols(
1241         executable,
1242         [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t {
1243           return populate_InfoTables(symbol, gpu);
1244         });
1245 
1246     if (err != HSA_STATUS_SUCCESS) {
1247       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1248              "Iterating over symbols for execuatable", get_error_string(err));
1249       return ATMI_STATUS_ERROR;
1250     }
1251 
1252     // save the executable and destroy during finalize
1253     HSAExecutables.push_back(executable);
1254     return ATMI_STATUS_SUCCESS;
1255   } else {
1256     return ATMI_STATUS_ERROR;
1257   }
1258 }
1259 
1260 } // namespace core
1261