1 /*===--------------------------------------------------------------------------
2  *              ATMI (Asynchronous Task and Memory Interface)
3  *
4  * This file is distributed under the MIT License. See LICENSE.txt for details.
5  *===------------------------------------------------------------------------*/
6 #include <gelf.h>
7 #include <libelf.h>
8 
9 #include <cassert>
10 #include <cstdarg>
11 #include <fstream>
12 #include <iomanip>
13 #include <iostream>
14 #include <set>
15 #include <string>
16 
17 #include "internal.h"
18 #include "machine.h"
19 #include "rt.h"
20 
21 #include "msgpack.h"
22 
23 typedef unsigned char *address;
24 /*
25  * Note descriptors.
26  */
27 typedef struct {
28   uint32_t n_namesz; /* Length of note's name. */
29   uint32_t n_descsz; /* Length of note's value. */
30   uint32_t n_type;   /* Type of note. */
31   // then name
32   // then padding, optional
33   // then desc, at 4 byte alignment (not 8, despite being elf64)
34 } Elf_Note;
35 
36 // The following include file and following structs/enums
37 // have been replicated on a per-use basis below. For example,
38 // llvm::AMDGPU::HSAMD::Kernel::Metadata has several fields,
39 // but we may care only about kernargSegmentSize_ for now, so
40 // we just include that field in our KernelMD implementation. We
41 // chose this approach to replicate in order to avoid forcing
42 // a dependency on LLVM_INCLUDE_DIR just to compile the runtime.
43 // #include "llvm/Support/AMDGPUMetadata.h"
44 // typedef llvm::AMDGPU::HSAMD::Metadata CodeObjectMD;
45 // typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
46 // typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
47 // using llvm::AMDGPU::HSAMD::AccessQualifier;
48 // using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
49 // using llvm::AMDGPU::HSAMD::ValueKind;
50 // using llvm::AMDGPU::HSAMD::ValueType;
51 
52 class KernelArgMD {
53 public:
54   enum class ValueKind {
55     HiddenGlobalOffsetX,
56     HiddenGlobalOffsetY,
57     HiddenGlobalOffsetZ,
58     HiddenNone,
59     HiddenPrintfBuffer,
60     HiddenDefaultQueue,
61     HiddenCompletionAction,
62     HiddenMultiGridSyncArg,
63     HiddenHostcallBuffer,
64     Unknown
65   };
66 
67   KernelArgMD()
68       : name_(std::string()), typeName_(std::string()), size_(0), offset_(0),
69         align_(0), valueKind_(ValueKind::Unknown) {}
70 
71   // fields
72   std::string name_;
73   std::string typeName_;
74   uint32_t size_;
75   uint32_t offset_;
76   uint32_t align_;
77   ValueKind valueKind_;
78 };
79 
80 class KernelMD {
81 public:
82   KernelMD() : kernargSegmentSize_(0ull) {}
83 
84   // fields
85   uint64_t kernargSegmentSize_;
86 };
87 
88 static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
89     //    Including only those fields that are relevant to the runtime.
90     //    {"ByValue", KernelArgMD::ValueKind::ByValue},
91     //    {"GlobalBuffer", KernelArgMD::ValueKind::GlobalBuffer},
92     //    {"DynamicSharedPointer",
93     //    KernelArgMD::ValueKind::DynamicSharedPointer},
94     //    {"Sampler", KernelArgMD::ValueKind::Sampler},
95     //    {"Image", KernelArgMD::ValueKind::Image},
96     //    {"Pipe", KernelArgMD::ValueKind::Pipe},
97     //    {"Queue", KernelArgMD::ValueKind::Queue},
98     {"HiddenGlobalOffsetX", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
99     {"HiddenGlobalOffsetY", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
100     {"HiddenGlobalOffsetZ", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
101     {"HiddenNone", KernelArgMD::ValueKind::HiddenNone},
102     {"HiddenPrintfBuffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
103     {"HiddenDefaultQueue", KernelArgMD::ValueKind::HiddenDefaultQueue},
104     {"HiddenCompletionAction", KernelArgMD::ValueKind::HiddenCompletionAction},
105     {"HiddenMultiGridSyncArg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
106     {"HiddenHostcallBuffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
107     // v3
108     //    {"by_value", KernelArgMD::ValueKind::ByValue},
109     //    {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer},
110     //    {"dynamic_shared_pointer",
111     //    KernelArgMD::ValueKind::DynamicSharedPointer},
112     //    {"sampler", KernelArgMD::ValueKind::Sampler},
113     //    {"image", KernelArgMD::ValueKind::Image},
114     //    {"pipe", KernelArgMD::ValueKind::Pipe},
115     //    {"queue", KernelArgMD::ValueKind::Queue},
116     {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
117     {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
118     {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
119     {"hidden_none", KernelArgMD::ValueKind::HiddenNone},
120     {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
121     {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue},
122     {"hidden_completion_action",
123      KernelArgMD::ValueKind::HiddenCompletionAction},
124     {"hidden_multigrid_sync_arg",
125      KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
126     {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
127 };
128 
129 // global variables. TODO: Get rid of these
130 atmi_machine_t g_atmi_machine;
131 ATLMachine g_atl_machine;
132 
133 std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
134 
135 std::map<std::string, std::string> KernelNameMap;
136 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
137 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
138 
139 bool g_atmi_initialized = false;
140 bool g_atmi_hostcall_required = false;
141 
142 /*
143    atlc is all internal global values.
144    The structure atl_context_t is defined in atl_internal.h
145    Most references will use the global structure prefix atlc.
146 */
147 atl_context_t atlc = {.struct_initialized = false};
148 
149 namespace core {
150 /* Machine Info */
151 atmi_machine_t *Runtime::GetMachineInfo() {
152   if (!atlc.g_hsa_initialized)
153     return NULL;
154   return &g_atmi_machine;
155 }
156 
157 static void atl_set_atmi_initialized() {
158   // FIXME: thread safe? locks?
159   g_atmi_initialized = true;
160 }
161 
162 static void atl_reset_atmi_initialized() {
163   // FIXME: thread safe? locks?
164   g_atmi_initialized = false;
165 }
166 
167 bool atl_is_atmi_initialized() { return g_atmi_initialized; }
168 
169 void allow_access_to_all_gpu_agents(void *ptr) {
170   hsa_status_t err;
171   std::vector<ATLGPUProcessor> &gpu_procs =
172       g_atl_machine.processors<ATLGPUProcessor>();
173   std::vector<hsa_agent_t> agents;
174   for (uint32_t i = 0; i < gpu_procs.size(); i++) {
175     agents.push_back(gpu_procs[i].agent());
176   }
177   err = hsa_amd_agents_allow_access(agents.size(), &agents[0], NULL, ptr);
178   if (err != HSA_STATUS_SUCCESS) {
179     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
180            "Allow agents ptr access", get_error_string(err));
181     exit(1);
182   }
183 }
184 
185 atmi_status_t Runtime::Initialize() {
186   atmi_devtype_t devtype = ATMI_DEVTYPE_GPU;
187   if (atl_is_atmi_initialized())
188     return ATMI_STATUS_SUCCESS;
189 
190   if (devtype == ATMI_DEVTYPE_ALL || devtype & ATMI_DEVTYPE_GPU) {
191     if (atl_init_gpu_context() != ATMI_STATUS_SUCCESS) {
192       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "GPU context init",
193              get_atmi_error_string(atl_init_gpu_context()));
194       exit(1);
195     }
196   }
197 
198   atl_set_atmi_initialized();
199   return ATMI_STATUS_SUCCESS;
200 }
201 
202 atmi_status_t Runtime::Finalize() {
203   hsa_status_t err;
204 
205   for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) {
206     SymbolInfoTable[i].clear();
207   }
208   SymbolInfoTable.clear();
209   for (uint32_t i = 0; i < KernelInfoTable.size(); i++) {
210     KernelInfoTable[i].clear();
211   }
212   KernelInfoTable.clear();
213 
214   atl_reset_atmi_initialized();
215   err = hsa_shut_down();
216   if (err != HSA_STATUS_SUCCESS) {
217     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Shutting down HSA",
218            get_error_string(err));
219     exit(1);
220   }
221 
222   return ATMI_STATUS_SUCCESS;
223 }
224 
225 static void atmi_init_context_structs() {
226   atlc.struct_initialized = true; /* This only gets called one time */
227   atlc.g_hsa_initialized = false;
228   atlc.g_gpu_initialized = false;
229   atlc.g_tasks_initialized = false;
230 }
231 
232 // Implement memory_pool iteration function
233 static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool,
234                                          void *data) {
235   ATLProcessor *proc = reinterpret_cast<ATLProcessor *>(data);
236   hsa_status_t err = HSA_STATUS_SUCCESS;
237   // Check if the memory_pool is allowed to allocate, i.e. do not return group
238   // memory
239   bool alloc_allowed = false;
240   err = hsa_amd_memory_pool_get_info(
241       memory_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
242       &alloc_allowed);
243   if (err != HSA_STATUS_SUCCESS) {
244     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
245            "Alloc allowed in memory pool check", get_error_string(err));
246     exit(1);
247   }
248   if (alloc_allowed) {
249     uint32_t global_flag = 0;
250     err = hsa_amd_memory_pool_get_info(
251         memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
252     if (err != HSA_STATUS_SUCCESS) {
253       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
254              "Get memory pool info", get_error_string(err));
255       exit(1);
256     }
257     if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) {
258       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED);
259       proc->addMemory(new_mem);
260       if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) {
261         DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle);
262         atl_gpu_kernarg_pools.push_back(memory_pool);
263       }
264     } else {
265       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED);
266       proc->addMemory(new_mem);
267     }
268   }
269 
270   return err;
271 }
272 
273 static hsa_status_t get_agent_info(hsa_agent_t agent, void *data) {
274   hsa_status_t err = HSA_STATUS_SUCCESS;
275   hsa_device_type_t device_type;
276   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
277   if (err != HSA_STATUS_SUCCESS) {
278     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
279            "Get device type info", get_error_string(err));
280     exit(1);
281   }
282   switch (device_type) {
283   case HSA_DEVICE_TYPE_CPU: {
284     ;
285     ATLCPUProcessor new_proc(agent);
286     err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
287                                              &new_proc);
288     if (err != HSA_STATUS_SUCCESS) {
289       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
290              "Iterate all memory pools", get_error_string(err));
291       exit(1);
292     }
293     g_atl_machine.addProcessor(new_proc);
294   } break;
295   case HSA_DEVICE_TYPE_GPU: {
296     ;
297     hsa_profile_t profile;
298     err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &profile);
299     if (err != HSA_STATUS_SUCCESS) {
300       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
301              "Query the agent profile", get_error_string(err));
302       exit(1);
303     }
304     atmi_devtype_t gpu_type;
305     gpu_type =
306         (profile == HSA_PROFILE_FULL) ? ATMI_DEVTYPE_iGPU : ATMI_DEVTYPE_dGPU;
307     ATLGPUProcessor new_proc(agent, gpu_type);
308     err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
309                                              &new_proc);
310     if (err != HSA_STATUS_SUCCESS) {
311       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
312              "Iterate all memory pools", get_error_string(err));
313       exit(1);
314     }
315     g_atl_machine.addProcessor(new_proc);
316   } break;
317   case HSA_DEVICE_TYPE_DSP: {
318     err = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
319   } break;
320   }
321 
322   return err;
323 }
324 
325 hsa_status_t get_fine_grained_region(hsa_region_t region, void *data) {
326   hsa_region_segment_t segment;
327   hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
328   if (segment != HSA_REGION_SEGMENT_GLOBAL) {
329     return HSA_STATUS_SUCCESS;
330   }
331   hsa_region_global_flag_t flags;
332   hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
333   if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
334     hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data);
335     *ret = region;
336     return HSA_STATUS_INFO_BREAK;
337   }
338   return HSA_STATUS_SUCCESS;
339 }
340 
341 /* Determines if a memory region can be used for kernarg allocations.  */
342 static hsa_status_t get_kernarg_memory_region(hsa_region_t region, void *data) {
343   hsa_region_segment_t segment;
344   hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
345   if (HSA_REGION_SEGMENT_GLOBAL != segment) {
346     return HSA_STATUS_SUCCESS;
347   }
348 
349   hsa_region_global_flag_t flags;
350   hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
351   if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
352     hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data);
353     *ret = region;
354     return HSA_STATUS_INFO_BREAK;
355   }
356 
357   return HSA_STATUS_SUCCESS;
358 }
359 
360 static hsa_status_t init_compute_and_memory() {
361   hsa_status_t err;
362 
363   /* Iterate over the agents and pick the gpu agent */
364   err = hsa_iterate_agents(get_agent_info, NULL);
365   if (err == HSA_STATUS_INFO_BREAK) {
366     err = HSA_STATUS_SUCCESS;
367   }
368   if (err != HSA_STATUS_SUCCESS) {
369     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Getting a gpu agent",
370            get_error_string(err));
371     exit(1);
372   }
373   if (err != HSA_STATUS_SUCCESS)
374     return err;
375 
376   /* Init all devices or individual device types? */
377   std::vector<ATLCPUProcessor> &cpu_procs =
378       g_atl_machine.processors<ATLCPUProcessor>();
379   std::vector<ATLGPUProcessor> &gpu_procs =
380       g_atl_machine.processors<ATLGPUProcessor>();
381   /* For CPU memory pools, add other devices that can access them directly
382    * or indirectly */
383   for (auto &cpu_proc : cpu_procs) {
384     for (auto &cpu_mem : cpu_proc.memories()) {
385       hsa_amd_memory_pool_t pool = cpu_mem.memory();
386       for (auto &gpu_proc : gpu_procs) {
387         hsa_agent_t agent = gpu_proc.agent();
388         hsa_amd_memory_pool_access_t access;
389         hsa_amd_agent_memory_pool_get_info(
390             agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
391         if (access != 0) {
392           // this means not NEVER, but could be YES or NO
393           // add this memory pool to the proc
394           gpu_proc.addMemory(cpu_mem);
395         }
396       }
397     }
398   }
399 
400   /* FIXME: are the below combinations of procs and memory pools needed?
401    * all to all compare procs with their memory pools and add those memory
402    * pools that are accessible by the target procs */
403   for (auto &gpu_proc : gpu_procs) {
404     for (auto &gpu_mem : gpu_proc.memories()) {
405       hsa_amd_memory_pool_t pool = gpu_mem.memory();
406       for (auto &cpu_proc : cpu_procs) {
407         hsa_agent_t agent = cpu_proc.agent();
408         hsa_amd_memory_pool_access_t access;
409         hsa_amd_agent_memory_pool_get_info(
410             agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
411         if (access != 0) {
412           // this means not NEVER, but could be YES or NO
413           // add this memory pool to the proc
414           cpu_proc.addMemory(gpu_mem);
415         }
416       }
417     }
418   }
419 
420   g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_CPU] = cpu_procs.size();
421   g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_GPU] = gpu_procs.size();
422 
423   size_t num_procs = cpu_procs.size() + gpu_procs.size();
424   // g_atmi_machine.devices = (atmi_device_t *)malloc(num_procs *
425   // sizeof(atmi_device_t));
426   atmi_device_t *all_devices = reinterpret_cast<atmi_device_t *>(
427       malloc(num_procs * sizeof(atmi_device_t)));
428   int num_iGPUs = 0;
429   int num_dGPUs = 0;
430   for (uint32_t i = 0; i < gpu_procs.size(); i++) {
431     if (gpu_procs[i].type() == ATMI_DEVTYPE_iGPU)
432       num_iGPUs++;
433     else
434       num_dGPUs++;
435   }
436   assert(num_iGPUs + num_dGPUs == gpu_procs.size() &&
437          "Number of dGPUs and iGPUs do not add up");
438   DEBUG_PRINT("CPU Agents: %lu\n", cpu_procs.size());
439   DEBUG_PRINT("iGPU Agents: %d\n", num_iGPUs);
440   DEBUG_PRINT("dGPU Agents: %d\n", num_dGPUs);
441   DEBUG_PRINT("GPU Agents: %lu\n", gpu_procs.size());
442 
443   g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_iGPU] = num_iGPUs;
444   g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_dGPU] = num_dGPUs;
445 
446   int cpus_begin = 0;
447   int cpus_end = cpu_procs.size();
448   int gpus_begin = cpu_procs.size();
449   int gpus_end = cpu_procs.size() + gpu_procs.size();
450   g_atmi_machine.devices_by_type[ATMI_DEVTYPE_CPU] = &all_devices[cpus_begin];
451   g_atmi_machine.devices_by_type[ATMI_DEVTYPE_GPU] = &all_devices[gpus_begin];
452   g_atmi_machine.devices_by_type[ATMI_DEVTYPE_iGPU] = &all_devices[gpus_begin];
453   g_atmi_machine.devices_by_type[ATMI_DEVTYPE_dGPU] = &all_devices[gpus_begin];
454   int proc_index = 0;
455   for (int i = cpus_begin; i < cpus_end; i++) {
456     all_devices[i].type = cpu_procs[proc_index].type();
457 
458     std::vector<ATLMemory> memories = cpu_procs[proc_index].memories();
459     int fine_memories_size = 0;
460     int coarse_memories_size = 0;
461     DEBUG_PRINT("CPU memory types:\t");
462     for (auto &memory : memories) {
463       atmi_memtype_t type = memory.type();
464       if (type == ATMI_MEMTYPE_FINE_GRAINED) {
465         fine_memories_size++;
466         DEBUG_PRINT("Fine\t");
467       } else {
468         coarse_memories_size++;
469         DEBUG_PRINT("Coarse\t");
470       }
471     }
472     DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
473     DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
474     proc_index++;
475   }
476   proc_index = 0;
477   for (int i = gpus_begin; i < gpus_end; i++) {
478     all_devices[i].type = gpu_procs[proc_index].type();
479 
480     std::vector<ATLMemory> memories = gpu_procs[proc_index].memories();
481     int fine_memories_size = 0;
482     int coarse_memories_size = 0;
483     DEBUG_PRINT("GPU memory types:\t");
484     for (auto &memory : memories) {
485       atmi_memtype_t type = memory.type();
486       if (type == ATMI_MEMTYPE_FINE_GRAINED) {
487         fine_memories_size++;
488         DEBUG_PRINT("Fine\t");
489       } else {
490         coarse_memories_size++;
491         DEBUG_PRINT("Coarse\t");
492       }
493     }
494     DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
495     DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
496     proc_index++;
497   }
498   proc_index = 0;
499   hsa_region_t atl_cpu_kernarg_region;
500   atl_cpu_kernarg_region.handle = (uint64_t)-1;
501   if (cpu_procs.size() > 0) {
502     err = hsa_agent_iterate_regions(
503         cpu_procs[0].agent(), get_fine_grained_region, &atl_cpu_kernarg_region);
504     if (err == HSA_STATUS_INFO_BREAK) {
505       err = HSA_STATUS_SUCCESS;
506     }
507     err = (atl_cpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR
508                                                           : HSA_STATUS_SUCCESS;
509     if (err != HSA_STATUS_SUCCESS) {
510       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
511              "Finding a CPU kernarg memory region handle",
512              get_error_string(err));
513       exit(1);
514     }
515   }
516   hsa_region_t atl_gpu_kernarg_region;
517   /* Find a memory region that supports kernel arguments.  */
518   atl_gpu_kernarg_region.handle = (uint64_t)-1;
519   if (gpu_procs.size() > 0) {
520     hsa_agent_iterate_regions(gpu_procs[0].agent(), get_kernarg_memory_region,
521                               &atl_gpu_kernarg_region);
522     err = (atl_gpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR
523                                                           : HSA_STATUS_SUCCESS;
524     if (err != HSA_STATUS_SUCCESS) {
525       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
526              "Finding a kernarg memory region", get_error_string(err));
527       exit(1);
528     }
529   }
530   if (num_procs > 0)
531     return HSA_STATUS_SUCCESS;
532   else
533     return HSA_STATUS_ERROR_NOT_INITIALIZED;
534 }
535 
536 hsa_status_t init_hsa() {
537   if (atlc.g_hsa_initialized == false) {
538     DEBUG_PRINT("Initializing HSA...");
539     hsa_status_t err = hsa_init();
540     if (err != HSA_STATUS_SUCCESS) {
541       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
542              "Initializing the hsa runtime", get_error_string(err));
543       exit(1);
544     }
545     if (err != HSA_STATUS_SUCCESS)
546       return err;
547 
548     err = init_compute_and_memory();
549     if (err != HSA_STATUS_SUCCESS)
550       return err;
551     if (err != HSA_STATUS_SUCCESS) {
552       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
553              "After initializing compute and memory", get_error_string(err));
554       exit(1);
555     }
556 
557     int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
558     KernelInfoTable.resize(gpu_count);
559     SymbolInfoTable.resize(gpu_count);
560     for (uint32_t i = 0; i < SymbolInfoTable.size(); i++)
561       SymbolInfoTable[i].clear();
562     for (uint32_t i = 0; i < KernelInfoTable.size(); i++)
563       KernelInfoTable[i].clear();
564     atlc.g_hsa_initialized = true;
565     DEBUG_PRINT("done\n");
566   }
567   return HSA_STATUS_SUCCESS;
568 }
569 
570 void init_tasks() {
571   if (atlc.g_tasks_initialized != false)
572     return;
573   std::vector<hsa_agent_t> gpu_agents;
574   int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
575   for (int gpu = 0; gpu < gpu_count; gpu++) {
576     atmi_place_t place = ATMI_PLACE_GPU(0, gpu);
577     ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
578     gpu_agents.push_back(proc.agent());
579   }
580   atlc.g_tasks_initialized = true;
581 }
582 
583 hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) {
584 #if (ROCM_VERSION_MAJOR >= 3) ||                                               \
585     (ROCM_VERSION_MAJOR >= 2 && ROCM_VERSION_MINOR >= 3)
586   if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) {
587 #else
588   if (event->event_type == GPU_MEMORY_FAULT_EVENT) {
589 #endif
590     hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault;
591     // memory_fault.agent
592     // memory_fault.virtual_address
593     // memory_fault.fault_reason_mask
594     // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address);
595     std::stringstream stream;
596     stream << std::hex << (uintptr_t)memory_fault.virtual_address;
597     std::string addr("0x" + stream.str());
598 
599     std::string err_string = "[GPU Memory Error] Addr: " + addr;
600     err_string += " Reason: ";
601     if (!(memory_fault.fault_reason_mask & 0x00111111)) {
602       err_string += "No Idea! ";
603     } else {
604       if (memory_fault.fault_reason_mask & 0x00000001)
605         err_string += "Page not present or supervisor privilege. ";
606       if (memory_fault.fault_reason_mask & 0x00000010)
607         err_string += "Write access to a read-only page. ";
608       if (memory_fault.fault_reason_mask & 0x00000100)
609         err_string += "Execute access to a page marked NX. ";
610       if (memory_fault.fault_reason_mask & 0x00001000)
611         err_string += "Host access only. ";
612       if (memory_fault.fault_reason_mask & 0x00010000)
613         err_string += "ECC failure (if supported by HW). ";
614       if (memory_fault.fault_reason_mask & 0x00100000)
615         err_string += "Can't determine the exact fault address. ";
616     }
617     fprintf(stderr, "%s\n", err_string.c_str());
618     return HSA_STATUS_ERROR;
619   }
620   return HSA_STATUS_SUCCESS;
621 }
622 
623 atmi_status_t atl_init_gpu_context() {
624   if (atlc.struct_initialized == false)
625     atmi_init_context_structs();
626   if (atlc.g_gpu_initialized != false)
627     return ATMI_STATUS_SUCCESS;
628 
629   hsa_status_t err;
630   err = init_hsa();
631   if (err != HSA_STATUS_SUCCESS)
632     return ATMI_STATUS_ERROR;
633 
634   err = hsa_amd_register_system_event_handler(callbackEvent, NULL);
635   if (err != HSA_STATUS_SUCCESS) {
636     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
637            "Registering the system for memory faults", get_error_string(err));
638     exit(1);
639   }
640 
641   init_tasks();
642   atlc.g_gpu_initialized = true;
643   return ATMI_STATUS_SUCCESS;
644 }
645 
646 static bool isImplicit(KernelArgMD::ValueKind value_kind) {
647   switch (value_kind) {
648   case KernelArgMD::ValueKind::HiddenGlobalOffsetX:
649   case KernelArgMD::ValueKind::HiddenGlobalOffsetY:
650   case KernelArgMD::ValueKind::HiddenGlobalOffsetZ:
651   case KernelArgMD::ValueKind::HiddenNone:
652   case KernelArgMD::ValueKind::HiddenPrintfBuffer:
653   case KernelArgMD::ValueKind::HiddenDefaultQueue:
654   case KernelArgMD::ValueKind::HiddenCompletionAction:
655   case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
656   case KernelArgMD::ValueKind::HiddenHostcallBuffer:
657     return true;
658   default:
659     return false;
660   }
661 }
662 
663 static std::pair<unsigned char *, unsigned char *>
664 find_metadata(void *binary, size_t binSize) {
665   std::pair<unsigned char *, unsigned char *> failure = {nullptr, nullptr};
666 
667   Elf *e = elf_memory(static_cast<char *>(binary), binSize);
668   if (elf_kind(e) != ELF_K_ELF) {
669     return failure;
670   }
671 
672   size_t numpHdrs;
673   if (elf_getphdrnum(e, &numpHdrs) != 0) {
674     return failure;
675   }
676 
677   for (size_t i = 0; i < numpHdrs; ++i) {
678     GElf_Phdr pHdr;
679     if (gelf_getphdr(e, i, &pHdr) != &pHdr) {
680       continue;
681     }
682     // Look for the runtime metadata note
683     if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) {
684       // Iterate over the notes in this segment
685       address ptr = (address)binary + pHdr.p_offset;
686       address segmentEnd = ptr + pHdr.p_filesz;
687 
688       while (ptr < segmentEnd) {
689         Elf_Note *note = reinterpret_cast<Elf_Note *>(ptr);
690         address name = (address)&note[1];
691 
692         if (note->n_type == 7 || note->n_type == 8) {
693           return failure;
694         } else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA */ &&
695                    note->n_namesz == sizeof "AMD" &&
696                    !memcmp(name, "AMD", note->n_namesz)) {
697           // code object v2 uses yaml metadata, no longer supported
698           return failure;
699         } else if (note->n_type == 32 /* NT_AMDGPU_METADATA */ &&
700                    note->n_namesz == sizeof "AMDGPU" &&
701                    !memcmp(name, "AMDGPU", note->n_namesz)) {
702 
703           // n_descsz = 485
704           // value is padded to 4 byte alignment, may want to move end up to
705           // match
706           size_t offset = sizeof(uint32_t) * 3 /* fields */
707                           + sizeof("AMDGPU")   /* name */
708                           + 1 /* padding to 4 byte alignment */;
709 
710           // Including the trailing padding means both pointers are 4 bytes
711           // aligned, which may be useful later.
712           unsigned char *metadata_start = (unsigned char *)ptr + offset;
713           unsigned char *metadata_end =
714               metadata_start + core::alignUp(note->n_descsz, 4);
715           return {metadata_start, metadata_end};
716         }
717         ptr += sizeof(*note) + core::alignUp(note->n_namesz, sizeof(int)) +
718                core::alignUp(note->n_descsz, sizeof(int));
719       }
720     }
721   }
722 
723   return failure;
724 }
725 
726 namespace {
727 int map_lookup_array(msgpack::byte_range message, const char *needle,
728                      msgpack::byte_range *res, uint64_t *size) {
729   unsigned count = 0;
730   struct s : msgpack::functors_defaults<s> {
731     s(unsigned &count, uint64_t *size) : count(count), size(size) {}
732     unsigned &count;
733     uint64_t *size;
734     const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) {
735       count++;
736       *size = N;
737       return bytes.end;
738     }
739   };
740 
741   msgpack::foreach_map(message,
742                        [&](msgpack::byte_range key, msgpack::byte_range value) {
743                          if (msgpack::message_is_string(key, needle)) {
744                            // If the message is an array, record number of
745                            // elements in *size
746                            msgpack::handle_msgpack<s>(value, {count, size});
747                            // return the whole array
748                            *res = value;
749                          }
750                        });
751   // Only claim success if exactly one key/array pair matched
752   return count != 1;
753 }
754 
755 int map_lookup_string(msgpack::byte_range message, const char *needle,
756                       std::string *res) {
757   unsigned count = 0;
758   struct s : public msgpack::functors_defaults<s> {
759     s(unsigned &count, std::string *res) : count(count), res(res) {}
760     unsigned &count;
761     std::string *res;
762     void handle_string(size_t N, const unsigned char *str) {
763       count++;
764       *res = std::string(str, str + N);
765     }
766   };
767   msgpack::foreach_map(message,
768                        [&](msgpack::byte_range key, msgpack::byte_range value) {
769                          if (msgpack::message_is_string(key, needle)) {
770                            msgpack::handle_msgpack<s>(value, {count, res});
771                          }
772                        });
773   return count != 1;
774 }
775 
776 int map_lookup_uint64_t(msgpack::byte_range message, const char *needle,
777                         uint64_t *res) {
778   unsigned count = 0;
779   msgpack::foreach_map(message,
780                        [&](msgpack::byte_range key, msgpack::byte_range value) {
781                          if (msgpack::message_is_string(key, needle)) {
782                            msgpack::foronly_unsigned(value, [&](uint64_t x) {
783                              count++;
784                              *res = x;
785                            });
786                          }
787                        });
788   return count != 1;
789 }
790 
791 int array_lookup_element(msgpack::byte_range message, uint64_t elt,
792                          msgpack::byte_range *res) {
793   int rc = 1;
794   uint64_t i = 0;
795   msgpack::foreach_array(message, [&](msgpack::byte_range value) {
796     if (i == elt) {
797       *res = value;
798       rc = 0;
799     }
800     i++;
801   });
802   return rc;
803 }
804 
805 int populate_kernelArgMD(msgpack::byte_range args_element,
806                          KernelArgMD *kernelarg) {
807   using namespace msgpack;
808   int error = 0;
809   foreach_map(args_element, [&](byte_range key, byte_range value) -> void {
810     if (message_is_string(key, ".name")) {
811       foronly_string(value, [&](size_t N, const unsigned char *str) {
812         kernelarg->name_ = std::string(str, str + N);
813       });
814     } else if (message_is_string(key, ".type_name")) {
815       foronly_string(value, [&](size_t N, const unsigned char *str) {
816         kernelarg->typeName_ = std::string(str, str + N);
817       });
818     } else if (message_is_string(key, ".size")) {
819       foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; });
820     } else if (message_is_string(key, ".offset")) {
821       foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; });
822     } else if (message_is_string(key, ".value_kind")) {
823       foronly_string(value, [&](size_t N, const unsigned char *str) {
824         std::string s = std::string(str, str + N);
825         auto itValueKind = ArgValueKind.find(s);
826         if (itValueKind != ArgValueKind.end()) {
827           kernelarg->valueKind_ = itValueKind->second;
828         }
829       });
830     }
831   });
832   return error;
833 }
834 } // namespace
835 
836 static hsa_status_t get_code_object_custom_metadata(void *binary,
837                                                     size_t binSize, int gpu) {
838   // parse code object with different keys from v2
839   // also, the kernel name is not the same as the symbol name -- so a
840   // symbol->name map is needed
841 
842   std::pair<unsigned char *, unsigned char *> metadata =
843       find_metadata(binary, binSize);
844   if (!metadata.first) {
845     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
846   }
847 
848   uint64_t kernelsSize = 0;
849   int msgpack_errors = 0;
850   msgpack::byte_range kernel_array;
851   msgpack_errors =
852       map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels",
853                        &kernel_array, &kernelsSize);
854   if (msgpack_errors != 0) {
855     printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
856            "kernels lookup in program metadata");
857     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
858   }
859 
860   for (size_t i = 0; i < kernelsSize; i++) {
861     assert(msgpack_errors == 0);
862     std::string kernelName;
863     std::string symbolName;
864 
865     msgpack::byte_range element;
866     msgpack_errors += array_lookup_element(kernel_array, i, &element);
867     if (msgpack_errors != 0) {
868       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
869              "element lookup in kernel metadata");
870       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
871     }
872 
873     msgpack_errors += map_lookup_string(element, ".name", &kernelName);
874     msgpack_errors += map_lookup_string(element, ".symbol", &symbolName);
875     if (msgpack_errors != 0) {
876       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
877              "strings lookup in kernel metadata");
878       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
879     }
880 
881     atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, {}, {}, {}};
882 
883     uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count;
884     msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count);
885     if (msgpack_errors != 0) {
886       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
887              "sgpr count metadata lookup in kernel metadata");
888       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
889     }
890 
891     info.sgpr_count = sgpr_count;
892 
893     msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count);
894     if (msgpack_errors != 0) {
895       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
896              "vgpr count metadata lookup in kernel metadata");
897       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
898     }
899 
900     info.vgpr_count = vgpr_count;
901 
902     msgpack_errors +=
903         map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count);
904     if (msgpack_errors != 0) {
905       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
906              "sgpr spill count metadata lookup in kernel metadata");
907       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
908     }
909 
910     info.sgpr_spill_count = sgpr_spill_count;
911 
912     msgpack_errors +=
913         map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count);
914     if (msgpack_errors != 0) {
915       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
916              "vgpr spill count metadata lookup in kernel metadata");
917       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
918     }
919 
920     info.vgpr_spill_count = vgpr_spill_count;
921 
922     size_t kernel_explicit_args_size = 0;
923     uint64_t kernel_segment_size;
924     msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size",
925                                           &kernel_segment_size);
926     if (msgpack_errors != 0) {
927       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
928              "kernarg segment size metadata lookup in kernel metadata");
929       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
930     }
931 
932     // create a map from symbol to name
933     DEBUG_PRINT("Kernel symbol %s; Name: %s; Size: %lu\n", symbolName.c_str(),
934                 kernelName.c_str(), kernel_segment_size);
935     KernelNameMap[symbolName] = kernelName;
936 
937     bool hasHiddenArgs = false;
938     if (kernel_segment_size > 0) {
939       uint64_t argsSize;
940       size_t offset = 0;
941 
942       msgpack::byte_range args_array;
943       msgpack_errors +=
944           map_lookup_array(element, ".args", &args_array, &argsSize);
945       if (msgpack_errors != 0) {
946         printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
947                "kernel args metadata lookup in kernel metadata");
948         return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
949       }
950 
951       info.num_args = argsSize;
952 
953       for (size_t i = 0; i < argsSize; ++i) {
954         KernelArgMD lcArg;
955 
956         msgpack::byte_range args_element;
957         msgpack_errors += array_lookup_element(args_array, i, &args_element);
958         if (msgpack_errors != 0) {
959           printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
960                  "iterate args map in kernel args metadata");
961           return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
962         }
963 
964         msgpack_errors += populate_kernelArgMD(args_element, &lcArg);
965         if (msgpack_errors != 0) {
966           printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
967                  "iterate args map in kernel args metadata");
968           return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
969         }
970         // populate info with sizes and offsets
971         info.arg_sizes.push_back(lcArg.size_);
972         // v3 has offset field and not align field
973         size_t new_offset = lcArg.offset_;
974         size_t padding = new_offset - offset;
975         offset = new_offset;
976         info.arg_offsets.push_back(lcArg.offset_);
977         DEBUG_PRINT("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(),
978                     lcArg.size_, lcArg.offset_);
979         offset += lcArg.size_;
980 
981         // check if the arg is a hidden/implicit arg
982         // this logic assumes that all hidden args are 8-byte aligned
983         if (!isImplicit(lcArg.valueKind_)) {
984           kernel_explicit_args_size += lcArg.size_;
985         } else {
986           hasHiddenArgs = true;
987         }
988         kernel_explicit_args_size += padding;
989       }
990     }
991 
992     // add size of implicit args, e.g.: offset x, y and z and pipe pointer, but
993     // in ATMI, do not count the compiler set implicit args, but set your own
994     // implicit args by discounting the compiler set implicit args
995     info.kernel_segment_size =
996         (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size) +
997         sizeof(atmi_implicit_args_t);
998     DEBUG_PRINT("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
999                 kernel_segment_size, info.kernel_segment_size);
1000 
1001     // kernel received, now add it to the kernel info table
1002     KernelInfoTable[gpu][kernelName] = info;
1003   }
1004 
1005   return HSA_STATUS_SUCCESS;
1006 }
1007 
1008 static hsa_status_t populate_InfoTables(hsa_executable_t executable,
1009                                         hsa_executable_symbol_t symbol,
1010                                         void *data) {
1011   int gpu = *static_cast<int *>(data);
1012   hsa_symbol_kind_t type;
1013 
1014   uint32_t name_length;
1015   hsa_status_t err;
1016   err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE,
1017                                        &type);
1018   if (err != HSA_STATUS_SUCCESS) {
1019     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1020            "Symbol info extraction", get_error_string(err));
1021     exit(1);
1022   }
1023   DEBUG_PRINT("Exec Symbol type: %d\n", type);
1024   if (type == HSA_SYMBOL_KIND_KERNEL) {
1025     err = hsa_executable_symbol_get_info(
1026         symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
1027     if (err != HSA_STATUS_SUCCESS) {
1028       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1029              "Symbol info extraction", get_error_string(err));
1030       exit(1);
1031     }
1032     char *name = reinterpret_cast<char *>(malloc(name_length + 1));
1033     err = hsa_executable_symbol_get_info(symbol,
1034                                          HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
1035     if (err != HSA_STATUS_SUCCESS) {
1036       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1037              "Symbol info extraction", get_error_string(err));
1038       exit(1);
1039     }
1040     name[name_length] = 0;
1041 
1042     if (KernelNameMap.find(std::string(name)) == KernelNameMap.end()) {
1043       // did not find kernel name in the kernel map; this can happen only
1044       // if the ROCr API for getting symbol info (name) is different from
1045       // the comgr method of getting symbol info
1046       if (HSA_STATUS_ERROR_INVALID_CODE_OBJECT != HSA_STATUS_SUCCESS) {
1047         printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1048                "Invalid kernel name",
1049                get_error_string(HSA_STATUS_ERROR_INVALID_CODE_OBJECT));
1050         exit(1);
1051       }
1052     }
1053     atl_kernel_info_t info;
1054     std::string kernelName = KernelNameMap[std::string(name)];
1055     // by now, the kernel info table should already have an entry
1056     // because the non-ROCr custom code object parsing is called before
1057     // iterating over the code object symbols using ROCr
1058     if (KernelInfoTable[gpu].find(kernelName) == KernelInfoTable[gpu].end()) {
1059       if (HSA_STATUS_ERROR_INVALID_CODE_OBJECT != HSA_STATUS_SUCCESS) {
1060         printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1061                "Finding the entry kernel info table",
1062                get_error_string(HSA_STATUS_ERROR_INVALID_CODE_OBJECT));
1063         exit(1);
1064       }
1065     }
1066     // found, so assign and update
1067     info = KernelInfoTable[gpu][kernelName];
1068 
1069     /* Extract dispatch information from the symbol */
1070     err = hsa_executable_symbol_get_info(
1071         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
1072         &(info.kernel_object));
1073     if (err != HSA_STATUS_SUCCESS) {
1074       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1075              "Extracting the symbol from the executable",
1076              get_error_string(err));
1077       exit(1);
1078     }
1079     err = hsa_executable_symbol_get_info(
1080         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
1081         &(info.group_segment_size));
1082     if (err != HSA_STATUS_SUCCESS) {
1083       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1084              "Extracting the group segment size from the executable",
1085              get_error_string(err));
1086       exit(1);
1087     }
1088     err = hsa_executable_symbol_get_info(
1089         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
1090         &(info.private_segment_size));
1091     if (err != HSA_STATUS_SUCCESS) {
1092       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1093              "Extracting the private segment from the executable",
1094              get_error_string(err));
1095       exit(1);
1096     }
1097 
1098     DEBUG_PRINT(
1099         "Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes "
1100         "kernarg\n",
1101         kernelName.c_str(), info.kernel_object, info.group_segment_size,
1102         info.private_segment_size, info.kernel_segment_size);
1103 
1104     // assign it back to the kernel info table
1105     KernelInfoTable[gpu][kernelName] = info;
1106     free(name);
1107   } else if (type == HSA_SYMBOL_KIND_VARIABLE) {
1108     err = hsa_executable_symbol_get_info(
1109         symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
1110     if (err != HSA_STATUS_SUCCESS) {
1111       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1112              "Symbol info extraction", get_error_string(err));
1113       exit(1);
1114     }
1115     char *name = reinterpret_cast<char *>(malloc(name_length + 1));
1116     err = hsa_executable_symbol_get_info(symbol,
1117                                          HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
1118     if (err != HSA_STATUS_SUCCESS) {
1119       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1120              "Symbol info extraction", get_error_string(err));
1121       exit(1);
1122     }
1123     name[name_length] = 0;
1124 
1125     atl_symbol_info_t info;
1126 
1127     err = hsa_executable_symbol_get_info(
1128         symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr));
1129     if (err != HSA_STATUS_SUCCESS) {
1130       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1131              "Symbol info address extraction", get_error_string(err));
1132       exit(1);
1133     }
1134 
1135     err = hsa_executable_symbol_get_info(
1136         symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size));
1137     if (err != HSA_STATUS_SUCCESS) {
1138       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1139              "Symbol info size extraction", get_error_string(err));
1140       exit(1);
1141     }
1142 
1143     atmi_mem_place_t place = ATMI_MEM_PLACE(ATMI_DEVTYPE_GPU, gpu, 0);
1144     DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr,
1145                 info.size);
1146     register_allocation(reinterpret_cast<void *>(info.addr), (size_t)info.size,
1147                         place);
1148     SymbolInfoTable[gpu][std::string(name)] = info;
1149     if (strcmp(name, "needs_hostcall_buffer") == 0)
1150       g_atmi_hostcall_required = true;
1151     free(name);
1152   } else {
1153     DEBUG_PRINT("Symbol is an indirect function\n");
1154   }
1155   return HSA_STATUS_SUCCESS;
1156 }
1157 
1158 atmi_status_t Runtime::RegisterModuleFromMemory(
1159     void *module_bytes, size_t module_size, atmi_place_t place,
1160     atmi_status_t (*on_deserialized_data)(void *data, size_t size,
1161                                           void *cb_state),
1162     void *cb_state, std::vector<hsa_executable_t> &HSAExecutables) {
1163   hsa_status_t err;
1164   int gpu = place.device_id;
1165   assert(gpu >= 0);
1166 
1167   DEBUG_PRINT("Trying to load module to GPU-%d\n", gpu);
1168   ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
1169   hsa_agent_t agent = proc.agent();
1170   hsa_executable_t executable = {0};
1171   hsa_profile_t agent_profile;
1172 
1173   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
1174   if (err != HSA_STATUS_SUCCESS) {
1175     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1176            "Query the agent profile", get_error_string(err));
1177     exit(1);
1178   }
1179   // FIXME: Assume that every profile is FULL until we understand how to build
1180   // GCN with base profile
1181   agent_profile = HSA_PROFILE_FULL;
1182   /* Create the empty executable.  */
1183   err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "",
1184                               &executable);
1185   if (err != HSA_STATUS_SUCCESS) {
1186     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1187            "Create the executable", get_error_string(err));
1188     exit(1);
1189   }
1190 
1191   bool module_load_success = false;
1192   do // Existing control flow used continue, preserve that for this patch
1193   {
1194     {
1195       // Some metadata info is not available through ROCr API, so use custom
1196       // code object metadata parsing to collect such metadata info
1197 
1198       err = get_code_object_custom_metadata(module_bytes, module_size, gpu);
1199       if (err != HSA_STATUS_SUCCESS) {
1200         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1201                     "Getting custom code object metadata",
1202                     get_error_string(err));
1203         continue;
1204       }
1205 
1206       // Deserialize code object.
1207       hsa_code_object_t code_object = {0};
1208       err = hsa_code_object_deserialize(module_bytes, module_size, NULL,
1209                                         &code_object);
1210       if (err != HSA_STATUS_SUCCESS) {
1211         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1212                     "Code Object Deserialization", get_error_string(err));
1213         continue;
1214       }
1215       assert(0 != code_object.handle);
1216 
1217       // Mutating the device image here avoids another allocation & memcpy
1218       void *code_object_alloc_data =
1219           reinterpret_cast<void *>(code_object.handle);
1220       atmi_status_t atmi_err =
1221           on_deserialized_data(code_object_alloc_data, module_size, cb_state);
1222       if (atmi_err != ATMI_STATUS_SUCCESS) {
1223         printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1224                "Error in deserialized_data callback",
1225                get_atmi_error_string(atmi_err));
1226         exit(1);
1227       }
1228 
1229       /* Load the code object.  */
1230       err =
1231           hsa_executable_load_code_object(executable, agent, code_object, NULL);
1232       if (err != HSA_STATUS_SUCCESS) {
1233         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1234                     "Loading the code object", get_error_string(err));
1235         continue;
1236       }
1237 
1238       // cannot iterate over symbols until executable is frozen
1239     }
1240     module_load_success = true;
1241   } while (0);
1242   DEBUG_PRINT("Modules loaded successful? %d\n", module_load_success);
1243   if (module_load_success) {
1244     /* Freeze the executable; it can now be queried for symbols.  */
1245     err = hsa_executable_freeze(executable, "");
1246     if (err != HSA_STATUS_SUCCESS) {
1247       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1248              "Freeze the executable", get_error_string(err));
1249       exit(1);
1250     }
1251 
1252     err = hsa_executable_iterate_symbols(executable, populate_InfoTables,
1253                                          static_cast<void *>(&gpu));
1254     if (err != HSA_STATUS_SUCCESS) {
1255       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1256              "Iterating over symbols for execuatable", get_error_string(err));
1257       exit(1);
1258     }
1259 
1260     // save the executable and destroy during finalize
1261     HSAExecutables.push_back(executable);
1262     return ATMI_STATUS_SUCCESS;
1263   } else {
1264     return ATMI_STATUS_ERROR;
1265   }
1266 }
1267 
1268 } // namespace core
1269