1 /*===--------------------------------------------------------------------------
2  *              ATMI (Asynchronous Task and Memory Interface)
3  *
4  * This file is distributed under the MIT License. See LICENSE.txt for details.
5  *===------------------------------------------------------------------------*/
6 #include <gelf.h>
7 #include <libelf.h>
8 
9 #include <cassert>
10 #include <cstdarg>
11 #include <fstream>
12 #include <iomanip>
13 #include <iostream>
14 #include <set>
15 #include <string>
16 
17 #include "internal.h"
18 #include "machine.h"
19 #include "rt.h"
20 
21 #include "msgpack.h"
22 
23 typedef unsigned char *address;
24 /*
25  * Note descriptors.
26  */
27 typedef struct {
28   uint32_t n_namesz; /* Length of note's name. */
29   uint32_t n_descsz; /* Length of note's value. */
30   uint32_t n_type;   /* Type of note. */
31   // then name
32   // then padding, optional
33   // then desc, at 4 byte alignment (not 8, despite being elf64)
34 } Elf_Note;
35 
36 // The following include file and following structs/enums
37 // have been replicated on a per-use basis below. For example,
38 // llvm::AMDGPU::HSAMD::Kernel::Metadata has several fields,
39 // but we may care only about kernargSegmentSize_ for now, so
40 // we just include that field in our KernelMD implementation. We
41 // chose this approach to replicate in order to avoid forcing
42 // a dependency on LLVM_INCLUDE_DIR just to compile the runtime.
43 // #include "llvm/Support/AMDGPUMetadata.h"
44 // typedef llvm::AMDGPU::HSAMD::Metadata CodeObjectMD;
45 // typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
46 // typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
47 // using llvm::AMDGPU::HSAMD::AccessQualifier;
48 // using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
49 // using llvm::AMDGPU::HSAMD::ValueKind;
50 // using llvm::AMDGPU::HSAMD::ValueType;
51 
52 class KernelArgMD {
53 public:
54   enum class ValueKind {
55     HiddenGlobalOffsetX,
56     HiddenGlobalOffsetY,
57     HiddenGlobalOffsetZ,
58     HiddenNone,
59     HiddenPrintfBuffer,
60     HiddenDefaultQueue,
61     HiddenCompletionAction,
62     HiddenMultiGridSyncArg,
63     HiddenHostcallBuffer,
64     Unknown
65   };
66 
67   KernelArgMD()
68       : name_(std::string()), typeName_(std::string()), size_(0), offset_(0),
69         align_(0), valueKind_(ValueKind::Unknown) {}
70 
71   // fields
72   std::string name_;
73   std::string typeName_;
74   uint32_t size_;
75   uint32_t offset_;
76   uint32_t align_;
77   ValueKind valueKind_;
78 };
79 
80 class KernelMD {
81 public:
82   KernelMD() : kernargSegmentSize_(0ull) {}
83 
84   // fields
85   uint64_t kernargSegmentSize_;
86 };
87 
88 static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
89     //    Including only those fields that are relevant to the runtime.
90     //    {"ByValue", KernelArgMD::ValueKind::ByValue},
91     //    {"GlobalBuffer", KernelArgMD::ValueKind::GlobalBuffer},
92     //    {"DynamicSharedPointer",
93     //    KernelArgMD::ValueKind::DynamicSharedPointer},
94     //    {"Sampler", KernelArgMD::ValueKind::Sampler},
95     //    {"Image", KernelArgMD::ValueKind::Image},
96     //    {"Pipe", KernelArgMD::ValueKind::Pipe},
97     //    {"Queue", KernelArgMD::ValueKind::Queue},
98     {"HiddenGlobalOffsetX", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
99     {"HiddenGlobalOffsetY", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
100     {"HiddenGlobalOffsetZ", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
101     {"HiddenNone", KernelArgMD::ValueKind::HiddenNone},
102     {"HiddenPrintfBuffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
103     {"HiddenDefaultQueue", KernelArgMD::ValueKind::HiddenDefaultQueue},
104     {"HiddenCompletionAction", KernelArgMD::ValueKind::HiddenCompletionAction},
105     {"HiddenMultiGridSyncArg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
106     {"HiddenHostcallBuffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
107     // v3
108     //    {"by_value", KernelArgMD::ValueKind::ByValue},
109     //    {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer},
110     //    {"dynamic_shared_pointer",
111     //    KernelArgMD::ValueKind::DynamicSharedPointer},
112     //    {"sampler", KernelArgMD::ValueKind::Sampler},
113     //    {"image", KernelArgMD::ValueKind::Image},
114     //    {"pipe", KernelArgMD::ValueKind::Pipe},
115     //    {"queue", KernelArgMD::ValueKind::Queue},
116     {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
117     {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
118     {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
119     {"hidden_none", KernelArgMD::ValueKind::HiddenNone},
120     {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
121     {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue},
122     {"hidden_completion_action",
123      KernelArgMD::ValueKind::HiddenCompletionAction},
124     {"hidden_multigrid_sync_arg",
125      KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
126     {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
127 };
128 
129 // global variables. TODO: Get rid of these
130 atmi_machine_t g_atmi_machine;
131 ATLMachine g_atl_machine;
132 
133 std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
134 
135 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
136 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
137 
138 bool g_atmi_initialized = false;
139 bool g_atmi_hostcall_required = false;
140 
141 /*
142    atlc is all internal global values.
143    The structure atl_context_t is defined in atl_internal.h
144    Most references will use the global structure prefix atlc.
145 */
146 atl_context_t atlc = {.struct_initialized = false};
147 
148 namespace core {
149 /* Machine Info */
150 atmi_machine_t *Runtime::GetMachineInfo() {
151   if (!atlc.g_hsa_initialized)
152     return NULL;
153   return &g_atmi_machine;
154 }
155 
156 static void atl_set_atmi_initialized() {
157   // FIXME: thread safe? locks?
158   g_atmi_initialized = true;
159 }
160 
161 static void atl_reset_atmi_initialized() {
162   // FIXME: thread safe? locks?
163   g_atmi_initialized = false;
164 }
165 
166 bool atl_is_atmi_initialized() { return g_atmi_initialized; }
167 
168 hsa_status_t allow_access_to_all_gpu_agents(void *ptr) {
169   std::vector<ATLGPUProcessor> &gpu_procs =
170       g_atl_machine.processors<ATLGPUProcessor>();
171   std::vector<hsa_agent_t> agents;
172   for (uint32_t i = 0; i < gpu_procs.size(); i++) {
173     agents.push_back(gpu_procs[i].agent());
174   }
175   return hsa_amd_agents_allow_access(agents.size(), &agents[0], NULL, ptr);
176 }
177 
178 atmi_status_t Runtime::Initialize() {
179   atmi_devtype_t devtype = ATMI_DEVTYPE_GPU;
180   if (atl_is_atmi_initialized())
181     return ATMI_STATUS_SUCCESS;
182 
183   if (devtype == ATMI_DEVTYPE_ALL || devtype & ATMI_DEVTYPE_GPU) {
184     atmi_status_t rc = atl_init_gpu_context();
185     if (rc != ATMI_STATUS_SUCCESS) {
186       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "GPU context init",
187              get_atmi_error_string(atl_init_gpu_context()));
188       return rc;
189     }
190   }
191 
192   atl_set_atmi_initialized();
193   return ATMI_STATUS_SUCCESS;
194 }
195 
196 atmi_status_t Runtime::Finalize() {
197   atmi_status_t rc = ATMI_STATUS_SUCCESS;
198   for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) {
199     SymbolInfoTable[i].clear();
200   }
201   SymbolInfoTable.clear();
202   for (uint32_t i = 0; i < KernelInfoTable.size(); i++) {
203     KernelInfoTable[i].clear();
204   }
205   KernelInfoTable.clear();
206 
207   atl_reset_atmi_initialized();
208   hsa_status_t err = hsa_shut_down();
209   if (err != HSA_STATUS_SUCCESS) {
210     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Shutting down HSA",
211            get_error_string(err));
212     rc = ATMI_STATUS_ERROR;
213   }
214 
215   return rc;
216 }
217 
218 static void atmi_init_context_structs() {
219   atlc.struct_initialized = true; /* This only gets called one time */
220   atlc.g_hsa_initialized = false;
221   atlc.g_gpu_initialized = false;
222   atlc.g_tasks_initialized = false;
223 }
224 
225 // Implement memory_pool iteration function
226 static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool,
227                                          void *data) {
228   ATLProcessor *proc = reinterpret_cast<ATLProcessor *>(data);
229   hsa_status_t err = HSA_STATUS_SUCCESS;
230   // Check if the memory_pool is allowed to allocate, i.e. do not return group
231   // memory
232   bool alloc_allowed = false;
233   err = hsa_amd_memory_pool_get_info(
234       memory_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
235       &alloc_allowed);
236   if (err != HSA_STATUS_SUCCESS) {
237     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
238            "Alloc allowed in memory pool check", get_error_string(err));
239     return err;
240   }
241   if (alloc_allowed) {
242     uint32_t global_flag = 0;
243     err = hsa_amd_memory_pool_get_info(
244         memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
245     if (err != HSA_STATUS_SUCCESS) {
246       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
247              "Get memory pool info", get_error_string(err));
248       return err;
249     }
250     if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) {
251       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED);
252       proc->addMemory(new_mem);
253       if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) {
254         DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle);
255         atl_gpu_kernarg_pools.push_back(memory_pool);
256       }
257     } else {
258       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED);
259       proc->addMemory(new_mem);
260     }
261   }
262 
263   return err;
264 }
265 
266 static hsa_status_t get_agent_info(hsa_agent_t agent, void *data) {
267   hsa_status_t err = HSA_STATUS_SUCCESS;
268   hsa_device_type_t device_type;
269   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
270   if (err != HSA_STATUS_SUCCESS) {
271     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
272            "Get device type info", get_error_string(err));
273     return err;
274   }
275   switch (device_type) {
276   case HSA_DEVICE_TYPE_CPU: {
277     ATLCPUProcessor new_proc(agent);
278     err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
279                                              &new_proc);
280     if (err != HSA_STATUS_SUCCESS) {
281       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
282              "Iterate all memory pools", get_error_string(err));
283       return err;
284     }
285     g_atl_machine.addProcessor(new_proc);
286   } break;
287   case HSA_DEVICE_TYPE_GPU: {
288     hsa_profile_t profile;
289     err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &profile);
290     if (err != HSA_STATUS_SUCCESS) {
291       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
292              "Query the agent profile", get_error_string(err));
293       return err;
294     }
295     atmi_devtype_t gpu_type;
296     gpu_type =
297         (profile == HSA_PROFILE_FULL) ? ATMI_DEVTYPE_iGPU : ATMI_DEVTYPE_dGPU;
298     ATLGPUProcessor new_proc(agent, gpu_type);
299     err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
300                                              &new_proc);
301     if (err != HSA_STATUS_SUCCESS) {
302       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
303              "Iterate all memory pools", get_error_string(err));
304       return err;
305     }
306     g_atl_machine.addProcessor(new_proc);
307   } break;
308   case HSA_DEVICE_TYPE_DSP: {
309     err = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
310   } break;
311   }
312 
313   return err;
314 }
315 
316 hsa_status_t get_fine_grained_region(hsa_region_t region, void *data) {
317   hsa_region_segment_t segment;
318   hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
319   if (segment != HSA_REGION_SEGMENT_GLOBAL) {
320     return HSA_STATUS_SUCCESS;
321   }
322   hsa_region_global_flag_t flags;
323   hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
324   if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
325     hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data);
326     *ret = region;
327     return HSA_STATUS_INFO_BREAK;
328   }
329   return HSA_STATUS_SUCCESS;
330 }
331 
332 /* Determines if a memory region can be used for kernarg allocations.  */
333 static hsa_status_t get_kernarg_memory_region(hsa_region_t region, void *data) {
334   hsa_region_segment_t segment;
335   hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
336   if (HSA_REGION_SEGMENT_GLOBAL != segment) {
337     return HSA_STATUS_SUCCESS;
338   }
339 
340   hsa_region_global_flag_t flags;
341   hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
342   if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
343     hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data);
344     *ret = region;
345     return HSA_STATUS_INFO_BREAK;
346   }
347 
348   return HSA_STATUS_SUCCESS;
349 }
350 
351 static hsa_status_t init_compute_and_memory() {
352   hsa_status_t err;
353 
354   /* Iterate over the agents and pick the gpu agent */
355   err = hsa_iterate_agents(get_agent_info, NULL);
356   if (err == HSA_STATUS_INFO_BREAK) {
357     err = HSA_STATUS_SUCCESS;
358   }
359   if (err != HSA_STATUS_SUCCESS) {
360     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Getting a gpu agent",
361            get_error_string(err));
362     return err;
363   }
364 
365   /* Init all devices or individual device types? */
366   std::vector<ATLCPUProcessor> &cpu_procs =
367       g_atl_machine.processors<ATLCPUProcessor>();
368   std::vector<ATLGPUProcessor> &gpu_procs =
369       g_atl_machine.processors<ATLGPUProcessor>();
370   /* For CPU memory pools, add other devices that can access them directly
371    * or indirectly */
372   for (auto &cpu_proc : cpu_procs) {
373     for (auto &cpu_mem : cpu_proc.memories()) {
374       hsa_amd_memory_pool_t pool = cpu_mem.memory();
375       for (auto &gpu_proc : gpu_procs) {
376         hsa_agent_t agent = gpu_proc.agent();
377         hsa_amd_memory_pool_access_t access;
378         hsa_amd_agent_memory_pool_get_info(
379             agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
380         if (access != 0) {
381           // this means not NEVER, but could be YES or NO
382           // add this memory pool to the proc
383           gpu_proc.addMemory(cpu_mem);
384         }
385       }
386     }
387   }
388 
389   /* FIXME: are the below combinations of procs and memory pools needed?
390    * all to all compare procs with their memory pools and add those memory
391    * pools that are accessible by the target procs */
392   for (auto &gpu_proc : gpu_procs) {
393     for (auto &gpu_mem : gpu_proc.memories()) {
394       hsa_amd_memory_pool_t pool = gpu_mem.memory();
395       for (auto &cpu_proc : cpu_procs) {
396         hsa_agent_t agent = cpu_proc.agent();
397         hsa_amd_memory_pool_access_t access;
398         hsa_amd_agent_memory_pool_get_info(
399             agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
400         if (access != 0) {
401           // this means not NEVER, but could be YES or NO
402           // add this memory pool to the proc
403           cpu_proc.addMemory(gpu_mem);
404         }
405       }
406     }
407   }
408 
409   g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_CPU] = cpu_procs.size();
410   g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_GPU] = gpu_procs.size();
411 
412   size_t num_procs = cpu_procs.size() + gpu_procs.size();
413   // g_atmi_machine.devices = (atmi_device_t *)malloc(num_procs *
414   // sizeof(atmi_device_t));
415   atmi_device_t *all_devices = reinterpret_cast<atmi_device_t *>(
416       malloc(num_procs * sizeof(atmi_device_t)));
417   int num_iGPUs = 0;
418   int num_dGPUs = 0;
419   for (uint32_t i = 0; i < gpu_procs.size(); i++) {
420     if (gpu_procs[i].type() == ATMI_DEVTYPE_iGPU)
421       num_iGPUs++;
422     else
423       num_dGPUs++;
424   }
425   assert(num_iGPUs + num_dGPUs == gpu_procs.size() &&
426          "Number of dGPUs and iGPUs do not add up");
427   DEBUG_PRINT("CPU Agents: %lu\n", cpu_procs.size());
428   DEBUG_PRINT("iGPU Agents: %d\n", num_iGPUs);
429   DEBUG_PRINT("dGPU Agents: %d\n", num_dGPUs);
430   DEBUG_PRINT("GPU Agents: %lu\n", gpu_procs.size());
431 
432   g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_iGPU] = num_iGPUs;
433   g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_dGPU] = num_dGPUs;
434 
435   int cpus_begin = 0;
436   int cpus_end = cpu_procs.size();
437   int gpus_begin = cpu_procs.size();
438   int gpus_end = cpu_procs.size() + gpu_procs.size();
439   g_atmi_machine.devices_by_type[ATMI_DEVTYPE_CPU] = &all_devices[cpus_begin];
440   g_atmi_machine.devices_by_type[ATMI_DEVTYPE_GPU] = &all_devices[gpus_begin];
441   g_atmi_machine.devices_by_type[ATMI_DEVTYPE_iGPU] = &all_devices[gpus_begin];
442   g_atmi_machine.devices_by_type[ATMI_DEVTYPE_dGPU] = &all_devices[gpus_begin];
443   int proc_index = 0;
444   for (int i = cpus_begin; i < cpus_end; i++) {
445     all_devices[i].type = cpu_procs[proc_index].type();
446 
447     std::vector<ATLMemory> memories = cpu_procs[proc_index].memories();
448     int fine_memories_size = 0;
449     int coarse_memories_size = 0;
450     DEBUG_PRINT("CPU memory types:\t");
451     for (auto &memory : memories) {
452       atmi_memtype_t type = memory.type();
453       if (type == ATMI_MEMTYPE_FINE_GRAINED) {
454         fine_memories_size++;
455         DEBUG_PRINT("Fine\t");
456       } else {
457         coarse_memories_size++;
458         DEBUG_PRINT("Coarse\t");
459       }
460     }
461     DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
462     DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
463     proc_index++;
464   }
465   proc_index = 0;
466   for (int i = gpus_begin; i < gpus_end; i++) {
467     all_devices[i].type = gpu_procs[proc_index].type();
468 
469     std::vector<ATLMemory> memories = gpu_procs[proc_index].memories();
470     int fine_memories_size = 0;
471     int coarse_memories_size = 0;
472     DEBUG_PRINT("GPU memory types:\t");
473     for (auto &memory : memories) {
474       atmi_memtype_t type = memory.type();
475       if (type == ATMI_MEMTYPE_FINE_GRAINED) {
476         fine_memories_size++;
477         DEBUG_PRINT("Fine\t");
478       } else {
479         coarse_memories_size++;
480         DEBUG_PRINT("Coarse\t");
481       }
482     }
483     DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
484     DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
485     proc_index++;
486   }
487   proc_index = 0;
488   hsa_region_t atl_cpu_kernarg_region;
489   atl_cpu_kernarg_region.handle = (uint64_t)-1;
490   if (cpu_procs.size() > 0) {
491     err = hsa_agent_iterate_regions(
492         cpu_procs[0].agent(), get_fine_grained_region, &atl_cpu_kernarg_region);
493     if (err == HSA_STATUS_INFO_BREAK) {
494       err = HSA_STATUS_SUCCESS;
495     }
496     err = (atl_cpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR
497                                                           : HSA_STATUS_SUCCESS;
498     if (err != HSA_STATUS_SUCCESS) {
499       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
500              "Finding a CPU kernarg memory region handle",
501              get_error_string(err));
502       return err;
503     }
504   }
505   hsa_region_t atl_gpu_kernarg_region;
506   /* Find a memory region that supports kernel arguments.  */
507   atl_gpu_kernarg_region.handle = (uint64_t)-1;
508   if (gpu_procs.size() > 0) {
509     hsa_agent_iterate_regions(gpu_procs[0].agent(), get_kernarg_memory_region,
510                               &atl_gpu_kernarg_region);
511     err = (atl_gpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR
512                                                           : HSA_STATUS_SUCCESS;
513     if (err != HSA_STATUS_SUCCESS) {
514       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
515              "Finding a kernarg memory region", get_error_string(err));
516       return err;
517     }
518   }
519   if (num_procs > 0)
520     return HSA_STATUS_SUCCESS;
521   else
522     return HSA_STATUS_ERROR_NOT_INITIALIZED;
523 }
524 
525 hsa_status_t init_hsa() {
526   if (atlc.g_hsa_initialized == false) {
527     DEBUG_PRINT("Initializing HSA...");
528     hsa_status_t err = hsa_init();
529     if (err != HSA_STATUS_SUCCESS) {
530       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
531              "Initializing the hsa runtime", get_error_string(err));
532       return err;
533     }
534     if (err != HSA_STATUS_SUCCESS)
535       return err;
536 
537     err = init_compute_and_memory();
538     if (err != HSA_STATUS_SUCCESS)
539       return err;
540     if (err != HSA_STATUS_SUCCESS) {
541       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
542              "After initializing compute and memory", get_error_string(err));
543       return err;
544     }
545 
546     int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
547     KernelInfoTable.resize(gpu_count);
548     SymbolInfoTable.resize(gpu_count);
549     for (uint32_t i = 0; i < SymbolInfoTable.size(); i++)
550       SymbolInfoTable[i].clear();
551     for (uint32_t i = 0; i < KernelInfoTable.size(); i++)
552       KernelInfoTable[i].clear();
553     atlc.g_hsa_initialized = true;
554     DEBUG_PRINT("done\n");
555   }
556   return HSA_STATUS_SUCCESS;
557 }
558 
559 void init_tasks() {
560   if (atlc.g_tasks_initialized != false)
561     return;
562   std::vector<hsa_agent_t> gpu_agents;
563   int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>();
564   for (int gpu = 0; gpu < gpu_count; gpu++) {
565     atmi_place_t place = ATMI_PLACE_GPU(0, gpu);
566     ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
567     gpu_agents.push_back(proc.agent());
568   }
569   atlc.g_tasks_initialized = true;
570 }
571 
572 hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) {
573 #if (ROCM_VERSION_MAJOR >= 3) ||                                               \
574     (ROCM_VERSION_MAJOR >= 2 && ROCM_VERSION_MINOR >= 3)
575   if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) {
576 #else
577   if (event->event_type == GPU_MEMORY_FAULT_EVENT) {
578 #endif
579     hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault;
580     // memory_fault.agent
581     // memory_fault.virtual_address
582     // memory_fault.fault_reason_mask
583     // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address);
584     std::stringstream stream;
585     stream << std::hex << (uintptr_t)memory_fault.virtual_address;
586     std::string addr("0x" + stream.str());
587 
588     std::string err_string = "[GPU Memory Error] Addr: " + addr;
589     err_string += " Reason: ";
590     if (!(memory_fault.fault_reason_mask & 0x00111111)) {
591       err_string += "No Idea! ";
592     } else {
593       if (memory_fault.fault_reason_mask & 0x00000001)
594         err_string += "Page not present or supervisor privilege. ";
595       if (memory_fault.fault_reason_mask & 0x00000010)
596         err_string += "Write access to a read-only page. ";
597       if (memory_fault.fault_reason_mask & 0x00000100)
598         err_string += "Execute access to a page marked NX. ";
599       if (memory_fault.fault_reason_mask & 0x00001000)
600         err_string += "Host access only. ";
601       if (memory_fault.fault_reason_mask & 0x00010000)
602         err_string += "ECC failure (if supported by HW). ";
603       if (memory_fault.fault_reason_mask & 0x00100000)
604         err_string += "Can't determine the exact fault address. ";
605     }
606     fprintf(stderr, "%s\n", err_string.c_str());
607     return HSA_STATUS_ERROR;
608   }
609   return HSA_STATUS_SUCCESS;
610 }
611 
612 atmi_status_t atl_init_gpu_context() {
613   if (atlc.struct_initialized == false)
614     atmi_init_context_structs();
615   if (atlc.g_gpu_initialized != false)
616     return ATMI_STATUS_SUCCESS;
617 
618   hsa_status_t err;
619   err = init_hsa();
620   if (err != HSA_STATUS_SUCCESS)
621     return ATMI_STATUS_ERROR;
622 
623   err = hsa_amd_register_system_event_handler(callbackEvent, NULL);
624   if (err != HSA_STATUS_SUCCESS) {
625     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
626            "Registering the system for memory faults", get_error_string(err));
627     return ATMI_STATUS_ERROR;
628   }
629 
630   init_tasks();
631   atlc.g_gpu_initialized = true;
632   return ATMI_STATUS_SUCCESS;
633 }
634 
635 static bool isImplicit(KernelArgMD::ValueKind value_kind) {
636   switch (value_kind) {
637   case KernelArgMD::ValueKind::HiddenGlobalOffsetX:
638   case KernelArgMD::ValueKind::HiddenGlobalOffsetY:
639   case KernelArgMD::ValueKind::HiddenGlobalOffsetZ:
640   case KernelArgMD::ValueKind::HiddenNone:
641   case KernelArgMD::ValueKind::HiddenPrintfBuffer:
642   case KernelArgMD::ValueKind::HiddenDefaultQueue:
643   case KernelArgMD::ValueKind::HiddenCompletionAction:
644   case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
645   case KernelArgMD::ValueKind::HiddenHostcallBuffer:
646     return true;
647   default:
648     return false;
649   }
650 }
651 
652 static std::pair<unsigned char *, unsigned char *>
653 find_metadata(void *binary, size_t binSize) {
654   std::pair<unsigned char *, unsigned char *> failure = {nullptr, nullptr};
655 
656   Elf *e = elf_memory(static_cast<char *>(binary), binSize);
657   if (elf_kind(e) != ELF_K_ELF) {
658     return failure;
659   }
660 
661   size_t numpHdrs;
662   if (elf_getphdrnum(e, &numpHdrs) != 0) {
663     return failure;
664   }
665 
666   for (size_t i = 0; i < numpHdrs; ++i) {
667     GElf_Phdr pHdr;
668     if (gelf_getphdr(e, i, &pHdr) != &pHdr) {
669       continue;
670     }
671     // Look for the runtime metadata note
672     if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) {
673       // Iterate over the notes in this segment
674       address ptr = (address)binary + pHdr.p_offset;
675       address segmentEnd = ptr + pHdr.p_filesz;
676 
677       while (ptr < segmentEnd) {
678         Elf_Note *note = reinterpret_cast<Elf_Note *>(ptr);
679         address name = (address)&note[1];
680 
681         if (note->n_type == 7 || note->n_type == 8) {
682           return failure;
683         } else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA */ &&
684                    note->n_namesz == sizeof "AMD" &&
685                    !memcmp(name, "AMD", note->n_namesz)) {
686           // code object v2 uses yaml metadata, no longer supported
687           return failure;
688         } else if (note->n_type == 32 /* NT_AMDGPU_METADATA */ &&
689                    note->n_namesz == sizeof "AMDGPU" &&
690                    !memcmp(name, "AMDGPU", note->n_namesz)) {
691 
692           // n_descsz = 485
693           // value is padded to 4 byte alignment, may want to move end up to
694           // match
695           size_t offset = sizeof(uint32_t) * 3 /* fields */
696                           + sizeof("AMDGPU")   /* name */
697                           + 1 /* padding to 4 byte alignment */;
698 
699           // Including the trailing padding means both pointers are 4 bytes
700           // aligned, which may be useful later.
701           unsigned char *metadata_start = (unsigned char *)ptr + offset;
702           unsigned char *metadata_end =
703               metadata_start + core::alignUp(note->n_descsz, 4);
704           return {metadata_start, metadata_end};
705         }
706         ptr += sizeof(*note) + core::alignUp(note->n_namesz, sizeof(int)) +
707                core::alignUp(note->n_descsz, sizeof(int));
708       }
709     }
710   }
711 
712   return failure;
713 }
714 
715 namespace {
716 int map_lookup_array(msgpack::byte_range message, const char *needle,
717                      msgpack::byte_range *res, uint64_t *size) {
718   unsigned count = 0;
719   struct s : msgpack::functors_defaults<s> {
720     s(unsigned &count, uint64_t *size) : count(count), size(size) {}
721     unsigned &count;
722     uint64_t *size;
723     const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) {
724       count++;
725       *size = N;
726       return bytes.end;
727     }
728   };
729 
730   msgpack::foreach_map(message,
731                        [&](msgpack::byte_range key, msgpack::byte_range value) {
732                          if (msgpack::message_is_string(key, needle)) {
733                            // If the message is an array, record number of
734                            // elements in *size
735                            msgpack::handle_msgpack<s>(value, {count, size});
736                            // return the whole array
737                            *res = value;
738                          }
739                        });
740   // Only claim success if exactly one key/array pair matched
741   return count != 1;
742 }
743 
744 int map_lookup_string(msgpack::byte_range message, const char *needle,
745                       std::string *res) {
746   unsigned count = 0;
747   struct s : public msgpack::functors_defaults<s> {
748     s(unsigned &count, std::string *res) : count(count), res(res) {}
749     unsigned &count;
750     std::string *res;
751     void handle_string(size_t N, const unsigned char *str) {
752       count++;
753       *res = std::string(str, str + N);
754     }
755   };
756   msgpack::foreach_map(message,
757                        [&](msgpack::byte_range key, msgpack::byte_range value) {
758                          if (msgpack::message_is_string(key, needle)) {
759                            msgpack::handle_msgpack<s>(value, {count, res});
760                          }
761                        });
762   return count != 1;
763 }
764 
765 int map_lookup_uint64_t(msgpack::byte_range message, const char *needle,
766                         uint64_t *res) {
767   unsigned count = 0;
768   msgpack::foreach_map(message,
769                        [&](msgpack::byte_range key, msgpack::byte_range value) {
770                          if (msgpack::message_is_string(key, needle)) {
771                            msgpack::foronly_unsigned(value, [&](uint64_t x) {
772                              count++;
773                              *res = x;
774                            });
775                          }
776                        });
777   return count != 1;
778 }
779 
780 int array_lookup_element(msgpack::byte_range message, uint64_t elt,
781                          msgpack::byte_range *res) {
782   int rc = 1;
783   uint64_t i = 0;
784   msgpack::foreach_array(message, [&](msgpack::byte_range value) {
785     if (i == elt) {
786       *res = value;
787       rc = 0;
788     }
789     i++;
790   });
791   return rc;
792 }
793 
794 int populate_kernelArgMD(msgpack::byte_range args_element,
795                          KernelArgMD *kernelarg) {
796   using namespace msgpack;
797   int error = 0;
798   foreach_map(args_element, [&](byte_range key, byte_range value) -> void {
799     if (message_is_string(key, ".name")) {
800       foronly_string(value, [&](size_t N, const unsigned char *str) {
801         kernelarg->name_ = std::string(str, str + N);
802       });
803     } else if (message_is_string(key, ".type_name")) {
804       foronly_string(value, [&](size_t N, const unsigned char *str) {
805         kernelarg->typeName_ = std::string(str, str + N);
806       });
807     } else if (message_is_string(key, ".size")) {
808       foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; });
809     } else if (message_is_string(key, ".offset")) {
810       foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; });
811     } else if (message_is_string(key, ".value_kind")) {
812       foronly_string(value, [&](size_t N, const unsigned char *str) {
813         std::string s = std::string(str, str + N);
814         auto itValueKind = ArgValueKind.find(s);
815         if (itValueKind != ArgValueKind.end()) {
816           kernelarg->valueKind_ = itValueKind->second;
817         }
818       });
819     }
820   });
821   return error;
822 }
823 } // namespace
824 
825 static hsa_status_t get_code_object_custom_metadata(void *binary,
826                                                     size_t binSize, int gpu) {
827   // parse code object with different keys from v2
828   // also, the kernel name is not the same as the symbol name -- so a
829   // symbol->name map is needed
830 
831   std::pair<unsigned char *, unsigned char *> metadata =
832       find_metadata(binary, binSize);
833   if (!metadata.first) {
834     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
835   }
836 
837   uint64_t kernelsSize = 0;
838   int msgpack_errors = 0;
839   msgpack::byte_range kernel_array;
840   msgpack_errors =
841       map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels",
842                        &kernel_array, &kernelsSize);
843   if (msgpack_errors != 0) {
844     printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
845            "kernels lookup in program metadata");
846     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
847   }
848 
849   for (size_t i = 0; i < kernelsSize; i++) {
850     assert(msgpack_errors == 0);
851     std::string kernelName;
852     std::string symbolName;
853 
854     msgpack::byte_range element;
855     msgpack_errors += array_lookup_element(kernel_array, i, &element);
856     if (msgpack_errors != 0) {
857       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
858              "element lookup in kernel metadata");
859       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
860     }
861 
862     msgpack_errors += map_lookup_string(element, ".name", &kernelName);
863     msgpack_errors += map_lookup_string(element, ".symbol", &symbolName);
864     if (msgpack_errors != 0) {
865       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
866              "strings lookup in kernel metadata");
867       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
868     }
869 
870     // Make sure that kernelName + ".kd" == symbolName
871     if ((kernelName + ".kd") != symbolName) {
872       printf("[%s:%d] Kernel name mismatching symbol: %s != %s + .kd\n",
873              __FILE__, __LINE__, symbolName.c_str(), kernelName.c_str());
874       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
875     }
876 
877     atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, {}, {}, {}};
878 
879     uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count;
880     msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count);
881     if (msgpack_errors != 0) {
882       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
883              "sgpr count metadata lookup in kernel metadata");
884       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
885     }
886 
887     info.sgpr_count = sgpr_count;
888 
889     msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count);
890     if (msgpack_errors != 0) {
891       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
892              "vgpr count metadata lookup in kernel metadata");
893       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
894     }
895 
896     info.vgpr_count = vgpr_count;
897 
898     msgpack_errors +=
899         map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count);
900     if (msgpack_errors != 0) {
901       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
902              "sgpr spill count metadata lookup in kernel metadata");
903       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
904     }
905 
906     info.sgpr_spill_count = sgpr_spill_count;
907 
908     msgpack_errors +=
909         map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count);
910     if (msgpack_errors != 0) {
911       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
912              "vgpr spill count metadata lookup in kernel metadata");
913       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
914     }
915 
916     info.vgpr_spill_count = vgpr_spill_count;
917 
918     size_t kernel_explicit_args_size = 0;
919     uint64_t kernel_segment_size;
920     msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size",
921                                           &kernel_segment_size);
922     if (msgpack_errors != 0) {
923       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
924              "kernarg segment size metadata lookup in kernel metadata");
925       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
926     }
927 
928     bool hasHiddenArgs = false;
929     if (kernel_segment_size > 0) {
930       uint64_t argsSize;
931       size_t offset = 0;
932 
933       msgpack::byte_range args_array;
934       msgpack_errors +=
935           map_lookup_array(element, ".args", &args_array, &argsSize);
936       if (msgpack_errors != 0) {
937         printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
938                "kernel args metadata lookup in kernel metadata");
939         return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
940       }
941 
942       info.num_args = argsSize;
943 
944       for (size_t i = 0; i < argsSize; ++i) {
945         KernelArgMD lcArg;
946 
947         msgpack::byte_range args_element;
948         msgpack_errors += array_lookup_element(args_array, i, &args_element);
949         if (msgpack_errors != 0) {
950           printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
951                  "iterate args map in kernel args metadata");
952           return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
953         }
954 
955         msgpack_errors += populate_kernelArgMD(args_element, &lcArg);
956         if (msgpack_errors != 0) {
957           printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
958                  "iterate args map in kernel args metadata");
959           return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
960         }
961         // populate info with sizes and offsets
962         info.arg_sizes.push_back(lcArg.size_);
963         // v3 has offset field and not align field
964         size_t new_offset = lcArg.offset_;
965         size_t padding = new_offset - offset;
966         offset = new_offset;
967         info.arg_offsets.push_back(lcArg.offset_);
968         DEBUG_PRINT("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(),
969                     lcArg.size_, lcArg.offset_);
970         offset += lcArg.size_;
971 
972         // check if the arg is a hidden/implicit arg
973         // this logic assumes that all hidden args are 8-byte aligned
974         if (!isImplicit(lcArg.valueKind_)) {
975           kernel_explicit_args_size += lcArg.size_;
976         } else {
977           hasHiddenArgs = true;
978         }
979         kernel_explicit_args_size += padding;
980       }
981     }
982 
983     // add size of implicit args, e.g.: offset x, y and z and pipe pointer, but
984     // in ATMI, do not count the compiler set implicit args, but set your own
985     // implicit args by discounting the compiler set implicit args
986     info.kernel_segment_size =
987         (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size) +
988         sizeof(atmi_implicit_args_t);
989     DEBUG_PRINT("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
990                 kernel_segment_size, info.kernel_segment_size);
991 
992     // kernel received, now add it to the kernel info table
993     KernelInfoTable[gpu][kernelName] = info;
994   }
995 
996   return HSA_STATUS_SUCCESS;
997 }
998 
999 static hsa_status_t populate_InfoTables(hsa_executable_t executable,
1000                                         hsa_executable_symbol_t symbol,
1001                                         void *data) {
1002   int gpu = *static_cast<int *>(data);
1003   hsa_symbol_kind_t type;
1004 
1005   uint32_t name_length;
1006   hsa_status_t err;
1007   err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE,
1008                                        &type);
1009   if (err != HSA_STATUS_SUCCESS) {
1010     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1011            "Symbol info extraction", get_error_string(err));
1012     return err;
1013   }
1014   DEBUG_PRINT("Exec Symbol type: %d\n", type);
1015   if (type == HSA_SYMBOL_KIND_KERNEL) {
1016     err = hsa_executable_symbol_get_info(
1017         symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
1018     if (err != HSA_STATUS_SUCCESS) {
1019       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1020              "Symbol info extraction", get_error_string(err));
1021       return err;
1022     }
1023     char *name = reinterpret_cast<char *>(malloc(name_length + 1));
1024     err = hsa_executable_symbol_get_info(symbol,
1025                                          HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
1026     if (err != HSA_STATUS_SUCCESS) {
1027       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1028              "Symbol info extraction", get_error_string(err));
1029       return err;
1030     }
1031     // remove the suffix .kd from symbol name.
1032     name[name_length - 3] = 0;
1033 
1034     atl_kernel_info_t info;
1035     std::string kernelName(name);
1036     // by now, the kernel info table should already have an entry
1037     // because the non-ROCr custom code object parsing is called before
1038     // iterating over the code object symbols using ROCr
1039     if (KernelInfoTable[gpu].find(kernelName) == KernelInfoTable[gpu].end()) {
1040       return HSA_STATUS_ERROR;
1041     }
1042     // found, so assign and update
1043     info = KernelInfoTable[gpu][kernelName];
1044 
1045     /* Extract dispatch information from the symbol */
1046     err = hsa_executable_symbol_get_info(
1047         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
1048         &(info.kernel_object));
1049     if (err != HSA_STATUS_SUCCESS) {
1050       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1051              "Extracting the symbol from the executable",
1052              get_error_string(err));
1053       return err;
1054     }
1055     err = hsa_executable_symbol_get_info(
1056         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
1057         &(info.group_segment_size));
1058     if (err != HSA_STATUS_SUCCESS) {
1059       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1060              "Extracting the group segment size from the executable",
1061              get_error_string(err));
1062       return err;
1063     }
1064     err = hsa_executable_symbol_get_info(
1065         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
1066         &(info.private_segment_size));
1067     if (err != HSA_STATUS_SUCCESS) {
1068       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1069              "Extracting the private segment from the executable",
1070              get_error_string(err));
1071       return err;
1072     }
1073 
1074     DEBUG_PRINT(
1075         "Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes "
1076         "kernarg\n",
1077         kernelName.c_str(), info.kernel_object, info.group_segment_size,
1078         info.private_segment_size, info.kernel_segment_size);
1079 
1080     // assign it back to the kernel info table
1081     KernelInfoTable[gpu][kernelName] = info;
1082     free(name);
1083   } else if (type == HSA_SYMBOL_KIND_VARIABLE) {
1084     err = hsa_executable_symbol_get_info(
1085         symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
1086     if (err != HSA_STATUS_SUCCESS) {
1087       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1088              "Symbol info extraction", get_error_string(err));
1089       return err;
1090     }
1091     char *name = reinterpret_cast<char *>(malloc(name_length + 1));
1092     err = hsa_executable_symbol_get_info(symbol,
1093                                          HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
1094     if (err != HSA_STATUS_SUCCESS) {
1095       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1096              "Symbol info extraction", get_error_string(err));
1097       return err;
1098     }
1099     name[name_length] = 0;
1100 
1101     atl_symbol_info_t info;
1102 
1103     err = hsa_executable_symbol_get_info(
1104         symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr));
1105     if (err != HSA_STATUS_SUCCESS) {
1106       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1107              "Symbol info address extraction", get_error_string(err));
1108       return err;
1109     }
1110 
1111     err = hsa_executable_symbol_get_info(
1112         symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size));
1113     if (err != HSA_STATUS_SUCCESS) {
1114       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1115              "Symbol info size extraction", get_error_string(err));
1116       return err;
1117     }
1118 
1119     atmi_mem_place_t place = ATMI_MEM_PLACE(ATMI_DEVTYPE_GPU, gpu, 0);
1120     DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr,
1121                 info.size);
1122     err = register_allocation(reinterpret_cast<void *>(info.addr),
1123                               (size_t)info.size, place);
1124     if (err != HSA_STATUS_SUCCESS) {
1125       return err;
1126     }
1127     SymbolInfoTable[gpu][std::string(name)] = info;
1128     if (strcmp(name, "needs_hostcall_buffer") == 0)
1129       g_atmi_hostcall_required = true;
1130     free(name);
1131   } else {
1132     DEBUG_PRINT("Symbol is an indirect function\n");
1133   }
1134   return HSA_STATUS_SUCCESS;
1135 }
1136 
1137 atmi_status_t Runtime::RegisterModuleFromMemory(
1138     void *module_bytes, size_t module_size, atmi_place_t place,
1139     atmi_status_t (*on_deserialized_data)(void *data, size_t size,
1140                                           void *cb_state),
1141     void *cb_state, std::vector<hsa_executable_t> &HSAExecutables) {
1142   hsa_status_t err;
1143   int gpu = place.device_id;
1144   assert(gpu >= 0);
1145 
1146   DEBUG_PRINT("Trying to load module to GPU-%d\n", gpu);
1147   ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place);
1148   hsa_agent_t agent = proc.agent();
1149   hsa_executable_t executable = {0};
1150   hsa_profile_t agent_profile;
1151 
1152   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
1153   if (err != HSA_STATUS_SUCCESS) {
1154     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1155            "Query the agent profile", get_error_string(err));
1156     return ATMI_STATUS_ERROR;
1157   }
1158   // FIXME: Assume that every profile is FULL until we understand how to build
1159   // GCN with base profile
1160   agent_profile = HSA_PROFILE_FULL;
1161   /* Create the empty executable.  */
1162   err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "",
1163                               &executable);
1164   if (err != HSA_STATUS_SUCCESS) {
1165     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1166            "Create the executable", get_error_string(err));
1167     return ATMI_STATUS_ERROR;
1168   }
1169 
1170   bool module_load_success = false;
1171   do // Existing control flow used continue, preserve that for this patch
1172   {
1173     {
1174       // Some metadata info is not available through ROCr API, so use custom
1175       // code object metadata parsing to collect such metadata info
1176 
1177       err = get_code_object_custom_metadata(module_bytes, module_size, gpu);
1178       if (err != HSA_STATUS_SUCCESS) {
1179         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1180                     "Getting custom code object metadata",
1181                     get_error_string(err));
1182         continue;
1183       }
1184 
1185       // Deserialize code object.
1186       hsa_code_object_t code_object = {0};
1187       err = hsa_code_object_deserialize(module_bytes, module_size, NULL,
1188                                         &code_object);
1189       if (err != HSA_STATUS_SUCCESS) {
1190         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1191                     "Code Object Deserialization", get_error_string(err));
1192         continue;
1193       }
1194       assert(0 != code_object.handle);
1195 
1196       // Mutating the device image here avoids another allocation & memcpy
1197       void *code_object_alloc_data =
1198           reinterpret_cast<void *>(code_object.handle);
1199       atmi_status_t atmi_err =
1200           on_deserialized_data(code_object_alloc_data, module_size, cb_state);
1201       if (atmi_err != ATMI_STATUS_SUCCESS) {
1202         printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1203                "Error in deserialized_data callback",
1204                get_atmi_error_string(atmi_err));
1205         return atmi_err;
1206       }
1207 
1208       /* Load the code object.  */
1209       err =
1210           hsa_executable_load_code_object(executable, agent, code_object, NULL);
1211       if (err != HSA_STATUS_SUCCESS) {
1212         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1213                     "Loading the code object", get_error_string(err));
1214         continue;
1215       }
1216 
1217       // cannot iterate over symbols until executable is frozen
1218     }
1219     module_load_success = true;
1220   } while (0);
1221   DEBUG_PRINT("Modules loaded successful? %d\n", module_load_success);
1222   if (module_load_success) {
1223     /* Freeze the executable; it can now be queried for symbols.  */
1224     err = hsa_executable_freeze(executable, "");
1225     if (err != HSA_STATUS_SUCCESS) {
1226       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1227              "Freeze the executable", get_error_string(err));
1228       return ATMI_STATUS_ERROR;
1229     }
1230 
1231     err = hsa_executable_iterate_symbols(executable, populate_InfoTables,
1232                                          static_cast<void *>(&gpu));
1233     if (err != HSA_STATUS_SUCCESS) {
1234       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1235              "Iterating over symbols for execuatable", get_error_string(err));
1236       return ATMI_STATUS_ERROR;
1237     }
1238 
1239     // save the executable and destroy during finalize
1240     HSAExecutables.push_back(executable);
1241     return ATMI_STATUS_SUCCESS;
1242   } else {
1243     return ATMI_STATUS_ERROR;
1244   }
1245 }
1246 
1247 } // namespace core
1248