1 /*===--------------------------------------------------------------------------
2  *              ATMI (Asynchronous Task and Memory Interface)
3  *
4  * This file is distributed under the MIT License. See LICENSE.txt for details.
5  *===------------------------------------------------------------------------*/
6 #include <gelf.h>
7 #include <libelf.h>
8 
9 #include <cassert>
10 #include <cstdarg>
11 #include <fstream>
12 #include <iomanip>
13 #include <iostream>
14 #include <set>
15 #include <string>
16 
17 #include "internal.h"
18 #include "machine.h"
19 #include "rt.h"
20 
21 #include "msgpack.h"
22 
23 namespace hsa {
24 // Wrap HSA iterate API in a shim that allows passing general callables
25 template <typename C>
26 hsa_status_t executable_iterate_symbols(hsa_executable_t executable, C cb) {
27   auto L = [](hsa_executable_t executable, hsa_executable_symbol_t symbol,
28               void *data) -> hsa_status_t {
29     C *unwrapped = static_cast<C *>(data);
30     return (*unwrapped)(executable, symbol);
31   };
32   return hsa_executable_iterate_symbols(executable, L,
33                                         static_cast<void *>(&cb));
34 }
35 } // namespace hsa
36 
37 typedef unsigned char *address;
38 /*
39  * Note descriptors.
40  */
41 typedef struct {
42   uint32_t n_namesz; /* Length of note's name. */
43   uint32_t n_descsz; /* Length of note's value. */
44   uint32_t n_type;   /* Type of note. */
45   // then name
46   // then padding, optional
47   // then desc, at 4 byte alignment (not 8, despite being elf64)
48 } Elf_Note;
49 
50 // The following include file and following structs/enums
51 // have been replicated on a per-use basis below. For example,
52 // llvm::AMDGPU::HSAMD::Kernel::Metadata has several fields,
53 // but we may care only about kernargSegmentSize_ for now, so
54 // we just include that field in our KernelMD implementation. We
55 // chose this approach to replicate in order to avoid forcing
56 // a dependency on LLVM_INCLUDE_DIR just to compile the runtime.
57 // #include "llvm/Support/AMDGPUMetadata.h"
58 // typedef llvm::AMDGPU::HSAMD::Metadata CodeObjectMD;
59 // typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
60 // typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
61 // using llvm::AMDGPU::HSAMD::AccessQualifier;
62 // using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
63 // using llvm::AMDGPU::HSAMD::ValueKind;
64 // using llvm::AMDGPU::HSAMD::ValueType;
65 
66 class KernelArgMD {
67 public:
68   enum class ValueKind {
69     HiddenGlobalOffsetX,
70     HiddenGlobalOffsetY,
71     HiddenGlobalOffsetZ,
72     HiddenNone,
73     HiddenPrintfBuffer,
74     HiddenDefaultQueue,
75     HiddenCompletionAction,
76     HiddenMultiGridSyncArg,
77     HiddenHostcallBuffer,
78     Unknown
79   };
80 
81   KernelArgMD()
82       : name_(std::string()), typeName_(std::string()), size_(0), offset_(0),
83         align_(0), valueKind_(ValueKind::Unknown) {}
84 
85   // fields
86   std::string name_;
87   std::string typeName_;
88   uint32_t size_;
89   uint32_t offset_;
90   uint32_t align_;
91   ValueKind valueKind_;
92 };
93 
94 class KernelMD {
95 public:
96   KernelMD() : kernargSegmentSize_(0ull) {}
97 
98   // fields
99   uint64_t kernargSegmentSize_;
100 };
101 
102 static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
103     //    Including only those fields that are relevant to the runtime.
104     //    {"ByValue", KernelArgMD::ValueKind::ByValue},
105     //    {"GlobalBuffer", KernelArgMD::ValueKind::GlobalBuffer},
106     //    {"DynamicSharedPointer",
107     //    KernelArgMD::ValueKind::DynamicSharedPointer},
108     //    {"Sampler", KernelArgMD::ValueKind::Sampler},
109     //    {"Image", KernelArgMD::ValueKind::Image},
110     //    {"Pipe", KernelArgMD::ValueKind::Pipe},
111     //    {"Queue", KernelArgMD::ValueKind::Queue},
112     {"HiddenGlobalOffsetX", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
113     {"HiddenGlobalOffsetY", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
114     {"HiddenGlobalOffsetZ", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
115     {"HiddenNone", KernelArgMD::ValueKind::HiddenNone},
116     {"HiddenPrintfBuffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
117     {"HiddenDefaultQueue", KernelArgMD::ValueKind::HiddenDefaultQueue},
118     {"HiddenCompletionAction", KernelArgMD::ValueKind::HiddenCompletionAction},
119     {"HiddenMultiGridSyncArg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
120     {"HiddenHostcallBuffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
121     // v3
122     //    {"by_value", KernelArgMD::ValueKind::ByValue},
123     //    {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer},
124     //    {"dynamic_shared_pointer",
125     //    KernelArgMD::ValueKind::DynamicSharedPointer},
126     //    {"sampler", KernelArgMD::ValueKind::Sampler},
127     //    {"image", KernelArgMD::ValueKind::Image},
128     //    {"pipe", KernelArgMD::ValueKind::Pipe},
129     //    {"queue", KernelArgMD::ValueKind::Queue},
130     {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
131     {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
132     {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
133     {"hidden_none", KernelArgMD::ValueKind::HiddenNone},
134     {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
135     {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue},
136     {"hidden_completion_action",
137      KernelArgMD::ValueKind::HiddenCompletionAction},
138     {"hidden_multigrid_sync_arg",
139      KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
140     {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
141 };
142 
143 ATLMachine g_atl_machine;
144 
145 namespace core {
146 
147 hsa_status_t allow_access_to_all_gpu_agents(void *ptr) {
148   std::vector<ATLGPUProcessor> &gpu_procs =
149       g_atl_machine.processors<ATLGPUProcessor>();
150   std::vector<hsa_agent_t> agents;
151   for (uint32_t i = 0; i < gpu_procs.size(); i++) {
152     agents.push_back(gpu_procs[i].agent());
153   }
154   return hsa_amd_agents_allow_access(agents.size(), &agents[0], NULL, ptr);
155 }
156 
157 // Implement memory_pool iteration function
158 static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool,
159                                          void *data) {
160   ATLProcessor *proc = reinterpret_cast<ATLProcessor *>(data);
161   hsa_status_t err = HSA_STATUS_SUCCESS;
162   // Check if the memory_pool is allowed to allocate, i.e. do not return group
163   // memory
164   bool alloc_allowed = false;
165   err = hsa_amd_memory_pool_get_info(
166       memory_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
167       &alloc_allowed);
168   if (err != HSA_STATUS_SUCCESS) {
169     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
170            "Alloc allowed in memory pool check", get_error_string(err));
171     return err;
172   }
173   if (alloc_allowed) {
174     uint32_t global_flag = 0;
175     err = hsa_amd_memory_pool_get_info(
176         memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
177     if (err != HSA_STATUS_SUCCESS) {
178       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
179              "Get memory pool info", get_error_string(err));
180       return err;
181     }
182     if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) {
183       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED);
184       proc->addMemory(new_mem);
185     } else {
186       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED);
187       proc->addMemory(new_mem);
188     }
189   }
190 
191   return err;
192 }
193 
194 static hsa_status_t get_agent_info(hsa_agent_t agent, void *data) {
195   hsa_status_t err = HSA_STATUS_SUCCESS;
196   hsa_device_type_t device_type;
197   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
198   if (err != HSA_STATUS_SUCCESS) {
199     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
200            "Get device type info", get_error_string(err));
201     return err;
202   }
203   switch (device_type) {
204   case HSA_DEVICE_TYPE_CPU: {
205     ATLCPUProcessor new_proc(agent);
206     err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
207                                              &new_proc);
208     if (err != HSA_STATUS_SUCCESS) {
209       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
210              "Iterate all memory pools", get_error_string(err));
211       return err;
212     }
213     g_atl_machine.addProcessor(new_proc);
214   } break;
215   case HSA_DEVICE_TYPE_GPU: {
216     hsa_profile_t profile;
217     err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &profile);
218     if (err != HSA_STATUS_SUCCESS) {
219       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
220              "Query the agent profile", get_error_string(err));
221       return err;
222     }
223     atmi_devtype_t gpu_type;
224     gpu_type =
225         (profile == HSA_PROFILE_FULL) ? ATMI_DEVTYPE_iGPU : ATMI_DEVTYPE_dGPU;
226     ATLGPUProcessor new_proc(agent, gpu_type);
227     err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
228                                              &new_proc);
229     if (err != HSA_STATUS_SUCCESS) {
230       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
231              "Iterate all memory pools", get_error_string(err));
232       return err;
233     }
234     g_atl_machine.addProcessor(new_proc);
235   } break;
236   case HSA_DEVICE_TYPE_DSP: {
237     err = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
238   } break;
239   }
240 
241   return err;
242 }
243 
244 hsa_status_t get_fine_grained_region(hsa_region_t region, void *data) {
245   hsa_region_segment_t segment;
246   hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
247   if (segment != HSA_REGION_SEGMENT_GLOBAL) {
248     return HSA_STATUS_SUCCESS;
249   }
250   hsa_region_global_flag_t flags;
251   hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
252   if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
253     hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data);
254     *ret = region;
255     return HSA_STATUS_INFO_BREAK;
256   }
257   return HSA_STATUS_SUCCESS;
258 }
259 
260 /* Determines if a memory region can be used for kernarg allocations.  */
261 static hsa_status_t get_kernarg_memory_region(hsa_region_t region, void *data) {
262   hsa_region_segment_t segment;
263   hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
264   if (HSA_REGION_SEGMENT_GLOBAL != segment) {
265     return HSA_STATUS_SUCCESS;
266   }
267 
268   hsa_region_global_flag_t flags;
269   hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
270   if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
271     hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data);
272     *ret = region;
273     return HSA_STATUS_INFO_BREAK;
274   }
275 
276   return HSA_STATUS_SUCCESS;
277 }
278 
279 static hsa_status_t init_compute_and_memory() {
280   hsa_status_t err;
281 
282   /* Iterate over the agents and pick the gpu agent */
283   err = hsa_iterate_agents(get_agent_info, NULL);
284   if (err == HSA_STATUS_INFO_BREAK) {
285     err = HSA_STATUS_SUCCESS;
286   }
287   if (err != HSA_STATUS_SUCCESS) {
288     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Getting a gpu agent",
289            get_error_string(err));
290     return err;
291   }
292 
293   /* Init all devices or individual device types? */
294   std::vector<ATLCPUProcessor> &cpu_procs =
295       g_atl_machine.processors<ATLCPUProcessor>();
296   std::vector<ATLGPUProcessor> &gpu_procs =
297       g_atl_machine.processors<ATLGPUProcessor>();
298   /* For CPU memory pools, add other devices that can access them directly
299    * or indirectly */
300   for (auto &cpu_proc : cpu_procs) {
301     for (auto &cpu_mem : cpu_proc.memories()) {
302       hsa_amd_memory_pool_t pool = cpu_mem.memory();
303       for (auto &gpu_proc : gpu_procs) {
304         hsa_agent_t agent = gpu_proc.agent();
305         hsa_amd_memory_pool_access_t access;
306         hsa_amd_agent_memory_pool_get_info(
307             agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
308         if (access != 0) {
309           // this means not NEVER, but could be YES or NO
310           // add this memory pool to the proc
311           gpu_proc.addMemory(cpu_mem);
312         }
313       }
314     }
315   }
316 
317   /* FIXME: are the below combinations of procs and memory pools needed?
318    * all to all compare procs with their memory pools and add those memory
319    * pools that are accessible by the target procs */
320   for (auto &gpu_proc : gpu_procs) {
321     for (auto &gpu_mem : gpu_proc.memories()) {
322       hsa_amd_memory_pool_t pool = gpu_mem.memory();
323       for (auto &cpu_proc : cpu_procs) {
324         hsa_agent_t agent = cpu_proc.agent();
325         hsa_amd_memory_pool_access_t access;
326         hsa_amd_agent_memory_pool_get_info(
327             agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
328         if (access != 0) {
329           // this means not NEVER, but could be YES or NO
330           // add this memory pool to the proc
331           cpu_proc.addMemory(gpu_mem);
332         }
333       }
334     }
335   }
336 
337   size_t num_procs = cpu_procs.size() + gpu_procs.size();
338   int num_iGPUs = 0;
339   int num_dGPUs = 0;
340   for (uint32_t i = 0; i < gpu_procs.size(); i++) {
341     if (gpu_procs[i].type() == ATMI_DEVTYPE_iGPU)
342       num_iGPUs++;
343     else
344       num_dGPUs++;
345   }
346   assert(num_iGPUs + num_dGPUs == gpu_procs.size() &&
347          "Number of dGPUs and iGPUs do not add up");
348   DEBUG_PRINT("CPU Agents: %lu\n", cpu_procs.size());
349   DEBUG_PRINT("iGPU Agents: %d\n", num_iGPUs);
350   DEBUG_PRINT("dGPU Agents: %d\n", num_dGPUs);
351   DEBUG_PRINT("GPU Agents: %lu\n", gpu_procs.size());
352 
353   int cpus_begin = 0;
354   int cpus_end = cpu_procs.size();
355   int gpus_begin = cpu_procs.size();
356   int gpus_end = cpu_procs.size() + gpu_procs.size();
357   int proc_index = 0;
358   for (int i = cpus_begin; i < cpus_end; i++) {
359     std::vector<ATLMemory> memories = cpu_procs[proc_index].memories();
360     int fine_memories_size = 0;
361     int coarse_memories_size = 0;
362     DEBUG_PRINT("CPU memory types:\t");
363     for (auto &memory : memories) {
364       atmi_memtype_t type = memory.type();
365       if (type == ATMI_MEMTYPE_FINE_GRAINED) {
366         fine_memories_size++;
367         DEBUG_PRINT("Fine\t");
368       } else {
369         coarse_memories_size++;
370         DEBUG_PRINT("Coarse\t");
371       }
372     }
373     DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
374     DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
375     proc_index++;
376   }
377   proc_index = 0;
378   for (int i = gpus_begin; i < gpus_end; i++) {
379     std::vector<ATLMemory> memories = gpu_procs[proc_index].memories();
380     int fine_memories_size = 0;
381     int coarse_memories_size = 0;
382     DEBUG_PRINT("GPU memory types:\t");
383     for (auto &memory : memories) {
384       atmi_memtype_t type = memory.type();
385       if (type == ATMI_MEMTYPE_FINE_GRAINED) {
386         fine_memories_size++;
387         DEBUG_PRINT("Fine\t");
388       } else {
389         coarse_memories_size++;
390         DEBUG_PRINT("Coarse\t");
391       }
392     }
393     DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
394     DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
395     proc_index++;
396   }
397   proc_index = 0;
398   hsa_region_t atl_cpu_kernarg_region;
399   atl_cpu_kernarg_region.handle = (uint64_t)-1;
400   if (cpu_procs.size() > 0) {
401     err = hsa_agent_iterate_regions(
402         cpu_procs[0].agent(), get_fine_grained_region, &atl_cpu_kernarg_region);
403     if (err == HSA_STATUS_INFO_BREAK) {
404       err = HSA_STATUS_SUCCESS;
405     }
406     err = (atl_cpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR
407                                                           : HSA_STATUS_SUCCESS;
408     if (err != HSA_STATUS_SUCCESS) {
409       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
410              "Finding a CPU kernarg memory region handle",
411              get_error_string(err));
412       return err;
413     }
414   }
415   hsa_region_t atl_gpu_kernarg_region;
416   /* Find a memory region that supports kernel arguments.  */
417   atl_gpu_kernarg_region.handle = (uint64_t)-1;
418   if (gpu_procs.size() > 0) {
419     hsa_agent_iterate_regions(gpu_procs[0].agent(), get_kernarg_memory_region,
420                               &atl_gpu_kernarg_region);
421     err = (atl_gpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR
422                                                           : HSA_STATUS_SUCCESS;
423     if (err != HSA_STATUS_SUCCESS) {
424       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
425              "Finding a kernarg memory region", get_error_string(err));
426       return err;
427     }
428   }
429   if (num_procs > 0)
430     return HSA_STATUS_SUCCESS;
431   else
432     return HSA_STATUS_ERROR_NOT_INITIALIZED;
433 }
434 
435 hsa_status_t init_hsa() {
436   DEBUG_PRINT("Initializing HSA...");
437   hsa_status_t err = hsa_init();
438   if (err != HSA_STATUS_SUCCESS) {
439     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
440            "Initializing the hsa runtime", get_error_string(err));
441     return err;
442   }
443   if (err != HSA_STATUS_SUCCESS)
444     return err;
445 
446   err = init_compute_and_memory();
447   if (err != HSA_STATUS_SUCCESS)
448     return err;
449   if (err != HSA_STATUS_SUCCESS) {
450     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
451            "After initializing compute and memory", get_error_string(err));
452     return err;
453   }
454 
455   DEBUG_PRINT("done\n");
456   return HSA_STATUS_SUCCESS;
457 }
458 
459 hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) {
460 #if (ROCM_VERSION_MAJOR >= 3) ||                                               \
461     (ROCM_VERSION_MAJOR >= 2 && ROCM_VERSION_MINOR >= 3)
462   if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) {
463 #else
464   if (event->event_type == GPU_MEMORY_FAULT_EVENT) {
465 #endif
466     hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault;
467     // memory_fault.agent
468     // memory_fault.virtual_address
469     // memory_fault.fault_reason_mask
470     // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address);
471     std::stringstream stream;
472     stream << std::hex << (uintptr_t)memory_fault.virtual_address;
473     std::string addr("0x" + stream.str());
474 
475     std::string err_string = "[GPU Memory Error] Addr: " + addr;
476     err_string += " Reason: ";
477     if (!(memory_fault.fault_reason_mask & 0x00111111)) {
478       err_string += "No Idea! ";
479     } else {
480       if (memory_fault.fault_reason_mask & 0x00000001)
481         err_string += "Page not present or supervisor privilege. ";
482       if (memory_fault.fault_reason_mask & 0x00000010)
483         err_string += "Write access to a read-only page. ";
484       if (memory_fault.fault_reason_mask & 0x00000100)
485         err_string += "Execute access to a page marked NX. ";
486       if (memory_fault.fault_reason_mask & 0x00001000)
487         err_string += "Host access only. ";
488       if (memory_fault.fault_reason_mask & 0x00010000)
489         err_string += "ECC failure (if supported by HW). ";
490       if (memory_fault.fault_reason_mask & 0x00100000)
491         err_string += "Can't determine the exact fault address. ";
492     }
493     fprintf(stderr, "%s\n", err_string.c_str());
494     return HSA_STATUS_ERROR;
495   }
496   return HSA_STATUS_SUCCESS;
497 }
498 
499 hsa_status_t atl_init_gpu_context() {
500   hsa_status_t err;
501   err = init_hsa();
502   if (err != HSA_STATUS_SUCCESS)
503     return HSA_STATUS_ERROR;
504 
505   err = hsa_amd_register_system_event_handler(callbackEvent, NULL);
506   if (err != HSA_STATUS_SUCCESS) {
507     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
508            "Registering the system for memory faults", get_error_string(err));
509     return HSA_STATUS_ERROR;
510   }
511 
512   return HSA_STATUS_SUCCESS;
513 }
514 
515 static bool isImplicit(KernelArgMD::ValueKind value_kind) {
516   switch (value_kind) {
517   case KernelArgMD::ValueKind::HiddenGlobalOffsetX:
518   case KernelArgMD::ValueKind::HiddenGlobalOffsetY:
519   case KernelArgMD::ValueKind::HiddenGlobalOffsetZ:
520   case KernelArgMD::ValueKind::HiddenNone:
521   case KernelArgMD::ValueKind::HiddenPrintfBuffer:
522   case KernelArgMD::ValueKind::HiddenDefaultQueue:
523   case KernelArgMD::ValueKind::HiddenCompletionAction:
524   case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
525   case KernelArgMD::ValueKind::HiddenHostcallBuffer:
526     return true;
527   default:
528     return false;
529   }
530 }
531 
532 static std::pair<unsigned char *, unsigned char *>
533 find_metadata(void *binary, size_t binSize) {
534   std::pair<unsigned char *, unsigned char *> failure = {nullptr, nullptr};
535 
536   Elf *e = elf_memory(static_cast<char *>(binary), binSize);
537   if (elf_kind(e) != ELF_K_ELF) {
538     return failure;
539   }
540 
541   size_t numpHdrs;
542   if (elf_getphdrnum(e, &numpHdrs) != 0) {
543     return failure;
544   }
545 
546   for (size_t i = 0; i < numpHdrs; ++i) {
547     GElf_Phdr pHdr;
548     if (gelf_getphdr(e, i, &pHdr) != &pHdr) {
549       continue;
550     }
551     // Look for the runtime metadata note
552     if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) {
553       // Iterate over the notes in this segment
554       address ptr = (address)binary + pHdr.p_offset;
555       address segmentEnd = ptr + pHdr.p_filesz;
556 
557       while (ptr < segmentEnd) {
558         Elf_Note *note = reinterpret_cast<Elf_Note *>(ptr);
559         address name = (address)&note[1];
560 
561         if (note->n_type == 7 || note->n_type == 8) {
562           return failure;
563         } else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA */ &&
564                    note->n_namesz == sizeof "AMD" &&
565                    !memcmp(name, "AMD", note->n_namesz)) {
566           // code object v2 uses yaml metadata, no longer supported
567           return failure;
568         } else if (note->n_type == 32 /* NT_AMDGPU_METADATA */ &&
569                    note->n_namesz == sizeof "AMDGPU" &&
570                    !memcmp(name, "AMDGPU", note->n_namesz)) {
571 
572           // n_descsz = 485
573           // value is padded to 4 byte alignment, may want to move end up to
574           // match
575           size_t offset = sizeof(uint32_t) * 3 /* fields */
576                           + sizeof("AMDGPU")   /* name */
577                           + 1 /* padding to 4 byte alignment */;
578 
579           // Including the trailing padding means both pointers are 4 bytes
580           // aligned, which may be useful later.
581           unsigned char *metadata_start = (unsigned char *)ptr + offset;
582           unsigned char *metadata_end =
583               metadata_start + core::alignUp(note->n_descsz, 4);
584           return {metadata_start, metadata_end};
585         }
586         ptr += sizeof(*note) + core::alignUp(note->n_namesz, sizeof(int)) +
587                core::alignUp(note->n_descsz, sizeof(int));
588       }
589     }
590   }
591 
592   return failure;
593 }
594 
595 namespace {
596 int map_lookup_array(msgpack::byte_range message, const char *needle,
597                      msgpack::byte_range *res, uint64_t *size) {
598   unsigned count = 0;
599   struct s : msgpack::functors_defaults<s> {
600     s(unsigned &count, uint64_t *size) : count(count), size(size) {}
601     unsigned &count;
602     uint64_t *size;
603     const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) {
604       count++;
605       *size = N;
606       return bytes.end;
607     }
608   };
609 
610   msgpack::foreach_map(message,
611                        [&](msgpack::byte_range key, msgpack::byte_range value) {
612                          if (msgpack::message_is_string(key, needle)) {
613                            // If the message is an array, record number of
614                            // elements in *size
615                            msgpack::handle_msgpack<s>(value, {count, size});
616                            // return the whole array
617                            *res = value;
618                          }
619                        });
620   // Only claim success if exactly one key/array pair matched
621   return count != 1;
622 }
623 
624 int map_lookup_string(msgpack::byte_range message, const char *needle,
625                       std::string *res) {
626   unsigned count = 0;
627   struct s : public msgpack::functors_defaults<s> {
628     s(unsigned &count, std::string *res) : count(count), res(res) {}
629     unsigned &count;
630     std::string *res;
631     void handle_string(size_t N, const unsigned char *str) {
632       count++;
633       *res = std::string(str, str + N);
634     }
635   };
636   msgpack::foreach_map(message,
637                        [&](msgpack::byte_range key, msgpack::byte_range value) {
638                          if (msgpack::message_is_string(key, needle)) {
639                            msgpack::handle_msgpack<s>(value, {count, res});
640                          }
641                        });
642   return count != 1;
643 }
644 
645 int map_lookup_uint64_t(msgpack::byte_range message, const char *needle,
646                         uint64_t *res) {
647   unsigned count = 0;
648   msgpack::foreach_map(message,
649                        [&](msgpack::byte_range key, msgpack::byte_range value) {
650                          if (msgpack::message_is_string(key, needle)) {
651                            msgpack::foronly_unsigned(value, [&](uint64_t x) {
652                              count++;
653                              *res = x;
654                            });
655                          }
656                        });
657   return count != 1;
658 }
659 
660 int array_lookup_element(msgpack::byte_range message, uint64_t elt,
661                          msgpack::byte_range *res) {
662   int rc = 1;
663   uint64_t i = 0;
664   msgpack::foreach_array(message, [&](msgpack::byte_range value) {
665     if (i == elt) {
666       *res = value;
667       rc = 0;
668     }
669     i++;
670   });
671   return rc;
672 }
673 
674 int populate_kernelArgMD(msgpack::byte_range args_element,
675                          KernelArgMD *kernelarg) {
676   using namespace msgpack;
677   int error = 0;
678   foreach_map(args_element, [&](byte_range key, byte_range value) -> void {
679     if (message_is_string(key, ".name")) {
680       foronly_string(value, [&](size_t N, const unsigned char *str) {
681         kernelarg->name_ = std::string(str, str + N);
682       });
683     } else if (message_is_string(key, ".type_name")) {
684       foronly_string(value, [&](size_t N, const unsigned char *str) {
685         kernelarg->typeName_ = std::string(str, str + N);
686       });
687     } else if (message_is_string(key, ".size")) {
688       foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; });
689     } else if (message_is_string(key, ".offset")) {
690       foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; });
691     } else if (message_is_string(key, ".value_kind")) {
692       foronly_string(value, [&](size_t N, const unsigned char *str) {
693         std::string s = std::string(str, str + N);
694         auto itValueKind = ArgValueKind.find(s);
695         if (itValueKind != ArgValueKind.end()) {
696           kernelarg->valueKind_ = itValueKind->second;
697         }
698       });
699     }
700   });
701   return error;
702 }
703 } // namespace
704 
705 static hsa_status_t get_code_object_custom_metadata(
706     void *binary, size_t binSize, int gpu,
707     std::map<std::string, atl_kernel_info_t> &KernelInfoTable) {
708   // parse code object with different keys from v2
709   // also, the kernel name is not the same as the symbol name -- so a
710   // symbol->name map is needed
711 
712   std::pair<unsigned char *, unsigned char *> metadata =
713       find_metadata(binary, binSize);
714   if (!metadata.first) {
715     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
716   }
717 
718   uint64_t kernelsSize = 0;
719   int msgpack_errors = 0;
720   msgpack::byte_range kernel_array;
721   msgpack_errors =
722       map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels",
723                        &kernel_array, &kernelsSize);
724   if (msgpack_errors != 0) {
725     printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
726            "kernels lookup in program metadata");
727     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
728   }
729 
730   for (size_t i = 0; i < kernelsSize; i++) {
731     assert(msgpack_errors == 0);
732     std::string kernelName;
733     std::string symbolName;
734 
735     msgpack::byte_range element;
736     msgpack_errors += array_lookup_element(kernel_array, i, &element);
737     if (msgpack_errors != 0) {
738       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
739              "element lookup in kernel metadata");
740       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
741     }
742 
743     msgpack_errors += map_lookup_string(element, ".name", &kernelName);
744     msgpack_errors += map_lookup_string(element, ".symbol", &symbolName);
745     if (msgpack_errors != 0) {
746       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
747              "strings lookup in kernel metadata");
748       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
749     }
750 
751     // Make sure that kernelName + ".kd" == symbolName
752     if ((kernelName + ".kd") != symbolName) {
753       printf("[%s:%d] Kernel name mismatching symbol: %s != %s + .kd\n",
754              __FILE__, __LINE__, symbolName.c_str(), kernelName.c_str());
755       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
756     }
757 
758     atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, {}, {}, {}};
759 
760     uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count;
761     msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count);
762     if (msgpack_errors != 0) {
763       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
764              "sgpr count metadata lookup in kernel metadata");
765       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
766     }
767 
768     info.sgpr_count = sgpr_count;
769 
770     msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count);
771     if (msgpack_errors != 0) {
772       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
773              "vgpr count metadata lookup in kernel metadata");
774       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
775     }
776 
777     info.vgpr_count = vgpr_count;
778 
779     msgpack_errors +=
780         map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count);
781     if (msgpack_errors != 0) {
782       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
783              "sgpr spill count metadata lookup in kernel metadata");
784       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
785     }
786 
787     info.sgpr_spill_count = sgpr_spill_count;
788 
789     msgpack_errors +=
790         map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count);
791     if (msgpack_errors != 0) {
792       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
793              "vgpr spill count metadata lookup in kernel metadata");
794       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
795     }
796 
797     info.vgpr_spill_count = vgpr_spill_count;
798 
799     size_t kernel_explicit_args_size = 0;
800     uint64_t kernel_segment_size;
801     msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size",
802                                           &kernel_segment_size);
803     if (msgpack_errors != 0) {
804       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
805              "kernarg segment size metadata lookup in kernel metadata");
806       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
807     }
808 
809     bool hasHiddenArgs = false;
810     if (kernel_segment_size > 0) {
811       uint64_t argsSize;
812       size_t offset = 0;
813 
814       msgpack::byte_range args_array;
815       msgpack_errors +=
816           map_lookup_array(element, ".args", &args_array, &argsSize);
817       if (msgpack_errors != 0) {
818         printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
819                "kernel args metadata lookup in kernel metadata");
820         return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
821       }
822 
823       info.num_args = argsSize;
824 
825       for (size_t i = 0; i < argsSize; ++i) {
826         KernelArgMD lcArg;
827 
828         msgpack::byte_range args_element;
829         msgpack_errors += array_lookup_element(args_array, i, &args_element);
830         if (msgpack_errors != 0) {
831           printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
832                  "iterate args map in kernel args metadata");
833           return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
834         }
835 
836         msgpack_errors += populate_kernelArgMD(args_element, &lcArg);
837         if (msgpack_errors != 0) {
838           printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
839                  "iterate args map in kernel args metadata");
840           return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
841         }
842         // populate info with sizes and offsets
843         info.arg_sizes.push_back(lcArg.size_);
844         // v3 has offset field and not align field
845         size_t new_offset = lcArg.offset_;
846         size_t padding = new_offset - offset;
847         offset = new_offset;
848         info.arg_offsets.push_back(lcArg.offset_);
849         DEBUG_PRINT("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(),
850                     lcArg.size_, lcArg.offset_);
851         offset += lcArg.size_;
852 
853         // check if the arg is a hidden/implicit arg
854         // this logic assumes that all hidden args are 8-byte aligned
855         if (!isImplicit(lcArg.valueKind_)) {
856           kernel_explicit_args_size += lcArg.size_;
857         } else {
858           hasHiddenArgs = true;
859         }
860         kernel_explicit_args_size += padding;
861       }
862     }
863 
864     // add size of implicit args, e.g.: offset x, y and z and pipe pointer, but
865     // in ATMI, do not count the compiler set implicit args, but set your own
866     // implicit args by discounting the compiler set implicit args
867     info.kernel_segment_size =
868         (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size) +
869         sizeof(atmi_implicit_args_t);
870     DEBUG_PRINT("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
871                 kernel_segment_size, info.kernel_segment_size);
872 
873     // kernel received, now add it to the kernel info table
874     KernelInfoTable[kernelName] = info;
875   }
876 
877   return HSA_STATUS_SUCCESS;
878 }
879 
880 static hsa_status_t
881 populate_InfoTables(hsa_executable_symbol_t symbol, int gpu,
882                     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
883                     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable) {
884   hsa_symbol_kind_t type;
885 
886   uint32_t name_length;
887   hsa_status_t err;
888   err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE,
889                                        &type);
890   if (err != HSA_STATUS_SUCCESS) {
891     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
892            "Symbol info extraction", get_error_string(err));
893     return err;
894   }
895   DEBUG_PRINT("Exec Symbol type: %d\n", type);
896   if (type == HSA_SYMBOL_KIND_KERNEL) {
897     err = hsa_executable_symbol_get_info(
898         symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
899     if (err != HSA_STATUS_SUCCESS) {
900       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
901              "Symbol info extraction", get_error_string(err));
902       return err;
903     }
904     char *name = reinterpret_cast<char *>(malloc(name_length + 1));
905     err = hsa_executable_symbol_get_info(symbol,
906                                          HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
907     if (err != HSA_STATUS_SUCCESS) {
908       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
909              "Symbol info extraction", get_error_string(err));
910       return err;
911     }
912     // remove the suffix .kd from symbol name.
913     name[name_length - 3] = 0;
914 
915     atl_kernel_info_t info;
916     std::string kernelName(name);
917     // by now, the kernel info table should already have an entry
918     // because the non-ROCr custom code object parsing is called before
919     // iterating over the code object symbols using ROCr
920     if (KernelInfoTable.find(kernelName) == KernelInfoTable.end()) {
921       if (HSA_STATUS_ERROR_INVALID_CODE_OBJECT != HSA_STATUS_SUCCESS) {
922         printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
923                "Finding the entry kernel info table",
924                get_error_string(HSA_STATUS_ERROR_INVALID_CODE_OBJECT));
925         exit(1);
926       }
927     }
928     // found, so assign and update
929     info = KernelInfoTable[kernelName];
930 
931     /* Extract dispatch information from the symbol */
932     err = hsa_executable_symbol_get_info(
933         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
934         &(info.kernel_object));
935     if (err != HSA_STATUS_SUCCESS) {
936       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
937              "Extracting the symbol from the executable",
938              get_error_string(err));
939       return err;
940     }
941     err = hsa_executable_symbol_get_info(
942         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
943         &(info.group_segment_size));
944     if (err != HSA_STATUS_SUCCESS) {
945       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
946              "Extracting the group segment size from the executable",
947              get_error_string(err));
948       return err;
949     }
950     err = hsa_executable_symbol_get_info(
951         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
952         &(info.private_segment_size));
953     if (err != HSA_STATUS_SUCCESS) {
954       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
955              "Extracting the private segment from the executable",
956              get_error_string(err));
957       return err;
958     }
959 
960     DEBUG_PRINT(
961         "Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes "
962         "kernarg\n",
963         kernelName.c_str(), info.kernel_object, info.group_segment_size,
964         info.private_segment_size, info.kernel_segment_size);
965 
966     // assign it back to the kernel info table
967     KernelInfoTable[kernelName] = info;
968     free(name);
969   } else if (type == HSA_SYMBOL_KIND_VARIABLE) {
970     err = hsa_executable_symbol_get_info(
971         symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
972     if (err != HSA_STATUS_SUCCESS) {
973       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
974              "Symbol info extraction", get_error_string(err));
975       return err;
976     }
977     char *name = reinterpret_cast<char *>(malloc(name_length + 1));
978     err = hsa_executable_symbol_get_info(symbol,
979                                          HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
980     if (err != HSA_STATUS_SUCCESS) {
981       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
982              "Symbol info extraction", get_error_string(err));
983       return err;
984     }
985     name[name_length] = 0;
986 
987     atl_symbol_info_t info;
988 
989     err = hsa_executable_symbol_get_info(
990         symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr));
991     if (err != HSA_STATUS_SUCCESS) {
992       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
993              "Symbol info address extraction", get_error_string(err));
994       return err;
995     }
996 
997     err = hsa_executable_symbol_get_info(
998         symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size));
999     if (err != HSA_STATUS_SUCCESS) {
1000       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1001              "Symbol info size extraction", get_error_string(err));
1002       return err;
1003     }
1004 
1005     DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr,
1006                 info.size);
1007     err = register_allocation(reinterpret_cast<void *>(info.addr),
1008                               (size_t)info.size, ATMI_DEVTYPE_GPU);
1009     if (err != HSA_STATUS_SUCCESS) {
1010       return err;
1011     }
1012     SymbolInfoTable[std::string(name)] = info;
1013     free(name);
1014   } else {
1015     DEBUG_PRINT("Symbol is an indirect function\n");
1016   }
1017   return HSA_STATUS_SUCCESS;
1018 }
1019 
1020 hsa_status_t RegisterModuleFromMemory(
1021     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
1022     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
1023     void *module_bytes, size_t module_size, int gpu,
1024     hsa_status_t (*on_deserialized_data)(void *data, size_t size,
1025                                          void *cb_state),
1026     void *cb_state, std::vector<hsa_executable_t> &HSAExecutables) {
1027   hsa_status_t err;
1028   assert(gpu >= 0);
1029 
1030   DEBUG_PRINT("Trying to load module to GPU-%d\n", gpu);
1031   ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(gpu);
1032   hsa_agent_t agent = proc.agent();
1033   hsa_executable_t executable = {0};
1034   hsa_profile_t agent_profile;
1035 
1036   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
1037   if (err != HSA_STATUS_SUCCESS) {
1038     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1039            "Query the agent profile", get_error_string(err));
1040     return HSA_STATUS_ERROR;
1041   }
1042   // FIXME: Assume that every profile is FULL until we understand how to build
1043   // GCN with base profile
1044   agent_profile = HSA_PROFILE_FULL;
1045   /* Create the empty executable.  */
1046   err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "",
1047                               &executable);
1048   if (err != HSA_STATUS_SUCCESS) {
1049     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1050            "Create the executable", get_error_string(err));
1051     return HSA_STATUS_ERROR;
1052   }
1053 
1054   bool module_load_success = false;
1055   do // Existing control flow used continue, preserve that for this patch
1056   {
1057     {
1058       // Some metadata info is not available through ROCr API, so use custom
1059       // code object metadata parsing to collect such metadata info
1060 
1061       err = get_code_object_custom_metadata(module_bytes, module_size, gpu,
1062                                             KernelInfoTable);
1063       if (err != HSA_STATUS_SUCCESS) {
1064         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1065                     "Getting custom code object metadata",
1066                     get_error_string(err));
1067         continue;
1068       }
1069 
1070       // Deserialize code object.
1071       hsa_code_object_t code_object = {0};
1072       err = hsa_code_object_deserialize(module_bytes, module_size, NULL,
1073                                         &code_object);
1074       if (err != HSA_STATUS_SUCCESS) {
1075         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1076                     "Code Object Deserialization", get_error_string(err));
1077         continue;
1078       }
1079       assert(0 != code_object.handle);
1080 
1081       // Mutating the device image here avoids another allocation & memcpy
1082       void *code_object_alloc_data =
1083           reinterpret_cast<void *>(code_object.handle);
1084       hsa_status_t atmi_err =
1085           on_deserialized_data(code_object_alloc_data, module_size, cb_state);
1086       if (atmi_err != HSA_STATUS_SUCCESS) {
1087         printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1088                "Error in deserialized_data callback",
1089                get_atmi_error_string(atmi_err));
1090         return atmi_err;
1091       }
1092 
1093       /* Load the code object.  */
1094       err =
1095           hsa_executable_load_code_object(executable, agent, code_object, NULL);
1096       if (err != HSA_STATUS_SUCCESS) {
1097         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1098                     "Loading the code object", get_error_string(err));
1099         continue;
1100       }
1101 
1102       // cannot iterate over symbols until executable is frozen
1103     }
1104     module_load_success = true;
1105   } while (0);
1106   DEBUG_PRINT("Modules loaded successful? %d\n", module_load_success);
1107   if (module_load_success) {
1108     /* Freeze the executable; it can now be queried for symbols.  */
1109     err = hsa_executable_freeze(executable, "");
1110     if (err != HSA_STATUS_SUCCESS) {
1111       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1112              "Freeze the executable", get_error_string(err));
1113       return HSA_STATUS_ERROR;
1114     }
1115 
1116     err = hsa::executable_iterate_symbols(
1117         executable,
1118         [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t {
1119           return populate_InfoTables(symbol, gpu, KernelInfoTable,
1120                                      SymbolInfoTable);
1121         });
1122     if (err != HSA_STATUS_SUCCESS) {
1123       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1124              "Iterating over symbols for execuatable", get_error_string(err));
1125       return HSA_STATUS_ERROR;
1126     }
1127 
1128     // save the executable and destroy during finalize
1129     HSAExecutables.push_back(executable);
1130     return HSA_STATUS_SUCCESS;
1131   } else {
1132     return HSA_STATUS_ERROR;
1133   }
1134 }
1135 
1136 } // namespace core
1137