1 //===--- amdgpu/impl/system.cpp ----------------------------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include <libelf.h>
9 
10 #include <cassert>
11 #include <sstream>
12 #include <string>
13 
14 #include "internal.h"
15 #include "machine.h"
16 #include "rt.h"
17 
18 #include "msgpack.h"
19 
20 namespace hsa {
21 // Wrap HSA iterate API in a shim that allows passing general callables
22 template <typename C>
23 hsa_status_t executable_iterate_symbols(hsa_executable_t executable, C cb) {
24   auto L = [](hsa_executable_t executable, hsa_executable_symbol_t symbol,
25               void *data) -> hsa_status_t {
26     C *unwrapped = static_cast<C *>(data);
27     return (*unwrapped)(executable, symbol);
28   };
29   return hsa_executable_iterate_symbols(executable, L,
30                                         static_cast<void *>(&cb));
31 }
32 } // namespace hsa
33 
34 typedef unsigned char *address;
35 /*
36  * Note descriptors.
37  */
38 typedef struct {
39   uint32_t n_namesz; /* Length of note's name. */
40   uint32_t n_descsz; /* Length of note's value. */
41   uint32_t n_type;   /* Type of note. */
42   // then name
43   // then padding, optional
44   // then desc, at 4 byte alignment (not 8, despite being elf64)
45 } Elf_Note;
46 
47 // The following include file and following structs/enums
48 // have been replicated on a per-use basis below. For example,
49 // llvm::AMDGPU::HSAMD::Kernel::Metadata has several fields,
50 // but we may care only about kernargSegmentSize_ for now, so
51 // we just include that field in our KernelMD implementation. We
52 // chose this approach to replicate in order to avoid forcing
53 // a dependency on LLVM_INCLUDE_DIR just to compile the runtime.
54 // #include "llvm/Support/AMDGPUMetadata.h"
55 // typedef llvm::AMDGPU::HSAMD::Metadata CodeObjectMD;
56 // typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD;
57 // typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD;
58 // using llvm::AMDGPU::HSAMD::AccessQualifier;
59 // using llvm::AMDGPU::HSAMD::AddressSpaceQualifier;
60 // using llvm::AMDGPU::HSAMD::ValueKind;
61 // using llvm::AMDGPU::HSAMD::ValueType;
62 
63 class KernelArgMD {
64 public:
65   enum class ValueKind {
66     HiddenGlobalOffsetX,
67     HiddenGlobalOffsetY,
68     HiddenGlobalOffsetZ,
69     HiddenNone,
70     HiddenPrintfBuffer,
71     HiddenDefaultQueue,
72     HiddenCompletionAction,
73     HiddenMultiGridSyncArg,
74     HiddenHostcallBuffer,
75     Unknown
76   };
77 
78   KernelArgMD()
79       : name_(std::string()), typeName_(std::string()), size_(0), offset_(0),
80         align_(0), valueKind_(ValueKind::Unknown) {}
81 
82   // fields
83   std::string name_;
84   std::string typeName_;
85   uint32_t size_;
86   uint32_t offset_;
87   uint32_t align_;
88   ValueKind valueKind_;
89 };
90 
91 class KernelMD {
92 public:
93   KernelMD() : kernargSegmentSize_(0ull) {}
94 
95   // fields
96   uint64_t kernargSegmentSize_;
97 };
98 
99 static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
100     //    Including only those fields that are relevant to the runtime.
101     //    {"ByValue", KernelArgMD::ValueKind::ByValue},
102     //    {"GlobalBuffer", KernelArgMD::ValueKind::GlobalBuffer},
103     //    {"DynamicSharedPointer",
104     //    KernelArgMD::ValueKind::DynamicSharedPointer},
105     //    {"Sampler", KernelArgMD::ValueKind::Sampler},
106     //    {"Image", KernelArgMD::ValueKind::Image},
107     //    {"Pipe", KernelArgMD::ValueKind::Pipe},
108     //    {"Queue", KernelArgMD::ValueKind::Queue},
109     {"HiddenGlobalOffsetX", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
110     {"HiddenGlobalOffsetY", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
111     {"HiddenGlobalOffsetZ", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
112     {"HiddenNone", KernelArgMD::ValueKind::HiddenNone},
113     {"HiddenPrintfBuffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
114     {"HiddenDefaultQueue", KernelArgMD::ValueKind::HiddenDefaultQueue},
115     {"HiddenCompletionAction", KernelArgMD::ValueKind::HiddenCompletionAction},
116     {"HiddenMultiGridSyncArg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
117     {"HiddenHostcallBuffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
118     // v3
119     //    {"by_value", KernelArgMD::ValueKind::ByValue},
120     //    {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer},
121     //    {"dynamic_shared_pointer",
122     //    KernelArgMD::ValueKind::DynamicSharedPointer},
123     //    {"sampler", KernelArgMD::ValueKind::Sampler},
124     //    {"image", KernelArgMD::ValueKind::Image},
125     //    {"pipe", KernelArgMD::ValueKind::Pipe},
126     //    {"queue", KernelArgMD::ValueKind::Queue},
127     {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
128     {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
129     {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
130     {"hidden_none", KernelArgMD::ValueKind::HiddenNone},
131     {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
132     {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue},
133     {"hidden_completion_action",
134      KernelArgMD::ValueKind::HiddenCompletionAction},
135     {"hidden_multigrid_sync_arg",
136      KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
137     {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
138 };
139 
140 ATLMachine g_atl_machine;
141 
142 namespace core {
143 
144 // Implement memory_pool iteration function
145 static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool,
146                                          void *data) {
147   ATLProcessor *proc = reinterpret_cast<ATLProcessor *>(data);
148   hsa_status_t err = HSA_STATUS_SUCCESS;
149   // Check if the memory_pool is allowed to allocate, i.e. do not return group
150   // memory
151   bool alloc_allowed = false;
152   err = hsa_amd_memory_pool_get_info(
153       memory_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
154       &alloc_allowed);
155   if (err != HSA_STATUS_SUCCESS) {
156     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
157            "Alloc allowed in memory pool check", get_error_string(err));
158     return err;
159   }
160   if (alloc_allowed) {
161     uint32_t global_flag = 0;
162     err = hsa_amd_memory_pool_get_info(
163         memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
164     if (err != HSA_STATUS_SUCCESS) {
165       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
166              "Get memory pool info", get_error_string(err));
167       return err;
168     }
169     if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) {
170       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED);
171       proc->addMemory(new_mem);
172     } else {
173       ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED);
174       proc->addMemory(new_mem);
175     }
176   }
177 
178   return err;
179 }
180 
181 static hsa_status_t get_agent_info(hsa_agent_t agent, void *data) {
182   hsa_status_t err = HSA_STATUS_SUCCESS;
183   hsa_device_type_t device_type;
184   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
185   if (err != HSA_STATUS_SUCCESS) {
186     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
187            "Get device type info", get_error_string(err));
188     return err;
189   }
190   switch (device_type) {
191   case HSA_DEVICE_TYPE_CPU: {
192     ATLCPUProcessor new_proc(agent);
193     err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
194                                              &new_proc);
195     if (err != HSA_STATUS_SUCCESS) {
196       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
197              "Iterate all memory pools", get_error_string(err));
198       return err;
199     }
200     g_atl_machine.addProcessor(new_proc);
201   } break;
202   case HSA_DEVICE_TYPE_GPU: {
203     hsa_profile_t profile;
204     err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &profile);
205     if (err != HSA_STATUS_SUCCESS) {
206       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
207              "Query the agent profile", get_error_string(err));
208       return err;
209     }
210     atmi_devtype_t gpu_type;
211     gpu_type =
212         (profile == HSA_PROFILE_FULL) ? ATMI_DEVTYPE_iGPU : ATMI_DEVTYPE_dGPU;
213     ATLGPUProcessor new_proc(agent, gpu_type);
214     err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info,
215                                              &new_proc);
216     if (err != HSA_STATUS_SUCCESS) {
217       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
218              "Iterate all memory pools", get_error_string(err));
219       return err;
220     }
221     g_atl_machine.addProcessor(new_proc);
222   } break;
223   case HSA_DEVICE_TYPE_DSP: {
224     err = HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
225   } break;
226   }
227 
228   return err;
229 }
230 
231 static hsa_status_t init_compute_and_memory() {
232   hsa_status_t err;
233 
234   /* Iterate over the agents and pick the gpu agent */
235   err = hsa_iterate_agents(get_agent_info, NULL);
236   if (err == HSA_STATUS_INFO_BREAK) {
237     err = HSA_STATUS_SUCCESS;
238   }
239   if (err != HSA_STATUS_SUCCESS) {
240     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Getting a gpu agent",
241            get_error_string(err));
242     return err;
243   }
244 
245   /* Init all devices or individual device types? */
246   std::vector<ATLCPUProcessor> &cpu_procs =
247       g_atl_machine.processors<ATLCPUProcessor>();
248   std::vector<ATLGPUProcessor> &gpu_procs =
249       g_atl_machine.processors<ATLGPUProcessor>();
250   /* For CPU memory pools, add other devices that can access them directly
251    * or indirectly */
252   for (auto &cpu_proc : cpu_procs) {
253     for (auto &cpu_mem : cpu_proc.memories()) {
254       hsa_amd_memory_pool_t pool = cpu_mem.memory();
255       for (auto &gpu_proc : gpu_procs) {
256         hsa_agent_t agent = gpu_proc.agent();
257         hsa_amd_memory_pool_access_t access;
258         hsa_amd_agent_memory_pool_get_info(
259             agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
260         if (access != 0) {
261           // this means not NEVER, but could be YES or NO
262           // add this memory pool to the proc
263           gpu_proc.addMemory(cpu_mem);
264         }
265       }
266     }
267   }
268 
269   /* FIXME: are the below combinations of procs and memory pools needed?
270    * all to all compare procs with their memory pools and add those memory
271    * pools that are accessible by the target procs */
272   for (auto &gpu_proc : gpu_procs) {
273     for (auto &gpu_mem : gpu_proc.memories()) {
274       hsa_amd_memory_pool_t pool = gpu_mem.memory();
275       for (auto &cpu_proc : cpu_procs) {
276         hsa_agent_t agent = cpu_proc.agent();
277         hsa_amd_memory_pool_access_t access;
278         hsa_amd_agent_memory_pool_get_info(
279             agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
280         if (access != 0) {
281           // this means not NEVER, but could be YES or NO
282           // add this memory pool to the proc
283           cpu_proc.addMemory(gpu_mem);
284         }
285       }
286     }
287   }
288 
289   size_t num_procs = cpu_procs.size() + gpu_procs.size();
290   int num_iGPUs = 0;
291   int num_dGPUs = 0;
292   for (uint32_t i = 0; i < gpu_procs.size(); i++) {
293     if (gpu_procs[i].type() == ATMI_DEVTYPE_iGPU)
294       num_iGPUs++;
295     else
296       num_dGPUs++;
297   }
298   assert(num_iGPUs + num_dGPUs == gpu_procs.size() &&
299          "Number of dGPUs and iGPUs do not add up");
300   DEBUG_PRINT("CPU Agents: %lu\n", cpu_procs.size());
301   DEBUG_PRINT("iGPU Agents: %d\n", num_iGPUs);
302   DEBUG_PRINT("dGPU Agents: %d\n", num_dGPUs);
303   DEBUG_PRINT("GPU Agents: %lu\n", gpu_procs.size());
304 
305   int cpus_begin = 0;
306   int cpus_end = cpu_procs.size();
307   int gpus_begin = cpu_procs.size();
308   int gpus_end = cpu_procs.size() + gpu_procs.size();
309   int proc_index = 0;
310   for (int i = cpus_begin; i < cpus_end; i++) {
311     std::vector<ATLMemory> memories = cpu_procs[proc_index].memories();
312     int fine_memories_size = 0;
313     int coarse_memories_size = 0;
314     DEBUG_PRINT("CPU memory types:\t");
315     for (auto &memory : memories) {
316       atmi_memtype_t type = memory.type();
317       if (type == ATMI_MEMTYPE_FINE_GRAINED) {
318         fine_memories_size++;
319         DEBUG_PRINT("Fine\t");
320       } else {
321         coarse_memories_size++;
322         DEBUG_PRINT("Coarse\t");
323       }
324     }
325     DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
326     DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
327     proc_index++;
328   }
329   proc_index = 0;
330   for (int i = gpus_begin; i < gpus_end; i++) {
331     std::vector<ATLMemory> memories = gpu_procs[proc_index].memories();
332     int fine_memories_size = 0;
333     int coarse_memories_size = 0;
334     DEBUG_PRINT("GPU memory types:\t");
335     for (auto &memory : memories) {
336       atmi_memtype_t type = memory.type();
337       if (type == ATMI_MEMTYPE_FINE_GRAINED) {
338         fine_memories_size++;
339         DEBUG_PRINT("Fine\t");
340       } else {
341         coarse_memories_size++;
342         DEBUG_PRINT("Coarse\t");
343       }
344     }
345     DEBUG_PRINT("\nFine Memories : %d", fine_memories_size);
346     DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size);
347     proc_index++;
348   }
349   if (num_procs > 0)
350     return HSA_STATUS_SUCCESS;
351   else
352     return HSA_STATUS_ERROR_NOT_INITIALIZED;
353 }
354 
355 hsa_status_t init_hsa() {
356   DEBUG_PRINT("Initializing HSA...");
357   hsa_status_t err = hsa_init();
358   if (err != HSA_STATUS_SUCCESS) {
359     return err;
360   }
361 
362   err = init_compute_and_memory();
363   if (err != HSA_STATUS_SUCCESS)
364     return err;
365   if (err != HSA_STATUS_SUCCESS) {
366     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
367            "After initializing compute and memory", get_error_string(err));
368     return err;
369   }
370 
371   DEBUG_PRINT("done\n");
372   return HSA_STATUS_SUCCESS;
373 }
374 
375 hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) {
376 #if (ROCM_VERSION_MAJOR >= 3) ||                                               \
377     (ROCM_VERSION_MAJOR >= 2 && ROCM_VERSION_MINOR >= 3)
378   if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) {
379 #else
380   if (event->event_type == GPU_MEMORY_FAULT_EVENT) {
381 #endif
382     hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault;
383     // memory_fault.agent
384     // memory_fault.virtual_address
385     // memory_fault.fault_reason_mask
386     // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address);
387     std::stringstream stream;
388     stream << std::hex << (uintptr_t)memory_fault.virtual_address;
389     std::string addr("0x" + stream.str());
390 
391     std::string err_string = "[GPU Memory Error] Addr: " + addr;
392     err_string += " Reason: ";
393     if (!(memory_fault.fault_reason_mask & 0x00111111)) {
394       err_string += "No Idea! ";
395     } else {
396       if (memory_fault.fault_reason_mask & 0x00000001)
397         err_string += "Page not present or supervisor privilege. ";
398       if (memory_fault.fault_reason_mask & 0x00000010)
399         err_string += "Write access to a read-only page. ";
400       if (memory_fault.fault_reason_mask & 0x00000100)
401         err_string += "Execute access to a page marked NX. ";
402       if (memory_fault.fault_reason_mask & 0x00001000)
403         err_string += "Host access only. ";
404       if (memory_fault.fault_reason_mask & 0x00010000)
405         err_string += "ECC failure (if supported by HW). ";
406       if (memory_fault.fault_reason_mask & 0x00100000)
407         err_string += "Can't determine the exact fault address. ";
408     }
409     fprintf(stderr, "%s\n", err_string.c_str());
410     return HSA_STATUS_ERROR;
411   }
412   return HSA_STATUS_SUCCESS;
413 }
414 
415 hsa_status_t atl_init_gpu_context() {
416   hsa_status_t err;
417   err = init_hsa();
418   if (err != HSA_STATUS_SUCCESS)
419     return HSA_STATUS_ERROR;
420 
421   err = hsa_amd_register_system_event_handler(callbackEvent, NULL);
422   if (err != HSA_STATUS_SUCCESS) {
423     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
424            "Registering the system for memory faults", get_error_string(err));
425     return HSA_STATUS_ERROR;
426   }
427 
428   return HSA_STATUS_SUCCESS;
429 }
430 
431 static bool isImplicit(KernelArgMD::ValueKind value_kind) {
432   switch (value_kind) {
433   case KernelArgMD::ValueKind::HiddenGlobalOffsetX:
434   case KernelArgMD::ValueKind::HiddenGlobalOffsetY:
435   case KernelArgMD::ValueKind::HiddenGlobalOffsetZ:
436   case KernelArgMD::ValueKind::HiddenNone:
437   case KernelArgMD::ValueKind::HiddenPrintfBuffer:
438   case KernelArgMD::ValueKind::HiddenDefaultQueue:
439   case KernelArgMD::ValueKind::HiddenCompletionAction:
440   case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
441   case KernelArgMD::ValueKind::HiddenHostcallBuffer:
442     return true;
443   default:
444     return false;
445   }
446 }
447 
448 static std::pair<unsigned char *, unsigned char *>
449 find_metadata(void *binary, size_t binSize) {
450   std::pair<unsigned char *, unsigned char *> failure = {nullptr, nullptr};
451 
452   Elf *e = elf_memory(static_cast<char *>(binary), binSize);
453   if (elf_kind(e) != ELF_K_ELF) {
454     return failure;
455   }
456 
457   size_t numpHdrs;
458   if (elf_getphdrnum(e, &numpHdrs) != 0) {
459     return failure;
460   }
461 
462   Elf64_Phdr *pHdrs = elf64_getphdr(e);
463   for (size_t i = 0; i < numpHdrs; ++i) {
464     Elf64_Phdr pHdr = pHdrs[i];
465 
466     // Look for the runtime metadata note
467     if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) {
468       // Iterate over the notes in this segment
469       address ptr = (address)binary + pHdr.p_offset;
470       address segmentEnd = ptr + pHdr.p_filesz;
471 
472       while (ptr < segmentEnd) {
473         Elf_Note *note = reinterpret_cast<Elf_Note *>(ptr);
474         address name = (address)&note[1];
475 
476         if (note->n_type == 7 || note->n_type == 8) {
477           return failure;
478         } else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA */ &&
479                    note->n_namesz == sizeof "AMD" &&
480                    !memcmp(name, "AMD", note->n_namesz)) {
481           // code object v2 uses yaml metadata, no longer supported
482           return failure;
483         } else if (note->n_type == 32 /* NT_AMDGPU_METADATA */ &&
484                    note->n_namesz == sizeof "AMDGPU" &&
485                    !memcmp(name, "AMDGPU", note->n_namesz)) {
486 
487           // n_descsz = 485
488           // value is padded to 4 byte alignment, may want to move end up to
489           // match
490           size_t offset = sizeof(uint32_t) * 3 /* fields */
491                           + sizeof("AMDGPU")   /* name */
492                           + 1 /* padding to 4 byte alignment */;
493 
494           // Including the trailing padding means both pointers are 4 bytes
495           // aligned, which may be useful later.
496           unsigned char *metadata_start = (unsigned char *)ptr + offset;
497           unsigned char *metadata_end =
498               metadata_start + core::alignUp(note->n_descsz, 4);
499           return {metadata_start, metadata_end};
500         }
501         ptr += sizeof(*note) + core::alignUp(note->n_namesz, sizeof(int)) +
502                core::alignUp(note->n_descsz, sizeof(int));
503       }
504     }
505   }
506 
507   return failure;
508 }
509 
510 namespace {
511 int map_lookup_array(msgpack::byte_range message, const char *needle,
512                      msgpack::byte_range *res, uint64_t *size) {
513   unsigned count = 0;
514   struct s : msgpack::functors_defaults<s> {
515     s(unsigned &count, uint64_t *size) : count(count), size(size) {}
516     unsigned &count;
517     uint64_t *size;
518     const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) {
519       count++;
520       *size = N;
521       return bytes.end;
522     }
523   };
524 
525   msgpack::foreach_map(message,
526                        [&](msgpack::byte_range key, msgpack::byte_range value) {
527                          if (msgpack::message_is_string(key, needle)) {
528                            // If the message is an array, record number of
529                            // elements in *size
530                            msgpack::handle_msgpack<s>(value, {count, size});
531                            // return the whole array
532                            *res = value;
533                          }
534                        });
535   // Only claim success if exactly one key/array pair matched
536   return count != 1;
537 }
538 
539 int map_lookup_string(msgpack::byte_range message, const char *needle,
540                       std::string *res) {
541   unsigned count = 0;
542   struct s : public msgpack::functors_defaults<s> {
543     s(unsigned &count, std::string *res) : count(count), res(res) {}
544     unsigned &count;
545     std::string *res;
546     void handle_string(size_t N, const unsigned char *str) {
547       count++;
548       *res = std::string(str, str + N);
549     }
550   };
551   msgpack::foreach_map(message,
552                        [&](msgpack::byte_range key, msgpack::byte_range value) {
553                          if (msgpack::message_is_string(key, needle)) {
554                            msgpack::handle_msgpack<s>(value, {count, res});
555                          }
556                        });
557   return count != 1;
558 }
559 
560 int map_lookup_uint64_t(msgpack::byte_range message, const char *needle,
561                         uint64_t *res) {
562   unsigned count = 0;
563   msgpack::foreach_map(message,
564                        [&](msgpack::byte_range key, msgpack::byte_range value) {
565                          if (msgpack::message_is_string(key, needle)) {
566                            msgpack::foronly_unsigned(value, [&](uint64_t x) {
567                              count++;
568                              *res = x;
569                            });
570                          }
571                        });
572   return count != 1;
573 }
574 
575 int array_lookup_element(msgpack::byte_range message, uint64_t elt,
576                          msgpack::byte_range *res) {
577   int rc = 1;
578   uint64_t i = 0;
579   msgpack::foreach_array(message, [&](msgpack::byte_range value) {
580     if (i == elt) {
581       *res = value;
582       rc = 0;
583     }
584     i++;
585   });
586   return rc;
587 }
588 
589 int populate_kernelArgMD(msgpack::byte_range args_element,
590                          KernelArgMD *kernelarg) {
591   using namespace msgpack;
592   int error = 0;
593   foreach_map(args_element, [&](byte_range key, byte_range value) -> void {
594     if (message_is_string(key, ".name")) {
595       foronly_string(value, [&](size_t N, const unsigned char *str) {
596         kernelarg->name_ = std::string(str, str + N);
597       });
598     } else if (message_is_string(key, ".type_name")) {
599       foronly_string(value, [&](size_t N, const unsigned char *str) {
600         kernelarg->typeName_ = std::string(str, str + N);
601       });
602     } else if (message_is_string(key, ".size")) {
603       foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; });
604     } else if (message_is_string(key, ".offset")) {
605       foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; });
606     } else if (message_is_string(key, ".value_kind")) {
607       foronly_string(value, [&](size_t N, const unsigned char *str) {
608         std::string s = std::string(str, str + N);
609         auto itValueKind = ArgValueKind.find(s);
610         if (itValueKind != ArgValueKind.end()) {
611           kernelarg->valueKind_ = itValueKind->second;
612         }
613       });
614     }
615   });
616   return error;
617 }
618 } // namespace
619 
620 static hsa_status_t get_code_object_custom_metadata(
621     void *binary, size_t binSize,
622     std::map<std::string, atl_kernel_info_t> &KernelInfoTable) {
623   // parse code object with different keys from v2
624   // also, the kernel name is not the same as the symbol name -- so a
625   // symbol->name map is needed
626 
627   std::pair<unsigned char *, unsigned char *> metadata =
628       find_metadata(binary, binSize);
629   if (!metadata.first) {
630     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
631   }
632 
633   uint64_t kernelsSize = 0;
634   int msgpack_errors = 0;
635   msgpack::byte_range kernel_array;
636   msgpack_errors =
637       map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels",
638                        &kernel_array, &kernelsSize);
639   if (msgpack_errors != 0) {
640     printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
641            "kernels lookup in program metadata");
642     return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
643   }
644 
645   for (size_t i = 0; i < kernelsSize; i++) {
646     assert(msgpack_errors == 0);
647     std::string kernelName;
648     std::string symbolName;
649 
650     msgpack::byte_range element;
651     msgpack_errors += array_lookup_element(kernel_array, i, &element);
652     if (msgpack_errors != 0) {
653       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
654              "element lookup in kernel metadata");
655       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
656     }
657 
658     msgpack_errors += map_lookup_string(element, ".name", &kernelName);
659     msgpack_errors += map_lookup_string(element, ".symbol", &symbolName);
660     if (msgpack_errors != 0) {
661       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
662              "strings lookup in kernel metadata");
663       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
664     }
665 
666     // Make sure that kernelName + ".kd" == symbolName
667     if ((kernelName + ".kd") != symbolName) {
668       printf("[%s:%d] Kernel name mismatching symbol: %s != %s + .kd\n",
669              __FILE__, __LINE__, symbolName.c_str(), kernelName.c_str());
670       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
671     }
672 
673     atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, {}, {}, {}};
674 
675     uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count;
676     msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count);
677     if (msgpack_errors != 0) {
678       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
679              "sgpr count metadata lookup in kernel metadata");
680       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
681     }
682 
683     info.sgpr_count = sgpr_count;
684 
685     msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count);
686     if (msgpack_errors != 0) {
687       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
688              "vgpr count metadata lookup in kernel metadata");
689       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
690     }
691 
692     info.vgpr_count = vgpr_count;
693 
694     msgpack_errors +=
695         map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count);
696     if (msgpack_errors != 0) {
697       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
698              "sgpr spill count metadata lookup in kernel metadata");
699       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
700     }
701 
702     info.sgpr_spill_count = sgpr_spill_count;
703 
704     msgpack_errors +=
705         map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count);
706     if (msgpack_errors != 0) {
707       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
708              "vgpr spill count metadata lookup in kernel metadata");
709       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
710     }
711 
712     info.vgpr_spill_count = vgpr_spill_count;
713 
714     size_t kernel_explicit_args_size = 0;
715     uint64_t kernel_segment_size;
716     msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size",
717                                           &kernel_segment_size);
718     if (msgpack_errors != 0) {
719       printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
720              "kernarg segment size metadata lookup in kernel metadata");
721       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
722     }
723 
724     bool hasHiddenArgs = false;
725     if (kernel_segment_size > 0) {
726       uint64_t argsSize;
727       size_t offset = 0;
728 
729       msgpack::byte_range args_array;
730       msgpack_errors +=
731           map_lookup_array(element, ".args", &args_array, &argsSize);
732       if (msgpack_errors != 0) {
733         printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
734                "kernel args metadata lookup in kernel metadata");
735         return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
736       }
737 
738       info.num_args = argsSize;
739 
740       for (size_t i = 0; i < argsSize; ++i) {
741         KernelArgMD lcArg;
742 
743         msgpack::byte_range args_element;
744         msgpack_errors += array_lookup_element(args_array, i, &args_element);
745         if (msgpack_errors != 0) {
746           printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
747                  "iterate args map in kernel args metadata");
748           return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
749         }
750 
751         msgpack_errors += populate_kernelArgMD(args_element, &lcArg);
752         if (msgpack_errors != 0) {
753           printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
754                  "iterate args map in kernel args metadata");
755           return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
756         }
757         // populate info with sizes and offsets
758         info.arg_sizes.push_back(lcArg.size_);
759         // v3 has offset field and not align field
760         size_t new_offset = lcArg.offset_;
761         size_t padding = new_offset - offset;
762         offset = new_offset;
763         info.arg_offsets.push_back(lcArg.offset_);
764         DEBUG_PRINT("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(),
765                     lcArg.size_, lcArg.offset_);
766         offset += lcArg.size_;
767 
768         // check if the arg is a hidden/implicit arg
769         // this logic assumes that all hidden args are 8-byte aligned
770         if (!isImplicit(lcArg.valueKind_)) {
771           kernel_explicit_args_size += lcArg.size_;
772         } else {
773           hasHiddenArgs = true;
774         }
775         kernel_explicit_args_size += padding;
776       }
777     }
778 
779     // add size of implicit args, e.g.: offset x, y and z and pipe pointer, but
780     // in ATMI, do not count the compiler set implicit args, but set your own
781     // implicit args by discounting the compiler set implicit args
782     info.kernel_segment_size =
783         (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size) +
784         sizeof(atmi_implicit_args_t);
785     DEBUG_PRINT("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
786                 kernel_segment_size, info.kernel_segment_size);
787 
788     // kernel received, now add it to the kernel info table
789     KernelInfoTable[kernelName] = info;
790   }
791 
792   return HSA_STATUS_SUCCESS;
793 }
794 
795 static hsa_status_t
796 populate_InfoTables(hsa_executable_symbol_t symbol,
797                     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
798                     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable) {
799   hsa_symbol_kind_t type;
800 
801   uint32_t name_length;
802   hsa_status_t err;
803   err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE,
804                                        &type);
805   if (err != HSA_STATUS_SUCCESS) {
806     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
807            "Symbol info extraction", get_error_string(err));
808     return err;
809   }
810   DEBUG_PRINT("Exec Symbol type: %d\n", type);
811   if (type == HSA_SYMBOL_KIND_KERNEL) {
812     err = hsa_executable_symbol_get_info(
813         symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
814     if (err != HSA_STATUS_SUCCESS) {
815       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
816              "Symbol info extraction", get_error_string(err));
817       return err;
818     }
819     char *name = reinterpret_cast<char *>(malloc(name_length + 1));
820     err = hsa_executable_symbol_get_info(symbol,
821                                          HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
822     if (err != HSA_STATUS_SUCCESS) {
823       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
824              "Symbol info extraction", get_error_string(err));
825       return err;
826     }
827     // remove the suffix .kd from symbol name.
828     name[name_length - 3] = 0;
829 
830     atl_kernel_info_t info;
831     std::string kernelName(name);
832     // by now, the kernel info table should already have an entry
833     // because the non-ROCr custom code object parsing is called before
834     // iterating over the code object symbols using ROCr
835     if (KernelInfoTable.find(kernelName) == KernelInfoTable.end()) {
836       if (HSA_STATUS_ERROR_INVALID_CODE_OBJECT != HSA_STATUS_SUCCESS) {
837         printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
838                "Finding the entry kernel info table",
839                get_error_string(HSA_STATUS_ERROR_INVALID_CODE_OBJECT));
840         exit(1);
841       }
842     }
843     // found, so assign and update
844     info = KernelInfoTable[kernelName];
845 
846     /* Extract dispatch information from the symbol */
847     err = hsa_executable_symbol_get_info(
848         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
849         &(info.kernel_object));
850     if (err != HSA_STATUS_SUCCESS) {
851       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
852              "Extracting the symbol from the executable",
853              get_error_string(err));
854       return err;
855     }
856     err = hsa_executable_symbol_get_info(
857         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
858         &(info.group_segment_size));
859     if (err != HSA_STATUS_SUCCESS) {
860       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
861              "Extracting the group segment size from the executable",
862              get_error_string(err));
863       return err;
864     }
865     err = hsa_executable_symbol_get_info(
866         symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
867         &(info.private_segment_size));
868     if (err != HSA_STATUS_SUCCESS) {
869       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
870              "Extracting the private segment from the executable",
871              get_error_string(err));
872       return err;
873     }
874 
875     DEBUG_PRINT(
876         "Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes "
877         "kernarg\n",
878         kernelName.c_str(), info.kernel_object, info.group_segment_size,
879         info.private_segment_size, info.kernel_segment_size);
880 
881     // assign it back to the kernel info table
882     KernelInfoTable[kernelName] = info;
883     free(name);
884   } else if (type == HSA_SYMBOL_KIND_VARIABLE) {
885     err = hsa_executable_symbol_get_info(
886         symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
887     if (err != HSA_STATUS_SUCCESS) {
888       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
889              "Symbol info extraction", get_error_string(err));
890       return err;
891     }
892     char *name = reinterpret_cast<char *>(malloc(name_length + 1));
893     err = hsa_executable_symbol_get_info(symbol,
894                                          HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
895     if (err != HSA_STATUS_SUCCESS) {
896       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
897              "Symbol info extraction", get_error_string(err));
898       return err;
899     }
900     name[name_length] = 0;
901 
902     atl_symbol_info_t info;
903 
904     err = hsa_executable_symbol_get_info(
905         symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr));
906     if (err != HSA_STATUS_SUCCESS) {
907       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
908              "Symbol info address extraction", get_error_string(err));
909       return err;
910     }
911 
912     err = hsa_executable_symbol_get_info(
913         symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size));
914     if (err != HSA_STATUS_SUCCESS) {
915       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
916              "Symbol info size extraction", get_error_string(err));
917       return err;
918     }
919 
920     DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr,
921                 info.size);
922     SymbolInfoTable[std::string(name)] = info;
923     free(name);
924   } else {
925     DEBUG_PRINT("Symbol is an indirect function\n");
926   }
927   return HSA_STATUS_SUCCESS;
928 }
929 
930 hsa_status_t RegisterModuleFromMemory(
931     std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
932     std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
933     void *module_bytes, size_t module_size, hsa_agent_t agent,
934     hsa_status_t (*on_deserialized_data)(void *data, size_t size,
935                                          void *cb_state),
936     void *cb_state, std::vector<hsa_executable_t> &HSAExecutables) {
937   hsa_status_t err;
938   hsa_executable_t executable = {0};
939   hsa_profile_t agent_profile;
940 
941   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
942   if (err != HSA_STATUS_SUCCESS) {
943     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
944            "Query the agent profile", get_error_string(err));
945     return HSA_STATUS_ERROR;
946   }
947   // FIXME: Assume that every profile is FULL until we understand how to build
948   // GCN with base profile
949   agent_profile = HSA_PROFILE_FULL;
950   /* Create the empty executable.  */
951   err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "",
952                               &executable);
953   if (err != HSA_STATUS_SUCCESS) {
954     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
955            "Create the executable", get_error_string(err));
956     return HSA_STATUS_ERROR;
957   }
958 
959   bool module_load_success = false;
960   do // Existing control flow used continue, preserve that for this patch
961   {
962     {
963       // Some metadata info is not available through ROCr API, so use custom
964       // code object metadata parsing to collect such metadata info
965 
966       err = get_code_object_custom_metadata(module_bytes, module_size,
967                                             KernelInfoTable);
968       if (err != HSA_STATUS_SUCCESS) {
969         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
970                     "Getting custom code object metadata",
971                     get_error_string(err));
972         continue;
973       }
974 
975       // Deserialize code object.
976       hsa_code_object_t code_object = {0};
977       err = hsa_code_object_deserialize(module_bytes, module_size, NULL,
978                                         &code_object);
979       if (err != HSA_STATUS_SUCCESS) {
980         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
981                     "Code Object Deserialization", get_error_string(err));
982         continue;
983       }
984       assert(0 != code_object.handle);
985 
986       // Mutating the device image here avoids another allocation & memcpy
987       void *code_object_alloc_data =
988           reinterpret_cast<void *>(code_object.handle);
989       hsa_status_t atmi_err =
990           on_deserialized_data(code_object_alloc_data, module_size, cb_state);
991       if (atmi_err != HSA_STATUS_SUCCESS) {
992         printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
993                "Error in deserialized_data callback",
994                get_error_string(atmi_err));
995         return atmi_err;
996       }
997 
998       /* Load the code object.  */
999       err =
1000           hsa_executable_load_code_object(executable, agent, code_object, NULL);
1001       if (err != HSA_STATUS_SUCCESS) {
1002         DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1003                     "Loading the code object", get_error_string(err));
1004         continue;
1005       }
1006 
1007       // cannot iterate over symbols until executable is frozen
1008     }
1009     module_load_success = true;
1010   } while (0);
1011   DEBUG_PRINT("Modules loaded successful? %d\n", module_load_success);
1012   if (module_load_success) {
1013     /* Freeze the executable; it can now be queried for symbols.  */
1014     err = hsa_executable_freeze(executable, "");
1015     if (err != HSA_STATUS_SUCCESS) {
1016       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1017              "Freeze the executable", get_error_string(err));
1018       return HSA_STATUS_ERROR;
1019     }
1020 
1021     err = hsa::executable_iterate_symbols(
1022         executable,
1023         [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t {
1024           return populate_InfoTables(symbol, KernelInfoTable, SymbolInfoTable);
1025         });
1026     if (err != HSA_STATUS_SUCCESS) {
1027       printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1028              "Iterating over symbols for execuatable", get_error_string(err));
1029       return HSA_STATUS_ERROR;
1030     }
1031 
1032     // save the executable and destroy during finalize
1033     HSAExecutables.push_back(executable);
1034     return HSA_STATUS_SUCCESS;
1035   } else {
1036     return HSA_STATUS_ERROR;
1037   }
1038 }
1039 
1040 } // namespace core
1041