1 /*===-------------------------------------------------------------------------- 2 * ATMI (Asynchronous Task and Memory Interface) 3 * 4 * This file is distributed under the MIT License. See LICENSE.txt for details. 5 *===------------------------------------------------------------------------*/ 6 #include <gelf.h> 7 #include <libelf.h> 8 9 #include <cassert> 10 #include <cstdarg> 11 #include <fstream> 12 #include <iomanip> 13 #include <iostream> 14 #include <set> 15 #include <string> 16 17 #include "internal.h" 18 #include "machine.h" 19 #include "rt.h" 20 21 #include "msgpack.h" 22 23 typedef unsigned char *address; 24 /* 25 * Note descriptors. 26 */ 27 typedef struct { 28 uint32_t n_namesz; /* Length of note's name. */ 29 uint32_t n_descsz; /* Length of note's value. */ 30 uint32_t n_type; /* Type of note. */ 31 // then name 32 // then padding, optional 33 // then desc, at 4 byte alignment (not 8, despite being elf64) 34 } Elf_Note; 35 36 // The following include file and following structs/enums 37 // have been replicated on a per-use basis below. For example, 38 // llvm::AMDGPU::HSAMD::Kernel::Metadata has several fields, 39 // but we may care only about kernargSegmentSize_ for now, so 40 // we just include that field in our KernelMD implementation. We 41 // chose this approach to replicate in order to avoid forcing 42 // a dependency on LLVM_INCLUDE_DIR just to compile the runtime. 43 // #include "llvm/Support/AMDGPUMetadata.h" 44 // typedef llvm::AMDGPU::HSAMD::Metadata CodeObjectMD; 45 // typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD; 46 // typedef llvm::AMDGPU::HSAMD::Kernel::Arg::Metadata KernelArgMD; 47 // using llvm::AMDGPU::HSAMD::AccessQualifier; 48 // using llvm::AMDGPU::HSAMD::AddressSpaceQualifier; 49 // using llvm::AMDGPU::HSAMD::ValueKind; 50 // using llvm::AMDGPU::HSAMD::ValueType; 51 52 class KernelArgMD { 53 public: 54 enum class ValueKind { 55 HiddenGlobalOffsetX, 56 HiddenGlobalOffsetY, 57 HiddenGlobalOffsetZ, 58 HiddenNone, 59 HiddenPrintfBuffer, 60 HiddenDefaultQueue, 61 HiddenCompletionAction, 62 HiddenMultiGridSyncArg, 63 HiddenHostcallBuffer, 64 Unknown 65 }; 66 67 KernelArgMD() 68 : name_(std::string()), typeName_(std::string()), size_(0), offset_(0), 69 align_(0), valueKind_(ValueKind::Unknown) {} 70 71 // fields 72 std::string name_; 73 std::string typeName_; 74 uint32_t size_; 75 uint32_t offset_; 76 uint32_t align_; 77 ValueKind valueKind_; 78 }; 79 80 class KernelMD { 81 public: 82 KernelMD() : kernargSegmentSize_(0ull) {} 83 84 // fields 85 uint64_t kernargSegmentSize_; 86 }; 87 88 static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = { 89 // Including only those fields that are relevant to the runtime. 90 // {"ByValue", KernelArgMD::ValueKind::ByValue}, 91 // {"GlobalBuffer", KernelArgMD::ValueKind::GlobalBuffer}, 92 // {"DynamicSharedPointer", 93 // KernelArgMD::ValueKind::DynamicSharedPointer}, 94 // {"Sampler", KernelArgMD::ValueKind::Sampler}, 95 // {"Image", KernelArgMD::ValueKind::Image}, 96 // {"Pipe", KernelArgMD::ValueKind::Pipe}, 97 // {"Queue", KernelArgMD::ValueKind::Queue}, 98 {"HiddenGlobalOffsetX", KernelArgMD::ValueKind::HiddenGlobalOffsetX}, 99 {"HiddenGlobalOffsetY", KernelArgMD::ValueKind::HiddenGlobalOffsetY}, 100 {"HiddenGlobalOffsetZ", KernelArgMD::ValueKind::HiddenGlobalOffsetZ}, 101 {"HiddenNone", KernelArgMD::ValueKind::HiddenNone}, 102 {"HiddenPrintfBuffer", KernelArgMD::ValueKind::HiddenPrintfBuffer}, 103 {"HiddenDefaultQueue", KernelArgMD::ValueKind::HiddenDefaultQueue}, 104 {"HiddenCompletionAction", KernelArgMD::ValueKind::HiddenCompletionAction}, 105 {"HiddenMultiGridSyncArg", KernelArgMD::ValueKind::HiddenMultiGridSyncArg}, 106 {"HiddenHostcallBuffer", KernelArgMD::ValueKind::HiddenHostcallBuffer}, 107 // v3 108 // {"by_value", KernelArgMD::ValueKind::ByValue}, 109 // {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer}, 110 // {"dynamic_shared_pointer", 111 // KernelArgMD::ValueKind::DynamicSharedPointer}, 112 // {"sampler", KernelArgMD::ValueKind::Sampler}, 113 // {"image", KernelArgMD::ValueKind::Image}, 114 // {"pipe", KernelArgMD::ValueKind::Pipe}, 115 // {"queue", KernelArgMD::ValueKind::Queue}, 116 {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX}, 117 {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY}, 118 {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ}, 119 {"hidden_none", KernelArgMD::ValueKind::HiddenNone}, 120 {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer}, 121 {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue}, 122 {"hidden_completion_action", 123 KernelArgMD::ValueKind::HiddenCompletionAction}, 124 {"hidden_multigrid_sync_arg", 125 KernelArgMD::ValueKind::HiddenMultiGridSyncArg}, 126 {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer}, 127 }; 128 129 // global variables. TODO: Get rid of these 130 atmi_machine_t g_atmi_machine; 131 ATLMachine g_atl_machine; 132 133 std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools; 134 135 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable; 136 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable; 137 138 bool g_atmi_initialized = false; 139 bool g_atmi_hostcall_required = false; 140 141 /* 142 atlc is all internal global values. 143 The structure atl_context_t is defined in atl_internal.h 144 Most references will use the global structure prefix atlc. 145 */ 146 atl_context_t atlc = {.struct_initialized = false}; 147 148 namespace core { 149 /* Machine Info */ 150 atmi_machine_t *Runtime::GetMachineInfo() { 151 if (!atlc.g_hsa_initialized) 152 return NULL; 153 return &g_atmi_machine; 154 } 155 156 static void atl_set_atmi_initialized() { 157 // FIXME: thread safe? locks? 158 g_atmi_initialized = true; 159 } 160 161 static void atl_reset_atmi_initialized() { 162 // FIXME: thread safe? locks? 163 g_atmi_initialized = false; 164 } 165 166 bool atl_is_atmi_initialized() { return g_atmi_initialized; } 167 168 hsa_status_t allow_access_to_all_gpu_agents(void *ptr) { 169 std::vector<ATLGPUProcessor> &gpu_procs = 170 g_atl_machine.processors<ATLGPUProcessor>(); 171 std::vector<hsa_agent_t> agents; 172 for (uint32_t i = 0; i < gpu_procs.size(); i++) { 173 agents.push_back(gpu_procs[i].agent()); 174 } 175 return hsa_amd_agents_allow_access(agents.size(), &agents[0], NULL, ptr); 176 } 177 178 atmi_status_t Runtime::Initialize() { 179 atmi_devtype_t devtype = ATMI_DEVTYPE_GPU; 180 if (atl_is_atmi_initialized()) 181 return ATMI_STATUS_SUCCESS; 182 183 if (devtype == ATMI_DEVTYPE_ALL || devtype & ATMI_DEVTYPE_GPU) { 184 atmi_status_t rc = atl_init_gpu_context(); 185 if (rc != ATMI_STATUS_SUCCESS) { 186 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "GPU context init", 187 get_atmi_error_string(atl_init_gpu_context())); 188 return rc; 189 } 190 } 191 192 atl_set_atmi_initialized(); 193 return ATMI_STATUS_SUCCESS; 194 } 195 196 atmi_status_t Runtime::Finalize() { 197 atmi_status_t rc = ATMI_STATUS_SUCCESS; 198 for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) { 199 SymbolInfoTable[i].clear(); 200 } 201 SymbolInfoTable.clear(); 202 for (uint32_t i = 0; i < KernelInfoTable.size(); i++) { 203 KernelInfoTable[i].clear(); 204 } 205 KernelInfoTable.clear(); 206 207 atl_reset_atmi_initialized(); 208 hsa_status_t err = hsa_shut_down(); 209 if (err != HSA_STATUS_SUCCESS) { 210 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Shutting down HSA", 211 get_error_string(err)); 212 rc = ATMI_STATUS_ERROR; 213 } 214 215 return rc; 216 } 217 218 static void atmi_init_context_structs() { 219 atlc.struct_initialized = true; /* This only gets called one time */ 220 atlc.g_hsa_initialized = false; 221 atlc.g_gpu_initialized = false; 222 atlc.g_tasks_initialized = false; 223 } 224 225 // Implement memory_pool iteration function 226 static hsa_status_t get_memory_pool_info(hsa_amd_memory_pool_t memory_pool, 227 void *data) { 228 ATLProcessor *proc = reinterpret_cast<ATLProcessor *>(data); 229 hsa_status_t err = HSA_STATUS_SUCCESS; 230 // Check if the memory_pool is allowed to allocate, i.e. do not return group 231 // memory 232 bool alloc_allowed = false; 233 err = hsa_amd_memory_pool_get_info( 234 memory_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, 235 &alloc_allowed); 236 if (err != HSA_STATUS_SUCCESS) { 237 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 238 "Alloc allowed in memory pool check", get_error_string(err)); 239 return err; 240 } 241 if (alloc_allowed) { 242 uint32_t global_flag = 0; 243 err = hsa_amd_memory_pool_get_info( 244 memory_pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); 245 if (err != HSA_STATUS_SUCCESS) { 246 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 247 "Get memory pool info", get_error_string(err)); 248 return err; 249 } 250 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & global_flag) { 251 ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_FINE_GRAINED); 252 proc->addMemory(new_mem); 253 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & global_flag) { 254 DEBUG_PRINT("GPU kernel args pool handle: %lu\n", memory_pool.handle); 255 atl_gpu_kernarg_pools.push_back(memory_pool); 256 } 257 } else { 258 ATLMemory new_mem(memory_pool, *proc, ATMI_MEMTYPE_COARSE_GRAINED); 259 proc->addMemory(new_mem); 260 } 261 } 262 263 return err; 264 } 265 266 static hsa_status_t get_agent_info(hsa_agent_t agent, void *data) { 267 hsa_status_t err = HSA_STATUS_SUCCESS; 268 hsa_device_type_t device_type; 269 err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); 270 if (err != HSA_STATUS_SUCCESS) { 271 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 272 "Get device type info", get_error_string(err)); 273 return err; 274 } 275 switch (device_type) { 276 case HSA_DEVICE_TYPE_CPU: { 277 ATLCPUProcessor new_proc(agent); 278 err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info, 279 &new_proc); 280 if (err != HSA_STATUS_SUCCESS) { 281 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 282 "Iterate all memory pools", get_error_string(err)); 283 return err; 284 } 285 g_atl_machine.addProcessor(new_proc); 286 } break; 287 case HSA_DEVICE_TYPE_GPU: { 288 hsa_profile_t profile; 289 err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &profile); 290 if (err != HSA_STATUS_SUCCESS) { 291 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 292 "Query the agent profile", get_error_string(err)); 293 return err; 294 } 295 atmi_devtype_t gpu_type; 296 gpu_type = 297 (profile == HSA_PROFILE_FULL) ? ATMI_DEVTYPE_iGPU : ATMI_DEVTYPE_dGPU; 298 ATLGPUProcessor new_proc(agent, gpu_type); 299 err = hsa_amd_agent_iterate_memory_pools(agent, get_memory_pool_info, 300 &new_proc); 301 if (err != HSA_STATUS_SUCCESS) { 302 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 303 "Iterate all memory pools", get_error_string(err)); 304 return err; 305 } 306 g_atl_machine.addProcessor(new_proc); 307 } break; 308 case HSA_DEVICE_TYPE_DSP: { 309 err = HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 310 } break; 311 } 312 313 return err; 314 } 315 316 hsa_status_t get_fine_grained_region(hsa_region_t region, void *data) { 317 hsa_region_segment_t segment; 318 hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment); 319 if (segment != HSA_REGION_SEGMENT_GLOBAL) { 320 return HSA_STATUS_SUCCESS; 321 } 322 hsa_region_global_flag_t flags; 323 hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); 324 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) { 325 hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data); 326 *ret = region; 327 return HSA_STATUS_INFO_BREAK; 328 } 329 return HSA_STATUS_SUCCESS; 330 } 331 332 /* Determines if a memory region can be used for kernarg allocations. */ 333 static hsa_status_t get_kernarg_memory_region(hsa_region_t region, void *data) { 334 hsa_region_segment_t segment; 335 hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment); 336 if (HSA_REGION_SEGMENT_GLOBAL != segment) { 337 return HSA_STATUS_SUCCESS; 338 } 339 340 hsa_region_global_flag_t flags; 341 hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); 342 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) { 343 hsa_region_t *ret = reinterpret_cast<hsa_region_t *>(data); 344 *ret = region; 345 return HSA_STATUS_INFO_BREAK; 346 } 347 348 return HSA_STATUS_SUCCESS; 349 } 350 351 static hsa_status_t init_compute_and_memory() { 352 hsa_status_t err; 353 354 /* Iterate over the agents and pick the gpu agent */ 355 err = hsa_iterate_agents(get_agent_info, NULL); 356 if (err == HSA_STATUS_INFO_BREAK) { 357 err = HSA_STATUS_SUCCESS; 358 } 359 if (err != HSA_STATUS_SUCCESS) { 360 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Getting a gpu agent", 361 get_error_string(err)); 362 return err; 363 } 364 365 /* Init all devices or individual device types? */ 366 std::vector<ATLCPUProcessor> &cpu_procs = 367 g_atl_machine.processors<ATLCPUProcessor>(); 368 std::vector<ATLGPUProcessor> &gpu_procs = 369 g_atl_machine.processors<ATLGPUProcessor>(); 370 /* For CPU memory pools, add other devices that can access them directly 371 * or indirectly */ 372 for (auto &cpu_proc : cpu_procs) { 373 for (auto &cpu_mem : cpu_proc.memories()) { 374 hsa_amd_memory_pool_t pool = cpu_mem.memory(); 375 for (auto &gpu_proc : gpu_procs) { 376 hsa_agent_t agent = gpu_proc.agent(); 377 hsa_amd_memory_pool_access_t access; 378 hsa_amd_agent_memory_pool_get_info( 379 agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); 380 if (access != 0) { 381 // this means not NEVER, but could be YES or NO 382 // add this memory pool to the proc 383 gpu_proc.addMemory(cpu_mem); 384 } 385 } 386 } 387 } 388 389 /* FIXME: are the below combinations of procs and memory pools needed? 390 * all to all compare procs with their memory pools and add those memory 391 * pools that are accessible by the target procs */ 392 for (auto &gpu_proc : gpu_procs) { 393 for (auto &gpu_mem : gpu_proc.memories()) { 394 hsa_amd_memory_pool_t pool = gpu_mem.memory(); 395 for (auto &cpu_proc : cpu_procs) { 396 hsa_agent_t agent = cpu_proc.agent(); 397 hsa_amd_memory_pool_access_t access; 398 hsa_amd_agent_memory_pool_get_info( 399 agent, pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); 400 if (access != 0) { 401 // this means not NEVER, but could be YES or NO 402 // add this memory pool to the proc 403 cpu_proc.addMemory(gpu_mem); 404 } 405 } 406 } 407 } 408 409 g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_CPU] = cpu_procs.size(); 410 g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_GPU] = gpu_procs.size(); 411 412 size_t num_procs = cpu_procs.size() + gpu_procs.size(); 413 // g_atmi_machine.devices = (atmi_device_t *)malloc(num_procs * 414 // sizeof(atmi_device_t)); 415 atmi_device_t *all_devices = reinterpret_cast<atmi_device_t *>( 416 malloc(num_procs * sizeof(atmi_device_t))); 417 int num_iGPUs = 0; 418 int num_dGPUs = 0; 419 for (uint32_t i = 0; i < gpu_procs.size(); i++) { 420 if (gpu_procs[i].type() == ATMI_DEVTYPE_iGPU) 421 num_iGPUs++; 422 else 423 num_dGPUs++; 424 } 425 assert(num_iGPUs + num_dGPUs == gpu_procs.size() && 426 "Number of dGPUs and iGPUs do not add up"); 427 DEBUG_PRINT("CPU Agents: %lu\n", cpu_procs.size()); 428 DEBUG_PRINT("iGPU Agents: %d\n", num_iGPUs); 429 DEBUG_PRINT("dGPU Agents: %d\n", num_dGPUs); 430 DEBUG_PRINT("GPU Agents: %lu\n", gpu_procs.size()); 431 432 g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_iGPU] = num_iGPUs; 433 g_atmi_machine.device_count_by_type[ATMI_DEVTYPE_dGPU] = num_dGPUs; 434 435 int cpus_begin = 0; 436 int cpus_end = cpu_procs.size(); 437 int gpus_begin = cpu_procs.size(); 438 int gpus_end = cpu_procs.size() + gpu_procs.size(); 439 g_atmi_machine.devices_by_type[ATMI_DEVTYPE_CPU] = &all_devices[cpus_begin]; 440 g_atmi_machine.devices_by_type[ATMI_DEVTYPE_GPU] = &all_devices[gpus_begin]; 441 g_atmi_machine.devices_by_type[ATMI_DEVTYPE_iGPU] = &all_devices[gpus_begin]; 442 g_atmi_machine.devices_by_type[ATMI_DEVTYPE_dGPU] = &all_devices[gpus_begin]; 443 int proc_index = 0; 444 for (int i = cpus_begin; i < cpus_end; i++) { 445 all_devices[i].type = cpu_procs[proc_index].type(); 446 447 std::vector<ATLMemory> memories = cpu_procs[proc_index].memories(); 448 int fine_memories_size = 0; 449 int coarse_memories_size = 0; 450 DEBUG_PRINT("CPU memory types:\t"); 451 for (auto &memory : memories) { 452 atmi_memtype_t type = memory.type(); 453 if (type == ATMI_MEMTYPE_FINE_GRAINED) { 454 fine_memories_size++; 455 DEBUG_PRINT("Fine\t"); 456 } else { 457 coarse_memories_size++; 458 DEBUG_PRINT("Coarse\t"); 459 } 460 } 461 DEBUG_PRINT("\nFine Memories : %d", fine_memories_size); 462 DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size); 463 proc_index++; 464 } 465 proc_index = 0; 466 for (int i = gpus_begin; i < gpus_end; i++) { 467 all_devices[i].type = gpu_procs[proc_index].type(); 468 469 std::vector<ATLMemory> memories = gpu_procs[proc_index].memories(); 470 int fine_memories_size = 0; 471 int coarse_memories_size = 0; 472 DEBUG_PRINT("GPU memory types:\t"); 473 for (auto &memory : memories) { 474 atmi_memtype_t type = memory.type(); 475 if (type == ATMI_MEMTYPE_FINE_GRAINED) { 476 fine_memories_size++; 477 DEBUG_PRINT("Fine\t"); 478 } else { 479 coarse_memories_size++; 480 DEBUG_PRINT("Coarse\t"); 481 } 482 } 483 DEBUG_PRINT("\nFine Memories : %d", fine_memories_size); 484 DEBUG_PRINT("\tCoarse Memories : %d\n", coarse_memories_size); 485 proc_index++; 486 } 487 proc_index = 0; 488 hsa_region_t atl_cpu_kernarg_region; 489 atl_cpu_kernarg_region.handle = (uint64_t)-1; 490 if (cpu_procs.size() > 0) { 491 err = hsa_agent_iterate_regions( 492 cpu_procs[0].agent(), get_fine_grained_region, &atl_cpu_kernarg_region); 493 if (err == HSA_STATUS_INFO_BREAK) { 494 err = HSA_STATUS_SUCCESS; 495 } 496 err = (atl_cpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR 497 : HSA_STATUS_SUCCESS; 498 if (err != HSA_STATUS_SUCCESS) { 499 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 500 "Finding a CPU kernarg memory region handle", 501 get_error_string(err)); 502 return err; 503 } 504 } 505 hsa_region_t atl_gpu_kernarg_region; 506 /* Find a memory region that supports kernel arguments. */ 507 atl_gpu_kernarg_region.handle = (uint64_t)-1; 508 if (gpu_procs.size() > 0) { 509 hsa_agent_iterate_regions(gpu_procs[0].agent(), get_kernarg_memory_region, 510 &atl_gpu_kernarg_region); 511 err = (atl_gpu_kernarg_region.handle == (uint64_t)-1) ? HSA_STATUS_ERROR 512 : HSA_STATUS_SUCCESS; 513 if (err != HSA_STATUS_SUCCESS) { 514 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 515 "Finding a kernarg memory region", get_error_string(err)); 516 return err; 517 } 518 } 519 if (num_procs > 0) 520 return HSA_STATUS_SUCCESS; 521 else 522 return HSA_STATUS_ERROR_NOT_INITIALIZED; 523 } 524 525 hsa_status_t init_hsa() { 526 if (atlc.g_hsa_initialized == false) { 527 DEBUG_PRINT("Initializing HSA..."); 528 hsa_status_t err = hsa_init(); 529 if (err != HSA_STATUS_SUCCESS) { 530 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 531 "Initializing the hsa runtime", get_error_string(err)); 532 return err; 533 } 534 if (err != HSA_STATUS_SUCCESS) 535 return err; 536 537 err = init_compute_and_memory(); 538 if (err != HSA_STATUS_SUCCESS) 539 return err; 540 if (err != HSA_STATUS_SUCCESS) { 541 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 542 "After initializing compute and memory", get_error_string(err)); 543 return err; 544 } 545 546 int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>(); 547 KernelInfoTable.resize(gpu_count); 548 SymbolInfoTable.resize(gpu_count); 549 for (uint32_t i = 0; i < SymbolInfoTable.size(); i++) 550 SymbolInfoTable[i].clear(); 551 for (uint32_t i = 0; i < KernelInfoTable.size(); i++) 552 KernelInfoTable[i].clear(); 553 atlc.g_hsa_initialized = true; 554 DEBUG_PRINT("done\n"); 555 } 556 return HSA_STATUS_SUCCESS; 557 } 558 559 void init_tasks() { 560 if (atlc.g_tasks_initialized != false) 561 return; 562 std::vector<hsa_agent_t> gpu_agents; 563 int gpu_count = g_atl_machine.processorCount<ATLGPUProcessor>(); 564 for (int gpu = 0; gpu < gpu_count; gpu++) { 565 atmi_place_t place = ATMI_PLACE_GPU(0, gpu); 566 ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place); 567 gpu_agents.push_back(proc.agent()); 568 } 569 atlc.g_tasks_initialized = true; 570 } 571 572 hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) { 573 #if (ROCM_VERSION_MAJOR >= 3) || \ 574 (ROCM_VERSION_MAJOR >= 2 && ROCM_VERSION_MINOR >= 3) 575 if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) { 576 #else 577 if (event->event_type == GPU_MEMORY_FAULT_EVENT) { 578 #endif 579 hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault; 580 // memory_fault.agent 581 // memory_fault.virtual_address 582 // memory_fault.fault_reason_mask 583 // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address); 584 std::stringstream stream; 585 stream << std::hex << (uintptr_t)memory_fault.virtual_address; 586 std::string addr("0x" + stream.str()); 587 588 std::string err_string = "[GPU Memory Error] Addr: " + addr; 589 err_string += " Reason: "; 590 if (!(memory_fault.fault_reason_mask & 0x00111111)) { 591 err_string += "No Idea! "; 592 } else { 593 if (memory_fault.fault_reason_mask & 0x00000001) 594 err_string += "Page not present or supervisor privilege. "; 595 if (memory_fault.fault_reason_mask & 0x00000010) 596 err_string += "Write access to a read-only page. "; 597 if (memory_fault.fault_reason_mask & 0x00000100) 598 err_string += "Execute access to a page marked NX. "; 599 if (memory_fault.fault_reason_mask & 0x00001000) 600 err_string += "Host access only. "; 601 if (memory_fault.fault_reason_mask & 0x00010000) 602 err_string += "ECC failure (if supported by HW). "; 603 if (memory_fault.fault_reason_mask & 0x00100000) 604 err_string += "Can't determine the exact fault address. "; 605 } 606 fprintf(stderr, "%s\n", err_string.c_str()); 607 return HSA_STATUS_ERROR; 608 } 609 return HSA_STATUS_SUCCESS; 610 } 611 612 atmi_status_t atl_init_gpu_context() { 613 if (atlc.struct_initialized == false) 614 atmi_init_context_structs(); 615 if (atlc.g_gpu_initialized != false) 616 return ATMI_STATUS_SUCCESS; 617 618 hsa_status_t err; 619 err = init_hsa(); 620 if (err != HSA_STATUS_SUCCESS) 621 return ATMI_STATUS_ERROR; 622 623 err = hsa_amd_register_system_event_handler(callbackEvent, NULL); 624 if (err != HSA_STATUS_SUCCESS) { 625 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 626 "Registering the system for memory faults", get_error_string(err)); 627 return ATMI_STATUS_ERROR; 628 } 629 630 init_tasks(); 631 atlc.g_gpu_initialized = true; 632 return ATMI_STATUS_SUCCESS; 633 } 634 635 static bool isImplicit(KernelArgMD::ValueKind value_kind) { 636 switch (value_kind) { 637 case KernelArgMD::ValueKind::HiddenGlobalOffsetX: 638 case KernelArgMD::ValueKind::HiddenGlobalOffsetY: 639 case KernelArgMD::ValueKind::HiddenGlobalOffsetZ: 640 case KernelArgMD::ValueKind::HiddenNone: 641 case KernelArgMD::ValueKind::HiddenPrintfBuffer: 642 case KernelArgMD::ValueKind::HiddenDefaultQueue: 643 case KernelArgMD::ValueKind::HiddenCompletionAction: 644 case KernelArgMD::ValueKind::HiddenMultiGridSyncArg: 645 case KernelArgMD::ValueKind::HiddenHostcallBuffer: 646 return true; 647 default: 648 return false; 649 } 650 } 651 652 static std::pair<unsigned char *, unsigned char *> 653 find_metadata(void *binary, size_t binSize) { 654 std::pair<unsigned char *, unsigned char *> failure = {nullptr, nullptr}; 655 656 Elf *e = elf_memory(static_cast<char *>(binary), binSize); 657 if (elf_kind(e) != ELF_K_ELF) { 658 return failure; 659 } 660 661 size_t numpHdrs; 662 if (elf_getphdrnum(e, &numpHdrs) != 0) { 663 return failure; 664 } 665 666 for (size_t i = 0; i < numpHdrs; ++i) { 667 GElf_Phdr pHdr; 668 if (gelf_getphdr(e, i, &pHdr) != &pHdr) { 669 continue; 670 } 671 // Look for the runtime metadata note 672 if (pHdr.p_type == PT_NOTE && pHdr.p_align >= sizeof(int)) { 673 // Iterate over the notes in this segment 674 address ptr = (address)binary + pHdr.p_offset; 675 address segmentEnd = ptr + pHdr.p_filesz; 676 677 while (ptr < segmentEnd) { 678 Elf_Note *note = reinterpret_cast<Elf_Note *>(ptr); 679 address name = (address)¬e[1]; 680 681 if (note->n_type == 7 || note->n_type == 8) { 682 return failure; 683 } else if (note->n_type == 10 /* NT_AMD_AMDGPU_HSA_METADATA */ && 684 note->n_namesz == sizeof "AMD" && 685 !memcmp(name, "AMD", note->n_namesz)) { 686 // code object v2 uses yaml metadata, no longer supported 687 return failure; 688 } else if (note->n_type == 32 /* NT_AMDGPU_METADATA */ && 689 note->n_namesz == sizeof "AMDGPU" && 690 !memcmp(name, "AMDGPU", note->n_namesz)) { 691 692 // n_descsz = 485 693 // value is padded to 4 byte alignment, may want to move end up to 694 // match 695 size_t offset = sizeof(uint32_t) * 3 /* fields */ 696 + sizeof("AMDGPU") /* name */ 697 + 1 /* padding to 4 byte alignment */; 698 699 // Including the trailing padding means both pointers are 4 bytes 700 // aligned, which may be useful later. 701 unsigned char *metadata_start = (unsigned char *)ptr + offset; 702 unsigned char *metadata_end = 703 metadata_start + core::alignUp(note->n_descsz, 4); 704 return {metadata_start, metadata_end}; 705 } 706 ptr += sizeof(*note) + core::alignUp(note->n_namesz, sizeof(int)) + 707 core::alignUp(note->n_descsz, sizeof(int)); 708 } 709 } 710 } 711 712 return failure; 713 } 714 715 namespace { 716 int map_lookup_array(msgpack::byte_range message, const char *needle, 717 msgpack::byte_range *res, uint64_t *size) { 718 unsigned count = 0; 719 struct s : msgpack::functors_defaults<s> { 720 s(unsigned &count, uint64_t *size) : count(count), size(size) {} 721 unsigned &count; 722 uint64_t *size; 723 const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) { 724 count++; 725 *size = N; 726 return bytes.end; 727 } 728 }; 729 730 msgpack::foreach_map(message, 731 [&](msgpack::byte_range key, msgpack::byte_range value) { 732 if (msgpack::message_is_string(key, needle)) { 733 // If the message is an array, record number of 734 // elements in *size 735 msgpack::handle_msgpack<s>(value, {count, size}); 736 // return the whole array 737 *res = value; 738 } 739 }); 740 // Only claim success if exactly one key/array pair matched 741 return count != 1; 742 } 743 744 int map_lookup_string(msgpack::byte_range message, const char *needle, 745 std::string *res) { 746 unsigned count = 0; 747 struct s : public msgpack::functors_defaults<s> { 748 s(unsigned &count, std::string *res) : count(count), res(res) {} 749 unsigned &count; 750 std::string *res; 751 void handle_string(size_t N, const unsigned char *str) { 752 count++; 753 *res = std::string(str, str + N); 754 } 755 }; 756 msgpack::foreach_map(message, 757 [&](msgpack::byte_range key, msgpack::byte_range value) { 758 if (msgpack::message_is_string(key, needle)) { 759 msgpack::handle_msgpack<s>(value, {count, res}); 760 } 761 }); 762 return count != 1; 763 } 764 765 int map_lookup_uint64_t(msgpack::byte_range message, const char *needle, 766 uint64_t *res) { 767 unsigned count = 0; 768 msgpack::foreach_map(message, 769 [&](msgpack::byte_range key, msgpack::byte_range value) { 770 if (msgpack::message_is_string(key, needle)) { 771 msgpack::foronly_unsigned(value, [&](uint64_t x) { 772 count++; 773 *res = x; 774 }); 775 } 776 }); 777 return count != 1; 778 } 779 780 int array_lookup_element(msgpack::byte_range message, uint64_t elt, 781 msgpack::byte_range *res) { 782 int rc = 1; 783 uint64_t i = 0; 784 msgpack::foreach_array(message, [&](msgpack::byte_range value) { 785 if (i == elt) { 786 *res = value; 787 rc = 0; 788 } 789 i++; 790 }); 791 return rc; 792 } 793 794 int populate_kernelArgMD(msgpack::byte_range args_element, 795 KernelArgMD *kernelarg) { 796 using namespace msgpack; 797 int error = 0; 798 foreach_map(args_element, [&](byte_range key, byte_range value) -> void { 799 if (message_is_string(key, ".name")) { 800 foronly_string(value, [&](size_t N, const unsigned char *str) { 801 kernelarg->name_ = std::string(str, str + N); 802 }); 803 } else if (message_is_string(key, ".type_name")) { 804 foronly_string(value, [&](size_t N, const unsigned char *str) { 805 kernelarg->typeName_ = std::string(str, str + N); 806 }); 807 } else if (message_is_string(key, ".size")) { 808 foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; }); 809 } else if (message_is_string(key, ".offset")) { 810 foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; }); 811 } else if (message_is_string(key, ".value_kind")) { 812 foronly_string(value, [&](size_t N, const unsigned char *str) { 813 std::string s = std::string(str, str + N); 814 auto itValueKind = ArgValueKind.find(s); 815 if (itValueKind != ArgValueKind.end()) { 816 kernelarg->valueKind_ = itValueKind->second; 817 } 818 }); 819 } 820 }); 821 return error; 822 } 823 } // namespace 824 825 static hsa_status_t get_code_object_custom_metadata(void *binary, 826 size_t binSize, int gpu) { 827 // parse code object with different keys from v2 828 // also, the kernel name is not the same as the symbol name -- so a 829 // symbol->name map is needed 830 831 std::pair<unsigned char *, unsigned char *> metadata = 832 find_metadata(binary, binSize); 833 if (!metadata.first) { 834 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 835 } 836 837 uint64_t kernelsSize = 0; 838 int msgpack_errors = 0; 839 msgpack::byte_range kernel_array; 840 msgpack_errors = 841 map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels", 842 &kernel_array, &kernelsSize); 843 if (msgpack_errors != 0) { 844 printf("[%s:%d] %s failed\n", __FILE__, __LINE__, 845 "kernels lookup in program metadata"); 846 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 847 } 848 849 for (size_t i = 0; i < kernelsSize; i++) { 850 assert(msgpack_errors == 0); 851 std::string kernelName; 852 std::string symbolName; 853 854 msgpack::byte_range element; 855 msgpack_errors += array_lookup_element(kernel_array, i, &element); 856 if (msgpack_errors != 0) { 857 printf("[%s:%d] %s failed\n", __FILE__, __LINE__, 858 "element lookup in kernel metadata"); 859 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 860 } 861 862 msgpack_errors += map_lookup_string(element, ".name", &kernelName); 863 msgpack_errors += map_lookup_string(element, ".symbol", &symbolName); 864 if (msgpack_errors != 0) { 865 printf("[%s:%d] %s failed\n", __FILE__, __LINE__, 866 "strings lookup in kernel metadata"); 867 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 868 } 869 870 // Make sure that kernelName + ".kd" == symbolName 871 if ((kernelName + ".kd") != symbolName) { 872 printf("[%s:%d] Kernel name mismatching symbol: %s != %s + .kd\n", 873 __FILE__, __LINE__, symbolName.c_str(), kernelName.c_str()); 874 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 875 } 876 877 atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, {}, {}, {}}; 878 879 uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count; 880 msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count); 881 if (msgpack_errors != 0) { 882 printf("[%s:%d] %s failed\n", __FILE__, __LINE__, 883 "sgpr count metadata lookup in kernel metadata"); 884 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 885 } 886 887 info.sgpr_count = sgpr_count; 888 889 msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count); 890 if (msgpack_errors != 0) { 891 printf("[%s:%d] %s failed\n", __FILE__, __LINE__, 892 "vgpr count metadata lookup in kernel metadata"); 893 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 894 } 895 896 info.vgpr_count = vgpr_count; 897 898 msgpack_errors += 899 map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count); 900 if (msgpack_errors != 0) { 901 printf("[%s:%d] %s failed\n", __FILE__, __LINE__, 902 "sgpr spill count metadata lookup in kernel metadata"); 903 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 904 } 905 906 info.sgpr_spill_count = sgpr_spill_count; 907 908 msgpack_errors += 909 map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count); 910 if (msgpack_errors != 0) { 911 printf("[%s:%d] %s failed\n", __FILE__, __LINE__, 912 "vgpr spill count metadata lookup in kernel metadata"); 913 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 914 } 915 916 info.vgpr_spill_count = vgpr_spill_count; 917 918 size_t kernel_explicit_args_size = 0; 919 uint64_t kernel_segment_size; 920 msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size", 921 &kernel_segment_size); 922 if (msgpack_errors != 0) { 923 printf("[%s:%d] %s failed\n", __FILE__, __LINE__, 924 "kernarg segment size metadata lookup in kernel metadata"); 925 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 926 } 927 928 bool hasHiddenArgs = false; 929 if (kernel_segment_size > 0) { 930 uint64_t argsSize; 931 size_t offset = 0; 932 933 msgpack::byte_range args_array; 934 msgpack_errors += 935 map_lookup_array(element, ".args", &args_array, &argsSize); 936 if (msgpack_errors != 0) { 937 printf("[%s:%d] %s failed\n", __FILE__, __LINE__, 938 "kernel args metadata lookup in kernel metadata"); 939 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 940 } 941 942 info.num_args = argsSize; 943 944 for (size_t i = 0; i < argsSize; ++i) { 945 KernelArgMD lcArg; 946 947 msgpack::byte_range args_element; 948 msgpack_errors += array_lookup_element(args_array, i, &args_element); 949 if (msgpack_errors != 0) { 950 printf("[%s:%d] %s failed\n", __FILE__, __LINE__, 951 "iterate args map in kernel args metadata"); 952 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 953 } 954 955 msgpack_errors += populate_kernelArgMD(args_element, &lcArg); 956 if (msgpack_errors != 0) { 957 printf("[%s:%d] %s failed\n", __FILE__, __LINE__, 958 "iterate args map in kernel args metadata"); 959 return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; 960 } 961 // populate info with sizes and offsets 962 info.arg_sizes.push_back(lcArg.size_); 963 // v3 has offset field and not align field 964 size_t new_offset = lcArg.offset_; 965 size_t padding = new_offset - offset; 966 offset = new_offset; 967 info.arg_offsets.push_back(lcArg.offset_); 968 DEBUG_PRINT("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), 969 lcArg.size_, lcArg.offset_); 970 offset += lcArg.size_; 971 972 // check if the arg is a hidden/implicit arg 973 // this logic assumes that all hidden args are 8-byte aligned 974 if (!isImplicit(lcArg.valueKind_)) { 975 kernel_explicit_args_size += lcArg.size_; 976 } else { 977 hasHiddenArgs = true; 978 } 979 kernel_explicit_args_size += padding; 980 } 981 } 982 983 // add size of implicit args, e.g.: offset x, y and z and pipe pointer, but 984 // in ATMI, do not count the compiler set implicit args, but set your own 985 // implicit args by discounting the compiler set implicit args 986 info.kernel_segment_size = 987 (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size) + 988 sizeof(atmi_implicit_args_t); 989 DEBUG_PRINT("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(), 990 kernel_segment_size, info.kernel_segment_size); 991 992 // kernel received, now add it to the kernel info table 993 KernelInfoTable[gpu][kernelName] = info; 994 } 995 996 return HSA_STATUS_SUCCESS; 997 } 998 999 static hsa_status_t populate_InfoTables(hsa_executable_t executable, 1000 hsa_executable_symbol_t symbol, 1001 void *data) { 1002 int gpu = *static_cast<int *>(data); 1003 hsa_symbol_kind_t type; 1004 1005 uint32_t name_length; 1006 hsa_status_t err; 1007 err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, 1008 &type); 1009 if (err != HSA_STATUS_SUCCESS) { 1010 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1011 "Symbol info extraction", get_error_string(err)); 1012 return err; 1013 } 1014 DEBUG_PRINT("Exec Symbol type: %d\n", type); 1015 if (type == HSA_SYMBOL_KIND_KERNEL) { 1016 err = hsa_executable_symbol_get_info( 1017 symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); 1018 if (err != HSA_STATUS_SUCCESS) { 1019 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1020 "Symbol info extraction", get_error_string(err)); 1021 return err; 1022 } 1023 char *name = reinterpret_cast<char *>(malloc(name_length + 1)); 1024 err = hsa_executable_symbol_get_info(symbol, 1025 HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); 1026 if (err != HSA_STATUS_SUCCESS) { 1027 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1028 "Symbol info extraction", get_error_string(err)); 1029 return err; 1030 } 1031 // remove the suffix .kd from symbol name. 1032 name[name_length - 3] = 0; 1033 1034 atl_kernel_info_t info; 1035 std::string kernelName(name); 1036 // by now, the kernel info table should already have an entry 1037 // because the non-ROCr custom code object parsing is called before 1038 // iterating over the code object symbols using ROCr 1039 if (KernelInfoTable[gpu].find(kernelName) == KernelInfoTable[gpu].end()) { 1040 return HSA_STATUS_ERROR; 1041 } 1042 // found, so assign and update 1043 info = KernelInfoTable[gpu][kernelName]; 1044 1045 /* Extract dispatch information from the symbol */ 1046 err = hsa_executable_symbol_get_info( 1047 symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, 1048 &(info.kernel_object)); 1049 if (err != HSA_STATUS_SUCCESS) { 1050 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1051 "Extracting the symbol from the executable", 1052 get_error_string(err)); 1053 return err; 1054 } 1055 err = hsa_executable_symbol_get_info( 1056 symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, 1057 &(info.group_segment_size)); 1058 if (err != HSA_STATUS_SUCCESS) { 1059 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1060 "Extracting the group segment size from the executable", 1061 get_error_string(err)); 1062 return err; 1063 } 1064 err = hsa_executable_symbol_get_info( 1065 symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, 1066 &(info.private_segment_size)); 1067 if (err != HSA_STATUS_SUCCESS) { 1068 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1069 "Extracting the private segment from the executable", 1070 get_error_string(err)); 1071 return err; 1072 } 1073 1074 DEBUG_PRINT( 1075 "Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes " 1076 "kernarg\n", 1077 kernelName.c_str(), info.kernel_object, info.group_segment_size, 1078 info.private_segment_size, info.kernel_segment_size); 1079 1080 // assign it back to the kernel info table 1081 KernelInfoTable[gpu][kernelName] = info; 1082 free(name); 1083 } else if (type == HSA_SYMBOL_KIND_VARIABLE) { 1084 err = hsa_executable_symbol_get_info( 1085 symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); 1086 if (err != HSA_STATUS_SUCCESS) { 1087 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1088 "Symbol info extraction", get_error_string(err)); 1089 return err; 1090 } 1091 char *name = reinterpret_cast<char *>(malloc(name_length + 1)); 1092 err = hsa_executable_symbol_get_info(symbol, 1093 HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); 1094 if (err != HSA_STATUS_SUCCESS) { 1095 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1096 "Symbol info extraction", get_error_string(err)); 1097 return err; 1098 } 1099 name[name_length] = 0; 1100 1101 atl_symbol_info_t info; 1102 1103 err = hsa_executable_symbol_get_info( 1104 symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr)); 1105 if (err != HSA_STATUS_SUCCESS) { 1106 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1107 "Symbol info address extraction", get_error_string(err)); 1108 return err; 1109 } 1110 1111 err = hsa_executable_symbol_get_info( 1112 symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size)); 1113 if (err != HSA_STATUS_SUCCESS) { 1114 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1115 "Symbol info size extraction", get_error_string(err)); 1116 return err; 1117 } 1118 1119 atmi_mem_place_t place = ATMI_MEM_PLACE(ATMI_DEVTYPE_GPU, gpu, 0); 1120 DEBUG_PRINT("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, 1121 info.size); 1122 err = register_allocation(reinterpret_cast<void *>(info.addr), 1123 (size_t)info.size, place); 1124 if (err != HSA_STATUS_SUCCESS) { 1125 return err; 1126 } 1127 SymbolInfoTable[gpu][std::string(name)] = info; 1128 if (strcmp(name, "needs_hostcall_buffer") == 0) 1129 g_atmi_hostcall_required = true; 1130 free(name); 1131 } else { 1132 DEBUG_PRINT("Symbol is an indirect function\n"); 1133 } 1134 return HSA_STATUS_SUCCESS; 1135 } 1136 1137 atmi_status_t Runtime::RegisterModuleFromMemory( 1138 void *module_bytes, size_t module_size, atmi_place_t place, 1139 atmi_status_t (*on_deserialized_data)(void *data, size_t size, 1140 void *cb_state), 1141 void *cb_state, std::vector<hsa_executable_t> &HSAExecutables) { 1142 hsa_status_t err; 1143 int gpu = place.device_id; 1144 assert(gpu >= 0); 1145 1146 DEBUG_PRINT("Trying to load module to GPU-%d\n", gpu); 1147 ATLGPUProcessor &proc = get_processor<ATLGPUProcessor>(place); 1148 hsa_agent_t agent = proc.agent(); 1149 hsa_executable_t executable = {0}; 1150 hsa_profile_t agent_profile; 1151 1152 err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile); 1153 if (err != HSA_STATUS_SUCCESS) { 1154 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1155 "Query the agent profile", get_error_string(err)); 1156 return ATMI_STATUS_ERROR; 1157 } 1158 // FIXME: Assume that every profile is FULL until we understand how to build 1159 // GCN with base profile 1160 agent_profile = HSA_PROFILE_FULL; 1161 /* Create the empty executable. */ 1162 err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "", 1163 &executable); 1164 if (err != HSA_STATUS_SUCCESS) { 1165 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1166 "Create the executable", get_error_string(err)); 1167 return ATMI_STATUS_ERROR; 1168 } 1169 1170 bool module_load_success = false; 1171 do // Existing control flow used continue, preserve that for this patch 1172 { 1173 { 1174 // Some metadata info is not available through ROCr API, so use custom 1175 // code object metadata parsing to collect such metadata info 1176 1177 err = get_code_object_custom_metadata(module_bytes, module_size, gpu); 1178 if (err != HSA_STATUS_SUCCESS) { 1179 DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1180 "Getting custom code object metadata", 1181 get_error_string(err)); 1182 continue; 1183 } 1184 1185 // Deserialize code object. 1186 hsa_code_object_t code_object = {0}; 1187 err = hsa_code_object_deserialize(module_bytes, module_size, NULL, 1188 &code_object); 1189 if (err != HSA_STATUS_SUCCESS) { 1190 DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1191 "Code Object Deserialization", get_error_string(err)); 1192 continue; 1193 } 1194 assert(0 != code_object.handle); 1195 1196 // Mutating the device image here avoids another allocation & memcpy 1197 void *code_object_alloc_data = 1198 reinterpret_cast<void *>(code_object.handle); 1199 atmi_status_t atmi_err = 1200 on_deserialized_data(code_object_alloc_data, module_size, cb_state); 1201 if (atmi_err != ATMI_STATUS_SUCCESS) { 1202 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1203 "Error in deserialized_data callback", 1204 get_atmi_error_string(atmi_err)); 1205 return atmi_err; 1206 } 1207 1208 /* Load the code object. */ 1209 err = 1210 hsa_executable_load_code_object(executable, agent, code_object, NULL); 1211 if (err != HSA_STATUS_SUCCESS) { 1212 DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1213 "Loading the code object", get_error_string(err)); 1214 continue; 1215 } 1216 1217 // cannot iterate over symbols until executable is frozen 1218 } 1219 module_load_success = true; 1220 } while (0); 1221 DEBUG_PRINT("Modules loaded successful? %d\n", module_load_success); 1222 if (module_load_success) { 1223 /* Freeze the executable; it can now be queried for symbols. */ 1224 err = hsa_executable_freeze(executable, ""); 1225 if (err != HSA_STATUS_SUCCESS) { 1226 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1227 "Freeze the executable", get_error_string(err)); 1228 return ATMI_STATUS_ERROR; 1229 } 1230 1231 err = hsa_executable_iterate_symbols(executable, populate_InfoTables, 1232 static_cast<void *>(&gpu)); 1233 if (err != HSA_STATUS_SUCCESS) { 1234 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1235 "Iterating over symbols for execuatable", get_error_string(err)); 1236 return ATMI_STATUS_ERROR; 1237 } 1238 1239 // save the executable and destroy during finalize 1240 HSAExecutables.push_back(executable); 1241 return ATMI_STATUS_SUCCESS; 1242 } else { 1243 return ATMI_STATUS_ERROR; 1244 } 1245 } 1246 1247 } // namespace core 1248