1 //===----RTLs/hsa/src/rtl.cpp - Target RTLs Implementation -------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for hsa machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <algorithm> 14 #include <assert.h> 15 #include <cstdio> 16 #include <cstdlib> 17 #include <cstring> 18 #include <dlfcn.h> 19 #include <elf.h> 20 #include <ffi.h> 21 #include <fstream> 22 #include <iostream> 23 #include <libelf.h> 24 #include <list> 25 #include <memory> 26 #include <mutex> 27 #include <shared_mutex> 28 #include <thread> 29 #include <unordered_map> 30 #include <vector> 31 32 // Header from ATMI interface 33 #include "atmi_interop_hsa.h" 34 #include "atmi_runtime.h" 35 36 #include "internal.h" 37 38 #include "Debug.h" 39 #include "get_elf_mach_gfx_name.h" 40 #include "machine.h" 41 #include "omptargetplugin.h" 42 #include "print_tracing.h" 43 44 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 45 46 #ifndef TARGET_NAME 47 #define TARGET_NAME AMDHSA 48 #endif 49 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 50 51 // hostrpc interface, FIXME: consider moving to its own include these are 52 // statically linked into amdgpu/plugin if present from hostrpc_services.a, 53 // linked as --whole-archive to override the weak symbols that are used to 54 // implement a fallback for toolchains that do not yet have a hostrpc library. 55 extern "C" { 56 unsigned long hostrpc_assign_buffer(hsa_agent_t agent, hsa_queue_t *this_Q, 57 uint32_t device_id); 58 hsa_status_t hostrpc_init(); 59 hsa_status_t hostrpc_terminate(); 60 61 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } 62 __attribute__((weak)) hsa_status_t hostrpc_terminate() { 63 return HSA_STATUS_SUCCESS; 64 } 65 __attribute__((weak)) unsigned long 66 hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, uint32_t device_id) { 67 DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library " 68 "missing\n", 69 device_id); 70 return 0; 71 } 72 } 73 74 int print_kernel_trace; 75 76 #ifdef OMPTARGET_DEBUG 77 #define check(msg, status) \ 78 if (status != HSA_STATUS_SUCCESS) { \ 79 DP(#msg " failed\n"); \ 80 } else { \ 81 DP(#msg " succeeded\n"); \ 82 } 83 #else 84 #define check(msg, status) \ 85 {} 86 #endif 87 88 #include "elf_common.h" 89 90 namespace core { 91 hsa_status_t RegisterModuleFromMemory( 92 std::map<std::string, atl_kernel_info_t> &KernelInfo, 93 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, void *, size_t, 94 int DeviceId, 95 hsa_status_t (*on_deserialized_data)(void *data, size_t size, 96 void *cb_state), 97 void *cb_state, std::vector<hsa_executable_t> &HSAExecutables); 98 } 99 100 /// Keep entries table per device 101 struct FuncOrGblEntryTy { 102 __tgt_target_table Table; 103 std::vector<__tgt_offload_entry> Entries; 104 }; 105 106 enum ExecutionModeType { 107 SPMD, // constructors, destructors, 108 // combined constructs (`teams distribute parallel for [simd]`) 109 GENERIC, // everything else 110 NONE 111 }; 112 113 struct KernelArgPool { 114 private: 115 static pthread_mutex_t mutex; 116 117 public: 118 uint32_t kernarg_segment_size; 119 void *kernarg_region = nullptr; 120 std::queue<int> free_kernarg_segments; 121 122 uint32_t kernarg_size_including_implicit() { 123 return kernarg_segment_size + sizeof(atmi_implicit_args_t); 124 } 125 126 ~KernelArgPool() { 127 if (kernarg_region) { 128 auto r = hsa_amd_memory_pool_free(kernarg_region); 129 if (r != HSA_STATUS_SUCCESS) { 130 DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(r)); 131 } 132 } 133 } 134 135 // Can't really copy or move a mutex 136 KernelArgPool() = default; 137 KernelArgPool(const KernelArgPool &) = delete; 138 KernelArgPool(KernelArgPool &&) = delete; 139 140 KernelArgPool(uint32_t kernarg_segment_size, 141 hsa_amd_memory_pool_t &memory_pool) 142 : kernarg_segment_size(kernarg_segment_size) { 143 144 // atmi uses one pool per kernel for all gpus, with a fixed upper size 145 // preserving that exact scheme here, including the queue<int> 146 147 hsa_status_t err = hsa_amd_memory_pool_allocate( 148 memory_pool, kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0, 149 &kernarg_region); 150 151 if (err != HSA_STATUS_SUCCESS) { 152 DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(err)); 153 kernarg_region = nullptr; // paranoid 154 return; 155 } 156 157 err = core::allow_access_to_all_gpu_agents(kernarg_region); 158 if (err != HSA_STATUS_SUCCESS) { 159 DP("hsa allow_access_to_all_gpu_agents failed: %s\n", 160 get_error_string(err)); 161 auto r = hsa_amd_memory_pool_free(kernarg_region); 162 if (r != HSA_STATUS_SUCCESS) { 163 // if free failed, can't do anything more to resolve it 164 DP("hsa memory poll free failed: %s\n", get_error_string(err)); 165 } 166 kernarg_region = nullptr; 167 return; 168 } 169 170 for (int i = 0; i < MAX_NUM_KERNELS; i++) { 171 free_kernarg_segments.push(i); 172 } 173 } 174 175 void *allocate(uint64_t arg_num) { 176 assert((arg_num * sizeof(void *)) == kernarg_segment_size); 177 lock l(&mutex); 178 void *res = nullptr; 179 if (!free_kernarg_segments.empty()) { 180 181 int free_idx = free_kernarg_segments.front(); 182 res = static_cast<void *>(static_cast<char *>(kernarg_region) + 183 (free_idx * kernarg_size_including_implicit())); 184 assert(free_idx == pointer_to_index(res)); 185 free_kernarg_segments.pop(); 186 } 187 return res; 188 } 189 190 void deallocate(void *ptr) { 191 lock l(&mutex); 192 int idx = pointer_to_index(ptr); 193 free_kernarg_segments.push(idx); 194 } 195 196 private: 197 int pointer_to_index(void *ptr) { 198 ptrdiff_t bytes = 199 static_cast<char *>(ptr) - static_cast<char *>(kernarg_region); 200 assert(bytes >= 0); 201 assert(bytes % kernarg_size_including_implicit() == 0); 202 return bytes / kernarg_size_including_implicit(); 203 } 204 struct lock { 205 lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); } 206 ~lock() { pthread_mutex_unlock(m); } 207 pthread_mutex_t *m; 208 }; 209 }; 210 pthread_mutex_t KernelArgPool::mutex = PTHREAD_MUTEX_INITIALIZER; 211 212 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>> 213 KernelArgPoolMap; 214 215 /// Use a single entity to encode a kernel and a set of flags 216 struct KernelTy { 217 // execution mode of kernel 218 // 0 - SPMD mode (without master warp) 219 // 1 - Generic mode (with master warp) 220 int8_t ExecutionMode; 221 int16_t ConstWGSize; 222 int32_t device_id; 223 void *CallStackAddr = nullptr; 224 const char *Name; 225 226 KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id, 227 void *_CallStackAddr, const char *_Name, 228 uint32_t _kernarg_segment_size, 229 hsa_amd_memory_pool_t &KernArgMemoryPool) 230 : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize), 231 device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) { 232 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); 233 234 std::string N(_Name); 235 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { 236 KernelArgPoolMap.insert( 237 std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool( 238 _kernarg_segment_size, KernArgMemoryPool)))); 239 } 240 } 241 }; 242 243 /// List that contains all the kernels. 244 /// FIXME: we may need this to be per device and per library. 245 std::list<KernelTy> KernelsList; 246 247 static std::vector<hsa_agent_t> find_gpu_agents() { 248 std::vector<hsa_agent_t> res; 249 250 hsa_status_t err = hsa_iterate_agents( 251 [](hsa_agent_t agent, void *data) -> hsa_status_t { 252 std::vector<hsa_agent_t> *res = 253 static_cast<std::vector<hsa_agent_t> *>(data); 254 255 hsa_device_type_t device_type; 256 // get_info fails iff HSA runtime not yet initialized 257 hsa_status_t err = 258 hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); 259 if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) 260 printf("rtl.cpp: err %d\n", err); 261 assert(err == HSA_STATUS_SUCCESS); 262 263 if (device_type == HSA_DEVICE_TYPE_GPU) { 264 res->push_back(agent); 265 } 266 return HSA_STATUS_SUCCESS; 267 }, 268 &res); 269 270 // iterate_agents fails iff HSA runtime not yet initialized 271 if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) 272 printf("rtl.cpp: err %d\n", err); 273 assert(err == HSA_STATUS_SUCCESS); 274 return res; 275 } 276 277 static void callbackQueue(hsa_status_t status, hsa_queue_t *source, 278 void *data) { 279 if (status != HSA_STATUS_SUCCESS) { 280 const char *status_string; 281 if (hsa_status_string(status, &status_string) != HSA_STATUS_SUCCESS) { 282 status_string = "unavailable"; 283 } 284 fprintf(stderr, "[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, 285 __LINE__, source, status, status_string); 286 abort(); 287 } 288 } 289 290 namespace core { 291 namespace { 292 void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest) { 293 __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE); 294 } 295 296 uint16_t create_header() { 297 uint16_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; 298 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; 299 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; 300 return header; 301 } 302 303 hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { 304 std::vector<hsa_amd_memory_pool_t> *Result = 305 static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data); 306 bool AllocAllowed = false; 307 hsa_status_t err = hsa_amd_memory_pool_get_info( 308 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, 309 &AllocAllowed); 310 if (err != HSA_STATUS_SUCCESS) { 311 fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n", 312 get_error_string(err)); 313 return err; 314 } 315 316 if (!AllocAllowed) { 317 // nothing needs to be done here. 318 return HSA_STATUS_SUCCESS; 319 } 320 321 uint32_t GlobalFlags = 0; 322 err = hsa_amd_memory_pool_get_info( 323 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 324 if (err != HSA_STATUS_SUCCESS) { 325 fprintf(stderr, "Get memory pool info failed: %s\n", get_error_string(err)); 326 return err; 327 } 328 329 fprintf(stderr, "Flags : %d\n", GlobalFlags); 330 if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) && 331 (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) { 332 size_t size = 0; 333 err = hsa_amd_memory_pool_get_info(MemoryPool, 334 HSA_AMD_MEMORY_POOL_INFO_SIZE, &size); 335 if (err != HSA_STATUS_SUCCESS) { 336 fprintf(stderr, "Get memory pool size failed: %s\n", 337 get_error_string(err)); 338 return err; 339 } 340 if (size > 0) 341 Result->push_back(MemoryPool); 342 } 343 344 return HSA_STATUS_SUCCESS; 345 } 346 347 std::pair<hsa_status_t, hsa_amd_memory_pool_t> 348 FindKernargPool(const std::vector<hsa_agent_t> &HSAAgents) { 349 std::vector<hsa_amd_memory_pool_t> KernArgPools; 350 for (const auto &processor : g_atl_machine.processors<ATLCPUProcessor>()) { 351 hsa_agent_t Agent = processor.agent(); 352 hsa_status_t err = HSA_STATUS_SUCCESS; 353 err = hsa_amd_agent_iterate_memory_pools( 354 Agent, addKernArgPool, static_cast<void *>(&KernArgPools)); 355 if (err != HSA_STATUS_SUCCESS) { 356 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 357 "Iterate all memory pools", get_error_string(err)); 358 return {err, hsa_amd_memory_pool_t{}}; 359 } 360 } 361 362 if (KernArgPools.empty()) { 363 fprintf(stderr, "Unable to find any valid kernarg pool\n"); 364 return {HSA_STATUS_ERROR, hsa_amd_memory_pool_t{}}; 365 } 366 367 return {HSA_STATUS_SUCCESS, KernArgPools[0]}; 368 } 369 370 } // namespace 371 } // namespace core 372 373 /// Class containing all the device information 374 class RTLDeviceInfoTy { 375 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; 376 377 public: 378 // load binary populates symbol tables and mutates various global state 379 // run uses those symbol tables 380 std::shared_timed_mutex load_run_lock; 381 382 int NumberOfDevices; 383 384 // GPU devices 385 std::vector<hsa_agent_t> HSAAgents; 386 std::vector<hsa_queue_t *> HSAQueues; // one per gpu 387 388 // Device properties 389 std::vector<int> ComputeUnits; 390 std::vector<int> GroupsPerDevice; 391 std::vector<int> ThreadsPerGroup; 392 std::vector<int> WarpSize; 393 std::vector<std::string> GPUName; 394 395 // OpenMP properties 396 std::vector<int> NumTeams; 397 std::vector<int> NumThreads; 398 399 // OpenMP Environment properties 400 int EnvNumTeams; 401 int EnvTeamLimit; 402 int EnvMaxTeamsDefault; 403 404 // OpenMP Requires Flags 405 int64_t RequiresFlags; 406 407 // Resource pools 408 SignalPoolT FreeSignalPool; 409 410 bool hostcall_required = false; 411 412 std::vector<hsa_executable_t> HSAExecutables; 413 414 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable; 415 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable; 416 417 hsa_amd_memory_pool_t KernArgPool; 418 419 struct atmiFreePtrDeletor { 420 void operator()(void *p) { 421 core::Runtime::Memfree(p); // ignore failure to free 422 } 423 }; 424 425 // device_State shared across loaded binaries, error if inconsistent size 426 std::vector<std::pair<std::unique_ptr<void, atmiFreePtrDeletor>, uint64_t>> 427 deviceStateStore; 428 429 static const unsigned HardTeamLimit = 430 (1 << 16) - 1; // 64K needed to fit in uint16 431 static const int DefaultNumTeams = 128; 432 static const int Max_Teams = 433 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams]; 434 static const int Warp_Size = 435 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]; 436 static const int Max_WG_Size = 437 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size]; 438 static const int Default_WG_Size = 439 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size]; 440 441 using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *, 442 size_t size, hsa_agent_t); 443 hsa_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size, 444 MemcpyFunc Func, int32_t deviceId) { 445 hsa_agent_t agent = HSAAgents[deviceId]; 446 hsa_signal_t s = FreeSignalPool.pop(); 447 if (s.handle == 0) { 448 return HSA_STATUS_ERROR; 449 } 450 hsa_status_t r = Func(s, dest, src, size, agent); 451 FreeSignalPool.push(s); 452 return r; 453 } 454 455 hsa_status_t freesignalpool_memcpy_d2h(void *dest, const void *src, 456 size_t size, int32_t deviceId) { 457 return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h, deviceId); 458 } 459 460 hsa_status_t freesignalpool_memcpy_h2d(void *dest, const void *src, 461 size_t size, int32_t deviceId) { 462 return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d, deviceId); 463 } 464 465 // Record entry point associated with device 466 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { 467 assert(device_id < (int32_t)FuncGblEntries.size() && 468 "Unexpected device id!"); 469 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 470 471 E.Entries.push_back(entry); 472 } 473 474 // Return true if the entry is associated with device 475 bool findOffloadEntry(int32_t device_id, void *addr) { 476 assert(device_id < (int32_t)FuncGblEntries.size() && 477 "Unexpected device id!"); 478 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 479 480 for (auto &it : E.Entries) { 481 if (it.addr == addr) 482 return true; 483 } 484 485 return false; 486 } 487 488 // Return the pointer to the target entries table 489 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { 490 assert(device_id < (int32_t)FuncGblEntries.size() && 491 "Unexpected device id!"); 492 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 493 494 int32_t size = E.Entries.size(); 495 496 // Table is empty 497 if (!size) 498 return 0; 499 500 __tgt_offload_entry *begin = &E.Entries[0]; 501 __tgt_offload_entry *end = &E.Entries[size - 1]; 502 503 // Update table info according to the entries and return the pointer 504 E.Table.EntriesBegin = begin; 505 E.Table.EntriesEnd = ++end; 506 507 return &E.Table; 508 } 509 510 // Clear entries table for a device 511 void clearOffloadEntriesTable(int device_id) { 512 assert(device_id < (int32_t)FuncGblEntries.size() && 513 "Unexpected device id!"); 514 FuncGblEntries[device_id].emplace_back(); 515 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 516 // KernelArgPoolMap.clear(); 517 E.Entries.clear(); 518 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 519 } 520 521 RTLDeviceInfoTy() { 522 // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr 523 // anytime. You do not need a debug library build. 524 // 0 => no tracing 525 // 1 => tracing dispatch only 526 // >1 => verbosity increase 527 if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) 528 print_kernel_trace = atoi(envStr); 529 else 530 print_kernel_trace = 0; 531 532 DP("Start initializing HSA-ATMI\n"); 533 hsa_status_t err = core::atl_init_gpu_context(); 534 if (err != HSA_STATUS_SUCCESS) { 535 DP("Error when initializing HSA-ATMI\n"); 536 return; 537 } 538 539 // Init hostcall soon after initializing ATMI 540 hostrpc_init(); 541 542 HSAAgents = find_gpu_agents(); 543 NumberOfDevices = (int)HSAAgents.size(); 544 545 if (NumberOfDevices == 0) { 546 DP("There are no devices supporting HSA.\n"); 547 return; 548 } else { 549 DP("There are %d devices supporting HSA.\n", NumberOfDevices); 550 } 551 552 std::tie(err, KernArgPool) = core::FindKernargPool(HSAAgents); 553 if (err != HSA_STATUS_SUCCESS) { 554 DP("Error when reading memory pools\n"); 555 return; 556 } 557 558 // Init the device info 559 HSAQueues.resize(NumberOfDevices); 560 FuncGblEntries.resize(NumberOfDevices); 561 ThreadsPerGroup.resize(NumberOfDevices); 562 ComputeUnits.resize(NumberOfDevices); 563 GPUName.resize(NumberOfDevices); 564 GroupsPerDevice.resize(NumberOfDevices); 565 WarpSize.resize(NumberOfDevices); 566 NumTeams.resize(NumberOfDevices); 567 NumThreads.resize(NumberOfDevices); 568 deviceStateStore.resize(NumberOfDevices); 569 KernelInfoTable.resize(NumberOfDevices); 570 SymbolInfoTable.resize(NumberOfDevices); 571 572 for (int i = 0; i < NumberOfDevices; i++) { 573 HSAQueues[i] = nullptr; 574 } 575 576 for (int i = 0; i < NumberOfDevices; i++) { 577 uint32_t queue_size = 0; 578 { 579 hsa_status_t err = hsa_agent_get_info( 580 HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); 581 if (err != HSA_STATUS_SUCCESS) { 582 DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", i); 583 return; 584 } 585 if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) { 586 queue_size = core::Runtime::getInstance().getMaxQueueSize(); 587 } 588 } 589 590 hsa_status_t rc = hsa_queue_create( 591 HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI, callbackQueue, NULL, 592 UINT32_MAX, UINT32_MAX, &HSAQueues[i]); 593 if (rc != HSA_STATUS_SUCCESS) { 594 DP("Failed to create HSA queue %d\n", i); 595 return; 596 } 597 598 deviceStateStore[i] = {nullptr, 0}; 599 } 600 601 for (int i = 0; i < NumberOfDevices; i++) { 602 ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size; 603 GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams; 604 ComputeUnits[i] = 1; 605 DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", i, 606 GroupsPerDevice[i], ThreadsPerGroup[i]); 607 } 608 609 // Get environment variables regarding teams 610 char *envStr = getenv("OMP_TEAM_LIMIT"); 611 if (envStr) { 612 // OMP_TEAM_LIMIT has been set 613 EnvTeamLimit = std::stoi(envStr); 614 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 615 } else { 616 EnvTeamLimit = -1; 617 } 618 envStr = getenv("OMP_NUM_TEAMS"); 619 if (envStr) { 620 // OMP_NUM_TEAMS has been set 621 EnvNumTeams = std::stoi(envStr); 622 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 623 } else { 624 EnvNumTeams = -1; 625 } 626 // Get environment variables regarding expMaxTeams 627 envStr = getenv("OMP_MAX_TEAMS_DEFAULT"); 628 if (envStr) { 629 EnvMaxTeamsDefault = std::stoi(envStr); 630 DP("Parsed OMP_MAX_TEAMS_DEFAULT=%d\n", EnvMaxTeamsDefault); 631 } else { 632 EnvMaxTeamsDefault = -1; 633 } 634 635 // Default state. 636 RequiresFlags = OMP_REQ_UNDEFINED; 637 } 638 639 ~RTLDeviceInfoTy() { 640 DP("Finalizing the HSA-ATMI DeviceInfo.\n"); 641 // Run destructors on types that use HSA before 642 // atmi_finalize removes access to it 643 deviceStateStore.clear(); 644 KernelArgPoolMap.clear(); 645 // Terminate hostrpc before finalizing ATMI 646 hostrpc_terminate(); 647 648 hsa_status_t Err; 649 for (uint32_t I = 0; I < HSAExecutables.size(); I++) { 650 Err = hsa_executable_destroy(HSAExecutables[I]); 651 if (Err != HSA_STATUS_SUCCESS) { 652 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 653 "Destroying executable", get_error_string(Err)); 654 } 655 } 656 657 Err = hsa_shut_down(); 658 if (Err != HSA_STATUS_SUCCESS) { 659 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Shutting down HSA", 660 get_error_string(Err)); 661 } 662 } 663 }; 664 665 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; 666 667 // TODO: May need to drop the trailing to fields until deviceRTL is updated 668 struct omptarget_device_environmentTy { 669 int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG 670 // only useful for Debug build of deviceRTLs 671 int32_t num_devices; // gets number of active offload devices 672 int32_t device_num; // gets a value 0 to num_devices-1 673 }; 674 675 static RTLDeviceInfoTy DeviceInfo; 676 677 namespace { 678 679 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, 680 __tgt_async_info *AsyncInfo) { 681 assert(AsyncInfo && "AsyncInfo is nullptr"); 682 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 683 // Return success if we are not copying back to host from target. 684 if (!HstPtr) 685 return OFFLOAD_SUCCESS; 686 hsa_status_t err; 687 DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 688 (long long unsigned)(Elf64_Addr)TgtPtr, 689 (long long unsigned)(Elf64_Addr)HstPtr); 690 691 err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size, 692 DeviceId); 693 694 if (err != HSA_STATUS_SUCCESS) { 695 DP("Error when copying data from device to host. Pointers: " 696 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 697 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 698 return OFFLOAD_FAIL; 699 } 700 DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 701 (long long unsigned)(Elf64_Addr)TgtPtr, 702 (long long unsigned)(Elf64_Addr)HstPtr); 703 return OFFLOAD_SUCCESS; 704 } 705 706 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, 707 __tgt_async_info *AsyncInfo) { 708 assert(AsyncInfo && "AsyncInfo is nullptr"); 709 hsa_status_t err; 710 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 711 // Return success if we are not doing host to target. 712 if (!HstPtr) 713 return OFFLOAD_SUCCESS; 714 715 DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, 716 (long long unsigned)(Elf64_Addr)HstPtr, 717 (long long unsigned)(Elf64_Addr)TgtPtr); 718 err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size, 719 DeviceId); 720 if (err != HSA_STATUS_SUCCESS) { 721 DP("Error when copying data from host to device. Pointers: " 722 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 723 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 724 return OFFLOAD_FAIL; 725 } 726 return OFFLOAD_SUCCESS; 727 } 728 729 // Async. 730 // The implementation was written with cuda streams in mind. The semantics of 731 // that are to execute kernels on a queue in order of insertion. A synchronise 732 // call then makes writes visible between host and device. This means a series 733 // of N data_submit_async calls are expected to execute serially. HSA offers 734 // various options to run the data copies concurrently. This may require changes 735 // to libomptarget. 736 737 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that 738 // there are no outstanding kernels that need to be synchronized. Any async call 739 // may be passed a Queue==0, at which point the cuda implementation will set it 740 // to non-null (see getStream). The cuda streams are per-device. Upstream may 741 // change this interface to explicitly initialize the AsyncInfo_pointer, but 742 // until then hsa lazily initializes it as well. 743 744 void initAsyncInfo(__tgt_async_info *AsyncInfo) { 745 // set non-null while using async calls, return to null to indicate completion 746 assert(AsyncInfo); 747 if (!AsyncInfo->Queue) { 748 AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX); 749 } 750 } 751 void finiAsyncInfo(__tgt_async_info *AsyncInfo) { 752 assert(AsyncInfo); 753 assert(AsyncInfo->Queue); 754 AsyncInfo->Queue = 0; 755 } 756 757 bool elf_machine_id_is_amdgcn(__tgt_device_image *image) { 758 const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h 759 int32_t r = elf_check_machine(image, amdgcnMachineID); 760 if (!r) { 761 DP("Supported machine ID not found\n"); 762 } 763 return r; 764 } 765 766 uint32_t elf_e_flags(__tgt_device_image *image) { 767 char *img_begin = (char *)image->ImageStart; 768 size_t img_size = (char *)image->ImageEnd - img_begin; 769 770 Elf *e = elf_memory(img_begin, img_size); 771 if (!e) { 772 DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); 773 return 0; 774 } 775 776 Elf64_Ehdr *eh64 = elf64_getehdr(e); 777 778 if (!eh64) { 779 DP("Unable to get machine ID from ELF file!\n"); 780 elf_end(e); 781 return 0; 782 } 783 784 uint32_t Flags = eh64->e_flags; 785 786 elf_end(e); 787 DP("ELF Flags: 0x%x\n", Flags); 788 return Flags; 789 } 790 } // namespace 791 792 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 793 return elf_machine_id_is_amdgcn(image); 794 } 795 796 int __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } 797 798 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 799 DP("Init requires flags to %ld\n", RequiresFlags); 800 DeviceInfo.RequiresFlags = RequiresFlags; 801 return RequiresFlags; 802 } 803 804 namespace { 805 template <typename T> bool enforce_upper_bound(T *value, T upper) { 806 bool changed = *value > upper; 807 if (changed) { 808 *value = upper; 809 } 810 return changed; 811 } 812 } // namespace 813 814 int32_t __tgt_rtl_init_device(int device_id) { 815 hsa_status_t err; 816 817 // this is per device id init 818 DP("Initialize the device id: %d\n", device_id); 819 820 hsa_agent_t agent = DeviceInfo.HSAAgents[device_id]; 821 822 // Get number of Compute Unit 823 uint32_t compute_units = 0; 824 err = hsa_agent_get_info( 825 agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 826 &compute_units); 827 if (err != HSA_STATUS_SUCCESS) { 828 DeviceInfo.ComputeUnits[device_id] = 1; 829 DP("Error getting compute units : settiing to 1\n"); 830 } else { 831 DeviceInfo.ComputeUnits[device_id] = compute_units; 832 DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]); 833 } 834 835 char GetInfoName[64]; // 64 max size returned by get info 836 err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, 837 (void *)GetInfoName); 838 if (err) 839 DeviceInfo.GPUName[device_id] = "--unknown gpu--"; 840 else { 841 DeviceInfo.GPUName[device_id] = GetInfoName; 842 } 843 844 if (print_kernel_trace & STARTUP_DETAILS) 845 fprintf(stderr, "Device#%-2d CU's: %2d %s\n", device_id, 846 DeviceInfo.ComputeUnits[device_id], 847 DeviceInfo.GPUName[device_id].c_str()); 848 849 // Query attributes to determine number of threads/block and blocks/grid. 850 uint16_t workgroup_max_dim[3]; 851 err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 852 &workgroup_max_dim); 853 if (err != HSA_STATUS_SUCCESS) { 854 DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 855 DP("Error getting grid dims: num groups : %d\n", 856 RTLDeviceInfoTy::DefaultNumTeams); 857 } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) { 858 DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0]; 859 DP("Using %d ROCm blocks per grid\n", 860 DeviceInfo.GroupsPerDevice[device_id]); 861 } else { 862 DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit; 863 DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " 864 "at the hard limit\n", 865 workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit); 866 } 867 868 // Get thread limit 869 hsa_dim3_t grid_max_dim; 870 err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim); 871 if (err == HSA_STATUS_SUCCESS) { 872 DeviceInfo.ThreadsPerGroup[device_id] = 873 reinterpret_cast<uint32_t *>(&grid_max_dim)[0] / 874 DeviceInfo.GroupsPerDevice[device_id]; 875 876 if (DeviceInfo.ThreadsPerGroup[device_id] == 0) { 877 DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size; 878 DP("Default thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size); 879 } else if (enforce_upper_bound(&DeviceInfo.ThreadsPerGroup[device_id], 880 RTLDeviceInfoTy::Max_WG_Size)) { 881 DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size); 882 } else { 883 DP("Using ROCm Queried thread limit: %d\n", 884 DeviceInfo.ThreadsPerGroup[device_id]); 885 } 886 } else { 887 DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size; 888 DP("Error getting max block dimension, use default:%d \n", 889 RTLDeviceInfoTy::Max_WG_Size); 890 } 891 892 // Get wavefront size 893 uint32_t wavefront_size = 0; 894 err = 895 hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size); 896 if (err == HSA_STATUS_SUCCESS) { 897 DP("Queried wavefront size: %d\n", wavefront_size); 898 DeviceInfo.WarpSize[device_id] = wavefront_size; 899 } else { 900 DP("Default wavefront size: %d\n", 901 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]); 902 DeviceInfo.WarpSize[device_id] = 903 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]; 904 } 905 906 // Adjust teams to the env variables 907 908 if (DeviceInfo.EnvTeamLimit > 0 && 909 (enforce_upper_bound(&DeviceInfo.GroupsPerDevice[device_id], 910 DeviceInfo.EnvTeamLimit))) { 911 DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", 912 DeviceInfo.EnvTeamLimit); 913 } 914 915 // Set default number of teams 916 if (DeviceInfo.EnvNumTeams > 0) { 917 DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; 918 DP("Default number of teams set according to environment %d\n", 919 DeviceInfo.EnvNumTeams); 920 } else { 921 char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); 922 int TeamsPerCU = 1; // default number of teams per CU is 1 923 if (TeamsPerCUEnvStr) { 924 TeamsPerCU = std::stoi(TeamsPerCUEnvStr); 925 } 926 927 DeviceInfo.NumTeams[device_id] = 928 TeamsPerCU * DeviceInfo.ComputeUnits[device_id]; 929 DP("Default number of teams = %d * number of compute units %d\n", 930 TeamsPerCU, DeviceInfo.ComputeUnits[device_id]); 931 } 932 933 if (enforce_upper_bound(&DeviceInfo.NumTeams[device_id], 934 DeviceInfo.GroupsPerDevice[device_id])) { 935 DP("Default number of teams exceeds device limit, capping at %d\n", 936 DeviceInfo.GroupsPerDevice[device_id]); 937 } 938 939 // Set default number of threads 940 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size; 941 DP("Default number of threads set according to library's default %d\n", 942 RTLDeviceInfoTy::Default_WG_Size); 943 if (enforce_upper_bound(&DeviceInfo.NumThreads[device_id], 944 DeviceInfo.ThreadsPerGroup[device_id])) { 945 DP("Default number of threads exceeds device limit, capping at %d\n", 946 DeviceInfo.ThreadsPerGroup[device_id]); 947 } 948 949 DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", 950 device_id, DeviceInfo.GroupsPerDevice[device_id], 951 DeviceInfo.ThreadsPerGroup[device_id]); 952 953 DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id, 954 DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id], 955 DeviceInfo.GroupsPerDevice[device_id], 956 DeviceInfo.GroupsPerDevice[device_id] * 957 DeviceInfo.ThreadsPerGroup[device_id]); 958 959 return OFFLOAD_SUCCESS; 960 } 961 962 namespace { 963 Elf64_Shdr *find_only_SHT_HASH(Elf *elf) { 964 size_t N; 965 int rc = elf_getshdrnum(elf, &N); 966 if (rc != 0) { 967 return nullptr; 968 } 969 970 Elf64_Shdr *result = nullptr; 971 for (size_t i = 0; i < N; i++) { 972 Elf_Scn *scn = elf_getscn(elf, i); 973 if (scn) { 974 Elf64_Shdr *shdr = elf64_getshdr(scn); 975 if (shdr) { 976 if (shdr->sh_type == SHT_HASH) { 977 if (result == nullptr) { 978 result = shdr; 979 } else { 980 // multiple SHT_HASH sections not handled 981 return nullptr; 982 } 983 } 984 } 985 } 986 } 987 return result; 988 } 989 990 const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash, 991 const char *symname) { 992 993 assert(section_hash); 994 size_t section_symtab_index = section_hash->sh_link; 995 Elf64_Shdr *section_symtab = 996 elf64_getshdr(elf_getscn(elf, section_symtab_index)); 997 size_t section_strtab_index = section_symtab->sh_link; 998 999 const Elf64_Sym *symtab = 1000 reinterpret_cast<const Elf64_Sym *>(base + section_symtab->sh_offset); 1001 1002 const uint32_t *hashtab = 1003 reinterpret_cast<const uint32_t *>(base + section_hash->sh_offset); 1004 1005 // Layout: 1006 // nbucket 1007 // nchain 1008 // bucket[nbucket] 1009 // chain[nchain] 1010 uint32_t nbucket = hashtab[0]; 1011 const uint32_t *bucket = &hashtab[2]; 1012 const uint32_t *chain = &hashtab[nbucket + 2]; 1013 1014 const size_t max = strlen(symname) + 1; 1015 const uint32_t hash = elf_hash(symname); 1016 for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) { 1017 char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name); 1018 if (strncmp(symname, n, max) == 0) { 1019 return &symtab[i]; 1020 } 1021 } 1022 1023 return nullptr; 1024 } 1025 1026 typedef struct { 1027 void *addr = nullptr; 1028 uint32_t size = UINT32_MAX; 1029 uint32_t sh_type = SHT_NULL; 1030 } symbol_info; 1031 1032 int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname, 1033 symbol_info *res) { 1034 if (elf_kind(elf) != ELF_K_ELF) { 1035 return 1; 1036 } 1037 1038 Elf64_Shdr *section_hash = find_only_SHT_HASH(elf); 1039 if (!section_hash) { 1040 return 1; 1041 } 1042 1043 const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname); 1044 if (!sym) { 1045 return 1; 1046 } 1047 1048 if (sym->st_size > UINT32_MAX) { 1049 return 1; 1050 } 1051 1052 if (sym->st_shndx == SHN_UNDEF) { 1053 return 1; 1054 } 1055 1056 Elf_Scn *section = elf_getscn(elf, sym->st_shndx); 1057 if (!section) { 1058 return 1; 1059 } 1060 1061 Elf64_Shdr *header = elf64_getshdr(section); 1062 if (!header) { 1063 return 1; 1064 } 1065 1066 res->addr = sym->st_value + base; 1067 res->size = static_cast<uint32_t>(sym->st_size); 1068 res->sh_type = header->sh_type; 1069 return 0; 1070 } 1071 1072 int get_symbol_info_without_loading(char *base, size_t img_size, 1073 const char *symname, symbol_info *res) { 1074 Elf *elf = elf_memory(base, img_size); 1075 if (elf) { 1076 int rc = get_symbol_info_without_loading(elf, base, symname, res); 1077 elf_end(elf); 1078 return rc; 1079 } 1080 return 1; 1081 } 1082 1083 hsa_status_t interop_get_symbol_info(char *base, size_t img_size, 1084 const char *symname, void **var_addr, 1085 uint32_t *var_size) { 1086 symbol_info si; 1087 int rc = get_symbol_info_without_loading(base, img_size, symname, &si); 1088 if (rc == 0) { 1089 *var_addr = si.addr; 1090 *var_size = si.size; 1091 return HSA_STATUS_SUCCESS; 1092 } else { 1093 return HSA_STATUS_ERROR; 1094 } 1095 } 1096 1097 template <typename C> 1098 hsa_status_t module_register_from_memory_to_place( 1099 std::map<std::string, atl_kernel_info_t> &KernelInfoTable, 1100 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, 1101 void *module_bytes, size_t module_size, int DeviceId, C cb, 1102 std::vector<hsa_executable_t> &HSAExecutables) { 1103 auto L = [](void *data, size_t size, void *cb_state) -> hsa_status_t { 1104 C *unwrapped = static_cast<C *>(cb_state); 1105 return (*unwrapped)(data, size); 1106 }; 1107 return core::RegisterModuleFromMemory( 1108 KernelInfoTable, SymbolInfoTable, module_bytes, module_size, DeviceId, L, 1109 static_cast<void *>(&cb), HSAExecutables); 1110 } 1111 } // namespace 1112 1113 static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) { 1114 uint64_t device_State_bytes = 0; 1115 { 1116 // If this is the deviceRTL, get the state variable size 1117 symbol_info size_si; 1118 int rc = get_symbol_info_without_loading( 1119 ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si); 1120 1121 if (rc == 0) { 1122 if (size_si.size != sizeof(uint64_t)) { 1123 DP("Found device_State_size variable with wrong size\n"); 1124 return 0; 1125 } 1126 1127 // Read number of bytes directly from the elf 1128 memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t)); 1129 } 1130 } 1131 return device_State_bytes; 1132 } 1133 1134 static __tgt_target_table * 1135 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image); 1136 1137 static __tgt_target_table * 1138 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image); 1139 1140 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1141 __tgt_device_image *image) { 1142 DeviceInfo.load_run_lock.lock(); 1143 __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image); 1144 DeviceInfo.load_run_lock.unlock(); 1145 return res; 1146 } 1147 1148 struct device_environment { 1149 // initialise an omptarget_device_environmentTy in the deviceRTL 1150 // patches around differences in the deviceRTL between trunk, aomp, 1151 // rocmcc. Over time these differences will tend to zero and this class 1152 // simplified. 1153 // Symbol may be in .data or .bss, and may be missing fields: 1154 // - aomp has debug_level, num_devices, device_num 1155 // - trunk has debug_level 1156 // - under review in trunk is debug_level, device_num 1157 // - rocmcc matches aomp, patch to swap num_devices and device_num 1158 1159 // The symbol may also have been deadstripped because the device side 1160 // accessors were unused. 1161 1162 // If the symbol is in .data (aomp, rocm) it can be written directly. 1163 // If it is in .bss, we must wait for it to be allocated space on the 1164 // gpu (trunk) and initialize after loading. 1165 const char *sym() { return "omptarget_device_environment"; } 1166 1167 omptarget_device_environmentTy host_device_env; 1168 symbol_info si; 1169 bool valid = false; 1170 1171 __tgt_device_image *image; 1172 const size_t img_size; 1173 1174 device_environment(int device_id, int number_devices, 1175 __tgt_device_image *image, const size_t img_size) 1176 : image(image), img_size(img_size) { 1177 1178 host_device_env.num_devices = number_devices; 1179 host_device_env.device_num = device_id; 1180 host_device_env.debug_level = 0; 1181 #ifdef OMPTARGET_DEBUG 1182 if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { 1183 host_device_env.debug_level = std::stoi(envStr); 1184 } 1185 #endif 1186 1187 int rc = get_symbol_info_without_loading((char *)image->ImageStart, 1188 img_size, sym(), &si); 1189 if (rc != 0) { 1190 DP("Finding global device environment '%s' - symbol missing.\n", sym()); 1191 return; 1192 } 1193 1194 if (si.size > sizeof(host_device_env)) { 1195 DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size, 1196 sizeof(host_device_env)); 1197 return; 1198 } 1199 1200 valid = true; 1201 } 1202 1203 bool in_image() { return si.sh_type != SHT_NOBITS; } 1204 1205 hsa_status_t before_loading(void *data, size_t size) { 1206 if (valid) { 1207 if (in_image()) { 1208 DP("Setting global device environment before load (%u bytes)\n", 1209 si.size); 1210 uint64_t offset = (char *)si.addr - (char *)image->ImageStart; 1211 void *pos = (char *)data + offset; 1212 memcpy(pos, &host_device_env, si.size); 1213 } 1214 } 1215 return HSA_STATUS_SUCCESS; 1216 } 1217 1218 hsa_status_t after_loading() { 1219 if (valid) { 1220 if (!in_image()) { 1221 DP("Setting global device environment after load (%u bytes)\n", 1222 si.size); 1223 int device_id = host_device_env.device_num; 1224 auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; 1225 void *state_ptr; 1226 uint32_t state_ptr_size; 1227 hsa_status_t err = atmi_interop_hsa_get_symbol_info( 1228 SymbolInfo, device_id, sym(), &state_ptr, &state_ptr_size); 1229 if (err != HSA_STATUS_SUCCESS) { 1230 DP("failed to find %s in loaded image\n", sym()); 1231 return err; 1232 } 1233 1234 if (state_ptr_size != si.size) { 1235 DP("Symbol had size %u before loading, %u after\n", state_ptr_size, 1236 si.size); 1237 return HSA_STATUS_ERROR; 1238 } 1239 1240 return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env, 1241 state_ptr_size, device_id); 1242 } 1243 } 1244 return HSA_STATUS_SUCCESS; 1245 } 1246 }; 1247 1248 static hsa_status_t atmi_calloc(void **ret_ptr, size_t size, int DeviceId) { 1249 uint64_t rounded = 4 * ((size + 3) / 4); 1250 void *ptr; 1251 hsa_status_t err = core::Runtime::DeviceMalloc(&ptr, rounded, DeviceId); 1252 if (err != HSA_STATUS_SUCCESS) { 1253 return err; 1254 } 1255 1256 hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4); 1257 if (rc != HSA_STATUS_SUCCESS) { 1258 fprintf(stderr, "zero fill device_state failed with %u\n", rc); 1259 core::Runtime::Memfree(ptr); 1260 return HSA_STATUS_ERROR; 1261 } 1262 1263 *ret_ptr = ptr; 1264 return HSA_STATUS_SUCCESS; 1265 } 1266 1267 static bool image_contains_symbol(void *data, size_t size, const char *sym) { 1268 symbol_info si; 1269 int rc = get_symbol_info_without_loading((char *)data, size, sym, &si); 1270 return (rc == 0) && (si.addr != nullptr); 1271 } 1272 1273 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, 1274 __tgt_device_image *image) { 1275 // This function loads the device image onto gpu[device_id] and does other 1276 // per-image initialization work. Specifically: 1277 // 1278 // - Initialize an omptarget_device_environmentTy instance embedded in the 1279 // image at the symbol "omptarget_device_environment" 1280 // Fields debug_level, device_num, num_devices. Used by the deviceRTL. 1281 // 1282 // - Allocate a large array per-gpu (could be moved to init_device) 1283 // - Read a uint64_t at symbol omptarget_nvptx_device_State_size 1284 // - Allocate at least that many bytes of gpu memory 1285 // - Zero initialize it 1286 // - Write the pointer to the symbol omptarget_nvptx_device_State 1287 // 1288 // - Pulls some per-kernel information together from various sources and 1289 // records it in the KernelsList for quicker access later 1290 // 1291 // The initialization can be done before or after loading the image onto the 1292 // gpu. This function presently does a mixture. Using the hsa api to get/set 1293 // the information is simpler to implement, in exchange for more complicated 1294 // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes 1295 // back from the gpu vs a hashtable lookup on the host. 1296 1297 const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart; 1298 1299 DeviceInfo.clearOffloadEntriesTable(device_id); 1300 1301 // We do not need to set the ELF version because the caller of this function 1302 // had to do that to decide the right runtime to use 1303 1304 if (!elf_machine_id_is_amdgcn(image)) { 1305 return NULL; 1306 } 1307 1308 { 1309 auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image, 1310 img_size); 1311 1312 auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id]; 1313 auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; 1314 hsa_status_t err = module_register_from_memory_to_place( 1315 KernelInfo, SymbolInfo, (void *)image->ImageStart, img_size, device_id, 1316 [&](void *data, size_t size) { 1317 if (image_contains_symbol(data, size, "needs_hostcall_buffer")) { 1318 __atomic_store_n(&DeviceInfo.hostcall_required, true, 1319 __ATOMIC_RELEASE); 1320 } 1321 return env.before_loading(data, size); 1322 }, 1323 DeviceInfo.HSAExecutables); 1324 1325 check("Module registering", err); 1326 if (err != HSA_STATUS_SUCCESS) { 1327 fprintf(stderr, 1328 "Possible gpu arch mismatch: device:%s, image:%s please check" 1329 " compiler flag: -march=<gpu>\n", 1330 DeviceInfo.GPUName[device_id].c_str(), 1331 get_elf_mach_gfx_name(elf_e_flags(image))); 1332 return NULL; 1333 } 1334 1335 err = env.after_loading(); 1336 if (err != HSA_STATUS_SUCCESS) { 1337 return NULL; 1338 } 1339 } 1340 1341 DP("ATMI module successfully loaded!\n"); 1342 1343 { 1344 // the device_State array is either large value in bss or a void* that 1345 // needs to be assigned to a pointer to an array of size device_state_bytes 1346 // If absent, it has been deadstripped and needs no setup. 1347 1348 void *state_ptr; 1349 uint32_t state_ptr_size; 1350 auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; 1351 hsa_status_t err = atmi_interop_hsa_get_symbol_info( 1352 SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr, 1353 &state_ptr_size); 1354 1355 if (err != HSA_STATUS_SUCCESS) { 1356 DP("No device_state symbol found, skipping initialization\n"); 1357 } else { 1358 if (state_ptr_size < sizeof(void *)) { 1359 DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size, 1360 sizeof(void *)); 1361 return NULL; 1362 } 1363 1364 // if it's larger than a void*, assume it's a bss array and no further 1365 // initialization is required. Only try to set up a pointer for 1366 // sizeof(void*) 1367 if (state_ptr_size == sizeof(void *)) { 1368 uint64_t device_State_bytes = 1369 get_device_State_bytes((char *)image->ImageStart, img_size); 1370 if (device_State_bytes == 0) { 1371 DP("Can't initialize device_State, missing size information\n"); 1372 return NULL; 1373 } 1374 1375 auto &dss = DeviceInfo.deviceStateStore[device_id]; 1376 if (dss.first.get() == nullptr) { 1377 assert(dss.second == 0); 1378 void *ptr = NULL; 1379 hsa_status_t err = atmi_calloc(&ptr, device_State_bytes, device_id); 1380 if (err != HSA_STATUS_SUCCESS) { 1381 DP("Failed to allocate device_state array\n"); 1382 return NULL; 1383 } 1384 dss = { 1385 std::unique_ptr<void, RTLDeviceInfoTy::atmiFreePtrDeletor>{ptr}, 1386 device_State_bytes, 1387 }; 1388 } 1389 1390 void *ptr = dss.first.get(); 1391 if (device_State_bytes != dss.second) { 1392 DP("Inconsistent sizes of device_State unsupported\n"); 1393 return NULL; 1394 } 1395 1396 // write ptr to device memory so it can be used by later kernels 1397 err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, 1398 sizeof(void *), device_id); 1399 if (err != HSA_STATUS_SUCCESS) { 1400 DP("memcpy install of state_ptr failed\n"); 1401 return NULL; 1402 } 1403 } 1404 } 1405 } 1406 1407 // Here, we take advantage of the data that is appended after img_end to get 1408 // the symbols' name we need to load. This data consist of the host entries 1409 // begin and end as well as the target name (see the offloading linker script 1410 // creation in clang compiler). 1411 1412 // Find the symbols in the module by name. The name can be obtain by 1413 // concatenating the host entry name with the target name 1414 1415 __tgt_offload_entry *HostBegin = image->EntriesBegin; 1416 __tgt_offload_entry *HostEnd = image->EntriesEnd; 1417 1418 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { 1419 1420 if (!e->addr) { 1421 // The host should have always something in the address to 1422 // uniquely identify the target region. 1423 fprintf(stderr, "Analyzing host entry '<null>' (size = %lld)...\n", 1424 (unsigned long long)e->size); 1425 return NULL; 1426 } 1427 1428 if (e->size) { 1429 __tgt_offload_entry entry = *e; 1430 1431 void *varptr; 1432 uint32_t varsize; 1433 1434 auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; 1435 hsa_status_t err = atmi_interop_hsa_get_symbol_info( 1436 SymbolInfoMap, device_id, e->name, &varptr, &varsize); 1437 1438 if (err != HSA_STATUS_SUCCESS) { 1439 // Inform the user what symbol prevented offloading 1440 DP("Loading global '%s' (Failed)\n", e->name); 1441 return NULL; 1442 } 1443 1444 if (varsize != e->size) { 1445 DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name, 1446 varsize, e->size); 1447 return NULL; 1448 } 1449 1450 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 1451 DPxPTR(e - HostBegin), e->name, DPxPTR(varptr)); 1452 entry.addr = (void *)varptr; 1453 1454 DeviceInfo.addOffloadEntry(device_id, entry); 1455 1456 if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 1457 e->flags & OMP_DECLARE_TARGET_LINK) { 1458 // If unified memory is present any target link variables 1459 // can access host addresses directly. There is no longer a 1460 // need for device copies. 1461 err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr, 1462 sizeof(void *), device_id); 1463 if (err != HSA_STATUS_SUCCESS) 1464 DP("Error when copying USM\n"); 1465 DP("Copy linked variable host address (" DPxMOD ")" 1466 "to device address (" DPxMOD ")\n", 1467 DPxPTR(*((void **)e->addr)), DPxPTR(varptr)); 1468 } 1469 1470 continue; 1471 } 1472 1473 DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name)); 1474 1475 uint32_t kernarg_segment_size; 1476 auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id]; 1477 hsa_status_t err = atmi_interop_hsa_get_kernel_info( 1478 KernelInfoMap, device_id, e->name, 1479 HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, 1480 &kernarg_segment_size); 1481 1482 // each arg is a void * in this openmp implementation 1483 uint32_t arg_num = kernarg_segment_size / sizeof(void *); 1484 std::vector<size_t> arg_sizes(arg_num); 1485 for (std::vector<size_t>::iterator it = arg_sizes.begin(); 1486 it != arg_sizes.end(); it++) { 1487 *it = sizeof(void *); 1488 } 1489 1490 // default value GENERIC (in case symbol is missing from cubin file) 1491 int8_t ExecModeVal = ExecutionModeType::GENERIC; 1492 1493 // get flat group size if present, else Default_WG_Size 1494 int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; 1495 1496 // get Kernel Descriptor if present. 1497 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp 1498 struct KernDescValType { 1499 uint16_t Version; 1500 uint16_t TSize; 1501 uint16_t WG_Size; 1502 uint8_t Mode; 1503 }; 1504 struct KernDescValType KernDescVal; 1505 std::string KernDescNameStr(e->name); 1506 KernDescNameStr += "_kern_desc"; 1507 const char *KernDescName = KernDescNameStr.c_str(); 1508 1509 void *KernDescPtr; 1510 uint32_t KernDescSize; 1511 void *CallStackAddr = nullptr; 1512 err = interop_get_symbol_info((char *)image->ImageStart, img_size, 1513 KernDescName, &KernDescPtr, &KernDescSize); 1514 1515 if (err == HSA_STATUS_SUCCESS) { 1516 if ((size_t)KernDescSize != sizeof(KernDescVal)) 1517 DP("Loading global computation properties '%s' - size mismatch (%u != " 1518 "%lu)\n", 1519 KernDescName, KernDescSize, sizeof(KernDescVal)); 1520 1521 memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); 1522 1523 // Check structure size against recorded size. 1524 if ((size_t)KernDescSize != KernDescVal.TSize) 1525 DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", 1526 sizeof(KernDescVal), KernDescVal.TSize, KernDescName); 1527 1528 DP("After loading global for %s KernDesc \n", KernDescName); 1529 DP("KernDesc: Version: %d\n", KernDescVal.Version); 1530 DP("KernDesc: TSize: %d\n", KernDescVal.TSize); 1531 DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size); 1532 DP("KernDesc: Mode: %d\n", KernDescVal.Mode); 1533 1534 // Get ExecMode 1535 ExecModeVal = KernDescVal.Mode; 1536 DP("ExecModeVal %d\n", ExecModeVal); 1537 if (KernDescVal.WG_Size == 0) { 1538 KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size; 1539 DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size); 1540 } 1541 WGSizeVal = KernDescVal.WG_Size; 1542 DP("WGSizeVal %d\n", WGSizeVal); 1543 check("Loading KernDesc computation property", err); 1544 } else { 1545 DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); 1546 1547 // Generic 1548 std::string ExecModeNameStr(e->name); 1549 ExecModeNameStr += "_exec_mode"; 1550 const char *ExecModeName = ExecModeNameStr.c_str(); 1551 1552 void *ExecModePtr; 1553 uint32_t varsize; 1554 err = interop_get_symbol_info((char *)image->ImageStart, img_size, 1555 ExecModeName, &ExecModePtr, &varsize); 1556 1557 if (err == HSA_STATUS_SUCCESS) { 1558 if ((size_t)varsize != sizeof(int8_t)) { 1559 DP("Loading global computation properties '%s' - size mismatch(%u != " 1560 "%lu)\n", 1561 ExecModeName, varsize, sizeof(int8_t)); 1562 return NULL; 1563 } 1564 1565 memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize); 1566 1567 DP("After loading global for %s ExecMode = %d\n", ExecModeName, 1568 ExecModeVal); 1569 1570 if (ExecModeVal < 0 || ExecModeVal > 1) { 1571 DP("Error wrong exec_mode value specified in HSA code object file: " 1572 "%d\n", 1573 ExecModeVal); 1574 return NULL; 1575 } 1576 } else { 1577 DP("Loading global exec_mode '%s' - symbol missing, using default " 1578 "value " 1579 "GENERIC (1)\n", 1580 ExecModeName); 1581 } 1582 check("Loading computation property", err); 1583 1584 // Flat group size 1585 std::string WGSizeNameStr(e->name); 1586 WGSizeNameStr += "_wg_size"; 1587 const char *WGSizeName = WGSizeNameStr.c_str(); 1588 1589 void *WGSizePtr; 1590 uint32_t WGSize; 1591 err = interop_get_symbol_info((char *)image->ImageStart, img_size, 1592 WGSizeName, &WGSizePtr, &WGSize); 1593 1594 if (err == HSA_STATUS_SUCCESS) { 1595 if ((size_t)WGSize != sizeof(int16_t)) { 1596 DP("Loading global computation properties '%s' - size mismatch (%u " 1597 "!= " 1598 "%lu)\n", 1599 WGSizeName, WGSize, sizeof(int16_t)); 1600 return NULL; 1601 } 1602 1603 memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); 1604 1605 DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); 1606 1607 if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size || 1608 WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) { 1609 DP("Error wrong WGSize value specified in HSA code object file: " 1610 "%d\n", 1611 WGSizeVal); 1612 WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; 1613 } 1614 } else { 1615 DP("Warning: Loading WGSize '%s' - symbol not found, " 1616 "using default value %d\n", 1617 WGSizeName, WGSizeVal); 1618 } 1619 1620 check("Loading WGSize computation property", err); 1621 } 1622 1623 KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id, 1624 CallStackAddr, e->name, kernarg_segment_size, 1625 DeviceInfo.KernArgPool)); 1626 __tgt_offload_entry entry = *e; 1627 entry.addr = (void *)&KernelsList.back(); 1628 DeviceInfo.addOffloadEntry(device_id, entry); 1629 DP("Entry point %ld maps to %s\n", e - HostBegin, e->name); 1630 } 1631 1632 return DeviceInfo.getOffloadEntriesTable(device_id); 1633 } 1634 1635 void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) { 1636 void *ptr = NULL; 1637 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); 1638 1639 if (kind != TARGET_ALLOC_DEFAULT) { 1640 REPORT("Invalid target data allocation kind or requested allocator not " 1641 "implemented yet\n"); 1642 return NULL; 1643 } 1644 1645 hsa_status_t err = core::Runtime::DeviceMalloc(&ptr, size, device_id); 1646 DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size, 1647 (long long unsigned)(Elf64_Addr)ptr); 1648 ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL; 1649 return ptr; 1650 } 1651 1652 int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr, 1653 int64_t size) { 1654 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); 1655 __tgt_async_info AsyncInfo; 1656 int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo); 1657 if (rc != OFFLOAD_SUCCESS) 1658 return OFFLOAD_FAIL; 1659 1660 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1661 } 1662 1663 int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr, 1664 int64_t size, __tgt_async_info *AsyncInfo) { 1665 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); 1666 if (AsyncInfo) { 1667 initAsyncInfo(AsyncInfo); 1668 return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo); 1669 } else { 1670 return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size); 1671 } 1672 } 1673 1674 int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr, 1675 int64_t size) { 1676 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); 1677 __tgt_async_info AsyncInfo; 1678 int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo); 1679 if (rc != OFFLOAD_SUCCESS) 1680 return OFFLOAD_FAIL; 1681 1682 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1683 } 1684 1685 int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr, 1686 void *tgt_ptr, int64_t size, 1687 __tgt_async_info *AsyncInfo) { 1688 assert(AsyncInfo && "AsyncInfo is nullptr"); 1689 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); 1690 initAsyncInfo(AsyncInfo); 1691 return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo); 1692 } 1693 1694 int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) { 1695 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); 1696 hsa_status_t err; 1697 DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr); 1698 err = core::Runtime::Memfree(tgt_ptr); 1699 if (err != HSA_STATUS_SUCCESS) { 1700 DP("Error when freeing CUDA memory\n"); 1701 return OFFLOAD_FAIL; 1702 } 1703 return OFFLOAD_SUCCESS; 1704 } 1705 1706 // Determine launch values for threadsPerGroup and num_groups. 1707 // Outputs: treadsPerGroup, num_groups 1708 // Inputs: Max_Teams, Max_WG_Size, Warp_Size, ExecutionMode, 1709 // EnvTeamLimit, EnvNumTeams, num_teams, thread_limit, 1710 // loop_tripcount. 1711 void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize, 1712 int ExecutionMode, int EnvTeamLimit, int EnvNumTeams, 1713 int num_teams, int thread_limit, uint64_t loop_tripcount, 1714 int32_t device_id) { 1715 1716 int Max_Teams = DeviceInfo.EnvMaxTeamsDefault > 0 1717 ? DeviceInfo.EnvMaxTeamsDefault 1718 : DeviceInfo.NumTeams[device_id]; 1719 if (Max_Teams > DeviceInfo.HardTeamLimit) 1720 Max_Teams = DeviceInfo.HardTeamLimit; 1721 1722 if (print_kernel_trace & STARTUP_DETAILS) { 1723 fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n", 1724 RTLDeviceInfoTy::Max_Teams); 1725 fprintf(stderr, "Max_Teams: %d\n", Max_Teams); 1726 fprintf(stderr, "RTLDeviceInfoTy::Warp_Size: %d\n", 1727 RTLDeviceInfoTy::Warp_Size); 1728 fprintf(stderr, "RTLDeviceInfoTy::Max_WG_Size: %d\n", 1729 RTLDeviceInfoTy::Max_WG_Size); 1730 fprintf(stderr, "RTLDeviceInfoTy::Default_WG_Size: %d\n", 1731 RTLDeviceInfoTy::Default_WG_Size); 1732 fprintf(stderr, "thread_limit: %d\n", thread_limit); 1733 fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); 1734 fprintf(stderr, "ConstWGSize: %d\n", ConstWGSize); 1735 } 1736 // check for thread_limit() clause 1737 if (thread_limit > 0) { 1738 threadsPerGroup = thread_limit; 1739 DP("Setting threads per block to requested %d\n", thread_limit); 1740 if (ExecutionMode == GENERIC) { // Add master warp for GENERIC 1741 threadsPerGroup += RTLDeviceInfoTy::Warp_Size; 1742 DP("Adding master wavefront: +%d threads\n", RTLDeviceInfoTy::Warp_Size); 1743 } 1744 if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max 1745 threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size; 1746 DP("Setting threads per block to maximum %d\n", threadsPerGroup); 1747 } 1748 } 1749 // check flat_max_work_group_size attr here 1750 if (threadsPerGroup > ConstWGSize) { 1751 threadsPerGroup = ConstWGSize; 1752 DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", 1753 threadsPerGroup); 1754 } 1755 if (print_kernel_trace & STARTUP_DETAILS) 1756 fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); 1757 DP("Preparing %d threads\n", threadsPerGroup); 1758 1759 // Set default num_groups (teams) 1760 if (DeviceInfo.EnvTeamLimit > 0) 1761 num_groups = (Max_Teams < DeviceInfo.EnvTeamLimit) 1762 ? Max_Teams 1763 : DeviceInfo.EnvTeamLimit; 1764 else 1765 num_groups = Max_Teams; 1766 DP("Set default num of groups %d\n", num_groups); 1767 1768 if (print_kernel_trace & STARTUP_DETAILS) { 1769 fprintf(stderr, "num_groups: %d\n", num_groups); 1770 fprintf(stderr, "num_teams: %d\n", num_teams); 1771 } 1772 1773 // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size 1774 // This reduction is typical for default case (no thread_limit clause). 1775 // or when user goes crazy with num_teams clause. 1776 // FIXME: We cant distinguish between a constant or variable thread limit. 1777 // So we only handle constant thread_limits. 1778 if (threadsPerGroup > 1779 RTLDeviceInfoTy::Default_WG_Size) // 256 < threadsPerGroup <= 1024 1780 // Should we round threadsPerGroup up to nearest RTLDeviceInfoTy::Warp_Size 1781 // here? 1782 num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup; 1783 1784 // check for num_teams() clause 1785 if (num_teams > 0) { 1786 num_groups = (num_teams < num_groups) ? num_teams : num_groups; 1787 } 1788 if (print_kernel_trace & STARTUP_DETAILS) { 1789 fprintf(stderr, "num_groups: %d\n", num_groups); 1790 fprintf(stderr, "DeviceInfo.EnvNumTeams %d\n", DeviceInfo.EnvNumTeams); 1791 fprintf(stderr, "DeviceInfo.EnvTeamLimit %d\n", DeviceInfo.EnvTeamLimit); 1792 } 1793 1794 if (DeviceInfo.EnvNumTeams > 0) { 1795 num_groups = (DeviceInfo.EnvNumTeams < num_groups) ? DeviceInfo.EnvNumTeams 1796 : num_groups; 1797 DP("Modifying teams based on EnvNumTeams %d\n", DeviceInfo.EnvNumTeams); 1798 } else if (DeviceInfo.EnvTeamLimit > 0) { 1799 num_groups = (DeviceInfo.EnvTeamLimit < num_groups) 1800 ? DeviceInfo.EnvTeamLimit 1801 : num_groups; 1802 DP("Modifying teams based on EnvTeamLimit%d\n", DeviceInfo.EnvTeamLimit); 1803 } else { 1804 if (num_teams <= 0) { 1805 if (loop_tripcount > 0) { 1806 if (ExecutionMode == SPMD) { 1807 // round up to the nearest integer 1808 num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1; 1809 } else { 1810 num_groups = loop_tripcount; 1811 } 1812 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 1813 "threads per block %d\n", 1814 num_groups, loop_tripcount, threadsPerGroup); 1815 } 1816 } else { 1817 num_groups = num_teams; 1818 } 1819 if (num_groups > Max_Teams) { 1820 num_groups = Max_Teams; 1821 if (print_kernel_trace & STARTUP_DETAILS) 1822 fprintf(stderr, "Limiting num_groups %d to Max_Teams %d \n", num_groups, 1823 Max_Teams); 1824 } 1825 if (num_groups > num_teams && num_teams > 0) { 1826 num_groups = num_teams; 1827 if (print_kernel_trace & STARTUP_DETAILS) 1828 fprintf(stderr, "Limiting num_groups %d to clause num_teams %d \n", 1829 num_groups, num_teams); 1830 } 1831 } 1832 1833 // num_teams clause always honored, no matter what, unless DEFAULT is active. 1834 if (num_teams > 0) { 1835 num_groups = num_teams; 1836 // Cap num_groups to EnvMaxTeamsDefault if set. 1837 if (DeviceInfo.EnvMaxTeamsDefault > 0 && 1838 num_groups > DeviceInfo.EnvMaxTeamsDefault) 1839 num_groups = DeviceInfo.EnvMaxTeamsDefault; 1840 } 1841 if (print_kernel_trace & STARTUP_DETAILS) { 1842 fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); 1843 fprintf(stderr, "num_groups: %d\n", num_groups); 1844 fprintf(stderr, "loop_tripcount: %ld\n", loop_tripcount); 1845 } 1846 DP("Final %d num_groups and %d threadsPerGroup\n", num_groups, 1847 threadsPerGroup); 1848 } 1849 1850 static uint64_t acquire_available_packet_id(hsa_queue_t *queue) { 1851 uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); 1852 bool full = true; 1853 while (full) { 1854 full = 1855 packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue)); 1856 } 1857 return packet_id; 1858 } 1859 1860 static int32_t __tgt_rtl_run_target_team_region_locked( 1861 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1862 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams, 1863 int32_t thread_limit, uint64_t loop_tripcount); 1864 1865 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1866 void **tgt_args, 1867 ptrdiff_t *tgt_offsets, 1868 int32_t arg_num, int32_t num_teams, 1869 int32_t thread_limit, 1870 uint64_t loop_tripcount) { 1871 1872 DeviceInfo.load_run_lock.lock_shared(); 1873 int32_t res = __tgt_rtl_run_target_team_region_locked( 1874 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams, 1875 thread_limit, loop_tripcount); 1876 1877 DeviceInfo.load_run_lock.unlock_shared(); 1878 return res; 1879 } 1880 1881 int32_t __tgt_rtl_run_target_team_region_locked( 1882 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1883 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams, 1884 int32_t thread_limit, uint64_t loop_tripcount) { 1885 // Set the context we are using 1886 // update thread limit content in gpu memory if un-initialized or specified 1887 // from host 1888 1889 DP("Run target team region thread_limit %d\n", thread_limit); 1890 1891 // All args are references. 1892 std::vector<void *> args(arg_num); 1893 std::vector<void *> ptrs(arg_num); 1894 1895 DP("Arg_num: %d\n", arg_num); 1896 for (int32_t i = 0; i < arg_num; ++i) { 1897 ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); 1898 args[i] = &ptrs[i]; 1899 DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i])); 1900 } 1901 1902 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; 1903 1904 std::string kernel_name = std::string(KernelInfo->Name); 1905 auto &KernelInfoTable = DeviceInfo.KernelInfoTable; 1906 if (KernelInfoTable[device_id].find(kernel_name) == 1907 KernelInfoTable[device_id].end()) { 1908 DP("Kernel %s not found\n", kernel_name.c_str()); 1909 return OFFLOAD_FAIL; 1910 } 1911 1912 const atl_kernel_info_t KernelInfoEntry = 1913 KernelInfoTable[device_id][kernel_name]; 1914 const uint32_t group_segment_size = KernelInfoEntry.group_segment_size; 1915 const uint32_t sgpr_count = KernelInfoEntry.sgpr_count; 1916 const uint32_t vgpr_count = KernelInfoEntry.vgpr_count; 1917 const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count; 1918 const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count; 1919 1920 assert(arg_num == (int)KernelInfoEntry.num_args); 1921 1922 /* 1923 * Set limit based on ThreadsPerGroup and GroupsPerDevice 1924 */ 1925 int num_groups = 0; 1926 1927 int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size; 1928 1929 getLaunchVals(threadsPerGroup, num_groups, KernelInfo->ConstWGSize, 1930 KernelInfo->ExecutionMode, DeviceInfo.EnvTeamLimit, 1931 DeviceInfo.EnvNumTeams, 1932 num_teams, // From run_region arg 1933 thread_limit, // From run_region arg 1934 loop_tripcount, // From run_region arg 1935 KernelInfo->device_id); 1936 1937 if (print_kernel_trace >= LAUNCH) { 1938 // enum modes are SPMD, GENERIC, NONE 0,1,2 1939 // if doing rtl timing, print to stderr, unless stdout requested. 1940 bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); 1941 fprintf(traceToStdout ? stdout : stderr, 1942 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " 1943 "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " 1944 "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", 1945 device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, 1946 arg_num, num_groups, threadsPerGroup, num_teams, thread_limit, 1947 group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count, 1948 vgpr_spill_count, loop_tripcount, KernelInfo->Name); 1949 } 1950 1951 // Run on the device. 1952 { 1953 hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id]; 1954 if (!queue) { 1955 return OFFLOAD_FAIL; 1956 } 1957 uint64_t packet_id = acquire_available_packet_id(queue); 1958 1959 const uint32_t mask = queue->size - 1; // size is a power of 2 1960 hsa_kernel_dispatch_packet_t *packet = 1961 (hsa_kernel_dispatch_packet_t *)queue->base_address + 1962 (packet_id & mask); 1963 1964 // packet->header is written last 1965 packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 1966 packet->workgroup_size_x = threadsPerGroup; 1967 packet->workgroup_size_y = 1; 1968 packet->workgroup_size_z = 1; 1969 packet->reserved0 = 0; 1970 packet->grid_size_x = num_groups * threadsPerGroup; 1971 packet->grid_size_y = 1; 1972 packet->grid_size_z = 1; 1973 packet->private_segment_size = KernelInfoEntry.private_segment_size; 1974 packet->group_segment_size = KernelInfoEntry.group_segment_size; 1975 packet->kernel_object = KernelInfoEntry.kernel_object; 1976 packet->kernarg_address = 0; // use the block allocator 1977 packet->reserved2 = 0; // atmi writes id_ here 1978 packet->completion_signal = {0}; // may want a pool of signals 1979 1980 KernelArgPool *ArgPool = nullptr; 1981 { 1982 auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name)); 1983 if (it != KernelArgPoolMap.end()) { 1984 ArgPool = (it->second).get(); 1985 } 1986 } 1987 if (!ArgPool) { 1988 DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, 1989 device_id); 1990 } 1991 { 1992 void *kernarg = nullptr; 1993 if (ArgPool) { 1994 assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *))); 1995 kernarg = ArgPool->allocate(arg_num); 1996 } 1997 if (!kernarg) { 1998 DP("Allocate kernarg failed\n"); 1999 return OFFLOAD_FAIL; 2000 } 2001 2002 // Copy explicit arguments 2003 for (int i = 0; i < arg_num; i++) { 2004 memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *)); 2005 } 2006 2007 // Initialize implicit arguments. ATMI seems to leave most fields 2008 // uninitialized 2009 atmi_implicit_args_t *impl_args = 2010 reinterpret_cast<atmi_implicit_args_t *>( 2011 static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size); 2012 memset(impl_args, 0, 2013 sizeof(atmi_implicit_args_t)); // may not be necessary 2014 impl_args->offset_x = 0; 2015 impl_args->offset_y = 0; 2016 impl_args->offset_z = 0; 2017 2018 // assign a hostcall buffer for the selected Q 2019 if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) { 2020 // hostrpc_assign_buffer is not thread safe, and this function is 2021 // under a multiple reader lock, not a writer lock. 2022 static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER; 2023 pthread_mutex_lock(&hostcall_init_lock); 2024 impl_args->hostcall_ptr = hostrpc_assign_buffer( 2025 DeviceInfo.HSAAgents[device_id], queue, device_id); 2026 pthread_mutex_unlock(&hostcall_init_lock); 2027 if (!impl_args->hostcall_ptr) { 2028 DP("hostrpc_assign_buffer failed, gpu would dereference null and " 2029 "error\n"); 2030 return OFFLOAD_FAIL; 2031 } 2032 } 2033 2034 packet->kernarg_address = kernarg; 2035 } 2036 2037 { 2038 hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); 2039 if (s.handle == 0) { 2040 DP("Failed to get signal instance\n"); 2041 return OFFLOAD_FAIL; 2042 } 2043 packet->completion_signal = s; 2044 hsa_signal_store_relaxed(packet->completion_signal, 1); 2045 } 2046 2047 core::packet_store_release(reinterpret_cast<uint32_t *>(packet), 2048 core::create_header(), packet->setup); 2049 2050 hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); 2051 2052 while (hsa_signal_wait_scacquire(packet->completion_signal, 2053 HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, 2054 HSA_WAIT_STATE_BLOCKED) != 0) 2055 ; 2056 2057 assert(ArgPool); 2058 ArgPool->deallocate(packet->kernarg_address); 2059 DeviceInfo.FreeSignalPool.push(packet->completion_signal); 2060 } 2061 2062 DP("Kernel completed\n"); 2063 return OFFLOAD_SUCCESS; 2064 } 2065 2066 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 2067 void **tgt_args, ptrdiff_t *tgt_offsets, 2068 int32_t arg_num) { 2069 // use one team and one thread 2070 // fix thread num 2071 int32_t team_num = 1; 2072 int32_t thread_limit = 0; // use default 2073 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 2074 tgt_offsets, arg_num, team_num, 2075 thread_limit, 0); 2076 } 2077 2078 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 2079 void *tgt_entry_ptr, void **tgt_args, 2080 ptrdiff_t *tgt_offsets, 2081 int32_t arg_num, 2082 __tgt_async_info *AsyncInfo) { 2083 assert(AsyncInfo && "AsyncInfo is nullptr"); 2084 initAsyncInfo(AsyncInfo); 2085 2086 // use one team and one thread 2087 // fix thread num 2088 int32_t team_num = 1; 2089 int32_t thread_limit = 0; // use default 2090 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 2091 tgt_offsets, arg_num, team_num, 2092 thread_limit, 0); 2093 } 2094 2095 int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *AsyncInfo) { 2096 assert(AsyncInfo && "AsyncInfo is nullptr"); 2097 2098 // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant 2099 // is not ensured by devices.cpp for amdgcn 2100 // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr"); 2101 if (AsyncInfo->Queue) { 2102 finiAsyncInfo(AsyncInfo); 2103 } 2104 return OFFLOAD_SUCCESS; 2105 } 2106