1 //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for AMD hsa machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <algorithm> 14 #include <assert.h> 15 #include <cstdio> 16 #include <cstdlib> 17 #include <cstring> 18 #include <functional> 19 #include <libelf.h> 20 #include <list> 21 #include <memory> 22 #include <mutex> 23 #include <shared_mutex> 24 #include <unordered_map> 25 #include <vector> 26 27 #include "impl_runtime.h" 28 #include "interop_hsa.h" 29 30 #include "internal.h" 31 #include "rt.h" 32 33 #include "DeviceEnvironment.h" 34 #include "get_elf_mach_gfx_name.h" 35 #include "omptargetplugin.h" 36 #include "print_tracing.h" 37 38 #include "llvm/Frontend/OpenMP/OMPConstants.h" 39 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 40 41 // hostrpc interface, FIXME: consider moving to its own include these are 42 // statically linked into amdgpu/plugin if present from hostrpc_services.a, 43 // linked as --whole-archive to override the weak symbols that are used to 44 // implement a fallback for toolchains that do not yet have a hostrpc library. 45 extern "C" { 46 uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ, 47 uint32_t DeviceId); 48 hsa_status_t hostrpc_init(); 49 hsa_status_t hostrpc_terminate(); 50 51 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } 52 __attribute__((weak)) hsa_status_t hostrpc_terminate() { 53 return HSA_STATUS_SUCCESS; 54 } 55 __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, 56 uint32_t DeviceId) { 57 DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library " 58 "missing\n", 59 DeviceId); 60 return 0; 61 } 62 } 63 64 // Heuristic parameters used for kernel launch 65 // Number of teams per CU to allow scheduling flexibility 66 static const unsigned DefaultTeamsPerCU = 4; 67 68 int print_kernel_trace; 69 70 #ifdef OMPTARGET_DEBUG 71 #define check(msg, status) \ 72 if (status != HSA_STATUS_SUCCESS) { \ 73 DP(#msg " failed\n"); \ 74 } else { \ 75 DP(#msg " succeeded\n"); \ 76 } 77 #else 78 #define check(msg, status) \ 79 {} 80 #endif 81 82 #include "elf_common.h" 83 84 namespace hsa { 85 template <typename C> hsa_status_t iterate_agents(C Cb) { 86 auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t { 87 C *Unwrapped = static_cast<C *>(Data); 88 return (*Unwrapped)(Agent); 89 }; 90 return hsa_iterate_agents(L, static_cast<void *>(&Cb)); 91 } 92 93 template <typename C> 94 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) { 95 auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t { 96 C *Unwrapped = static_cast<C *>(Data); 97 return (*Unwrapped)(MemoryPool); 98 }; 99 100 return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb)); 101 } 102 103 } // namespace hsa 104 105 /// Keep entries table per device 106 struct FuncOrGblEntryTy { 107 __tgt_target_table Table; 108 std::vector<__tgt_offload_entry> Entries; 109 }; 110 111 struct KernelArgPool { 112 private: 113 static pthread_mutex_t Mutex; 114 115 public: 116 uint32_t KernargSegmentSize; 117 void *KernargRegion = nullptr; 118 std::queue<int> FreeKernargSegments; 119 120 uint32_t kernargSizeIncludingImplicit() { 121 return KernargSegmentSize + sizeof(impl_implicit_args_t); 122 } 123 124 ~KernelArgPool() { 125 if (KernargRegion) { 126 auto R = hsa_amd_memory_pool_free(KernargRegion); 127 if (R != HSA_STATUS_SUCCESS) { 128 DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)); 129 } 130 } 131 } 132 133 // Can't really copy or move a mutex 134 KernelArgPool() = default; 135 KernelArgPool(const KernelArgPool &) = delete; 136 KernelArgPool(KernelArgPool &&) = delete; 137 138 KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) 139 : KernargSegmentSize(KernargSegmentSize) { 140 141 // impl uses one pool per kernel for all gpus, with a fixed upper size 142 // preserving that exact scheme here, including the queue<int> 143 144 hsa_status_t Err = hsa_amd_memory_pool_allocate( 145 MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0, 146 &KernargRegion); 147 148 if (Err != HSA_STATUS_SUCCESS) { 149 DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)); 150 KernargRegion = nullptr; // paranoid 151 return; 152 } 153 154 Err = core::allow_access_to_all_gpu_agents(KernargRegion); 155 if (Err != HSA_STATUS_SUCCESS) { 156 DP("hsa allow_access_to_all_gpu_agents failed: %s\n", 157 get_error_string(Err)); 158 auto R = hsa_amd_memory_pool_free(KernargRegion); 159 if (R != HSA_STATUS_SUCCESS) { 160 // if free failed, can't do anything more to resolve it 161 DP("hsa memory poll free failed: %s\n", get_error_string(Err)); 162 } 163 KernargRegion = nullptr; 164 return; 165 } 166 167 for (int I = 0; I < MAX_NUM_KERNELS; I++) { 168 FreeKernargSegments.push(I); 169 } 170 } 171 172 void *allocate(uint64_t ArgNum) { 173 assert((ArgNum * sizeof(void *)) == KernargSegmentSize); 174 Lock L(&Mutex); 175 void *Res = nullptr; 176 if (!FreeKernargSegments.empty()) { 177 178 int FreeIdx = FreeKernargSegments.front(); 179 Res = static_cast<void *>(static_cast<char *>(KernargRegion) + 180 (FreeIdx * kernargSizeIncludingImplicit())); 181 assert(FreeIdx == pointerToIndex(Res)); 182 FreeKernargSegments.pop(); 183 } 184 return Res; 185 } 186 187 void deallocate(void *Ptr) { 188 Lock L(&Mutex); 189 int Idx = pointerToIndex(Ptr); 190 FreeKernargSegments.push(Idx); 191 } 192 193 private: 194 int pointerToIndex(void *Ptr) { 195 ptrdiff_t Bytes = 196 static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion); 197 assert(Bytes >= 0); 198 assert(Bytes % kernargSizeIncludingImplicit() == 0); 199 return Bytes / kernargSizeIncludingImplicit(); 200 } 201 struct Lock { 202 Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); } 203 ~Lock() { pthread_mutex_unlock(M); } 204 pthread_mutex_t *M; 205 }; 206 }; 207 pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER; 208 209 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>> 210 KernelArgPoolMap; 211 212 /// Use a single entity to encode a kernel and a set of flags 213 struct KernelTy { 214 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 215 int16_t ConstWGSize; 216 int32_t DeviceId; 217 void *CallStackAddr = nullptr; 218 const char *Name; 219 220 KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, 221 int32_t DeviceId, void *CallStackAddr, const char *Name, 222 uint32_t KernargSegmentSize, 223 hsa_amd_memory_pool_t &KernArgMemoryPool) 224 : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), 225 DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { 226 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); 227 228 std::string N(Name); 229 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { 230 KernelArgPoolMap.insert( 231 std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool( 232 KernargSegmentSize, KernArgMemoryPool)))); 233 } 234 } 235 }; 236 237 /// List that contains all the kernels. 238 /// FIXME: we may need this to be per device and per library. 239 std::list<KernelTy> KernelsList; 240 241 template <typename Callback> static hsa_status_t findAgents(Callback CB) { 242 243 hsa_status_t Err = 244 hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t { 245 hsa_device_type_t DeviceType; 246 // get_info fails iff HSA runtime not yet initialized 247 hsa_status_t Err = 248 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); 249 250 if (Err != HSA_STATUS_SUCCESS) { 251 if (print_kernel_trace > 0) 252 DP("rtl.cpp: err %s\n", get_error_string(Err)); 253 254 return Err; 255 } 256 257 CB(DeviceType, Agent); 258 return HSA_STATUS_SUCCESS; 259 }); 260 261 // iterate_agents fails iff HSA runtime not yet initialized 262 if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) { 263 DP("rtl.cpp: err %s\n", get_error_string(Err)); 264 } 265 266 return Err; 267 } 268 269 static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source, 270 void *Data) { 271 if (Status != HSA_STATUS_SUCCESS) { 272 const char *StatusString; 273 if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) { 274 StatusString = "unavailable"; 275 } 276 DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source, 277 Status, StatusString); 278 abort(); 279 } 280 } 281 282 namespace core { 283 namespace { 284 285 bool checkResult(hsa_status_t Err, const char *ErrMsg) { 286 if (Err == HSA_STATUS_SUCCESS) 287 return true; 288 289 REPORT("%s", ErrMsg); 290 REPORT("%s", get_error_string(Err)); 291 return false; 292 } 293 294 void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) { 295 __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE); 296 } 297 298 uint16_t createHeader() { 299 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; 300 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; 301 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; 302 return Header; 303 } 304 305 hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { 306 bool AllocAllowed = false; 307 hsa_status_t Err = hsa_amd_memory_pool_get_info( 308 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, 309 &AllocAllowed); 310 if (Err != HSA_STATUS_SUCCESS) { 311 DP("Alloc allowed in memory pool check failed: %s\n", 312 get_error_string(Err)); 313 return Err; 314 } 315 316 size_t Size = 0; 317 Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, 318 &Size); 319 if (Err != HSA_STATUS_SUCCESS) { 320 DP("Get memory pool size failed: %s\n", get_error_string(Err)); 321 return Err; 322 } 323 324 return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; 325 } 326 327 hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { 328 std::vector<hsa_amd_memory_pool_t> *Result = 329 static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data); 330 331 hsa_status_t Err; 332 if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) { 333 return Err; 334 } 335 336 Result->push_back(MemoryPool); 337 return HSA_STATUS_SUCCESS; 338 } 339 340 } // namespace 341 } // namespace core 342 343 struct EnvironmentVariables { 344 int NumTeams; 345 int TeamLimit; 346 int TeamThreadLimit; 347 int MaxTeamsDefault; 348 int DynamicMemSize; 349 }; 350 351 template <uint32_t wavesize> 352 static constexpr const llvm::omp::GV &getGridValue() { 353 return llvm::omp::getAMDGPUGridValues<wavesize>(); 354 } 355 356 struct HSALifetime { 357 // Wrapper around HSA used to ensure it is constructed before other types 358 // and destructed after, which means said other types can use raii for 359 // cleanup without risking running outside of the lifetime of HSA 360 const hsa_status_t S; 361 362 bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; } 363 HSALifetime() : S(hsa_init()) {} 364 365 ~HSALifetime() { 366 if (S == HSA_STATUS_SUCCESS) { 367 hsa_status_t Err = hsa_shut_down(); 368 if (Err != HSA_STATUS_SUCCESS) { 369 // Can't call into HSA to get a string from the integer 370 DP("Shutting down HSA failed: %d\n", Err); 371 } 372 } 373 } 374 }; 375 376 // Handle scheduling of multiple hsa_queue's per device to 377 // multiple threads (one scheduler per device) 378 class HSAQueueScheduler { 379 public: 380 HSAQueueScheduler() : Current(0) {} 381 382 HSAQueueScheduler(const HSAQueueScheduler &) = delete; 383 384 HSAQueueScheduler(HSAQueueScheduler &&Q) { 385 Current = Q.Current.load(); 386 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 387 HSAQueues[I] = Q.HSAQueues[I]; 388 Q.HSAQueues[I] = nullptr; 389 } 390 } 391 392 // \return false if any HSA queue creation fails 393 bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) { 394 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 395 hsa_queue_t *Q = nullptr; 396 hsa_status_t Rc = 397 hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI, 398 callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q); 399 if (Rc != HSA_STATUS_SUCCESS) { 400 DP("Failed to create HSA queue %d\n", I); 401 return false; 402 } 403 HSAQueues[I] = Q; 404 } 405 return true; 406 } 407 408 ~HSAQueueScheduler() { 409 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 410 if (HSAQueues[I]) { 411 hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]); 412 if (Err != HSA_STATUS_SUCCESS) 413 DP("Error destroying HSA queue"); 414 } 415 } 416 } 417 418 // \return next queue to use for device 419 hsa_queue_t *next() { 420 return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) % 421 NUM_QUEUES_PER_DEVICE]; 422 } 423 424 private: 425 // Number of queues per device 426 enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 }; 427 hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {}; 428 std::atomic<uint8_t> Current; 429 }; 430 431 /// Class containing all the device information 432 class RTLDeviceInfoTy : HSALifetime { 433 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; 434 435 struct QueueDeleter { 436 void operator()(hsa_queue_t *Q) { 437 if (Q) { 438 hsa_status_t Err = hsa_queue_destroy(Q); 439 if (Err != HSA_STATUS_SUCCESS) { 440 DP("Error destroying hsa queue: %s\n", get_error_string(Err)); 441 } 442 } 443 } 444 }; 445 446 public: 447 bool ConstructionSucceeded = false; 448 449 // load binary populates symbol tables and mutates various global state 450 // run uses those symbol tables 451 std::shared_timed_mutex LoadRunLock; 452 453 int NumberOfDevices = 0; 454 455 // GPU devices 456 std::vector<hsa_agent_t> HSAAgents; 457 std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu 458 459 // CPUs 460 std::vector<hsa_agent_t> CPUAgents; 461 462 // Device properties 463 std::vector<int> ComputeUnits; 464 std::vector<int> GroupsPerDevice; 465 std::vector<int> ThreadsPerGroup; 466 std::vector<int> WarpSize; 467 std::vector<std::string> GPUName; 468 469 // OpenMP properties 470 std::vector<int> NumTeams; 471 std::vector<int> NumThreads; 472 473 // OpenMP Environment properties 474 EnvironmentVariables Env; 475 476 // OpenMP Requires Flags 477 int64_t RequiresFlags; 478 479 // Resource pools 480 SignalPoolT FreeSignalPool; 481 482 bool HostcallRequired = false; 483 484 std::vector<hsa_executable_t> HSAExecutables; 485 486 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable; 487 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable; 488 489 hsa_amd_memory_pool_t KernArgPool; 490 491 // fine grained memory pool for host allocations 492 hsa_amd_memory_pool_t HostFineGrainedMemoryPool; 493 494 // fine and coarse-grained memory pools per offloading device 495 std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools; 496 std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools; 497 498 struct ImplFreePtrDeletor { 499 void operator()(void *P) { 500 core::Runtime::Memfree(P); // ignore failure to free 501 } 502 }; 503 504 // device_State shared across loaded binaries, error if inconsistent size 505 std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>> 506 DeviceStateStore; 507 508 static const unsigned HardTeamLimit = 509 (1 << 16) - 1; // 64K needed to fit in uint16 510 static const int DefaultNumTeams = 128; 511 512 // These need to be per-device since different devices can have different 513 // wave sizes, but are currently the same number for each so that refactor 514 // can be postponed. 515 static_assert(getGridValue<32>().GV_Max_Teams == 516 getGridValue<64>().GV_Max_Teams, 517 ""); 518 static const int MaxTeams = getGridValue<64>().GV_Max_Teams; 519 520 static_assert(getGridValue<32>().GV_Max_WG_Size == 521 getGridValue<64>().GV_Max_WG_Size, 522 ""); 523 static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size; 524 525 static_assert(getGridValue<32>().GV_Default_WG_Size == 526 getGridValue<64>().GV_Default_WG_Size, 527 ""); 528 static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size; 529 530 using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size, 531 hsa_agent_t, hsa_amd_memory_pool_t); 532 hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size, 533 MemcpyFunc Func, int32_t DeviceId) { 534 hsa_agent_t Agent = HSAAgents[DeviceId]; 535 hsa_signal_t S = FreeSignalPool.pop(); 536 if (S.handle == 0) { 537 return HSA_STATUS_ERROR; 538 } 539 hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool); 540 FreeSignalPool.push(S); 541 return R; 542 } 543 544 hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size, 545 int32_t DeviceId) { 546 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId); 547 } 548 549 hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size, 550 int32_t DeviceId) { 551 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId); 552 } 553 554 static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) { 555 char TmpChar[1000]; 556 uint16_t Major, Minor; 557 uint32_t TmpUInt; 558 uint32_t TmpUInt2; 559 uint32_t CacheSize[4]; 560 bool TmpBool; 561 uint16_t WorkgroupMaxDim[3]; 562 hsa_dim3_t GridMaxDim; 563 564 // Getting basic information about HSA and Device 565 core::checkResult( 566 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major), 567 "Error from hsa_system_get_info when obtaining " 568 "HSA_SYSTEM_INFO_VERSION_MAJOR\n"); 569 core::checkResult( 570 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor), 571 "Error from hsa_system_get_info when obtaining " 572 "HSA_SYSTEM_INFO_VERSION_MINOR\n"); 573 printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor); 574 printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId); 575 core::checkResult( 576 hsa_agent_get_info( 577 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar), 578 "Error returned from hsa_agent_get_info when obtaining " 579 "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n"); 580 printf(" Product Name: \t\t\t%s \n", TmpChar); 581 core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar), 582 "Error returned from hsa_agent_get_info when obtaining " 583 "HSA_AGENT_INFO_NAME\n"); 584 printf(" Device Name: \t\t\t%s \n", TmpChar); 585 core::checkResult( 586 hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar), 587 "Error returned from hsa_agent_get_info when obtaining " 588 "HSA_AGENT_INFO_NAME\n"); 589 printf(" Vendor Name: \t\t\t%s \n", TmpChar); 590 hsa_device_type_t DevType; 591 core::checkResult( 592 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType), 593 "Error returned from hsa_agent_get_info when obtaining " 594 "HSA_AGENT_INFO_DEVICE\n"); 595 printf(" Device Type: \t\t\t%s \n", 596 DevType == HSA_DEVICE_TYPE_CPU 597 ? "CPU" 598 : (DevType == HSA_DEVICE_TYPE_GPU 599 ? "GPU" 600 : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN"))); 601 core::checkResult( 602 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt), 603 "Error returned from hsa_agent_get_info when obtaining " 604 "HSA_AGENT_INFO_QUEUES_MAX\n"); 605 printf(" Max Queues: \t\t\t%u \n", TmpUInt); 606 core::checkResult( 607 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt), 608 "Error returned from hsa_agent_get_info when obtaining " 609 "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n"); 610 printf(" Queue Min Size: \t\t\t%u \n", TmpUInt); 611 core::checkResult( 612 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt), 613 "Error returned from hsa_agent_get_info when obtaining " 614 "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n"); 615 printf(" Queue Max Size: \t\t\t%u \n", TmpUInt); 616 617 // Getting cache information 618 printf(" Cache:\n"); 619 620 // FIXME: This is deprecated according to HSA documentation. But using 621 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during 622 // runtime. 623 core::checkResult( 624 hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize), 625 "Error returned from hsa_agent_get_info when obtaining " 626 "HSA_AGENT_INFO_CACHE_SIZE\n"); 627 628 for (int I = 0; I < 4; I++) { 629 if (CacheSize[I]) { 630 printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]); 631 } 632 } 633 634 core::checkResult( 635 hsa_agent_get_info(Agent, 636 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, 637 &TmpUInt), 638 "Error returned from hsa_agent_get_info when obtaining " 639 "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n"); 640 printf(" Cacheline Size: \t\t\t%u \n", TmpUInt); 641 core::checkResult( 642 hsa_agent_get_info( 643 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, 644 &TmpUInt), 645 "Error returned from hsa_agent_get_info when obtaining " 646 "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n"); 647 printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt); 648 core::checkResult( 649 hsa_agent_get_info( 650 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 651 &TmpUInt), 652 "Error returned from hsa_agent_get_info when obtaining " 653 "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n"); 654 printf(" Compute Units: \t\t\t%u \n", TmpUInt); 655 core::checkResult(hsa_agent_get_info( 656 Agent, 657 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, 658 &TmpUInt), 659 "Error returned from hsa_agent_get_info when obtaining " 660 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); 661 printf(" SIMD per CU: \t\t\t%u \n", TmpUInt); 662 core::checkResult( 663 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool), 664 "Error returned from hsa_agent_get_info when obtaining " 665 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); 666 printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE")); 667 core::checkResult( 668 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2), 669 "Error returned from hsa_agent_get_info when obtaining " 670 "HSA_AGENT_INFO_WAVEFRONT_SIZE\n"); 671 printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2); 672 core::checkResult( 673 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt), 674 "Error returned from hsa_agent_get_info when obtaining " 675 "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n"); 676 printf(" Workgroup Max Size: \t\t%u \n", TmpUInt); 677 core::checkResult(hsa_agent_get_info(Agent, 678 HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 679 WorkgroupMaxDim), 680 "Error returned from hsa_agent_get_info when obtaining " 681 "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n"); 682 printf(" Workgroup Max Size per Dimension:\n"); 683 printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]); 684 printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]); 685 printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]); 686 core::checkResult(hsa_agent_get_info( 687 Agent, 688 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, 689 &TmpUInt), 690 "Error returned from hsa_agent_get_info when obtaining " 691 "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n"); 692 printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt); 693 printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2); 694 core::checkResult( 695 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt), 696 "Error returned from hsa_agent_get_info when obtaining " 697 "HSA_AGENT_INFO_GRID_MAX_SIZE\n"); 698 printf(" Grid Max Size: \t\t\t%u \n", TmpUInt); 699 core::checkResult( 700 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim), 701 "Error returned from hsa_agent_get_info when obtaining " 702 "HSA_AGENT_INFO_GRID_MAX_DIM\n"); 703 printf(" Grid Max Size per Dimension: \t\t\n"); 704 printf(" x: \t\t\t\t%u\n", GridMaxDim.x); 705 printf(" y: \t\t\t\t%u\n", GridMaxDim.y); 706 printf(" z: \t\t\t\t%u\n", GridMaxDim.z); 707 core::checkResult( 708 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt), 709 "Error returned from hsa_agent_get_info when obtaining " 710 "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n"); 711 printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt); 712 713 printf(" Memory Pools:\n"); 714 auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t { 715 std::string TmpStr; 716 size_t Size; 717 bool Alloc, Access; 718 hsa_amd_segment_t Segment; 719 hsa_amd_memory_pool_global_flag_t GlobalFlags; 720 core::checkResult( 721 hsa_amd_memory_pool_get_info( 722 Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags), 723 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 724 "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n"); 725 core::checkResult(hsa_amd_memory_pool_get_info( 726 Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment), 727 "Error returned from hsa_amd_memory_pool_get_info when " 728 "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n"); 729 730 switch (Segment) { 731 case HSA_AMD_SEGMENT_GLOBAL: 732 TmpStr = "GLOBAL; FLAGS: "; 733 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags) 734 TmpStr += "KERNARG, "; 735 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags) 736 TmpStr += "FINE GRAINED, "; 737 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags) 738 TmpStr += "COARSE GRAINED, "; 739 break; 740 case HSA_AMD_SEGMENT_READONLY: 741 TmpStr = "READONLY"; 742 break; 743 case HSA_AMD_SEGMENT_PRIVATE: 744 TmpStr = "PRIVATE"; 745 break; 746 case HSA_AMD_SEGMENT_GROUP: 747 TmpStr = "GROUP"; 748 break; 749 } 750 printf(" Pool %s: \n", TmpStr.c_str()); 751 752 core::checkResult(hsa_amd_memory_pool_get_info( 753 Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size), 754 "Error returned from hsa_amd_memory_pool_get_info when " 755 "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n"); 756 printf(" Size: \t\t\t\t %zu bytes\n", Size); 757 core::checkResult( 758 hsa_amd_memory_pool_get_info( 759 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc), 760 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 761 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n"); 762 printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE")); 763 core::checkResult( 764 hsa_amd_memory_pool_get_info( 765 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size), 766 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 767 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n"); 768 printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size); 769 core::checkResult( 770 hsa_amd_memory_pool_get_info( 771 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size), 772 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 773 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n"); 774 printf(" Runtime Alloc alignment: \t %zu bytes\n", Size); 775 core::checkResult( 776 hsa_amd_memory_pool_get_info( 777 Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access), 778 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 779 "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n"); 780 printf(" Accessable by all: \t\t %s\n", 781 (Access ? "TRUE" : "FALSE")); 782 783 return HSA_STATUS_SUCCESS; 784 }; 785 // Iterate over all the memory regions for this agent. Get the memory region 786 // type and size 787 hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr); 788 789 printf(" ISAs:\n"); 790 auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t { 791 char TmpChar[1000]; 792 core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar), 793 "Error returned from hsa_isa_get_info_alt when " 794 "obtaining HSA_ISA_INFO_NAME\n"); 795 printf(" Name: \t\t\t\t %s\n", TmpChar); 796 797 return HSA_STATUS_SUCCESS; 798 }; 799 // Iterate over all the memory regions for this agent. Get the memory region 800 // type and size 801 hsa_agent_iterate_isas(Agent, CBIsas, nullptr); 802 } 803 804 // Record entry point associated with device 805 void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) { 806 assert(DeviceId < (int32_t)FuncGblEntries.size() && 807 "Unexpected device id!"); 808 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 809 810 E.Entries.push_back(Entry); 811 } 812 813 // Return true if the entry is associated with device 814 bool findOffloadEntry(int32_t DeviceId, void *Addr) { 815 assert(DeviceId < (int32_t)FuncGblEntries.size() && 816 "Unexpected device id!"); 817 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 818 819 for (auto &It : E.Entries) { 820 if (It.addr == Addr) 821 return true; 822 } 823 824 return false; 825 } 826 827 // Return the pointer to the target entries table 828 __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) { 829 assert(DeviceId < (int32_t)FuncGblEntries.size() && 830 "Unexpected device id!"); 831 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 832 833 int32_t Size = E.Entries.size(); 834 835 // Table is empty 836 if (!Size) 837 return 0; 838 839 __tgt_offload_entry *Begin = &E.Entries[0]; 840 __tgt_offload_entry *End = &E.Entries[Size - 1]; 841 842 // Update table info according to the entries and return the pointer 843 E.Table.EntriesBegin = Begin; 844 E.Table.EntriesEnd = ++End; 845 846 return &E.Table; 847 } 848 849 // Clear entries table for a device 850 void clearOffloadEntriesTable(int DeviceId) { 851 assert(DeviceId < (int32_t)FuncGblEntries.size() && 852 "Unexpected device id!"); 853 FuncGblEntries[DeviceId].emplace_back(); 854 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 855 // KernelArgPoolMap.clear(); 856 E.Entries.clear(); 857 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 858 } 859 860 hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, 861 unsigned int DeviceId) { 862 assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here."); 863 uint32_t GlobalFlags = 0; 864 hsa_status_t Err = hsa_amd_memory_pool_get_info( 865 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 866 867 if (Err != HSA_STATUS_SUCCESS) { 868 return Err; 869 } 870 871 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { 872 DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; 873 } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { 874 DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; 875 } 876 877 return HSA_STATUS_SUCCESS; 878 } 879 880 hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) { 881 for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { 882 hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( 883 Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { 884 hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool); 885 if (ValidStatus != HSA_STATUS_SUCCESS) { 886 DP("Alloc allowed in memory pool check failed: %s\n", 887 get_error_string(ValidStatus)); 888 return HSA_STATUS_SUCCESS; 889 } 890 return addDeviceMemoryPool(MemoryPool, DeviceId); 891 }); 892 893 if (Err != HSA_STATUS_SUCCESS) { 894 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 895 "Iterate all memory pools", get_error_string(Err)); 896 return Err; 897 } 898 } 899 return HSA_STATUS_SUCCESS; 900 } 901 902 hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) { 903 std::vector<hsa_amd_memory_pool_t> HostPools; 904 905 // collect all the "valid" pools for all the given agents. 906 for (const auto &Agent : Agents) { 907 hsa_status_t Err = hsa_amd_agent_iterate_memory_pools( 908 Agent, core::addMemoryPool, static_cast<void *>(&HostPools)); 909 if (Err != HSA_STATUS_SUCCESS) { 910 DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)); 911 } 912 } 913 914 // We need two fine-grained pools. 915 // 1. One with kernarg flag set for storing kernel arguments 916 // 2. Second for host allocations 917 bool FineGrainedMemoryPoolSet = false; 918 bool KernArgPoolSet = false; 919 for (const auto &MemoryPool : HostPools) { 920 hsa_status_t Err = HSA_STATUS_SUCCESS; 921 uint32_t GlobalFlags = 0; 922 Err = hsa_amd_memory_pool_get_info( 923 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 924 if (Err != HSA_STATUS_SUCCESS) { 925 DP("Get memory pool info failed: %s\n", get_error_string(Err)); 926 return Err; 927 } 928 929 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { 930 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { 931 KernArgPool = MemoryPool; 932 KernArgPoolSet = true; 933 } 934 HostFineGrainedMemoryPool = MemoryPool; 935 FineGrainedMemoryPoolSet = true; 936 } 937 } 938 939 if (FineGrainedMemoryPoolSet && KernArgPoolSet) 940 return HSA_STATUS_SUCCESS; 941 942 return HSA_STATUS_ERROR; 943 } 944 945 hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) { 946 assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && 947 "Invalid device Id"); 948 return DeviceCoarseGrainedMemoryPools[DeviceId]; 949 } 950 951 hsa_amd_memory_pool_t getHostMemoryPool() { 952 return HostFineGrainedMemoryPool; 953 } 954 955 static int readEnv(const char *Env, int Default = -1) { 956 const char *EnvStr = getenv(Env); 957 int Res = Default; 958 if (EnvStr) { 959 Res = std::stoi(EnvStr); 960 DP("Parsed %s=%d\n", Env, Res); 961 } 962 return Res; 963 } 964 965 RTLDeviceInfoTy() { 966 DP("Start initializing " GETNAME(TARGET_NAME) "\n"); 967 968 // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr 969 // anytime. You do not need a debug library build. 970 // 0 => no tracing 971 // 1 => tracing dispatch only 972 // >1 => verbosity increase 973 974 if (!HSAInitSuccess()) { 975 DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"); 976 return; 977 } 978 979 if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) 980 print_kernel_trace = atoi(EnvStr); 981 else 982 print_kernel_trace = 0; 983 984 hsa_status_t Err = core::atl_init_gpu_context(); 985 if (Err != HSA_STATUS_SUCCESS) { 986 DP("Error when initializing " GETNAME(TARGET_NAME) "\n"); 987 return; 988 } 989 990 // Init hostcall soon after initializing hsa 991 hostrpc_init(); 992 993 Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { 994 if (DeviceType == HSA_DEVICE_TYPE_CPU) { 995 CPUAgents.push_back(Agent); 996 } else { 997 HSAAgents.push_back(Agent); 998 } 999 }); 1000 if (Err != HSA_STATUS_SUCCESS) 1001 return; 1002 1003 NumberOfDevices = (int)HSAAgents.size(); 1004 1005 if (NumberOfDevices == 0) { 1006 DP("There are no devices supporting HSA.\n"); 1007 return; 1008 } 1009 DP("There are %d devices supporting HSA.\n", NumberOfDevices); 1010 1011 // Init the device info 1012 HSAQueueSchedulers.reserve(NumberOfDevices); 1013 FuncGblEntries.resize(NumberOfDevices); 1014 ThreadsPerGroup.resize(NumberOfDevices); 1015 ComputeUnits.resize(NumberOfDevices); 1016 GPUName.resize(NumberOfDevices); 1017 GroupsPerDevice.resize(NumberOfDevices); 1018 WarpSize.resize(NumberOfDevices); 1019 NumTeams.resize(NumberOfDevices); 1020 NumThreads.resize(NumberOfDevices); 1021 DeviceStateStore.resize(NumberOfDevices); 1022 KernelInfoTable.resize(NumberOfDevices); 1023 SymbolInfoTable.resize(NumberOfDevices); 1024 DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); 1025 DeviceFineGrainedMemoryPools.resize(NumberOfDevices); 1026 1027 Err = setupDevicePools(HSAAgents); 1028 if (Err != HSA_STATUS_SUCCESS) { 1029 DP("Setup for Device Memory Pools failed\n"); 1030 return; 1031 } 1032 1033 Err = setupHostMemoryPools(CPUAgents); 1034 if (Err != HSA_STATUS_SUCCESS) { 1035 DP("Setup for Host Memory Pools failed\n"); 1036 return; 1037 } 1038 1039 for (int I = 0; I < NumberOfDevices; I++) { 1040 uint32_t QueueSize = 0; 1041 { 1042 hsa_status_t Err = hsa_agent_get_info( 1043 HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize); 1044 if (Err != HSA_STATUS_SUCCESS) { 1045 DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I); 1046 return; 1047 } 1048 enum { MaxQueueSize = 4096 }; 1049 if (QueueSize > MaxQueueSize) { 1050 QueueSize = MaxQueueSize; 1051 } 1052 } 1053 1054 { 1055 HSAQueueScheduler QSched; 1056 if (!QSched.createQueues(HSAAgents[I], QueueSize)) 1057 return; 1058 HSAQueueSchedulers.emplace_back(std::move(QSched)); 1059 } 1060 1061 DeviceStateStore[I] = {nullptr, 0}; 1062 } 1063 1064 for (int I = 0; I < NumberOfDevices; I++) { 1065 ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize; 1066 GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams; 1067 ComputeUnits[I] = 1; 1068 DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I, 1069 GroupsPerDevice[I], ThreadsPerGroup[I]); 1070 } 1071 1072 // Get environment variables regarding teams 1073 Env.TeamLimit = readEnv("OMP_TEAM_LIMIT"); 1074 Env.NumTeams = readEnv("OMP_NUM_TEAMS"); 1075 Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT"); 1076 Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT"); 1077 Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0); 1078 1079 // Default state. 1080 RequiresFlags = OMP_REQ_UNDEFINED; 1081 1082 ConstructionSucceeded = true; 1083 } 1084 1085 ~RTLDeviceInfoTy() { 1086 DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"); 1087 if (!HSAInitSuccess()) { 1088 // Then none of these can have been set up and they can't be torn down 1089 return; 1090 } 1091 // Run destructors on types that use HSA before 1092 // impl_finalize removes access to it 1093 DeviceStateStore.clear(); 1094 KernelArgPoolMap.clear(); 1095 // Terminate hostrpc before finalizing hsa 1096 hostrpc_terminate(); 1097 1098 hsa_status_t Err; 1099 for (uint32_t I = 0; I < HSAExecutables.size(); I++) { 1100 Err = hsa_executable_destroy(HSAExecutables[I]); 1101 if (Err != HSA_STATUS_SUCCESS) { 1102 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1103 "Destroying executable", get_error_string(Err)); 1104 } 1105 } 1106 } 1107 }; 1108 1109 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; 1110 1111 static RTLDeviceInfoTy DeviceInfo; 1112 1113 namespace { 1114 1115 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, 1116 __tgt_async_info *AsyncInfo) { 1117 assert(AsyncInfo && "AsyncInfo is nullptr"); 1118 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 1119 // Return success if we are not copying back to host from target. 1120 if (!HstPtr) 1121 return OFFLOAD_SUCCESS; 1122 hsa_status_t Err; 1123 DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 1124 (long long unsigned)(Elf64_Addr)TgtPtr, 1125 (long long unsigned)(Elf64_Addr)HstPtr); 1126 1127 Err = DeviceInfo.freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, 1128 DeviceId); 1129 1130 if (Err != HSA_STATUS_SUCCESS) { 1131 DP("Error when copying data from device to host. Pointers: " 1132 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 1133 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 1134 return OFFLOAD_FAIL; 1135 } 1136 DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 1137 (long long unsigned)(Elf64_Addr)TgtPtr, 1138 (long long unsigned)(Elf64_Addr)HstPtr); 1139 return OFFLOAD_SUCCESS; 1140 } 1141 1142 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, 1143 __tgt_async_info *AsyncInfo) { 1144 assert(AsyncInfo && "AsyncInfo is nullptr"); 1145 hsa_status_t Err; 1146 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 1147 // Return success if we are not doing host to target. 1148 if (!HstPtr) 1149 return OFFLOAD_SUCCESS; 1150 1151 DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, 1152 (long long unsigned)(Elf64_Addr)HstPtr, 1153 (long long unsigned)(Elf64_Addr)TgtPtr); 1154 Err = DeviceInfo.freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, 1155 DeviceId); 1156 if (Err != HSA_STATUS_SUCCESS) { 1157 DP("Error when copying data from host to device. Pointers: " 1158 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 1159 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 1160 return OFFLOAD_FAIL; 1161 } 1162 return OFFLOAD_SUCCESS; 1163 } 1164 1165 // Async. 1166 // The implementation was written with cuda streams in mind. The semantics of 1167 // that are to execute kernels on a queue in order of insertion. A synchronise 1168 // call then makes writes visible between host and device. This means a series 1169 // of N data_submit_async calls are expected to execute serially. HSA offers 1170 // various options to run the data copies concurrently. This may require changes 1171 // to libomptarget. 1172 1173 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that 1174 // there are no outstanding kernels that need to be synchronized. Any async call 1175 // may be passed a Queue==0, at which point the cuda implementation will set it 1176 // to non-null (see getStream). The cuda streams are per-device. Upstream may 1177 // change this interface to explicitly initialize the AsyncInfo_pointer, but 1178 // until then hsa lazily initializes it as well. 1179 1180 void initAsyncInfo(__tgt_async_info *AsyncInfo) { 1181 // set non-null while using async calls, return to null to indicate completion 1182 assert(AsyncInfo); 1183 if (!AsyncInfo->Queue) { 1184 AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX); 1185 } 1186 } 1187 void finiAsyncInfo(__tgt_async_info *AsyncInfo) { 1188 assert(AsyncInfo); 1189 assert(AsyncInfo->Queue); 1190 AsyncInfo->Queue = 0; 1191 } 1192 1193 // Determine launch values for kernel. 1194 struct LaunchVals { 1195 int WorkgroupSize; 1196 int GridSize; 1197 }; 1198 LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, 1199 int ConstWGSize, 1200 llvm::omp::OMPTgtExecModeFlags ExecutionMode, 1201 int NumTeams, int ThreadLimit, uint64_t LoopTripcount, 1202 int DeviceNumTeams) { 1203 1204 int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize; 1205 int NumGroups = 0; 1206 1207 int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; 1208 if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit)) 1209 MaxTeams = RTLDeviceInfoTy::HardTeamLimit; 1210 1211 if (print_kernel_trace & STARTUP_DETAILS) { 1212 DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams); 1213 DP("Max_Teams: %d\n", MaxTeams); 1214 DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize); 1215 DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize); 1216 DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", 1217 RTLDeviceInfoTy::DefaultWgSize); 1218 DP("thread_limit: %d\n", ThreadLimit); 1219 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1220 DP("ConstWGSize: %d\n", ConstWGSize); 1221 } 1222 // check for thread_limit() clause 1223 if (ThreadLimit > 0) { 1224 ThreadsPerGroup = ThreadLimit; 1225 DP("Setting threads per block to requested %d\n", ThreadLimit); 1226 // Add master warp for GENERIC 1227 if (ExecutionMode == 1228 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { 1229 ThreadsPerGroup += WarpSize; 1230 DP("Adding master wavefront: +%d threads\n", WarpSize); 1231 } 1232 if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max 1233 ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize; 1234 DP("Setting threads per block to maximum %d\n", ThreadsPerGroup); 1235 } 1236 } 1237 // check flat_max_work_group_size attr here 1238 if (ThreadsPerGroup > ConstWGSize) { 1239 ThreadsPerGroup = ConstWGSize; 1240 DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", 1241 ThreadsPerGroup); 1242 } 1243 if (print_kernel_trace & STARTUP_DETAILS) 1244 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1245 DP("Preparing %d threads\n", ThreadsPerGroup); 1246 1247 // Set default num_groups (teams) 1248 if (Env.TeamLimit > 0) 1249 NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit; 1250 else 1251 NumGroups = MaxTeams; 1252 DP("Set default num of groups %d\n", NumGroups); 1253 1254 if (print_kernel_trace & STARTUP_DETAILS) { 1255 DP("num_groups: %d\n", NumGroups); 1256 DP("num_teams: %d\n", NumTeams); 1257 } 1258 1259 // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size 1260 // This reduction is typical for default case (no thread_limit clause). 1261 // or when user goes crazy with num_teams clause. 1262 // FIXME: We cant distinguish between a constant or variable thread limit. 1263 // So we only handle constant thread_limits. 1264 if (ThreadsPerGroup > 1265 RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024 1266 // Should we round threadsPerGroup up to nearest WarpSize 1267 // here? 1268 NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup; 1269 1270 // check for num_teams() clause 1271 if (NumTeams > 0) { 1272 NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups; 1273 } 1274 if (print_kernel_trace & STARTUP_DETAILS) { 1275 DP("num_groups: %d\n", NumGroups); 1276 DP("Env.NumTeams %d\n", Env.NumTeams); 1277 DP("Env.TeamLimit %d\n", Env.TeamLimit); 1278 } 1279 1280 if (Env.NumTeams > 0) { 1281 NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups; 1282 DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); 1283 } else if (Env.TeamLimit > 0) { 1284 NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups; 1285 DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); 1286 } else { 1287 if (NumTeams <= 0) { 1288 if (LoopTripcount > 0) { 1289 if (ExecutionMode == 1290 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { 1291 // round up to the nearest integer 1292 NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1; 1293 } else if (ExecutionMode == 1294 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { 1295 NumGroups = LoopTripcount; 1296 } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { 1297 // This is a generic kernel that was transformed to use SPMD-mode 1298 // execution but uses Generic-mode semantics for scheduling. 1299 NumGroups = LoopTripcount; 1300 } 1301 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 1302 "threads per block %d\n", 1303 NumGroups, LoopTripcount, ThreadsPerGroup); 1304 } 1305 } else { 1306 NumGroups = NumTeams; 1307 } 1308 if (NumGroups > MaxTeams) { 1309 NumGroups = MaxTeams; 1310 if (print_kernel_trace & STARTUP_DETAILS) 1311 DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams); 1312 } 1313 if (NumGroups > NumTeams && NumTeams > 0) { 1314 NumGroups = NumTeams; 1315 if (print_kernel_trace & STARTUP_DETAILS) 1316 DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups, 1317 NumTeams); 1318 } 1319 } 1320 1321 // num_teams clause always honored, no matter what, unless DEFAULT is active. 1322 if (NumTeams > 0) { 1323 NumGroups = NumTeams; 1324 // Cap num_groups to EnvMaxTeamsDefault if set. 1325 if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault) 1326 NumGroups = Env.MaxTeamsDefault; 1327 } 1328 if (print_kernel_trace & STARTUP_DETAILS) { 1329 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1330 DP("num_groups: %d\n", NumGroups); 1331 DP("loop_tripcount: %ld\n", LoopTripcount); 1332 } 1333 DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups, 1334 ThreadsPerGroup); 1335 1336 LaunchVals Res; 1337 Res.WorkgroupSize = ThreadsPerGroup; 1338 Res.GridSize = ThreadsPerGroup * NumGroups; 1339 return Res; 1340 } 1341 1342 static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) { 1343 uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); 1344 bool Full = true; 1345 while (Full) { 1346 Full = 1347 PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue)); 1348 } 1349 return PacketId; 1350 } 1351 1352 int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, 1353 ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, 1354 int32_t ThreadLimit, uint64_t LoopTripcount) { 1355 // Set the context we are using 1356 // update thread limit content in gpu memory if un-initialized or specified 1357 // from host 1358 1359 DP("Run target team region thread_limit %d\n", ThreadLimit); 1360 1361 // All args are references. 1362 std::vector<void *> Args(ArgNum); 1363 std::vector<void *> Ptrs(ArgNum); 1364 1365 DP("Arg_num: %d\n", ArgNum); 1366 for (int32_t I = 0; I < ArgNum; ++I) { 1367 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1368 Args[I] = &Ptrs[I]; 1369 DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])); 1370 } 1371 1372 KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr; 1373 1374 std::string KernelName = std::string(KernelInfo->Name); 1375 auto &KernelInfoTable = DeviceInfo.KernelInfoTable; 1376 if (KernelInfoTable[DeviceId].find(KernelName) == 1377 KernelInfoTable[DeviceId].end()) { 1378 DP("Kernel %s not found\n", KernelName.c_str()); 1379 return OFFLOAD_FAIL; 1380 } 1381 1382 const atl_kernel_info_t KernelInfoEntry = 1383 KernelInfoTable[DeviceId][KernelName]; 1384 const uint32_t GroupSegmentSize = 1385 KernelInfoEntry.group_segment_size + DeviceInfo.Env.DynamicMemSize; 1386 const uint32_t SgprCount = KernelInfoEntry.sgpr_count; 1387 const uint32_t VgprCount = KernelInfoEntry.vgpr_count; 1388 const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count; 1389 const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count; 1390 1391 assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count); 1392 1393 /* 1394 * Set limit based on ThreadsPerGroup and GroupsPerDevice 1395 */ 1396 LaunchVals LV = 1397 getLaunchVals(DeviceInfo.WarpSize[DeviceId], DeviceInfo.Env, 1398 KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, 1399 NumTeams, // From run_region arg 1400 ThreadLimit, // From run_region arg 1401 LoopTripcount, // From run_region arg 1402 DeviceInfo.NumTeams[KernelInfo->DeviceId]); 1403 const int GridSize = LV.GridSize; 1404 const int WorkgroupSize = LV.WorkgroupSize; 1405 1406 if (print_kernel_trace >= LAUNCH) { 1407 int NumGroups = GridSize / WorkgroupSize; 1408 // enum modes are SPMD, GENERIC, NONE 0,1,2 1409 // if doing rtl timing, print to stderr, unless stdout requested. 1410 bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); 1411 fprintf(TraceToStdout ? stdout : stderr, 1412 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " 1413 "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " 1414 "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", 1415 DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, 1416 ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit, 1417 GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount, 1418 VgprSpillCount, LoopTripcount, KernelInfo->Name); 1419 } 1420 1421 // Run on the device. 1422 { 1423 hsa_queue_t *Queue = DeviceInfo.HSAQueueSchedulers[DeviceId].next(); 1424 if (!Queue) { 1425 return OFFLOAD_FAIL; 1426 } 1427 uint64_t PacketId = acquireAvailablePacketId(Queue); 1428 1429 const uint32_t Mask = Queue->size - 1; // size is a power of 2 1430 hsa_kernel_dispatch_packet_t *Packet = 1431 (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); 1432 1433 // packet->header is written last 1434 Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 1435 Packet->workgroup_size_x = WorkgroupSize; 1436 Packet->workgroup_size_y = 1; 1437 Packet->workgroup_size_z = 1; 1438 Packet->reserved0 = 0; 1439 Packet->grid_size_x = GridSize; 1440 Packet->grid_size_y = 1; 1441 Packet->grid_size_z = 1; 1442 Packet->private_segment_size = KernelInfoEntry.private_segment_size; 1443 Packet->group_segment_size = GroupSegmentSize; 1444 Packet->kernel_object = KernelInfoEntry.kernel_object; 1445 Packet->kernarg_address = 0; // use the block allocator 1446 Packet->reserved2 = 0; // impl writes id_ here 1447 Packet->completion_signal = {0}; // may want a pool of signals 1448 1449 KernelArgPool *ArgPool = nullptr; 1450 void *KernArg = nullptr; 1451 { 1452 auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name)); 1453 if (It != KernelArgPoolMap.end()) { 1454 ArgPool = (It->second).get(); 1455 } 1456 } 1457 if (!ArgPool) { 1458 DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, 1459 DeviceId); 1460 } 1461 { 1462 if (ArgPool) { 1463 assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))); 1464 KernArg = ArgPool->allocate(ArgNum); 1465 } 1466 if (!KernArg) { 1467 DP("Allocate kernarg failed\n"); 1468 return OFFLOAD_FAIL; 1469 } 1470 1471 // Copy explicit arguments 1472 for (int I = 0; I < ArgNum; I++) { 1473 memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *)); 1474 } 1475 1476 // Initialize implicit arguments. TODO: Which of these can be dropped 1477 impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>( 1478 static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize); 1479 memset(ImplArgs, 0, 1480 sizeof(impl_implicit_args_t)); // may not be necessary 1481 ImplArgs->offset_x = 0; 1482 ImplArgs->offset_y = 0; 1483 ImplArgs->offset_z = 0; 1484 1485 // assign a hostcall buffer for the selected Q 1486 if (__atomic_load_n(&DeviceInfo.HostcallRequired, __ATOMIC_ACQUIRE)) { 1487 // hostrpc_assign_buffer is not thread safe, and this function is 1488 // under a multiple reader lock, not a writer lock. 1489 static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER; 1490 pthread_mutex_lock(&HostcallInitLock); 1491 uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo.HSAAgents[DeviceId], 1492 Queue, DeviceId); 1493 pthread_mutex_unlock(&HostcallInitLock); 1494 if (!Buffer) { 1495 DP("hostrpc_assign_buffer failed, gpu would dereference null and " 1496 "error\n"); 1497 return OFFLOAD_FAIL; 1498 } 1499 1500 DP("Implicit argument count: %d\n", 1501 KernelInfoEntry.implicit_argument_count); 1502 if (KernelInfoEntry.implicit_argument_count >= 4) { 1503 // Initialise pointer for implicit_argument_count != 0 ABI 1504 // Guess that the right implicit argument is at offset 24 after 1505 // the explicit arguments. In the future, should be able to read 1506 // the offset from msgpack. Clang is not annotating it at present. 1507 uint64_t Offset = 1508 sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); 1509 if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) { 1510 DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit " 1511 "args: %d\n", 1512 Offset + 8, ArgPool->kernargSizeIncludingImplicit()); 1513 } else { 1514 memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8); 1515 } 1516 } 1517 1518 // initialise pointer for implicit_argument_count == 0 ABI 1519 ImplArgs->hostcall_ptr = Buffer; 1520 } 1521 1522 Packet->kernarg_address = KernArg; 1523 } 1524 1525 hsa_signal_t S = DeviceInfo.FreeSignalPool.pop(); 1526 if (S.handle == 0) { 1527 DP("Failed to get signal instance\n"); 1528 return OFFLOAD_FAIL; 1529 } 1530 Packet->completion_signal = S; 1531 hsa_signal_store_relaxed(Packet->completion_signal, 1); 1532 1533 // Publish the packet indicating it is ready to be processed 1534 core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet), 1535 core::createHeader(), Packet->setup); 1536 1537 // Since the packet is already published, its contents must not be 1538 // accessed any more 1539 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); 1540 1541 while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, 1542 HSA_WAIT_STATE_BLOCKED) != 0) 1543 ; 1544 1545 assert(ArgPool); 1546 ArgPool->deallocate(KernArg); 1547 DeviceInfo.FreeSignalPool.push(S); 1548 } 1549 1550 DP("Kernel completed\n"); 1551 return OFFLOAD_SUCCESS; 1552 } 1553 1554 bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) { 1555 const uint16_t AmdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h 1556 int32_t R = elf_check_machine(Image, AmdgcnMachineID); 1557 if (!R) { 1558 DP("Supported machine ID not found\n"); 1559 } 1560 return R; 1561 } 1562 1563 uint32_t elfEFlags(__tgt_device_image *Image) { 1564 char *ImgBegin = (char *)Image->ImageStart; 1565 size_t ImgSize = (char *)Image->ImageEnd - ImgBegin; 1566 1567 Elf *E = elf_memory(ImgBegin, ImgSize); 1568 if (!E) { 1569 DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); 1570 return 0; 1571 } 1572 1573 Elf64_Ehdr *Eh64 = elf64_getehdr(E); 1574 1575 if (!Eh64) { 1576 DP("Unable to get machine ID from ELF file!\n"); 1577 elf_end(E); 1578 return 0; 1579 } 1580 1581 uint32_t Flags = Eh64->e_flags; 1582 1583 elf_end(E); 1584 DP("ELF Flags: 0x%x\n", Flags); 1585 return Flags; 1586 } 1587 1588 template <typename T> bool enforceUpperBound(T *Value, T Upper) { 1589 bool Changed = *Value > Upper; 1590 if (Changed) { 1591 *Value = Upper; 1592 } 1593 return Changed; 1594 } 1595 1596 Elf64_Shdr *findOnlyShtHash(Elf *Elf) { 1597 size_t N; 1598 int Rc = elf_getshdrnum(Elf, &N); 1599 if (Rc != 0) { 1600 return nullptr; 1601 } 1602 1603 Elf64_Shdr *Result = nullptr; 1604 for (size_t I = 0; I < N; I++) { 1605 Elf_Scn *Scn = elf_getscn(Elf, I); 1606 if (Scn) { 1607 Elf64_Shdr *Shdr = elf64_getshdr(Scn); 1608 if (Shdr) { 1609 if (Shdr->sh_type == SHT_HASH) { 1610 if (Result == nullptr) { 1611 Result = Shdr; 1612 } else { 1613 // multiple SHT_HASH sections not handled 1614 return nullptr; 1615 } 1616 } 1617 } 1618 } 1619 } 1620 return Result; 1621 } 1622 1623 const Elf64_Sym *elfLookup(Elf *Elf, char *Base, Elf64_Shdr *SectionHash, 1624 const char *Symname) { 1625 1626 assert(SectionHash); 1627 size_t SectionSymtabIndex = SectionHash->sh_link; 1628 Elf64_Shdr *SectionSymtab = 1629 elf64_getshdr(elf_getscn(Elf, SectionSymtabIndex)); 1630 size_t SectionStrtabIndex = SectionSymtab->sh_link; 1631 1632 const Elf64_Sym *Symtab = 1633 reinterpret_cast<const Elf64_Sym *>(Base + SectionSymtab->sh_offset); 1634 1635 const uint32_t *Hashtab = 1636 reinterpret_cast<const uint32_t *>(Base + SectionHash->sh_offset); 1637 1638 // Layout: 1639 // nbucket 1640 // nchain 1641 // bucket[nbucket] 1642 // chain[nchain] 1643 uint32_t Nbucket = Hashtab[0]; 1644 const uint32_t *Bucket = &Hashtab[2]; 1645 const uint32_t *Chain = &Hashtab[Nbucket + 2]; 1646 1647 const size_t Max = strlen(Symname) + 1; 1648 const uint32_t Hash = elf_hash(Symname); 1649 for (uint32_t I = Bucket[Hash % Nbucket]; I != 0; I = Chain[I]) { 1650 char *N = elf_strptr(Elf, SectionStrtabIndex, Symtab[I].st_name); 1651 if (strncmp(Symname, N, Max) == 0) { 1652 return &Symtab[I]; 1653 } 1654 } 1655 1656 return nullptr; 1657 } 1658 1659 struct SymbolInfo { 1660 void *Addr = nullptr; 1661 uint32_t Size = UINT32_MAX; 1662 uint32_t ShType = SHT_NULL; 1663 }; 1664 1665 int getSymbolInfoWithoutLoading(Elf *Elf, char *Base, const char *Symname, 1666 SymbolInfo *Res) { 1667 if (elf_kind(Elf) != ELF_K_ELF) { 1668 return 1; 1669 } 1670 1671 Elf64_Shdr *SectionHash = findOnlyShtHash(Elf); 1672 if (!SectionHash) { 1673 return 1; 1674 } 1675 1676 const Elf64_Sym *Sym = elfLookup(Elf, Base, SectionHash, Symname); 1677 if (!Sym) { 1678 return 1; 1679 } 1680 1681 if (Sym->st_size > UINT32_MAX) { 1682 return 1; 1683 } 1684 1685 if (Sym->st_shndx == SHN_UNDEF) { 1686 return 1; 1687 } 1688 1689 Elf_Scn *Section = elf_getscn(Elf, Sym->st_shndx); 1690 if (!Section) { 1691 return 1; 1692 } 1693 1694 Elf64_Shdr *Header = elf64_getshdr(Section); 1695 if (!Header) { 1696 return 1; 1697 } 1698 1699 Res->Addr = Sym->st_value + Base; 1700 Res->Size = static_cast<uint32_t>(Sym->st_size); 1701 Res->ShType = Header->sh_type; 1702 return 0; 1703 } 1704 1705 int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *Symname, 1706 SymbolInfo *Res) { 1707 Elf *Elf = elf_memory(Base, ImgSize); 1708 if (Elf) { 1709 int Rc = getSymbolInfoWithoutLoading(Elf, Base, Symname, Res); 1710 elf_end(Elf); 1711 return Rc; 1712 } 1713 return 1; 1714 } 1715 1716 hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize, 1717 const char *SymName, void **VarAddr, 1718 uint32_t *VarSize) { 1719 SymbolInfo SI; 1720 int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI); 1721 if (Rc == 0) { 1722 *VarAddr = SI.Addr; 1723 *VarSize = SI.Size; 1724 return HSA_STATUS_SUCCESS; 1725 } 1726 return HSA_STATUS_ERROR; 1727 } 1728 1729 template <typename C> 1730 hsa_status_t moduleRegisterFromMemoryToPlace( 1731 std::map<std::string, atl_kernel_info_t> &KernelInfoTable, 1732 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, 1733 void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb, 1734 std::vector<hsa_executable_t> &HSAExecutables) { 1735 auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t { 1736 C *Unwrapped = static_cast<C *>(CbState); 1737 return (*Unwrapped)(Data, Size); 1738 }; 1739 return core::RegisterModuleFromMemory( 1740 KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize, 1741 DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&Cb), 1742 HSAExecutables); 1743 } 1744 1745 uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) { 1746 uint64_t DeviceStateBytes = 0; 1747 { 1748 // If this is the deviceRTL, get the state variable size 1749 SymbolInfo SizeSi; 1750 int Rc = getSymbolInfoWithoutLoading( 1751 ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi); 1752 1753 if (Rc == 0) { 1754 if (SizeSi.Size != sizeof(uint64_t)) { 1755 DP("Found device_State_size variable with wrong size\n"); 1756 return 0; 1757 } 1758 1759 // Read number of bytes directly from the elf 1760 memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t)); 1761 } 1762 } 1763 return DeviceStateBytes; 1764 } 1765 1766 struct DeviceEnvironment { 1767 // initialise an DeviceEnvironmentTy in the deviceRTL 1768 // patches around differences in the deviceRTL between trunk, aomp, 1769 // rocmcc. Over time these differences will tend to zero and this class 1770 // simplified. 1771 // Symbol may be in .data or .bss, and may be missing fields, todo: 1772 // review aomp/trunk/rocm and simplify the following 1773 1774 // The symbol may also have been deadstripped because the device side 1775 // accessors were unused. 1776 1777 // If the symbol is in .data (aomp, rocm) it can be written directly. 1778 // If it is in .bss, we must wait for it to be allocated space on the 1779 // gpu (trunk) and initialize after loading. 1780 const char *sym() { return "omptarget_device_environment"; } 1781 1782 DeviceEnvironmentTy HostDeviceEnv; 1783 SymbolInfo SI; 1784 bool Valid = false; 1785 1786 __tgt_device_image *Image; 1787 const size_t ImgSize; 1788 1789 DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize, 1790 __tgt_device_image *Image, const size_t ImgSize) 1791 : Image(Image), ImgSize(ImgSize) { 1792 1793 HostDeviceEnv.NumDevices = NumberDevices; 1794 HostDeviceEnv.DeviceNum = DeviceId; 1795 HostDeviceEnv.DebugKind = 0; 1796 HostDeviceEnv.DynamicMemSize = DynamicMemSize; 1797 if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 1798 HostDeviceEnv.DebugKind = std::stoi(EnvStr); 1799 1800 int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize, 1801 sym(), &SI); 1802 if (Rc != 0) { 1803 DP("Finding global device environment '%s' - symbol missing.\n", sym()); 1804 return; 1805 } 1806 1807 if (SI.Size > sizeof(HostDeviceEnv)) { 1808 DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size, 1809 sizeof(HostDeviceEnv)); 1810 return; 1811 } 1812 1813 Valid = true; 1814 } 1815 1816 bool inImage() { return SI.ShType != SHT_NOBITS; } 1817 1818 hsa_status_t beforeLoading(void *Data, size_t Size) { 1819 if (Valid) { 1820 if (inImage()) { 1821 DP("Setting global device environment before load (%u bytes)\n", 1822 SI.Size); 1823 uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart; 1824 void *Pos = (char *)Data + Offset; 1825 memcpy(Pos, &HostDeviceEnv, SI.Size); 1826 } 1827 } 1828 return HSA_STATUS_SUCCESS; 1829 } 1830 1831 hsa_status_t afterLoading() { 1832 if (Valid) { 1833 if (!inImage()) { 1834 DP("Setting global device environment after load (%u bytes)\n", 1835 SI.Size); 1836 int DeviceId = HostDeviceEnv.DeviceNum; 1837 auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId]; 1838 void *StatePtr; 1839 uint32_t StatePtrSize; 1840 hsa_status_t Err = interop_hsa_get_symbol_info( 1841 SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize); 1842 if (Err != HSA_STATUS_SUCCESS) { 1843 DP("failed to find %s in loaded image\n", sym()); 1844 return Err; 1845 } 1846 1847 if (StatePtrSize != SI.Size) { 1848 DP("Symbol had size %u before loading, %u after\n", StatePtrSize, 1849 SI.Size); 1850 return HSA_STATUS_ERROR; 1851 } 1852 1853 return DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, 1854 StatePtrSize, DeviceId); 1855 } 1856 } 1857 return HSA_STATUS_SUCCESS; 1858 } 1859 }; 1860 1861 hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) { 1862 uint64_t Rounded = 4 * ((Size + 3) / 4); 1863 void *Ptr; 1864 hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId); 1865 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr); 1866 if (Err != HSA_STATUS_SUCCESS) { 1867 return Err; 1868 } 1869 1870 hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4); 1871 if (Rc != HSA_STATUS_SUCCESS) { 1872 DP("zero fill device_state failed with %u\n", Rc); 1873 core::Runtime::Memfree(Ptr); 1874 return HSA_STATUS_ERROR; 1875 } 1876 1877 *RetPtr = Ptr; 1878 return HSA_STATUS_SUCCESS; 1879 } 1880 1881 bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) { 1882 SymbolInfo SI; 1883 int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI); 1884 return (Rc == 0) && (SI.Addr != nullptr); 1885 } 1886 1887 } // namespace 1888 1889 namespace core { 1890 hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) { 1891 return hsa_amd_agents_allow_access(DeviceInfo.HSAAgents.size(), 1892 &DeviceInfo.HSAAgents[0], NULL, Ptr); 1893 } 1894 } // namespace core 1895 1896 extern "C" { 1897 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { 1898 return elfMachineIdIsAmdgcn(Image); 1899 } 1900 1901 int __tgt_rtl_number_of_devices() { 1902 // If the construction failed, no methods are safe to call 1903 if (DeviceInfo.ConstructionSucceeded) { 1904 return DeviceInfo.NumberOfDevices; 1905 } 1906 DP("AMDGPU plugin construction failed. Zero devices available\n"); 1907 return 0; 1908 } 1909 1910 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1911 DP("Init requires flags to %ld\n", RequiresFlags); 1912 DeviceInfo.RequiresFlags = RequiresFlags; 1913 return RequiresFlags; 1914 } 1915 1916 int32_t __tgt_rtl_init_device(int DeviceId) { 1917 hsa_status_t Err; 1918 1919 // this is per device id init 1920 DP("Initialize the device id: %d\n", DeviceId); 1921 1922 hsa_agent_t Agent = DeviceInfo.HSAAgents[DeviceId]; 1923 1924 // Get number of Compute Unit 1925 uint32_t ComputeUnits = 0; 1926 Err = hsa_agent_get_info( 1927 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 1928 &ComputeUnits); 1929 if (Err != HSA_STATUS_SUCCESS) { 1930 DeviceInfo.ComputeUnits[DeviceId] = 1; 1931 DP("Error getting compute units : settiing to 1\n"); 1932 } else { 1933 DeviceInfo.ComputeUnits[DeviceId] = ComputeUnits; 1934 DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[DeviceId]); 1935 } 1936 1937 char GetInfoName[64]; // 64 max size returned by get info 1938 Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, 1939 (void *)GetInfoName); 1940 if (Err) 1941 DeviceInfo.GPUName[DeviceId] = "--unknown gpu--"; 1942 else { 1943 DeviceInfo.GPUName[DeviceId] = GetInfoName; 1944 } 1945 1946 if (print_kernel_trace & STARTUP_DETAILS) 1947 DP("Device#%-2d CU's: %2d %s\n", DeviceId, 1948 DeviceInfo.ComputeUnits[DeviceId], DeviceInfo.GPUName[DeviceId].c_str()); 1949 1950 // Query attributes to determine number of threads/block and blocks/grid. 1951 uint16_t WorkgroupMaxDim[3]; 1952 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 1953 &WorkgroupMaxDim); 1954 if (Err != HSA_STATUS_SUCCESS) { 1955 DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams; 1956 DP("Error getting grid dims: num groups : %d\n", 1957 RTLDeviceInfoTy::DefaultNumTeams); 1958 } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) { 1959 DeviceInfo.GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; 1960 DP("Using %d ROCm blocks per grid\n", DeviceInfo.GroupsPerDevice[DeviceId]); 1961 } else { 1962 DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; 1963 DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " 1964 "at the hard limit\n", 1965 WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit); 1966 } 1967 1968 // Get thread limit 1969 hsa_dim3_t GridMaxDim; 1970 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim); 1971 if (Err == HSA_STATUS_SUCCESS) { 1972 DeviceInfo.ThreadsPerGroup[DeviceId] = 1973 reinterpret_cast<uint32_t *>(&GridMaxDim)[0] / 1974 DeviceInfo.GroupsPerDevice[DeviceId]; 1975 1976 if (DeviceInfo.ThreadsPerGroup[DeviceId] == 0) { 1977 DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; 1978 DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); 1979 } else if (enforceUpperBound(&DeviceInfo.ThreadsPerGroup[DeviceId], 1980 RTLDeviceInfoTy::MaxWgSize)) { 1981 DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); 1982 } else { 1983 DP("Using ROCm Queried thread limit: %d\n", 1984 DeviceInfo.ThreadsPerGroup[DeviceId]); 1985 } 1986 } else { 1987 DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; 1988 DP("Error getting max block dimension, use default:%d \n", 1989 RTLDeviceInfoTy::MaxWgSize); 1990 } 1991 1992 // Get wavefront size 1993 uint32_t WavefrontSize = 0; 1994 Err = 1995 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize); 1996 if (Err == HSA_STATUS_SUCCESS) { 1997 DP("Queried wavefront size: %d\n", WavefrontSize); 1998 DeviceInfo.WarpSize[DeviceId] = WavefrontSize; 1999 } else { 2000 // TODO: Burn the wavefront size into the code object 2001 DP("Warning: Unknown wavefront size, assuming 64\n"); 2002 DeviceInfo.WarpSize[DeviceId] = 64; 2003 } 2004 2005 // Adjust teams to the env variables 2006 2007 if (DeviceInfo.Env.TeamLimit > 0 && 2008 (enforceUpperBound(&DeviceInfo.GroupsPerDevice[DeviceId], 2009 DeviceInfo.Env.TeamLimit))) { 2010 DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", 2011 DeviceInfo.Env.TeamLimit); 2012 } 2013 2014 // Set default number of teams 2015 if (DeviceInfo.Env.NumTeams > 0) { 2016 DeviceInfo.NumTeams[DeviceId] = DeviceInfo.Env.NumTeams; 2017 DP("Default number of teams set according to environment %d\n", 2018 DeviceInfo.Env.NumTeams); 2019 } else { 2020 char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); 2021 int TeamsPerCU = DefaultTeamsPerCU; 2022 if (TeamsPerCUEnvStr) { 2023 TeamsPerCU = std::stoi(TeamsPerCUEnvStr); 2024 } 2025 2026 DeviceInfo.NumTeams[DeviceId] = 2027 TeamsPerCU * DeviceInfo.ComputeUnits[DeviceId]; 2028 DP("Default number of teams = %d * number of compute units %d\n", 2029 TeamsPerCU, DeviceInfo.ComputeUnits[DeviceId]); 2030 } 2031 2032 if (enforceUpperBound(&DeviceInfo.NumTeams[DeviceId], 2033 DeviceInfo.GroupsPerDevice[DeviceId])) { 2034 DP("Default number of teams exceeds device limit, capping at %d\n", 2035 DeviceInfo.GroupsPerDevice[DeviceId]); 2036 } 2037 2038 // Adjust threads to the env variables 2039 if (DeviceInfo.Env.TeamThreadLimit > 0 && 2040 (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId], 2041 DeviceInfo.Env.TeamThreadLimit))) { 2042 DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", 2043 DeviceInfo.Env.TeamThreadLimit); 2044 } 2045 2046 // Set default number of threads 2047 DeviceInfo.NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize; 2048 DP("Default number of threads set according to library's default %d\n", 2049 RTLDeviceInfoTy::DefaultWgSize); 2050 if (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId], 2051 DeviceInfo.ThreadsPerGroup[DeviceId])) { 2052 DP("Default number of threads exceeds device limit, capping at %d\n", 2053 DeviceInfo.ThreadsPerGroup[DeviceId]); 2054 } 2055 2056 DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", 2057 DeviceId, DeviceInfo.GroupsPerDevice[DeviceId], 2058 DeviceInfo.ThreadsPerGroup[DeviceId]); 2059 2060 DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId, 2061 DeviceInfo.WarpSize[DeviceId], DeviceInfo.ThreadsPerGroup[DeviceId], 2062 DeviceInfo.GroupsPerDevice[DeviceId], 2063 DeviceInfo.GroupsPerDevice[DeviceId] * 2064 DeviceInfo.ThreadsPerGroup[DeviceId]); 2065 2066 return OFFLOAD_SUCCESS; 2067 } 2068 2069 static __tgt_target_table * 2070 __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image); 2071 2072 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, 2073 __tgt_device_image *Image) { 2074 DeviceInfo.LoadRunLock.lock(); 2075 __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image); 2076 DeviceInfo.LoadRunLock.unlock(); 2077 return Res; 2078 } 2079 2080 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, 2081 __tgt_device_image *Image) { 2082 // This function loads the device image onto gpu[DeviceId] and does other 2083 // per-image initialization work. Specifically: 2084 // 2085 // - Initialize an DeviceEnvironmentTy instance embedded in the 2086 // image at the symbol "omptarget_device_environment" 2087 // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL. 2088 // 2089 // - Allocate a large array per-gpu (could be moved to init_device) 2090 // - Read a uint64_t at symbol omptarget_nvptx_device_State_size 2091 // - Allocate at least that many bytes of gpu memory 2092 // - Zero initialize it 2093 // - Write the pointer to the symbol omptarget_nvptx_device_State 2094 // 2095 // - Pulls some per-kernel information together from various sources and 2096 // records it in the KernelsList for quicker access later 2097 // 2098 // The initialization can be done before or after loading the image onto the 2099 // gpu. This function presently does a mixture. Using the hsa api to get/set 2100 // the information is simpler to implement, in exchange for more complicated 2101 // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes 2102 // back from the gpu vs a hashtable lookup on the host. 2103 2104 const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart; 2105 2106 DeviceInfo.clearOffloadEntriesTable(DeviceId); 2107 2108 // We do not need to set the ELF version because the caller of this function 2109 // had to do that to decide the right runtime to use 2110 2111 if (!elfMachineIdIsAmdgcn(Image)) 2112 return NULL; 2113 2114 { 2115 auto Env = DeviceEnvironment(DeviceId, DeviceInfo.NumberOfDevices, 2116 DeviceInfo.Env.DynamicMemSize, Image, ImgSize); 2117 2118 auto &KernelInfo = DeviceInfo.KernelInfoTable[DeviceId]; 2119 auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId]; 2120 hsa_status_t Err = moduleRegisterFromMemoryToPlace( 2121 KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId, 2122 [&](void *Data, size_t Size) { 2123 if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) { 2124 __atomic_store_n(&DeviceInfo.HostcallRequired, true, 2125 __ATOMIC_RELEASE); 2126 } 2127 return Env.beforeLoading(Data, Size); 2128 }, 2129 DeviceInfo.HSAExecutables); 2130 2131 check("Module registering", Err); 2132 if (Err != HSA_STATUS_SUCCESS) { 2133 const char *DeviceName = DeviceInfo.GPUName[DeviceId].c_str(); 2134 const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image)); 2135 2136 if (strcmp(DeviceName, ElfName) != 0) { 2137 DP("Possible gpu arch mismatch: device:%s, image:%s please check" 2138 " compiler flag: -march=<gpu>\n", 2139 DeviceName, ElfName); 2140 } else { 2141 DP("Error loading image onto GPU: %s\n", get_error_string(Err)); 2142 } 2143 2144 return NULL; 2145 } 2146 2147 Err = Env.afterLoading(); 2148 if (Err != HSA_STATUS_SUCCESS) { 2149 return NULL; 2150 } 2151 } 2152 2153 DP("AMDGPU module successfully loaded!\n"); 2154 2155 { 2156 // the device_State array is either large value in bss or a void* that 2157 // needs to be assigned to a pointer to an array of size device_state_bytes 2158 // If absent, it has been deadstripped and needs no setup. 2159 2160 void *StatePtr; 2161 uint32_t StatePtrSize; 2162 auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId]; 2163 hsa_status_t Err = interop_hsa_get_symbol_info( 2164 SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr, 2165 &StatePtrSize); 2166 2167 if (Err != HSA_STATUS_SUCCESS) { 2168 DP("No device_state symbol found, skipping initialization\n"); 2169 } else { 2170 if (StatePtrSize < sizeof(void *)) { 2171 DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize, 2172 sizeof(void *)); 2173 return NULL; 2174 } 2175 2176 // if it's larger than a void*, assume it's a bss array and no further 2177 // initialization is required. Only try to set up a pointer for 2178 // sizeof(void*) 2179 if (StatePtrSize == sizeof(void *)) { 2180 uint64_t DeviceStateBytes = 2181 getDeviceStateBytes((char *)Image->ImageStart, ImgSize); 2182 if (DeviceStateBytes == 0) { 2183 DP("Can't initialize device_State, missing size information\n"); 2184 return NULL; 2185 } 2186 2187 auto &DSS = DeviceInfo.DeviceStateStore[DeviceId]; 2188 if (DSS.first.get() == nullptr) { 2189 assert(DSS.second == 0); 2190 void *Ptr = NULL; 2191 hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId); 2192 if (Err != HSA_STATUS_SUCCESS) { 2193 DP("Failed to allocate device_state array\n"); 2194 return NULL; 2195 } 2196 DSS = { 2197 std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr}, 2198 DeviceStateBytes, 2199 }; 2200 } 2201 2202 void *Ptr = DSS.first.get(); 2203 if (DeviceStateBytes != DSS.second) { 2204 DP("Inconsistent sizes of device_State unsupported\n"); 2205 return NULL; 2206 } 2207 2208 // write ptr to device memory so it can be used by later kernels 2209 Err = DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *), 2210 DeviceId); 2211 if (Err != HSA_STATUS_SUCCESS) { 2212 DP("memcpy install of state_ptr failed\n"); 2213 return NULL; 2214 } 2215 } 2216 } 2217 } 2218 2219 // Here, we take advantage of the data that is appended after img_end to get 2220 // the symbols' name we need to load. This data consist of the host entries 2221 // begin and end as well as the target name (see the offloading linker script 2222 // creation in clang compiler). 2223 2224 // Find the symbols in the module by name. The name can be obtain by 2225 // concatenating the host entry name with the target name 2226 2227 __tgt_offload_entry *HostBegin = Image->EntriesBegin; 2228 __tgt_offload_entry *HostEnd = Image->EntriesEnd; 2229 2230 for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 2231 2232 if (!E->addr) { 2233 // The host should have always something in the address to 2234 // uniquely identify the target region. 2235 DP("Analyzing host entry '<null>' (size = %lld)...\n", 2236 (unsigned long long)E->size); 2237 return NULL; 2238 } 2239 2240 if (E->size) { 2241 __tgt_offload_entry Entry = *E; 2242 2243 void *Varptr; 2244 uint32_t Varsize; 2245 2246 auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId]; 2247 hsa_status_t Err = interop_hsa_get_symbol_info( 2248 SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize); 2249 2250 if (Err != HSA_STATUS_SUCCESS) { 2251 // Inform the user what symbol prevented offloading 2252 DP("Loading global '%s' (Failed)\n", E->name); 2253 return NULL; 2254 } 2255 2256 if (Varsize != E->size) { 2257 DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name, 2258 Varsize, E->size); 2259 return NULL; 2260 } 2261 2262 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 2263 DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)); 2264 Entry.addr = (void *)Varptr; 2265 2266 DeviceInfo.addOffloadEntry(DeviceId, Entry); 2267 2268 if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 2269 E->flags & OMP_DECLARE_TARGET_LINK) { 2270 // If unified memory is present any target link variables 2271 // can access host addresses directly. There is no longer a 2272 // need for device copies. 2273 Err = DeviceInfo.freesignalpoolMemcpyH2D(Varptr, E->addr, 2274 sizeof(void *), DeviceId); 2275 if (Err != HSA_STATUS_SUCCESS) 2276 DP("Error when copying USM\n"); 2277 DP("Copy linked variable host address (" DPxMOD ")" 2278 "to device address (" DPxMOD ")\n", 2279 DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)); 2280 } 2281 2282 continue; 2283 } 2284 2285 DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)); 2286 2287 // errors in kernarg_segment_size previously treated as = 0 (or as undef) 2288 uint32_t KernargSegmentSize = 0; 2289 auto &KernelInfoMap = DeviceInfo.KernelInfoTable[DeviceId]; 2290 hsa_status_t Err = HSA_STATUS_SUCCESS; 2291 if (!E->name) { 2292 Err = HSA_STATUS_ERROR; 2293 } else { 2294 std::string KernelStr = std::string(E->name); 2295 auto It = KernelInfoMap.find(KernelStr); 2296 if (It != KernelInfoMap.end()) { 2297 atl_kernel_info_t Info = It->second; 2298 KernargSegmentSize = Info.kernel_segment_size; 2299 } else { 2300 Err = HSA_STATUS_ERROR; 2301 } 2302 } 2303 2304 // default value GENERIC (in case symbol is missing from cubin file) 2305 llvm::omp::OMPTgtExecModeFlags ExecModeVal = 2306 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; 2307 2308 // get flat group size if present, else Default_WG_Size 2309 int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; 2310 2311 // get Kernel Descriptor if present. 2312 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp 2313 struct KernDescValType { 2314 uint16_t Version; 2315 uint16_t TSize; 2316 uint16_t WGSize; 2317 }; 2318 struct KernDescValType KernDescVal; 2319 std::string KernDescNameStr(E->name); 2320 KernDescNameStr += "_kern_desc"; 2321 const char *KernDescName = KernDescNameStr.c_str(); 2322 2323 void *KernDescPtr; 2324 uint32_t KernDescSize; 2325 void *CallStackAddr = nullptr; 2326 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName, 2327 &KernDescPtr, &KernDescSize); 2328 2329 if (Err == HSA_STATUS_SUCCESS) { 2330 if ((size_t)KernDescSize != sizeof(KernDescVal)) 2331 DP("Loading global computation properties '%s' - size mismatch (%u != " 2332 "%lu)\n", 2333 KernDescName, KernDescSize, sizeof(KernDescVal)); 2334 2335 memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); 2336 2337 // Check structure size against recorded size. 2338 if ((size_t)KernDescSize != KernDescVal.TSize) 2339 DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", 2340 sizeof(KernDescVal), KernDescVal.TSize, KernDescName); 2341 2342 DP("After loading global for %s KernDesc \n", KernDescName); 2343 DP("KernDesc: Version: %d\n", KernDescVal.Version); 2344 DP("KernDesc: TSize: %d\n", KernDescVal.TSize); 2345 DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize); 2346 2347 if (KernDescVal.WGSize == 0) { 2348 KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize; 2349 DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize); 2350 } 2351 WGSizeVal = KernDescVal.WGSize; 2352 DP("WGSizeVal %d\n", WGSizeVal); 2353 check("Loading KernDesc computation property", Err); 2354 } else { 2355 DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); 2356 2357 // Flat group size 2358 std::string WGSizeNameStr(E->name); 2359 WGSizeNameStr += "_wg_size"; 2360 const char *WGSizeName = WGSizeNameStr.c_str(); 2361 2362 void *WGSizePtr; 2363 uint32_t WGSize; 2364 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName, 2365 &WGSizePtr, &WGSize); 2366 2367 if (Err == HSA_STATUS_SUCCESS) { 2368 if ((size_t)WGSize != sizeof(int16_t)) { 2369 DP("Loading global computation properties '%s' - size mismatch (%u " 2370 "!= " 2371 "%lu)\n", 2372 WGSizeName, WGSize, sizeof(int16_t)); 2373 return NULL; 2374 } 2375 2376 memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); 2377 2378 DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); 2379 2380 if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize || 2381 WGSizeVal > RTLDeviceInfoTy::MaxWgSize) { 2382 DP("Error wrong WGSize value specified in HSA code object file: " 2383 "%d\n", 2384 WGSizeVal); 2385 WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; 2386 } 2387 } else { 2388 DP("Warning: Loading WGSize '%s' - symbol not found, " 2389 "using default value %d\n", 2390 WGSizeName, WGSizeVal); 2391 } 2392 2393 check("Loading WGSize computation property", Err); 2394 } 2395 2396 // Read execution mode from global in binary 2397 std::string ExecModeNameStr(E->name); 2398 ExecModeNameStr += "_exec_mode"; 2399 const char *ExecModeName = ExecModeNameStr.c_str(); 2400 2401 void *ExecModePtr; 2402 uint32_t VarSize; 2403 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName, 2404 &ExecModePtr, &VarSize); 2405 2406 if (Err == HSA_STATUS_SUCCESS) { 2407 if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 2408 DP("Loading global computation properties '%s' - size mismatch(%u != " 2409 "%lu)\n", 2410 ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 2411 return NULL; 2412 } 2413 2414 memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize); 2415 2416 DP("After loading global for %s ExecMode = %d\n", ExecModeName, 2417 ExecModeVal); 2418 2419 if (ExecModeVal < 0 || 2420 ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { 2421 DP("Error wrong exec_mode value specified in HSA code object file: " 2422 "%d\n", 2423 ExecModeVal); 2424 return NULL; 2425 } 2426 } else { 2427 DP("Loading global exec_mode '%s' - symbol missing, using default " 2428 "value " 2429 "GENERIC (1)\n", 2430 ExecModeName); 2431 } 2432 check("Loading computation property", Err); 2433 2434 KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, 2435 CallStackAddr, E->name, KernargSegmentSize, 2436 DeviceInfo.KernArgPool)); 2437 __tgt_offload_entry Entry = *E; 2438 Entry.addr = (void *)&KernelsList.back(); 2439 DeviceInfo.addOffloadEntry(DeviceId, Entry); 2440 DP("Entry point %ld maps to %s\n", E - HostBegin, E->name); 2441 } 2442 2443 return DeviceInfo.getOffloadEntriesTable(DeviceId); 2444 } 2445 2446 void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { 2447 void *Ptr = NULL; 2448 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 2449 2450 if (Kind != TARGET_ALLOC_DEFAULT) { 2451 REPORT("Invalid target data allocation kind or requested allocator not " 2452 "implemented yet\n"); 2453 return NULL; 2454 } 2455 2456 hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId); 2457 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr); 2458 DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size, 2459 (long long unsigned)(Elf64_Addr)Ptr); 2460 Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL; 2461 return Ptr; 2462 } 2463 2464 int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr, 2465 int64_t Size) { 2466 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 2467 __tgt_async_info AsyncInfo; 2468 int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); 2469 if (Rc != OFFLOAD_SUCCESS) 2470 return OFFLOAD_FAIL; 2471 2472 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 2473 } 2474 2475 int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr, 2476 int64_t Size, __tgt_async_info *AsyncInfo) { 2477 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 2478 if (AsyncInfo) { 2479 initAsyncInfo(AsyncInfo); 2480 return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo); 2481 } 2482 return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size); 2483 } 2484 2485 int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr, 2486 int64_t Size) { 2487 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 2488 __tgt_async_info AsyncInfo; 2489 int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); 2490 if (Rc != OFFLOAD_SUCCESS) 2491 return OFFLOAD_FAIL; 2492 2493 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 2494 } 2495 2496 int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr, 2497 int64_t Size, 2498 __tgt_async_info *AsyncInfo) { 2499 assert(AsyncInfo && "AsyncInfo is nullptr"); 2500 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 2501 initAsyncInfo(AsyncInfo); 2502 return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo); 2503 } 2504 2505 int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) { 2506 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 2507 hsa_status_t Err; 2508 DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr); 2509 Err = core::Runtime::Memfree(TgtPtr); 2510 if (Err != HSA_STATUS_SUCCESS) { 2511 DP("Error when freeing CUDA memory\n"); 2512 return OFFLOAD_FAIL; 2513 } 2514 return OFFLOAD_SUCCESS; 2515 } 2516 2517 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, 2518 void **TgtArgs, ptrdiff_t *TgtOffsets, 2519 int32_t ArgNum, int32_t NumTeams, 2520 int32_t ThreadLimit, 2521 uint64_t LoopTripcount) { 2522 2523 DeviceInfo.LoadRunLock.lock_shared(); 2524 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, 2525 ArgNum, NumTeams, ThreadLimit, LoopTripcount); 2526 2527 DeviceInfo.LoadRunLock.unlock_shared(); 2528 return Res; 2529 } 2530 2531 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr, 2532 void **TgtArgs, ptrdiff_t *TgtOffsets, 2533 int32_t ArgNum) { 2534 // use one team and one thread 2535 // fix thread num 2536 int32_t TeamNum = 1; 2537 int32_t ThreadLimit = 0; // use default 2538 return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs, 2539 TgtOffsets, ArgNum, TeamNum, 2540 ThreadLimit, 0); 2541 } 2542 2543 int32_t __tgt_rtl_run_target_team_region_async( 2544 int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, 2545 int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, 2546 uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) { 2547 assert(AsyncInfo && "AsyncInfo is nullptr"); 2548 initAsyncInfo(AsyncInfo); 2549 2550 DeviceInfo.LoadRunLock.lock_shared(); 2551 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, 2552 ArgNum, NumTeams, ThreadLimit, LoopTripcount); 2553 2554 DeviceInfo.LoadRunLock.unlock_shared(); 2555 return Res; 2556 } 2557 2558 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr, 2559 void **TgtArgs, ptrdiff_t *TgtOffsets, 2560 int32_t ArgNum, 2561 __tgt_async_info *AsyncInfo) { 2562 // use one team and one thread 2563 // fix thread num 2564 int32_t TeamNum = 1; 2565 int32_t ThreadLimit = 0; // use default 2566 return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs, 2567 TgtOffsets, ArgNum, TeamNum, 2568 ThreadLimit, 0, AsyncInfo); 2569 } 2570 2571 int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) { 2572 assert(AsyncInfo && "AsyncInfo is nullptr"); 2573 2574 // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant 2575 // is not ensured by devices.cpp for amdgcn 2576 // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr"); 2577 if (AsyncInfo->Queue) { 2578 finiAsyncInfo(AsyncInfo); 2579 } 2580 return OFFLOAD_SUCCESS; 2581 } 2582 2583 void __tgt_rtl_print_device_info(int32_t DeviceId) { 2584 // TODO: Assertion to see if DeviceId is correct 2585 // NOTE: We don't need to set context for print device info. 2586 2587 DeviceInfo.printDeviceInfo(DeviceId, DeviceInfo.HSAAgents[DeviceId]); 2588 } 2589 2590 } // extern "C" 2591