1 //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for AMD hsa machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <algorithm> 14 #include <assert.h> 15 #include <cstdio> 16 #include <cstdlib> 17 #include <cstring> 18 #include <functional> 19 #include <libelf.h> 20 #include <list> 21 #include <memory> 22 #include <mutex> 23 #include <shared_mutex> 24 #include <unordered_map> 25 #include <vector> 26 27 #include "impl_runtime.h" 28 #include "interop_hsa.h" 29 30 #include "internal.h" 31 #include "rt.h" 32 33 #include "DeviceEnvironment.h" 34 #include "get_elf_mach_gfx_name.h" 35 #include "omptargetplugin.h" 36 #include "print_tracing.h" 37 38 #include "llvm/ADT/StringMap.h" 39 #include "llvm/ADT/StringRef.h" 40 #include "llvm/Frontend/OpenMP/OMPConstants.h" 41 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 42 43 using namespace llvm; 44 45 // hostrpc interface, FIXME: consider moving to its own include these are 46 // statically linked into amdgpu/plugin if present from hostrpc_services.a, 47 // linked as --whole-archive to override the weak symbols that are used to 48 // implement a fallback for toolchains that do not yet have a hostrpc library. 49 extern "C" { 50 uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ, 51 uint32_t DeviceId); 52 hsa_status_t hostrpc_init(); 53 hsa_status_t hostrpc_terminate(); 54 55 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } 56 __attribute__((weak)) hsa_status_t hostrpc_terminate() { 57 return HSA_STATUS_SUCCESS; 58 } 59 __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, 60 uint32_t DeviceId) { 61 DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library " 62 "missing\n", 63 DeviceId); 64 return 0; 65 } 66 } 67 68 // Heuristic parameters used for kernel launch 69 // Number of teams per CU to allow scheduling flexibility 70 static const unsigned DefaultTeamsPerCU = 4; 71 72 int print_kernel_trace; 73 74 #ifdef OMPTARGET_DEBUG 75 #define check(msg, status) \ 76 if (status != HSA_STATUS_SUCCESS) { \ 77 DP(#msg " failed\n"); \ 78 } else { \ 79 DP(#msg " succeeded\n"); \ 80 } 81 #else 82 #define check(msg, status) \ 83 {} 84 #endif 85 86 #include "elf_common.h" 87 88 namespace hsa { 89 template <typename C> hsa_status_t iterate_agents(C Cb) { 90 auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t { 91 C *Unwrapped = static_cast<C *>(Data); 92 return (*Unwrapped)(Agent); 93 }; 94 return hsa_iterate_agents(L, static_cast<void *>(&Cb)); 95 } 96 97 template <typename C> 98 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) { 99 auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t { 100 C *Unwrapped = static_cast<C *>(Data); 101 return (*Unwrapped)(MemoryPool); 102 }; 103 104 return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb)); 105 } 106 107 } // namespace hsa 108 109 /// Keep entries table per device 110 struct FuncOrGblEntryTy { 111 __tgt_target_table Table; 112 std::vector<__tgt_offload_entry> Entries; 113 }; 114 115 struct KernelArgPool { 116 private: 117 static pthread_mutex_t Mutex; 118 119 public: 120 uint32_t KernargSegmentSize; 121 void *KernargRegion = nullptr; 122 std::queue<int> FreeKernargSegments; 123 124 uint32_t kernargSizeIncludingImplicit() { 125 return KernargSegmentSize + sizeof(impl_implicit_args_t); 126 } 127 128 ~KernelArgPool() { 129 if (KernargRegion) { 130 auto R = hsa_amd_memory_pool_free(KernargRegion); 131 if (R != HSA_STATUS_SUCCESS) { 132 DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)); 133 } 134 } 135 } 136 137 // Can't really copy or move a mutex 138 KernelArgPool() = default; 139 KernelArgPool(const KernelArgPool &) = delete; 140 KernelArgPool(KernelArgPool &&) = delete; 141 142 KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) 143 : KernargSegmentSize(KernargSegmentSize) { 144 145 // impl uses one pool per kernel for all gpus, with a fixed upper size 146 // preserving that exact scheme here, including the queue<int> 147 148 hsa_status_t Err = hsa_amd_memory_pool_allocate( 149 MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0, 150 &KernargRegion); 151 152 if (Err != HSA_STATUS_SUCCESS) { 153 DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)); 154 KernargRegion = nullptr; // paranoid 155 return; 156 } 157 158 Err = core::allow_access_to_all_gpu_agents(KernargRegion); 159 if (Err != HSA_STATUS_SUCCESS) { 160 DP("hsa allow_access_to_all_gpu_agents failed: %s\n", 161 get_error_string(Err)); 162 auto R = hsa_amd_memory_pool_free(KernargRegion); 163 if (R != HSA_STATUS_SUCCESS) { 164 // if free failed, can't do anything more to resolve it 165 DP("hsa memory poll free failed: %s\n", get_error_string(Err)); 166 } 167 KernargRegion = nullptr; 168 return; 169 } 170 171 for (int I = 0; I < MAX_NUM_KERNELS; I++) { 172 FreeKernargSegments.push(I); 173 } 174 } 175 176 void *allocate(uint64_t ArgNum) { 177 assert((ArgNum * sizeof(void *)) == KernargSegmentSize); 178 Lock L(&Mutex); 179 void *Res = nullptr; 180 if (!FreeKernargSegments.empty()) { 181 182 int FreeIdx = FreeKernargSegments.front(); 183 Res = static_cast<void *>(static_cast<char *>(KernargRegion) + 184 (FreeIdx * kernargSizeIncludingImplicit())); 185 assert(FreeIdx == pointerToIndex(Res)); 186 FreeKernargSegments.pop(); 187 } 188 return Res; 189 } 190 191 void deallocate(void *Ptr) { 192 Lock L(&Mutex); 193 int Idx = pointerToIndex(Ptr); 194 FreeKernargSegments.push(Idx); 195 } 196 197 private: 198 int pointerToIndex(void *Ptr) { 199 ptrdiff_t Bytes = 200 static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion); 201 assert(Bytes >= 0); 202 assert(Bytes % kernargSizeIncludingImplicit() == 0); 203 return Bytes / kernargSizeIncludingImplicit(); 204 } 205 struct Lock { 206 Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); } 207 ~Lock() { pthread_mutex_unlock(M); } 208 pthread_mutex_t *M; 209 }; 210 }; 211 pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER; 212 213 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>> 214 KernelArgPoolMap; 215 216 /// Use a single entity to encode a kernel and a set of flags 217 struct KernelTy { 218 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 219 int16_t ConstWGSize; 220 int32_t DeviceId; 221 void *CallStackAddr = nullptr; 222 const char *Name; 223 224 KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, 225 int32_t DeviceId, void *CallStackAddr, const char *Name, 226 uint32_t KernargSegmentSize, 227 hsa_amd_memory_pool_t &KernArgMemoryPool) 228 : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), 229 DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { 230 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); 231 232 std::string N(Name); 233 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { 234 KernelArgPoolMap.insert( 235 std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool( 236 KernargSegmentSize, KernArgMemoryPool)))); 237 } 238 } 239 }; 240 241 /// List that contains all the kernels. 242 /// FIXME: we may need this to be per device and per library. 243 std::list<KernelTy> KernelsList; 244 245 template <typename Callback> static hsa_status_t findAgents(Callback CB) { 246 247 hsa_status_t Err = 248 hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t { 249 hsa_device_type_t DeviceType; 250 // get_info fails iff HSA runtime not yet initialized 251 hsa_status_t Err = 252 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); 253 254 if (Err != HSA_STATUS_SUCCESS) { 255 if (print_kernel_trace > 0) 256 DP("rtl.cpp: err %s\n", get_error_string(Err)); 257 258 return Err; 259 } 260 261 CB(DeviceType, Agent); 262 return HSA_STATUS_SUCCESS; 263 }); 264 265 // iterate_agents fails iff HSA runtime not yet initialized 266 if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) { 267 DP("rtl.cpp: err %s\n", get_error_string(Err)); 268 } 269 270 return Err; 271 } 272 273 static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source, 274 void *Data) { 275 if (Status != HSA_STATUS_SUCCESS) { 276 const char *StatusString; 277 if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) { 278 StatusString = "unavailable"; 279 } 280 DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source, 281 Status, StatusString); 282 abort(); 283 } 284 } 285 286 namespace core { 287 namespace { 288 289 bool checkResult(hsa_status_t Err, const char *ErrMsg) { 290 if (Err == HSA_STATUS_SUCCESS) 291 return true; 292 293 REPORT("%s", ErrMsg); 294 REPORT("%s", get_error_string(Err)); 295 return false; 296 } 297 298 void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) { 299 __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE); 300 } 301 302 uint16_t createHeader() { 303 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; 304 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; 305 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; 306 return Header; 307 } 308 309 hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { 310 bool AllocAllowed = false; 311 hsa_status_t Err = hsa_amd_memory_pool_get_info( 312 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, 313 &AllocAllowed); 314 if (Err != HSA_STATUS_SUCCESS) { 315 DP("Alloc allowed in memory pool check failed: %s\n", 316 get_error_string(Err)); 317 return Err; 318 } 319 320 size_t Size = 0; 321 Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, 322 &Size); 323 if (Err != HSA_STATUS_SUCCESS) { 324 DP("Get memory pool size failed: %s\n", get_error_string(Err)); 325 return Err; 326 } 327 328 return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; 329 } 330 331 hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { 332 std::vector<hsa_amd_memory_pool_t> *Result = 333 static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data); 334 335 hsa_status_t Err; 336 if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) { 337 return Err; 338 } 339 340 Result->push_back(MemoryPool); 341 return HSA_STATUS_SUCCESS; 342 } 343 344 } // namespace 345 } // namespace core 346 347 struct EnvironmentVariables { 348 int NumTeams; 349 int TeamLimit; 350 int TeamThreadLimit; 351 int MaxTeamsDefault; 352 int DynamicMemSize; 353 }; 354 355 template <uint32_t wavesize> 356 static constexpr const llvm::omp::GV &getGridValue() { 357 return llvm::omp::getAMDGPUGridValues<wavesize>(); 358 } 359 360 struct HSALifetime { 361 // Wrapper around HSA used to ensure it is constructed before other types 362 // and destructed after, which means said other types can use raii for 363 // cleanup without risking running outside of the lifetime of HSA 364 const hsa_status_t S; 365 366 bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; } 367 HSALifetime() : S(hsa_init()) {} 368 369 ~HSALifetime() { 370 if (S == HSA_STATUS_SUCCESS) { 371 hsa_status_t Err = hsa_shut_down(); 372 if (Err != HSA_STATUS_SUCCESS) { 373 // Can't call into HSA to get a string from the integer 374 DP("Shutting down HSA failed: %d\n", Err); 375 } 376 } 377 } 378 }; 379 380 // Handle scheduling of multiple hsa_queue's per device to 381 // multiple threads (one scheduler per device) 382 class HSAQueueScheduler { 383 public: 384 HSAQueueScheduler() : Current(0) {} 385 386 HSAQueueScheduler(const HSAQueueScheduler &) = delete; 387 388 HSAQueueScheduler(HSAQueueScheduler &&Q) { 389 Current = Q.Current.load(); 390 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 391 HSAQueues[I] = Q.HSAQueues[I]; 392 Q.HSAQueues[I] = nullptr; 393 } 394 } 395 396 // \return false if any HSA queue creation fails 397 bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) { 398 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 399 hsa_queue_t *Q = nullptr; 400 hsa_status_t Rc = 401 hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI, 402 callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q); 403 if (Rc != HSA_STATUS_SUCCESS) { 404 DP("Failed to create HSA queue %d\n", I); 405 return false; 406 } 407 HSAQueues[I] = Q; 408 } 409 return true; 410 } 411 412 ~HSAQueueScheduler() { 413 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 414 if (HSAQueues[I]) { 415 hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]); 416 if (Err != HSA_STATUS_SUCCESS) 417 DP("Error destroying HSA queue"); 418 } 419 } 420 } 421 422 // \return next queue to use for device 423 hsa_queue_t *next() { 424 return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) % 425 NUM_QUEUES_PER_DEVICE]; 426 } 427 428 private: 429 // Number of queues per device 430 enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 }; 431 hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {}; 432 std::atomic<uint8_t> Current; 433 }; 434 435 /// Class containing all the device information 436 class RTLDeviceInfoTy : HSALifetime { 437 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; 438 439 struct QueueDeleter { 440 void operator()(hsa_queue_t *Q) { 441 if (Q) { 442 hsa_status_t Err = hsa_queue_destroy(Q); 443 if (Err != HSA_STATUS_SUCCESS) { 444 DP("Error destroying hsa queue: %s\n", get_error_string(Err)); 445 } 446 } 447 } 448 }; 449 450 public: 451 bool ConstructionSucceeded = false; 452 453 // load binary populates symbol tables and mutates various global state 454 // run uses those symbol tables 455 std::shared_timed_mutex LoadRunLock; 456 457 int NumberOfDevices = 0; 458 459 // GPU devices 460 std::vector<hsa_agent_t> HSAAgents; 461 std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu 462 463 // CPUs 464 std::vector<hsa_agent_t> CPUAgents; 465 466 // Device properties 467 std::vector<int> ComputeUnits; 468 std::vector<int> GroupsPerDevice; 469 std::vector<int> ThreadsPerGroup; 470 std::vector<int> WarpSize; 471 std::vector<std::string> GPUName; 472 std::vector<std::string> TargetID; 473 474 // OpenMP properties 475 std::vector<int> NumTeams; 476 std::vector<int> NumThreads; 477 478 // OpenMP Environment properties 479 EnvironmentVariables Env; 480 481 // OpenMP Requires Flags 482 int64_t RequiresFlags; 483 484 // Resource pools 485 SignalPoolT FreeSignalPool; 486 487 bool HostcallRequired = false; 488 489 std::vector<hsa_executable_t> HSAExecutables; 490 491 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable; 492 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable; 493 494 hsa_amd_memory_pool_t KernArgPool; 495 496 // fine grained memory pool for host allocations 497 hsa_amd_memory_pool_t HostFineGrainedMemoryPool; 498 499 // fine and coarse-grained memory pools per offloading device 500 std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools; 501 std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools; 502 503 struct ImplFreePtrDeletor { 504 void operator()(void *P) { 505 core::Runtime::Memfree(P); // ignore failure to free 506 } 507 }; 508 509 // device_State shared across loaded binaries, error if inconsistent size 510 std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>> 511 DeviceStateStore; 512 513 static const unsigned HardTeamLimit = 514 (1 << 16) - 1; // 64K needed to fit in uint16 515 static const int DefaultNumTeams = 128; 516 517 // These need to be per-device since different devices can have different 518 // wave sizes, but are currently the same number for each so that refactor 519 // can be postponed. 520 static_assert(getGridValue<32>().GV_Max_Teams == 521 getGridValue<64>().GV_Max_Teams, 522 ""); 523 static const int MaxTeams = getGridValue<64>().GV_Max_Teams; 524 525 static_assert(getGridValue<32>().GV_Max_WG_Size == 526 getGridValue<64>().GV_Max_WG_Size, 527 ""); 528 static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size; 529 530 static_assert(getGridValue<32>().GV_Default_WG_Size == 531 getGridValue<64>().GV_Default_WG_Size, 532 ""); 533 static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size; 534 535 using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size, 536 hsa_agent_t, hsa_amd_memory_pool_t); 537 hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size, 538 MemcpyFunc Func, int32_t DeviceId) { 539 hsa_agent_t Agent = HSAAgents[DeviceId]; 540 hsa_signal_t S = FreeSignalPool.pop(); 541 if (S.handle == 0) { 542 return HSA_STATUS_ERROR; 543 } 544 hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool); 545 FreeSignalPool.push(S); 546 return R; 547 } 548 549 hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size, 550 int32_t DeviceId) { 551 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId); 552 } 553 554 hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size, 555 int32_t DeviceId) { 556 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId); 557 } 558 559 static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) { 560 char TmpChar[1000]; 561 uint16_t Major, Minor; 562 uint32_t TmpUInt; 563 uint32_t TmpUInt2; 564 uint32_t CacheSize[4]; 565 bool TmpBool; 566 uint16_t WorkgroupMaxDim[3]; 567 hsa_dim3_t GridMaxDim; 568 569 // Getting basic information about HSA and Device 570 core::checkResult( 571 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major), 572 "Error from hsa_system_get_info when obtaining " 573 "HSA_SYSTEM_INFO_VERSION_MAJOR\n"); 574 core::checkResult( 575 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor), 576 "Error from hsa_system_get_info when obtaining " 577 "HSA_SYSTEM_INFO_VERSION_MINOR\n"); 578 printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor); 579 printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId); 580 core::checkResult( 581 hsa_agent_get_info( 582 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar), 583 "Error returned from hsa_agent_get_info when obtaining " 584 "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n"); 585 printf(" Product Name: \t\t\t%s \n", TmpChar); 586 core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar), 587 "Error returned from hsa_agent_get_info when obtaining " 588 "HSA_AGENT_INFO_NAME\n"); 589 printf(" Device Name: \t\t\t%s \n", TmpChar); 590 core::checkResult( 591 hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar), 592 "Error returned from hsa_agent_get_info when obtaining " 593 "HSA_AGENT_INFO_NAME\n"); 594 printf(" Vendor Name: \t\t\t%s \n", TmpChar); 595 hsa_device_type_t DevType; 596 core::checkResult( 597 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType), 598 "Error returned from hsa_agent_get_info when obtaining " 599 "HSA_AGENT_INFO_DEVICE\n"); 600 printf(" Device Type: \t\t\t%s \n", 601 DevType == HSA_DEVICE_TYPE_CPU 602 ? "CPU" 603 : (DevType == HSA_DEVICE_TYPE_GPU 604 ? "GPU" 605 : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN"))); 606 core::checkResult( 607 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt), 608 "Error returned from hsa_agent_get_info when obtaining " 609 "HSA_AGENT_INFO_QUEUES_MAX\n"); 610 printf(" Max Queues: \t\t\t%u \n", TmpUInt); 611 core::checkResult( 612 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt), 613 "Error returned from hsa_agent_get_info when obtaining " 614 "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n"); 615 printf(" Queue Min Size: \t\t\t%u \n", TmpUInt); 616 core::checkResult( 617 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt), 618 "Error returned from hsa_agent_get_info when obtaining " 619 "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n"); 620 printf(" Queue Max Size: \t\t\t%u \n", TmpUInt); 621 622 // Getting cache information 623 printf(" Cache:\n"); 624 625 // FIXME: This is deprecated according to HSA documentation. But using 626 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during 627 // runtime. 628 core::checkResult( 629 hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize), 630 "Error returned from hsa_agent_get_info when obtaining " 631 "HSA_AGENT_INFO_CACHE_SIZE\n"); 632 633 for (int I = 0; I < 4; I++) { 634 if (CacheSize[I]) { 635 printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]); 636 } 637 } 638 639 core::checkResult( 640 hsa_agent_get_info(Agent, 641 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, 642 &TmpUInt), 643 "Error returned from hsa_agent_get_info when obtaining " 644 "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n"); 645 printf(" Cacheline Size: \t\t\t%u \n", TmpUInt); 646 core::checkResult( 647 hsa_agent_get_info( 648 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, 649 &TmpUInt), 650 "Error returned from hsa_agent_get_info when obtaining " 651 "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n"); 652 printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt); 653 core::checkResult( 654 hsa_agent_get_info( 655 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 656 &TmpUInt), 657 "Error returned from hsa_agent_get_info when obtaining " 658 "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n"); 659 printf(" Compute Units: \t\t\t%u \n", TmpUInt); 660 core::checkResult(hsa_agent_get_info( 661 Agent, 662 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, 663 &TmpUInt), 664 "Error returned from hsa_agent_get_info when obtaining " 665 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); 666 printf(" SIMD per CU: \t\t\t%u \n", TmpUInt); 667 core::checkResult( 668 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool), 669 "Error returned from hsa_agent_get_info when obtaining " 670 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); 671 printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE")); 672 core::checkResult( 673 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2), 674 "Error returned from hsa_agent_get_info when obtaining " 675 "HSA_AGENT_INFO_WAVEFRONT_SIZE\n"); 676 printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2); 677 core::checkResult( 678 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt), 679 "Error returned from hsa_agent_get_info when obtaining " 680 "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n"); 681 printf(" Workgroup Max Size: \t\t%u \n", TmpUInt); 682 core::checkResult(hsa_agent_get_info(Agent, 683 HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 684 WorkgroupMaxDim), 685 "Error returned from hsa_agent_get_info when obtaining " 686 "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n"); 687 printf(" Workgroup Max Size per Dimension:\n"); 688 printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]); 689 printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]); 690 printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]); 691 core::checkResult(hsa_agent_get_info( 692 Agent, 693 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, 694 &TmpUInt), 695 "Error returned from hsa_agent_get_info when obtaining " 696 "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n"); 697 printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt); 698 printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2); 699 core::checkResult( 700 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt), 701 "Error returned from hsa_agent_get_info when obtaining " 702 "HSA_AGENT_INFO_GRID_MAX_SIZE\n"); 703 printf(" Grid Max Size: \t\t\t%u \n", TmpUInt); 704 core::checkResult( 705 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim), 706 "Error returned from hsa_agent_get_info when obtaining " 707 "HSA_AGENT_INFO_GRID_MAX_DIM\n"); 708 printf(" Grid Max Size per Dimension: \t\t\n"); 709 printf(" x: \t\t\t\t%u\n", GridMaxDim.x); 710 printf(" y: \t\t\t\t%u\n", GridMaxDim.y); 711 printf(" z: \t\t\t\t%u\n", GridMaxDim.z); 712 core::checkResult( 713 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt), 714 "Error returned from hsa_agent_get_info when obtaining " 715 "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n"); 716 printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt); 717 718 printf(" Memory Pools:\n"); 719 auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t { 720 std::string TmpStr; 721 size_t Size; 722 bool Alloc, Access; 723 hsa_amd_segment_t Segment; 724 hsa_amd_memory_pool_global_flag_t GlobalFlags; 725 core::checkResult( 726 hsa_amd_memory_pool_get_info( 727 Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags), 728 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 729 "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n"); 730 core::checkResult(hsa_amd_memory_pool_get_info( 731 Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment), 732 "Error returned from hsa_amd_memory_pool_get_info when " 733 "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n"); 734 735 switch (Segment) { 736 case HSA_AMD_SEGMENT_GLOBAL: 737 TmpStr = "GLOBAL; FLAGS: "; 738 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags) 739 TmpStr += "KERNARG, "; 740 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags) 741 TmpStr += "FINE GRAINED, "; 742 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags) 743 TmpStr += "COARSE GRAINED, "; 744 break; 745 case HSA_AMD_SEGMENT_READONLY: 746 TmpStr = "READONLY"; 747 break; 748 case HSA_AMD_SEGMENT_PRIVATE: 749 TmpStr = "PRIVATE"; 750 break; 751 case HSA_AMD_SEGMENT_GROUP: 752 TmpStr = "GROUP"; 753 break; 754 } 755 printf(" Pool %s: \n", TmpStr.c_str()); 756 757 core::checkResult(hsa_amd_memory_pool_get_info( 758 Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size), 759 "Error returned from hsa_amd_memory_pool_get_info when " 760 "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n"); 761 printf(" Size: \t\t\t\t %zu bytes\n", Size); 762 core::checkResult( 763 hsa_amd_memory_pool_get_info( 764 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc), 765 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 766 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n"); 767 printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE")); 768 core::checkResult( 769 hsa_amd_memory_pool_get_info( 770 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size), 771 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 772 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n"); 773 printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size); 774 core::checkResult( 775 hsa_amd_memory_pool_get_info( 776 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size), 777 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 778 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n"); 779 printf(" Runtime Alloc alignment: \t %zu bytes\n", Size); 780 core::checkResult( 781 hsa_amd_memory_pool_get_info( 782 Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access), 783 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 784 "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n"); 785 printf(" Accessable by all: \t\t %s\n", 786 (Access ? "TRUE" : "FALSE")); 787 788 return HSA_STATUS_SUCCESS; 789 }; 790 // Iterate over all the memory regions for this agent. Get the memory region 791 // type and size 792 hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr); 793 794 printf(" ISAs:\n"); 795 auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t { 796 char TmpChar[1000]; 797 core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar), 798 "Error returned from hsa_isa_get_info_alt when " 799 "obtaining HSA_ISA_INFO_NAME\n"); 800 printf(" Name: \t\t\t\t %s\n", TmpChar); 801 802 return HSA_STATUS_SUCCESS; 803 }; 804 // Iterate over all the memory regions for this agent. Get the memory region 805 // type and size 806 hsa_agent_iterate_isas(Agent, CBIsas, nullptr); 807 } 808 809 // Record entry point associated with device 810 void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) { 811 assert(DeviceId < (int32_t)FuncGblEntries.size() && 812 "Unexpected device id!"); 813 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 814 815 E.Entries.push_back(Entry); 816 } 817 818 // Return true if the entry is associated with device 819 bool findOffloadEntry(int32_t DeviceId, void *Addr) { 820 assert(DeviceId < (int32_t)FuncGblEntries.size() && 821 "Unexpected device id!"); 822 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 823 824 for (auto &It : E.Entries) { 825 if (It.addr == Addr) 826 return true; 827 } 828 829 return false; 830 } 831 832 // Return the pointer to the target entries table 833 __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) { 834 assert(DeviceId < (int32_t)FuncGblEntries.size() && 835 "Unexpected device id!"); 836 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 837 838 int32_t Size = E.Entries.size(); 839 840 // Table is empty 841 if (!Size) 842 return 0; 843 844 __tgt_offload_entry *Begin = &E.Entries[0]; 845 __tgt_offload_entry *End = &E.Entries[Size - 1]; 846 847 // Update table info according to the entries and return the pointer 848 E.Table.EntriesBegin = Begin; 849 E.Table.EntriesEnd = ++End; 850 851 return &E.Table; 852 } 853 854 // Clear entries table for a device 855 void clearOffloadEntriesTable(int DeviceId) { 856 assert(DeviceId < (int32_t)FuncGblEntries.size() && 857 "Unexpected device id!"); 858 FuncGblEntries[DeviceId].emplace_back(); 859 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 860 // KernelArgPoolMap.clear(); 861 E.Entries.clear(); 862 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 863 } 864 865 hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, 866 unsigned int DeviceId) { 867 assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here."); 868 uint32_t GlobalFlags = 0; 869 hsa_status_t Err = hsa_amd_memory_pool_get_info( 870 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 871 872 if (Err != HSA_STATUS_SUCCESS) { 873 return Err; 874 } 875 876 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { 877 DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; 878 } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { 879 DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; 880 } 881 882 return HSA_STATUS_SUCCESS; 883 } 884 885 hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) { 886 for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { 887 hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( 888 Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { 889 hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool); 890 if (ValidStatus != HSA_STATUS_SUCCESS) { 891 DP("Alloc allowed in memory pool check failed: %s\n", 892 get_error_string(ValidStatus)); 893 return HSA_STATUS_SUCCESS; 894 } 895 return addDeviceMemoryPool(MemoryPool, DeviceId); 896 }); 897 898 if (Err != HSA_STATUS_SUCCESS) { 899 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 900 "Iterate all memory pools", get_error_string(Err)); 901 return Err; 902 } 903 } 904 return HSA_STATUS_SUCCESS; 905 } 906 907 hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) { 908 std::vector<hsa_amd_memory_pool_t> HostPools; 909 910 // collect all the "valid" pools for all the given agents. 911 for (const auto &Agent : Agents) { 912 hsa_status_t Err = hsa_amd_agent_iterate_memory_pools( 913 Agent, core::addMemoryPool, static_cast<void *>(&HostPools)); 914 if (Err != HSA_STATUS_SUCCESS) { 915 DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)); 916 } 917 } 918 919 // We need two fine-grained pools. 920 // 1. One with kernarg flag set for storing kernel arguments 921 // 2. Second for host allocations 922 bool FineGrainedMemoryPoolSet = false; 923 bool KernArgPoolSet = false; 924 for (const auto &MemoryPool : HostPools) { 925 hsa_status_t Err = HSA_STATUS_SUCCESS; 926 uint32_t GlobalFlags = 0; 927 Err = hsa_amd_memory_pool_get_info( 928 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 929 if (Err != HSA_STATUS_SUCCESS) { 930 DP("Get memory pool info failed: %s\n", get_error_string(Err)); 931 return Err; 932 } 933 934 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { 935 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { 936 KernArgPool = MemoryPool; 937 KernArgPoolSet = true; 938 } 939 HostFineGrainedMemoryPool = MemoryPool; 940 FineGrainedMemoryPoolSet = true; 941 } 942 } 943 944 if (FineGrainedMemoryPoolSet && KernArgPoolSet) 945 return HSA_STATUS_SUCCESS; 946 947 return HSA_STATUS_ERROR; 948 } 949 950 hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) { 951 assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && 952 "Invalid device Id"); 953 return DeviceCoarseGrainedMemoryPools[DeviceId]; 954 } 955 956 hsa_amd_memory_pool_t getHostMemoryPool() { 957 return HostFineGrainedMemoryPool; 958 } 959 960 static int readEnv(const char *Env, int Default = -1) { 961 const char *EnvStr = getenv(Env); 962 int Res = Default; 963 if (EnvStr) { 964 Res = std::stoi(EnvStr); 965 DP("Parsed %s=%d\n", Env, Res); 966 } 967 return Res; 968 } 969 970 RTLDeviceInfoTy() { 971 DP("Start initializing " GETNAME(TARGET_NAME) "\n"); 972 973 // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr 974 // anytime. You do not need a debug library build. 975 // 0 => no tracing 976 // 1 => tracing dispatch only 977 // >1 => verbosity increase 978 979 if (!HSAInitSuccess()) { 980 DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"); 981 return; 982 } 983 984 if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) 985 print_kernel_trace = atoi(EnvStr); 986 else 987 print_kernel_trace = 0; 988 989 hsa_status_t Err = core::atl_init_gpu_context(); 990 if (Err != HSA_STATUS_SUCCESS) { 991 DP("Error when initializing " GETNAME(TARGET_NAME) "\n"); 992 return; 993 } 994 995 // Init hostcall soon after initializing hsa 996 hostrpc_init(); 997 998 Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { 999 if (DeviceType == HSA_DEVICE_TYPE_CPU) { 1000 CPUAgents.push_back(Agent); 1001 } else { 1002 HSAAgents.push_back(Agent); 1003 } 1004 }); 1005 if (Err != HSA_STATUS_SUCCESS) 1006 return; 1007 1008 NumberOfDevices = (int)HSAAgents.size(); 1009 1010 if (NumberOfDevices == 0) { 1011 DP("There are no devices supporting HSA.\n"); 1012 return; 1013 } 1014 DP("There are %d devices supporting HSA.\n", NumberOfDevices); 1015 1016 // Init the device info 1017 HSAQueueSchedulers.reserve(NumberOfDevices); 1018 FuncGblEntries.resize(NumberOfDevices); 1019 ThreadsPerGroup.resize(NumberOfDevices); 1020 ComputeUnits.resize(NumberOfDevices); 1021 GPUName.resize(NumberOfDevices); 1022 GroupsPerDevice.resize(NumberOfDevices); 1023 WarpSize.resize(NumberOfDevices); 1024 NumTeams.resize(NumberOfDevices); 1025 NumThreads.resize(NumberOfDevices); 1026 DeviceStateStore.resize(NumberOfDevices); 1027 KernelInfoTable.resize(NumberOfDevices); 1028 SymbolInfoTable.resize(NumberOfDevices); 1029 DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); 1030 DeviceFineGrainedMemoryPools.resize(NumberOfDevices); 1031 1032 Err = setupDevicePools(HSAAgents); 1033 if (Err != HSA_STATUS_SUCCESS) { 1034 DP("Setup for Device Memory Pools failed\n"); 1035 return; 1036 } 1037 1038 Err = setupHostMemoryPools(CPUAgents); 1039 if (Err != HSA_STATUS_SUCCESS) { 1040 DP("Setup for Host Memory Pools failed\n"); 1041 return; 1042 } 1043 1044 for (int I = 0; I < NumberOfDevices; I++) { 1045 uint32_t QueueSize = 0; 1046 { 1047 hsa_status_t Err = hsa_agent_get_info( 1048 HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize); 1049 if (Err != HSA_STATUS_SUCCESS) { 1050 DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I); 1051 return; 1052 } 1053 enum { MaxQueueSize = 4096 }; 1054 if (QueueSize > MaxQueueSize) { 1055 QueueSize = MaxQueueSize; 1056 } 1057 } 1058 1059 { 1060 HSAQueueScheduler QSched; 1061 if (!QSched.createQueues(HSAAgents[I], QueueSize)) 1062 return; 1063 HSAQueueSchedulers.emplace_back(std::move(QSched)); 1064 } 1065 1066 DeviceStateStore[I] = {nullptr, 0}; 1067 } 1068 1069 for (int I = 0; I < NumberOfDevices; I++) { 1070 ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize; 1071 GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams; 1072 ComputeUnits[I] = 1; 1073 DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I, 1074 GroupsPerDevice[I], ThreadsPerGroup[I]); 1075 } 1076 1077 // Get environment variables regarding teams 1078 Env.TeamLimit = readEnv("OMP_TEAM_LIMIT"); 1079 Env.NumTeams = readEnv("OMP_NUM_TEAMS"); 1080 Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT"); 1081 Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT"); 1082 Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0); 1083 1084 // Default state. 1085 RequiresFlags = OMP_REQ_UNDEFINED; 1086 1087 ConstructionSucceeded = true; 1088 } 1089 1090 ~RTLDeviceInfoTy() { 1091 DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"); 1092 if (!HSAInitSuccess()) { 1093 // Then none of these can have been set up and they can't be torn down 1094 return; 1095 } 1096 // Run destructors on types that use HSA before 1097 // impl_finalize removes access to it 1098 DeviceStateStore.clear(); 1099 KernelArgPoolMap.clear(); 1100 // Terminate hostrpc before finalizing hsa 1101 hostrpc_terminate(); 1102 1103 hsa_status_t Err; 1104 for (uint32_t I = 0; I < HSAExecutables.size(); I++) { 1105 Err = hsa_executable_destroy(HSAExecutables[I]); 1106 if (Err != HSA_STATUS_SUCCESS) { 1107 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1108 "Destroying executable", get_error_string(Err)); 1109 } 1110 } 1111 } 1112 }; 1113 1114 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; 1115 1116 static RTLDeviceInfoTy DeviceInfo; 1117 1118 namespace { 1119 1120 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, 1121 __tgt_async_info *AsyncInfo) { 1122 assert(AsyncInfo && "AsyncInfo is nullptr"); 1123 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 1124 // Return success if we are not copying back to host from target. 1125 if (!HstPtr) 1126 return OFFLOAD_SUCCESS; 1127 hsa_status_t Err; 1128 DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 1129 (long long unsigned)(Elf64_Addr)TgtPtr, 1130 (long long unsigned)(Elf64_Addr)HstPtr); 1131 1132 Err = DeviceInfo.freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, 1133 DeviceId); 1134 1135 if (Err != HSA_STATUS_SUCCESS) { 1136 DP("Error when copying data from device to host. Pointers: " 1137 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 1138 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 1139 return OFFLOAD_FAIL; 1140 } 1141 DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 1142 (long long unsigned)(Elf64_Addr)TgtPtr, 1143 (long long unsigned)(Elf64_Addr)HstPtr); 1144 return OFFLOAD_SUCCESS; 1145 } 1146 1147 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, 1148 __tgt_async_info *AsyncInfo) { 1149 assert(AsyncInfo && "AsyncInfo is nullptr"); 1150 hsa_status_t Err; 1151 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 1152 // Return success if we are not doing host to target. 1153 if (!HstPtr) 1154 return OFFLOAD_SUCCESS; 1155 1156 DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, 1157 (long long unsigned)(Elf64_Addr)HstPtr, 1158 (long long unsigned)(Elf64_Addr)TgtPtr); 1159 Err = DeviceInfo.freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, 1160 DeviceId); 1161 if (Err != HSA_STATUS_SUCCESS) { 1162 DP("Error when copying data from host to device. Pointers: " 1163 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 1164 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 1165 return OFFLOAD_FAIL; 1166 } 1167 return OFFLOAD_SUCCESS; 1168 } 1169 1170 // Async. 1171 // The implementation was written with cuda streams in mind. The semantics of 1172 // that are to execute kernels on a queue in order of insertion. A synchronise 1173 // call then makes writes visible between host and device. This means a series 1174 // of N data_submit_async calls are expected to execute serially. HSA offers 1175 // various options to run the data copies concurrently. This may require changes 1176 // to libomptarget. 1177 1178 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that 1179 // there are no outstanding kernels that need to be synchronized. Any async call 1180 // may be passed a Queue==0, at which point the cuda implementation will set it 1181 // to non-null (see getStream). The cuda streams are per-device. Upstream may 1182 // change this interface to explicitly initialize the AsyncInfo_pointer, but 1183 // until then hsa lazily initializes it as well. 1184 1185 void initAsyncInfo(__tgt_async_info *AsyncInfo) { 1186 // set non-null while using async calls, return to null to indicate completion 1187 assert(AsyncInfo); 1188 if (!AsyncInfo->Queue) { 1189 AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX); 1190 } 1191 } 1192 void finiAsyncInfo(__tgt_async_info *AsyncInfo) { 1193 assert(AsyncInfo); 1194 assert(AsyncInfo->Queue); 1195 AsyncInfo->Queue = 0; 1196 } 1197 1198 // Determine launch values for kernel. 1199 struct LaunchVals { 1200 int WorkgroupSize; 1201 int GridSize; 1202 }; 1203 LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, 1204 int ConstWGSize, 1205 llvm::omp::OMPTgtExecModeFlags ExecutionMode, 1206 int NumTeams, int ThreadLimit, uint64_t LoopTripcount, 1207 int DeviceNumTeams) { 1208 1209 int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize; 1210 int NumGroups = 0; 1211 1212 int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; 1213 if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit)) 1214 MaxTeams = RTLDeviceInfoTy::HardTeamLimit; 1215 1216 if (print_kernel_trace & STARTUP_DETAILS) { 1217 DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams); 1218 DP("Max_Teams: %d\n", MaxTeams); 1219 DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize); 1220 DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize); 1221 DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", 1222 RTLDeviceInfoTy::DefaultWgSize); 1223 DP("thread_limit: %d\n", ThreadLimit); 1224 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1225 DP("ConstWGSize: %d\n", ConstWGSize); 1226 } 1227 // check for thread_limit() clause 1228 if (ThreadLimit > 0) { 1229 ThreadsPerGroup = ThreadLimit; 1230 DP("Setting threads per block to requested %d\n", ThreadLimit); 1231 // Add master warp for GENERIC 1232 if (ExecutionMode == 1233 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { 1234 ThreadsPerGroup += WarpSize; 1235 DP("Adding master wavefront: +%d threads\n", WarpSize); 1236 } 1237 if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max 1238 ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize; 1239 DP("Setting threads per block to maximum %d\n", ThreadsPerGroup); 1240 } 1241 } 1242 // check flat_max_work_group_size attr here 1243 if (ThreadsPerGroup > ConstWGSize) { 1244 ThreadsPerGroup = ConstWGSize; 1245 DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", 1246 ThreadsPerGroup); 1247 } 1248 if (print_kernel_trace & STARTUP_DETAILS) 1249 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1250 DP("Preparing %d threads\n", ThreadsPerGroup); 1251 1252 // Set default num_groups (teams) 1253 if (Env.TeamLimit > 0) 1254 NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit; 1255 else 1256 NumGroups = MaxTeams; 1257 DP("Set default num of groups %d\n", NumGroups); 1258 1259 if (print_kernel_trace & STARTUP_DETAILS) { 1260 DP("num_groups: %d\n", NumGroups); 1261 DP("num_teams: %d\n", NumTeams); 1262 } 1263 1264 // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size 1265 // This reduction is typical for default case (no thread_limit clause). 1266 // or when user goes crazy with num_teams clause. 1267 // FIXME: We cant distinguish between a constant or variable thread limit. 1268 // So we only handle constant thread_limits. 1269 if (ThreadsPerGroup > 1270 RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024 1271 // Should we round threadsPerGroup up to nearest WarpSize 1272 // here? 1273 NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup; 1274 1275 // check for num_teams() clause 1276 if (NumTeams > 0) { 1277 NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups; 1278 } 1279 if (print_kernel_trace & STARTUP_DETAILS) { 1280 DP("num_groups: %d\n", NumGroups); 1281 DP("Env.NumTeams %d\n", Env.NumTeams); 1282 DP("Env.TeamLimit %d\n", Env.TeamLimit); 1283 } 1284 1285 if (Env.NumTeams > 0) { 1286 NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups; 1287 DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); 1288 } else if (Env.TeamLimit > 0) { 1289 NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups; 1290 DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); 1291 } else { 1292 if (NumTeams <= 0) { 1293 if (LoopTripcount > 0) { 1294 if (ExecutionMode == 1295 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { 1296 // round up to the nearest integer 1297 NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1; 1298 } else if (ExecutionMode == 1299 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { 1300 NumGroups = LoopTripcount; 1301 } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { 1302 // This is a generic kernel that was transformed to use SPMD-mode 1303 // execution but uses Generic-mode semantics for scheduling. 1304 NumGroups = LoopTripcount; 1305 } 1306 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 1307 "threads per block %d\n", 1308 NumGroups, LoopTripcount, ThreadsPerGroup); 1309 } 1310 } else { 1311 NumGroups = NumTeams; 1312 } 1313 if (NumGroups > MaxTeams) { 1314 NumGroups = MaxTeams; 1315 if (print_kernel_trace & STARTUP_DETAILS) 1316 DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams); 1317 } 1318 if (NumGroups > NumTeams && NumTeams > 0) { 1319 NumGroups = NumTeams; 1320 if (print_kernel_trace & STARTUP_DETAILS) 1321 DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups, 1322 NumTeams); 1323 } 1324 } 1325 1326 // num_teams clause always honored, no matter what, unless DEFAULT is active. 1327 if (NumTeams > 0) { 1328 NumGroups = NumTeams; 1329 // Cap num_groups to EnvMaxTeamsDefault if set. 1330 if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault) 1331 NumGroups = Env.MaxTeamsDefault; 1332 } 1333 if (print_kernel_trace & STARTUP_DETAILS) { 1334 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1335 DP("num_groups: %d\n", NumGroups); 1336 DP("loop_tripcount: %ld\n", LoopTripcount); 1337 } 1338 DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups, 1339 ThreadsPerGroup); 1340 1341 LaunchVals Res; 1342 Res.WorkgroupSize = ThreadsPerGroup; 1343 Res.GridSize = ThreadsPerGroup * NumGroups; 1344 return Res; 1345 } 1346 1347 static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) { 1348 uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); 1349 bool Full = true; 1350 while (Full) { 1351 Full = 1352 PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue)); 1353 } 1354 return PacketId; 1355 } 1356 1357 int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, 1358 ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, 1359 int32_t ThreadLimit, uint64_t LoopTripcount) { 1360 // Set the context we are using 1361 // update thread limit content in gpu memory if un-initialized or specified 1362 // from host 1363 1364 DP("Run target team region thread_limit %d\n", ThreadLimit); 1365 1366 // All args are references. 1367 std::vector<void *> Args(ArgNum); 1368 std::vector<void *> Ptrs(ArgNum); 1369 1370 DP("Arg_num: %d\n", ArgNum); 1371 for (int32_t I = 0; I < ArgNum; ++I) { 1372 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1373 Args[I] = &Ptrs[I]; 1374 DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])); 1375 } 1376 1377 KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr; 1378 1379 std::string KernelName = std::string(KernelInfo->Name); 1380 auto &KernelInfoTable = DeviceInfo.KernelInfoTable; 1381 if (KernelInfoTable[DeviceId].find(KernelName) == 1382 KernelInfoTable[DeviceId].end()) { 1383 DP("Kernel %s not found\n", KernelName.c_str()); 1384 return OFFLOAD_FAIL; 1385 } 1386 1387 const atl_kernel_info_t KernelInfoEntry = 1388 KernelInfoTable[DeviceId][KernelName]; 1389 const uint32_t GroupSegmentSize = 1390 KernelInfoEntry.group_segment_size + DeviceInfo.Env.DynamicMemSize; 1391 const uint32_t SgprCount = KernelInfoEntry.sgpr_count; 1392 const uint32_t VgprCount = KernelInfoEntry.vgpr_count; 1393 const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count; 1394 const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count; 1395 1396 assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count); 1397 1398 /* 1399 * Set limit based on ThreadsPerGroup and GroupsPerDevice 1400 */ 1401 LaunchVals LV = 1402 getLaunchVals(DeviceInfo.WarpSize[DeviceId], DeviceInfo.Env, 1403 KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, 1404 NumTeams, // From run_region arg 1405 ThreadLimit, // From run_region arg 1406 LoopTripcount, // From run_region arg 1407 DeviceInfo.NumTeams[KernelInfo->DeviceId]); 1408 const int GridSize = LV.GridSize; 1409 const int WorkgroupSize = LV.WorkgroupSize; 1410 1411 if (print_kernel_trace >= LAUNCH) { 1412 int NumGroups = GridSize / WorkgroupSize; 1413 // enum modes are SPMD, GENERIC, NONE 0,1,2 1414 // if doing rtl timing, print to stderr, unless stdout requested. 1415 bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); 1416 fprintf(TraceToStdout ? stdout : stderr, 1417 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " 1418 "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " 1419 "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", 1420 DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, 1421 ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit, 1422 GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount, 1423 VgprSpillCount, LoopTripcount, KernelInfo->Name); 1424 } 1425 1426 // Run on the device. 1427 { 1428 hsa_queue_t *Queue = DeviceInfo.HSAQueueSchedulers[DeviceId].next(); 1429 if (!Queue) { 1430 return OFFLOAD_FAIL; 1431 } 1432 uint64_t PacketId = acquireAvailablePacketId(Queue); 1433 1434 const uint32_t Mask = Queue->size - 1; // size is a power of 2 1435 hsa_kernel_dispatch_packet_t *Packet = 1436 (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); 1437 1438 // packet->header is written last 1439 Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 1440 Packet->workgroup_size_x = WorkgroupSize; 1441 Packet->workgroup_size_y = 1; 1442 Packet->workgroup_size_z = 1; 1443 Packet->reserved0 = 0; 1444 Packet->grid_size_x = GridSize; 1445 Packet->grid_size_y = 1; 1446 Packet->grid_size_z = 1; 1447 Packet->private_segment_size = KernelInfoEntry.private_segment_size; 1448 Packet->group_segment_size = GroupSegmentSize; 1449 Packet->kernel_object = KernelInfoEntry.kernel_object; 1450 Packet->kernarg_address = 0; // use the block allocator 1451 Packet->reserved2 = 0; // impl writes id_ here 1452 Packet->completion_signal = {0}; // may want a pool of signals 1453 1454 KernelArgPool *ArgPool = nullptr; 1455 void *KernArg = nullptr; 1456 { 1457 auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name)); 1458 if (It != KernelArgPoolMap.end()) { 1459 ArgPool = (It->second).get(); 1460 } 1461 } 1462 if (!ArgPool) { 1463 DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, 1464 DeviceId); 1465 } 1466 { 1467 if (ArgPool) { 1468 assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))); 1469 KernArg = ArgPool->allocate(ArgNum); 1470 } 1471 if (!KernArg) { 1472 DP("Allocate kernarg failed\n"); 1473 return OFFLOAD_FAIL; 1474 } 1475 1476 // Copy explicit arguments 1477 for (int I = 0; I < ArgNum; I++) { 1478 memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *)); 1479 } 1480 1481 // Initialize implicit arguments. TODO: Which of these can be dropped 1482 impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>( 1483 static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize); 1484 memset(ImplArgs, 0, 1485 sizeof(impl_implicit_args_t)); // may not be necessary 1486 ImplArgs->offset_x = 0; 1487 ImplArgs->offset_y = 0; 1488 ImplArgs->offset_z = 0; 1489 1490 // assign a hostcall buffer for the selected Q 1491 if (__atomic_load_n(&DeviceInfo.HostcallRequired, __ATOMIC_ACQUIRE)) { 1492 // hostrpc_assign_buffer is not thread safe, and this function is 1493 // under a multiple reader lock, not a writer lock. 1494 static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER; 1495 pthread_mutex_lock(&HostcallInitLock); 1496 uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo.HSAAgents[DeviceId], 1497 Queue, DeviceId); 1498 pthread_mutex_unlock(&HostcallInitLock); 1499 if (!Buffer) { 1500 DP("hostrpc_assign_buffer failed, gpu would dereference null and " 1501 "error\n"); 1502 return OFFLOAD_FAIL; 1503 } 1504 1505 DP("Implicit argument count: %d\n", 1506 KernelInfoEntry.implicit_argument_count); 1507 if (KernelInfoEntry.implicit_argument_count >= 4) { 1508 // Initialise pointer for implicit_argument_count != 0 ABI 1509 // Guess that the right implicit argument is at offset 24 after 1510 // the explicit arguments. In the future, should be able to read 1511 // the offset from msgpack. Clang is not annotating it at present. 1512 uint64_t Offset = 1513 sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); 1514 if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) { 1515 DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit " 1516 "args: %d\n", 1517 Offset + 8, ArgPool->kernargSizeIncludingImplicit()); 1518 } else { 1519 memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8); 1520 } 1521 } 1522 1523 // initialise pointer for implicit_argument_count == 0 ABI 1524 ImplArgs->hostcall_ptr = Buffer; 1525 } 1526 1527 Packet->kernarg_address = KernArg; 1528 } 1529 1530 hsa_signal_t S = DeviceInfo.FreeSignalPool.pop(); 1531 if (S.handle == 0) { 1532 DP("Failed to get signal instance\n"); 1533 return OFFLOAD_FAIL; 1534 } 1535 Packet->completion_signal = S; 1536 hsa_signal_store_relaxed(Packet->completion_signal, 1); 1537 1538 // Publish the packet indicating it is ready to be processed 1539 core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet), 1540 core::createHeader(), Packet->setup); 1541 1542 // Since the packet is already published, its contents must not be 1543 // accessed any more 1544 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); 1545 1546 while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, 1547 HSA_WAIT_STATE_BLOCKED) != 0) 1548 ; 1549 1550 assert(ArgPool); 1551 ArgPool->deallocate(KernArg); 1552 DeviceInfo.FreeSignalPool.push(S); 1553 } 1554 1555 DP("Kernel completed\n"); 1556 return OFFLOAD_SUCCESS; 1557 } 1558 1559 bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) { 1560 const uint16_t AmdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h 1561 int32_t R = elf_check_machine(Image, AmdgcnMachineID); 1562 if (!R) { 1563 DP("Supported machine ID not found\n"); 1564 } 1565 return R; 1566 } 1567 1568 uint32_t elfEFlags(__tgt_device_image *Image) { 1569 char *ImgBegin = (char *)Image->ImageStart; 1570 size_t ImgSize = (char *)Image->ImageEnd - ImgBegin; 1571 1572 Elf *E = elf_memory(ImgBegin, ImgSize); 1573 if (!E) { 1574 DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); 1575 return 0; 1576 } 1577 1578 Elf64_Ehdr *Eh64 = elf64_getehdr(E); 1579 1580 if (!Eh64) { 1581 DP("Unable to get machine ID from ELF file!\n"); 1582 elf_end(E); 1583 return 0; 1584 } 1585 1586 uint32_t Flags = Eh64->e_flags; 1587 1588 elf_end(E); 1589 DP("ELF Flags: 0x%x\n", Flags); 1590 return Flags; 1591 } 1592 1593 template <typename T> bool enforceUpperBound(T *Value, T Upper) { 1594 bool Changed = *Value > Upper; 1595 if (Changed) { 1596 *Value = Upper; 1597 } 1598 return Changed; 1599 } 1600 1601 Elf64_Shdr *findOnlyShtHash(Elf *Elf) { 1602 size_t N; 1603 int Rc = elf_getshdrnum(Elf, &N); 1604 if (Rc != 0) { 1605 return nullptr; 1606 } 1607 1608 Elf64_Shdr *Result = nullptr; 1609 for (size_t I = 0; I < N; I++) { 1610 Elf_Scn *Scn = elf_getscn(Elf, I); 1611 if (Scn) { 1612 Elf64_Shdr *Shdr = elf64_getshdr(Scn); 1613 if (Shdr) { 1614 if (Shdr->sh_type == SHT_HASH) { 1615 if (Result == nullptr) { 1616 Result = Shdr; 1617 } else { 1618 // multiple SHT_HASH sections not handled 1619 return nullptr; 1620 } 1621 } 1622 } 1623 } 1624 } 1625 return Result; 1626 } 1627 1628 const Elf64_Sym *elfLookup(Elf *Elf, char *Base, Elf64_Shdr *SectionHash, 1629 const char *Symname) { 1630 1631 assert(SectionHash); 1632 size_t SectionSymtabIndex = SectionHash->sh_link; 1633 Elf64_Shdr *SectionSymtab = 1634 elf64_getshdr(elf_getscn(Elf, SectionSymtabIndex)); 1635 size_t SectionStrtabIndex = SectionSymtab->sh_link; 1636 1637 const Elf64_Sym *Symtab = 1638 reinterpret_cast<const Elf64_Sym *>(Base + SectionSymtab->sh_offset); 1639 1640 const uint32_t *Hashtab = 1641 reinterpret_cast<const uint32_t *>(Base + SectionHash->sh_offset); 1642 1643 // Layout: 1644 // nbucket 1645 // nchain 1646 // bucket[nbucket] 1647 // chain[nchain] 1648 uint32_t Nbucket = Hashtab[0]; 1649 const uint32_t *Bucket = &Hashtab[2]; 1650 const uint32_t *Chain = &Hashtab[Nbucket + 2]; 1651 1652 const size_t Max = strlen(Symname) + 1; 1653 const uint32_t Hash = elf_hash(Symname); 1654 for (uint32_t I = Bucket[Hash % Nbucket]; I != 0; I = Chain[I]) { 1655 char *N = elf_strptr(Elf, SectionStrtabIndex, Symtab[I].st_name); 1656 if (strncmp(Symname, N, Max) == 0) { 1657 return &Symtab[I]; 1658 } 1659 } 1660 1661 return nullptr; 1662 } 1663 1664 struct SymbolInfo { 1665 void *Addr = nullptr; 1666 uint32_t Size = UINT32_MAX; 1667 uint32_t ShType = SHT_NULL; 1668 }; 1669 1670 int getSymbolInfoWithoutLoading(Elf *Elf, char *Base, const char *Symname, 1671 SymbolInfo *Res) { 1672 if (elf_kind(Elf) != ELF_K_ELF) { 1673 return 1; 1674 } 1675 1676 Elf64_Shdr *SectionHash = findOnlyShtHash(Elf); 1677 if (!SectionHash) { 1678 return 1; 1679 } 1680 1681 const Elf64_Sym *Sym = elfLookup(Elf, Base, SectionHash, Symname); 1682 if (!Sym) { 1683 return 1; 1684 } 1685 1686 if (Sym->st_size > UINT32_MAX) { 1687 return 1; 1688 } 1689 1690 if (Sym->st_shndx == SHN_UNDEF) { 1691 return 1; 1692 } 1693 1694 Elf_Scn *Section = elf_getscn(Elf, Sym->st_shndx); 1695 if (!Section) { 1696 return 1; 1697 } 1698 1699 Elf64_Shdr *Header = elf64_getshdr(Section); 1700 if (!Header) { 1701 return 1; 1702 } 1703 1704 Res->Addr = Sym->st_value + Base; 1705 Res->Size = static_cast<uint32_t>(Sym->st_size); 1706 Res->ShType = Header->sh_type; 1707 return 0; 1708 } 1709 1710 int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *Symname, 1711 SymbolInfo *Res) { 1712 Elf *Elf = elf_memory(Base, ImgSize); 1713 if (Elf) { 1714 int Rc = getSymbolInfoWithoutLoading(Elf, Base, Symname, Res); 1715 elf_end(Elf); 1716 return Rc; 1717 } 1718 return 1; 1719 } 1720 1721 hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize, 1722 const char *SymName, void **VarAddr, 1723 uint32_t *VarSize) { 1724 SymbolInfo SI; 1725 int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI); 1726 if (Rc == 0) { 1727 *VarAddr = SI.Addr; 1728 *VarSize = SI.Size; 1729 return HSA_STATUS_SUCCESS; 1730 } 1731 return HSA_STATUS_ERROR; 1732 } 1733 1734 template <typename C> 1735 hsa_status_t moduleRegisterFromMemoryToPlace( 1736 std::map<std::string, atl_kernel_info_t> &KernelInfoTable, 1737 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, 1738 void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb, 1739 std::vector<hsa_executable_t> &HSAExecutables) { 1740 auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t { 1741 C *Unwrapped = static_cast<C *>(CbState); 1742 return (*Unwrapped)(Data, Size); 1743 }; 1744 return core::RegisterModuleFromMemory( 1745 KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize, 1746 DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&Cb), 1747 HSAExecutables); 1748 } 1749 1750 uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) { 1751 uint64_t DeviceStateBytes = 0; 1752 { 1753 // If this is the deviceRTL, get the state variable size 1754 SymbolInfo SizeSi; 1755 int Rc = getSymbolInfoWithoutLoading( 1756 ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi); 1757 1758 if (Rc == 0) { 1759 if (SizeSi.Size != sizeof(uint64_t)) { 1760 DP("Found device_State_size variable with wrong size\n"); 1761 return 0; 1762 } 1763 1764 // Read number of bytes directly from the elf 1765 memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t)); 1766 } 1767 } 1768 return DeviceStateBytes; 1769 } 1770 1771 struct DeviceEnvironment { 1772 // initialise an DeviceEnvironmentTy in the deviceRTL 1773 // patches around differences in the deviceRTL between trunk, aomp, 1774 // rocmcc. Over time these differences will tend to zero and this class 1775 // simplified. 1776 // Symbol may be in .data or .bss, and may be missing fields, todo: 1777 // review aomp/trunk/rocm and simplify the following 1778 1779 // The symbol may also have been deadstripped because the device side 1780 // accessors were unused. 1781 1782 // If the symbol is in .data (aomp, rocm) it can be written directly. 1783 // If it is in .bss, we must wait for it to be allocated space on the 1784 // gpu (trunk) and initialize after loading. 1785 const char *sym() { return "omptarget_device_environment"; } 1786 1787 DeviceEnvironmentTy HostDeviceEnv; 1788 SymbolInfo SI; 1789 bool Valid = false; 1790 1791 __tgt_device_image *Image; 1792 const size_t ImgSize; 1793 1794 DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize, 1795 __tgt_device_image *Image, const size_t ImgSize) 1796 : Image(Image), ImgSize(ImgSize) { 1797 1798 HostDeviceEnv.NumDevices = NumberDevices; 1799 HostDeviceEnv.DeviceNum = DeviceId; 1800 HostDeviceEnv.DebugKind = 0; 1801 HostDeviceEnv.DynamicMemSize = DynamicMemSize; 1802 if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 1803 HostDeviceEnv.DebugKind = std::stoi(EnvStr); 1804 1805 int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize, 1806 sym(), &SI); 1807 if (Rc != 0) { 1808 DP("Finding global device environment '%s' - symbol missing.\n", sym()); 1809 return; 1810 } 1811 1812 if (SI.Size > sizeof(HostDeviceEnv)) { 1813 DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size, 1814 sizeof(HostDeviceEnv)); 1815 return; 1816 } 1817 1818 Valid = true; 1819 } 1820 1821 bool inImage() { return SI.ShType != SHT_NOBITS; } 1822 1823 hsa_status_t beforeLoading(void *Data, size_t Size) { 1824 if (Valid) { 1825 if (inImage()) { 1826 DP("Setting global device environment before load (%u bytes)\n", 1827 SI.Size); 1828 uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart; 1829 void *Pos = (char *)Data + Offset; 1830 memcpy(Pos, &HostDeviceEnv, SI.Size); 1831 } 1832 } 1833 return HSA_STATUS_SUCCESS; 1834 } 1835 1836 hsa_status_t afterLoading() { 1837 if (Valid) { 1838 if (!inImage()) { 1839 DP("Setting global device environment after load (%u bytes)\n", 1840 SI.Size); 1841 int DeviceId = HostDeviceEnv.DeviceNum; 1842 auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId]; 1843 void *StatePtr; 1844 uint32_t StatePtrSize; 1845 hsa_status_t Err = interop_hsa_get_symbol_info( 1846 SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize); 1847 if (Err != HSA_STATUS_SUCCESS) { 1848 DP("failed to find %s in loaded image\n", sym()); 1849 return Err; 1850 } 1851 1852 if (StatePtrSize != SI.Size) { 1853 DP("Symbol had size %u before loading, %u after\n", StatePtrSize, 1854 SI.Size); 1855 return HSA_STATUS_ERROR; 1856 } 1857 1858 return DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, 1859 StatePtrSize, DeviceId); 1860 } 1861 } 1862 return HSA_STATUS_SUCCESS; 1863 } 1864 }; 1865 1866 hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) { 1867 uint64_t Rounded = 4 * ((Size + 3) / 4); 1868 void *Ptr; 1869 hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId); 1870 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr); 1871 if (Err != HSA_STATUS_SUCCESS) { 1872 return Err; 1873 } 1874 1875 hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4); 1876 if (Rc != HSA_STATUS_SUCCESS) { 1877 DP("zero fill device_state failed with %u\n", Rc); 1878 core::Runtime::Memfree(Ptr); 1879 return HSA_STATUS_ERROR; 1880 } 1881 1882 *RetPtr = Ptr; 1883 return HSA_STATUS_SUCCESS; 1884 } 1885 1886 bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) { 1887 SymbolInfo SI; 1888 int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI); 1889 return (Rc == 0) && (SI.Addr != nullptr); 1890 } 1891 1892 } // namespace 1893 1894 namespace core { 1895 hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) { 1896 return hsa_amd_agents_allow_access(DeviceInfo.HSAAgents.size(), 1897 &DeviceInfo.HSAAgents[0], NULL, Ptr); 1898 } 1899 } // namespace core 1900 1901 static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) { 1902 hsa_status_t err; 1903 uint32_t name_len; 1904 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len); 1905 if (err != HSA_STATUS_SUCCESS) { 1906 DP("Error getting ISA info length\n"); 1907 return err; 1908 } 1909 1910 char TargetID[name_len]; 1911 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID); 1912 if (err != HSA_STATUS_SUCCESS) { 1913 DP("Error getting ISA info name\n"); 1914 return err; 1915 } 1916 1917 auto TripleTargetID = llvm::StringRef(TargetID); 1918 if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) { 1919 DeviceInfo.TargetID.push_back(TripleTargetID.ltrim('-').str()); 1920 } 1921 return HSA_STATUS_SUCCESS; 1922 } 1923 1924 /// Parse a TargetID to get processor arch and feature map. 1925 /// Returns processor subarch. 1926 /// Returns TargetID features in \p FeatureMap argument. 1927 /// If the \p TargetID contains feature+, FeatureMap it to true. 1928 /// If the \p TargetID contains feature-, FeatureMap it to false. 1929 /// If the \p TargetID does not contain a feature (default), do not map it. 1930 StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) { 1931 if (TargetID.empty()) 1932 return llvm::StringRef(); 1933 1934 auto ArchFeature = TargetID.split(":"); 1935 auto Arch = ArchFeature.first; 1936 auto Features = ArchFeature.second; 1937 if (Features.empty()) 1938 return Arch; 1939 1940 if (Features.contains("sramecc+")) { 1941 FeatureMap.insert(std::pair<std::string, bool>("sramecc", true)); 1942 } else if (Features.contains("sramecc-")) { 1943 FeatureMap.insert(std::pair<std::string, bool>("sramecc", false)); 1944 } 1945 if (Features.contains("xnack+")) { 1946 FeatureMap.insert(std::pair<std::string, bool>("xnack", true)); 1947 } else if (Features.contains("xnack-")) { 1948 FeatureMap.insert(std::pair<std::string, bool>("xnack", false)); 1949 } 1950 1951 return Arch; 1952 } 1953 1954 /// Checks if an image \p ImgInfo is compatible with current 1955 /// system's environment \p EnvInfo 1956 bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) { 1957 llvm::StringRef ImgTID(ImgInfo), EnvTID(EnvInfo); 1958 1959 // Compatible in case of exact match 1960 if (ImgTID == EnvTID) { 1961 DP("Compatible: Exact match \t[Image: %s]\t:\t[Environment: %s]\n", 1962 ImgTID.data(), EnvTID.data()); 1963 return true; 1964 } 1965 1966 // Incompatible if Archs mismatch. 1967 StringMap<bool> ImgMap, EnvMap; 1968 StringRef ImgArch = parseTargetID(ImgTID, ImgMap); 1969 StringRef EnvArch = parseTargetID(EnvTID, EnvMap); 1970 1971 // Both EnvArch and ImgArch can't be empty here. 1972 if (EnvArch.empty() || ImgArch.empty() || !ImgArch.contains(EnvArch)) { 1973 DP("Incompatible: Processor mismatch \t[Image: %s]\t:\t[Environment: %s]\n", 1974 ImgTID.data(), EnvTID.data()); 1975 return false; 1976 } 1977 1978 // Incompatible if image has more features than the environment, irrespective 1979 // of type or sign of features. 1980 if (ImgMap.size() > EnvMap.size()) { 1981 DP("Incompatible: Image has more features than the environment \t[Image: " 1982 "%s]\t:\t[Environment: %s]\n", 1983 ImgTID.data(), EnvTID.data()); 1984 return false; 1985 } 1986 1987 // Compatible if each target feature specified by the environment is 1988 // compatible with target feature of the image. The target feature is 1989 // compatible if the iamge does not specify it (meaning Any), or if it 1990 // specifies it with the same value (meaning On or Off). 1991 for (const auto &ImgFeature : ImgMap) { 1992 auto EnvFeature = EnvMap.find(ImgFeature.first()); 1993 if (EnvFeature == EnvMap.end()) { 1994 DP("Incompatible: Value of Image's non-ANY feature is not matching with " 1995 "the Environment feature's ANY value \t[Image: %s]\t:\t[Environment: " 1996 "%s]\n", 1997 ImgTID.data(), EnvTID.data()); 1998 return false; 1999 } else if (EnvFeature->first() == ImgFeature.first() && 2000 EnvFeature->second != ImgFeature.second) { 2001 DP("Incompatible: Value of Image's non-ANY feature is not matching with " 2002 "the Environment feature's non-ANY value \t[Image: " 2003 "%s]\t:\t[Environment: %s]\n", 2004 ImgTID.data(), EnvTID.data()); 2005 return false; 2006 } 2007 } 2008 2009 // Image is compatible if all features of Environment are: 2010 // - either, present in the Image's features map with the same sign, 2011 // - or, the feature is missing from Image's features map i.e. it is 2012 // set to ANY 2013 DP("Compatible: Target IDs are compatible \t[Image: %s]\t:\t[Environment: " 2014 "%s]\n", 2015 ImgTID.data(), EnvTID.data()); 2016 return true; 2017 } 2018 2019 extern "C" { 2020 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { 2021 return elfMachineIdIsAmdgcn(Image); 2022 } 2023 2024 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image, 2025 __tgt_image_info *info) { 2026 if (!__tgt_rtl_is_valid_binary(image)) 2027 return false; 2028 2029 // A subarchitecture was not specified. Assume it is compatible. 2030 if (!info->Arch) 2031 return true; 2032 2033 int32_t NumberOfDevices = __tgt_rtl_number_of_devices(); 2034 2035 for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { 2036 __tgt_rtl_init_device(DeviceId); 2037 hsa_agent_t agent = DeviceInfo.HSAAgents[DeviceId]; 2038 hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId); 2039 if (err != HSA_STATUS_SUCCESS) { 2040 DP("Error iterating ISAs\n"); 2041 return false; 2042 } 2043 if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo.TargetID[DeviceId])) 2044 return false; 2045 } 2046 DP("Image has Target ID compatible with the current environment: %s\n", 2047 info->Arch); 2048 return true; 2049 } 2050 2051 int __tgt_rtl_number_of_devices() { 2052 // If the construction failed, no methods are safe to call 2053 if (DeviceInfo.ConstructionSucceeded) { 2054 return DeviceInfo.NumberOfDevices; 2055 } 2056 DP("AMDGPU plugin construction failed. Zero devices available\n"); 2057 return 0; 2058 } 2059 2060 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 2061 DP("Init requires flags to %ld\n", RequiresFlags); 2062 DeviceInfo.RequiresFlags = RequiresFlags; 2063 return RequiresFlags; 2064 } 2065 2066 int32_t __tgt_rtl_init_device(int DeviceId) { 2067 hsa_status_t Err = hsa_init(); 2068 if (Err != HSA_STATUS_SUCCESS) { 2069 DP("HSA Initialization Failed.\n"); 2070 return HSA_STATUS_ERROR; 2071 } 2072 // this is per device id init 2073 DP("Initialize the device id: %d\n", DeviceId); 2074 2075 hsa_agent_t Agent = DeviceInfo.HSAAgents[DeviceId]; 2076 2077 // Get number of Compute Unit 2078 uint32_t ComputeUnits = 0; 2079 Err = hsa_agent_get_info( 2080 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 2081 &ComputeUnits); 2082 if (Err != HSA_STATUS_SUCCESS) { 2083 DeviceInfo.ComputeUnits[DeviceId] = 1; 2084 DP("Error getting compute units : settiing to 1\n"); 2085 } else { 2086 DeviceInfo.ComputeUnits[DeviceId] = ComputeUnits; 2087 DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[DeviceId]); 2088 } 2089 2090 char GetInfoName[64]; // 64 max size returned by get info 2091 Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, 2092 (void *)GetInfoName); 2093 if (Err) 2094 DeviceInfo.GPUName[DeviceId] = "--unknown gpu--"; 2095 else { 2096 DeviceInfo.GPUName[DeviceId] = GetInfoName; 2097 } 2098 2099 if (print_kernel_trace & STARTUP_DETAILS) 2100 DP("Device#%-2d CU's: %2d %s\n", DeviceId, 2101 DeviceInfo.ComputeUnits[DeviceId], DeviceInfo.GPUName[DeviceId].c_str()); 2102 2103 // Query attributes to determine number of threads/block and blocks/grid. 2104 uint16_t WorkgroupMaxDim[3]; 2105 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 2106 &WorkgroupMaxDim); 2107 if (Err != HSA_STATUS_SUCCESS) { 2108 DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams; 2109 DP("Error getting grid dims: num groups : %d\n", 2110 RTLDeviceInfoTy::DefaultNumTeams); 2111 } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) { 2112 DeviceInfo.GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; 2113 DP("Using %d ROCm blocks per grid\n", DeviceInfo.GroupsPerDevice[DeviceId]); 2114 } else { 2115 DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; 2116 DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " 2117 "at the hard limit\n", 2118 WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit); 2119 } 2120 2121 // Get thread limit 2122 hsa_dim3_t GridMaxDim; 2123 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim); 2124 if (Err == HSA_STATUS_SUCCESS) { 2125 DeviceInfo.ThreadsPerGroup[DeviceId] = 2126 reinterpret_cast<uint32_t *>(&GridMaxDim)[0] / 2127 DeviceInfo.GroupsPerDevice[DeviceId]; 2128 2129 if (DeviceInfo.ThreadsPerGroup[DeviceId] == 0) { 2130 DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; 2131 DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); 2132 } else if (enforceUpperBound(&DeviceInfo.ThreadsPerGroup[DeviceId], 2133 RTLDeviceInfoTy::MaxWgSize)) { 2134 DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); 2135 } else { 2136 DP("Using ROCm Queried thread limit: %d\n", 2137 DeviceInfo.ThreadsPerGroup[DeviceId]); 2138 } 2139 } else { 2140 DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; 2141 DP("Error getting max block dimension, use default:%d \n", 2142 RTLDeviceInfoTy::MaxWgSize); 2143 } 2144 2145 // Get wavefront size 2146 uint32_t WavefrontSize = 0; 2147 Err = 2148 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize); 2149 if (Err == HSA_STATUS_SUCCESS) { 2150 DP("Queried wavefront size: %d\n", WavefrontSize); 2151 DeviceInfo.WarpSize[DeviceId] = WavefrontSize; 2152 } else { 2153 // TODO: Burn the wavefront size into the code object 2154 DP("Warning: Unknown wavefront size, assuming 64\n"); 2155 DeviceInfo.WarpSize[DeviceId] = 64; 2156 } 2157 2158 // Adjust teams to the env variables 2159 2160 if (DeviceInfo.Env.TeamLimit > 0 && 2161 (enforceUpperBound(&DeviceInfo.GroupsPerDevice[DeviceId], 2162 DeviceInfo.Env.TeamLimit))) { 2163 DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", 2164 DeviceInfo.Env.TeamLimit); 2165 } 2166 2167 // Set default number of teams 2168 if (DeviceInfo.Env.NumTeams > 0) { 2169 DeviceInfo.NumTeams[DeviceId] = DeviceInfo.Env.NumTeams; 2170 DP("Default number of teams set according to environment %d\n", 2171 DeviceInfo.Env.NumTeams); 2172 } else { 2173 char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); 2174 int TeamsPerCU = DefaultTeamsPerCU; 2175 if (TeamsPerCUEnvStr) { 2176 TeamsPerCU = std::stoi(TeamsPerCUEnvStr); 2177 } 2178 2179 DeviceInfo.NumTeams[DeviceId] = 2180 TeamsPerCU * DeviceInfo.ComputeUnits[DeviceId]; 2181 DP("Default number of teams = %d * number of compute units %d\n", 2182 TeamsPerCU, DeviceInfo.ComputeUnits[DeviceId]); 2183 } 2184 2185 if (enforceUpperBound(&DeviceInfo.NumTeams[DeviceId], 2186 DeviceInfo.GroupsPerDevice[DeviceId])) { 2187 DP("Default number of teams exceeds device limit, capping at %d\n", 2188 DeviceInfo.GroupsPerDevice[DeviceId]); 2189 } 2190 2191 // Adjust threads to the env variables 2192 if (DeviceInfo.Env.TeamThreadLimit > 0 && 2193 (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId], 2194 DeviceInfo.Env.TeamThreadLimit))) { 2195 DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", 2196 DeviceInfo.Env.TeamThreadLimit); 2197 } 2198 2199 // Set default number of threads 2200 DeviceInfo.NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize; 2201 DP("Default number of threads set according to library's default %d\n", 2202 RTLDeviceInfoTy::DefaultWgSize); 2203 if (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId], 2204 DeviceInfo.ThreadsPerGroup[DeviceId])) { 2205 DP("Default number of threads exceeds device limit, capping at %d\n", 2206 DeviceInfo.ThreadsPerGroup[DeviceId]); 2207 } 2208 2209 DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", 2210 DeviceId, DeviceInfo.GroupsPerDevice[DeviceId], 2211 DeviceInfo.ThreadsPerGroup[DeviceId]); 2212 2213 DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId, 2214 DeviceInfo.WarpSize[DeviceId], DeviceInfo.ThreadsPerGroup[DeviceId], 2215 DeviceInfo.GroupsPerDevice[DeviceId], 2216 DeviceInfo.GroupsPerDevice[DeviceId] * 2217 DeviceInfo.ThreadsPerGroup[DeviceId]); 2218 2219 return OFFLOAD_SUCCESS; 2220 } 2221 2222 static __tgt_target_table * 2223 __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image); 2224 2225 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, 2226 __tgt_device_image *Image) { 2227 DeviceInfo.LoadRunLock.lock(); 2228 __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image); 2229 DeviceInfo.LoadRunLock.unlock(); 2230 return Res; 2231 } 2232 2233 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, 2234 __tgt_device_image *Image) { 2235 // This function loads the device image onto gpu[DeviceId] and does other 2236 // per-image initialization work. Specifically: 2237 // 2238 // - Initialize an DeviceEnvironmentTy instance embedded in the 2239 // image at the symbol "omptarget_device_environment" 2240 // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL. 2241 // 2242 // - Allocate a large array per-gpu (could be moved to init_device) 2243 // - Read a uint64_t at symbol omptarget_nvptx_device_State_size 2244 // - Allocate at least that many bytes of gpu memory 2245 // - Zero initialize it 2246 // - Write the pointer to the symbol omptarget_nvptx_device_State 2247 // 2248 // - Pulls some per-kernel information together from various sources and 2249 // records it in the KernelsList for quicker access later 2250 // 2251 // The initialization can be done before or after loading the image onto the 2252 // gpu. This function presently does a mixture. Using the hsa api to get/set 2253 // the information is simpler to implement, in exchange for more complicated 2254 // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes 2255 // back from the gpu vs a hashtable lookup on the host. 2256 2257 const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart; 2258 2259 DeviceInfo.clearOffloadEntriesTable(DeviceId); 2260 2261 // We do not need to set the ELF version because the caller of this function 2262 // had to do that to decide the right runtime to use 2263 2264 if (!elfMachineIdIsAmdgcn(Image)) 2265 return NULL; 2266 2267 { 2268 auto Env = DeviceEnvironment(DeviceId, DeviceInfo.NumberOfDevices, 2269 DeviceInfo.Env.DynamicMemSize, Image, ImgSize); 2270 2271 auto &KernelInfo = DeviceInfo.KernelInfoTable[DeviceId]; 2272 auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId]; 2273 hsa_status_t Err = moduleRegisterFromMemoryToPlace( 2274 KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId, 2275 [&](void *Data, size_t Size) { 2276 if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) { 2277 __atomic_store_n(&DeviceInfo.HostcallRequired, true, 2278 __ATOMIC_RELEASE); 2279 } 2280 return Env.beforeLoading(Data, Size); 2281 }, 2282 DeviceInfo.HSAExecutables); 2283 2284 check("Module registering", Err); 2285 if (Err != HSA_STATUS_SUCCESS) { 2286 const char *DeviceName = DeviceInfo.GPUName[DeviceId].c_str(); 2287 const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image)); 2288 2289 if (strcmp(DeviceName, ElfName) != 0) { 2290 DP("Possible gpu arch mismatch: device:%s, image:%s please check" 2291 " compiler flag: -march=<gpu>\n", 2292 DeviceName, ElfName); 2293 } else { 2294 DP("Error loading image onto GPU: %s\n", get_error_string(Err)); 2295 } 2296 2297 return NULL; 2298 } 2299 2300 Err = Env.afterLoading(); 2301 if (Err != HSA_STATUS_SUCCESS) { 2302 return NULL; 2303 } 2304 } 2305 2306 DP("AMDGPU module successfully loaded!\n"); 2307 2308 { 2309 // the device_State array is either large value in bss or a void* that 2310 // needs to be assigned to a pointer to an array of size device_state_bytes 2311 // If absent, it has been deadstripped and needs no setup. 2312 2313 void *StatePtr; 2314 uint32_t StatePtrSize; 2315 auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId]; 2316 hsa_status_t Err = interop_hsa_get_symbol_info( 2317 SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr, 2318 &StatePtrSize); 2319 2320 if (Err != HSA_STATUS_SUCCESS) { 2321 DP("No device_state symbol found, skipping initialization\n"); 2322 } else { 2323 if (StatePtrSize < sizeof(void *)) { 2324 DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize, 2325 sizeof(void *)); 2326 return NULL; 2327 } 2328 2329 // if it's larger than a void*, assume it's a bss array and no further 2330 // initialization is required. Only try to set up a pointer for 2331 // sizeof(void*) 2332 if (StatePtrSize == sizeof(void *)) { 2333 uint64_t DeviceStateBytes = 2334 getDeviceStateBytes((char *)Image->ImageStart, ImgSize); 2335 if (DeviceStateBytes == 0) { 2336 DP("Can't initialize device_State, missing size information\n"); 2337 return NULL; 2338 } 2339 2340 auto &DSS = DeviceInfo.DeviceStateStore[DeviceId]; 2341 if (DSS.first.get() == nullptr) { 2342 assert(DSS.second == 0); 2343 void *Ptr = NULL; 2344 hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId); 2345 if (Err != HSA_STATUS_SUCCESS) { 2346 DP("Failed to allocate device_state array\n"); 2347 return NULL; 2348 } 2349 DSS = { 2350 std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr}, 2351 DeviceStateBytes, 2352 }; 2353 } 2354 2355 void *Ptr = DSS.first.get(); 2356 if (DeviceStateBytes != DSS.second) { 2357 DP("Inconsistent sizes of device_State unsupported\n"); 2358 return NULL; 2359 } 2360 2361 // write ptr to device memory so it can be used by later kernels 2362 Err = DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *), 2363 DeviceId); 2364 if (Err != HSA_STATUS_SUCCESS) { 2365 DP("memcpy install of state_ptr failed\n"); 2366 return NULL; 2367 } 2368 } 2369 } 2370 } 2371 2372 // Here, we take advantage of the data that is appended after img_end to get 2373 // the symbols' name we need to load. This data consist of the host entries 2374 // begin and end as well as the target name (see the offloading linker script 2375 // creation in clang compiler). 2376 2377 // Find the symbols in the module by name. The name can be obtain by 2378 // concatenating the host entry name with the target name 2379 2380 __tgt_offload_entry *HostBegin = Image->EntriesBegin; 2381 __tgt_offload_entry *HostEnd = Image->EntriesEnd; 2382 2383 for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 2384 2385 if (!E->addr) { 2386 // The host should have always something in the address to 2387 // uniquely identify the target region. 2388 DP("Analyzing host entry '<null>' (size = %lld)...\n", 2389 (unsigned long long)E->size); 2390 return NULL; 2391 } 2392 2393 if (E->size) { 2394 __tgt_offload_entry Entry = *E; 2395 2396 void *Varptr; 2397 uint32_t Varsize; 2398 2399 auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId]; 2400 hsa_status_t Err = interop_hsa_get_symbol_info( 2401 SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize); 2402 2403 if (Err != HSA_STATUS_SUCCESS) { 2404 // Inform the user what symbol prevented offloading 2405 DP("Loading global '%s' (Failed)\n", E->name); 2406 return NULL; 2407 } 2408 2409 if (Varsize != E->size) { 2410 DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name, 2411 Varsize, E->size); 2412 return NULL; 2413 } 2414 2415 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 2416 DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)); 2417 Entry.addr = (void *)Varptr; 2418 2419 DeviceInfo.addOffloadEntry(DeviceId, Entry); 2420 2421 if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 2422 E->flags & OMP_DECLARE_TARGET_LINK) { 2423 // If unified memory is present any target link variables 2424 // can access host addresses directly. There is no longer a 2425 // need for device copies. 2426 Err = DeviceInfo.freesignalpoolMemcpyH2D(Varptr, E->addr, 2427 sizeof(void *), DeviceId); 2428 if (Err != HSA_STATUS_SUCCESS) 2429 DP("Error when copying USM\n"); 2430 DP("Copy linked variable host address (" DPxMOD ")" 2431 "to device address (" DPxMOD ")\n", 2432 DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)); 2433 } 2434 2435 continue; 2436 } 2437 2438 DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)); 2439 2440 // errors in kernarg_segment_size previously treated as = 0 (or as undef) 2441 uint32_t KernargSegmentSize = 0; 2442 auto &KernelInfoMap = DeviceInfo.KernelInfoTable[DeviceId]; 2443 hsa_status_t Err = HSA_STATUS_SUCCESS; 2444 if (!E->name) { 2445 Err = HSA_STATUS_ERROR; 2446 } else { 2447 std::string KernelStr = std::string(E->name); 2448 auto It = KernelInfoMap.find(KernelStr); 2449 if (It != KernelInfoMap.end()) { 2450 atl_kernel_info_t Info = It->second; 2451 KernargSegmentSize = Info.kernel_segment_size; 2452 } else { 2453 Err = HSA_STATUS_ERROR; 2454 } 2455 } 2456 2457 // default value GENERIC (in case symbol is missing from cubin file) 2458 llvm::omp::OMPTgtExecModeFlags ExecModeVal = 2459 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; 2460 2461 // get flat group size if present, else Default_WG_Size 2462 int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; 2463 2464 // get Kernel Descriptor if present. 2465 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp 2466 struct KernDescValType { 2467 uint16_t Version; 2468 uint16_t TSize; 2469 uint16_t WGSize; 2470 }; 2471 struct KernDescValType KernDescVal; 2472 std::string KernDescNameStr(E->name); 2473 KernDescNameStr += "_kern_desc"; 2474 const char *KernDescName = KernDescNameStr.c_str(); 2475 2476 void *KernDescPtr; 2477 uint32_t KernDescSize; 2478 void *CallStackAddr = nullptr; 2479 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName, 2480 &KernDescPtr, &KernDescSize); 2481 2482 if (Err == HSA_STATUS_SUCCESS) { 2483 if ((size_t)KernDescSize != sizeof(KernDescVal)) 2484 DP("Loading global computation properties '%s' - size mismatch (%u != " 2485 "%lu)\n", 2486 KernDescName, KernDescSize, sizeof(KernDescVal)); 2487 2488 memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); 2489 2490 // Check structure size against recorded size. 2491 if ((size_t)KernDescSize != KernDescVal.TSize) 2492 DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", 2493 sizeof(KernDescVal), KernDescVal.TSize, KernDescName); 2494 2495 DP("After loading global for %s KernDesc \n", KernDescName); 2496 DP("KernDesc: Version: %d\n", KernDescVal.Version); 2497 DP("KernDesc: TSize: %d\n", KernDescVal.TSize); 2498 DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize); 2499 2500 if (KernDescVal.WGSize == 0) { 2501 KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize; 2502 DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize); 2503 } 2504 WGSizeVal = KernDescVal.WGSize; 2505 DP("WGSizeVal %d\n", WGSizeVal); 2506 check("Loading KernDesc computation property", Err); 2507 } else { 2508 DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); 2509 2510 // Flat group size 2511 std::string WGSizeNameStr(E->name); 2512 WGSizeNameStr += "_wg_size"; 2513 const char *WGSizeName = WGSizeNameStr.c_str(); 2514 2515 void *WGSizePtr; 2516 uint32_t WGSize; 2517 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName, 2518 &WGSizePtr, &WGSize); 2519 2520 if (Err == HSA_STATUS_SUCCESS) { 2521 if ((size_t)WGSize != sizeof(int16_t)) { 2522 DP("Loading global computation properties '%s' - size mismatch (%u " 2523 "!= " 2524 "%lu)\n", 2525 WGSizeName, WGSize, sizeof(int16_t)); 2526 return NULL; 2527 } 2528 2529 memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); 2530 2531 DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); 2532 2533 if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize || 2534 WGSizeVal > RTLDeviceInfoTy::MaxWgSize) { 2535 DP("Error wrong WGSize value specified in HSA code object file: " 2536 "%d\n", 2537 WGSizeVal); 2538 WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; 2539 } 2540 } else { 2541 DP("Warning: Loading WGSize '%s' - symbol not found, " 2542 "using default value %d\n", 2543 WGSizeName, WGSizeVal); 2544 } 2545 2546 check("Loading WGSize computation property", Err); 2547 } 2548 2549 // Read execution mode from global in binary 2550 std::string ExecModeNameStr(E->name); 2551 ExecModeNameStr += "_exec_mode"; 2552 const char *ExecModeName = ExecModeNameStr.c_str(); 2553 2554 void *ExecModePtr; 2555 uint32_t VarSize; 2556 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName, 2557 &ExecModePtr, &VarSize); 2558 2559 if (Err == HSA_STATUS_SUCCESS) { 2560 if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 2561 DP("Loading global computation properties '%s' - size mismatch(%u != " 2562 "%lu)\n", 2563 ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 2564 return NULL; 2565 } 2566 2567 memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize); 2568 2569 DP("After loading global for %s ExecMode = %d\n", ExecModeName, 2570 ExecModeVal); 2571 2572 if (ExecModeVal < 0 || 2573 ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { 2574 DP("Error wrong exec_mode value specified in HSA code object file: " 2575 "%d\n", 2576 ExecModeVal); 2577 return NULL; 2578 } 2579 } else { 2580 DP("Loading global exec_mode '%s' - symbol missing, using default " 2581 "value " 2582 "GENERIC (1)\n", 2583 ExecModeName); 2584 } 2585 check("Loading computation property", Err); 2586 2587 KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, 2588 CallStackAddr, E->name, KernargSegmentSize, 2589 DeviceInfo.KernArgPool)); 2590 __tgt_offload_entry Entry = *E; 2591 Entry.addr = (void *)&KernelsList.back(); 2592 DeviceInfo.addOffloadEntry(DeviceId, Entry); 2593 DP("Entry point %ld maps to %s\n", E - HostBegin, E->name); 2594 } 2595 2596 return DeviceInfo.getOffloadEntriesTable(DeviceId); 2597 } 2598 2599 void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { 2600 void *Ptr = NULL; 2601 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 2602 2603 if (Kind != TARGET_ALLOC_DEFAULT) { 2604 REPORT("Invalid target data allocation kind or requested allocator not " 2605 "implemented yet\n"); 2606 return NULL; 2607 } 2608 2609 hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId); 2610 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr); 2611 DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size, 2612 (long long unsigned)(Elf64_Addr)Ptr); 2613 Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL; 2614 return Ptr; 2615 } 2616 2617 int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr, 2618 int64_t Size) { 2619 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 2620 __tgt_async_info AsyncInfo; 2621 int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); 2622 if (Rc != OFFLOAD_SUCCESS) 2623 return OFFLOAD_FAIL; 2624 2625 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 2626 } 2627 2628 int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr, 2629 int64_t Size, __tgt_async_info *AsyncInfo) { 2630 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 2631 if (AsyncInfo) { 2632 initAsyncInfo(AsyncInfo); 2633 return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo); 2634 } 2635 return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size); 2636 } 2637 2638 int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr, 2639 int64_t Size) { 2640 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 2641 __tgt_async_info AsyncInfo; 2642 int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); 2643 if (Rc != OFFLOAD_SUCCESS) 2644 return OFFLOAD_FAIL; 2645 2646 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 2647 } 2648 2649 int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr, 2650 int64_t Size, 2651 __tgt_async_info *AsyncInfo) { 2652 assert(AsyncInfo && "AsyncInfo is nullptr"); 2653 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 2654 initAsyncInfo(AsyncInfo); 2655 return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo); 2656 } 2657 2658 int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) { 2659 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 2660 hsa_status_t Err; 2661 DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr); 2662 Err = core::Runtime::Memfree(TgtPtr); 2663 if (Err != HSA_STATUS_SUCCESS) { 2664 DP("Error when freeing CUDA memory\n"); 2665 return OFFLOAD_FAIL; 2666 } 2667 return OFFLOAD_SUCCESS; 2668 } 2669 2670 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, 2671 void **TgtArgs, ptrdiff_t *TgtOffsets, 2672 int32_t ArgNum, int32_t NumTeams, 2673 int32_t ThreadLimit, 2674 uint64_t LoopTripcount) { 2675 2676 DeviceInfo.LoadRunLock.lock_shared(); 2677 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, 2678 ArgNum, NumTeams, ThreadLimit, LoopTripcount); 2679 2680 DeviceInfo.LoadRunLock.unlock_shared(); 2681 return Res; 2682 } 2683 2684 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr, 2685 void **TgtArgs, ptrdiff_t *TgtOffsets, 2686 int32_t ArgNum) { 2687 // use one team and one thread 2688 // fix thread num 2689 int32_t TeamNum = 1; 2690 int32_t ThreadLimit = 0; // use default 2691 return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs, 2692 TgtOffsets, ArgNum, TeamNum, 2693 ThreadLimit, 0); 2694 } 2695 2696 int32_t __tgt_rtl_run_target_team_region_async( 2697 int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, 2698 int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, 2699 uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) { 2700 assert(AsyncInfo && "AsyncInfo is nullptr"); 2701 initAsyncInfo(AsyncInfo); 2702 2703 DeviceInfo.LoadRunLock.lock_shared(); 2704 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, 2705 ArgNum, NumTeams, ThreadLimit, LoopTripcount); 2706 2707 DeviceInfo.LoadRunLock.unlock_shared(); 2708 return Res; 2709 } 2710 2711 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr, 2712 void **TgtArgs, ptrdiff_t *TgtOffsets, 2713 int32_t ArgNum, 2714 __tgt_async_info *AsyncInfo) { 2715 // use one team and one thread 2716 // fix thread num 2717 int32_t TeamNum = 1; 2718 int32_t ThreadLimit = 0; // use default 2719 return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs, 2720 TgtOffsets, ArgNum, TeamNum, 2721 ThreadLimit, 0, AsyncInfo); 2722 } 2723 2724 int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) { 2725 assert(AsyncInfo && "AsyncInfo is nullptr"); 2726 2727 // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant 2728 // is not ensured by devices.cpp for amdgcn 2729 // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr"); 2730 if (AsyncInfo->Queue) { 2731 finiAsyncInfo(AsyncInfo); 2732 } 2733 return OFFLOAD_SUCCESS; 2734 } 2735 2736 void __tgt_rtl_print_device_info(int32_t DeviceId) { 2737 // TODO: Assertion to see if DeviceId is correct 2738 // NOTE: We don't need to set context for print device info. 2739 2740 DeviceInfo.printDeviceInfo(DeviceId, DeviceInfo.HSAAgents[DeviceId]); 2741 } 2742 2743 } // extern "C" 2744