1 //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for AMD hsa machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <algorithm> 14 #include <assert.h> 15 #include <cstdio> 16 #include <cstdlib> 17 #include <cstring> 18 #include <functional> 19 #include <libelf.h> 20 #include <list> 21 #include <memory> 22 #include <mutex> 23 #include <shared_mutex> 24 #include <unordered_map> 25 #include <vector> 26 27 #include "impl_runtime.h" 28 #include "interop_hsa.h" 29 30 #include "internal.h" 31 #include "rt.h" 32 33 #include "DeviceEnvironment.h" 34 #include "get_elf_mach_gfx_name.h" 35 #include "omptargetplugin.h" 36 #include "print_tracing.h" 37 38 #include "llvm/ADT/StringMap.h" 39 #include "llvm/ADT/StringRef.h" 40 #include "llvm/Frontend/OpenMP/OMPConstants.h" 41 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 42 43 using namespace llvm; 44 45 // hostrpc interface, FIXME: consider moving to its own include these are 46 // statically linked into amdgpu/plugin if present from hostrpc_services.a, 47 // linked as --whole-archive to override the weak symbols that are used to 48 // implement a fallback for toolchains that do not yet have a hostrpc library. 49 extern "C" { 50 uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ, 51 uint32_t DeviceId); 52 hsa_status_t hostrpc_init(); 53 hsa_status_t hostrpc_terminate(); 54 55 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } 56 __attribute__((weak)) hsa_status_t hostrpc_terminate() { 57 return HSA_STATUS_SUCCESS; 58 } 59 __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, 60 uint32_t DeviceId) { 61 DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library " 62 "missing\n", 63 DeviceId); 64 return 0; 65 } 66 } 67 68 // Heuristic parameters used for kernel launch 69 // Number of teams per CU to allow scheduling flexibility 70 static const unsigned DefaultTeamsPerCU = 4; 71 72 int print_kernel_trace; 73 74 #ifdef OMPTARGET_DEBUG 75 #define check(msg, status) \ 76 if (status != HSA_STATUS_SUCCESS) { \ 77 DP(#msg " failed\n"); \ 78 } else { \ 79 DP(#msg " succeeded\n"); \ 80 } 81 #else 82 #define check(msg, status) \ 83 {} 84 #endif 85 86 #include "elf_common.h" 87 88 namespace hsa { 89 template <typename C> hsa_status_t iterate_agents(C Cb) { 90 auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t { 91 C *Unwrapped = static_cast<C *>(Data); 92 return (*Unwrapped)(Agent); 93 }; 94 return hsa_iterate_agents(L, static_cast<void *>(&Cb)); 95 } 96 97 template <typename C> 98 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) { 99 auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t { 100 C *Unwrapped = static_cast<C *>(Data); 101 return (*Unwrapped)(MemoryPool); 102 }; 103 104 return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb)); 105 } 106 107 } // namespace hsa 108 109 /// Keep entries table per device 110 struct FuncOrGblEntryTy { 111 __tgt_target_table Table; 112 std::vector<__tgt_offload_entry> Entries; 113 }; 114 115 struct KernelArgPool { 116 private: 117 static pthread_mutex_t Mutex; 118 119 public: 120 uint32_t KernargSegmentSize; 121 void *KernargRegion = nullptr; 122 std::queue<int> FreeKernargSegments; 123 124 uint32_t kernargSizeIncludingImplicit() { 125 return KernargSegmentSize + sizeof(impl_implicit_args_t); 126 } 127 128 ~KernelArgPool() { 129 if (KernargRegion) { 130 auto R = hsa_amd_memory_pool_free(KernargRegion); 131 if (R != HSA_STATUS_SUCCESS) { 132 DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)); 133 } 134 } 135 } 136 137 // Can't really copy or move a mutex 138 KernelArgPool() = default; 139 KernelArgPool(const KernelArgPool &) = delete; 140 KernelArgPool(KernelArgPool &&) = delete; 141 142 KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) 143 : KernargSegmentSize(KernargSegmentSize) { 144 145 // impl uses one pool per kernel for all gpus, with a fixed upper size 146 // preserving that exact scheme here, including the queue<int> 147 148 hsa_status_t Err = hsa_amd_memory_pool_allocate( 149 MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0, 150 &KernargRegion); 151 152 if (Err != HSA_STATUS_SUCCESS) { 153 DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)); 154 KernargRegion = nullptr; // paranoid 155 return; 156 } 157 158 Err = core::allow_access_to_all_gpu_agents(KernargRegion); 159 if (Err != HSA_STATUS_SUCCESS) { 160 DP("hsa allow_access_to_all_gpu_agents failed: %s\n", 161 get_error_string(Err)); 162 auto R = hsa_amd_memory_pool_free(KernargRegion); 163 if (R != HSA_STATUS_SUCCESS) { 164 // if free failed, can't do anything more to resolve it 165 DP("hsa memory poll free failed: %s\n", get_error_string(Err)); 166 } 167 KernargRegion = nullptr; 168 return; 169 } 170 171 for (int I = 0; I < MAX_NUM_KERNELS; I++) { 172 FreeKernargSegments.push(I); 173 } 174 } 175 176 void *allocate(uint64_t ArgNum) { 177 assert((ArgNum * sizeof(void *)) == KernargSegmentSize); 178 Lock L(&Mutex); 179 void *Res = nullptr; 180 if (!FreeKernargSegments.empty()) { 181 182 int FreeIdx = FreeKernargSegments.front(); 183 Res = static_cast<void *>(static_cast<char *>(KernargRegion) + 184 (FreeIdx * kernargSizeIncludingImplicit())); 185 assert(FreeIdx == pointerToIndex(Res)); 186 FreeKernargSegments.pop(); 187 } 188 return Res; 189 } 190 191 void deallocate(void *Ptr) { 192 Lock L(&Mutex); 193 int Idx = pointerToIndex(Ptr); 194 FreeKernargSegments.push(Idx); 195 } 196 197 private: 198 int pointerToIndex(void *Ptr) { 199 ptrdiff_t Bytes = 200 static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion); 201 assert(Bytes >= 0); 202 assert(Bytes % kernargSizeIncludingImplicit() == 0); 203 return Bytes / kernargSizeIncludingImplicit(); 204 } 205 struct Lock { 206 Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); } 207 ~Lock() { pthread_mutex_unlock(M); } 208 pthread_mutex_t *M; 209 }; 210 }; 211 pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER; 212 213 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>> 214 KernelArgPoolMap; 215 216 /// Use a single entity to encode a kernel and a set of flags 217 struct KernelTy { 218 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 219 int16_t ConstWGSize; 220 int32_t DeviceId; 221 void *CallStackAddr = nullptr; 222 const char *Name; 223 224 KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, 225 int32_t DeviceId, void *CallStackAddr, const char *Name, 226 uint32_t KernargSegmentSize, 227 hsa_amd_memory_pool_t &KernArgMemoryPool) 228 : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), 229 DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { 230 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); 231 232 std::string N(Name); 233 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { 234 KernelArgPoolMap.insert( 235 std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool( 236 KernargSegmentSize, KernArgMemoryPool)))); 237 } 238 } 239 }; 240 241 /// List that contains all the kernels. 242 /// FIXME: we may need this to be per device and per library. 243 std::list<KernelTy> KernelsList; 244 245 template <typename Callback> static hsa_status_t findAgents(Callback CB) { 246 247 hsa_status_t Err = 248 hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t { 249 hsa_device_type_t DeviceType; 250 // get_info fails iff HSA runtime not yet initialized 251 hsa_status_t Err = 252 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); 253 254 if (Err != HSA_STATUS_SUCCESS) { 255 if (print_kernel_trace > 0) 256 DP("rtl.cpp: err %s\n", get_error_string(Err)); 257 258 return Err; 259 } 260 261 CB(DeviceType, Agent); 262 return HSA_STATUS_SUCCESS; 263 }); 264 265 // iterate_agents fails iff HSA runtime not yet initialized 266 if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) { 267 DP("rtl.cpp: err %s\n", get_error_string(Err)); 268 } 269 270 return Err; 271 } 272 273 static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source, 274 void *Data) { 275 if (Status != HSA_STATUS_SUCCESS) { 276 const char *StatusString; 277 if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) { 278 StatusString = "unavailable"; 279 } 280 DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source, 281 Status, StatusString); 282 abort(); 283 } 284 } 285 286 namespace core { 287 namespace { 288 289 bool checkResult(hsa_status_t Err, const char *ErrMsg) { 290 if (Err == HSA_STATUS_SUCCESS) 291 return true; 292 293 REPORT("%s", ErrMsg); 294 REPORT("%s", get_error_string(Err)); 295 return false; 296 } 297 298 void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) { 299 __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE); 300 } 301 302 uint16_t createHeader() { 303 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; 304 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; 305 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; 306 return Header; 307 } 308 309 hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { 310 bool AllocAllowed = false; 311 hsa_status_t Err = hsa_amd_memory_pool_get_info( 312 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, 313 &AllocAllowed); 314 if (Err != HSA_STATUS_SUCCESS) { 315 DP("Alloc allowed in memory pool check failed: %s\n", 316 get_error_string(Err)); 317 return Err; 318 } 319 320 size_t Size = 0; 321 Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, 322 &Size); 323 if (Err != HSA_STATUS_SUCCESS) { 324 DP("Get memory pool size failed: %s\n", get_error_string(Err)); 325 return Err; 326 } 327 328 return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; 329 } 330 331 hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { 332 std::vector<hsa_amd_memory_pool_t> *Result = 333 static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data); 334 335 hsa_status_t Err; 336 if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) { 337 return Err; 338 } 339 340 Result->push_back(MemoryPool); 341 return HSA_STATUS_SUCCESS; 342 } 343 344 } // namespace 345 } // namespace core 346 347 struct EnvironmentVariables { 348 int NumTeams; 349 int TeamLimit; 350 int TeamThreadLimit; 351 int MaxTeamsDefault; 352 int DynamicMemSize; 353 }; 354 355 template <uint32_t wavesize> 356 static constexpr const llvm::omp::GV &getGridValue() { 357 return llvm::omp::getAMDGPUGridValues<wavesize>(); 358 } 359 360 struct HSALifetime { 361 // Wrapper around HSA used to ensure it is constructed before other types 362 // and destructed after, which means said other types can use raii for 363 // cleanup without risking running outside of the lifetime of HSA 364 const hsa_status_t S; 365 366 bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; } 367 HSALifetime() : S(hsa_init()) {} 368 369 ~HSALifetime() { 370 if (S == HSA_STATUS_SUCCESS) { 371 hsa_status_t Err = hsa_shut_down(); 372 if (Err != HSA_STATUS_SUCCESS) { 373 // Can't call into HSA to get a string from the integer 374 DP("Shutting down HSA failed: %d\n", Err); 375 } 376 } 377 } 378 }; 379 380 // Handle scheduling of multiple hsa_queue's per device to 381 // multiple threads (one scheduler per device) 382 class HSAQueueScheduler { 383 public: 384 HSAQueueScheduler() : Current(0) {} 385 386 HSAQueueScheduler(const HSAQueueScheduler &) = delete; 387 388 HSAQueueScheduler(HSAQueueScheduler &&Q) { 389 Current = Q.Current.load(); 390 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 391 HSAQueues[I] = Q.HSAQueues[I]; 392 Q.HSAQueues[I] = nullptr; 393 } 394 } 395 396 // \return false if any HSA queue creation fails 397 bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) { 398 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 399 hsa_queue_t *Q = nullptr; 400 hsa_status_t Rc = 401 hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI, 402 callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q); 403 if (Rc != HSA_STATUS_SUCCESS) { 404 DP("Failed to create HSA queue %d\n", I); 405 return false; 406 } 407 HSAQueues[I] = Q; 408 } 409 return true; 410 } 411 412 ~HSAQueueScheduler() { 413 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 414 if (HSAQueues[I]) { 415 hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]); 416 if (Err != HSA_STATUS_SUCCESS) 417 DP("Error destroying HSA queue"); 418 } 419 } 420 } 421 422 // \return next queue to use for device 423 hsa_queue_t *next() { 424 return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) % 425 NUM_QUEUES_PER_DEVICE]; 426 } 427 428 private: 429 // Number of queues per device 430 enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 }; 431 hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {}; 432 std::atomic<uint8_t> Current; 433 }; 434 435 /// Class containing all the device information 436 class RTLDeviceInfoTy : HSALifetime { 437 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; 438 439 struct QueueDeleter { 440 void operator()(hsa_queue_t *Q) { 441 if (Q) { 442 hsa_status_t Err = hsa_queue_destroy(Q); 443 if (Err != HSA_STATUS_SUCCESS) { 444 DP("Error destroying hsa queue: %s\n", get_error_string(Err)); 445 } 446 } 447 } 448 }; 449 450 public: 451 bool ConstructionSucceeded = false; 452 453 // load binary populates symbol tables and mutates various global state 454 // run uses those symbol tables 455 std::shared_timed_mutex LoadRunLock; 456 457 int NumberOfDevices = 0; 458 459 // GPU devices 460 std::vector<hsa_agent_t> HSAAgents; 461 std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu 462 463 // CPUs 464 std::vector<hsa_agent_t> CPUAgents; 465 466 // Device properties 467 std::vector<int> ComputeUnits; 468 std::vector<int> GroupsPerDevice; 469 std::vector<int> ThreadsPerGroup; 470 std::vector<int> WarpSize; 471 std::vector<std::string> GPUName; 472 std::vector<std::string> TargetID; 473 474 // OpenMP properties 475 std::vector<int> NumTeams; 476 std::vector<int> NumThreads; 477 478 // OpenMP Environment properties 479 EnvironmentVariables Env; 480 481 // OpenMP Requires Flags 482 int64_t RequiresFlags; 483 484 // Resource pools 485 SignalPoolT FreeSignalPool; 486 487 bool HostcallRequired = false; 488 489 std::vector<hsa_executable_t> HSAExecutables; 490 491 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable; 492 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable; 493 494 hsa_amd_memory_pool_t KernArgPool; 495 496 // fine grained memory pool for host allocations 497 hsa_amd_memory_pool_t HostFineGrainedMemoryPool; 498 499 // fine and coarse-grained memory pools per offloading device 500 std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools; 501 std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools; 502 503 struct ImplFreePtrDeletor { 504 void operator()(void *P) { 505 core::Runtime::Memfree(P); // ignore failure to free 506 } 507 }; 508 509 // device_State shared across loaded binaries, error if inconsistent size 510 std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>> 511 DeviceStateStore; 512 513 static const unsigned HardTeamLimit = 514 (1 << 16) - 1; // 64K needed to fit in uint16 515 static const int DefaultNumTeams = 128; 516 517 // These need to be per-device since different devices can have different 518 // wave sizes, but are currently the same number for each so that refactor 519 // can be postponed. 520 static_assert(getGridValue<32>().GV_Max_Teams == 521 getGridValue<64>().GV_Max_Teams, 522 ""); 523 static const int MaxTeams = getGridValue<64>().GV_Max_Teams; 524 525 static_assert(getGridValue<32>().GV_Max_WG_Size == 526 getGridValue<64>().GV_Max_WG_Size, 527 ""); 528 static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size; 529 530 static_assert(getGridValue<32>().GV_Default_WG_Size == 531 getGridValue<64>().GV_Default_WG_Size, 532 ""); 533 static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size; 534 535 using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size, 536 hsa_agent_t, hsa_amd_memory_pool_t); 537 hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size, 538 MemcpyFunc Func, int32_t DeviceId) { 539 hsa_agent_t Agent = HSAAgents[DeviceId]; 540 hsa_signal_t S = FreeSignalPool.pop(); 541 if (S.handle == 0) { 542 return HSA_STATUS_ERROR; 543 } 544 hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool); 545 FreeSignalPool.push(S); 546 return R; 547 } 548 549 hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size, 550 int32_t DeviceId) { 551 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId); 552 } 553 554 hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size, 555 int32_t DeviceId) { 556 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId); 557 } 558 559 static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) { 560 char TmpChar[1000]; 561 uint16_t Major, Minor; 562 uint32_t TmpUInt; 563 uint32_t TmpUInt2; 564 uint32_t CacheSize[4]; 565 bool TmpBool; 566 uint16_t WorkgroupMaxDim[3]; 567 hsa_dim3_t GridMaxDim; 568 569 // Getting basic information about HSA and Device 570 core::checkResult( 571 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major), 572 "Error from hsa_system_get_info when obtaining " 573 "HSA_SYSTEM_INFO_VERSION_MAJOR\n"); 574 core::checkResult( 575 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor), 576 "Error from hsa_system_get_info when obtaining " 577 "HSA_SYSTEM_INFO_VERSION_MINOR\n"); 578 printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor); 579 printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId); 580 core::checkResult( 581 hsa_agent_get_info( 582 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar), 583 "Error returned from hsa_agent_get_info when obtaining " 584 "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n"); 585 printf(" Product Name: \t\t\t%s \n", TmpChar); 586 core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar), 587 "Error returned from hsa_agent_get_info when obtaining " 588 "HSA_AGENT_INFO_NAME\n"); 589 printf(" Device Name: \t\t\t%s \n", TmpChar); 590 core::checkResult( 591 hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar), 592 "Error returned from hsa_agent_get_info when obtaining " 593 "HSA_AGENT_INFO_NAME\n"); 594 printf(" Vendor Name: \t\t\t%s \n", TmpChar); 595 hsa_device_type_t DevType; 596 core::checkResult( 597 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType), 598 "Error returned from hsa_agent_get_info when obtaining " 599 "HSA_AGENT_INFO_DEVICE\n"); 600 printf(" Device Type: \t\t\t%s \n", 601 DevType == HSA_DEVICE_TYPE_CPU 602 ? "CPU" 603 : (DevType == HSA_DEVICE_TYPE_GPU 604 ? "GPU" 605 : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN"))); 606 core::checkResult( 607 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt), 608 "Error returned from hsa_agent_get_info when obtaining " 609 "HSA_AGENT_INFO_QUEUES_MAX\n"); 610 printf(" Max Queues: \t\t\t%u \n", TmpUInt); 611 core::checkResult( 612 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt), 613 "Error returned from hsa_agent_get_info when obtaining " 614 "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n"); 615 printf(" Queue Min Size: \t\t\t%u \n", TmpUInt); 616 core::checkResult( 617 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt), 618 "Error returned from hsa_agent_get_info when obtaining " 619 "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n"); 620 printf(" Queue Max Size: \t\t\t%u \n", TmpUInt); 621 622 // Getting cache information 623 printf(" Cache:\n"); 624 625 // FIXME: This is deprecated according to HSA documentation. But using 626 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during 627 // runtime. 628 core::checkResult( 629 hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize), 630 "Error returned from hsa_agent_get_info when obtaining " 631 "HSA_AGENT_INFO_CACHE_SIZE\n"); 632 633 for (int I = 0; I < 4; I++) { 634 if (CacheSize[I]) { 635 printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]); 636 } 637 } 638 639 core::checkResult( 640 hsa_agent_get_info(Agent, 641 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, 642 &TmpUInt), 643 "Error returned from hsa_agent_get_info when obtaining " 644 "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n"); 645 printf(" Cacheline Size: \t\t\t%u \n", TmpUInt); 646 core::checkResult( 647 hsa_agent_get_info( 648 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, 649 &TmpUInt), 650 "Error returned from hsa_agent_get_info when obtaining " 651 "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n"); 652 printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt); 653 core::checkResult( 654 hsa_agent_get_info( 655 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 656 &TmpUInt), 657 "Error returned from hsa_agent_get_info when obtaining " 658 "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n"); 659 printf(" Compute Units: \t\t\t%u \n", TmpUInt); 660 core::checkResult(hsa_agent_get_info( 661 Agent, 662 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, 663 &TmpUInt), 664 "Error returned from hsa_agent_get_info when obtaining " 665 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); 666 printf(" SIMD per CU: \t\t\t%u \n", TmpUInt); 667 core::checkResult( 668 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool), 669 "Error returned from hsa_agent_get_info when obtaining " 670 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); 671 printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE")); 672 core::checkResult( 673 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2), 674 "Error returned from hsa_agent_get_info when obtaining " 675 "HSA_AGENT_INFO_WAVEFRONT_SIZE\n"); 676 printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2); 677 core::checkResult( 678 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt), 679 "Error returned from hsa_agent_get_info when obtaining " 680 "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n"); 681 printf(" Workgroup Max Size: \t\t%u \n", TmpUInt); 682 core::checkResult(hsa_agent_get_info(Agent, 683 HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 684 WorkgroupMaxDim), 685 "Error returned from hsa_agent_get_info when obtaining " 686 "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n"); 687 printf(" Workgroup Max Size per Dimension:\n"); 688 printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]); 689 printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]); 690 printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]); 691 core::checkResult(hsa_agent_get_info( 692 Agent, 693 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, 694 &TmpUInt), 695 "Error returned from hsa_agent_get_info when obtaining " 696 "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n"); 697 printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt); 698 printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2); 699 core::checkResult( 700 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt), 701 "Error returned from hsa_agent_get_info when obtaining " 702 "HSA_AGENT_INFO_GRID_MAX_SIZE\n"); 703 printf(" Grid Max Size: \t\t\t%u \n", TmpUInt); 704 core::checkResult( 705 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim), 706 "Error returned from hsa_agent_get_info when obtaining " 707 "HSA_AGENT_INFO_GRID_MAX_DIM\n"); 708 printf(" Grid Max Size per Dimension: \t\t\n"); 709 printf(" x: \t\t\t\t%u\n", GridMaxDim.x); 710 printf(" y: \t\t\t\t%u\n", GridMaxDim.y); 711 printf(" z: \t\t\t\t%u\n", GridMaxDim.z); 712 core::checkResult( 713 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt), 714 "Error returned from hsa_agent_get_info when obtaining " 715 "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n"); 716 printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt); 717 718 printf(" Memory Pools:\n"); 719 auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t { 720 std::string TmpStr; 721 size_t Size; 722 bool Alloc, Access; 723 hsa_amd_segment_t Segment; 724 hsa_amd_memory_pool_global_flag_t GlobalFlags; 725 core::checkResult( 726 hsa_amd_memory_pool_get_info( 727 Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags), 728 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 729 "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n"); 730 core::checkResult(hsa_amd_memory_pool_get_info( 731 Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment), 732 "Error returned from hsa_amd_memory_pool_get_info when " 733 "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n"); 734 735 switch (Segment) { 736 case HSA_AMD_SEGMENT_GLOBAL: 737 TmpStr = "GLOBAL; FLAGS: "; 738 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags) 739 TmpStr += "KERNARG, "; 740 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags) 741 TmpStr += "FINE GRAINED, "; 742 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags) 743 TmpStr += "COARSE GRAINED, "; 744 break; 745 case HSA_AMD_SEGMENT_READONLY: 746 TmpStr = "READONLY"; 747 break; 748 case HSA_AMD_SEGMENT_PRIVATE: 749 TmpStr = "PRIVATE"; 750 break; 751 case HSA_AMD_SEGMENT_GROUP: 752 TmpStr = "GROUP"; 753 break; 754 } 755 printf(" Pool %s: \n", TmpStr.c_str()); 756 757 core::checkResult(hsa_amd_memory_pool_get_info( 758 Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size), 759 "Error returned from hsa_amd_memory_pool_get_info when " 760 "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n"); 761 printf(" Size: \t\t\t\t %zu bytes\n", Size); 762 core::checkResult( 763 hsa_amd_memory_pool_get_info( 764 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc), 765 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 766 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n"); 767 printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE")); 768 core::checkResult( 769 hsa_amd_memory_pool_get_info( 770 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size), 771 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 772 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n"); 773 printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size); 774 core::checkResult( 775 hsa_amd_memory_pool_get_info( 776 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size), 777 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 778 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n"); 779 printf(" Runtime Alloc alignment: \t %zu bytes\n", Size); 780 core::checkResult( 781 hsa_amd_memory_pool_get_info( 782 Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access), 783 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 784 "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n"); 785 printf(" Accessable by all: \t\t %s\n", 786 (Access ? "TRUE" : "FALSE")); 787 788 return HSA_STATUS_SUCCESS; 789 }; 790 // Iterate over all the memory regions for this agent. Get the memory region 791 // type and size 792 hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr); 793 794 printf(" ISAs:\n"); 795 auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t { 796 char TmpChar[1000]; 797 core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar), 798 "Error returned from hsa_isa_get_info_alt when " 799 "obtaining HSA_ISA_INFO_NAME\n"); 800 printf(" Name: \t\t\t\t %s\n", TmpChar); 801 802 return HSA_STATUS_SUCCESS; 803 }; 804 // Iterate over all the memory regions for this agent. Get the memory region 805 // type and size 806 hsa_agent_iterate_isas(Agent, CBIsas, nullptr); 807 } 808 809 // Record entry point associated with device 810 void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) { 811 assert(DeviceId < (int32_t)FuncGblEntries.size() && 812 "Unexpected device id!"); 813 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 814 815 E.Entries.push_back(Entry); 816 } 817 818 // Return true if the entry is associated with device 819 bool findOffloadEntry(int32_t DeviceId, void *Addr) { 820 assert(DeviceId < (int32_t)FuncGblEntries.size() && 821 "Unexpected device id!"); 822 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 823 824 for (auto &It : E.Entries) { 825 if (It.addr == Addr) 826 return true; 827 } 828 829 return false; 830 } 831 832 // Return the pointer to the target entries table 833 __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) { 834 assert(DeviceId < (int32_t)FuncGblEntries.size() && 835 "Unexpected device id!"); 836 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 837 838 int32_t Size = E.Entries.size(); 839 840 // Table is empty 841 if (!Size) 842 return 0; 843 844 __tgt_offload_entry *Begin = &E.Entries[0]; 845 __tgt_offload_entry *End = &E.Entries[Size - 1]; 846 847 // Update table info according to the entries and return the pointer 848 E.Table.EntriesBegin = Begin; 849 E.Table.EntriesEnd = ++End; 850 851 return &E.Table; 852 } 853 854 // Clear entries table for a device 855 void clearOffloadEntriesTable(int DeviceId) { 856 assert(DeviceId < (int32_t)FuncGblEntries.size() && 857 "Unexpected device id!"); 858 FuncGblEntries[DeviceId].emplace_back(); 859 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 860 // KernelArgPoolMap.clear(); 861 E.Entries.clear(); 862 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 863 } 864 865 hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, 866 unsigned int DeviceId) { 867 assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here."); 868 uint32_t GlobalFlags = 0; 869 hsa_status_t Err = hsa_amd_memory_pool_get_info( 870 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 871 872 if (Err != HSA_STATUS_SUCCESS) { 873 return Err; 874 } 875 876 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { 877 DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; 878 } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { 879 DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; 880 } 881 882 return HSA_STATUS_SUCCESS; 883 } 884 885 hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) { 886 for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { 887 hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( 888 Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { 889 hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool); 890 if (ValidStatus != HSA_STATUS_SUCCESS) { 891 DP("Alloc allowed in memory pool check failed: %s\n", 892 get_error_string(ValidStatus)); 893 return HSA_STATUS_SUCCESS; 894 } 895 return addDeviceMemoryPool(MemoryPool, DeviceId); 896 }); 897 898 if (Err != HSA_STATUS_SUCCESS) { 899 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 900 "Iterate all memory pools", get_error_string(Err)); 901 return Err; 902 } 903 } 904 return HSA_STATUS_SUCCESS; 905 } 906 907 hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) { 908 std::vector<hsa_amd_memory_pool_t> HostPools; 909 910 // collect all the "valid" pools for all the given agents. 911 for (const auto &Agent : Agents) { 912 hsa_status_t Err = hsa_amd_agent_iterate_memory_pools( 913 Agent, core::addMemoryPool, static_cast<void *>(&HostPools)); 914 if (Err != HSA_STATUS_SUCCESS) { 915 DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)); 916 } 917 } 918 919 // We need two fine-grained pools. 920 // 1. One with kernarg flag set for storing kernel arguments 921 // 2. Second for host allocations 922 bool FineGrainedMemoryPoolSet = false; 923 bool KernArgPoolSet = false; 924 for (const auto &MemoryPool : HostPools) { 925 hsa_status_t Err = HSA_STATUS_SUCCESS; 926 uint32_t GlobalFlags = 0; 927 Err = hsa_amd_memory_pool_get_info( 928 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 929 if (Err != HSA_STATUS_SUCCESS) { 930 DP("Get memory pool info failed: %s\n", get_error_string(Err)); 931 return Err; 932 } 933 934 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { 935 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { 936 KernArgPool = MemoryPool; 937 KernArgPoolSet = true; 938 } 939 HostFineGrainedMemoryPool = MemoryPool; 940 FineGrainedMemoryPoolSet = true; 941 } 942 } 943 944 if (FineGrainedMemoryPoolSet && KernArgPoolSet) 945 return HSA_STATUS_SUCCESS; 946 947 return HSA_STATUS_ERROR; 948 } 949 950 hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) { 951 assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && 952 "Invalid device Id"); 953 return DeviceCoarseGrainedMemoryPools[DeviceId]; 954 } 955 956 hsa_amd_memory_pool_t getHostMemoryPool() { 957 return HostFineGrainedMemoryPool; 958 } 959 960 static int readEnv(const char *Env, int Default = -1) { 961 const char *EnvStr = getenv(Env); 962 int Res = Default; 963 if (EnvStr) { 964 Res = std::stoi(EnvStr); 965 DP("Parsed %s=%d\n", Env, Res); 966 } 967 return Res; 968 } 969 970 RTLDeviceInfoTy() { 971 DP("Start initializing " GETNAME(TARGET_NAME) "\n"); 972 973 // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr 974 // anytime. You do not need a debug library build. 975 // 0 => no tracing 976 // 1 => tracing dispatch only 977 // >1 => verbosity increase 978 979 if (!HSAInitSuccess()) { 980 DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"); 981 return; 982 } 983 984 if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) 985 print_kernel_trace = atoi(EnvStr); 986 else 987 print_kernel_trace = 0; 988 989 hsa_status_t Err = core::atl_init_gpu_context(); 990 if (Err != HSA_STATUS_SUCCESS) { 991 DP("Error when initializing " GETNAME(TARGET_NAME) "\n"); 992 return; 993 } 994 995 // Init hostcall soon after initializing hsa 996 hostrpc_init(); 997 998 Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { 999 if (DeviceType == HSA_DEVICE_TYPE_CPU) { 1000 CPUAgents.push_back(Agent); 1001 } else { 1002 HSAAgents.push_back(Agent); 1003 } 1004 }); 1005 if (Err != HSA_STATUS_SUCCESS) 1006 return; 1007 1008 NumberOfDevices = (int)HSAAgents.size(); 1009 1010 if (NumberOfDevices == 0) { 1011 DP("There are no devices supporting HSA.\n"); 1012 return; 1013 } 1014 DP("There are %d devices supporting HSA.\n", NumberOfDevices); 1015 1016 // Init the device info 1017 HSAQueueSchedulers.reserve(NumberOfDevices); 1018 FuncGblEntries.resize(NumberOfDevices); 1019 ThreadsPerGroup.resize(NumberOfDevices); 1020 ComputeUnits.resize(NumberOfDevices); 1021 GPUName.resize(NumberOfDevices); 1022 GroupsPerDevice.resize(NumberOfDevices); 1023 WarpSize.resize(NumberOfDevices); 1024 NumTeams.resize(NumberOfDevices); 1025 NumThreads.resize(NumberOfDevices); 1026 DeviceStateStore.resize(NumberOfDevices); 1027 KernelInfoTable.resize(NumberOfDevices); 1028 SymbolInfoTable.resize(NumberOfDevices); 1029 DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); 1030 DeviceFineGrainedMemoryPools.resize(NumberOfDevices); 1031 1032 Err = setupDevicePools(HSAAgents); 1033 if (Err != HSA_STATUS_SUCCESS) { 1034 DP("Setup for Device Memory Pools failed\n"); 1035 return; 1036 } 1037 1038 Err = setupHostMemoryPools(CPUAgents); 1039 if (Err != HSA_STATUS_SUCCESS) { 1040 DP("Setup for Host Memory Pools failed\n"); 1041 return; 1042 } 1043 1044 for (int I = 0; I < NumberOfDevices; I++) { 1045 uint32_t QueueSize = 0; 1046 { 1047 hsa_status_t Err = hsa_agent_get_info( 1048 HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize); 1049 if (Err != HSA_STATUS_SUCCESS) { 1050 DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I); 1051 return; 1052 } 1053 enum { MaxQueueSize = 4096 }; 1054 if (QueueSize > MaxQueueSize) { 1055 QueueSize = MaxQueueSize; 1056 } 1057 } 1058 1059 { 1060 HSAQueueScheduler QSched; 1061 if (!QSched.createQueues(HSAAgents[I], QueueSize)) 1062 return; 1063 HSAQueueSchedulers.emplace_back(std::move(QSched)); 1064 } 1065 1066 DeviceStateStore[I] = {nullptr, 0}; 1067 } 1068 1069 for (int I = 0; I < NumberOfDevices; I++) { 1070 ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize; 1071 GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams; 1072 ComputeUnits[I] = 1; 1073 DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I, 1074 GroupsPerDevice[I], ThreadsPerGroup[I]); 1075 } 1076 1077 // Get environment variables regarding teams 1078 Env.TeamLimit = readEnv("OMP_TEAM_LIMIT"); 1079 Env.NumTeams = readEnv("OMP_NUM_TEAMS"); 1080 Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT"); 1081 Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT"); 1082 Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0); 1083 1084 // Default state. 1085 RequiresFlags = OMP_REQ_UNDEFINED; 1086 1087 ConstructionSucceeded = true; 1088 } 1089 1090 ~RTLDeviceInfoTy() { 1091 DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"); 1092 if (!HSAInitSuccess()) { 1093 // Then none of these can have been set up and they can't be torn down 1094 return; 1095 } 1096 // Run destructors on types that use HSA before 1097 // impl_finalize removes access to it 1098 DeviceStateStore.clear(); 1099 KernelArgPoolMap.clear(); 1100 // Terminate hostrpc before finalizing hsa 1101 hostrpc_terminate(); 1102 1103 hsa_status_t Err; 1104 for (uint32_t I = 0; I < HSAExecutables.size(); I++) { 1105 Err = hsa_executable_destroy(HSAExecutables[I]); 1106 if (Err != HSA_STATUS_SUCCESS) { 1107 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1108 "Destroying executable", get_error_string(Err)); 1109 } 1110 } 1111 } 1112 }; 1113 1114 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; 1115 1116 static RTLDeviceInfoTy *DeviceInfoState = nullptr; 1117 static RTLDeviceInfoTy &DeviceInfo() { return *DeviceInfoState; } 1118 1119 int32_t __tgt_rtl_init_plugin() { 1120 DeviceInfoState = new RTLDeviceInfoTy; 1121 return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded) 1122 ? OFFLOAD_SUCCESS 1123 : OFFLOAD_FAIL; 1124 } 1125 1126 int32_t __tgt_rtl_deinit_plugin() { 1127 if (DeviceInfoState) 1128 delete DeviceInfoState; 1129 return OFFLOAD_SUCCESS; 1130 } 1131 1132 namespace { 1133 1134 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, 1135 __tgt_async_info *AsyncInfo) { 1136 assert(AsyncInfo && "AsyncInfo is nullptr"); 1137 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 1138 // Return success if we are not copying back to host from target. 1139 if (!HstPtr) 1140 return OFFLOAD_SUCCESS; 1141 hsa_status_t Err; 1142 DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 1143 (long long unsigned)(Elf64_Addr)TgtPtr, 1144 (long long unsigned)(Elf64_Addr)HstPtr); 1145 1146 Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, 1147 DeviceId); 1148 1149 if (Err != HSA_STATUS_SUCCESS) { 1150 DP("Error when copying data from device to host. Pointers: " 1151 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 1152 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 1153 return OFFLOAD_FAIL; 1154 } 1155 DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 1156 (long long unsigned)(Elf64_Addr)TgtPtr, 1157 (long long unsigned)(Elf64_Addr)HstPtr); 1158 return OFFLOAD_SUCCESS; 1159 } 1160 1161 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, 1162 __tgt_async_info *AsyncInfo) { 1163 assert(AsyncInfo && "AsyncInfo is nullptr"); 1164 hsa_status_t Err; 1165 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 1166 // Return success if we are not doing host to target. 1167 if (!HstPtr) 1168 return OFFLOAD_SUCCESS; 1169 1170 DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, 1171 (long long unsigned)(Elf64_Addr)HstPtr, 1172 (long long unsigned)(Elf64_Addr)TgtPtr); 1173 Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, 1174 DeviceId); 1175 if (Err != HSA_STATUS_SUCCESS) { 1176 DP("Error when copying data from host to device. Pointers: " 1177 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 1178 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 1179 return OFFLOAD_FAIL; 1180 } 1181 return OFFLOAD_SUCCESS; 1182 } 1183 1184 // Async. 1185 // The implementation was written with cuda streams in mind. The semantics of 1186 // that are to execute kernels on a queue in order of insertion. A synchronise 1187 // call then makes writes visible between host and device. This means a series 1188 // of N data_submit_async calls are expected to execute serially. HSA offers 1189 // various options to run the data copies concurrently. This may require changes 1190 // to libomptarget. 1191 1192 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that 1193 // there are no outstanding kernels that need to be synchronized. Any async call 1194 // may be passed a Queue==0, at which point the cuda implementation will set it 1195 // to non-null (see getStream). The cuda streams are per-device. Upstream may 1196 // change this interface to explicitly initialize the AsyncInfo_pointer, but 1197 // until then hsa lazily initializes it as well. 1198 1199 void initAsyncInfo(__tgt_async_info *AsyncInfo) { 1200 // set non-null while using async calls, return to null to indicate completion 1201 assert(AsyncInfo); 1202 if (!AsyncInfo->Queue) { 1203 AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX); 1204 } 1205 } 1206 void finiAsyncInfo(__tgt_async_info *AsyncInfo) { 1207 assert(AsyncInfo); 1208 assert(AsyncInfo->Queue); 1209 AsyncInfo->Queue = 0; 1210 } 1211 1212 // Determine launch values for kernel. 1213 struct LaunchVals { 1214 int WorkgroupSize; 1215 int GridSize; 1216 }; 1217 LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, 1218 int ConstWGSize, 1219 llvm::omp::OMPTgtExecModeFlags ExecutionMode, 1220 int NumTeams, int ThreadLimit, uint64_t LoopTripcount, 1221 int DeviceNumTeams) { 1222 1223 int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize; 1224 int NumGroups = 0; 1225 1226 int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; 1227 if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit)) 1228 MaxTeams = RTLDeviceInfoTy::HardTeamLimit; 1229 1230 if (print_kernel_trace & STARTUP_DETAILS) { 1231 DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams); 1232 DP("Max_Teams: %d\n", MaxTeams); 1233 DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize); 1234 DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize); 1235 DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", 1236 RTLDeviceInfoTy::DefaultWgSize); 1237 DP("thread_limit: %d\n", ThreadLimit); 1238 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1239 DP("ConstWGSize: %d\n", ConstWGSize); 1240 } 1241 // check for thread_limit() clause 1242 if (ThreadLimit > 0) { 1243 ThreadsPerGroup = ThreadLimit; 1244 DP("Setting threads per block to requested %d\n", ThreadLimit); 1245 // Add master warp for GENERIC 1246 if (ExecutionMode == 1247 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { 1248 ThreadsPerGroup += WarpSize; 1249 DP("Adding master wavefront: +%d threads\n", WarpSize); 1250 } 1251 if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max 1252 ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize; 1253 DP("Setting threads per block to maximum %d\n", ThreadsPerGroup); 1254 } 1255 } 1256 // check flat_max_work_group_size attr here 1257 if (ThreadsPerGroup > ConstWGSize) { 1258 ThreadsPerGroup = ConstWGSize; 1259 DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", 1260 ThreadsPerGroup); 1261 } 1262 if (print_kernel_trace & STARTUP_DETAILS) 1263 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1264 DP("Preparing %d threads\n", ThreadsPerGroup); 1265 1266 // Set default num_groups (teams) 1267 if (Env.TeamLimit > 0) 1268 NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit; 1269 else 1270 NumGroups = MaxTeams; 1271 DP("Set default num of groups %d\n", NumGroups); 1272 1273 if (print_kernel_trace & STARTUP_DETAILS) { 1274 DP("num_groups: %d\n", NumGroups); 1275 DP("num_teams: %d\n", NumTeams); 1276 } 1277 1278 // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size 1279 // This reduction is typical for default case (no thread_limit clause). 1280 // or when user goes crazy with num_teams clause. 1281 // FIXME: We cant distinguish between a constant or variable thread limit. 1282 // So we only handle constant thread_limits. 1283 if (ThreadsPerGroup > 1284 RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024 1285 // Should we round threadsPerGroup up to nearest WarpSize 1286 // here? 1287 NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup; 1288 1289 // check for num_teams() clause 1290 if (NumTeams > 0) { 1291 NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups; 1292 } 1293 if (print_kernel_trace & STARTUP_DETAILS) { 1294 DP("num_groups: %d\n", NumGroups); 1295 DP("Env.NumTeams %d\n", Env.NumTeams); 1296 DP("Env.TeamLimit %d\n", Env.TeamLimit); 1297 } 1298 1299 if (Env.NumTeams > 0) { 1300 NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups; 1301 DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); 1302 } else if (Env.TeamLimit > 0) { 1303 NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups; 1304 DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); 1305 } else { 1306 if (NumTeams <= 0) { 1307 if (LoopTripcount > 0) { 1308 if (ExecutionMode == 1309 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { 1310 // round up to the nearest integer 1311 NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1; 1312 } else if (ExecutionMode == 1313 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { 1314 NumGroups = LoopTripcount; 1315 } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { 1316 // This is a generic kernel that was transformed to use SPMD-mode 1317 // execution but uses Generic-mode semantics for scheduling. 1318 NumGroups = LoopTripcount; 1319 } 1320 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 1321 "threads per block %d\n", 1322 NumGroups, LoopTripcount, ThreadsPerGroup); 1323 } 1324 } else { 1325 NumGroups = NumTeams; 1326 } 1327 if (NumGroups > MaxTeams) { 1328 NumGroups = MaxTeams; 1329 if (print_kernel_trace & STARTUP_DETAILS) 1330 DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams); 1331 } 1332 if (NumGroups > NumTeams && NumTeams > 0) { 1333 NumGroups = NumTeams; 1334 if (print_kernel_trace & STARTUP_DETAILS) 1335 DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups, 1336 NumTeams); 1337 } 1338 } 1339 1340 // num_teams clause always honored, no matter what, unless DEFAULT is active. 1341 if (NumTeams > 0) { 1342 NumGroups = NumTeams; 1343 // Cap num_groups to EnvMaxTeamsDefault if set. 1344 if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault) 1345 NumGroups = Env.MaxTeamsDefault; 1346 } 1347 if (print_kernel_trace & STARTUP_DETAILS) { 1348 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1349 DP("num_groups: %d\n", NumGroups); 1350 DP("loop_tripcount: %ld\n", LoopTripcount); 1351 } 1352 DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups, 1353 ThreadsPerGroup); 1354 1355 LaunchVals Res; 1356 Res.WorkgroupSize = ThreadsPerGroup; 1357 Res.GridSize = ThreadsPerGroup * NumGroups; 1358 return Res; 1359 } 1360 1361 static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) { 1362 uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); 1363 bool Full = true; 1364 while (Full) { 1365 Full = 1366 PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue)); 1367 } 1368 return PacketId; 1369 } 1370 1371 int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, 1372 ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, 1373 int32_t ThreadLimit, uint64_t LoopTripcount) { 1374 // Set the context we are using 1375 // update thread limit content in gpu memory if un-initialized or specified 1376 // from host 1377 1378 DP("Run target team region thread_limit %d\n", ThreadLimit); 1379 1380 // All args are references. 1381 std::vector<void *> Args(ArgNum); 1382 std::vector<void *> Ptrs(ArgNum); 1383 1384 DP("Arg_num: %d\n", ArgNum); 1385 for (int32_t I = 0; I < ArgNum; ++I) { 1386 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1387 Args[I] = &Ptrs[I]; 1388 DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])); 1389 } 1390 1391 KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr; 1392 1393 std::string KernelName = std::string(KernelInfo->Name); 1394 auto &KernelInfoTable = DeviceInfo().KernelInfoTable; 1395 if (KernelInfoTable[DeviceId].find(KernelName) == 1396 KernelInfoTable[DeviceId].end()) { 1397 DP("Kernel %s not found\n", KernelName.c_str()); 1398 return OFFLOAD_FAIL; 1399 } 1400 1401 const atl_kernel_info_t KernelInfoEntry = 1402 KernelInfoTable[DeviceId][KernelName]; 1403 const uint32_t GroupSegmentSize = 1404 KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize; 1405 const uint32_t SgprCount = KernelInfoEntry.sgpr_count; 1406 const uint32_t VgprCount = KernelInfoEntry.vgpr_count; 1407 const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count; 1408 const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count; 1409 1410 assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count); 1411 1412 /* 1413 * Set limit based on ThreadsPerGroup and GroupsPerDevice 1414 */ 1415 LaunchVals LV = 1416 getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env, 1417 KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, 1418 NumTeams, // From run_region arg 1419 ThreadLimit, // From run_region arg 1420 LoopTripcount, // From run_region arg 1421 DeviceInfo().NumTeams[KernelInfo->DeviceId]); 1422 const int GridSize = LV.GridSize; 1423 const int WorkgroupSize = LV.WorkgroupSize; 1424 1425 if (print_kernel_trace >= LAUNCH) { 1426 int NumGroups = GridSize / WorkgroupSize; 1427 // enum modes are SPMD, GENERIC, NONE 0,1,2 1428 // if doing rtl timing, print to stderr, unless stdout requested. 1429 bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); 1430 fprintf(TraceToStdout ? stdout : stderr, 1431 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " 1432 "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " 1433 "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", 1434 DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, 1435 ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit, 1436 GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount, 1437 VgprSpillCount, LoopTripcount, KernelInfo->Name); 1438 } 1439 1440 // Run on the device. 1441 { 1442 hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next(); 1443 if (!Queue) { 1444 return OFFLOAD_FAIL; 1445 } 1446 uint64_t PacketId = acquireAvailablePacketId(Queue); 1447 1448 const uint32_t Mask = Queue->size - 1; // size is a power of 2 1449 hsa_kernel_dispatch_packet_t *Packet = 1450 (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); 1451 1452 // packet->header is written last 1453 Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 1454 Packet->workgroup_size_x = WorkgroupSize; 1455 Packet->workgroup_size_y = 1; 1456 Packet->workgroup_size_z = 1; 1457 Packet->reserved0 = 0; 1458 Packet->grid_size_x = GridSize; 1459 Packet->grid_size_y = 1; 1460 Packet->grid_size_z = 1; 1461 Packet->private_segment_size = KernelInfoEntry.private_segment_size; 1462 Packet->group_segment_size = GroupSegmentSize; 1463 Packet->kernel_object = KernelInfoEntry.kernel_object; 1464 Packet->kernarg_address = 0; // use the block allocator 1465 Packet->reserved2 = 0; // impl writes id_ here 1466 Packet->completion_signal = {0}; // may want a pool of signals 1467 1468 KernelArgPool *ArgPool = nullptr; 1469 void *KernArg = nullptr; 1470 { 1471 auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name)); 1472 if (It != KernelArgPoolMap.end()) { 1473 ArgPool = (It->second).get(); 1474 } 1475 } 1476 if (!ArgPool) { 1477 DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, 1478 DeviceId); 1479 } 1480 { 1481 if (ArgPool) { 1482 assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))); 1483 KernArg = ArgPool->allocate(ArgNum); 1484 } 1485 if (!KernArg) { 1486 DP("Allocate kernarg failed\n"); 1487 return OFFLOAD_FAIL; 1488 } 1489 1490 // Copy explicit arguments 1491 for (int I = 0; I < ArgNum; I++) { 1492 memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *)); 1493 } 1494 1495 // Initialize implicit arguments. TODO: Which of these can be dropped 1496 impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>( 1497 static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize); 1498 memset(ImplArgs, 0, 1499 sizeof(impl_implicit_args_t)); // may not be necessary 1500 ImplArgs->offset_x = 0; 1501 ImplArgs->offset_y = 0; 1502 ImplArgs->offset_z = 0; 1503 1504 // assign a hostcall buffer for the selected Q 1505 if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) { 1506 // hostrpc_assign_buffer is not thread safe, and this function is 1507 // under a multiple reader lock, not a writer lock. 1508 static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER; 1509 pthread_mutex_lock(&HostcallInitLock); 1510 uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo().HSAAgents[DeviceId], 1511 Queue, DeviceId); 1512 pthread_mutex_unlock(&HostcallInitLock); 1513 if (!Buffer) { 1514 DP("hostrpc_assign_buffer failed, gpu would dereference null and " 1515 "error\n"); 1516 return OFFLOAD_FAIL; 1517 } 1518 1519 DP("Implicit argument count: %d\n", 1520 KernelInfoEntry.implicit_argument_count); 1521 if (KernelInfoEntry.implicit_argument_count >= 4) { 1522 // Initialise pointer for implicit_argument_count != 0 ABI 1523 // Guess that the right implicit argument is at offset 24 after 1524 // the explicit arguments. In the future, should be able to read 1525 // the offset from msgpack. Clang is not annotating it at present. 1526 uint64_t Offset = 1527 sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); 1528 if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) { 1529 DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit " 1530 "args: %d\n", 1531 Offset + 8, ArgPool->kernargSizeIncludingImplicit()); 1532 } else { 1533 memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8); 1534 } 1535 } 1536 1537 // initialise pointer for implicit_argument_count == 0 ABI 1538 ImplArgs->hostcall_ptr = Buffer; 1539 } 1540 1541 Packet->kernarg_address = KernArg; 1542 } 1543 1544 hsa_signal_t S = DeviceInfo().FreeSignalPool.pop(); 1545 if (S.handle == 0) { 1546 DP("Failed to get signal instance\n"); 1547 return OFFLOAD_FAIL; 1548 } 1549 Packet->completion_signal = S; 1550 hsa_signal_store_relaxed(Packet->completion_signal, 1); 1551 1552 // Publish the packet indicating it is ready to be processed 1553 core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet), 1554 core::createHeader(), Packet->setup); 1555 1556 // Since the packet is already published, its contents must not be 1557 // accessed any more 1558 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); 1559 1560 while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, 1561 HSA_WAIT_STATE_BLOCKED) != 0) 1562 ; 1563 1564 assert(ArgPool); 1565 ArgPool->deallocate(KernArg); 1566 DeviceInfo().FreeSignalPool.push(S); 1567 } 1568 1569 DP("Kernel completed\n"); 1570 return OFFLOAD_SUCCESS; 1571 } 1572 1573 bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) { 1574 const uint16_t AmdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h 1575 int32_t R = elf_check_machine(Image, AmdgcnMachineID); 1576 if (!R) { 1577 DP("Supported machine ID not found\n"); 1578 } 1579 return R; 1580 } 1581 1582 uint32_t elfEFlags(__tgt_device_image *Image) { 1583 char *ImgBegin = (char *)Image->ImageStart; 1584 size_t ImgSize = (char *)Image->ImageEnd - ImgBegin; 1585 1586 Elf *E = elf_memory(ImgBegin, ImgSize); 1587 if (!E) { 1588 DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); 1589 return 0; 1590 } 1591 1592 Elf64_Ehdr *Eh64 = elf64_getehdr(E); 1593 1594 if (!Eh64) { 1595 DP("Unable to get machine ID from ELF file!\n"); 1596 elf_end(E); 1597 return 0; 1598 } 1599 1600 uint32_t Flags = Eh64->e_flags; 1601 1602 elf_end(E); 1603 DP("ELF Flags: 0x%x\n", Flags); 1604 return Flags; 1605 } 1606 1607 template <typename T> bool enforceUpperBound(T *Value, T Upper) { 1608 bool Changed = *Value > Upper; 1609 if (Changed) { 1610 *Value = Upper; 1611 } 1612 return Changed; 1613 } 1614 1615 Elf64_Shdr *findOnlyShtHash(Elf *Elf) { 1616 size_t N; 1617 int Rc = elf_getshdrnum(Elf, &N); 1618 if (Rc != 0) { 1619 return nullptr; 1620 } 1621 1622 Elf64_Shdr *Result = nullptr; 1623 for (size_t I = 0; I < N; I++) { 1624 Elf_Scn *Scn = elf_getscn(Elf, I); 1625 if (Scn) { 1626 Elf64_Shdr *Shdr = elf64_getshdr(Scn); 1627 if (Shdr) { 1628 if (Shdr->sh_type == SHT_HASH) { 1629 if (Result == nullptr) { 1630 Result = Shdr; 1631 } else { 1632 // multiple SHT_HASH sections not handled 1633 return nullptr; 1634 } 1635 } 1636 } 1637 } 1638 } 1639 return Result; 1640 } 1641 1642 const Elf64_Sym *elfLookup(Elf *Elf, char *Base, Elf64_Shdr *SectionHash, 1643 const char *Symname) { 1644 1645 assert(SectionHash); 1646 size_t SectionSymtabIndex = SectionHash->sh_link; 1647 Elf64_Shdr *SectionSymtab = 1648 elf64_getshdr(elf_getscn(Elf, SectionSymtabIndex)); 1649 size_t SectionStrtabIndex = SectionSymtab->sh_link; 1650 1651 const Elf64_Sym *Symtab = 1652 reinterpret_cast<const Elf64_Sym *>(Base + SectionSymtab->sh_offset); 1653 1654 const uint32_t *Hashtab = 1655 reinterpret_cast<const uint32_t *>(Base + SectionHash->sh_offset); 1656 1657 // Layout: 1658 // nbucket 1659 // nchain 1660 // bucket[nbucket] 1661 // chain[nchain] 1662 uint32_t Nbucket = Hashtab[0]; 1663 const uint32_t *Bucket = &Hashtab[2]; 1664 const uint32_t *Chain = &Hashtab[Nbucket + 2]; 1665 1666 const size_t Max = strlen(Symname) + 1; 1667 const uint32_t Hash = elf_hash(Symname); 1668 for (uint32_t I = Bucket[Hash % Nbucket]; I != 0; I = Chain[I]) { 1669 char *N = elf_strptr(Elf, SectionStrtabIndex, Symtab[I].st_name); 1670 if (strncmp(Symname, N, Max) == 0) { 1671 return &Symtab[I]; 1672 } 1673 } 1674 1675 return nullptr; 1676 } 1677 1678 struct SymbolInfo { 1679 void *Addr = nullptr; 1680 uint32_t Size = UINT32_MAX; 1681 uint32_t ShType = SHT_NULL; 1682 }; 1683 1684 int getSymbolInfoWithoutLoading(Elf *Elf, char *Base, const char *Symname, 1685 SymbolInfo *Res) { 1686 if (elf_kind(Elf) != ELF_K_ELF) { 1687 return 1; 1688 } 1689 1690 Elf64_Shdr *SectionHash = findOnlyShtHash(Elf); 1691 if (!SectionHash) { 1692 return 1; 1693 } 1694 1695 const Elf64_Sym *Sym = elfLookup(Elf, Base, SectionHash, Symname); 1696 if (!Sym) { 1697 return 1; 1698 } 1699 1700 if (Sym->st_size > UINT32_MAX) { 1701 return 1; 1702 } 1703 1704 if (Sym->st_shndx == SHN_UNDEF) { 1705 return 1; 1706 } 1707 1708 Elf_Scn *Section = elf_getscn(Elf, Sym->st_shndx); 1709 if (!Section) { 1710 return 1; 1711 } 1712 1713 Elf64_Shdr *Header = elf64_getshdr(Section); 1714 if (!Header) { 1715 return 1; 1716 } 1717 1718 Res->Addr = Sym->st_value + Base; 1719 Res->Size = static_cast<uint32_t>(Sym->st_size); 1720 Res->ShType = Header->sh_type; 1721 return 0; 1722 } 1723 1724 int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *Symname, 1725 SymbolInfo *Res) { 1726 Elf *Elf = elf_memory(Base, ImgSize); 1727 if (Elf) { 1728 int Rc = getSymbolInfoWithoutLoading(Elf, Base, Symname, Res); 1729 elf_end(Elf); 1730 return Rc; 1731 } 1732 return 1; 1733 } 1734 1735 hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize, 1736 const char *SymName, void **VarAddr, 1737 uint32_t *VarSize) { 1738 SymbolInfo SI; 1739 int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI); 1740 if (Rc == 0) { 1741 *VarAddr = SI.Addr; 1742 *VarSize = SI.Size; 1743 return HSA_STATUS_SUCCESS; 1744 } 1745 return HSA_STATUS_ERROR; 1746 } 1747 1748 template <typename C> 1749 hsa_status_t moduleRegisterFromMemoryToPlace( 1750 std::map<std::string, atl_kernel_info_t> &KernelInfoTable, 1751 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, 1752 void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb, 1753 std::vector<hsa_executable_t> &HSAExecutables) { 1754 auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t { 1755 C *Unwrapped = static_cast<C *>(CbState); 1756 return (*Unwrapped)(Data, Size); 1757 }; 1758 return core::RegisterModuleFromMemory( 1759 KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize, 1760 DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb), 1761 HSAExecutables); 1762 } 1763 1764 uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) { 1765 uint64_t DeviceStateBytes = 0; 1766 { 1767 // If this is the deviceRTL, get the state variable size 1768 SymbolInfo SizeSi; 1769 int Rc = getSymbolInfoWithoutLoading( 1770 ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi); 1771 1772 if (Rc == 0) { 1773 if (SizeSi.Size != sizeof(uint64_t)) { 1774 DP("Found device_State_size variable with wrong size\n"); 1775 return 0; 1776 } 1777 1778 // Read number of bytes directly from the elf 1779 memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t)); 1780 } 1781 } 1782 return DeviceStateBytes; 1783 } 1784 1785 struct DeviceEnvironment { 1786 // initialise an DeviceEnvironmentTy in the deviceRTL 1787 // patches around differences in the deviceRTL between trunk, aomp, 1788 // rocmcc. Over time these differences will tend to zero and this class 1789 // simplified. 1790 // Symbol may be in .data or .bss, and may be missing fields, todo: 1791 // review aomp/trunk/rocm and simplify the following 1792 1793 // The symbol may also have been deadstripped because the device side 1794 // accessors were unused. 1795 1796 // If the symbol is in .data (aomp, rocm) it can be written directly. 1797 // If it is in .bss, we must wait for it to be allocated space on the 1798 // gpu (trunk) and initialize after loading. 1799 const char *sym() { return "omptarget_device_environment"; } 1800 1801 DeviceEnvironmentTy HostDeviceEnv; 1802 SymbolInfo SI; 1803 bool Valid = false; 1804 1805 __tgt_device_image *Image; 1806 const size_t ImgSize; 1807 1808 DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize, 1809 __tgt_device_image *Image, const size_t ImgSize) 1810 : Image(Image), ImgSize(ImgSize) { 1811 1812 HostDeviceEnv.NumDevices = NumberDevices; 1813 HostDeviceEnv.DeviceNum = DeviceId; 1814 HostDeviceEnv.DebugKind = 0; 1815 HostDeviceEnv.DynamicMemSize = DynamicMemSize; 1816 if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 1817 HostDeviceEnv.DebugKind = std::stoi(EnvStr); 1818 1819 int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize, 1820 sym(), &SI); 1821 if (Rc != 0) { 1822 DP("Finding global device environment '%s' - symbol missing.\n", sym()); 1823 return; 1824 } 1825 1826 if (SI.Size > sizeof(HostDeviceEnv)) { 1827 DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size, 1828 sizeof(HostDeviceEnv)); 1829 return; 1830 } 1831 1832 Valid = true; 1833 } 1834 1835 bool inImage() { return SI.ShType != SHT_NOBITS; } 1836 1837 hsa_status_t beforeLoading(void *Data, size_t Size) { 1838 if (Valid) { 1839 if (inImage()) { 1840 DP("Setting global device environment before load (%u bytes)\n", 1841 SI.Size); 1842 uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart; 1843 void *Pos = (char *)Data + Offset; 1844 memcpy(Pos, &HostDeviceEnv, SI.Size); 1845 } 1846 } 1847 return HSA_STATUS_SUCCESS; 1848 } 1849 1850 hsa_status_t afterLoading() { 1851 if (Valid) { 1852 if (!inImage()) { 1853 DP("Setting global device environment after load (%u bytes)\n", 1854 SI.Size); 1855 int DeviceId = HostDeviceEnv.DeviceNum; 1856 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; 1857 void *StatePtr; 1858 uint32_t StatePtrSize; 1859 hsa_status_t Err = interop_hsa_get_symbol_info( 1860 SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize); 1861 if (Err != HSA_STATUS_SUCCESS) { 1862 DP("failed to find %s in loaded image\n", sym()); 1863 return Err; 1864 } 1865 1866 if (StatePtrSize != SI.Size) { 1867 DP("Symbol had size %u before loading, %u after\n", StatePtrSize, 1868 SI.Size); 1869 return HSA_STATUS_ERROR; 1870 } 1871 1872 return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, 1873 StatePtrSize, DeviceId); 1874 } 1875 } 1876 return HSA_STATUS_SUCCESS; 1877 } 1878 }; 1879 1880 hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) { 1881 uint64_t Rounded = 4 * ((Size + 3) / 4); 1882 void *Ptr; 1883 hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); 1884 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr); 1885 if (Err != HSA_STATUS_SUCCESS) { 1886 return Err; 1887 } 1888 1889 hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4); 1890 if (Rc != HSA_STATUS_SUCCESS) { 1891 DP("zero fill device_state failed with %u\n", Rc); 1892 core::Runtime::Memfree(Ptr); 1893 return HSA_STATUS_ERROR; 1894 } 1895 1896 *RetPtr = Ptr; 1897 return HSA_STATUS_SUCCESS; 1898 } 1899 1900 bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) { 1901 SymbolInfo SI; 1902 int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI); 1903 return (Rc == 0) && (SI.Addr != nullptr); 1904 } 1905 1906 } // namespace 1907 1908 namespace core { 1909 hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) { 1910 return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(), 1911 &DeviceInfo().HSAAgents[0], NULL, Ptr); 1912 } 1913 } // namespace core 1914 1915 static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) { 1916 hsa_status_t err; 1917 uint32_t name_len; 1918 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len); 1919 if (err != HSA_STATUS_SUCCESS) { 1920 DP("Error getting ISA info length\n"); 1921 return err; 1922 } 1923 1924 char TargetID[name_len]; 1925 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID); 1926 if (err != HSA_STATUS_SUCCESS) { 1927 DP("Error getting ISA info name\n"); 1928 return err; 1929 } 1930 1931 auto TripleTargetID = llvm::StringRef(TargetID); 1932 if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) { 1933 DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str()); 1934 } 1935 return HSA_STATUS_SUCCESS; 1936 } 1937 1938 /// Parse a TargetID to get processor arch and feature map. 1939 /// Returns processor subarch. 1940 /// Returns TargetID features in \p FeatureMap argument. 1941 /// If the \p TargetID contains feature+, FeatureMap it to true. 1942 /// If the \p TargetID contains feature-, FeatureMap it to false. 1943 /// If the \p TargetID does not contain a feature (default), do not map it. 1944 StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) { 1945 if (TargetID.empty()) 1946 return llvm::StringRef(); 1947 1948 auto ArchFeature = TargetID.split(":"); 1949 auto Arch = ArchFeature.first; 1950 auto Features = ArchFeature.second; 1951 if (Features.empty()) 1952 return Arch; 1953 1954 if (Features.contains("sramecc+")) { 1955 FeatureMap.insert(std::pair<std::string, bool>("sramecc", true)); 1956 } else if (Features.contains("sramecc-")) { 1957 FeatureMap.insert(std::pair<std::string, bool>("sramecc", false)); 1958 } 1959 if (Features.contains("xnack+")) { 1960 FeatureMap.insert(std::pair<std::string, bool>("xnack", true)); 1961 } else if (Features.contains("xnack-")) { 1962 FeatureMap.insert(std::pair<std::string, bool>("xnack", false)); 1963 } 1964 1965 return Arch; 1966 } 1967 1968 /// Checks if an image \p ImgInfo is compatible with current 1969 /// system's environment \p EnvInfo 1970 bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) { 1971 llvm::StringRef ImgTID(ImgInfo), EnvTID(EnvInfo); 1972 1973 // Compatible in case of exact match 1974 if (ImgTID == EnvTID) { 1975 DP("Compatible: Exact match \t[Image: %s]\t:\t[Environment: %s]\n", 1976 ImgTID.data(), EnvTID.data()); 1977 return true; 1978 } 1979 1980 // Incompatible if Archs mismatch. 1981 StringMap<bool> ImgMap, EnvMap; 1982 StringRef ImgArch = parseTargetID(ImgTID, ImgMap); 1983 StringRef EnvArch = parseTargetID(EnvTID, EnvMap); 1984 1985 // Both EnvArch and ImgArch can't be empty here. 1986 if (EnvArch.empty() || ImgArch.empty() || !ImgArch.contains(EnvArch)) { 1987 DP("Incompatible: Processor mismatch \t[Image: %s]\t:\t[Environment: %s]\n", 1988 ImgTID.data(), EnvTID.data()); 1989 return false; 1990 } 1991 1992 // Incompatible if image has more features than the environment, irrespective 1993 // of type or sign of features. 1994 if (ImgMap.size() > EnvMap.size()) { 1995 DP("Incompatible: Image has more features than the environment \t[Image: " 1996 "%s]\t:\t[Environment: %s]\n", 1997 ImgTID.data(), EnvTID.data()); 1998 return false; 1999 } 2000 2001 // Compatible if each target feature specified by the environment is 2002 // compatible with target feature of the image. The target feature is 2003 // compatible if the iamge does not specify it (meaning Any), or if it 2004 // specifies it with the same value (meaning On or Off). 2005 for (const auto &ImgFeature : ImgMap) { 2006 auto EnvFeature = EnvMap.find(ImgFeature.first()); 2007 if (EnvFeature == EnvMap.end()) { 2008 DP("Incompatible: Value of Image's non-ANY feature is not matching with " 2009 "the Environment feature's ANY value \t[Image: %s]\t:\t[Environment: " 2010 "%s]\n", 2011 ImgTID.data(), EnvTID.data()); 2012 return false; 2013 } else if (EnvFeature->first() == ImgFeature.first() && 2014 EnvFeature->second != ImgFeature.second) { 2015 DP("Incompatible: Value of Image's non-ANY feature is not matching with " 2016 "the Environment feature's non-ANY value \t[Image: " 2017 "%s]\t:\t[Environment: %s]\n", 2018 ImgTID.data(), EnvTID.data()); 2019 return false; 2020 } 2021 } 2022 2023 // Image is compatible if all features of Environment are: 2024 // - either, present in the Image's features map with the same sign, 2025 // - or, the feature is missing from Image's features map i.e. it is 2026 // set to ANY 2027 DP("Compatible: Target IDs are compatible \t[Image: %s]\t:\t[Environment: " 2028 "%s]\n", 2029 ImgTID.data(), EnvTID.data()); 2030 return true; 2031 } 2032 2033 extern "C" { 2034 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { 2035 return elfMachineIdIsAmdgcn(Image); 2036 } 2037 2038 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image, 2039 __tgt_image_info *info) { 2040 if (!__tgt_rtl_is_valid_binary(image)) 2041 return false; 2042 2043 // A subarchitecture was not specified. Assume it is compatible. 2044 if (!info->Arch) 2045 return true; 2046 2047 int32_t NumberOfDevices = __tgt_rtl_number_of_devices(); 2048 2049 for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { 2050 __tgt_rtl_init_device(DeviceId); 2051 hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId]; 2052 hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId); 2053 if (err != HSA_STATUS_SUCCESS) { 2054 DP("Error iterating ISAs\n"); 2055 return false; 2056 } 2057 if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo().TargetID[DeviceId])) 2058 return false; 2059 } 2060 DP("Image has Target ID compatible with the current environment: %s\n", 2061 info->Arch); 2062 return true; 2063 } 2064 2065 int __tgt_rtl_number_of_devices() { 2066 // If the construction failed, no methods are safe to call 2067 if (DeviceInfo().ConstructionSucceeded) { 2068 return DeviceInfo().NumberOfDevices; 2069 } 2070 DP("AMDGPU plugin construction failed. Zero devices available\n"); 2071 return 0; 2072 } 2073 2074 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 2075 DP("Init requires flags to %ld\n", RequiresFlags); 2076 DeviceInfo().RequiresFlags = RequiresFlags; 2077 return RequiresFlags; 2078 } 2079 2080 int32_t __tgt_rtl_init_device(int DeviceId) { 2081 hsa_status_t Err = hsa_init(); 2082 if (Err != HSA_STATUS_SUCCESS) { 2083 DP("HSA Initialization Failed.\n"); 2084 return HSA_STATUS_ERROR; 2085 } 2086 // this is per device id init 2087 DP("Initialize the device id: %d\n", DeviceId); 2088 2089 hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; 2090 2091 // Get number of Compute Unit 2092 uint32_t ComputeUnits = 0; 2093 Err = hsa_agent_get_info( 2094 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 2095 &ComputeUnits); 2096 if (Err != HSA_STATUS_SUCCESS) { 2097 DeviceInfo().ComputeUnits[DeviceId] = 1; 2098 DP("Error getting compute units : settiing to 1\n"); 2099 } else { 2100 DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits; 2101 DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]); 2102 } 2103 2104 char GetInfoName[64]; // 64 max size returned by get info 2105 Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, 2106 (void *)GetInfoName); 2107 if (Err) 2108 DeviceInfo().GPUName[DeviceId] = "--unknown gpu--"; 2109 else { 2110 DeviceInfo().GPUName[DeviceId] = GetInfoName; 2111 } 2112 2113 if (print_kernel_trace & STARTUP_DETAILS) 2114 DP("Device#%-2d CU's: %2d %s\n", DeviceId, 2115 DeviceInfo().ComputeUnits[DeviceId], DeviceInfo().GPUName[DeviceId].c_str()); 2116 2117 // Query attributes to determine number of threads/block and blocks/grid. 2118 uint16_t WorkgroupMaxDim[3]; 2119 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 2120 &WorkgroupMaxDim); 2121 if (Err != HSA_STATUS_SUCCESS) { 2122 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams; 2123 DP("Error getting grid dims: num groups : %d\n", 2124 RTLDeviceInfoTy::DefaultNumTeams); 2125 } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) { 2126 DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; 2127 DP("Using %d ROCm blocks per grid\n", DeviceInfo().GroupsPerDevice[DeviceId]); 2128 } else { 2129 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; 2130 DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " 2131 "at the hard limit\n", 2132 WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit); 2133 } 2134 2135 // Get thread limit 2136 hsa_dim3_t GridMaxDim; 2137 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim); 2138 if (Err == HSA_STATUS_SUCCESS) { 2139 DeviceInfo().ThreadsPerGroup[DeviceId] = 2140 reinterpret_cast<uint32_t *>(&GridMaxDim)[0] / 2141 DeviceInfo().GroupsPerDevice[DeviceId]; 2142 2143 if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) { 2144 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; 2145 DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); 2146 } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId], 2147 RTLDeviceInfoTy::MaxWgSize)) { 2148 DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); 2149 } else { 2150 DP("Using ROCm Queried thread limit: %d\n", 2151 DeviceInfo().ThreadsPerGroup[DeviceId]); 2152 } 2153 } else { 2154 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; 2155 DP("Error getting max block dimension, use default:%d \n", 2156 RTLDeviceInfoTy::MaxWgSize); 2157 } 2158 2159 // Get wavefront size 2160 uint32_t WavefrontSize = 0; 2161 Err = 2162 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize); 2163 if (Err == HSA_STATUS_SUCCESS) { 2164 DP("Queried wavefront size: %d\n", WavefrontSize); 2165 DeviceInfo().WarpSize[DeviceId] = WavefrontSize; 2166 } else { 2167 // TODO: Burn the wavefront size into the code object 2168 DP("Warning: Unknown wavefront size, assuming 64\n"); 2169 DeviceInfo().WarpSize[DeviceId] = 64; 2170 } 2171 2172 // Adjust teams to the env variables 2173 2174 if (DeviceInfo().Env.TeamLimit > 0 && 2175 (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId], 2176 DeviceInfo().Env.TeamLimit))) { 2177 DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", 2178 DeviceInfo().Env.TeamLimit); 2179 } 2180 2181 // Set default number of teams 2182 if (DeviceInfo().Env.NumTeams > 0) { 2183 DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams; 2184 DP("Default number of teams set according to environment %d\n", 2185 DeviceInfo().Env.NumTeams); 2186 } else { 2187 char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); 2188 int TeamsPerCU = DefaultTeamsPerCU; 2189 if (TeamsPerCUEnvStr) { 2190 TeamsPerCU = std::stoi(TeamsPerCUEnvStr); 2191 } 2192 2193 DeviceInfo().NumTeams[DeviceId] = 2194 TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId]; 2195 DP("Default number of teams = %d * number of compute units %d\n", 2196 TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]); 2197 } 2198 2199 if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId], 2200 DeviceInfo().GroupsPerDevice[DeviceId])) { 2201 DP("Default number of teams exceeds device limit, capping at %d\n", 2202 DeviceInfo().GroupsPerDevice[DeviceId]); 2203 } 2204 2205 // Adjust threads to the env variables 2206 if (DeviceInfo().Env.TeamThreadLimit > 0 && 2207 (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], 2208 DeviceInfo().Env.TeamThreadLimit))) { 2209 DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", 2210 DeviceInfo().Env.TeamThreadLimit); 2211 } 2212 2213 // Set default number of threads 2214 DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize; 2215 DP("Default number of threads set according to library's default %d\n", 2216 RTLDeviceInfoTy::DefaultWgSize); 2217 if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], 2218 DeviceInfo().ThreadsPerGroup[DeviceId])) { 2219 DP("Default number of threads exceeds device limit, capping at %d\n", 2220 DeviceInfo().ThreadsPerGroup[DeviceId]); 2221 } 2222 2223 DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", 2224 DeviceId, DeviceInfo().GroupsPerDevice[DeviceId], 2225 DeviceInfo().ThreadsPerGroup[DeviceId]); 2226 2227 DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId, 2228 DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId], 2229 DeviceInfo().GroupsPerDevice[DeviceId], 2230 DeviceInfo().GroupsPerDevice[DeviceId] * 2231 DeviceInfo().ThreadsPerGroup[DeviceId]); 2232 2233 return OFFLOAD_SUCCESS; 2234 } 2235 2236 static __tgt_target_table * 2237 __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image); 2238 2239 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, 2240 __tgt_device_image *Image) { 2241 DeviceInfo().LoadRunLock.lock(); 2242 __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image); 2243 DeviceInfo().LoadRunLock.unlock(); 2244 return Res; 2245 } 2246 2247 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, 2248 __tgt_device_image *Image) { 2249 // This function loads the device image onto gpu[DeviceId] and does other 2250 // per-image initialization work. Specifically: 2251 // 2252 // - Initialize an DeviceEnvironmentTy instance embedded in the 2253 // image at the symbol "omptarget_device_environment" 2254 // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL. 2255 // 2256 // - Allocate a large array per-gpu (could be moved to init_device) 2257 // - Read a uint64_t at symbol omptarget_nvptx_device_State_size 2258 // - Allocate at least that many bytes of gpu memory 2259 // - Zero initialize it 2260 // - Write the pointer to the symbol omptarget_nvptx_device_State 2261 // 2262 // - Pulls some per-kernel information together from various sources and 2263 // records it in the KernelsList for quicker access later 2264 // 2265 // The initialization can be done before or after loading the image onto the 2266 // gpu. This function presently does a mixture. Using the hsa api to get/set 2267 // the information is simpler to implement, in exchange for more complicated 2268 // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes 2269 // back from the gpu vs a hashtable lookup on the host. 2270 2271 const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart; 2272 2273 DeviceInfo().clearOffloadEntriesTable(DeviceId); 2274 2275 // We do not need to set the ELF version because the caller of this function 2276 // had to do that to decide the right runtime to use 2277 2278 if (!elfMachineIdIsAmdgcn(Image)) 2279 return NULL; 2280 2281 { 2282 auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, 2283 DeviceInfo().Env.DynamicMemSize, Image, ImgSize); 2284 2285 auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId]; 2286 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; 2287 hsa_status_t Err = moduleRegisterFromMemoryToPlace( 2288 KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId, 2289 [&](void *Data, size_t Size) { 2290 if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) { 2291 __atomic_store_n(&DeviceInfo().HostcallRequired, true, 2292 __ATOMIC_RELEASE); 2293 } 2294 return Env.beforeLoading(Data, Size); 2295 }, 2296 DeviceInfo().HSAExecutables); 2297 2298 check("Module registering", Err); 2299 if (Err != HSA_STATUS_SUCCESS) { 2300 const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str(); 2301 const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image)); 2302 2303 if (strcmp(DeviceName, ElfName) != 0) { 2304 DP("Possible gpu arch mismatch: device:%s, image:%s please check" 2305 " compiler flag: -march=<gpu>\n", 2306 DeviceName, ElfName); 2307 } else { 2308 DP("Error loading image onto GPU: %s\n", get_error_string(Err)); 2309 } 2310 2311 return NULL; 2312 } 2313 2314 Err = Env.afterLoading(); 2315 if (Err != HSA_STATUS_SUCCESS) { 2316 return NULL; 2317 } 2318 } 2319 2320 DP("AMDGPU module successfully loaded!\n"); 2321 2322 { 2323 // the device_State array is either large value in bss or a void* that 2324 // needs to be assigned to a pointer to an array of size device_state_bytes 2325 // If absent, it has been deadstripped and needs no setup. 2326 2327 void *StatePtr; 2328 uint32_t StatePtrSize; 2329 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; 2330 hsa_status_t Err = interop_hsa_get_symbol_info( 2331 SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr, 2332 &StatePtrSize); 2333 2334 if (Err != HSA_STATUS_SUCCESS) { 2335 DP("No device_state symbol found, skipping initialization\n"); 2336 } else { 2337 if (StatePtrSize < sizeof(void *)) { 2338 DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize, 2339 sizeof(void *)); 2340 return NULL; 2341 } 2342 2343 // if it's larger than a void*, assume it's a bss array and no further 2344 // initialization is required. Only try to set up a pointer for 2345 // sizeof(void*) 2346 if (StatePtrSize == sizeof(void *)) { 2347 uint64_t DeviceStateBytes = 2348 getDeviceStateBytes((char *)Image->ImageStart, ImgSize); 2349 if (DeviceStateBytes == 0) { 2350 DP("Can't initialize device_State, missing size information\n"); 2351 return NULL; 2352 } 2353 2354 auto &DSS = DeviceInfo().DeviceStateStore[DeviceId]; 2355 if (DSS.first.get() == nullptr) { 2356 assert(DSS.second == 0); 2357 void *Ptr = NULL; 2358 hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId); 2359 if (Err != HSA_STATUS_SUCCESS) { 2360 DP("Failed to allocate device_state array\n"); 2361 return NULL; 2362 } 2363 DSS = { 2364 std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr}, 2365 DeviceStateBytes, 2366 }; 2367 } 2368 2369 void *Ptr = DSS.first.get(); 2370 if (DeviceStateBytes != DSS.second) { 2371 DP("Inconsistent sizes of device_State unsupported\n"); 2372 return NULL; 2373 } 2374 2375 // write ptr to device memory so it can be used by later kernels 2376 Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *), 2377 DeviceId); 2378 if (Err != HSA_STATUS_SUCCESS) { 2379 DP("memcpy install of state_ptr failed\n"); 2380 return NULL; 2381 } 2382 } 2383 } 2384 } 2385 2386 // Here, we take advantage of the data that is appended after img_end to get 2387 // the symbols' name we need to load. This data consist of the host entries 2388 // begin and end as well as the target name (see the offloading linker script 2389 // creation in clang compiler). 2390 2391 // Find the symbols in the module by name. The name can be obtain by 2392 // concatenating the host entry name with the target name 2393 2394 __tgt_offload_entry *HostBegin = Image->EntriesBegin; 2395 __tgt_offload_entry *HostEnd = Image->EntriesEnd; 2396 2397 for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 2398 2399 if (!E->addr) { 2400 // The host should have always something in the address to 2401 // uniquely identify the target region. 2402 DP("Analyzing host entry '<null>' (size = %lld)...\n", 2403 (unsigned long long)E->size); 2404 return NULL; 2405 } 2406 2407 if (E->size) { 2408 __tgt_offload_entry Entry = *E; 2409 2410 void *Varptr; 2411 uint32_t Varsize; 2412 2413 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; 2414 hsa_status_t Err = interop_hsa_get_symbol_info( 2415 SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize); 2416 2417 if (Err != HSA_STATUS_SUCCESS) { 2418 // Inform the user what symbol prevented offloading 2419 DP("Loading global '%s' (Failed)\n", E->name); 2420 return NULL; 2421 } 2422 2423 if (Varsize != E->size) { 2424 DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name, 2425 Varsize, E->size); 2426 return NULL; 2427 } 2428 2429 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 2430 DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)); 2431 Entry.addr = (void *)Varptr; 2432 2433 DeviceInfo().addOffloadEntry(DeviceId, Entry); 2434 2435 if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 2436 E->flags & OMP_DECLARE_TARGET_LINK) { 2437 // If unified memory is present any target link variables 2438 // can access host addresses directly. There is no longer a 2439 // need for device copies. 2440 Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, 2441 sizeof(void *), DeviceId); 2442 if (Err != HSA_STATUS_SUCCESS) 2443 DP("Error when copying USM\n"); 2444 DP("Copy linked variable host address (" DPxMOD ")" 2445 "to device address (" DPxMOD ")\n", 2446 DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)); 2447 } 2448 2449 continue; 2450 } 2451 2452 DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)); 2453 2454 // errors in kernarg_segment_size previously treated as = 0 (or as undef) 2455 uint32_t KernargSegmentSize = 0; 2456 auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId]; 2457 hsa_status_t Err = HSA_STATUS_SUCCESS; 2458 if (!E->name) { 2459 Err = HSA_STATUS_ERROR; 2460 } else { 2461 std::string KernelStr = std::string(E->name); 2462 auto It = KernelInfoMap.find(KernelStr); 2463 if (It != KernelInfoMap.end()) { 2464 atl_kernel_info_t Info = It->second; 2465 KernargSegmentSize = Info.kernel_segment_size; 2466 } else { 2467 Err = HSA_STATUS_ERROR; 2468 } 2469 } 2470 2471 // default value GENERIC (in case symbol is missing from cubin file) 2472 llvm::omp::OMPTgtExecModeFlags ExecModeVal = 2473 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; 2474 2475 // get flat group size if present, else Default_WG_Size 2476 int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; 2477 2478 // get Kernel Descriptor if present. 2479 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp 2480 struct KernDescValType { 2481 uint16_t Version; 2482 uint16_t TSize; 2483 uint16_t WGSize; 2484 }; 2485 struct KernDescValType KernDescVal; 2486 std::string KernDescNameStr(E->name); 2487 KernDescNameStr += "_kern_desc"; 2488 const char *KernDescName = KernDescNameStr.c_str(); 2489 2490 void *KernDescPtr; 2491 uint32_t KernDescSize; 2492 void *CallStackAddr = nullptr; 2493 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName, 2494 &KernDescPtr, &KernDescSize); 2495 2496 if (Err == HSA_STATUS_SUCCESS) { 2497 if ((size_t)KernDescSize != sizeof(KernDescVal)) 2498 DP("Loading global computation properties '%s' - size mismatch (%u != " 2499 "%lu)\n", 2500 KernDescName, KernDescSize, sizeof(KernDescVal)); 2501 2502 memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); 2503 2504 // Check structure size against recorded size. 2505 if ((size_t)KernDescSize != KernDescVal.TSize) 2506 DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", 2507 sizeof(KernDescVal), KernDescVal.TSize, KernDescName); 2508 2509 DP("After loading global for %s KernDesc \n", KernDescName); 2510 DP("KernDesc: Version: %d\n", KernDescVal.Version); 2511 DP("KernDesc: TSize: %d\n", KernDescVal.TSize); 2512 DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize); 2513 2514 if (KernDescVal.WGSize == 0) { 2515 KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize; 2516 DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize); 2517 } 2518 WGSizeVal = KernDescVal.WGSize; 2519 DP("WGSizeVal %d\n", WGSizeVal); 2520 check("Loading KernDesc computation property", Err); 2521 } else { 2522 DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); 2523 2524 // Flat group size 2525 std::string WGSizeNameStr(E->name); 2526 WGSizeNameStr += "_wg_size"; 2527 const char *WGSizeName = WGSizeNameStr.c_str(); 2528 2529 void *WGSizePtr; 2530 uint32_t WGSize; 2531 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName, 2532 &WGSizePtr, &WGSize); 2533 2534 if (Err == HSA_STATUS_SUCCESS) { 2535 if ((size_t)WGSize != sizeof(int16_t)) { 2536 DP("Loading global computation properties '%s' - size mismatch (%u " 2537 "!= " 2538 "%lu)\n", 2539 WGSizeName, WGSize, sizeof(int16_t)); 2540 return NULL; 2541 } 2542 2543 memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); 2544 2545 DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); 2546 2547 if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize || 2548 WGSizeVal > RTLDeviceInfoTy::MaxWgSize) { 2549 DP("Error wrong WGSize value specified in HSA code object file: " 2550 "%d\n", 2551 WGSizeVal); 2552 WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; 2553 } 2554 } else { 2555 DP("Warning: Loading WGSize '%s' - symbol not found, " 2556 "using default value %d\n", 2557 WGSizeName, WGSizeVal); 2558 } 2559 2560 check("Loading WGSize computation property", Err); 2561 } 2562 2563 // Read execution mode from global in binary 2564 std::string ExecModeNameStr(E->name); 2565 ExecModeNameStr += "_exec_mode"; 2566 const char *ExecModeName = ExecModeNameStr.c_str(); 2567 2568 void *ExecModePtr; 2569 uint32_t VarSize; 2570 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName, 2571 &ExecModePtr, &VarSize); 2572 2573 if (Err == HSA_STATUS_SUCCESS) { 2574 if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 2575 DP("Loading global computation properties '%s' - size mismatch(%u != " 2576 "%lu)\n", 2577 ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 2578 return NULL; 2579 } 2580 2581 memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize); 2582 2583 DP("After loading global for %s ExecMode = %d\n", ExecModeName, 2584 ExecModeVal); 2585 2586 if (ExecModeVal < 0 || 2587 ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { 2588 DP("Error wrong exec_mode value specified in HSA code object file: " 2589 "%d\n", 2590 ExecModeVal); 2591 return NULL; 2592 } 2593 } else { 2594 DP("Loading global exec_mode '%s' - symbol missing, using default " 2595 "value " 2596 "GENERIC (1)\n", 2597 ExecModeName); 2598 } 2599 check("Loading computation property", Err); 2600 2601 KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, 2602 CallStackAddr, E->name, KernargSegmentSize, 2603 DeviceInfo().KernArgPool)); 2604 __tgt_offload_entry Entry = *E; 2605 Entry.addr = (void *)&KernelsList.back(); 2606 DeviceInfo().addOffloadEntry(DeviceId, Entry); 2607 DP("Entry point %ld maps to %s\n", E - HostBegin, E->name); 2608 } 2609 2610 return DeviceInfo().getOffloadEntriesTable(DeviceId); 2611 } 2612 2613 void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { 2614 void *Ptr = NULL; 2615 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2616 2617 if (Kind != TARGET_ALLOC_DEFAULT) { 2618 REPORT("Invalid target data allocation kind or requested allocator not " 2619 "implemented yet\n"); 2620 return NULL; 2621 } 2622 2623 hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); 2624 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr); 2625 DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size, 2626 (long long unsigned)(Elf64_Addr)Ptr); 2627 Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL; 2628 return Ptr; 2629 } 2630 2631 int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr, 2632 int64_t Size) { 2633 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2634 __tgt_async_info AsyncInfo; 2635 int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); 2636 if (Rc != OFFLOAD_SUCCESS) 2637 return OFFLOAD_FAIL; 2638 2639 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 2640 } 2641 2642 int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr, 2643 int64_t Size, __tgt_async_info *AsyncInfo) { 2644 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2645 if (AsyncInfo) { 2646 initAsyncInfo(AsyncInfo); 2647 return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo); 2648 } 2649 return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size); 2650 } 2651 2652 int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr, 2653 int64_t Size) { 2654 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2655 __tgt_async_info AsyncInfo; 2656 int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); 2657 if (Rc != OFFLOAD_SUCCESS) 2658 return OFFLOAD_FAIL; 2659 2660 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 2661 } 2662 2663 int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr, 2664 int64_t Size, 2665 __tgt_async_info *AsyncInfo) { 2666 assert(AsyncInfo && "AsyncInfo is nullptr"); 2667 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2668 initAsyncInfo(AsyncInfo); 2669 return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo); 2670 } 2671 2672 int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) { 2673 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2674 hsa_status_t Err; 2675 DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr); 2676 Err = core::Runtime::Memfree(TgtPtr); 2677 if (Err != HSA_STATUS_SUCCESS) { 2678 DP("Error when freeing CUDA memory\n"); 2679 return OFFLOAD_FAIL; 2680 } 2681 return OFFLOAD_SUCCESS; 2682 } 2683 2684 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, 2685 void **TgtArgs, ptrdiff_t *TgtOffsets, 2686 int32_t ArgNum, int32_t NumTeams, 2687 int32_t ThreadLimit, 2688 uint64_t LoopTripcount) { 2689 2690 DeviceInfo().LoadRunLock.lock_shared(); 2691 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, 2692 ArgNum, NumTeams, ThreadLimit, LoopTripcount); 2693 2694 DeviceInfo().LoadRunLock.unlock_shared(); 2695 return Res; 2696 } 2697 2698 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr, 2699 void **TgtArgs, ptrdiff_t *TgtOffsets, 2700 int32_t ArgNum) { 2701 // use one team and one thread 2702 // fix thread num 2703 int32_t TeamNum = 1; 2704 int32_t ThreadLimit = 0; // use default 2705 return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs, 2706 TgtOffsets, ArgNum, TeamNum, 2707 ThreadLimit, 0); 2708 } 2709 2710 int32_t __tgt_rtl_run_target_team_region_async( 2711 int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, 2712 int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, 2713 uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) { 2714 assert(AsyncInfo && "AsyncInfo is nullptr"); 2715 initAsyncInfo(AsyncInfo); 2716 2717 DeviceInfo().LoadRunLock.lock_shared(); 2718 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, 2719 ArgNum, NumTeams, ThreadLimit, LoopTripcount); 2720 2721 DeviceInfo().LoadRunLock.unlock_shared(); 2722 return Res; 2723 } 2724 2725 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr, 2726 void **TgtArgs, ptrdiff_t *TgtOffsets, 2727 int32_t ArgNum, 2728 __tgt_async_info *AsyncInfo) { 2729 // use one team and one thread 2730 // fix thread num 2731 int32_t TeamNum = 1; 2732 int32_t ThreadLimit = 0; // use default 2733 return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs, 2734 TgtOffsets, ArgNum, TeamNum, 2735 ThreadLimit, 0, AsyncInfo); 2736 } 2737 2738 int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) { 2739 assert(AsyncInfo && "AsyncInfo is nullptr"); 2740 2741 // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant 2742 // is not ensured by devices.cpp for amdgcn 2743 // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr"); 2744 if (AsyncInfo->Queue) { 2745 finiAsyncInfo(AsyncInfo); 2746 } 2747 return OFFLOAD_SUCCESS; 2748 } 2749 2750 void __tgt_rtl_print_device_info(int32_t DeviceId) { 2751 // TODO: Assertion to see if DeviceId is correct 2752 // NOTE: We don't need to set context for print device info. 2753 2754 DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]); 2755 } 2756 2757 } // extern "C" 2758