1 //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for AMD hsa machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <algorithm> 14 #include <assert.h> 15 #include <cstdio> 16 #include <cstdlib> 17 #include <cstring> 18 #include <functional> 19 #include <libelf.h> 20 #include <list> 21 #include <memory> 22 #include <mutex> 23 #include <shared_mutex> 24 #include <unordered_map> 25 #include <vector> 26 27 #include "impl_runtime.h" 28 #include "interop_hsa.h" 29 30 #include "internal.h" 31 #include "rt.h" 32 33 #include "DeviceEnvironment.h" 34 #include "get_elf_mach_gfx_name.h" 35 #include "omptargetplugin.h" 36 #include "print_tracing.h" 37 38 #include "llvm/ADT/StringMap.h" 39 #include "llvm/ADT/StringRef.h" 40 #include "llvm/Frontend/OpenMP/OMPConstants.h" 41 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 42 43 using namespace llvm; 44 45 // hostrpc interface, FIXME: consider moving to its own include these are 46 // statically linked into amdgpu/plugin if present from hostrpc_services.a, 47 // linked as --whole-archive to override the weak symbols that are used to 48 // implement a fallback for toolchains that do not yet have a hostrpc library. 49 extern "C" { 50 uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ, 51 uint32_t DeviceId); 52 hsa_status_t hostrpc_init(); 53 hsa_status_t hostrpc_terminate(); 54 55 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } 56 __attribute__((weak)) hsa_status_t hostrpc_terminate() { 57 return HSA_STATUS_SUCCESS; 58 } 59 __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, 60 uint32_t DeviceId) { 61 DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library " 62 "missing\n", 63 DeviceId); 64 return 0; 65 } 66 } 67 68 // Heuristic parameters used for kernel launch 69 // Number of teams per CU to allow scheduling flexibility 70 static const unsigned DefaultTeamsPerCU = 4; 71 72 int print_kernel_trace; 73 74 #ifdef OMPTARGET_DEBUG 75 #define check(msg, status) \ 76 if (status != HSA_STATUS_SUCCESS) { \ 77 DP(#msg " failed\n"); \ 78 } else { \ 79 DP(#msg " succeeded\n"); \ 80 } 81 #else 82 #define check(msg, status) \ 83 {} 84 #endif 85 86 #include "elf_common.h" 87 88 namespace hsa { 89 template <typename C> hsa_status_t iterate_agents(C Cb) { 90 auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t { 91 C *Unwrapped = static_cast<C *>(Data); 92 return (*Unwrapped)(Agent); 93 }; 94 return hsa_iterate_agents(L, static_cast<void *>(&Cb)); 95 } 96 97 template <typename C> 98 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) { 99 auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t { 100 C *Unwrapped = static_cast<C *>(Data); 101 return (*Unwrapped)(MemoryPool); 102 }; 103 104 return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb)); 105 } 106 107 } // namespace hsa 108 109 /// Keep entries table per device 110 struct FuncOrGblEntryTy { 111 __tgt_target_table Table; 112 std::vector<__tgt_offload_entry> Entries; 113 }; 114 115 struct KernelArgPool { 116 private: 117 static pthread_mutex_t Mutex; 118 119 public: 120 uint32_t KernargSegmentSize; 121 void *KernargRegion = nullptr; 122 std::queue<int> FreeKernargSegments; 123 124 uint32_t kernargSizeIncludingImplicit() { 125 return KernargSegmentSize + sizeof(impl_implicit_args_t); 126 } 127 128 ~KernelArgPool() { 129 if (KernargRegion) { 130 auto R = hsa_amd_memory_pool_free(KernargRegion); 131 if (R != HSA_STATUS_SUCCESS) { 132 DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)); 133 } 134 } 135 } 136 137 // Can't really copy or move a mutex 138 KernelArgPool() = default; 139 KernelArgPool(const KernelArgPool &) = delete; 140 KernelArgPool(KernelArgPool &&) = delete; 141 142 KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) 143 : KernargSegmentSize(KernargSegmentSize) { 144 145 // impl uses one pool per kernel for all gpus, with a fixed upper size 146 // preserving that exact scheme here, including the queue<int> 147 148 hsa_status_t Err = hsa_amd_memory_pool_allocate( 149 MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0, 150 &KernargRegion); 151 152 if (Err != HSA_STATUS_SUCCESS) { 153 DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)); 154 KernargRegion = nullptr; // paranoid 155 return; 156 } 157 158 Err = core::allow_access_to_all_gpu_agents(KernargRegion); 159 if (Err != HSA_STATUS_SUCCESS) { 160 DP("hsa allow_access_to_all_gpu_agents failed: %s\n", 161 get_error_string(Err)); 162 auto R = hsa_amd_memory_pool_free(KernargRegion); 163 if (R != HSA_STATUS_SUCCESS) { 164 // if free failed, can't do anything more to resolve it 165 DP("hsa memory poll free failed: %s\n", get_error_string(Err)); 166 } 167 KernargRegion = nullptr; 168 return; 169 } 170 171 for (int I = 0; I < MAX_NUM_KERNELS; I++) { 172 FreeKernargSegments.push(I); 173 } 174 } 175 176 void *allocate(uint64_t ArgNum) { 177 assert((ArgNum * sizeof(void *)) == KernargSegmentSize); 178 Lock L(&Mutex); 179 void *Res = nullptr; 180 if (!FreeKernargSegments.empty()) { 181 182 int FreeIdx = FreeKernargSegments.front(); 183 Res = static_cast<void *>(static_cast<char *>(KernargRegion) + 184 (FreeIdx * kernargSizeIncludingImplicit())); 185 assert(FreeIdx == pointerToIndex(Res)); 186 FreeKernargSegments.pop(); 187 } 188 return Res; 189 } 190 191 void deallocate(void *Ptr) { 192 Lock L(&Mutex); 193 int Idx = pointerToIndex(Ptr); 194 FreeKernargSegments.push(Idx); 195 } 196 197 private: 198 int pointerToIndex(void *Ptr) { 199 ptrdiff_t Bytes = 200 static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion); 201 assert(Bytes >= 0); 202 assert(Bytes % kernargSizeIncludingImplicit() == 0); 203 return Bytes / kernargSizeIncludingImplicit(); 204 } 205 struct Lock { 206 Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); } 207 ~Lock() { pthread_mutex_unlock(M); } 208 pthread_mutex_t *M; 209 }; 210 }; 211 pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER; 212 213 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>> 214 KernelArgPoolMap; 215 216 /// Use a single entity to encode a kernel and a set of flags 217 struct KernelTy { 218 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 219 int16_t ConstWGSize; 220 int32_t DeviceId; 221 void *CallStackAddr = nullptr; 222 const char *Name; 223 224 KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, 225 int32_t DeviceId, void *CallStackAddr, const char *Name, 226 uint32_t KernargSegmentSize, 227 hsa_amd_memory_pool_t &KernArgMemoryPool) 228 : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), 229 DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { 230 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); 231 232 std::string N(Name); 233 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { 234 KernelArgPoolMap.insert( 235 std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool( 236 KernargSegmentSize, KernArgMemoryPool)))); 237 } 238 } 239 }; 240 241 /// List that contains all the kernels. 242 /// FIXME: we may need this to be per device and per library. 243 std::list<KernelTy> KernelsList; 244 245 template <typename Callback> static hsa_status_t findAgents(Callback CB) { 246 247 hsa_status_t Err = 248 hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t { 249 hsa_device_type_t DeviceType; 250 // get_info fails iff HSA runtime not yet initialized 251 hsa_status_t Err = 252 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); 253 254 if (Err != HSA_STATUS_SUCCESS) { 255 if (print_kernel_trace > 0) 256 DP("rtl.cpp: err %s\n", get_error_string(Err)); 257 258 return Err; 259 } 260 261 CB(DeviceType, Agent); 262 return HSA_STATUS_SUCCESS; 263 }); 264 265 // iterate_agents fails iff HSA runtime not yet initialized 266 if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) { 267 DP("rtl.cpp: err %s\n", get_error_string(Err)); 268 } 269 270 return Err; 271 } 272 273 static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source, 274 void *Data) { 275 if (Status != HSA_STATUS_SUCCESS) { 276 const char *StatusString; 277 if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) { 278 StatusString = "unavailable"; 279 } 280 DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source, 281 Status, StatusString); 282 abort(); 283 } 284 } 285 286 namespace core { 287 namespace { 288 289 bool checkResult(hsa_status_t Err, const char *ErrMsg) { 290 if (Err == HSA_STATUS_SUCCESS) 291 return true; 292 293 REPORT("%s", ErrMsg); 294 REPORT("%s", get_error_string(Err)); 295 return false; 296 } 297 298 void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) { 299 __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE); 300 } 301 302 uint16_t createHeader() { 303 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; 304 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; 305 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; 306 return Header; 307 } 308 309 hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { 310 bool AllocAllowed = false; 311 hsa_status_t Err = hsa_amd_memory_pool_get_info( 312 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, 313 &AllocAllowed); 314 if (Err != HSA_STATUS_SUCCESS) { 315 DP("Alloc allowed in memory pool check failed: %s\n", 316 get_error_string(Err)); 317 return Err; 318 } 319 320 size_t Size = 0; 321 Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, 322 &Size); 323 if (Err != HSA_STATUS_SUCCESS) { 324 DP("Get memory pool size failed: %s\n", get_error_string(Err)); 325 return Err; 326 } 327 328 return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; 329 } 330 331 hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { 332 std::vector<hsa_amd_memory_pool_t> *Result = 333 static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data); 334 335 hsa_status_t Err; 336 if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) { 337 return Err; 338 } 339 340 Result->push_back(MemoryPool); 341 return HSA_STATUS_SUCCESS; 342 } 343 344 } // namespace 345 } // namespace core 346 347 struct EnvironmentVariables { 348 int NumTeams; 349 int TeamLimit; 350 int TeamThreadLimit; 351 int MaxTeamsDefault; 352 int DynamicMemSize; 353 }; 354 355 template <uint32_t wavesize> 356 static constexpr const llvm::omp::GV &getGridValue() { 357 return llvm::omp::getAMDGPUGridValues<wavesize>(); 358 } 359 360 struct HSALifetime { 361 // Wrapper around HSA used to ensure it is constructed before other types 362 // and destructed after, which means said other types can use raii for 363 // cleanup without risking running outside of the lifetime of HSA 364 const hsa_status_t S; 365 366 bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; } 367 HSALifetime() : S(hsa_init()) {} 368 369 ~HSALifetime() { 370 if (S == HSA_STATUS_SUCCESS) { 371 hsa_status_t Err = hsa_shut_down(); 372 if (Err != HSA_STATUS_SUCCESS) { 373 // Can't call into HSA to get a string from the integer 374 DP("Shutting down HSA failed: %d\n", Err); 375 } 376 } 377 } 378 }; 379 380 // Handle scheduling of multiple hsa_queue's per device to 381 // multiple threads (one scheduler per device) 382 class HSAQueueScheduler { 383 public: 384 HSAQueueScheduler() : Current(0) {} 385 386 HSAQueueScheduler(const HSAQueueScheduler &) = delete; 387 388 HSAQueueScheduler(HSAQueueScheduler &&Q) { 389 Current = Q.Current.load(); 390 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 391 HSAQueues[I] = Q.HSAQueues[I]; 392 Q.HSAQueues[I] = nullptr; 393 } 394 } 395 396 // \return false if any HSA queue creation fails 397 bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) { 398 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 399 hsa_queue_t *Q = nullptr; 400 hsa_status_t Rc = 401 hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI, 402 callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q); 403 if (Rc != HSA_STATUS_SUCCESS) { 404 DP("Failed to create HSA queue %d\n", I); 405 return false; 406 } 407 HSAQueues[I] = Q; 408 } 409 return true; 410 } 411 412 ~HSAQueueScheduler() { 413 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 414 if (HSAQueues[I]) { 415 hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]); 416 if (Err != HSA_STATUS_SUCCESS) 417 DP("Error destroying HSA queue"); 418 } 419 } 420 } 421 422 // \return next queue to use for device 423 hsa_queue_t *next() { 424 return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) % 425 NUM_QUEUES_PER_DEVICE]; 426 } 427 428 private: 429 // Number of queues per device 430 enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 }; 431 hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {}; 432 std::atomic<uint8_t> Current; 433 }; 434 435 /// Class containing all the device information 436 class RTLDeviceInfoTy : HSALifetime { 437 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; 438 439 struct QueueDeleter { 440 void operator()(hsa_queue_t *Q) { 441 if (Q) { 442 hsa_status_t Err = hsa_queue_destroy(Q); 443 if (Err != HSA_STATUS_SUCCESS) { 444 DP("Error destroying hsa queue: %s\n", get_error_string(Err)); 445 } 446 } 447 } 448 }; 449 450 public: 451 bool ConstructionSucceeded = false; 452 453 // load binary populates symbol tables and mutates various global state 454 // run uses those symbol tables 455 std::shared_timed_mutex LoadRunLock; 456 457 int NumberOfDevices = 0; 458 459 // GPU devices 460 std::vector<hsa_agent_t> HSAAgents; 461 std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu 462 463 // CPUs 464 std::vector<hsa_agent_t> CPUAgents; 465 466 // Device properties 467 std::vector<int> ComputeUnits; 468 std::vector<int> GroupsPerDevice; 469 std::vector<int> ThreadsPerGroup; 470 std::vector<int> WarpSize; 471 std::vector<std::string> GPUName; 472 std::vector<std::string> TargetID; 473 474 // OpenMP properties 475 std::vector<int> NumTeams; 476 std::vector<int> NumThreads; 477 478 // OpenMP Environment properties 479 EnvironmentVariables Env; 480 481 // OpenMP Requires Flags 482 int64_t RequiresFlags; 483 484 // Resource pools 485 SignalPoolT FreeSignalPool; 486 487 bool HostcallRequired = false; 488 489 std::vector<hsa_executable_t> HSAExecutables; 490 491 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable; 492 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable; 493 494 hsa_amd_memory_pool_t KernArgPool; 495 496 // fine grained memory pool for host allocations 497 hsa_amd_memory_pool_t HostFineGrainedMemoryPool; 498 499 // fine and coarse-grained memory pools per offloading device 500 std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools; 501 std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools; 502 503 struct ImplFreePtrDeletor { 504 void operator()(void *P) { 505 core::Runtime::Memfree(P); // ignore failure to free 506 } 507 }; 508 509 // device_State shared across loaded binaries, error if inconsistent size 510 std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>> 511 DeviceStateStore; 512 513 static const unsigned HardTeamLimit = 514 (1 << 16) - 1; // 64K needed to fit in uint16 515 static const int DefaultNumTeams = 128; 516 517 // These need to be per-device since different devices can have different 518 // wave sizes, but are currently the same number for each so that refactor 519 // can be postponed. 520 static_assert(getGridValue<32>().GV_Max_Teams == 521 getGridValue<64>().GV_Max_Teams, 522 ""); 523 static const int MaxTeams = getGridValue<64>().GV_Max_Teams; 524 525 static_assert(getGridValue<32>().GV_Max_WG_Size == 526 getGridValue<64>().GV_Max_WG_Size, 527 ""); 528 static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size; 529 530 static_assert(getGridValue<32>().GV_Default_WG_Size == 531 getGridValue<64>().GV_Default_WG_Size, 532 ""); 533 static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size; 534 535 using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size, 536 hsa_agent_t, hsa_amd_memory_pool_t); 537 hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size, 538 MemcpyFunc Func, int32_t DeviceId) { 539 hsa_agent_t Agent = HSAAgents[DeviceId]; 540 hsa_signal_t S = FreeSignalPool.pop(); 541 if (S.handle == 0) { 542 return HSA_STATUS_ERROR; 543 } 544 hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool); 545 FreeSignalPool.push(S); 546 return R; 547 } 548 549 hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size, 550 int32_t DeviceId) { 551 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId); 552 } 553 554 hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size, 555 int32_t DeviceId) { 556 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId); 557 } 558 559 static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) { 560 char TmpChar[1000]; 561 uint16_t Major, Minor; 562 uint32_t TmpUInt; 563 uint32_t TmpUInt2; 564 uint32_t CacheSize[4]; 565 bool TmpBool; 566 uint16_t WorkgroupMaxDim[3]; 567 hsa_dim3_t GridMaxDim; 568 569 // Getting basic information about HSA and Device 570 core::checkResult( 571 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major), 572 "Error from hsa_system_get_info when obtaining " 573 "HSA_SYSTEM_INFO_VERSION_MAJOR\n"); 574 core::checkResult( 575 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor), 576 "Error from hsa_system_get_info when obtaining " 577 "HSA_SYSTEM_INFO_VERSION_MINOR\n"); 578 printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor); 579 printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId); 580 core::checkResult( 581 hsa_agent_get_info( 582 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar), 583 "Error returned from hsa_agent_get_info when obtaining " 584 "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n"); 585 printf(" Product Name: \t\t\t%s \n", TmpChar); 586 core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar), 587 "Error returned from hsa_agent_get_info when obtaining " 588 "HSA_AGENT_INFO_NAME\n"); 589 printf(" Device Name: \t\t\t%s \n", TmpChar); 590 core::checkResult( 591 hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar), 592 "Error returned from hsa_agent_get_info when obtaining " 593 "HSA_AGENT_INFO_NAME\n"); 594 printf(" Vendor Name: \t\t\t%s \n", TmpChar); 595 hsa_device_type_t DevType; 596 core::checkResult( 597 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType), 598 "Error returned from hsa_agent_get_info when obtaining " 599 "HSA_AGENT_INFO_DEVICE\n"); 600 printf(" Device Type: \t\t\t%s \n", 601 DevType == HSA_DEVICE_TYPE_CPU 602 ? "CPU" 603 : (DevType == HSA_DEVICE_TYPE_GPU 604 ? "GPU" 605 : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN"))); 606 core::checkResult( 607 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt), 608 "Error returned from hsa_agent_get_info when obtaining " 609 "HSA_AGENT_INFO_QUEUES_MAX\n"); 610 printf(" Max Queues: \t\t\t%u \n", TmpUInt); 611 core::checkResult( 612 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt), 613 "Error returned from hsa_agent_get_info when obtaining " 614 "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n"); 615 printf(" Queue Min Size: \t\t\t%u \n", TmpUInt); 616 core::checkResult( 617 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt), 618 "Error returned from hsa_agent_get_info when obtaining " 619 "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n"); 620 printf(" Queue Max Size: \t\t\t%u \n", TmpUInt); 621 622 // Getting cache information 623 printf(" Cache:\n"); 624 625 // FIXME: This is deprecated according to HSA documentation. But using 626 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during 627 // runtime. 628 core::checkResult( 629 hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize), 630 "Error returned from hsa_agent_get_info when obtaining " 631 "HSA_AGENT_INFO_CACHE_SIZE\n"); 632 633 for (int I = 0; I < 4; I++) { 634 if (CacheSize[I]) { 635 printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]); 636 } 637 } 638 639 core::checkResult( 640 hsa_agent_get_info(Agent, 641 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, 642 &TmpUInt), 643 "Error returned from hsa_agent_get_info when obtaining " 644 "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n"); 645 printf(" Cacheline Size: \t\t\t%u \n", TmpUInt); 646 core::checkResult( 647 hsa_agent_get_info( 648 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, 649 &TmpUInt), 650 "Error returned from hsa_agent_get_info when obtaining " 651 "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n"); 652 printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt); 653 core::checkResult( 654 hsa_agent_get_info( 655 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 656 &TmpUInt), 657 "Error returned from hsa_agent_get_info when obtaining " 658 "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n"); 659 printf(" Compute Units: \t\t\t%u \n", TmpUInt); 660 core::checkResult(hsa_agent_get_info( 661 Agent, 662 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, 663 &TmpUInt), 664 "Error returned from hsa_agent_get_info when obtaining " 665 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); 666 printf(" SIMD per CU: \t\t\t%u \n", TmpUInt); 667 core::checkResult( 668 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool), 669 "Error returned from hsa_agent_get_info when obtaining " 670 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); 671 printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE")); 672 core::checkResult( 673 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2), 674 "Error returned from hsa_agent_get_info when obtaining " 675 "HSA_AGENT_INFO_WAVEFRONT_SIZE\n"); 676 printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2); 677 core::checkResult( 678 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt), 679 "Error returned from hsa_agent_get_info when obtaining " 680 "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n"); 681 printf(" Workgroup Max Size: \t\t%u \n", TmpUInt); 682 core::checkResult(hsa_agent_get_info(Agent, 683 HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 684 WorkgroupMaxDim), 685 "Error returned from hsa_agent_get_info when obtaining " 686 "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n"); 687 printf(" Workgroup Max Size per Dimension:\n"); 688 printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]); 689 printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]); 690 printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]); 691 core::checkResult(hsa_agent_get_info( 692 Agent, 693 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, 694 &TmpUInt), 695 "Error returned from hsa_agent_get_info when obtaining " 696 "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n"); 697 printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt); 698 printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2); 699 core::checkResult( 700 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt), 701 "Error returned from hsa_agent_get_info when obtaining " 702 "HSA_AGENT_INFO_GRID_MAX_SIZE\n"); 703 printf(" Grid Max Size: \t\t\t%u \n", TmpUInt); 704 core::checkResult( 705 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim), 706 "Error returned from hsa_agent_get_info when obtaining " 707 "HSA_AGENT_INFO_GRID_MAX_DIM\n"); 708 printf(" Grid Max Size per Dimension: \t\t\n"); 709 printf(" x: \t\t\t\t%u\n", GridMaxDim.x); 710 printf(" y: \t\t\t\t%u\n", GridMaxDim.y); 711 printf(" z: \t\t\t\t%u\n", GridMaxDim.z); 712 core::checkResult( 713 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt), 714 "Error returned from hsa_agent_get_info when obtaining " 715 "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n"); 716 printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt); 717 718 printf(" Memory Pools:\n"); 719 auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t { 720 std::string TmpStr; 721 size_t Size; 722 bool Alloc, Access; 723 hsa_amd_segment_t Segment; 724 hsa_amd_memory_pool_global_flag_t GlobalFlags; 725 core::checkResult( 726 hsa_amd_memory_pool_get_info( 727 Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags), 728 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 729 "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n"); 730 core::checkResult(hsa_amd_memory_pool_get_info( 731 Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment), 732 "Error returned from hsa_amd_memory_pool_get_info when " 733 "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n"); 734 735 switch (Segment) { 736 case HSA_AMD_SEGMENT_GLOBAL: 737 TmpStr = "GLOBAL; FLAGS: "; 738 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags) 739 TmpStr += "KERNARG, "; 740 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags) 741 TmpStr += "FINE GRAINED, "; 742 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags) 743 TmpStr += "COARSE GRAINED, "; 744 break; 745 case HSA_AMD_SEGMENT_READONLY: 746 TmpStr = "READONLY"; 747 break; 748 case HSA_AMD_SEGMENT_PRIVATE: 749 TmpStr = "PRIVATE"; 750 break; 751 case HSA_AMD_SEGMENT_GROUP: 752 TmpStr = "GROUP"; 753 break; 754 } 755 printf(" Pool %s: \n", TmpStr.c_str()); 756 757 core::checkResult(hsa_amd_memory_pool_get_info( 758 Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size), 759 "Error returned from hsa_amd_memory_pool_get_info when " 760 "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n"); 761 printf(" Size: \t\t\t\t %zu bytes\n", Size); 762 core::checkResult( 763 hsa_amd_memory_pool_get_info( 764 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc), 765 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 766 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n"); 767 printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE")); 768 core::checkResult( 769 hsa_amd_memory_pool_get_info( 770 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size), 771 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 772 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n"); 773 printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size); 774 core::checkResult( 775 hsa_amd_memory_pool_get_info( 776 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size), 777 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 778 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n"); 779 printf(" Runtime Alloc alignment: \t %zu bytes\n", Size); 780 core::checkResult( 781 hsa_amd_memory_pool_get_info( 782 Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access), 783 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 784 "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n"); 785 printf(" Accessable by all: \t\t %s\n", 786 (Access ? "TRUE" : "FALSE")); 787 788 return HSA_STATUS_SUCCESS; 789 }; 790 // Iterate over all the memory regions for this agent. Get the memory region 791 // type and size 792 hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr); 793 794 printf(" ISAs:\n"); 795 auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t { 796 char TmpChar[1000]; 797 core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar), 798 "Error returned from hsa_isa_get_info_alt when " 799 "obtaining HSA_ISA_INFO_NAME\n"); 800 printf(" Name: \t\t\t\t %s\n", TmpChar); 801 802 return HSA_STATUS_SUCCESS; 803 }; 804 // Iterate over all the memory regions for this agent. Get the memory region 805 // type and size 806 hsa_agent_iterate_isas(Agent, CBIsas, nullptr); 807 } 808 809 // Record entry point associated with device 810 void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) { 811 assert(DeviceId < (int32_t)FuncGblEntries.size() && 812 "Unexpected device id!"); 813 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 814 815 E.Entries.push_back(Entry); 816 } 817 818 // Return true if the entry is associated with device 819 bool findOffloadEntry(int32_t DeviceId, void *Addr) { 820 assert(DeviceId < (int32_t)FuncGblEntries.size() && 821 "Unexpected device id!"); 822 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 823 824 for (auto &It : E.Entries) { 825 if (It.addr == Addr) 826 return true; 827 } 828 829 return false; 830 } 831 832 // Return the pointer to the target entries table 833 __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) { 834 assert(DeviceId < (int32_t)FuncGblEntries.size() && 835 "Unexpected device id!"); 836 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 837 838 int32_t Size = E.Entries.size(); 839 840 // Table is empty 841 if (!Size) 842 return 0; 843 844 __tgt_offload_entry *Begin = &E.Entries[0]; 845 __tgt_offload_entry *End = &E.Entries[Size - 1]; 846 847 // Update table info according to the entries and return the pointer 848 E.Table.EntriesBegin = Begin; 849 E.Table.EntriesEnd = ++End; 850 851 return &E.Table; 852 } 853 854 // Clear entries table for a device 855 void clearOffloadEntriesTable(int DeviceId) { 856 assert(DeviceId < (int32_t)FuncGblEntries.size() && 857 "Unexpected device id!"); 858 FuncGblEntries[DeviceId].emplace_back(); 859 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 860 // KernelArgPoolMap.clear(); 861 E.Entries.clear(); 862 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 863 } 864 865 hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, 866 unsigned int DeviceId) { 867 assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here."); 868 uint32_t GlobalFlags = 0; 869 hsa_status_t Err = hsa_amd_memory_pool_get_info( 870 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 871 872 if (Err != HSA_STATUS_SUCCESS) { 873 return Err; 874 } 875 876 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { 877 DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; 878 } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { 879 DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; 880 } 881 882 return HSA_STATUS_SUCCESS; 883 } 884 885 hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) { 886 for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { 887 hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( 888 Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { 889 hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool); 890 if (ValidStatus != HSA_STATUS_SUCCESS) { 891 DP("Alloc allowed in memory pool check failed: %s\n", 892 get_error_string(ValidStatus)); 893 return HSA_STATUS_SUCCESS; 894 } 895 return addDeviceMemoryPool(MemoryPool, DeviceId); 896 }); 897 898 if (Err != HSA_STATUS_SUCCESS) { 899 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 900 "Iterate all memory pools", get_error_string(Err)); 901 return Err; 902 } 903 } 904 return HSA_STATUS_SUCCESS; 905 } 906 907 hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) { 908 std::vector<hsa_amd_memory_pool_t> HostPools; 909 910 // collect all the "valid" pools for all the given agents. 911 for (const auto &Agent : Agents) { 912 hsa_status_t Err = hsa_amd_agent_iterate_memory_pools( 913 Agent, core::addMemoryPool, static_cast<void *>(&HostPools)); 914 if (Err != HSA_STATUS_SUCCESS) { 915 DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)); 916 } 917 } 918 919 // We need two fine-grained pools. 920 // 1. One with kernarg flag set for storing kernel arguments 921 // 2. Second for host allocations 922 bool FineGrainedMemoryPoolSet = false; 923 bool KernArgPoolSet = false; 924 for (const auto &MemoryPool : HostPools) { 925 hsa_status_t Err = HSA_STATUS_SUCCESS; 926 uint32_t GlobalFlags = 0; 927 Err = hsa_amd_memory_pool_get_info( 928 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 929 if (Err != HSA_STATUS_SUCCESS) { 930 DP("Get memory pool info failed: %s\n", get_error_string(Err)); 931 return Err; 932 } 933 934 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { 935 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { 936 KernArgPool = MemoryPool; 937 KernArgPoolSet = true; 938 } 939 HostFineGrainedMemoryPool = MemoryPool; 940 FineGrainedMemoryPoolSet = true; 941 } 942 } 943 944 if (FineGrainedMemoryPoolSet && KernArgPoolSet) 945 return HSA_STATUS_SUCCESS; 946 947 return HSA_STATUS_ERROR; 948 } 949 950 hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) { 951 assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && 952 "Invalid device Id"); 953 return DeviceCoarseGrainedMemoryPools[DeviceId]; 954 } 955 956 hsa_amd_memory_pool_t getHostMemoryPool() { 957 return HostFineGrainedMemoryPool; 958 } 959 960 static int readEnv(const char *Env, int Default = -1) { 961 const char *EnvStr = getenv(Env); 962 int Res = Default; 963 if (EnvStr) { 964 Res = std::stoi(EnvStr); 965 DP("Parsed %s=%d\n", Env, Res); 966 } 967 return Res; 968 } 969 970 RTLDeviceInfoTy() { 971 DP("Start initializing " GETNAME(TARGET_NAME) "\n"); 972 973 // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr 974 // anytime. You do not need a debug library build. 975 // 0 => no tracing 976 // 1 => tracing dispatch only 977 // >1 => verbosity increase 978 979 if (!HSAInitSuccess()) { 980 DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"); 981 return; 982 } 983 984 if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) 985 print_kernel_trace = atoi(EnvStr); 986 else 987 print_kernel_trace = 0; 988 989 hsa_status_t Err = core::atl_init_gpu_context(); 990 if (Err != HSA_STATUS_SUCCESS) { 991 DP("Error when initializing " GETNAME(TARGET_NAME) "\n"); 992 return; 993 } 994 995 // Init hostcall soon after initializing hsa 996 hostrpc_init(); 997 998 Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { 999 if (DeviceType == HSA_DEVICE_TYPE_CPU) { 1000 CPUAgents.push_back(Agent); 1001 } else { 1002 HSAAgents.push_back(Agent); 1003 } 1004 }); 1005 if (Err != HSA_STATUS_SUCCESS) 1006 return; 1007 1008 NumberOfDevices = (int)HSAAgents.size(); 1009 1010 if (NumberOfDevices == 0) { 1011 DP("There are no devices supporting HSA.\n"); 1012 return; 1013 } 1014 DP("There are %d devices supporting HSA.\n", NumberOfDevices); 1015 1016 // Init the device info 1017 HSAQueueSchedulers.reserve(NumberOfDevices); 1018 FuncGblEntries.resize(NumberOfDevices); 1019 ThreadsPerGroup.resize(NumberOfDevices); 1020 ComputeUnits.resize(NumberOfDevices); 1021 GPUName.resize(NumberOfDevices); 1022 GroupsPerDevice.resize(NumberOfDevices); 1023 WarpSize.resize(NumberOfDevices); 1024 NumTeams.resize(NumberOfDevices); 1025 NumThreads.resize(NumberOfDevices); 1026 DeviceStateStore.resize(NumberOfDevices); 1027 KernelInfoTable.resize(NumberOfDevices); 1028 SymbolInfoTable.resize(NumberOfDevices); 1029 DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); 1030 DeviceFineGrainedMemoryPools.resize(NumberOfDevices); 1031 1032 Err = setupDevicePools(HSAAgents); 1033 if (Err != HSA_STATUS_SUCCESS) { 1034 DP("Setup for Device Memory Pools failed\n"); 1035 return; 1036 } 1037 1038 Err = setupHostMemoryPools(CPUAgents); 1039 if (Err != HSA_STATUS_SUCCESS) { 1040 DP("Setup for Host Memory Pools failed\n"); 1041 return; 1042 } 1043 1044 for (int I = 0; I < NumberOfDevices; I++) { 1045 uint32_t QueueSize = 0; 1046 { 1047 hsa_status_t Err = hsa_agent_get_info( 1048 HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize); 1049 if (Err != HSA_STATUS_SUCCESS) { 1050 DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I); 1051 return; 1052 } 1053 enum { MaxQueueSize = 4096 }; 1054 if (QueueSize > MaxQueueSize) { 1055 QueueSize = MaxQueueSize; 1056 } 1057 } 1058 1059 { 1060 HSAQueueScheduler QSched; 1061 if (!QSched.createQueues(HSAAgents[I], QueueSize)) 1062 return; 1063 HSAQueueSchedulers.emplace_back(std::move(QSched)); 1064 } 1065 1066 DeviceStateStore[I] = {nullptr, 0}; 1067 } 1068 1069 for (int I = 0; I < NumberOfDevices; I++) { 1070 ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize; 1071 GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams; 1072 ComputeUnits[I] = 1; 1073 DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I, 1074 GroupsPerDevice[I], ThreadsPerGroup[I]); 1075 } 1076 1077 // Get environment variables regarding teams 1078 Env.TeamLimit = readEnv("OMP_TEAM_LIMIT"); 1079 Env.NumTeams = readEnv("OMP_NUM_TEAMS"); 1080 Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT"); 1081 Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT"); 1082 Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0); 1083 1084 // Default state. 1085 RequiresFlags = OMP_REQ_UNDEFINED; 1086 1087 ConstructionSucceeded = true; 1088 } 1089 1090 ~RTLDeviceInfoTy() { 1091 DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"); 1092 if (!HSAInitSuccess()) { 1093 // Then none of these can have been set up and they can't be torn down 1094 return; 1095 } 1096 // Run destructors on types that use HSA before 1097 // impl_finalize removes access to it 1098 DeviceStateStore.clear(); 1099 KernelArgPoolMap.clear(); 1100 // Terminate hostrpc before finalizing hsa 1101 hostrpc_terminate(); 1102 1103 hsa_status_t Err; 1104 for (uint32_t I = 0; I < HSAExecutables.size(); I++) { 1105 Err = hsa_executable_destroy(HSAExecutables[I]); 1106 if (Err != HSA_STATUS_SUCCESS) { 1107 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1108 "Destroying executable", get_error_string(Err)); 1109 } 1110 } 1111 } 1112 }; 1113 1114 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; 1115 1116 // Putting accesses to DeviceInfo global behind a function call prior 1117 // to changing to use init_plugin/deinit_plugin calls 1118 static RTLDeviceInfoTy DeviceInfoState; 1119 static RTLDeviceInfoTy& DeviceInfo() { return DeviceInfoState; } 1120 1121 namespace { 1122 1123 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, 1124 __tgt_async_info *AsyncInfo) { 1125 assert(AsyncInfo && "AsyncInfo is nullptr"); 1126 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 1127 // Return success if we are not copying back to host from target. 1128 if (!HstPtr) 1129 return OFFLOAD_SUCCESS; 1130 hsa_status_t Err; 1131 DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 1132 (long long unsigned)(Elf64_Addr)TgtPtr, 1133 (long long unsigned)(Elf64_Addr)HstPtr); 1134 1135 Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, 1136 DeviceId); 1137 1138 if (Err != HSA_STATUS_SUCCESS) { 1139 DP("Error when copying data from device to host. Pointers: " 1140 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 1141 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 1142 return OFFLOAD_FAIL; 1143 } 1144 DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 1145 (long long unsigned)(Elf64_Addr)TgtPtr, 1146 (long long unsigned)(Elf64_Addr)HstPtr); 1147 return OFFLOAD_SUCCESS; 1148 } 1149 1150 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, 1151 __tgt_async_info *AsyncInfo) { 1152 assert(AsyncInfo && "AsyncInfo is nullptr"); 1153 hsa_status_t Err; 1154 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 1155 // Return success if we are not doing host to target. 1156 if (!HstPtr) 1157 return OFFLOAD_SUCCESS; 1158 1159 DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, 1160 (long long unsigned)(Elf64_Addr)HstPtr, 1161 (long long unsigned)(Elf64_Addr)TgtPtr); 1162 Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, 1163 DeviceId); 1164 if (Err != HSA_STATUS_SUCCESS) { 1165 DP("Error when copying data from host to device. Pointers: " 1166 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 1167 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 1168 return OFFLOAD_FAIL; 1169 } 1170 return OFFLOAD_SUCCESS; 1171 } 1172 1173 // Async. 1174 // The implementation was written with cuda streams in mind. The semantics of 1175 // that are to execute kernels on a queue in order of insertion. A synchronise 1176 // call then makes writes visible between host and device. This means a series 1177 // of N data_submit_async calls are expected to execute serially. HSA offers 1178 // various options to run the data copies concurrently. This may require changes 1179 // to libomptarget. 1180 1181 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that 1182 // there are no outstanding kernels that need to be synchronized. Any async call 1183 // may be passed a Queue==0, at which point the cuda implementation will set it 1184 // to non-null (see getStream). The cuda streams are per-device. Upstream may 1185 // change this interface to explicitly initialize the AsyncInfo_pointer, but 1186 // until then hsa lazily initializes it as well. 1187 1188 void initAsyncInfo(__tgt_async_info *AsyncInfo) { 1189 // set non-null while using async calls, return to null to indicate completion 1190 assert(AsyncInfo); 1191 if (!AsyncInfo->Queue) { 1192 AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX); 1193 } 1194 } 1195 void finiAsyncInfo(__tgt_async_info *AsyncInfo) { 1196 assert(AsyncInfo); 1197 assert(AsyncInfo->Queue); 1198 AsyncInfo->Queue = 0; 1199 } 1200 1201 // Determine launch values for kernel. 1202 struct LaunchVals { 1203 int WorkgroupSize; 1204 int GridSize; 1205 }; 1206 LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, 1207 int ConstWGSize, 1208 llvm::omp::OMPTgtExecModeFlags ExecutionMode, 1209 int NumTeams, int ThreadLimit, uint64_t LoopTripcount, 1210 int DeviceNumTeams) { 1211 1212 int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize; 1213 int NumGroups = 0; 1214 1215 int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; 1216 if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit)) 1217 MaxTeams = RTLDeviceInfoTy::HardTeamLimit; 1218 1219 if (print_kernel_trace & STARTUP_DETAILS) { 1220 DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams); 1221 DP("Max_Teams: %d\n", MaxTeams); 1222 DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize); 1223 DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize); 1224 DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", 1225 RTLDeviceInfoTy::DefaultWgSize); 1226 DP("thread_limit: %d\n", ThreadLimit); 1227 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1228 DP("ConstWGSize: %d\n", ConstWGSize); 1229 } 1230 // check for thread_limit() clause 1231 if (ThreadLimit > 0) { 1232 ThreadsPerGroup = ThreadLimit; 1233 DP("Setting threads per block to requested %d\n", ThreadLimit); 1234 // Add master warp for GENERIC 1235 if (ExecutionMode == 1236 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { 1237 ThreadsPerGroup += WarpSize; 1238 DP("Adding master wavefront: +%d threads\n", WarpSize); 1239 } 1240 if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max 1241 ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize; 1242 DP("Setting threads per block to maximum %d\n", ThreadsPerGroup); 1243 } 1244 } 1245 // check flat_max_work_group_size attr here 1246 if (ThreadsPerGroup > ConstWGSize) { 1247 ThreadsPerGroup = ConstWGSize; 1248 DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", 1249 ThreadsPerGroup); 1250 } 1251 if (print_kernel_trace & STARTUP_DETAILS) 1252 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1253 DP("Preparing %d threads\n", ThreadsPerGroup); 1254 1255 // Set default num_groups (teams) 1256 if (Env.TeamLimit > 0) 1257 NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit; 1258 else 1259 NumGroups = MaxTeams; 1260 DP("Set default num of groups %d\n", NumGroups); 1261 1262 if (print_kernel_trace & STARTUP_DETAILS) { 1263 DP("num_groups: %d\n", NumGroups); 1264 DP("num_teams: %d\n", NumTeams); 1265 } 1266 1267 // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size 1268 // This reduction is typical for default case (no thread_limit clause). 1269 // or when user goes crazy with num_teams clause. 1270 // FIXME: We cant distinguish between a constant or variable thread limit. 1271 // So we only handle constant thread_limits. 1272 if (ThreadsPerGroup > 1273 RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024 1274 // Should we round threadsPerGroup up to nearest WarpSize 1275 // here? 1276 NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup; 1277 1278 // check for num_teams() clause 1279 if (NumTeams > 0) { 1280 NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups; 1281 } 1282 if (print_kernel_trace & STARTUP_DETAILS) { 1283 DP("num_groups: %d\n", NumGroups); 1284 DP("Env.NumTeams %d\n", Env.NumTeams); 1285 DP("Env.TeamLimit %d\n", Env.TeamLimit); 1286 } 1287 1288 if (Env.NumTeams > 0) { 1289 NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups; 1290 DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); 1291 } else if (Env.TeamLimit > 0) { 1292 NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups; 1293 DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); 1294 } else { 1295 if (NumTeams <= 0) { 1296 if (LoopTripcount > 0) { 1297 if (ExecutionMode == 1298 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { 1299 // round up to the nearest integer 1300 NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1; 1301 } else if (ExecutionMode == 1302 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { 1303 NumGroups = LoopTripcount; 1304 } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { 1305 // This is a generic kernel that was transformed to use SPMD-mode 1306 // execution but uses Generic-mode semantics for scheduling. 1307 NumGroups = LoopTripcount; 1308 } 1309 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 1310 "threads per block %d\n", 1311 NumGroups, LoopTripcount, ThreadsPerGroup); 1312 } 1313 } else { 1314 NumGroups = NumTeams; 1315 } 1316 if (NumGroups > MaxTeams) { 1317 NumGroups = MaxTeams; 1318 if (print_kernel_trace & STARTUP_DETAILS) 1319 DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams); 1320 } 1321 if (NumGroups > NumTeams && NumTeams > 0) { 1322 NumGroups = NumTeams; 1323 if (print_kernel_trace & STARTUP_DETAILS) 1324 DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups, 1325 NumTeams); 1326 } 1327 } 1328 1329 // num_teams clause always honored, no matter what, unless DEFAULT is active. 1330 if (NumTeams > 0) { 1331 NumGroups = NumTeams; 1332 // Cap num_groups to EnvMaxTeamsDefault if set. 1333 if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault) 1334 NumGroups = Env.MaxTeamsDefault; 1335 } 1336 if (print_kernel_trace & STARTUP_DETAILS) { 1337 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1338 DP("num_groups: %d\n", NumGroups); 1339 DP("loop_tripcount: %ld\n", LoopTripcount); 1340 } 1341 DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups, 1342 ThreadsPerGroup); 1343 1344 LaunchVals Res; 1345 Res.WorkgroupSize = ThreadsPerGroup; 1346 Res.GridSize = ThreadsPerGroup * NumGroups; 1347 return Res; 1348 } 1349 1350 static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) { 1351 uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); 1352 bool Full = true; 1353 while (Full) { 1354 Full = 1355 PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue)); 1356 } 1357 return PacketId; 1358 } 1359 1360 int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, 1361 ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, 1362 int32_t ThreadLimit, uint64_t LoopTripcount) { 1363 // Set the context we are using 1364 // update thread limit content in gpu memory if un-initialized or specified 1365 // from host 1366 1367 DP("Run target team region thread_limit %d\n", ThreadLimit); 1368 1369 // All args are references. 1370 std::vector<void *> Args(ArgNum); 1371 std::vector<void *> Ptrs(ArgNum); 1372 1373 DP("Arg_num: %d\n", ArgNum); 1374 for (int32_t I = 0; I < ArgNum; ++I) { 1375 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1376 Args[I] = &Ptrs[I]; 1377 DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])); 1378 } 1379 1380 KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr; 1381 1382 std::string KernelName = std::string(KernelInfo->Name); 1383 auto &KernelInfoTable = DeviceInfo().KernelInfoTable; 1384 if (KernelInfoTable[DeviceId].find(KernelName) == 1385 KernelInfoTable[DeviceId].end()) { 1386 DP("Kernel %s not found\n", KernelName.c_str()); 1387 return OFFLOAD_FAIL; 1388 } 1389 1390 const atl_kernel_info_t KernelInfoEntry = 1391 KernelInfoTable[DeviceId][KernelName]; 1392 const uint32_t GroupSegmentSize = 1393 KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize; 1394 const uint32_t SgprCount = KernelInfoEntry.sgpr_count; 1395 const uint32_t VgprCount = KernelInfoEntry.vgpr_count; 1396 const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count; 1397 const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count; 1398 1399 assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count); 1400 1401 /* 1402 * Set limit based on ThreadsPerGroup and GroupsPerDevice 1403 */ 1404 LaunchVals LV = 1405 getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env, 1406 KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, 1407 NumTeams, // From run_region arg 1408 ThreadLimit, // From run_region arg 1409 LoopTripcount, // From run_region arg 1410 DeviceInfo().NumTeams[KernelInfo->DeviceId]); 1411 const int GridSize = LV.GridSize; 1412 const int WorkgroupSize = LV.WorkgroupSize; 1413 1414 if (print_kernel_trace >= LAUNCH) { 1415 int NumGroups = GridSize / WorkgroupSize; 1416 // enum modes are SPMD, GENERIC, NONE 0,1,2 1417 // if doing rtl timing, print to stderr, unless stdout requested. 1418 bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); 1419 fprintf(TraceToStdout ? stdout : stderr, 1420 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " 1421 "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " 1422 "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", 1423 DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, 1424 ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit, 1425 GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount, 1426 VgprSpillCount, LoopTripcount, KernelInfo->Name); 1427 } 1428 1429 // Run on the device. 1430 { 1431 hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next(); 1432 if (!Queue) { 1433 return OFFLOAD_FAIL; 1434 } 1435 uint64_t PacketId = acquireAvailablePacketId(Queue); 1436 1437 const uint32_t Mask = Queue->size - 1; // size is a power of 2 1438 hsa_kernel_dispatch_packet_t *Packet = 1439 (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); 1440 1441 // packet->header is written last 1442 Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 1443 Packet->workgroup_size_x = WorkgroupSize; 1444 Packet->workgroup_size_y = 1; 1445 Packet->workgroup_size_z = 1; 1446 Packet->reserved0 = 0; 1447 Packet->grid_size_x = GridSize; 1448 Packet->grid_size_y = 1; 1449 Packet->grid_size_z = 1; 1450 Packet->private_segment_size = KernelInfoEntry.private_segment_size; 1451 Packet->group_segment_size = GroupSegmentSize; 1452 Packet->kernel_object = KernelInfoEntry.kernel_object; 1453 Packet->kernarg_address = 0; // use the block allocator 1454 Packet->reserved2 = 0; // impl writes id_ here 1455 Packet->completion_signal = {0}; // may want a pool of signals 1456 1457 KernelArgPool *ArgPool = nullptr; 1458 void *KernArg = nullptr; 1459 { 1460 auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name)); 1461 if (It != KernelArgPoolMap.end()) { 1462 ArgPool = (It->second).get(); 1463 } 1464 } 1465 if (!ArgPool) { 1466 DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, 1467 DeviceId); 1468 } 1469 { 1470 if (ArgPool) { 1471 assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))); 1472 KernArg = ArgPool->allocate(ArgNum); 1473 } 1474 if (!KernArg) { 1475 DP("Allocate kernarg failed\n"); 1476 return OFFLOAD_FAIL; 1477 } 1478 1479 // Copy explicit arguments 1480 for (int I = 0; I < ArgNum; I++) { 1481 memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *)); 1482 } 1483 1484 // Initialize implicit arguments. TODO: Which of these can be dropped 1485 impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>( 1486 static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize); 1487 memset(ImplArgs, 0, 1488 sizeof(impl_implicit_args_t)); // may not be necessary 1489 ImplArgs->offset_x = 0; 1490 ImplArgs->offset_y = 0; 1491 ImplArgs->offset_z = 0; 1492 1493 // assign a hostcall buffer for the selected Q 1494 if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) { 1495 // hostrpc_assign_buffer is not thread safe, and this function is 1496 // under a multiple reader lock, not a writer lock. 1497 static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER; 1498 pthread_mutex_lock(&HostcallInitLock); 1499 uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo().HSAAgents[DeviceId], 1500 Queue, DeviceId); 1501 pthread_mutex_unlock(&HostcallInitLock); 1502 if (!Buffer) { 1503 DP("hostrpc_assign_buffer failed, gpu would dereference null and " 1504 "error\n"); 1505 return OFFLOAD_FAIL; 1506 } 1507 1508 DP("Implicit argument count: %d\n", 1509 KernelInfoEntry.implicit_argument_count); 1510 if (KernelInfoEntry.implicit_argument_count >= 4) { 1511 // Initialise pointer for implicit_argument_count != 0 ABI 1512 // Guess that the right implicit argument is at offset 24 after 1513 // the explicit arguments. In the future, should be able to read 1514 // the offset from msgpack. Clang is not annotating it at present. 1515 uint64_t Offset = 1516 sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); 1517 if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) { 1518 DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit " 1519 "args: %d\n", 1520 Offset + 8, ArgPool->kernargSizeIncludingImplicit()); 1521 } else { 1522 memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8); 1523 } 1524 } 1525 1526 // initialise pointer for implicit_argument_count == 0 ABI 1527 ImplArgs->hostcall_ptr = Buffer; 1528 } 1529 1530 Packet->kernarg_address = KernArg; 1531 } 1532 1533 hsa_signal_t S = DeviceInfo().FreeSignalPool.pop(); 1534 if (S.handle == 0) { 1535 DP("Failed to get signal instance\n"); 1536 return OFFLOAD_FAIL; 1537 } 1538 Packet->completion_signal = S; 1539 hsa_signal_store_relaxed(Packet->completion_signal, 1); 1540 1541 // Publish the packet indicating it is ready to be processed 1542 core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet), 1543 core::createHeader(), Packet->setup); 1544 1545 // Since the packet is already published, its contents must not be 1546 // accessed any more 1547 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); 1548 1549 while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, 1550 HSA_WAIT_STATE_BLOCKED) != 0) 1551 ; 1552 1553 assert(ArgPool); 1554 ArgPool->deallocate(KernArg); 1555 DeviceInfo().FreeSignalPool.push(S); 1556 } 1557 1558 DP("Kernel completed\n"); 1559 return OFFLOAD_SUCCESS; 1560 } 1561 1562 bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) { 1563 const uint16_t AmdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h 1564 int32_t R = elf_check_machine(Image, AmdgcnMachineID); 1565 if (!R) { 1566 DP("Supported machine ID not found\n"); 1567 } 1568 return R; 1569 } 1570 1571 uint32_t elfEFlags(__tgt_device_image *Image) { 1572 char *ImgBegin = (char *)Image->ImageStart; 1573 size_t ImgSize = (char *)Image->ImageEnd - ImgBegin; 1574 1575 Elf *E = elf_memory(ImgBegin, ImgSize); 1576 if (!E) { 1577 DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); 1578 return 0; 1579 } 1580 1581 Elf64_Ehdr *Eh64 = elf64_getehdr(E); 1582 1583 if (!Eh64) { 1584 DP("Unable to get machine ID from ELF file!\n"); 1585 elf_end(E); 1586 return 0; 1587 } 1588 1589 uint32_t Flags = Eh64->e_flags; 1590 1591 elf_end(E); 1592 DP("ELF Flags: 0x%x\n", Flags); 1593 return Flags; 1594 } 1595 1596 template <typename T> bool enforceUpperBound(T *Value, T Upper) { 1597 bool Changed = *Value > Upper; 1598 if (Changed) { 1599 *Value = Upper; 1600 } 1601 return Changed; 1602 } 1603 1604 Elf64_Shdr *findOnlyShtHash(Elf *Elf) { 1605 size_t N; 1606 int Rc = elf_getshdrnum(Elf, &N); 1607 if (Rc != 0) { 1608 return nullptr; 1609 } 1610 1611 Elf64_Shdr *Result = nullptr; 1612 for (size_t I = 0; I < N; I++) { 1613 Elf_Scn *Scn = elf_getscn(Elf, I); 1614 if (Scn) { 1615 Elf64_Shdr *Shdr = elf64_getshdr(Scn); 1616 if (Shdr) { 1617 if (Shdr->sh_type == SHT_HASH) { 1618 if (Result == nullptr) { 1619 Result = Shdr; 1620 } else { 1621 // multiple SHT_HASH sections not handled 1622 return nullptr; 1623 } 1624 } 1625 } 1626 } 1627 } 1628 return Result; 1629 } 1630 1631 const Elf64_Sym *elfLookup(Elf *Elf, char *Base, Elf64_Shdr *SectionHash, 1632 const char *Symname) { 1633 1634 assert(SectionHash); 1635 size_t SectionSymtabIndex = SectionHash->sh_link; 1636 Elf64_Shdr *SectionSymtab = 1637 elf64_getshdr(elf_getscn(Elf, SectionSymtabIndex)); 1638 size_t SectionStrtabIndex = SectionSymtab->sh_link; 1639 1640 const Elf64_Sym *Symtab = 1641 reinterpret_cast<const Elf64_Sym *>(Base + SectionSymtab->sh_offset); 1642 1643 const uint32_t *Hashtab = 1644 reinterpret_cast<const uint32_t *>(Base + SectionHash->sh_offset); 1645 1646 // Layout: 1647 // nbucket 1648 // nchain 1649 // bucket[nbucket] 1650 // chain[nchain] 1651 uint32_t Nbucket = Hashtab[0]; 1652 const uint32_t *Bucket = &Hashtab[2]; 1653 const uint32_t *Chain = &Hashtab[Nbucket + 2]; 1654 1655 const size_t Max = strlen(Symname) + 1; 1656 const uint32_t Hash = elf_hash(Symname); 1657 for (uint32_t I = Bucket[Hash % Nbucket]; I != 0; I = Chain[I]) { 1658 char *N = elf_strptr(Elf, SectionStrtabIndex, Symtab[I].st_name); 1659 if (strncmp(Symname, N, Max) == 0) { 1660 return &Symtab[I]; 1661 } 1662 } 1663 1664 return nullptr; 1665 } 1666 1667 struct SymbolInfo { 1668 void *Addr = nullptr; 1669 uint32_t Size = UINT32_MAX; 1670 uint32_t ShType = SHT_NULL; 1671 }; 1672 1673 int getSymbolInfoWithoutLoading(Elf *Elf, char *Base, const char *Symname, 1674 SymbolInfo *Res) { 1675 if (elf_kind(Elf) != ELF_K_ELF) { 1676 return 1; 1677 } 1678 1679 Elf64_Shdr *SectionHash = findOnlyShtHash(Elf); 1680 if (!SectionHash) { 1681 return 1; 1682 } 1683 1684 const Elf64_Sym *Sym = elfLookup(Elf, Base, SectionHash, Symname); 1685 if (!Sym) { 1686 return 1; 1687 } 1688 1689 if (Sym->st_size > UINT32_MAX) { 1690 return 1; 1691 } 1692 1693 if (Sym->st_shndx == SHN_UNDEF) { 1694 return 1; 1695 } 1696 1697 Elf_Scn *Section = elf_getscn(Elf, Sym->st_shndx); 1698 if (!Section) { 1699 return 1; 1700 } 1701 1702 Elf64_Shdr *Header = elf64_getshdr(Section); 1703 if (!Header) { 1704 return 1; 1705 } 1706 1707 Res->Addr = Sym->st_value + Base; 1708 Res->Size = static_cast<uint32_t>(Sym->st_size); 1709 Res->ShType = Header->sh_type; 1710 return 0; 1711 } 1712 1713 int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *Symname, 1714 SymbolInfo *Res) { 1715 Elf *Elf = elf_memory(Base, ImgSize); 1716 if (Elf) { 1717 int Rc = getSymbolInfoWithoutLoading(Elf, Base, Symname, Res); 1718 elf_end(Elf); 1719 return Rc; 1720 } 1721 return 1; 1722 } 1723 1724 hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize, 1725 const char *SymName, void **VarAddr, 1726 uint32_t *VarSize) { 1727 SymbolInfo SI; 1728 int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI); 1729 if (Rc == 0) { 1730 *VarAddr = SI.Addr; 1731 *VarSize = SI.Size; 1732 return HSA_STATUS_SUCCESS; 1733 } 1734 return HSA_STATUS_ERROR; 1735 } 1736 1737 template <typename C> 1738 hsa_status_t moduleRegisterFromMemoryToPlace( 1739 std::map<std::string, atl_kernel_info_t> &KernelInfoTable, 1740 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, 1741 void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb, 1742 std::vector<hsa_executable_t> &HSAExecutables) { 1743 auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t { 1744 C *Unwrapped = static_cast<C *>(CbState); 1745 return (*Unwrapped)(Data, Size); 1746 }; 1747 return core::RegisterModuleFromMemory( 1748 KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize, 1749 DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb), 1750 HSAExecutables); 1751 } 1752 1753 uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) { 1754 uint64_t DeviceStateBytes = 0; 1755 { 1756 // If this is the deviceRTL, get the state variable size 1757 SymbolInfo SizeSi; 1758 int Rc = getSymbolInfoWithoutLoading( 1759 ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi); 1760 1761 if (Rc == 0) { 1762 if (SizeSi.Size != sizeof(uint64_t)) { 1763 DP("Found device_State_size variable with wrong size\n"); 1764 return 0; 1765 } 1766 1767 // Read number of bytes directly from the elf 1768 memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t)); 1769 } 1770 } 1771 return DeviceStateBytes; 1772 } 1773 1774 struct DeviceEnvironment { 1775 // initialise an DeviceEnvironmentTy in the deviceRTL 1776 // patches around differences in the deviceRTL between trunk, aomp, 1777 // rocmcc. Over time these differences will tend to zero and this class 1778 // simplified. 1779 // Symbol may be in .data or .bss, and may be missing fields, todo: 1780 // review aomp/trunk/rocm and simplify the following 1781 1782 // The symbol may also have been deadstripped because the device side 1783 // accessors were unused. 1784 1785 // If the symbol is in .data (aomp, rocm) it can be written directly. 1786 // If it is in .bss, we must wait for it to be allocated space on the 1787 // gpu (trunk) and initialize after loading. 1788 const char *sym() { return "omptarget_device_environment"; } 1789 1790 DeviceEnvironmentTy HostDeviceEnv; 1791 SymbolInfo SI; 1792 bool Valid = false; 1793 1794 __tgt_device_image *Image; 1795 const size_t ImgSize; 1796 1797 DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize, 1798 __tgt_device_image *Image, const size_t ImgSize) 1799 : Image(Image), ImgSize(ImgSize) { 1800 1801 HostDeviceEnv.NumDevices = NumberDevices; 1802 HostDeviceEnv.DeviceNum = DeviceId; 1803 HostDeviceEnv.DebugKind = 0; 1804 HostDeviceEnv.DynamicMemSize = DynamicMemSize; 1805 if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 1806 HostDeviceEnv.DebugKind = std::stoi(EnvStr); 1807 1808 int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize, 1809 sym(), &SI); 1810 if (Rc != 0) { 1811 DP("Finding global device environment '%s' - symbol missing.\n", sym()); 1812 return; 1813 } 1814 1815 if (SI.Size > sizeof(HostDeviceEnv)) { 1816 DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size, 1817 sizeof(HostDeviceEnv)); 1818 return; 1819 } 1820 1821 Valid = true; 1822 } 1823 1824 bool inImage() { return SI.ShType != SHT_NOBITS; } 1825 1826 hsa_status_t beforeLoading(void *Data, size_t Size) { 1827 if (Valid) { 1828 if (inImage()) { 1829 DP("Setting global device environment before load (%u bytes)\n", 1830 SI.Size); 1831 uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart; 1832 void *Pos = (char *)Data + Offset; 1833 memcpy(Pos, &HostDeviceEnv, SI.Size); 1834 } 1835 } 1836 return HSA_STATUS_SUCCESS; 1837 } 1838 1839 hsa_status_t afterLoading() { 1840 if (Valid) { 1841 if (!inImage()) { 1842 DP("Setting global device environment after load (%u bytes)\n", 1843 SI.Size); 1844 int DeviceId = HostDeviceEnv.DeviceNum; 1845 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; 1846 void *StatePtr; 1847 uint32_t StatePtrSize; 1848 hsa_status_t Err = interop_hsa_get_symbol_info( 1849 SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize); 1850 if (Err != HSA_STATUS_SUCCESS) { 1851 DP("failed to find %s in loaded image\n", sym()); 1852 return Err; 1853 } 1854 1855 if (StatePtrSize != SI.Size) { 1856 DP("Symbol had size %u before loading, %u after\n", StatePtrSize, 1857 SI.Size); 1858 return HSA_STATUS_ERROR; 1859 } 1860 1861 return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, 1862 StatePtrSize, DeviceId); 1863 } 1864 } 1865 return HSA_STATUS_SUCCESS; 1866 } 1867 }; 1868 1869 hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) { 1870 uint64_t Rounded = 4 * ((Size + 3) / 4); 1871 void *Ptr; 1872 hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); 1873 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr); 1874 if (Err != HSA_STATUS_SUCCESS) { 1875 return Err; 1876 } 1877 1878 hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4); 1879 if (Rc != HSA_STATUS_SUCCESS) { 1880 DP("zero fill device_state failed with %u\n", Rc); 1881 core::Runtime::Memfree(Ptr); 1882 return HSA_STATUS_ERROR; 1883 } 1884 1885 *RetPtr = Ptr; 1886 return HSA_STATUS_SUCCESS; 1887 } 1888 1889 bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) { 1890 SymbolInfo SI; 1891 int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI); 1892 return (Rc == 0) && (SI.Addr != nullptr); 1893 } 1894 1895 } // namespace 1896 1897 namespace core { 1898 hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) { 1899 return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(), 1900 &DeviceInfo().HSAAgents[0], NULL, Ptr); 1901 } 1902 } // namespace core 1903 1904 static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) { 1905 hsa_status_t err; 1906 uint32_t name_len; 1907 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len); 1908 if (err != HSA_STATUS_SUCCESS) { 1909 DP("Error getting ISA info length\n"); 1910 return err; 1911 } 1912 1913 char TargetID[name_len]; 1914 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID); 1915 if (err != HSA_STATUS_SUCCESS) { 1916 DP("Error getting ISA info name\n"); 1917 return err; 1918 } 1919 1920 auto TripleTargetID = llvm::StringRef(TargetID); 1921 if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) { 1922 DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str()); 1923 } 1924 return HSA_STATUS_SUCCESS; 1925 } 1926 1927 /// Parse a TargetID to get processor arch and feature map. 1928 /// Returns processor subarch. 1929 /// Returns TargetID features in \p FeatureMap argument. 1930 /// If the \p TargetID contains feature+, FeatureMap it to true. 1931 /// If the \p TargetID contains feature-, FeatureMap it to false. 1932 /// If the \p TargetID does not contain a feature (default), do not map it. 1933 StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) { 1934 if (TargetID.empty()) 1935 return llvm::StringRef(); 1936 1937 auto ArchFeature = TargetID.split(":"); 1938 auto Arch = ArchFeature.first; 1939 auto Features = ArchFeature.second; 1940 if (Features.empty()) 1941 return Arch; 1942 1943 if (Features.contains("sramecc+")) { 1944 FeatureMap.insert(std::pair<std::string, bool>("sramecc", true)); 1945 } else if (Features.contains("sramecc-")) { 1946 FeatureMap.insert(std::pair<std::string, bool>("sramecc", false)); 1947 } 1948 if (Features.contains("xnack+")) { 1949 FeatureMap.insert(std::pair<std::string, bool>("xnack", true)); 1950 } else if (Features.contains("xnack-")) { 1951 FeatureMap.insert(std::pair<std::string, bool>("xnack", false)); 1952 } 1953 1954 return Arch; 1955 } 1956 1957 /// Checks if an image \p ImgInfo is compatible with current 1958 /// system's environment \p EnvInfo 1959 bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) { 1960 llvm::StringRef ImgTID(ImgInfo), EnvTID(EnvInfo); 1961 1962 // Compatible in case of exact match 1963 if (ImgTID == EnvTID) { 1964 DP("Compatible: Exact match \t[Image: %s]\t:\t[Environment: %s]\n", 1965 ImgTID.data(), EnvTID.data()); 1966 return true; 1967 } 1968 1969 // Incompatible if Archs mismatch. 1970 StringMap<bool> ImgMap, EnvMap; 1971 StringRef ImgArch = parseTargetID(ImgTID, ImgMap); 1972 StringRef EnvArch = parseTargetID(EnvTID, EnvMap); 1973 1974 // Both EnvArch and ImgArch can't be empty here. 1975 if (EnvArch.empty() || ImgArch.empty() || !ImgArch.contains(EnvArch)) { 1976 DP("Incompatible: Processor mismatch \t[Image: %s]\t:\t[Environment: %s]\n", 1977 ImgTID.data(), EnvTID.data()); 1978 return false; 1979 } 1980 1981 // Incompatible if image has more features than the environment, irrespective 1982 // of type or sign of features. 1983 if (ImgMap.size() > EnvMap.size()) { 1984 DP("Incompatible: Image has more features than the environment \t[Image: " 1985 "%s]\t:\t[Environment: %s]\n", 1986 ImgTID.data(), EnvTID.data()); 1987 return false; 1988 } 1989 1990 // Compatible if each target feature specified by the environment is 1991 // compatible with target feature of the image. The target feature is 1992 // compatible if the iamge does not specify it (meaning Any), or if it 1993 // specifies it with the same value (meaning On or Off). 1994 for (const auto &ImgFeature : ImgMap) { 1995 auto EnvFeature = EnvMap.find(ImgFeature.first()); 1996 if (EnvFeature == EnvMap.end()) { 1997 DP("Incompatible: Value of Image's non-ANY feature is not matching with " 1998 "the Environment feature's ANY value \t[Image: %s]\t:\t[Environment: " 1999 "%s]\n", 2000 ImgTID.data(), EnvTID.data()); 2001 return false; 2002 } else if (EnvFeature->first() == ImgFeature.first() && 2003 EnvFeature->second != ImgFeature.second) { 2004 DP("Incompatible: Value of Image's non-ANY feature is not matching with " 2005 "the Environment feature's non-ANY value \t[Image: " 2006 "%s]\t:\t[Environment: %s]\n", 2007 ImgTID.data(), EnvTID.data()); 2008 return false; 2009 } 2010 } 2011 2012 // Image is compatible if all features of Environment are: 2013 // - either, present in the Image's features map with the same sign, 2014 // - or, the feature is missing from Image's features map i.e. it is 2015 // set to ANY 2016 DP("Compatible: Target IDs are compatible \t[Image: %s]\t:\t[Environment: " 2017 "%s]\n", 2018 ImgTID.data(), EnvTID.data()); 2019 return true; 2020 } 2021 2022 extern "C" { 2023 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { 2024 return elfMachineIdIsAmdgcn(Image); 2025 } 2026 2027 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image, 2028 __tgt_image_info *info) { 2029 if (!__tgt_rtl_is_valid_binary(image)) 2030 return false; 2031 2032 // A subarchitecture was not specified. Assume it is compatible. 2033 if (!info->Arch) 2034 return true; 2035 2036 int32_t NumberOfDevices = __tgt_rtl_number_of_devices(); 2037 2038 for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { 2039 __tgt_rtl_init_device(DeviceId); 2040 hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId]; 2041 hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId); 2042 if (err != HSA_STATUS_SUCCESS) { 2043 DP("Error iterating ISAs\n"); 2044 return false; 2045 } 2046 if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo().TargetID[DeviceId])) 2047 return false; 2048 } 2049 DP("Image has Target ID compatible with the current environment: %s\n", 2050 info->Arch); 2051 return true; 2052 } 2053 2054 int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS; } 2055 int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS; } 2056 2057 int __tgt_rtl_number_of_devices() { 2058 // If the construction failed, no methods are safe to call 2059 if (DeviceInfo().ConstructionSucceeded) { 2060 return DeviceInfo().NumberOfDevices; 2061 } 2062 DP("AMDGPU plugin construction failed. Zero devices available\n"); 2063 return 0; 2064 } 2065 2066 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 2067 DP("Init requires flags to %ld\n", RequiresFlags); 2068 DeviceInfo().RequiresFlags = RequiresFlags; 2069 return RequiresFlags; 2070 } 2071 2072 int32_t __tgt_rtl_init_device(int DeviceId) { 2073 hsa_status_t Err = hsa_init(); 2074 if (Err != HSA_STATUS_SUCCESS) { 2075 DP("HSA Initialization Failed.\n"); 2076 return HSA_STATUS_ERROR; 2077 } 2078 // this is per device id init 2079 DP("Initialize the device id: %d\n", DeviceId); 2080 2081 hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; 2082 2083 // Get number of Compute Unit 2084 uint32_t ComputeUnits = 0; 2085 Err = hsa_agent_get_info( 2086 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 2087 &ComputeUnits); 2088 if (Err != HSA_STATUS_SUCCESS) { 2089 DeviceInfo().ComputeUnits[DeviceId] = 1; 2090 DP("Error getting compute units : settiing to 1\n"); 2091 } else { 2092 DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits; 2093 DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]); 2094 } 2095 2096 char GetInfoName[64]; // 64 max size returned by get info 2097 Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, 2098 (void *)GetInfoName); 2099 if (Err) 2100 DeviceInfo().GPUName[DeviceId] = "--unknown gpu--"; 2101 else { 2102 DeviceInfo().GPUName[DeviceId] = GetInfoName; 2103 } 2104 2105 if (print_kernel_trace & STARTUP_DETAILS) 2106 DP("Device#%-2d CU's: %2d %s\n", DeviceId, 2107 DeviceInfo().ComputeUnits[DeviceId], DeviceInfo().GPUName[DeviceId].c_str()); 2108 2109 // Query attributes to determine number of threads/block and blocks/grid. 2110 uint16_t WorkgroupMaxDim[3]; 2111 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 2112 &WorkgroupMaxDim); 2113 if (Err != HSA_STATUS_SUCCESS) { 2114 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams; 2115 DP("Error getting grid dims: num groups : %d\n", 2116 RTLDeviceInfoTy::DefaultNumTeams); 2117 } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) { 2118 DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; 2119 DP("Using %d ROCm blocks per grid\n", DeviceInfo().GroupsPerDevice[DeviceId]); 2120 } else { 2121 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; 2122 DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " 2123 "at the hard limit\n", 2124 WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit); 2125 } 2126 2127 // Get thread limit 2128 hsa_dim3_t GridMaxDim; 2129 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim); 2130 if (Err == HSA_STATUS_SUCCESS) { 2131 DeviceInfo().ThreadsPerGroup[DeviceId] = 2132 reinterpret_cast<uint32_t *>(&GridMaxDim)[0] / 2133 DeviceInfo().GroupsPerDevice[DeviceId]; 2134 2135 if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) { 2136 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; 2137 DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); 2138 } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId], 2139 RTLDeviceInfoTy::MaxWgSize)) { 2140 DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); 2141 } else { 2142 DP("Using ROCm Queried thread limit: %d\n", 2143 DeviceInfo().ThreadsPerGroup[DeviceId]); 2144 } 2145 } else { 2146 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; 2147 DP("Error getting max block dimension, use default:%d \n", 2148 RTLDeviceInfoTy::MaxWgSize); 2149 } 2150 2151 // Get wavefront size 2152 uint32_t WavefrontSize = 0; 2153 Err = 2154 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize); 2155 if (Err == HSA_STATUS_SUCCESS) { 2156 DP("Queried wavefront size: %d\n", WavefrontSize); 2157 DeviceInfo().WarpSize[DeviceId] = WavefrontSize; 2158 } else { 2159 // TODO: Burn the wavefront size into the code object 2160 DP("Warning: Unknown wavefront size, assuming 64\n"); 2161 DeviceInfo().WarpSize[DeviceId] = 64; 2162 } 2163 2164 // Adjust teams to the env variables 2165 2166 if (DeviceInfo().Env.TeamLimit > 0 && 2167 (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId], 2168 DeviceInfo().Env.TeamLimit))) { 2169 DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", 2170 DeviceInfo().Env.TeamLimit); 2171 } 2172 2173 // Set default number of teams 2174 if (DeviceInfo().Env.NumTeams > 0) { 2175 DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams; 2176 DP("Default number of teams set according to environment %d\n", 2177 DeviceInfo().Env.NumTeams); 2178 } else { 2179 char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); 2180 int TeamsPerCU = DefaultTeamsPerCU; 2181 if (TeamsPerCUEnvStr) { 2182 TeamsPerCU = std::stoi(TeamsPerCUEnvStr); 2183 } 2184 2185 DeviceInfo().NumTeams[DeviceId] = 2186 TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId]; 2187 DP("Default number of teams = %d * number of compute units %d\n", 2188 TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]); 2189 } 2190 2191 if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId], 2192 DeviceInfo().GroupsPerDevice[DeviceId])) { 2193 DP("Default number of teams exceeds device limit, capping at %d\n", 2194 DeviceInfo().GroupsPerDevice[DeviceId]); 2195 } 2196 2197 // Adjust threads to the env variables 2198 if (DeviceInfo().Env.TeamThreadLimit > 0 && 2199 (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], 2200 DeviceInfo().Env.TeamThreadLimit))) { 2201 DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", 2202 DeviceInfo().Env.TeamThreadLimit); 2203 } 2204 2205 // Set default number of threads 2206 DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize; 2207 DP("Default number of threads set according to library's default %d\n", 2208 RTLDeviceInfoTy::DefaultWgSize); 2209 if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], 2210 DeviceInfo().ThreadsPerGroup[DeviceId])) { 2211 DP("Default number of threads exceeds device limit, capping at %d\n", 2212 DeviceInfo().ThreadsPerGroup[DeviceId]); 2213 } 2214 2215 DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", 2216 DeviceId, DeviceInfo().GroupsPerDevice[DeviceId], 2217 DeviceInfo().ThreadsPerGroup[DeviceId]); 2218 2219 DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId, 2220 DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId], 2221 DeviceInfo().GroupsPerDevice[DeviceId], 2222 DeviceInfo().GroupsPerDevice[DeviceId] * 2223 DeviceInfo().ThreadsPerGroup[DeviceId]); 2224 2225 return OFFLOAD_SUCCESS; 2226 } 2227 2228 static __tgt_target_table * 2229 __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image); 2230 2231 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, 2232 __tgt_device_image *Image) { 2233 DeviceInfo().LoadRunLock.lock(); 2234 __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image); 2235 DeviceInfo().LoadRunLock.unlock(); 2236 return Res; 2237 } 2238 2239 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, 2240 __tgt_device_image *Image) { 2241 // This function loads the device image onto gpu[DeviceId] and does other 2242 // per-image initialization work. Specifically: 2243 // 2244 // - Initialize an DeviceEnvironmentTy instance embedded in the 2245 // image at the symbol "omptarget_device_environment" 2246 // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL. 2247 // 2248 // - Allocate a large array per-gpu (could be moved to init_device) 2249 // - Read a uint64_t at symbol omptarget_nvptx_device_State_size 2250 // - Allocate at least that many bytes of gpu memory 2251 // - Zero initialize it 2252 // - Write the pointer to the symbol omptarget_nvptx_device_State 2253 // 2254 // - Pulls some per-kernel information together from various sources and 2255 // records it in the KernelsList for quicker access later 2256 // 2257 // The initialization can be done before or after loading the image onto the 2258 // gpu. This function presently does a mixture. Using the hsa api to get/set 2259 // the information is simpler to implement, in exchange for more complicated 2260 // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes 2261 // back from the gpu vs a hashtable lookup on the host. 2262 2263 const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart; 2264 2265 DeviceInfo().clearOffloadEntriesTable(DeviceId); 2266 2267 // We do not need to set the ELF version because the caller of this function 2268 // had to do that to decide the right runtime to use 2269 2270 if (!elfMachineIdIsAmdgcn(Image)) 2271 return NULL; 2272 2273 { 2274 auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, 2275 DeviceInfo().Env.DynamicMemSize, Image, ImgSize); 2276 2277 auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId]; 2278 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; 2279 hsa_status_t Err = moduleRegisterFromMemoryToPlace( 2280 KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId, 2281 [&](void *Data, size_t Size) { 2282 if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) { 2283 __atomic_store_n(&DeviceInfo().HostcallRequired, true, 2284 __ATOMIC_RELEASE); 2285 } 2286 return Env.beforeLoading(Data, Size); 2287 }, 2288 DeviceInfo().HSAExecutables); 2289 2290 check("Module registering", Err); 2291 if (Err != HSA_STATUS_SUCCESS) { 2292 const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str(); 2293 const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image)); 2294 2295 if (strcmp(DeviceName, ElfName) != 0) { 2296 DP("Possible gpu arch mismatch: device:%s, image:%s please check" 2297 " compiler flag: -march=<gpu>\n", 2298 DeviceName, ElfName); 2299 } else { 2300 DP("Error loading image onto GPU: %s\n", get_error_string(Err)); 2301 } 2302 2303 return NULL; 2304 } 2305 2306 Err = Env.afterLoading(); 2307 if (Err != HSA_STATUS_SUCCESS) { 2308 return NULL; 2309 } 2310 } 2311 2312 DP("AMDGPU module successfully loaded!\n"); 2313 2314 { 2315 // the device_State array is either large value in bss or a void* that 2316 // needs to be assigned to a pointer to an array of size device_state_bytes 2317 // If absent, it has been deadstripped and needs no setup. 2318 2319 void *StatePtr; 2320 uint32_t StatePtrSize; 2321 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; 2322 hsa_status_t Err = interop_hsa_get_symbol_info( 2323 SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr, 2324 &StatePtrSize); 2325 2326 if (Err != HSA_STATUS_SUCCESS) { 2327 DP("No device_state symbol found, skipping initialization\n"); 2328 } else { 2329 if (StatePtrSize < sizeof(void *)) { 2330 DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize, 2331 sizeof(void *)); 2332 return NULL; 2333 } 2334 2335 // if it's larger than a void*, assume it's a bss array and no further 2336 // initialization is required. Only try to set up a pointer for 2337 // sizeof(void*) 2338 if (StatePtrSize == sizeof(void *)) { 2339 uint64_t DeviceStateBytes = 2340 getDeviceStateBytes((char *)Image->ImageStart, ImgSize); 2341 if (DeviceStateBytes == 0) { 2342 DP("Can't initialize device_State, missing size information\n"); 2343 return NULL; 2344 } 2345 2346 auto &DSS = DeviceInfo().DeviceStateStore[DeviceId]; 2347 if (DSS.first.get() == nullptr) { 2348 assert(DSS.second == 0); 2349 void *Ptr = NULL; 2350 hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId); 2351 if (Err != HSA_STATUS_SUCCESS) { 2352 DP("Failed to allocate device_state array\n"); 2353 return NULL; 2354 } 2355 DSS = { 2356 std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr}, 2357 DeviceStateBytes, 2358 }; 2359 } 2360 2361 void *Ptr = DSS.first.get(); 2362 if (DeviceStateBytes != DSS.second) { 2363 DP("Inconsistent sizes of device_State unsupported\n"); 2364 return NULL; 2365 } 2366 2367 // write ptr to device memory so it can be used by later kernels 2368 Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *), 2369 DeviceId); 2370 if (Err != HSA_STATUS_SUCCESS) { 2371 DP("memcpy install of state_ptr failed\n"); 2372 return NULL; 2373 } 2374 } 2375 } 2376 } 2377 2378 // Here, we take advantage of the data that is appended after img_end to get 2379 // the symbols' name we need to load. This data consist of the host entries 2380 // begin and end as well as the target name (see the offloading linker script 2381 // creation in clang compiler). 2382 2383 // Find the symbols in the module by name. The name can be obtain by 2384 // concatenating the host entry name with the target name 2385 2386 __tgt_offload_entry *HostBegin = Image->EntriesBegin; 2387 __tgt_offload_entry *HostEnd = Image->EntriesEnd; 2388 2389 for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 2390 2391 if (!E->addr) { 2392 // The host should have always something in the address to 2393 // uniquely identify the target region. 2394 DP("Analyzing host entry '<null>' (size = %lld)...\n", 2395 (unsigned long long)E->size); 2396 return NULL; 2397 } 2398 2399 if (E->size) { 2400 __tgt_offload_entry Entry = *E; 2401 2402 void *Varptr; 2403 uint32_t Varsize; 2404 2405 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; 2406 hsa_status_t Err = interop_hsa_get_symbol_info( 2407 SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize); 2408 2409 if (Err != HSA_STATUS_SUCCESS) { 2410 // Inform the user what symbol prevented offloading 2411 DP("Loading global '%s' (Failed)\n", E->name); 2412 return NULL; 2413 } 2414 2415 if (Varsize != E->size) { 2416 DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name, 2417 Varsize, E->size); 2418 return NULL; 2419 } 2420 2421 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 2422 DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)); 2423 Entry.addr = (void *)Varptr; 2424 2425 DeviceInfo().addOffloadEntry(DeviceId, Entry); 2426 2427 if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 2428 E->flags & OMP_DECLARE_TARGET_LINK) { 2429 // If unified memory is present any target link variables 2430 // can access host addresses directly. There is no longer a 2431 // need for device copies. 2432 Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, 2433 sizeof(void *), DeviceId); 2434 if (Err != HSA_STATUS_SUCCESS) 2435 DP("Error when copying USM\n"); 2436 DP("Copy linked variable host address (" DPxMOD ")" 2437 "to device address (" DPxMOD ")\n", 2438 DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)); 2439 } 2440 2441 continue; 2442 } 2443 2444 DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)); 2445 2446 // errors in kernarg_segment_size previously treated as = 0 (or as undef) 2447 uint32_t KernargSegmentSize = 0; 2448 auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId]; 2449 hsa_status_t Err = HSA_STATUS_SUCCESS; 2450 if (!E->name) { 2451 Err = HSA_STATUS_ERROR; 2452 } else { 2453 std::string KernelStr = std::string(E->name); 2454 auto It = KernelInfoMap.find(KernelStr); 2455 if (It != KernelInfoMap.end()) { 2456 atl_kernel_info_t Info = It->second; 2457 KernargSegmentSize = Info.kernel_segment_size; 2458 } else { 2459 Err = HSA_STATUS_ERROR; 2460 } 2461 } 2462 2463 // default value GENERIC (in case symbol is missing from cubin file) 2464 llvm::omp::OMPTgtExecModeFlags ExecModeVal = 2465 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; 2466 2467 // get flat group size if present, else Default_WG_Size 2468 int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; 2469 2470 // get Kernel Descriptor if present. 2471 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp 2472 struct KernDescValType { 2473 uint16_t Version; 2474 uint16_t TSize; 2475 uint16_t WGSize; 2476 }; 2477 struct KernDescValType KernDescVal; 2478 std::string KernDescNameStr(E->name); 2479 KernDescNameStr += "_kern_desc"; 2480 const char *KernDescName = KernDescNameStr.c_str(); 2481 2482 void *KernDescPtr; 2483 uint32_t KernDescSize; 2484 void *CallStackAddr = nullptr; 2485 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName, 2486 &KernDescPtr, &KernDescSize); 2487 2488 if (Err == HSA_STATUS_SUCCESS) { 2489 if ((size_t)KernDescSize != sizeof(KernDescVal)) 2490 DP("Loading global computation properties '%s' - size mismatch (%u != " 2491 "%lu)\n", 2492 KernDescName, KernDescSize, sizeof(KernDescVal)); 2493 2494 memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); 2495 2496 // Check structure size against recorded size. 2497 if ((size_t)KernDescSize != KernDescVal.TSize) 2498 DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", 2499 sizeof(KernDescVal), KernDescVal.TSize, KernDescName); 2500 2501 DP("After loading global for %s KernDesc \n", KernDescName); 2502 DP("KernDesc: Version: %d\n", KernDescVal.Version); 2503 DP("KernDesc: TSize: %d\n", KernDescVal.TSize); 2504 DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize); 2505 2506 if (KernDescVal.WGSize == 0) { 2507 KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize; 2508 DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize); 2509 } 2510 WGSizeVal = KernDescVal.WGSize; 2511 DP("WGSizeVal %d\n", WGSizeVal); 2512 check("Loading KernDesc computation property", Err); 2513 } else { 2514 DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); 2515 2516 // Flat group size 2517 std::string WGSizeNameStr(E->name); 2518 WGSizeNameStr += "_wg_size"; 2519 const char *WGSizeName = WGSizeNameStr.c_str(); 2520 2521 void *WGSizePtr; 2522 uint32_t WGSize; 2523 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName, 2524 &WGSizePtr, &WGSize); 2525 2526 if (Err == HSA_STATUS_SUCCESS) { 2527 if ((size_t)WGSize != sizeof(int16_t)) { 2528 DP("Loading global computation properties '%s' - size mismatch (%u " 2529 "!= " 2530 "%lu)\n", 2531 WGSizeName, WGSize, sizeof(int16_t)); 2532 return NULL; 2533 } 2534 2535 memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); 2536 2537 DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); 2538 2539 if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize || 2540 WGSizeVal > RTLDeviceInfoTy::MaxWgSize) { 2541 DP("Error wrong WGSize value specified in HSA code object file: " 2542 "%d\n", 2543 WGSizeVal); 2544 WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; 2545 } 2546 } else { 2547 DP("Warning: Loading WGSize '%s' - symbol not found, " 2548 "using default value %d\n", 2549 WGSizeName, WGSizeVal); 2550 } 2551 2552 check("Loading WGSize computation property", Err); 2553 } 2554 2555 // Read execution mode from global in binary 2556 std::string ExecModeNameStr(E->name); 2557 ExecModeNameStr += "_exec_mode"; 2558 const char *ExecModeName = ExecModeNameStr.c_str(); 2559 2560 void *ExecModePtr; 2561 uint32_t VarSize; 2562 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName, 2563 &ExecModePtr, &VarSize); 2564 2565 if (Err == HSA_STATUS_SUCCESS) { 2566 if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 2567 DP("Loading global computation properties '%s' - size mismatch(%u != " 2568 "%lu)\n", 2569 ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 2570 return NULL; 2571 } 2572 2573 memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize); 2574 2575 DP("After loading global for %s ExecMode = %d\n", ExecModeName, 2576 ExecModeVal); 2577 2578 if (ExecModeVal < 0 || 2579 ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { 2580 DP("Error wrong exec_mode value specified in HSA code object file: " 2581 "%d\n", 2582 ExecModeVal); 2583 return NULL; 2584 } 2585 } else { 2586 DP("Loading global exec_mode '%s' - symbol missing, using default " 2587 "value " 2588 "GENERIC (1)\n", 2589 ExecModeName); 2590 } 2591 check("Loading computation property", Err); 2592 2593 KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, 2594 CallStackAddr, E->name, KernargSegmentSize, 2595 DeviceInfo().KernArgPool)); 2596 __tgt_offload_entry Entry = *E; 2597 Entry.addr = (void *)&KernelsList.back(); 2598 DeviceInfo().addOffloadEntry(DeviceId, Entry); 2599 DP("Entry point %ld maps to %s\n", E - HostBegin, E->name); 2600 } 2601 2602 return DeviceInfo().getOffloadEntriesTable(DeviceId); 2603 } 2604 2605 void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { 2606 void *Ptr = NULL; 2607 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2608 2609 if (Kind != TARGET_ALLOC_DEFAULT) { 2610 REPORT("Invalid target data allocation kind or requested allocator not " 2611 "implemented yet\n"); 2612 return NULL; 2613 } 2614 2615 hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); 2616 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr); 2617 DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size, 2618 (long long unsigned)(Elf64_Addr)Ptr); 2619 Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL; 2620 return Ptr; 2621 } 2622 2623 int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr, 2624 int64_t Size) { 2625 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2626 __tgt_async_info AsyncInfo; 2627 int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); 2628 if (Rc != OFFLOAD_SUCCESS) 2629 return OFFLOAD_FAIL; 2630 2631 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 2632 } 2633 2634 int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr, 2635 int64_t Size, __tgt_async_info *AsyncInfo) { 2636 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2637 if (AsyncInfo) { 2638 initAsyncInfo(AsyncInfo); 2639 return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo); 2640 } 2641 return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size); 2642 } 2643 2644 int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr, 2645 int64_t Size) { 2646 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2647 __tgt_async_info AsyncInfo; 2648 int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); 2649 if (Rc != OFFLOAD_SUCCESS) 2650 return OFFLOAD_FAIL; 2651 2652 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 2653 } 2654 2655 int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr, 2656 int64_t Size, 2657 __tgt_async_info *AsyncInfo) { 2658 assert(AsyncInfo && "AsyncInfo is nullptr"); 2659 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2660 initAsyncInfo(AsyncInfo); 2661 return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo); 2662 } 2663 2664 int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) { 2665 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2666 hsa_status_t Err; 2667 DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr); 2668 Err = core::Runtime::Memfree(TgtPtr); 2669 if (Err != HSA_STATUS_SUCCESS) { 2670 DP("Error when freeing CUDA memory\n"); 2671 return OFFLOAD_FAIL; 2672 } 2673 return OFFLOAD_SUCCESS; 2674 } 2675 2676 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, 2677 void **TgtArgs, ptrdiff_t *TgtOffsets, 2678 int32_t ArgNum, int32_t NumTeams, 2679 int32_t ThreadLimit, 2680 uint64_t LoopTripcount) { 2681 2682 DeviceInfo().LoadRunLock.lock_shared(); 2683 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, 2684 ArgNum, NumTeams, ThreadLimit, LoopTripcount); 2685 2686 DeviceInfo().LoadRunLock.unlock_shared(); 2687 return Res; 2688 } 2689 2690 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr, 2691 void **TgtArgs, ptrdiff_t *TgtOffsets, 2692 int32_t ArgNum) { 2693 // use one team and one thread 2694 // fix thread num 2695 int32_t TeamNum = 1; 2696 int32_t ThreadLimit = 0; // use default 2697 return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs, 2698 TgtOffsets, ArgNum, TeamNum, 2699 ThreadLimit, 0); 2700 } 2701 2702 int32_t __tgt_rtl_run_target_team_region_async( 2703 int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, 2704 int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, 2705 uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) { 2706 assert(AsyncInfo && "AsyncInfo is nullptr"); 2707 initAsyncInfo(AsyncInfo); 2708 2709 DeviceInfo().LoadRunLock.lock_shared(); 2710 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, 2711 ArgNum, NumTeams, ThreadLimit, LoopTripcount); 2712 2713 DeviceInfo().LoadRunLock.unlock_shared(); 2714 return Res; 2715 } 2716 2717 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr, 2718 void **TgtArgs, ptrdiff_t *TgtOffsets, 2719 int32_t ArgNum, 2720 __tgt_async_info *AsyncInfo) { 2721 // use one team and one thread 2722 // fix thread num 2723 int32_t TeamNum = 1; 2724 int32_t ThreadLimit = 0; // use default 2725 return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs, 2726 TgtOffsets, ArgNum, TeamNum, 2727 ThreadLimit, 0, AsyncInfo); 2728 } 2729 2730 int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) { 2731 assert(AsyncInfo && "AsyncInfo is nullptr"); 2732 2733 // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant 2734 // is not ensured by devices.cpp for amdgcn 2735 // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr"); 2736 if (AsyncInfo->Queue) { 2737 finiAsyncInfo(AsyncInfo); 2738 } 2739 return OFFLOAD_SUCCESS; 2740 } 2741 2742 void __tgt_rtl_print_device_info(int32_t DeviceId) { 2743 // TODO: Assertion to see if DeviceId is correct 2744 // NOTE: We don't need to set context for print device info. 2745 2746 DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]); 2747 } 2748 2749 } // extern "C" 2750