1 //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for AMD hsa machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <algorithm> 14 #include <assert.h> 15 #include <cstdio> 16 #include <cstdlib> 17 #include <cstring> 18 #include <functional> 19 #include <libelf.h> 20 #include <list> 21 #include <memory> 22 #include <mutex> 23 #include <shared_mutex> 24 #include <unordered_map> 25 #include <vector> 26 27 #include "impl_runtime.h" 28 #include "interop_hsa.h" 29 30 #include "internal.h" 31 #include "rt.h" 32 33 #include "DeviceEnvironment.h" 34 #include "get_elf_mach_gfx_name.h" 35 #include "omptargetplugin.h" 36 #include "print_tracing.h" 37 38 #include "llvm/ADT/StringMap.h" 39 #include "llvm/ADT/StringRef.h" 40 #include "llvm/Frontend/OpenMP/OMPConstants.h" 41 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 42 43 using namespace llvm; 44 45 // hostrpc interface, FIXME: consider moving to its own include these are 46 // statically linked into amdgpu/plugin if present from hostrpc_services.a, 47 // linked as --whole-archive to override the weak symbols that are used to 48 // implement a fallback for toolchains that do not yet have a hostrpc library. 49 extern "C" { 50 uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ, 51 uint32_t DeviceId); 52 hsa_status_t hostrpc_init(); 53 hsa_status_t hostrpc_terminate(); 54 55 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } 56 __attribute__((weak)) hsa_status_t hostrpc_terminate() { 57 return HSA_STATUS_SUCCESS; 58 } 59 __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, 60 uint32_t DeviceId) { 61 DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library " 62 "missing\n", 63 DeviceId); 64 return 0; 65 } 66 } 67 68 // Heuristic parameters used for kernel launch 69 // Number of teams per CU to allow scheduling flexibility 70 static const unsigned DefaultTeamsPerCU = 4; 71 72 int print_kernel_trace; 73 74 #ifdef OMPTARGET_DEBUG 75 #define check(msg, status) \ 76 if (status != HSA_STATUS_SUCCESS) { \ 77 DP(#msg " failed\n"); \ 78 } else { \ 79 DP(#msg " succeeded\n"); \ 80 } 81 #else 82 #define check(msg, status) \ 83 {} 84 #endif 85 86 #include "elf_common.h" 87 88 namespace hsa { 89 template <typename C> hsa_status_t iterate_agents(C Cb) { 90 auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t { 91 C *Unwrapped = static_cast<C *>(Data); 92 return (*Unwrapped)(Agent); 93 }; 94 return hsa_iterate_agents(L, static_cast<void *>(&Cb)); 95 } 96 97 template <typename C> 98 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) { 99 auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t { 100 C *Unwrapped = static_cast<C *>(Data); 101 return (*Unwrapped)(MemoryPool); 102 }; 103 104 return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb)); 105 } 106 107 } // namespace hsa 108 109 /// Keep entries table per device 110 struct FuncOrGblEntryTy { 111 __tgt_target_table Table; 112 std::vector<__tgt_offload_entry> Entries; 113 }; 114 115 struct KernelArgPool { 116 private: 117 static pthread_mutex_t Mutex; 118 119 public: 120 uint32_t KernargSegmentSize; 121 void *KernargRegion = nullptr; 122 std::queue<int> FreeKernargSegments; 123 124 uint32_t kernargSizeIncludingImplicit() { 125 return KernargSegmentSize + sizeof(impl_implicit_args_t); 126 } 127 128 ~KernelArgPool() { 129 if (KernargRegion) { 130 auto R = hsa_amd_memory_pool_free(KernargRegion); 131 if (R != HSA_STATUS_SUCCESS) { 132 DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)); 133 } 134 } 135 } 136 137 // Can't really copy or move a mutex 138 KernelArgPool() = default; 139 KernelArgPool(const KernelArgPool &) = delete; 140 KernelArgPool(KernelArgPool &&) = delete; 141 142 KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) 143 : KernargSegmentSize(KernargSegmentSize) { 144 145 // impl uses one pool per kernel for all gpus, with a fixed upper size 146 // preserving that exact scheme here, including the queue<int> 147 148 hsa_status_t Err = hsa_amd_memory_pool_allocate( 149 MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0, 150 &KernargRegion); 151 152 if (Err != HSA_STATUS_SUCCESS) { 153 DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)); 154 KernargRegion = nullptr; // paranoid 155 return; 156 } 157 158 Err = core::allow_access_to_all_gpu_agents(KernargRegion); 159 if (Err != HSA_STATUS_SUCCESS) { 160 DP("hsa allow_access_to_all_gpu_agents failed: %s\n", 161 get_error_string(Err)); 162 auto R = hsa_amd_memory_pool_free(KernargRegion); 163 if (R != HSA_STATUS_SUCCESS) { 164 // if free failed, can't do anything more to resolve it 165 DP("hsa memory poll free failed: %s\n", get_error_string(Err)); 166 } 167 KernargRegion = nullptr; 168 return; 169 } 170 171 for (int I = 0; I < MAX_NUM_KERNELS; I++) { 172 FreeKernargSegments.push(I); 173 } 174 } 175 176 void *allocate(uint64_t ArgNum) { 177 assert((ArgNum * sizeof(void *)) == KernargSegmentSize); 178 Lock L(&Mutex); 179 void *Res = nullptr; 180 if (!FreeKernargSegments.empty()) { 181 182 int FreeIdx = FreeKernargSegments.front(); 183 Res = static_cast<void *>(static_cast<char *>(KernargRegion) + 184 (FreeIdx * kernargSizeIncludingImplicit())); 185 assert(FreeIdx == pointerToIndex(Res)); 186 FreeKernargSegments.pop(); 187 } 188 return Res; 189 } 190 191 void deallocate(void *Ptr) { 192 Lock L(&Mutex); 193 int Idx = pointerToIndex(Ptr); 194 FreeKernargSegments.push(Idx); 195 } 196 197 private: 198 int pointerToIndex(void *Ptr) { 199 ptrdiff_t Bytes = 200 static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion); 201 assert(Bytes >= 0); 202 assert(Bytes % kernargSizeIncludingImplicit() == 0); 203 return Bytes / kernargSizeIncludingImplicit(); 204 } 205 struct Lock { 206 Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); } 207 ~Lock() { pthread_mutex_unlock(M); } 208 pthread_mutex_t *M; 209 }; 210 }; 211 pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER; 212 213 /// Use a single entity to encode a kernel and a set of flags 214 struct KernelTy { 215 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 216 int16_t ConstWGSize; 217 int32_t DeviceId; 218 void *CallStackAddr = nullptr; 219 const char *Name; 220 221 KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, 222 int32_t DeviceId, void *CallStackAddr, const char *Name, 223 uint32_t KernargSegmentSize, 224 hsa_amd_memory_pool_t &KernArgMemoryPool, 225 std::unordered_map<std::string, std::unique_ptr<KernelArgPool>> 226 &KernelArgPoolMap) 227 : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), 228 DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { 229 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); 230 231 std::string N(Name); 232 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { 233 KernelArgPoolMap.insert( 234 std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool( 235 KernargSegmentSize, KernArgMemoryPool)))); 236 } 237 } 238 }; 239 240 template <typename Callback> static hsa_status_t findAgents(Callback CB) { 241 242 hsa_status_t Err = 243 hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t { 244 hsa_device_type_t DeviceType; 245 // get_info fails iff HSA runtime not yet initialized 246 hsa_status_t Err = 247 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); 248 249 if (Err != HSA_STATUS_SUCCESS) { 250 if (print_kernel_trace > 0) 251 DP("rtl.cpp: err %s\n", get_error_string(Err)); 252 253 return Err; 254 } 255 256 CB(DeviceType, Agent); 257 return HSA_STATUS_SUCCESS; 258 }); 259 260 // iterate_agents fails iff HSA runtime not yet initialized 261 if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) { 262 DP("rtl.cpp: err %s\n", get_error_string(Err)); 263 } 264 265 return Err; 266 } 267 268 static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source, 269 void *Data) { 270 if (Status != HSA_STATUS_SUCCESS) { 271 const char *StatusString; 272 if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) { 273 StatusString = "unavailable"; 274 } 275 DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source, 276 Status, StatusString); 277 abort(); 278 } 279 } 280 281 namespace core { 282 namespace { 283 284 bool checkResult(hsa_status_t Err, const char *ErrMsg) { 285 if (Err == HSA_STATUS_SUCCESS) 286 return true; 287 288 REPORT("%s", ErrMsg); 289 REPORT("%s", get_error_string(Err)); 290 return false; 291 } 292 293 void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) { 294 __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE); 295 } 296 297 uint16_t createHeader() { 298 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; 299 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; 300 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; 301 return Header; 302 } 303 304 hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { 305 bool AllocAllowed = false; 306 hsa_status_t Err = hsa_amd_memory_pool_get_info( 307 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, 308 &AllocAllowed); 309 if (Err != HSA_STATUS_SUCCESS) { 310 DP("Alloc allowed in memory pool check failed: %s\n", 311 get_error_string(Err)); 312 return Err; 313 } 314 315 size_t Size = 0; 316 Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, 317 &Size); 318 if (Err != HSA_STATUS_SUCCESS) { 319 DP("Get memory pool size failed: %s\n", get_error_string(Err)); 320 return Err; 321 } 322 323 return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; 324 } 325 326 hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { 327 std::vector<hsa_amd_memory_pool_t> *Result = 328 static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data); 329 330 hsa_status_t Err; 331 if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) { 332 return Err; 333 } 334 335 Result->push_back(MemoryPool); 336 return HSA_STATUS_SUCCESS; 337 } 338 339 } // namespace 340 } // namespace core 341 342 struct EnvironmentVariables { 343 int NumTeams; 344 int TeamLimit; 345 int TeamThreadLimit; 346 int MaxTeamsDefault; 347 int DynamicMemSize; 348 }; 349 350 template <uint32_t wavesize> 351 static constexpr const llvm::omp::GV &getGridValue() { 352 return llvm::omp::getAMDGPUGridValues<wavesize>(); 353 } 354 355 struct HSALifetime { 356 // Wrapper around HSA used to ensure it is constructed before other types 357 // and destructed after, which means said other types can use raii for 358 // cleanup without risking running outside of the lifetime of HSA 359 const hsa_status_t S; 360 361 bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; } 362 HSALifetime() : S(hsa_init()) {} 363 364 ~HSALifetime() { 365 if (S == HSA_STATUS_SUCCESS) { 366 hsa_status_t Err = hsa_shut_down(); 367 if (Err != HSA_STATUS_SUCCESS) { 368 // Can't call into HSA to get a string from the integer 369 DP("Shutting down HSA failed: %d\n", Err); 370 } 371 } 372 } 373 }; 374 375 // Handle scheduling of multiple hsa_queue's per device to 376 // multiple threads (one scheduler per device) 377 class HSAQueueScheduler { 378 public: 379 HSAQueueScheduler() : Current(0) {} 380 381 HSAQueueScheduler(const HSAQueueScheduler &) = delete; 382 383 HSAQueueScheduler(HSAQueueScheduler &&Q) { 384 Current = Q.Current.load(); 385 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 386 HSAQueues[I] = Q.HSAQueues[I]; 387 Q.HSAQueues[I] = nullptr; 388 } 389 } 390 391 // \return false if any HSA queue creation fails 392 bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) { 393 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 394 hsa_queue_t *Q = nullptr; 395 hsa_status_t Rc = 396 hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI, 397 callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q); 398 if (Rc != HSA_STATUS_SUCCESS) { 399 DP("Failed to create HSA queue %d\n", I); 400 return false; 401 } 402 HSAQueues[I] = Q; 403 } 404 return true; 405 } 406 407 ~HSAQueueScheduler() { 408 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { 409 if (HSAQueues[I]) { 410 hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]); 411 if (Err != HSA_STATUS_SUCCESS) 412 DP("Error destroying HSA queue"); 413 } 414 } 415 } 416 417 // \return next queue to use for device 418 hsa_queue_t *next() { 419 return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) % 420 NUM_QUEUES_PER_DEVICE]; 421 } 422 423 private: 424 // Number of queues per device 425 enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 }; 426 hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {}; 427 std::atomic<uint8_t> Current; 428 }; 429 430 /// Class containing all the device information 431 class RTLDeviceInfoTy : HSALifetime { 432 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; 433 434 struct QueueDeleter { 435 void operator()(hsa_queue_t *Q) { 436 if (Q) { 437 hsa_status_t Err = hsa_queue_destroy(Q); 438 if (Err != HSA_STATUS_SUCCESS) { 439 DP("Error destroying hsa queue: %s\n", get_error_string(Err)); 440 } 441 } 442 } 443 }; 444 445 public: 446 bool ConstructionSucceeded = false; 447 448 // load binary populates symbol tables and mutates various global state 449 // run uses those symbol tables 450 std::shared_timed_mutex LoadRunLock; 451 452 int NumberOfDevices = 0; 453 454 /// List that contains all the kernels. 455 /// FIXME: we may need this to be per device and per library. 456 std::list<KernelTy> KernelsList; 457 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>> 458 KernelArgPoolMap; 459 460 // GPU devices 461 std::vector<hsa_agent_t> HSAAgents; 462 std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu 463 464 // CPUs 465 std::vector<hsa_agent_t> CPUAgents; 466 467 // Device properties 468 std::vector<int> ComputeUnits; 469 std::vector<int> GroupsPerDevice; 470 std::vector<int> ThreadsPerGroup; 471 std::vector<int> WarpSize; 472 std::vector<std::string> GPUName; 473 std::vector<std::string> TargetID; 474 475 // OpenMP properties 476 std::vector<int> NumTeams; 477 std::vector<int> NumThreads; 478 479 // OpenMP Environment properties 480 EnvironmentVariables Env; 481 482 // OpenMP Requires Flags 483 int64_t RequiresFlags; 484 485 // Resource pools 486 SignalPoolT FreeSignalPool; 487 488 bool HostcallRequired = false; 489 490 std::vector<hsa_executable_t> HSAExecutables; 491 492 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable; 493 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable; 494 495 hsa_amd_memory_pool_t KernArgPool; 496 497 // fine grained memory pool for host allocations 498 hsa_amd_memory_pool_t HostFineGrainedMemoryPool; 499 500 // fine and coarse-grained memory pools per offloading device 501 std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools; 502 std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools; 503 504 struct ImplFreePtrDeletor { 505 void operator()(void *P) { 506 core::Runtime::Memfree(P); // ignore failure to free 507 } 508 }; 509 510 // device_State shared across loaded binaries, error if inconsistent size 511 std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>> 512 DeviceStateStore; 513 514 static const unsigned HardTeamLimit = 515 (1 << 16) - 1; // 64K needed to fit in uint16 516 static const int DefaultNumTeams = 128; 517 518 // These need to be per-device since different devices can have different 519 // wave sizes, but are currently the same number for each so that refactor 520 // can be postponed. 521 static_assert(getGridValue<32>().GV_Max_Teams == 522 getGridValue<64>().GV_Max_Teams, 523 ""); 524 static const int MaxTeams = getGridValue<64>().GV_Max_Teams; 525 526 static_assert(getGridValue<32>().GV_Max_WG_Size == 527 getGridValue<64>().GV_Max_WG_Size, 528 ""); 529 static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size; 530 531 static_assert(getGridValue<32>().GV_Default_WG_Size == 532 getGridValue<64>().GV_Default_WG_Size, 533 ""); 534 static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size; 535 536 using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size, 537 hsa_agent_t, hsa_amd_memory_pool_t); 538 hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size, 539 MemcpyFunc Func, int32_t DeviceId) { 540 hsa_agent_t Agent = HSAAgents[DeviceId]; 541 hsa_signal_t S = FreeSignalPool.pop(); 542 if (S.handle == 0) { 543 return HSA_STATUS_ERROR; 544 } 545 hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool); 546 FreeSignalPool.push(S); 547 return R; 548 } 549 550 hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size, 551 int32_t DeviceId) { 552 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId); 553 } 554 555 hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size, 556 int32_t DeviceId) { 557 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId); 558 } 559 560 static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) { 561 char TmpChar[1000]; 562 uint16_t Major, Minor; 563 uint32_t TmpUInt; 564 uint32_t TmpUInt2; 565 uint32_t CacheSize[4]; 566 bool TmpBool; 567 uint16_t WorkgroupMaxDim[3]; 568 hsa_dim3_t GridMaxDim; 569 570 // Getting basic information about HSA and Device 571 core::checkResult( 572 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major), 573 "Error from hsa_system_get_info when obtaining " 574 "HSA_SYSTEM_INFO_VERSION_MAJOR\n"); 575 core::checkResult( 576 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor), 577 "Error from hsa_system_get_info when obtaining " 578 "HSA_SYSTEM_INFO_VERSION_MINOR\n"); 579 printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor); 580 printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId); 581 core::checkResult( 582 hsa_agent_get_info( 583 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar), 584 "Error returned from hsa_agent_get_info when obtaining " 585 "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n"); 586 printf(" Product Name: \t\t\t%s \n", TmpChar); 587 core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar), 588 "Error returned from hsa_agent_get_info when obtaining " 589 "HSA_AGENT_INFO_NAME\n"); 590 printf(" Device Name: \t\t\t%s \n", TmpChar); 591 core::checkResult( 592 hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar), 593 "Error returned from hsa_agent_get_info when obtaining " 594 "HSA_AGENT_INFO_NAME\n"); 595 printf(" Vendor Name: \t\t\t%s \n", TmpChar); 596 hsa_device_type_t DevType; 597 core::checkResult( 598 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType), 599 "Error returned from hsa_agent_get_info when obtaining " 600 "HSA_AGENT_INFO_DEVICE\n"); 601 printf(" Device Type: \t\t\t%s \n", 602 DevType == HSA_DEVICE_TYPE_CPU 603 ? "CPU" 604 : (DevType == HSA_DEVICE_TYPE_GPU 605 ? "GPU" 606 : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN"))); 607 core::checkResult( 608 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt), 609 "Error returned from hsa_agent_get_info when obtaining " 610 "HSA_AGENT_INFO_QUEUES_MAX\n"); 611 printf(" Max Queues: \t\t\t%u \n", TmpUInt); 612 core::checkResult( 613 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt), 614 "Error returned from hsa_agent_get_info when obtaining " 615 "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n"); 616 printf(" Queue Min Size: \t\t\t%u \n", TmpUInt); 617 core::checkResult( 618 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt), 619 "Error returned from hsa_agent_get_info when obtaining " 620 "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n"); 621 printf(" Queue Max Size: \t\t\t%u \n", TmpUInt); 622 623 // Getting cache information 624 printf(" Cache:\n"); 625 626 // FIXME: This is deprecated according to HSA documentation. But using 627 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during 628 // runtime. 629 core::checkResult( 630 hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize), 631 "Error returned from hsa_agent_get_info when obtaining " 632 "HSA_AGENT_INFO_CACHE_SIZE\n"); 633 634 for (int I = 0; I < 4; I++) { 635 if (CacheSize[I]) { 636 printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]); 637 } 638 } 639 640 core::checkResult( 641 hsa_agent_get_info(Agent, 642 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, 643 &TmpUInt), 644 "Error returned from hsa_agent_get_info when obtaining " 645 "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n"); 646 printf(" Cacheline Size: \t\t\t%u \n", TmpUInt); 647 core::checkResult( 648 hsa_agent_get_info( 649 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, 650 &TmpUInt), 651 "Error returned from hsa_agent_get_info when obtaining " 652 "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n"); 653 printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt); 654 core::checkResult( 655 hsa_agent_get_info( 656 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 657 &TmpUInt), 658 "Error returned from hsa_agent_get_info when obtaining " 659 "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n"); 660 printf(" Compute Units: \t\t\t%u \n", TmpUInt); 661 core::checkResult(hsa_agent_get_info( 662 Agent, 663 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, 664 &TmpUInt), 665 "Error returned from hsa_agent_get_info when obtaining " 666 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); 667 printf(" SIMD per CU: \t\t\t%u \n", TmpUInt); 668 core::checkResult( 669 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool), 670 "Error returned from hsa_agent_get_info when obtaining " 671 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); 672 printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE")); 673 core::checkResult( 674 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2), 675 "Error returned from hsa_agent_get_info when obtaining " 676 "HSA_AGENT_INFO_WAVEFRONT_SIZE\n"); 677 printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2); 678 core::checkResult( 679 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt), 680 "Error returned from hsa_agent_get_info when obtaining " 681 "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n"); 682 printf(" Workgroup Max Size: \t\t%u \n", TmpUInt); 683 core::checkResult(hsa_agent_get_info(Agent, 684 HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 685 WorkgroupMaxDim), 686 "Error returned from hsa_agent_get_info when obtaining " 687 "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n"); 688 printf(" Workgroup Max Size per Dimension:\n"); 689 printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]); 690 printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]); 691 printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]); 692 core::checkResult(hsa_agent_get_info( 693 Agent, 694 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, 695 &TmpUInt), 696 "Error returned from hsa_agent_get_info when obtaining " 697 "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n"); 698 printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt); 699 printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2); 700 core::checkResult( 701 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt), 702 "Error returned from hsa_agent_get_info when obtaining " 703 "HSA_AGENT_INFO_GRID_MAX_SIZE\n"); 704 printf(" Grid Max Size: \t\t\t%u \n", TmpUInt); 705 core::checkResult( 706 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim), 707 "Error returned from hsa_agent_get_info when obtaining " 708 "HSA_AGENT_INFO_GRID_MAX_DIM\n"); 709 printf(" Grid Max Size per Dimension: \t\t\n"); 710 printf(" x: \t\t\t\t%u\n", GridMaxDim.x); 711 printf(" y: \t\t\t\t%u\n", GridMaxDim.y); 712 printf(" z: \t\t\t\t%u\n", GridMaxDim.z); 713 core::checkResult( 714 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt), 715 "Error returned from hsa_agent_get_info when obtaining " 716 "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n"); 717 printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt); 718 719 printf(" Memory Pools:\n"); 720 auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t { 721 std::string TmpStr; 722 size_t Size; 723 bool Alloc, Access; 724 hsa_amd_segment_t Segment; 725 hsa_amd_memory_pool_global_flag_t GlobalFlags; 726 core::checkResult( 727 hsa_amd_memory_pool_get_info( 728 Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags), 729 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 730 "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n"); 731 core::checkResult(hsa_amd_memory_pool_get_info( 732 Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment), 733 "Error returned from hsa_amd_memory_pool_get_info when " 734 "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n"); 735 736 switch (Segment) { 737 case HSA_AMD_SEGMENT_GLOBAL: 738 TmpStr = "GLOBAL; FLAGS: "; 739 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags) 740 TmpStr += "KERNARG, "; 741 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags) 742 TmpStr += "FINE GRAINED, "; 743 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags) 744 TmpStr += "COARSE GRAINED, "; 745 break; 746 case HSA_AMD_SEGMENT_READONLY: 747 TmpStr = "READONLY"; 748 break; 749 case HSA_AMD_SEGMENT_PRIVATE: 750 TmpStr = "PRIVATE"; 751 break; 752 case HSA_AMD_SEGMENT_GROUP: 753 TmpStr = "GROUP"; 754 break; 755 } 756 printf(" Pool %s: \n", TmpStr.c_str()); 757 758 core::checkResult(hsa_amd_memory_pool_get_info( 759 Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size), 760 "Error returned from hsa_amd_memory_pool_get_info when " 761 "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n"); 762 printf(" Size: \t\t\t\t %zu bytes\n", Size); 763 core::checkResult( 764 hsa_amd_memory_pool_get_info( 765 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc), 766 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 767 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n"); 768 printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE")); 769 core::checkResult( 770 hsa_amd_memory_pool_get_info( 771 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size), 772 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 773 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n"); 774 printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size); 775 core::checkResult( 776 hsa_amd_memory_pool_get_info( 777 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size), 778 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 779 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n"); 780 printf(" Runtime Alloc alignment: \t %zu bytes\n", Size); 781 core::checkResult( 782 hsa_amd_memory_pool_get_info( 783 Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access), 784 "Error returned from hsa_amd_memory_pool_get_info when obtaining " 785 "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n"); 786 printf(" Accessable by all: \t\t %s\n", 787 (Access ? "TRUE" : "FALSE")); 788 789 return HSA_STATUS_SUCCESS; 790 }; 791 // Iterate over all the memory regions for this agent. Get the memory region 792 // type and size 793 hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr); 794 795 printf(" ISAs:\n"); 796 auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t { 797 char TmpChar[1000]; 798 core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar), 799 "Error returned from hsa_isa_get_info_alt when " 800 "obtaining HSA_ISA_INFO_NAME\n"); 801 printf(" Name: \t\t\t\t %s\n", TmpChar); 802 803 return HSA_STATUS_SUCCESS; 804 }; 805 // Iterate over all the memory regions for this agent. Get the memory region 806 // type and size 807 hsa_agent_iterate_isas(Agent, CBIsas, nullptr); 808 } 809 810 // Record entry point associated with device 811 void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) { 812 assert(DeviceId < (int32_t)FuncGblEntries.size() && 813 "Unexpected device id!"); 814 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 815 816 E.Entries.push_back(Entry); 817 } 818 819 // Return true if the entry is associated with device 820 bool findOffloadEntry(int32_t DeviceId, void *Addr) { 821 assert(DeviceId < (int32_t)FuncGblEntries.size() && 822 "Unexpected device id!"); 823 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 824 825 for (auto &It : E.Entries) { 826 if (It.addr == Addr) 827 return true; 828 } 829 830 return false; 831 } 832 833 // Return the pointer to the target entries table 834 __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) { 835 assert(DeviceId < (int32_t)FuncGblEntries.size() && 836 "Unexpected device id!"); 837 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 838 839 int32_t Size = E.Entries.size(); 840 841 // Table is empty 842 if (!Size) 843 return 0; 844 845 __tgt_offload_entry *Begin = &E.Entries[0]; 846 __tgt_offload_entry *End = &E.Entries[Size - 1]; 847 848 // Update table info according to the entries and return the pointer 849 E.Table.EntriesBegin = Begin; 850 E.Table.EntriesEnd = ++End; 851 852 return &E.Table; 853 } 854 855 // Clear entries table for a device 856 void clearOffloadEntriesTable(int DeviceId) { 857 assert(DeviceId < (int32_t)FuncGblEntries.size() && 858 "Unexpected device id!"); 859 FuncGblEntries[DeviceId].emplace_back(); 860 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); 861 E.Entries.clear(); 862 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 863 } 864 865 hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, 866 unsigned int DeviceId) { 867 assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here."); 868 uint32_t GlobalFlags = 0; 869 hsa_status_t Err = hsa_amd_memory_pool_get_info( 870 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 871 872 if (Err != HSA_STATUS_SUCCESS) { 873 return Err; 874 } 875 876 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { 877 DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; 878 } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { 879 DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; 880 } 881 882 return HSA_STATUS_SUCCESS; 883 } 884 885 hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) { 886 for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { 887 hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( 888 Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { 889 hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool); 890 if (ValidStatus != HSA_STATUS_SUCCESS) { 891 DP("Alloc allowed in memory pool check failed: %s\n", 892 get_error_string(ValidStatus)); 893 return HSA_STATUS_SUCCESS; 894 } 895 return addDeviceMemoryPool(MemoryPool, DeviceId); 896 }); 897 898 if (Err != HSA_STATUS_SUCCESS) { 899 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 900 "Iterate all memory pools", get_error_string(Err)); 901 return Err; 902 } 903 } 904 return HSA_STATUS_SUCCESS; 905 } 906 907 hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) { 908 std::vector<hsa_amd_memory_pool_t> HostPools; 909 910 // collect all the "valid" pools for all the given agents. 911 for (const auto &Agent : Agents) { 912 hsa_status_t Err = hsa_amd_agent_iterate_memory_pools( 913 Agent, core::addMemoryPool, static_cast<void *>(&HostPools)); 914 if (Err != HSA_STATUS_SUCCESS) { 915 DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)); 916 } 917 } 918 919 // We need two fine-grained pools. 920 // 1. One with kernarg flag set for storing kernel arguments 921 // 2. Second for host allocations 922 bool FineGrainedMemoryPoolSet = false; 923 bool KernArgPoolSet = false; 924 for (const auto &MemoryPool : HostPools) { 925 hsa_status_t Err = HSA_STATUS_SUCCESS; 926 uint32_t GlobalFlags = 0; 927 Err = hsa_amd_memory_pool_get_info( 928 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 929 if (Err != HSA_STATUS_SUCCESS) { 930 DP("Get memory pool info failed: %s\n", get_error_string(Err)); 931 return Err; 932 } 933 934 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { 935 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { 936 KernArgPool = MemoryPool; 937 KernArgPoolSet = true; 938 } 939 HostFineGrainedMemoryPool = MemoryPool; 940 FineGrainedMemoryPoolSet = true; 941 } 942 } 943 944 if (FineGrainedMemoryPoolSet && KernArgPoolSet) 945 return HSA_STATUS_SUCCESS; 946 947 return HSA_STATUS_ERROR; 948 } 949 950 hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) { 951 assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && 952 "Invalid device Id"); 953 return DeviceCoarseGrainedMemoryPools[DeviceId]; 954 } 955 956 hsa_amd_memory_pool_t getHostMemoryPool() { 957 return HostFineGrainedMemoryPool; 958 } 959 960 static int readEnv(const char *Env, int Default = -1) { 961 const char *EnvStr = getenv(Env); 962 int Res = Default; 963 if (EnvStr) { 964 Res = std::stoi(EnvStr); 965 DP("Parsed %s=%d\n", Env, Res); 966 } 967 return Res; 968 } 969 970 RTLDeviceInfoTy() { 971 DP("Start initializing " GETNAME(TARGET_NAME) "\n"); 972 973 // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr 974 // anytime. You do not need a debug library build. 975 // 0 => no tracing 976 // 1 => tracing dispatch only 977 // >1 => verbosity increase 978 979 if (!HSAInitSuccess()) { 980 DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"); 981 return; 982 } 983 984 if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) 985 print_kernel_trace = atoi(EnvStr); 986 else 987 print_kernel_trace = 0; 988 989 hsa_status_t Err = core::atl_init_gpu_context(); 990 if (Err != HSA_STATUS_SUCCESS) { 991 DP("Error when initializing " GETNAME(TARGET_NAME) "\n"); 992 return; 993 } 994 995 // Init hostcall soon after initializing hsa 996 hostrpc_init(); 997 998 Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { 999 if (DeviceType == HSA_DEVICE_TYPE_CPU) { 1000 CPUAgents.push_back(Agent); 1001 } else { 1002 HSAAgents.push_back(Agent); 1003 } 1004 }); 1005 if (Err != HSA_STATUS_SUCCESS) 1006 return; 1007 1008 NumberOfDevices = (int)HSAAgents.size(); 1009 1010 if (NumberOfDevices == 0) { 1011 DP("There are no devices supporting HSA.\n"); 1012 return; 1013 } 1014 DP("There are %d devices supporting HSA.\n", NumberOfDevices); 1015 1016 // Init the device info 1017 HSAQueueSchedulers.reserve(NumberOfDevices); 1018 FuncGblEntries.resize(NumberOfDevices); 1019 ThreadsPerGroup.resize(NumberOfDevices); 1020 ComputeUnits.resize(NumberOfDevices); 1021 GPUName.resize(NumberOfDevices); 1022 GroupsPerDevice.resize(NumberOfDevices); 1023 WarpSize.resize(NumberOfDevices); 1024 NumTeams.resize(NumberOfDevices); 1025 NumThreads.resize(NumberOfDevices); 1026 DeviceStateStore.resize(NumberOfDevices); 1027 KernelInfoTable.resize(NumberOfDevices); 1028 SymbolInfoTable.resize(NumberOfDevices); 1029 DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); 1030 DeviceFineGrainedMemoryPools.resize(NumberOfDevices); 1031 1032 Err = setupDevicePools(HSAAgents); 1033 if (Err != HSA_STATUS_SUCCESS) { 1034 DP("Setup for Device Memory Pools failed\n"); 1035 return; 1036 } 1037 1038 Err = setupHostMemoryPools(CPUAgents); 1039 if (Err != HSA_STATUS_SUCCESS) { 1040 DP("Setup for Host Memory Pools failed\n"); 1041 return; 1042 } 1043 1044 for (int I = 0; I < NumberOfDevices; I++) { 1045 uint32_t QueueSize = 0; 1046 { 1047 hsa_status_t Err = hsa_agent_get_info( 1048 HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize); 1049 if (Err != HSA_STATUS_SUCCESS) { 1050 DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I); 1051 return; 1052 } 1053 enum { MaxQueueSize = 4096 }; 1054 if (QueueSize > MaxQueueSize) { 1055 QueueSize = MaxQueueSize; 1056 } 1057 } 1058 1059 { 1060 HSAQueueScheduler QSched; 1061 if (!QSched.createQueues(HSAAgents[I], QueueSize)) 1062 return; 1063 HSAQueueSchedulers.emplace_back(std::move(QSched)); 1064 } 1065 1066 DeviceStateStore[I] = {nullptr, 0}; 1067 } 1068 1069 for (int I = 0; I < NumberOfDevices; I++) { 1070 ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize; 1071 GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams; 1072 ComputeUnits[I] = 1; 1073 DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I, 1074 GroupsPerDevice[I], ThreadsPerGroup[I]); 1075 } 1076 1077 // Get environment variables regarding teams 1078 Env.TeamLimit = readEnv("OMP_TEAM_LIMIT"); 1079 Env.NumTeams = readEnv("OMP_NUM_TEAMS"); 1080 Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT"); 1081 Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT"); 1082 Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0); 1083 1084 // Default state. 1085 RequiresFlags = OMP_REQ_UNDEFINED; 1086 1087 ConstructionSucceeded = true; 1088 } 1089 1090 ~RTLDeviceInfoTy() { 1091 DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"); 1092 if (!HSAInitSuccess()) { 1093 // Then none of these can have been set up and they can't be torn down 1094 return; 1095 } 1096 // Run destructors on types that use HSA before 1097 // impl_finalize removes access to it 1098 DeviceStateStore.clear(); 1099 KernelArgPoolMap.clear(); 1100 // Terminate hostrpc before finalizing hsa 1101 hostrpc_terminate(); 1102 1103 hsa_status_t Err; 1104 for (uint32_t I = 0; I < HSAExecutables.size(); I++) { 1105 Err = hsa_executable_destroy(HSAExecutables[I]); 1106 if (Err != HSA_STATUS_SUCCESS) { 1107 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 1108 "Destroying executable", get_error_string(Err)); 1109 } 1110 } 1111 } 1112 }; 1113 1114 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; 1115 1116 static RTLDeviceInfoTy *DeviceInfoState = nullptr; 1117 static RTLDeviceInfoTy &DeviceInfo() { return *DeviceInfoState; } 1118 1119 namespace { 1120 1121 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, 1122 __tgt_async_info *AsyncInfo) { 1123 assert(AsyncInfo && "AsyncInfo is nullptr"); 1124 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 1125 // Return success if we are not copying back to host from target. 1126 if (!HstPtr) 1127 return OFFLOAD_SUCCESS; 1128 hsa_status_t Err; 1129 DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 1130 (long long unsigned)(Elf64_Addr)TgtPtr, 1131 (long long unsigned)(Elf64_Addr)HstPtr); 1132 1133 Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, 1134 DeviceId); 1135 1136 if (Err != HSA_STATUS_SUCCESS) { 1137 DP("Error when copying data from device to host. Pointers: " 1138 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 1139 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 1140 return OFFLOAD_FAIL; 1141 } 1142 DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 1143 (long long unsigned)(Elf64_Addr)TgtPtr, 1144 (long long unsigned)(Elf64_Addr)HstPtr); 1145 return OFFLOAD_SUCCESS; 1146 } 1147 1148 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, 1149 __tgt_async_info *AsyncInfo) { 1150 assert(AsyncInfo && "AsyncInfo is nullptr"); 1151 hsa_status_t Err; 1152 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 1153 // Return success if we are not doing host to target. 1154 if (!HstPtr) 1155 return OFFLOAD_SUCCESS; 1156 1157 DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, 1158 (long long unsigned)(Elf64_Addr)HstPtr, 1159 (long long unsigned)(Elf64_Addr)TgtPtr); 1160 Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, 1161 DeviceId); 1162 if (Err != HSA_STATUS_SUCCESS) { 1163 DP("Error when copying data from host to device. Pointers: " 1164 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 1165 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 1166 return OFFLOAD_FAIL; 1167 } 1168 return OFFLOAD_SUCCESS; 1169 } 1170 1171 // Async. 1172 // The implementation was written with cuda streams in mind. The semantics of 1173 // that are to execute kernels on a queue in order of insertion. A synchronise 1174 // call then makes writes visible between host and device. This means a series 1175 // of N data_submit_async calls are expected to execute serially. HSA offers 1176 // various options to run the data copies concurrently. This may require changes 1177 // to libomptarget. 1178 1179 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that 1180 // there are no outstanding kernels that need to be synchronized. Any async call 1181 // may be passed a Queue==0, at which point the cuda implementation will set it 1182 // to non-null (see getStream). The cuda streams are per-device. Upstream may 1183 // change this interface to explicitly initialize the AsyncInfo_pointer, but 1184 // until then hsa lazily initializes it as well. 1185 1186 void initAsyncInfo(__tgt_async_info *AsyncInfo) { 1187 // set non-null while using async calls, return to null to indicate completion 1188 assert(AsyncInfo); 1189 if (!AsyncInfo->Queue) { 1190 AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX); 1191 } 1192 } 1193 void finiAsyncInfo(__tgt_async_info *AsyncInfo) { 1194 assert(AsyncInfo); 1195 assert(AsyncInfo->Queue); 1196 AsyncInfo->Queue = 0; 1197 } 1198 1199 // Determine launch values for kernel. 1200 struct LaunchVals { 1201 int WorkgroupSize; 1202 int GridSize; 1203 }; 1204 LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, 1205 int ConstWGSize, 1206 llvm::omp::OMPTgtExecModeFlags ExecutionMode, 1207 int NumTeams, int ThreadLimit, uint64_t LoopTripcount, 1208 int DeviceNumTeams) { 1209 1210 int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize; 1211 int NumGroups = 0; 1212 1213 int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; 1214 if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit)) 1215 MaxTeams = RTLDeviceInfoTy::HardTeamLimit; 1216 1217 if (print_kernel_trace & STARTUP_DETAILS) { 1218 DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams); 1219 DP("Max_Teams: %d\n", MaxTeams); 1220 DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize); 1221 DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize); 1222 DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", 1223 RTLDeviceInfoTy::DefaultWgSize); 1224 DP("thread_limit: %d\n", ThreadLimit); 1225 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1226 DP("ConstWGSize: %d\n", ConstWGSize); 1227 } 1228 // check for thread_limit() clause 1229 if (ThreadLimit > 0) { 1230 ThreadsPerGroup = ThreadLimit; 1231 DP("Setting threads per block to requested %d\n", ThreadLimit); 1232 // Add master warp for GENERIC 1233 if (ExecutionMode == 1234 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { 1235 ThreadsPerGroup += WarpSize; 1236 DP("Adding master wavefront: +%d threads\n", WarpSize); 1237 } 1238 if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max 1239 ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize; 1240 DP("Setting threads per block to maximum %d\n", ThreadsPerGroup); 1241 } 1242 } 1243 // check flat_max_work_group_size attr here 1244 if (ThreadsPerGroup > ConstWGSize) { 1245 ThreadsPerGroup = ConstWGSize; 1246 DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", 1247 ThreadsPerGroup); 1248 } 1249 if (print_kernel_trace & STARTUP_DETAILS) 1250 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1251 DP("Preparing %d threads\n", ThreadsPerGroup); 1252 1253 // Set default num_groups (teams) 1254 if (Env.TeamLimit > 0) 1255 NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit; 1256 else 1257 NumGroups = MaxTeams; 1258 DP("Set default num of groups %d\n", NumGroups); 1259 1260 if (print_kernel_trace & STARTUP_DETAILS) { 1261 DP("num_groups: %d\n", NumGroups); 1262 DP("num_teams: %d\n", NumTeams); 1263 } 1264 1265 // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size 1266 // This reduction is typical for default case (no thread_limit clause). 1267 // or when user goes crazy with num_teams clause. 1268 // FIXME: We cant distinguish between a constant or variable thread limit. 1269 // So we only handle constant thread_limits. 1270 if (ThreadsPerGroup > 1271 RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024 1272 // Should we round threadsPerGroup up to nearest WarpSize 1273 // here? 1274 NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup; 1275 1276 // check for num_teams() clause 1277 if (NumTeams > 0) { 1278 NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups; 1279 } 1280 if (print_kernel_trace & STARTUP_DETAILS) { 1281 DP("num_groups: %d\n", NumGroups); 1282 DP("Env.NumTeams %d\n", Env.NumTeams); 1283 DP("Env.TeamLimit %d\n", Env.TeamLimit); 1284 } 1285 1286 if (Env.NumTeams > 0) { 1287 NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups; 1288 DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); 1289 } else if (Env.TeamLimit > 0) { 1290 NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups; 1291 DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); 1292 } else { 1293 if (NumTeams <= 0) { 1294 if (LoopTripcount > 0) { 1295 if (ExecutionMode == 1296 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { 1297 // round up to the nearest integer 1298 NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1; 1299 } else if (ExecutionMode == 1300 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { 1301 NumGroups = LoopTripcount; 1302 } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { 1303 // This is a generic kernel that was transformed to use SPMD-mode 1304 // execution but uses Generic-mode semantics for scheduling. 1305 NumGroups = LoopTripcount; 1306 } 1307 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 1308 "threads per block %d\n", 1309 NumGroups, LoopTripcount, ThreadsPerGroup); 1310 } 1311 } else { 1312 NumGroups = NumTeams; 1313 } 1314 if (NumGroups > MaxTeams) { 1315 NumGroups = MaxTeams; 1316 if (print_kernel_trace & STARTUP_DETAILS) 1317 DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams); 1318 } 1319 if (NumGroups > NumTeams && NumTeams > 0) { 1320 NumGroups = NumTeams; 1321 if (print_kernel_trace & STARTUP_DETAILS) 1322 DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups, 1323 NumTeams); 1324 } 1325 } 1326 1327 // num_teams clause always honored, no matter what, unless DEFAULT is active. 1328 if (NumTeams > 0) { 1329 NumGroups = NumTeams; 1330 // Cap num_groups to EnvMaxTeamsDefault if set. 1331 if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault) 1332 NumGroups = Env.MaxTeamsDefault; 1333 } 1334 if (print_kernel_trace & STARTUP_DETAILS) { 1335 DP("threadsPerGroup: %d\n", ThreadsPerGroup); 1336 DP("num_groups: %d\n", NumGroups); 1337 DP("loop_tripcount: %ld\n", LoopTripcount); 1338 } 1339 DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups, 1340 ThreadsPerGroup); 1341 1342 LaunchVals Res; 1343 Res.WorkgroupSize = ThreadsPerGroup; 1344 Res.GridSize = ThreadsPerGroup * NumGroups; 1345 return Res; 1346 } 1347 1348 static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) { 1349 uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); 1350 bool Full = true; 1351 while (Full) { 1352 Full = 1353 PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue)); 1354 } 1355 return PacketId; 1356 } 1357 1358 int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, 1359 ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, 1360 int32_t ThreadLimit, uint64_t LoopTripcount) { 1361 // Set the context we are using 1362 // update thread limit content in gpu memory if un-initialized or specified 1363 // from host 1364 1365 DP("Run target team region thread_limit %d\n", ThreadLimit); 1366 1367 // All args are references. 1368 std::vector<void *> Args(ArgNum); 1369 std::vector<void *> Ptrs(ArgNum); 1370 1371 DP("Arg_num: %d\n", ArgNum); 1372 for (int32_t I = 0; I < ArgNum; ++I) { 1373 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1374 Args[I] = &Ptrs[I]; 1375 DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])); 1376 } 1377 1378 KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr; 1379 1380 std::string KernelName = std::string(KernelInfo->Name); 1381 auto &KernelInfoTable = DeviceInfo().KernelInfoTable; 1382 if (KernelInfoTable[DeviceId].find(KernelName) == 1383 KernelInfoTable[DeviceId].end()) { 1384 DP("Kernel %s not found\n", KernelName.c_str()); 1385 return OFFLOAD_FAIL; 1386 } 1387 1388 const atl_kernel_info_t KernelInfoEntry = 1389 KernelInfoTable[DeviceId][KernelName]; 1390 const uint32_t GroupSegmentSize = 1391 KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize; 1392 const uint32_t SgprCount = KernelInfoEntry.sgpr_count; 1393 const uint32_t VgprCount = KernelInfoEntry.vgpr_count; 1394 const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count; 1395 const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count; 1396 1397 assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count); 1398 1399 /* 1400 * Set limit based on ThreadsPerGroup and GroupsPerDevice 1401 */ 1402 LaunchVals LV = 1403 getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env, 1404 KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, 1405 NumTeams, // From run_region arg 1406 ThreadLimit, // From run_region arg 1407 LoopTripcount, // From run_region arg 1408 DeviceInfo().NumTeams[KernelInfo->DeviceId]); 1409 const int GridSize = LV.GridSize; 1410 const int WorkgroupSize = LV.WorkgroupSize; 1411 1412 if (print_kernel_trace >= LAUNCH) { 1413 int NumGroups = GridSize / WorkgroupSize; 1414 // enum modes are SPMD, GENERIC, NONE 0,1,2 1415 // if doing rtl timing, print to stderr, unless stdout requested. 1416 bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); 1417 fprintf(TraceToStdout ? stdout : stderr, 1418 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " 1419 "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " 1420 "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", 1421 DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, 1422 ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit, 1423 GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount, 1424 VgprSpillCount, LoopTripcount, KernelInfo->Name); 1425 } 1426 1427 // Run on the device. 1428 { 1429 hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next(); 1430 if (!Queue) { 1431 return OFFLOAD_FAIL; 1432 } 1433 uint64_t PacketId = acquireAvailablePacketId(Queue); 1434 1435 const uint32_t Mask = Queue->size - 1; // size is a power of 2 1436 hsa_kernel_dispatch_packet_t *Packet = 1437 (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); 1438 1439 // packet->header is written last 1440 Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 1441 Packet->workgroup_size_x = WorkgroupSize; 1442 Packet->workgroup_size_y = 1; 1443 Packet->workgroup_size_z = 1; 1444 Packet->reserved0 = 0; 1445 Packet->grid_size_x = GridSize; 1446 Packet->grid_size_y = 1; 1447 Packet->grid_size_z = 1; 1448 Packet->private_segment_size = KernelInfoEntry.private_segment_size; 1449 Packet->group_segment_size = GroupSegmentSize; 1450 Packet->kernel_object = KernelInfoEntry.kernel_object; 1451 Packet->kernarg_address = 0; // use the block allocator 1452 Packet->reserved2 = 0; // impl writes id_ here 1453 Packet->completion_signal = {0}; // may want a pool of signals 1454 1455 KernelArgPool *ArgPool = nullptr; 1456 void *KernArg = nullptr; 1457 { 1458 auto It = 1459 DeviceInfo().KernelArgPoolMap.find(std::string(KernelInfo->Name)); 1460 if (It != DeviceInfo().KernelArgPoolMap.end()) { 1461 ArgPool = (It->second).get(); 1462 } 1463 } 1464 if (!ArgPool) { 1465 DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, 1466 DeviceId); 1467 } 1468 { 1469 if (ArgPool) { 1470 assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))); 1471 KernArg = ArgPool->allocate(ArgNum); 1472 } 1473 if (!KernArg) { 1474 DP("Allocate kernarg failed\n"); 1475 return OFFLOAD_FAIL; 1476 } 1477 1478 // Copy explicit arguments 1479 for (int I = 0; I < ArgNum; I++) { 1480 memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *)); 1481 } 1482 1483 // Initialize implicit arguments. TODO: Which of these can be dropped 1484 impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>( 1485 static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize); 1486 memset(ImplArgs, 0, 1487 sizeof(impl_implicit_args_t)); // may not be necessary 1488 ImplArgs->offset_x = 0; 1489 ImplArgs->offset_y = 0; 1490 ImplArgs->offset_z = 0; 1491 1492 // assign a hostcall buffer for the selected Q 1493 if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) { 1494 // hostrpc_assign_buffer is not thread safe, and this function is 1495 // under a multiple reader lock, not a writer lock. 1496 static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER; 1497 pthread_mutex_lock(&HostcallInitLock); 1498 uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo().HSAAgents[DeviceId], 1499 Queue, DeviceId); 1500 pthread_mutex_unlock(&HostcallInitLock); 1501 if (!Buffer) { 1502 DP("hostrpc_assign_buffer failed, gpu would dereference null and " 1503 "error\n"); 1504 return OFFLOAD_FAIL; 1505 } 1506 1507 DP("Implicit argument count: %d\n", 1508 KernelInfoEntry.implicit_argument_count); 1509 if (KernelInfoEntry.implicit_argument_count >= 4) { 1510 // Initialise pointer for implicit_argument_count != 0 ABI 1511 // Guess that the right implicit argument is at offset 24 after 1512 // the explicit arguments. In the future, should be able to read 1513 // the offset from msgpack. Clang is not annotating it at present. 1514 uint64_t Offset = 1515 sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); 1516 if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) { 1517 DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit " 1518 "args: %d\n", 1519 Offset + 8, ArgPool->kernargSizeIncludingImplicit()); 1520 } else { 1521 memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8); 1522 } 1523 } 1524 1525 // initialise pointer for implicit_argument_count == 0 ABI 1526 ImplArgs->hostcall_ptr = Buffer; 1527 } 1528 1529 Packet->kernarg_address = KernArg; 1530 } 1531 1532 hsa_signal_t S = DeviceInfo().FreeSignalPool.pop(); 1533 if (S.handle == 0) { 1534 DP("Failed to get signal instance\n"); 1535 return OFFLOAD_FAIL; 1536 } 1537 Packet->completion_signal = S; 1538 hsa_signal_store_relaxed(Packet->completion_signal, 1); 1539 1540 // Publish the packet indicating it is ready to be processed 1541 core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet), 1542 core::createHeader(), Packet->setup); 1543 1544 // Since the packet is already published, its contents must not be 1545 // accessed any more 1546 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); 1547 1548 while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, 1549 HSA_WAIT_STATE_BLOCKED) != 0) 1550 ; 1551 1552 assert(ArgPool); 1553 ArgPool->deallocate(KernArg); 1554 DeviceInfo().FreeSignalPool.push(S); 1555 } 1556 1557 DP("Kernel completed\n"); 1558 return OFFLOAD_SUCCESS; 1559 } 1560 1561 bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) { 1562 const uint16_t AmdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h 1563 int32_t R = elf_check_machine(Image, AmdgcnMachineID); 1564 if (!R) { 1565 DP("Supported machine ID not found\n"); 1566 } 1567 return R; 1568 } 1569 1570 uint32_t elfEFlags(__tgt_device_image *Image) { 1571 char *ImgBegin = (char *)Image->ImageStart; 1572 size_t ImgSize = (char *)Image->ImageEnd - ImgBegin; 1573 1574 Elf *E = elf_memory(ImgBegin, ImgSize); 1575 if (!E) { 1576 DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); 1577 return 0; 1578 } 1579 1580 Elf64_Ehdr *Eh64 = elf64_getehdr(E); 1581 1582 if (!Eh64) { 1583 DP("Unable to get machine ID from ELF file!\n"); 1584 elf_end(E); 1585 return 0; 1586 } 1587 1588 uint32_t Flags = Eh64->e_flags; 1589 1590 elf_end(E); 1591 DP("ELF Flags: 0x%x\n", Flags); 1592 return Flags; 1593 } 1594 1595 template <typename T> bool enforceUpperBound(T *Value, T Upper) { 1596 bool Changed = *Value > Upper; 1597 if (Changed) { 1598 *Value = Upper; 1599 } 1600 return Changed; 1601 } 1602 1603 Elf64_Shdr *findOnlyShtHash(Elf *Elf) { 1604 size_t N; 1605 int Rc = elf_getshdrnum(Elf, &N); 1606 if (Rc != 0) { 1607 return nullptr; 1608 } 1609 1610 Elf64_Shdr *Result = nullptr; 1611 for (size_t I = 0; I < N; I++) { 1612 Elf_Scn *Scn = elf_getscn(Elf, I); 1613 if (Scn) { 1614 Elf64_Shdr *Shdr = elf64_getshdr(Scn); 1615 if (Shdr) { 1616 if (Shdr->sh_type == SHT_HASH) { 1617 if (Result == nullptr) { 1618 Result = Shdr; 1619 } else { 1620 // multiple SHT_HASH sections not handled 1621 return nullptr; 1622 } 1623 } 1624 } 1625 } 1626 } 1627 return Result; 1628 } 1629 1630 const Elf64_Sym *elfLookup(Elf *Elf, char *Base, Elf64_Shdr *SectionHash, 1631 const char *Symname) { 1632 1633 assert(SectionHash); 1634 size_t SectionSymtabIndex = SectionHash->sh_link; 1635 Elf64_Shdr *SectionSymtab = 1636 elf64_getshdr(elf_getscn(Elf, SectionSymtabIndex)); 1637 size_t SectionStrtabIndex = SectionSymtab->sh_link; 1638 1639 const Elf64_Sym *Symtab = 1640 reinterpret_cast<const Elf64_Sym *>(Base + SectionSymtab->sh_offset); 1641 1642 const uint32_t *Hashtab = 1643 reinterpret_cast<const uint32_t *>(Base + SectionHash->sh_offset); 1644 1645 // Layout: 1646 // nbucket 1647 // nchain 1648 // bucket[nbucket] 1649 // chain[nchain] 1650 uint32_t Nbucket = Hashtab[0]; 1651 const uint32_t *Bucket = &Hashtab[2]; 1652 const uint32_t *Chain = &Hashtab[Nbucket + 2]; 1653 1654 const size_t Max = strlen(Symname) + 1; 1655 const uint32_t Hash = elf_hash(Symname); 1656 for (uint32_t I = Bucket[Hash % Nbucket]; I != 0; I = Chain[I]) { 1657 char *N = elf_strptr(Elf, SectionStrtabIndex, Symtab[I].st_name); 1658 if (strncmp(Symname, N, Max) == 0) { 1659 return &Symtab[I]; 1660 } 1661 } 1662 1663 return nullptr; 1664 } 1665 1666 struct SymbolInfo { 1667 void *Addr = nullptr; 1668 uint32_t Size = UINT32_MAX; 1669 uint32_t ShType = SHT_NULL; 1670 }; 1671 1672 int getSymbolInfoWithoutLoading(Elf *Elf, char *Base, const char *Symname, 1673 SymbolInfo *Res) { 1674 if (elf_kind(Elf) != ELF_K_ELF) { 1675 return 1; 1676 } 1677 1678 Elf64_Shdr *SectionHash = findOnlyShtHash(Elf); 1679 if (!SectionHash) { 1680 return 1; 1681 } 1682 1683 const Elf64_Sym *Sym = elfLookup(Elf, Base, SectionHash, Symname); 1684 if (!Sym) { 1685 return 1; 1686 } 1687 1688 if (Sym->st_size > UINT32_MAX) { 1689 return 1; 1690 } 1691 1692 if (Sym->st_shndx == SHN_UNDEF) { 1693 return 1; 1694 } 1695 1696 Elf_Scn *Section = elf_getscn(Elf, Sym->st_shndx); 1697 if (!Section) { 1698 return 1; 1699 } 1700 1701 Elf64_Shdr *Header = elf64_getshdr(Section); 1702 if (!Header) { 1703 return 1; 1704 } 1705 1706 Res->Addr = Sym->st_value + Base; 1707 Res->Size = static_cast<uint32_t>(Sym->st_size); 1708 Res->ShType = Header->sh_type; 1709 return 0; 1710 } 1711 1712 int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *Symname, 1713 SymbolInfo *Res) { 1714 Elf *Elf = elf_memory(Base, ImgSize); 1715 if (Elf) { 1716 int Rc = getSymbolInfoWithoutLoading(Elf, Base, Symname, Res); 1717 elf_end(Elf); 1718 return Rc; 1719 } 1720 return 1; 1721 } 1722 1723 hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize, 1724 const char *SymName, void **VarAddr, 1725 uint32_t *VarSize) { 1726 SymbolInfo SI; 1727 int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI); 1728 if (Rc == 0) { 1729 *VarAddr = SI.Addr; 1730 *VarSize = SI.Size; 1731 return HSA_STATUS_SUCCESS; 1732 } 1733 return HSA_STATUS_ERROR; 1734 } 1735 1736 template <typename C> 1737 hsa_status_t moduleRegisterFromMemoryToPlace( 1738 std::map<std::string, atl_kernel_info_t> &KernelInfoTable, 1739 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, 1740 void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb, 1741 std::vector<hsa_executable_t> &HSAExecutables) { 1742 auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t { 1743 C *Unwrapped = static_cast<C *>(CbState); 1744 return (*Unwrapped)(Data, Size); 1745 }; 1746 return core::RegisterModuleFromMemory( 1747 KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize, 1748 DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb), 1749 HSAExecutables); 1750 } 1751 1752 uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) { 1753 uint64_t DeviceStateBytes = 0; 1754 { 1755 // If this is the deviceRTL, get the state variable size 1756 SymbolInfo SizeSi; 1757 int Rc = getSymbolInfoWithoutLoading( 1758 ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi); 1759 1760 if (Rc == 0) { 1761 if (SizeSi.Size != sizeof(uint64_t)) { 1762 DP("Found device_State_size variable with wrong size\n"); 1763 return 0; 1764 } 1765 1766 // Read number of bytes directly from the elf 1767 memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t)); 1768 } 1769 } 1770 return DeviceStateBytes; 1771 } 1772 1773 struct DeviceEnvironment { 1774 // initialise an DeviceEnvironmentTy in the deviceRTL 1775 // patches around differences in the deviceRTL between trunk, aomp, 1776 // rocmcc. Over time these differences will tend to zero and this class 1777 // simplified. 1778 // Symbol may be in .data or .bss, and may be missing fields, todo: 1779 // review aomp/trunk/rocm and simplify the following 1780 1781 // The symbol may also have been deadstripped because the device side 1782 // accessors were unused. 1783 1784 // If the symbol is in .data (aomp, rocm) it can be written directly. 1785 // If it is in .bss, we must wait for it to be allocated space on the 1786 // gpu (trunk) and initialize after loading. 1787 const char *sym() { return "omptarget_device_environment"; } 1788 1789 DeviceEnvironmentTy HostDeviceEnv; 1790 SymbolInfo SI; 1791 bool Valid = false; 1792 1793 __tgt_device_image *Image; 1794 const size_t ImgSize; 1795 1796 DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize, 1797 __tgt_device_image *Image, const size_t ImgSize) 1798 : Image(Image), ImgSize(ImgSize) { 1799 1800 HostDeviceEnv.NumDevices = NumberDevices; 1801 HostDeviceEnv.DeviceNum = DeviceId; 1802 HostDeviceEnv.DebugKind = 0; 1803 HostDeviceEnv.DynamicMemSize = DynamicMemSize; 1804 if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 1805 HostDeviceEnv.DebugKind = std::stoi(EnvStr); 1806 1807 int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize, 1808 sym(), &SI); 1809 if (Rc != 0) { 1810 DP("Finding global device environment '%s' - symbol missing.\n", sym()); 1811 return; 1812 } 1813 1814 if (SI.Size > sizeof(HostDeviceEnv)) { 1815 DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size, 1816 sizeof(HostDeviceEnv)); 1817 return; 1818 } 1819 1820 Valid = true; 1821 } 1822 1823 bool inImage() { return SI.ShType != SHT_NOBITS; } 1824 1825 hsa_status_t beforeLoading(void *Data, size_t Size) { 1826 if (Valid) { 1827 if (inImage()) { 1828 DP("Setting global device environment before load (%u bytes)\n", 1829 SI.Size); 1830 uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart; 1831 void *Pos = (char *)Data + Offset; 1832 memcpy(Pos, &HostDeviceEnv, SI.Size); 1833 } 1834 } 1835 return HSA_STATUS_SUCCESS; 1836 } 1837 1838 hsa_status_t afterLoading() { 1839 if (Valid) { 1840 if (!inImage()) { 1841 DP("Setting global device environment after load (%u bytes)\n", 1842 SI.Size); 1843 int DeviceId = HostDeviceEnv.DeviceNum; 1844 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; 1845 void *StatePtr; 1846 uint32_t StatePtrSize; 1847 hsa_status_t Err = interop_hsa_get_symbol_info( 1848 SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize); 1849 if (Err != HSA_STATUS_SUCCESS) { 1850 DP("failed to find %s in loaded image\n", sym()); 1851 return Err; 1852 } 1853 1854 if (StatePtrSize != SI.Size) { 1855 DP("Symbol had size %u before loading, %u after\n", StatePtrSize, 1856 SI.Size); 1857 return HSA_STATUS_ERROR; 1858 } 1859 1860 return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, 1861 StatePtrSize, DeviceId); 1862 } 1863 } 1864 return HSA_STATUS_SUCCESS; 1865 } 1866 }; 1867 1868 hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) { 1869 uint64_t Rounded = 4 * ((Size + 3) / 4); 1870 void *Ptr; 1871 hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); 1872 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr); 1873 if (Err != HSA_STATUS_SUCCESS) { 1874 return Err; 1875 } 1876 1877 hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4); 1878 if (Rc != HSA_STATUS_SUCCESS) { 1879 DP("zero fill device_state failed with %u\n", Rc); 1880 core::Runtime::Memfree(Ptr); 1881 return HSA_STATUS_ERROR; 1882 } 1883 1884 *RetPtr = Ptr; 1885 return HSA_STATUS_SUCCESS; 1886 } 1887 1888 bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) { 1889 SymbolInfo SI; 1890 int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI); 1891 return (Rc == 0) && (SI.Addr != nullptr); 1892 } 1893 1894 } // namespace 1895 1896 namespace core { 1897 hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) { 1898 return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(), 1899 &DeviceInfo().HSAAgents[0], NULL, Ptr); 1900 } 1901 } // namespace core 1902 1903 static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) { 1904 hsa_status_t err; 1905 uint32_t name_len; 1906 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len); 1907 if (err != HSA_STATUS_SUCCESS) { 1908 DP("Error getting ISA info length\n"); 1909 return err; 1910 } 1911 1912 char TargetID[name_len]; 1913 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID); 1914 if (err != HSA_STATUS_SUCCESS) { 1915 DP("Error getting ISA info name\n"); 1916 return err; 1917 } 1918 1919 auto TripleTargetID = llvm::StringRef(TargetID); 1920 if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) { 1921 DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str()); 1922 } 1923 return HSA_STATUS_SUCCESS; 1924 } 1925 1926 /// Parse a TargetID to get processor arch and feature map. 1927 /// Returns processor subarch. 1928 /// Returns TargetID features in \p FeatureMap argument. 1929 /// If the \p TargetID contains feature+, FeatureMap it to true. 1930 /// If the \p TargetID contains feature-, FeatureMap it to false. 1931 /// If the \p TargetID does not contain a feature (default), do not map it. 1932 StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) { 1933 if (TargetID.empty()) 1934 return llvm::StringRef(); 1935 1936 auto ArchFeature = TargetID.split(":"); 1937 auto Arch = ArchFeature.first; 1938 auto Features = ArchFeature.second; 1939 if (Features.empty()) 1940 return Arch; 1941 1942 if (Features.contains("sramecc+")) { 1943 FeatureMap.insert(std::pair<std::string, bool>("sramecc", true)); 1944 } else if (Features.contains("sramecc-")) { 1945 FeatureMap.insert(std::pair<std::string, bool>("sramecc", false)); 1946 } 1947 if (Features.contains("xnack+")) { 1948 FeatureMap.insert(std::pair<std::string, bool>("xnack", true)); 1949 } else if (Features.contains("xnack-")) { 1950 FeatureMap.insert(std::pair<std::string, bool>("xnack", false)); 1951 } 1952 1953 return Arch; 1954 } 1955 1956 /// Checks if an image \p ImgInfo is compatible with current 1957 /// system's environment \p EnvInfo 1958 bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) { 1959 llvm::StringRef ImgTID(ImgInfo), EnvTID(EnvInfo); 1960 1961 // Compatible in case of exact match 1962 if (ImgTID == EnvTID) { 1963 DP("Compatible: Exact match \t[Image: %s]\t:\t[Environment: %s]\n", 1964 ImgTID.data(), EnvTID.data()); 1965 return true; 1966 } 1967 1968 // Incompatible if Archs mismatch. 1969 StringMap<bool> ImgMap, EnvMap; 1970 StringRef ImgArch = parseTargetID(ImgTID, ImgMap); 1971 StringRef EnvArch = parseTargetID(EnvTID, EnvMap); 1972 1973 // Both EnvArch and ImgArch can't be empty here. 1974 if (EnvArch.empty() || ImgArch.empty() || !ImgArch.contains(EnvArch)) { 1975 DP("Incompatible: Processor mismatch \t[Image: %s]\t:\t[Environment: %s]\n", 1976 ImgTID.data(), EnvTID.data()); 1977 return false; 1978 } 1979 1980 // Incompatible if image has more features than the environment, irrespective 1981 // of type or sign of features. 1982 if (ImgMap.size() > EnvMap.size()) { 1983 DP("Incompatible: Image has more features than the environment \t[Image: " 1984 "%s]\t:\t[Environment: %s]\n", 1985 ImgTID.data(), EnvTID.data()); 1986 return false; 1987 } 1988 1989 // Compatible if each target feature specified by the environment is 1990 // compatible with target feature of the image. The target feature is 1991 // compatible if the iamge does not specify it (meaning Any), or if it 1992 // specifies it with the same value (meaning On or Off). 1993 for (const auto &ImgFeature : ImgMap) { 1994 auto EnvFeature = EnvMap.find(ImgFeature.first()); 1995 if (EnvFeature == EnvMap.end()) { 1996 DP("Incompatible: Value of Image's non-ANY feature is not matching with " 1997 "the Environment feature's ANY value \t[Image: %s]\t:\t[Environment: " 1998 "%s]\n", 1999 ImgTID.data(), EnvTID.data()); 2000 return false; 2001 } else if (EnvFeature->first() == ImgFeature.first() && 2002 EnvFeature->second != ImgFeature.second) { 2003 DP("Incompatible: Value of Image's non-ANY feature is not matching with " 2004 "the Environment feature's non-ANY value \t[Image: " 2005 "%s]\t:\t[Environment: %s]\n", 2006 ImgTID.data(), EnvTID.data()); 2007 return false; 2008 } 2009 } 2010 2011 // Image is compatible if all features of Environment are: 2012 // - either, present in the Image's features map with the same sign, 2013 // - or, the feature is missing from Image's features map i.e. it is 2014 // set to ANY 2015 DP("Compatible: Target IDs are compatible \t[Image: %s]\t:\t[Environment: " 2016 "%s]\n", 2017 ImgTID.data(), EnvTID.data()); 2018 return true; 2019 } 2020 2021 extern "C" { 2022 2023 int32_t __tgt_rtl_init_plugin() { 2024 DeviceInfoState = new RTLDeviceInfoTy; 2025 return (DeviceInfoState && DeviceInfoState->ConstructionSucceeded) 2026 ? OFFLOAD_SUCCESS 2027 : OFFLOAD_FAIL; 2028 } 2029 2030 int32_t __tgt_rtl_deinit_plugin() { 2031 if (DeviceInfoState) 2032 delete DeviceInfoState; 2033 return OFFLOAD_SUCCESS; 2034 } 2035 2036 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { 2037 return elfMachineIdIsAmdgcn(Image); 2038 } 2039 2040 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image, 2041 __tgt_image_info *info) { 2042 if (!__tgt_rtl_is_valid_binary(image)) 2043 return false; 2044 2045 // A subarchitecture was not specified. Assume it is compatible. 2046 if (!info->Arch) 2047 return true; 2048 2049 int32_t NumberOfDevices = __tgt_rtl_number_of_devices(); 2050 2051 for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { 2052 __tgt_rtl_init_device(DeviceId); 2053 hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId]; 2054 hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId); 2055 if (err != HSA_STATUS_SUCCESS) { 2056 DP("Error iterating ISAs\n"); 2057 return false; 2058 } 2059 if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo().TargetID[DeviceId])) 2060 return false; 2061 } 2062 DP("Image has Target ID compatible with the current environment: %s\n", 2063 info->Arch); 2064 return true; 2065 } 2066 2067 int __tgt_rtl_number_of_devices() { 2068 // If the construction failed, no methods are safe to call 2069 if (DeviceInfo().ConstructionSucceeded) { 2070 return DeviceInfo().NumberOfDevices; 2071 } 2072 DP("AMDGPU plugin construction failed. Zero devices available\n"); 2073 return 0; 2074 } 2075 2076 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 2077 DP("Init requires flags to %ld\n", RequiresFlags); 2078 DeviceInfo().RequiresFlags = RequiresFlags; 2079 return RequiresFlags; 2080 } 2081 2082 int32_t __tgt_rtl_init_device(int DeviceId) { 2083 hsa_status_t Err = hsa_init(); 2084 if (Err != HSA_STATUS_SUCCESS) { 2085 DP("HSA Initialization Failed.\n"); 2086 return HSA_STATUS_ERROR; 2087 } 2088 // this is per device id init 2089 DP("Initialize the device id: %d\n", DeviceId); 2090 2091 hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; 2092 2093 // Get number of Compute Unit 2094 uint32_t ComputeUnits = 0; 2095 Err = hsa_agent_get_info( 2096 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 2097 &ComputeUnits); 2098 if (Err != HSA_STATUS_SUCCESS) { 2099 DeviceInfo().ComputeUnits[DeviceId] = 1; 2100 DP("Error getting compute units : settiing to 1\n"); 2101 } else { 2102 DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits; 2103 DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]); 2104 } 2105 2106 char GetInfoName[64]; // 64 max size returned by get info 2107 Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, 2108 (void *)GetInfoName); 2109 if (Err) 2110 DeviceInfo().GPUName[DeviceId] = "--unknown gpu--"; 2111 else { 2112 DeviceInfo().GPUName[DeviceId] = GetInfoName; 2113 } 2114 2115 if (print_kernel_trace & STARTUP_DETAILS) 2116 DP("Device#%-2d CU's: %2d %s\n", DeviceId, 2117 DeviceInfo().ComputeUnits[DeviceId], DeviceInfo().GPUName[DeviceId].c_str()); 2118 2119 // Query attributes to determine number of threads/block and blocks/grid. 2120 uint16_t WorkgroupMaxDim[3]; 2121 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 2122 &WorkgroupMaxDim); 2123 if (Err != HSA_STATUS_SUCCESS) { 2124 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams; 2125 DP("Error getting grid dims: num groups : %d\n", 2126 RTLDeviceInfoTy::DefaultNumTeams); 2127 } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) { 2128 DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; 2129 DP("Using %d ROCm blocks per grid\n", DeviceInfo().GroupsPerDevice[DeviceId]); 2130 } else { 2131 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; 2132 DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " 2133 "at the hard limit\n", 2134 WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit); 2135 } 2136 2137 // Get thread limit 2138 hsa_dim3_t GridMaxDim; 2139 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim); 2140 if (Err == HSA_STATUS_SUCCESS) { 2141 DeviceInfo().ThreadsPerGroup[DeviceId] = 2142 reinterpret_cast<uint32_t *>(&GridMaxDim)[0] / 2143 DeviceInfo().GroupsPerDevice[DeviceId]; 2144 2145 if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) { 2146 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; 2147 DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); 2148 } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId], 2149 RTLDeviceInfoTy::MaxWgSize)) { 2150 DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); 2151 } else { 2152 DP("Using ROCm Queried thread limit: %d\n", 2153 DeviceInfo().ThreadsPerGroup[DeviceId]); 2154 } 2155 } else { 2156 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; 2157 DP("Error getting max block dimension, use default:%d \n", 2158 RTLDeviceInfoTy::MaxWgSize); 2159 } 2160 2161 // Get wavefront size 2162 uint32_t WavefrontSize = 0; 2163 Err = 2164 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize); 2165 if (Err == HSA_STATUS_SUCCESS) { 2166 DP("Queried wavefront size: %d\n", WavefrontSize); 2167 DeviceInfo().WarpSize[DeviceId] = WavefrontSize; 2168 } else { 2169 // TODO: Burn the wavefront size into the code object 2170 DP("Warning: Unknown wavefront size, assuming 64\n"); 2171 DeviceInfo().WarpSize[DeviceId] = 64; 2172 } 2173 2174 // Adjust teams to the env variables 2175 2176 if (DeviceInfo().Env.TeamLimit > 0 && 2177 (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId], 2178 DeviceInfo().Env.TeamLimit))) { 2179 DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", 2180 DeviceInfo().Env.TeamLimit); 2181 } 2182 2183 // Set default number of teams 2184 if (DeviceInfo().Env.NumTeams > 0) { 2185 DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams; 2186 DP("Default number of teams set according to environment %d\n", 2187 DeviceInfo().Env.NumTeams); 2188 } else { 2189 char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); 2190 int TeamsPerCU = DefaultTeamsPerCU; 2191 if (TeamsPerCUEnvStr) { 2192 TeamsPerCU = std::stoi(TeamsPerCUEnvStr); 2193 } 2194 2195 DeviceInfo().NumTeams[DeviceId] = 2196 TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId]; 2197 DP("Default number of teams = %d * number of compute units %d\n", 2198 TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]); 2199 } 2200 2201 if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId], 2202 DeviceInfo().GroupsPerDevice[DeviceId])) { 2203 DP("Default number of teams exceeds device limit, capping at %d\n", 2204 DeviceInfo().GroupsPerDevice[DeviceId]); 2205 } 2206 2207 // Adjust threads to the env variables 2208 if (DeviceInfo().Env.TeamThreadLimit > 0 && 2209 (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], 2210 DeviceInfo().Env.TeamThreadLimit))) { 2211 DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", 2212 DeviceInfo().Env.TeamThreadLimit); 2213 } 2214 2215 // Set default number of threads 2216 DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize; 2217 DP("Default number of threads set according to library's default %d\n", 2218 RTLDeviceInfoTy::DefaultWgSize); 2219 if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], 2220 DeviceInfo().ThreadsPerGroup[DeviceId])) { 2221 DP("Default number of threads exceeds device limit, capping at %d\n", 2222 DeviceInfo().ThreadsPerGroup[DeviceId]); 2223 } 2224 2225 DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", 2226 DeviceId, DeviceInfo().GroupsPerDevice[DeviceId], 2227 DeviceInfo().ThreadsPerGroup[DeviceId]); 2228 2229 DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId, 2230 DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId], 2231 DeviceInfo().GroupsPerDevice[DeviceId], 2232 DeviceInfo().GroupsPerDevice[DeviceId] * 2233 DeviceInfo().ThreadsPerGroup[DeviceId]); 2234 2235 return OFFLOAD_SUCCESS; 2236 } 2237 2238 static __tgt_target_table * 2239 __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image); 2240 2241 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, 2242 __tgt_device_image *Image) { 2243 DeviceInfo().LoadRunLock.lock(); 2244 __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image); 2245 DeviceInfo().LoadRunLock.unlock(); 2246 return Res; 2247 } 2248 2249 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, 2250 __tgt_device_image *Image) { 2251 // This function loads the device image onto gpu[DeviceId] and does other 2252 // per-image initialization work. Specifically: 2253 // 2254 // - Initialize an DeviceEnvironmentTy instance embedded in the 2255 // image at the symbol "omptarget_device_environment" 2256 // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL. 2257 // 2258 // - Allocate a large array per-gpu (could be moved to init_device) 2259 // - Read a uint64_t at symbol omptarget_nvptx_device_State_size 2260 // - Allocate at least that many bytes of gpu memory 2261 // - Zero initialize it 2262 // - Write the pointer to the symbol omptarget_nvptx_device_State 2263 // 2264 // - Pulls some per-kernel information together from various sources and 2265 // records it in the KernelsList for quicker access later 2266 // 2267 // The initialization can be done before or after loading the image onto the 2268 // gpu. This function presently does a mixture. Using the hsa api to get/set 2269 // the information is simpler to implement, in exchange for more complicated 2270 // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes 2271 // back from the gpu vs a hashtable lookup on the host. 2272 2273 const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart; 2274 2275 DeviceInfo().clearOffloadEntriesTable(DeviceId); 2276 2277 // We do not need to set the ELF version because the caller of this function 2278 // had to do that to decide the right runtime to use 2279 2280 if (!elfMachineIdIsAmdgcn(Image)) 2281 return NULL; 2282 2283 { 2284 auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, 2285 DeviceInfo().Env.DynamicMemSize, Image, ImgSize); 2286 2287 auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId]; 2288 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; 2289 hsa_status_t Err = moduleRegisterFromMemoryToPlace( 2290 KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId, 2291 [&](void *Data, size_t Size) { 2292 if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) { 2293 __atomic_store_n(&DeviceInfo().HostcallRequired, true, 2294 __ATOMIC_RELEASE); 2295 } 2296 return Env.beforeLoading(Data, Size); 2297 }, 2298 DeviceInfo().HSAExecutables); 2299 2300 check("Module registering", Err); 2301 if (Err != HSA_STATUS_SUCCESS) { 2302 const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str(); 2303 const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image)); 2304 2305 if (strcmp(DeviceName, ElfName) != 0) { 2306 DP("Possible gpu arch mismatch: device:%s, image:%s please check" 2307 " compiler flag: -march=<gpu>\n", 2308 DeviceName, ElfName); 2309 } else { 2310 DP("Error loading image onto GPU: %s\n", get_error_string(Err)); 2311 } 2312 2313 return NULL; 2314 } 2315 2316 Err = Env.afterLoading(); 2317 if (Err != HSA_STATUS_SUCCESS) { 2318 return NULL; 2319 } 2320 } 2321 2322 DP("AMDGPU module successfully loaded!\n"); 2323 2324 { 2325 // the device_State array is either large value in bss or a void* that 2326 // needs to be assigned to a pointer to an array of size device_state_bytes 2327 // If absent, it has been deadstripped and needs no setup. 2328 2329 void *StatePtr; 2330 uint32_t StatePtrSize; 2331 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; 2332 hsa_status_t Err = interop_hsa_get_symbol_info( 2333 SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr, 2334 &StatePtrSize); 2335 2336 if (Err != HSA_STATUS_SUCCESS) { 2337 DP("No device_state symbol found, skipping initialization\n"); 2338 } else { 2339 if (StatePtrSize < sizeof(void *)) { 2340 DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize, 2341 sizeof(void *)); 2342 return NULL; 2343 } 2344 2345 // if it's larger than a void*, assume it's a bss array and no further 2346 // initialization is required. Only try to set up a pointer for 2347 // sizeof(void*) 2348 if (StatePtrSize == sizeof(void *)) { 2349 uint64_t DeviceStateBytes = 2350 getDeviceStateBytes((char *)Image->ImageStart, ImgSize); 2351 if (DeviceStateBytes == 0) { 2352 DP("Can't initialize device_State, missing size information\n"); 2353 return NULL; 2354 } 2355 2356 auto &DSS = DeviceInfo().DeviceStateStore[DeviceId]; 2357 if (DSS.first.get() == nullptr) { 2358 assert(DSS.second == 0); 2359 void *Ptr = NULL; 2360 hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId); 2361 if (Err != HSA_STATUS_SUCCESS) { 2362 DP("Failed to allocate device_state array\n"); 2363 return NULL; 2364 } 2365 DSS = { 2366 std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr}, 2367 DeviceStateBytes, 2368 }; 2369 } 2370 2371 void *Ptr = DSS.first.get(); 2372 if (DeviceStateBytes != DSS.second) { 2373 DP("Inconsistent sizes of device_State unsupported\n"); 2374 return NULL; 2375 } 2376 2377 // write ptr to device memory so it can be used by later kernels 2378 Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, 2379 sizeof(void *), DeviceId); 2380 if (Err != HSA_STATUS_SUCCESS) { 2381 DP("memcpy install of state_ptr failed\n"); 2382 return NULL; 2383 } 2384 } 2385 } 2386 } 2387 2388 // Here, we take advantage of the data that is appended after img_end to get 2389 // the symbols' name we need to load. This data consist of the host entries 2390 // begin and end as well as the target name (see the offloading linker script 2391 // creation in clang compiler). 2392 2393 // Find the symbols in the module by name. The name can be obtain by 2394 // concatenating the host entry name with the target name 2395 2396 __tgt_offload_entry *HostBegin = Image->EntriesBegin; 2397 __tgt_offload_entry *HostEnd = Image->EntriesEnd; 2398 2399 for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 2400 2401 if (!E->addr) { 2402 // The host should have always something in the address to 2403 // uniquely identify the target region. 2404 DP("Analyzing host entry '<null>' (size = %lld)...\n", 2405 (unsigned long long)E->size); 2406 return NULL; 2407 } 2408 2409 if (E->size) { 2410 __tgt_offload_entry Entry = *E; 2411 2412 void *Varptr; 2413 uint32_t Varsize; 2414 2415 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; 2416 hsa_status_t Err = interop_hsa_get_symbol_info( 2417 SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize); 2418 2419 if (Err != HSA_STATUS_SUCCESS) { 2420 // Inform the user what symbol prevented offloading 2421 DP("Loading global '%s' (Failed)\n", E->name); 2422 return NULL; 2423 } 2424 2425 if (Varsize != E->size) { 2426 DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name, 2427 Varsize, E->size); 2428 return NULL; 2429 } 2430 2431 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 2432 DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)); 2433 Entry.addr = (void *)Varptr; 2434 2435 DeviceInfo().addOffloadEntry(DeviceId, Entry); 2436 2437 if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 2438 E->flags & OMP_DECLARE_TARGET_LINK) { 2439 // If unified memory is present any target link variables 2440 // can access host addresses directly. There is no longer a 2441 // need for device copies. 2442 Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, sizeof(void *), 2443 DeviceId); 2444 if (Err != HSA_STATUS_SUCCESS) 2445 DP("Error when copying USM\n"); 2446 DP("Copy linked variable host address (" DPxMOD ")" 2447 "to device address (" DPxMOD ")\n", 2448 DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)); 2449 } 2450 2451 continue; 2452 } 2453 2454 DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)); 2455 2456 // errors in kernarg_segment_size previously treated as = 0 (or as undef) 2457 uint32_t KernargSegmentSize = 0; 2458 auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId]; 2459 hsa_status_t Err = HSA_STATUS_SUCCESS; 2460 if (!E->name) { 2461 Err = HSA_STATUS_ERROR; 2462 } else { 2463 std::string KernelStr = std::string(E->name); 2464 auto It = KernelInfoMap.find(KernelStr); 2465 if (It != KernelInfoMap.end()) { 2466 atl_kernel_info_t Info = It->second; 2467 KernargSegmentSize = Info.kernel_segment_size; 2468 } else { 2469 Err = HSA_STATUS_ERROR; 2470 } 2471 } 2472 2473 // default value GENERIC (in case symbol is missing from cubin file) 2474 llvm::omp::OMPTgtExecModeFlags ExecModeVal = 2475 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; 2476 2477 // get flat group size if present, else Default_WG_Size 2478 int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; 2479 2480 // get Kernel Descriptor if present. 2481 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp 2482 struct KernDescValType { 2483 uint16_t Version; 2484 uint16_t TSize; 2485 uint16_t WGSize; 2486 }; 2487 struct KernDescValType KernDescVal; 2488 std::string KernDescNameStr(E->name); 2489 KernDescNameStr += "_kern_desc"; 2490 const char *KernDescName = KernDescNameStr.c_str(); 2491 2492 void *KernDescPtr; 2493 uint32_t KernDescSize; 2494 void *CallStackAddr = nullptr; 2495 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName, 2496 &KernDescPtr, &KernDescSize); 2497 2498 if (Err == HSA_STATUS_SUCCESS) { 2499 if ((size_t)KernDescSize != sizeof(KernDescVal)) 2500 DP("Loading global computation properties '%s' - size mismatch (%u != " 2501 "%lu)\n", 2502 KernDescName, KernDescSize, sizeof(KernDescVal)); 2503 2504 memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); 2505 2506 // Check structure size against recorded size. 2507 if ((size_t)KernDescSize != KernDescVal.TSize) 2508 DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", 2509 sizeof(KernDescVal), KernDescVal.TSize, KernDescName); 2510 2511 DP("After loading global for %s KernDesc \n", KernDescName); 2512 DP("KernDesc: Version: %d\n", KernDescVal.Version); 2513 DP("KernDesc: TSize: %d\n", KernDescVal.TSize); 2514 DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize); 2515 2516 if (KernDescVal.WGSize == 0) { 2517 KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize; 2518 DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize); 2519 } 2520 WGSizeVal = KernDescVal.WGSize; 2521 DP("WGSizeVal %d\n", WGSizeVal); 2522 check("Loading KernDesc computation property", Err); 2523 } else { 2524 DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); 2525 2526 // Flat group size 2527 std::string WGSizeNameStr(E->name); 2528 WGSizeNameStr += "_wg_size"; 2529 const char *WGSizeName = WGSizeNameStr.c_str(); 2530 2531 void *WGSizePtr; 2532 uint32_t WGSize; 2533 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName, 2534 &WGSizePtr, &WGSize); 2535 2536 if (Err == HSA_STATUS_SUCCESS) { 2537 if ((size_t)WGSize != sizeof(int16_t)) { 2538 DP("Loading global computation properties '%s' - size mismatch (%u " 2539 "!= " 2540 "%lu)\n", 2541 WGSizeName, WGSize, sizeof(int16_t)); 2542 return NULL; 2543 } 2544 2545 memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); 2546 2547 DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); 2548 2549 if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize || 2550 WGSizeVal > RTLDeviceInfoTy::MaxWgSize) { 2551 DP("Error wrong WGSize value specified in HSA code object file: " 2552 "%d\n", 2553 WGSizeVal); 2554 WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; 2555 } 2556 } else { 2557 DP("Warning: Loading WGSize '%s' - symbol not found, " 2558 "using default value %d\n", 2559 WGSizeName, WGSizeVal); 2560 } 2561 2562 check("Loading WGSize computation property", Err); 2563 } 2564 2565 // Read execution mode from global in binary 2566 std::string ExecModeNameStr(E->name); 2567 ExecModeNameStr += "_exec_mode"; 2568 const char *ExecModeName = ExecModeNameStr.c_str(); 2569 2570 void *ExecModePtr; 2571 uint32_t VarSize; 2572 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName, 2573 &ExecModePtr, &VarSize); 2574 2575 if (Err == HSA_STATUS_SUCCESS) { 2576 if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 2577 DP("Loading global computation properties '%s' - size mismatch(%u != " 2578 "%lu)\n", 2579 ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 2580 return NULL; 2581 } 2582 2583 memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize); 2584 2585 DP("After loading global for %s ExecMode = %d\n", ExecModeName, 2586 ExecModeVal); 2587 2588 if (ExecModeVal < 0 || 2589 ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { 2590 DP("Error wrong exec_mode value specified in HSA code object file: " 2591 "%d\n", 2592 ExecModeVal); 2593 return NULL; 2594 } 2595 } else { 2596 DP("Loading global exec_mode '%s' - symbol missing, using default " 2597 "value " 2598 "GENERIC (1)\n", 2599 ExecModeName); 2600 } 2601 check("Loading computation property", Err); 2602 2603 DeviceInfo().KernelsList.push_back( 2604 KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name, 2605 KernargSegmentSize, DeviceInfo().KernArgPool, 2606 DeviceInfo().KernelArgPoolMap)); 2607 __tgt_offload_entry Entry = *E; 2608 Entry.addr = (void *)&DeviceInfo().KernelsList.back(); 2609 DeviceInfo().addOffloadEntry(DeviceId, Entry); 2610 DP("Entry point %ld maps to %s\n", E - HostBegin, E->name); 2611 } 2612 2613 return DeviceInfo().getOffloadEntriesTable(DeviceId); 2614 } 2615 2616 void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { 2617 void *Ptr = NULL; 2618 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2619 2620 if (Kind != TARGET_ALLOC_DEFAULT) { 2621 REPORT("Invalid target data allocation kind or requested allocator not " 2622 "implemented yet\n"); 2623 return NULL; 2624 } 2625 2626 hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); 2627 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr); 2628 DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size, 2629 (long long unsigned)(Elf64_Addr)Ptr); 2630 Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL; 2631 return Ptr; 2632 } 2633 2634 int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr, 2635 int64_t Size) { 2636 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2637 __tgt_async_info AsyncInfo; 2638 int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); 2639 if (Rc != OFFLOAD_SUCCESS) 2640 return OFFLOAD_FAIL; 2641 2642 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 2643 } 2644 2645 int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr, 2646 int64_t Size, __tgt_async_info *AsyncInfo) { 2647 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2648 if (AsyncInfo) { 2649 initAsyncInfo(AsyncInfo); 2650 return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo); 2651 } 2652 return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size); 2653 } 2654 2655 int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr, 2656 int64_t Size) { 2657 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2658 __tgt_async_info AsyncInfo; 2659 int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); 2660 if (Rc != OFFLOAD_SUCCESS) 2661 return OFFLOAD_FAIL; 2662 2663 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 2664 } 2665 2666 int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr, 2667 int64_t Size, 2668 __tgt_async_info *AsyncInfo) { 2669 assert(AsyncInfo && "AsyncInfo is nullptr"); 2670 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2671 initAsyncInfo(AsyncInfo); 2672 return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo); 2673 } 2674 2675 int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) { 2676 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large"); 2677 hsa_status_t Err; 2678 DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr); 2679 Err = core::Runtime::Memfree(TgtPtr); 2680 if (Err != HSA_STATUS_SUCCESS) { 2681 DP("Error when freeing CUDA memory\n"); 2682 return OFFLOAD_FAIL; 2683 } 2684 return OFFLOAD_SUCCESS; 2685 } 2686 2687 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, 2688 void **TgtArgs, ptrdiff_t *TgtOffsets, 2689 int32_t ArgNum, int32_t NumTeams, 2690 int32_t ThreadLimit, 2691 uint64_t LoopTripcount) { 2692 2693 DeviceInfo().LoadRunLock.lock_shared(); 2694 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, 2695 ArgNum, NumTeams, ThreadLimit, LoopTripcount); 2696 2697 DeviceInfo().LoadRunLock.unlock_shared(); 2698 return Res; 2699 } 2700 2701 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr, 2702 void **TgtArgs, ptrdiff_t *TgtOffsets, 2703 int32_t ArgNum) { 2704 // use one team and one thread 2705 // fix thread num 2706 int32_t TeamNum = 1; 2707 int32_t ThreadLimit = 0; // use default 2708 return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs, 2709 TgtOffsets, ArgNum, TeamNum, 2710 ThreadLimit, 0); 2711 } 2712 2713 int32_t __tgt_rtl_run_target_team_region_async( 2714 int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, 2715 int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, 2716 uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) { 2717 assert(AsyncInfo && "AsyncInfo is nullptr"); 2718 initAsyncInfo(AsyncInfo); 2719 2720 DeviceInfo().LoadRunLock.lock_shared(); 2721 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, 2722 ArgNum, NumTeams, ThreadLimit, LoopTripcount); 2723 2724 DeviceInfo().LoadRunLock.unlock_shared(); 2725 return Res; 2726 } 2727 2728 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr, 2729 void **TgtArgs, ptrdiff_t *TgtOffsets, 2730 int32_t ArgNum, 2731 __tgt_async_info *AsyncInfo) { 2732 // use one team and one thread 2733 // fix thread num 2734 int32_t TeamNum = 1; 2735 int32_t ThreadLimit = 0; // use default 2736 return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs, 2737 TgtOffsets, ArgNum, TeamNum, 2738 ThreadLimit, 0, AsyncInfo); 2739 } 2740 2741 int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) { 2742 assert(AsyncInfo && "AsyncInfo is nullptr"); 2743 2744 // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant 2745 // is not ensured by devices.cpp for amdgcn 2746 // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr"); 2747 if (AsyncInfo->Queue) { 2748 finiAsyncInfo(AsyncInfo); 2749 } 2750 return OFFLOAD_SUCCESS; 2751 } 2752 2753 void __tgt_rtl_print_device_info(int32_t DeviceId) { 2754 // TODO: Assertion to see if DeviceId is correct 2755 // NOTE: We don't need to set context for print device info. 2756 2757 DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]); 2758 } 2759 2760 } // extern "C" 2761