1 //===----RTLs/hsa/src/rtl.cpp - Target RTLs Implementation -------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for hsa machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <algorithm> 14 #include <assert.h> 15 #include <cstdio> 16 #include <cstdlib> 17 #include <cstring> 18 #include <elf.h> 19 #include <fstream> 20 #include <functional> 21 #include <iostream> 22 #include <libelf.h> 23 #include <list> 24 #include <memory> 25 #include <mutex> 26 #include <shared_mutex> 27 #include <unordered_map> 28 #include <vector> 29 30 // Header from ATMI interface 31 #include "atmi_interop_hsa.h" 32 #include "atmi_runtime.h" 33 34 #include "internal.h" 35 36 #include "Debug.h" 37 #include "get_elf_mach_gfx_name.h" 38 #include "machine.h" 39 #include "omptargetplugin.h" 40 #include "print_tracing.h" 41 42 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 43 44 #ifndef TARGET_NAME 45 #define TARGET_NAME AMDHSA 46 #endif 47 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 48 49 // hostrpc interface, FIXME: consider moving to its own include these are 50 // statically linked into amdgpu/plugin if present from hostrpc_services.a, 51 // linked as --whole-archive to override the weak symbols that are used to 52 // implement a fallback for toolchains that do not yet have a hostrpc library. 53 extern "C" { 54 unsigned long hostrpc_assign_buffer(hsa_agent_t agent, hsa_queue_t *this_Q, 55 uint32_t device_id); 56 hsa_status_t hostrpc_init(); 57 hsa_status_t hostrpc_terminate(); 58 59 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } 60 __attribute__((weak)) hsa_status_t hostrpc_terminate() { 61 return HSA_STATUS_SUCCESS; 62 } 63 __attribute__((weak)) unsigned long 64 hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, uint32_t device_id) { 65 DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library " 66 "missing\n", 67 device_id); 68 return 0; 69 } 70 } 71 72 // Heuristic parameters used for kernel launch 73 // Number of teams per CU to allow scheduling flexibility 74 static const unsigned DefaultTeamsPerCU = 4; 75 76 int print_kernel_trace; 77 78 #ifdef OMPTARGET_DEBUG 79 #define check(msg, status) \ 80 if (status != HSA_STATUS_SUCCESS) { \ 81 DP(#msg " failed\n"); \ 82 } else { \ 83 DP(#msg " succeeded\n"); \ 84 } 85 #else 86 #define check(msg, status) \ 87 {} 88 #endif 89 90 #include "elf_common.h" 91 92 namespace core { 93 hsa_status_t RegisterModuleFromMemory( 94 std::map<std::string, atl_kernel_info_t> &KernelInfo, 95 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, void *, size_t, 96 hsa_agent_t agent, 97 hsa_status_t (*on_deserialized_data)(void *data, size_t size, 98 void *cb_state), 99 void *cb_state, std::vector<hsa_executable_t> &HSAExecutables); 100 } 101 102 namespace hsa { 103 template <typename C> hsa_status_t iterate_agents(C cb) { 104 auto L = [](hsa_agent_t agent, void *data) -> hsa_status_t { 105 C *unwrapped = static_cast<C *>(data); 106 return (*unwrapped)(agent); 107 }; 108 return hsa_iterate_agents(L, static_cast<void *>(&cb)); 109 } 110 111 template <typename C> 112 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C cb) { 113 auto L = [](hsa_amd_memory_pool_t MemoryPool, void *data) -> hsa_status_t { 114 C *unwrapped = static_cast<C *>(data); 115 return (*unwrapped)(MemoryPool); 116 }; 117 118 return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&cb)); 119 } 120 121 } // namespace hsa 122 123 /// Keep entries table per device 124 struct FuncOrGblEntryTy { 125 __tgt_target_table Table; 126 std::vector<__tgt_offload_entry> Entries; 127 }; 128 129 enum ExecutionModeType { 130 SPMD, // constructors, destructors, 131 // combined constructs (`teams distribute parallel for [simd]`) 132 GENERIC, // everything else 133 NONE 134 }; 135 136 struct KernelArgPool { 137 private: 138 static pthread_mutex_t mutex; 139 140 public: 141 uint32_t kernarg_segment_size; 142 void *kernarg_region = nullptr; 143 std::queue<int> free_kernarg_segments; 144 145 uint32_t kernarg_size_including_implicit() { 146 return kernarg_segment_size + sizeof(atmi_implicit_args_t); 147 } 148 149 ~KernelArgPool() { 150 if (kernarg_region) { 151 auto r = hsa_amd_memory_pool_free(kernarg_region); 152 if (r != HSA_STATUS_SUCCESS) { 153 DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(r)); 154 } 155 } 156 } 157 158 // Can't really copy or move a mutex 159 KernelArgPool() = default; 160 KernelArgPool(const KernelArgPool &) = delete; 161 KernelArgPool(KernelArgPool &&) = delete; 162 163 KernelArgPool(uint32_t kernarg_segment_size, 164 hsa_amd_memory_pool_t &memory_pool) 165 : kernarg_segment_size(kernarg_segment_size) { 166 167 // atmi uses one pool per kernel for all gpus, with a fixed upper size 168 // preserving that exact scheme here, including the queue<int> 169 170 hsa_status_t err = hsa_amd_memory_pool_allocate( 171 memory_pool, kernarg_size_including_implicit() * MAX_NUM_KERNELS, 0, 172 &kernarg_region); 173 174 if (err != HSA_STATUS_SUCCESS) { 175 DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(err)); 176 kernarg_region = nullptr; // paranoid 177 return; 178 } 179 180 err = core::allow_access_to_all_gpu_agents(kernarg_region); 181 if (err != HSA_STATUS_SUCCESS) { 182 DP("hsa allow_access_to_all_gpu_agents failed: %s\n", 183 get_error_string(err)); 184 auto r = hsa_amd_memory_pool_free(kernarg_region); 185 if (r != HSA_STATUS_SUCCESS) { 186 // if free failed, can't do anything more to resolve it 187 DP("hsa memory poll free failed: %s\n", get_error_string(err)); 188 } 189 kernarg_region = nullptr; 190 return; 191 } 192 193 for (int i = 0; i < MAX_NUM_KERNELS; i++) { 194 free_kernarg_segments.push(i); 195 } 196 } 197 198 void *allocate(uint64_t arg_num) { 199 assert((arg_num * sizeof(void *)) == kernarg_segment_size); 200 lock l(&mutex); 201 void *res = nullptr; 202 if (!free_kernarg_segments.empty()) { 203 204 int free_idx = free_kernarg_segments.front(); 205 res = static_cast<void *>(static_cast<char *>(kernarg_region) + 206 (free_idx * kernarg_size_including_implicit())); 207 assert(free_idx == pointer_to_index(res)); 208 free_kernarg_segments.pop(); 209 } 210 return res; 211 } 212 213 void deallocate(void *ptr) { 214 lock l(&mutex); 215 int idx = pointer_to_index(ptr); 216 free_kernarg_segments.push(idx); 217 } 218 219 private: 220 int pointer_to_index(void *ptr) { 221 ptrdiff_t bytes = 222 static_cast<char *>(ptr) - static_cast<char *>(kernarg_region); 223 assert(bytes >= 0); 224 assert(bytes % kernarg_size_including_implicit() == 0); 225 return bytes / kernarg_size_including_implicit(); 226 } 227 struct lock { 228 lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); } 229 ~lock() { pthread_mutex_unlock(m); } 230 pthread_mutex_t *m; 231 }; 232 }; 233 pthread_mutex_t KernelArgPool::mutex = PTHREAD_MUTEX_INITIALIZER; 234 235 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>> 236 KernelArgPoolMap; 237 238 /// Use a single entity to encode a kernel and a set of flags 239 struct KernelTy { 240 // execution mode of kernel 241 // 0 - SPMD mode (without master warp) 242 // 1 - Generic mode (with master warp) 243 int8_t ExecutionMode; 244 int16_t ConstWGSize; 245 int32_t device_id; 246 void *CallStackAddr = nullptr; 247 const char *Name; 248 249 KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id, 250 void *_CallStackAddr, const char *_Name, 251 uint32_t _kernarg_segment_size, 252 hsa_amd_memory_pool_t &KernArgMemoryPool) 253 : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize), 254 device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) { 255 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); 256 257 std::string N(_Name); 258 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { 259 KernelArgPoolMap.insert( 260 std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool( 261 _kernarg_segment_size, KernArgMemoryPool)))); 262 } 263 } 264 }; 265 266 /// List that contains all the kernels. 267 /// FIXME: we may need this to be per device and per library. 268 std::list<KernelTy> KernelsList; 269 270 template <typename Callback> static hsa_status_t FindAgents(Callback CB) { 271 272 hsa_status_t err = 273 hsa::iterate_agents([&](hsa_agent_t agent) -> hsa_status_t { 274 hsa_device_type_t device_type; 275 // get_info fails iff HSA runtime not yet initialized 276 hsa_status_t err = 277 hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type); 278 if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) 279 printf("rtl.cpp: err %d\n", err); 280 assert(err == HSA_STATUS_SUCCESS); 281 282 CB(device_type, agent); 283 return HSA_STATUS_SUCCESS; 284 }); 285 286 // iterate_agents fails iff HSA runtime not yet initialized 287 if (print_kernel_trace > 0 && err != HSA_STATUS_SUCCESS) { 288 printf("rtl.cpp: err %d\n", err); 289 } 290 291 return err; 292 } 293 294 static void callbackQueue(hsa_status_t status, hsa_queue_t *source, 295 void *data) { 296 if (status != HSA_STATUS_SUCCESS) { 297 const char *status_string; 298 if (hsa_status_string(status, &status_string) != HSA_STATUS_SUCCESS) { 299 status_string = "unavailable"; 300 } 301 fprintf(stderr, "[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, 302 __LINE__, source, status, status_string); 303 abort(); 304 } 305 } 306 307 namespace core { 308 namespace { 309 void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest) { 310 __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE); 311 } 312 313 uint16_t create_header() { 314 uint16_t header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; 315 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; 316 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; 317 return header; 318 } 319 320 hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { 321 std::vector<hsa_amd_memory_pool_t> *Result = 322 static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data); 323 bool AllocAllowed = false; 324 hsa_status_t err = hsa_amd_memory_pool_get_info( 325 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, 326 &AllocAllowed); 327 if (err != HSA_STATUS_SUCCESS) { 328 fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n", 329 get_error_string(err)); 330 return err; 331 } 332 333 if (!AllocAllowed) { 334 // nothing needs to be done here. 335 return HSA_STATUS_SUCCESS; 336 } 337 338 uint32_t GlobalFlags = 0; 339 err = hsa_amd_memory_pool_get_info( 340 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 341 if (err != HSA_STATUS_SUCCESS) { 342 fprintf(stderr, "Get memory pool info failed: %s\n", get_error_string(err)); 343 return err; 344 } 345 346 size_t size = 0; 347 err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, 348 &size); 349 if (err != HSA_STATUS_SUCCESS) { 350 fprintf(stderr, "Get memory pool size failed: %s\n", get_error_string(err)); 351 return err; 352 } 353 354 if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) && 355 (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) && 356 size > 0) { 357 Result->push_back(MemoryPool); 358 } 359 360 return HSA_STATUS_SUCCESS; 361 } 362 363 std::pair<hsa_status_t, bool> 364 isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { 365 bool AllocAllowed = false; 366 hsa_status_t Err = hsa_amd_memory_pool_get_info( 367 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, 368 &AllocAllowed); 369 if (Err != HSA_STATUS_SUCCESS) { 370 fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n", 371 get_error_string(Err)); 372 return {Err, false}; 373 } 374 375 return {HSA_STATUS_SUCCESS, AllocAllowed}; 376 } 377 378 template <typename AccumulatorFunc> 379 hsa_status_t collectMemoryPools(const std::vector<hsa_agent_t> &Agents, 380 AccumulatorFunc Func) { 381 for (int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { 382 hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( 383 Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { 384 hsa_status_t Err; 385 bool Valid = false; 386 std::tie(Err, Valid) = isValidMemoryPool(MemoryPool); 387 if (Err != HSA_STATUS_SUCCESS) { 388 return Err; 389 } 390 if (Valid) 391 Func(MemoryPool, DeviceId); 392 return HSA_STATUS_SUCCESS; 393 }); 394 395 if (Err != HSA_STATUS_SUCCESS) { 396 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 397 "Iterate all memory pools", get_error_string(Err)); 398 return Err; 399 } 400 } 401 402 return HSA_STATUS_SUCCESS; 403 } 404 405 std::pair<hsa_status_t, hsa_amd_memory_pool_t> 406 FindKernargPool(const std::vector<hsa_agent_t> &HSAAgents) { 407 std::vector<hsa_amd_memory_pool_t> KernArgPools; 408 for (const auto &Agent : HSAAgents) { 409 hsa_status_t err = HSA_STATUS_SUCCESS; 410 err = hsa_amd_agent_iterate_memory_pools( 411 Agent, addKernArgPool, static_cast<void *>(&KernArgPools)); 412 if (err != HSA_STATUS_SUCCESS) { 413 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 414 "Iterate all memory pools", get_error_string(err)); 415 return {err, hsa_amd_memory_pool_t{}}; 416 } 417 } 418 419 if (KernArgPools.empty()) { 420 fprintf(stderr, "Unable to find any valid kernarg pool\n"); 421 return {HSA_STATUS_ERROR, hsa_amd_memory_pool_t{}}; 422 } 423 424 return {HSA_STATUS_SUCCESS, KernArgPools[0]}; 425 } 426 427 } // namespace 428 } // namespace core 429 430 struct EnvironmentVariables { 431 int NumTeams; 432 int TeamLimit; 433 int TeamThreadLimit; 434 int MaxTeamsDefault; 435 }; 436 437 /// Class containing all the device information 438 class RTLDeviceInfoTy { 439 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; 440 441 public: 442 // load binary populates symbol tables and mutates various global state 443 // run uses those symbol tables 444 std::shared_timed_mutex load_run_lock; 445 446 int NumberOfDevices; 447 448 // GPU devices 449 std::vector<hsa_agent_t> HSAAgents; 450 std::vector<hsa_queue_t *> HSAQueues; // one per gpu 451 452 // CPUs 453 std::vector<hsa_agent_t> CPUAgents; 454 455 // Device properties 456 std::vector<int> ComputeUnits; 457 std::vector<int> GroupsPerDevice; 458 std::vector<int> ThreadsPerGroup; 459 std::vector<int> WarpSize; 460 std::vector<std::string> GPUName; 461 462 // OpenMP properties 463 std::vector<int> NumTeams; 464 std::vector<int> NumThreads; 465 466 // OpenMP Environment properties 467 EnvironmentVariables Env; 468 469 // OpenMP Requires Flags 470 int64_t RequiresFlags; 471 472 // Resource pools 473 SignalPoolT FreeSignalPool; 474 475 bool hostcall_required = false; 476 477 std::vector<hsa_executable_t> HSAExecutables; 478 479 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable; 480 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable; 481 482 hsa_amd_memory_pool_t KernArgPool; 483 484 // fine grained memory pool for host allocations 485 hsa_amd_memory_pool_t HostFineGrainedMemoryPool; 486 487 // fine and coarse-grained memory pools per offloading device 488 std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools; 489 std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools; 490 491 struct atmiFreePtrDeletor { 492 void operator()(void *p) { 493 core::Runtime::Memfree(p); // ignore failure to free 494 } 495 }; 496 497 // device_State shared across loaded binaries, error if inconsistent size 498 std::vector<std::pair<std::unique_ptr<void, atmiFreePtrDeletor>, uint64_t>> 499 deviceStateStore; 500 501 static const unsigned HardTeamLimit = 502 (1 << 16) - 1; // 64K needed to fit in uint16 503 static const int DefaultNumTeams = 128; 504 static const int Max_Teams = 505 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams]; 506 static const int Warp_Size = 507 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]; 508 static const int Max_WG_Size = 509 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size]; 510 static const int Default_WG_Size = 511 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size]; 512 513 using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *, 514 size_t size, hsa_agent_t); 515 hsa_status_t freesignalpool_memcpy(void *dest, const void *src, size_t size, 516 MemcpyFunc Func, int32_t deviceId) { 517 hsa_agent_t agent = HSAAgents[deviceId]; 518 hsa_signal_t s = FreeSignalPool.pop(); 519 if (s.handle == 0) { 520 return HSA_STATUS_ERROR; 521 } 522 hsa_status_t r = Func(s, dest, src, size, agent); 523 FreeSignalPool.push(s); 524 return r; 525 } 526 527 hsa_status_t freesignalpool_memcpy_d2h(void *dest, const void *src, 528 size_t size, int32_t deviceId) { 529 return freesignalpool_memcpy(dest, src, size, atmi_memcpy_d2h, deviceId); 530 } 531 532 hsa_status_t freesignalpool_memcpy_h2d(void *dest, const void *src, 533 size_t size, int32_t deviceId) { 534 return freesignalpool_memcpy(dest, src, size, atmi_memcpy_h2d, deviceId); 535 } 536 537 // Record entry point associated with device 538 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { 539 assert(device_id < (int32_t)FuncGblEntries.size() && 540 "Unexpected device id!"); 541 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 542 543 E.Entries.push_back(entry); 544 } 545 546 // Return true if the entry is associated with device 547 bool findOffloadEntry(int32_t device_id, void *addr) { 548 assert(device_id < (int32_t)FuncGblEntries.size() && 549 "Unexpected device id!"); 550 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 551 552 for (auto &it : E.Entries) { 553 if (it.addr == addr) 554 return true; 555 } 556 557 return false; 558 } 559 560 // Return the pointer to the target entries table 561 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { 562 assert(device_id < (int32_t)FuncGblEntries.size() && 563 "Unexpected device id!"); 564 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 565 566 int32_t size = E.Entries.size(); 567 568 // Table is empty 569 if (!size) 570 return 0; 571 572 __tgt_offload_entry *begin = &E.Entries[0]; 573 __tgt_offload_entry *end = &E.Entries[size - 1]; 574 575 // Update table info according to the entries and return the pointer 576 E.Table.EntriesBegin = begin; 577 E.Table.EntriesEnd = ++end; 578 579 return &E.Table; 580 } 581 582 // Clear entries table for a device 583 void clearOffloadEntriesTable(int device_id) { 584 assert(device_id < (int32_t)FuncGblEntries.size() && 585 "Unexpected device id!"); 586 FuncGblEntries[device_id].emplace_back(); 587 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 588 // KernelArgPoolMap.clear(); 589 E.Entries.clear(); 590 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 591 } 592 593 hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, 594 int DeviceId) { 595 assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here."); 596 uint32_t GlobalFlags = 0; 597 hsa_status_t Err = hsa_amd_memory_pool_get_info( 598 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 599 600 if (Err != HSA_STATUS_SUCCESS) { 601 return Err; 602 } 603 604 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { 605 DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; 606 } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { 607 DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; 608 } 609 610 return HSA_STATUS_SUCCESS; 611 } 612 613 hsa_status_t addHostMemoryPool(hsa_amd_memory_pool_t MemoryPool, 614 int DeviceId) { 615 uint32_t GlobalFlags = 0; 616 hsa_status_t Err = hsa_amd_memory_pool_get_info( 617 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); 618 619 if (Err != HSA_STATUS_SUCCESS) { 620 return Err; 621 } 622 623 uint32_t Size; 624 Err = hsa_amd_memory_pool_get_info(MemoryPool, 625 HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size); 626 if (Err != HSA_STATUS_SUCCESS) { 627 return Err; 628 } 629 630 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED && 631 Size > 0) { 632 HostFineGrainedMemoryPool = MemoryPool; 633 } 634 635 return HSA_STATUS_SUCCESS; 636 } 637 638 hsa_status_t setupMemoryPools() { 639 using namespace std::placeholders; 640 hsa_status_t Err; 641 Err = core::collectMemoryPools( 642 CPUAgents, std::bind(&RTLDeviceInfoTy::addHostMemoryPool, this, _1, _2)); 643 if (Err != HSA_STATUS_SUCCESS) { 644 fprintf(stderr, "HSA error in collecting memory pools for CPU: %s\n", 645 get_error_string(Err)); 646 return Err; 647 } 648 Err = core::collectMemoryPools( 649 HSAAgents, std::bind(&RTLDeviceInfoTy::addDeviceMemoryPool, this, _1, _2)); 650 if (Err != HSA_STATUS_SUCCESS) { 651 fprintf(stderr, 652 "HSA error in collecting memory pools for offload devices: %s\n", 653 get_error_string(Err)); 654 return Err; 655 } 656 return HSA_STATUS_SUCCESS; 657 } 658 659 hsa_amd_memory_pool_t getDeviceMemoryPool(int DeviceId) { 660 assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && 661 "Invalid device Id"); 662 return DeviceCoarseGrainedMemoryPools[DeviceId]; 663 } 664 665 hsa_amd_memory_pool_t getHostMemoryPool() { 666 return HostFineGrainedMemoryPool; 667 } 668 669 static int readEnvElseMinusOne(const char *Env) { 670 const char *envStr = getenv(Env); 671 int res = -1; 672 if (envStr) { 673 res = std::stoi(envStr); 674 DP("Parsed %s=%d\n", Env, res); 675 } 676 return res; 677 } 678 679 RTLDeviceInfoTy() { 680 // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr 681 // anytime. You do not need a debug library build. 682 // 0 => no tracing 683 // 1 => tracing dispatch only 684 // >1 => verbosity increase 685 if (char *envStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) 686 print_kernel_trace = atoi(envStr); 687 else 688 print_kernel_trace = 0; 689 690 DP("Start initializing HSA-ATMI\n"); 691 hsa_status_t err = core::atl_init_gpu_context(); 692 if (err != HSA_STATUS_SUCCESS) { 693 DP("Error when initializing HSA-ATMI\n"); 694 return; 695 } 696 697 // Init hostcall soon after initializing ATMI 698 hostrpc_init(); 699 700 err = FindAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { 701 if (DeviceType == HSA_DEVICE_TYPE_CPU) { 702 CPUAgents.push_back(Agent); 703 } else { 704 HSAAgents.push_back(Agent); 705 } 706 }); 707 if (err != HSA_STATUS_SUCCESS) 708 return; 709 710 NumberOfDevices = (int)HSAAgents.size(); 711 712 if (NumberOfDevices == 0) { 713 DP("There are no devices supporting HSA.\n"); 714 return; 715 } else { 716 DP("There are %d devices supporting HSA.\n", NumberOfDevices); 717 } 718 std::tie(err, KernArgPool) = core::FindKernargPool(CPUAgents); 719 if (err != HSA_STATUS_SUCCESS) { 720 DP("Error when reading memory pools\n"); 721 return; 722 } 723 724 // Init the device info 725 HSAQueues.resize(NumberOfDevices); 726 FuncGblEntries.resize(NumberOfDevices); 727 ThreadsPerGroup.resize(NumberOfDevices); 728 ComputeUnits.resize(NumberOfDevices); 729 GPUName.resize(NumberOfDevices); 730 GroupsPerDevice.resize(NumberOfDevices); 731 WarpSize.resize(NumberOfDevices); 732 NumTeams.resize(NumberOfDevices); 733 NumThreads.resize(NumberOfDevices); 734 deviceStateStore.resize(NumberOfDevices); 735 KernelInfoTable.resize(NumberOfDevices); 736 SymbolInfoTable.resize(NumberOfDevices); 737 DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); 738 DeviceFineGrainedMemoryPools.resize(NumberOfDevices); 739 740 err = setupMemoryPools(); 741 if (err != HSA_STATUS_SUCCESS) { 742 DP("Error when setting up memory pools"); 743 return; 744 } 745 746 for (int i = 0; i < NumberOfDevices; i++) { 747 HSAQueues[i] = nullptr; 748 } 749 750 for (int i = 0; i < NumberOfDevices; i++) { 751 uint32_t queue_size = 0; 752 { 753 hsa_status_t err = hsa_agent_get_info( 754 HSAAgents[i], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size); 755 if (err != HSA_STATUS_SUCCESS) { 756 DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", i); 757 return; 758 } 759 if (queue_size > core::Runtime::getInstance().getMaxQueueSize()) { 760 queue_size = core::Runtime::getInstance().getMaxQueueSize(); 761 } 762 } 763 764 hsa_status_t rc = hsa_queue_create( 765 HSAAgents[i], queue_size, HSA_QUEUE_TYPE_MULTI, callbackQueue, NULL, 766 UINT32_MAX, UINT32_MAX, &HSAQueues[i]); 767 if (rc != HSA_STATUS_SUCCESS) { 768 DP("Failed to create HSA queue %d\n", i); 769 return; 770 } 771 772 deviceStateStore[i] = {nullptr, 0}; 773 } 774 775 for (int i = 0; i < NumberOfDevices; i++) { 776 ThreadsPerGroup[i] = RTLDeviceInfoTy::Default_WG_Size; 777 GroupsPerDevice[i] = RTLDeviceInfoTy::DefaultNumTeams; 778 ComputeUnits[i] = 1; 779 DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", i, 780 GroupsPerDevice[i], ThreadsPerGroup[i]); 781 } 782 783 // Get environment variables regarding teams 784 Env.TeamLimit = readEnvElseMinusOne("OMP_TEAM_LIMIT"); 785 Env.NumTeams = readEnvElseMinusOne("OMP_NUM_TEAMS"); 786 Env.MaxTeamsDefault = readEnvElseMinusOne("OMP_MAX_TEAMS_DEFAULT"); 787 Env.TeamThreadLimit = readEnvElseMinusOne("OMP_TEAMS_THREAD_LIMIT"); 788 789 // Default state. 790 RequiresFlags = OMP_REQ_UNDEFINED; 791 } 792 793 ~RTLDeviceInfoTy() { 794 DP("Finalizing the HSA-ATMI DeviceInfo.\n"); 795 // Run destructors on types that use HSA before 796 // atmi_finalize removes access to it 797 deviceStateStore.clear(); 798 KernelArgPoolMap.clear(); 799 // Terminate hostrpc before finalizing ATMI 800 hostrpc_terminate(); 801 802 hsa_status_t Err; 803 for (uint32_t I = 0; I < HSAExecutables.size(); I++) { 804 Err = hsa_executable_destroy(HSAExecutables[I]); 805 if (Err != HSA_STATUS_SUCCESS) { 806 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, 807 "Destroying executable", get_error_string(Err)); 808 } 809 } 810 811 Err = hsa_shut_down(); 812 if (Err != HSA_STATUS_SUCCESS) { 813 printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Shutting down HSA", 814 get_error_string(Err)); 815 } 816 } 817 }; 818 819 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; 820 821 // TODO: May need to drop the trailing to fields until deviceRTL is updated 822 struct omptarget_device_environmentTy { 823 int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG 824 // only useful for Debug build of deviceRTLs 825 int32_t num_devices; // gets number of active offload devices 826 int32_t device_num; // gets a value 0 to num_devices-1 827 }; 828 829 static RTLDeviceInfoTy DeviceInfo; 830 831 namespace { 832 833 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, 834 __tgt_async_info *AsyncInfo) { 835 assert(AsyncInfo && "AsyncInfo is nullptr"); 836 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 837 // Return success if we are not copying back to host from target. 838 if (!HstPtr) 839 return OFFLOAD_SUCCESS; 840 hsa_status_t err; 841 DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 842 (long long unsigned)(Elf64_Addr)TgtPtr, 843 (long long unsigned)(Elf64_Addr)HstPtr); 844 845 err = DeviceInfo.freesignalpool_memcpy_d2h(HstPtr, TgtPtr, (size_t)Size, 846 DeviceId); 847 848 if (err != HSA_STATUS_SUCCESS) { 849 DP("Error when copying data from device to host. Pointers: " 850 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 851 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 852 return OFFLOAD_FAIL; 853 } 854 DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, 855 (long long unsigned)(Elf64_Addr)TgtPtr, 856 (long long unsigned)(Elf64_Addr)HstPtr); 857 return OFFLOAD_SUCCESS; 858 } 859 860 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, 861 __tgt_async_info *AsyncInfo) { 862 assert(AsyncInfo && "AsyncInfo is nullptr"); 863 hsa_status_t err; 864 assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); 865 // Return success if we are not doing host to target. 866 if (!HstPtr) 867 return OFFLOAD_SUCCESS; 868 869 DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, 870 (long long unsigned)(Elf64_Addr)HstPtr, 871 (long long unsigned)(Elf64_Addr)TgtPtr); 872 err = DeviceInfo.freesignalpool_memcpy_h2d(TgtPtr, HstPtr, (size_t)Size, 873 DeviceId); 874 if (err != HSA_STATUS_SUCCESS) { 875 DP("Error when copying data from host to device. Pointers: " 876 "host = 0x%016lx, device = 0x%016lx, size = %lld\n", 877 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); 878 return OFFLOAD_FAIL; 879 } 880 return OFFLOAD_SUCCESS; 881 } 882 883 // Async. 884 // The implementation was written with cuda streams in mind. The semantics of 885 // that are to execute kernels on a queue in order of insertion. A synchronise 886 // call then makes writes visible between host and device. This means a series 887 // of N data_submit_async calls are expected to execute serially. HSA offers 888 // various options to run the data copies concurrently. This may require changes 889 // to libomptarget. 890 891 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that 892 // there are no outstanding kernels that need to be synchronized. Any async call 893 // may be passed a Queue==0, at which point the cuda implementation will set it 894 // to non-null (see getStream). The cuda streams are per-device. Upstream may 895 // change this interface to explicitly initialize the AsyncInfo_pointer, but 896 // until then hsa lazily initializes it as well. 897 898 void initAsyncInfo(__tgt_async_info *AsyncInfo) { 899 // set non-null while using async calls, return to null to indicate completion 900 assert(AsyncInfo); 901 if (!AsyncInfo->Queue) { 902 AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX); 903 } 904 } 905 void finiAsyncInfo(__tgt_async_info *AsyncInfo) { 906 assert(AsyncInfo); 907 assert(AsyncInfo->Queue); 908 AsyncInfo->Queue = 0; 909 } 910 911 bool elf_machine_id_is_amdgcn(__tgt_device_image *image) { 912 const uint16_t amdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h 913 int32_t r = elf_check_machine(image, amdgcnMachineID); 914 if (!r) { 915 DP("Supported machine ID not found\n"); 916 } 917 return r; 918 } 919 920 uint32_t elf_e_flags(__tgt_device_image *image) { 921 char *img_begin = (char *)image->ImageStart; 922 size_t img_size = (char *)image->ImageEnd - img_begin; 923 924 Elf *e = elf_memory(img_begin, img_size); 925 if (!e) { 926 DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); 927 return 0; 928 } 929 930 Elf64_Ehdr *eh64 = elf64_getehdr(e); 931 932 if (!eh64) { 933 DP("Unable to get machine ID from ELF file!\n"); 934 elf_end(e); 935 return 0; 936 } 937 938 uint32_t Flags = eh64->e_flags; 939 940 elf_end(e); 941 DP("ELF Flags: 0x%x\n", Flags); 942 return Flags; 943 } 944 } // namespace 945 946 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 947 return elf_machine_id_is_amdgcn(image); 948 } 949 950 int __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } 951 952 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 953 DP("Init requires flags to %ld\n", RequiresFlags); 954 DeviceInfo.RequiresFlags = RequiresFlags; 955 return RequiresFlags; 956 } 957 958 namespace { 959 template <typename T> bool enforce_upper_bound(T *value, T upper) { 960 bool changed = *value > upper; 961 if (changed) { 962 *value = upper; 963 } 964 return changed; 965 } 966 } // namespace 967 968 int32_t __tgt_rtl_init_device(int device_id) { 969 hsa_status_t err; 970 971 // this is per device id init 972 DP("Initialize the device id: %d\n", device_id); 973 974 hsa_agent_t agent = DeviceInfo.HSAAgents[device_id]; 975 976 // Get number of Compute Unit 977 uint32_t compute_units = 0; 978 err = hsa_agent_get_info( 979 agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, 980 &compute_units); 981 if (err != HSA_STATUS_SUCCESS) { 982 DeviceInfo.ComputeUnits[device_id] = 1; 983 DP("Error getting compute units : settiing to 1\n"); 984 } else { 985 DeviceInfo.ComputeUnits[device_id] = compute_units; 986 DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[device_id]); 987 } 988 989 char GetInfoName[64]; // 64 max size returned by get info 990 err = hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, 991 (void *)GetInfoName); 992 if (err) 993 DeviceInfo.GPUName[device_id] = "--unknown gpu--"; 994 else { 995 DeviceInfo.GPUName[device_id] = GetInfoName; 996 } 997 998 if (print_kernel_trace & STARTUP_DETAILS) 999 fprintf(stderr, "Device#%-2d CU's: %2d %s\n", device_id, 1000 DeviceInfo.ComputeUnits[device_id], 1001 DeviceInfo.GPUName[device_id].c_str()); 1002 1003 // Query attributes to determine number of threads/block and blocks/grid. 1004 uint16_t workgroup_max_dim[3]; 1005 err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, 1006 &workgroup_max_dim); 1007 if (err != HSA_STATUS_SUCCESS) { 1008 DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 1009 DP("Error getting grid dims: num groups : %d\n", 1010 RTLDeviceInfoTy::DefaultNumTeams); 1011 } else if (workgroup_max_dim[0] <= RTLDeviceInfoTy::HardTeamLimit) { 1012 DeviceInfo.GroupsPerDevice[device_id] = workgroup_max_dim[0]; 1013 DP("Using %d ROCm blocks per grid\n", 1014 DeviceInfo.GroupsPerDevice[device_id]); 1015 } else { 1016 DeviceInfo.GroupsPerDevice[device_id] = RTLDeviceInfoTy::HardTeamLimit; 1017 DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " 1018 "at the hard limit\n", 1019 workgroup_max_dim[0], RTLDeviceInfoTy::HardTeamLimit); 1020 } 1021 1022 // Get thread limit 1023 hsa_dim3_t grid_max_dim; 1024 err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM, &grid_max_dim); 1025 if (err == HSA_STATUS_SUCCESS) { 1026 DeviceInfo.ThreadsPerGroup[device_id] = 1027 reinterpret_cast<uint32_t *>(&grid_max_dim)[0] / 1028 DeviceInfo.GroupsPerDevice[device_id]; 1029 1030 if (DeviceInfo.ThreadsPerGroup[device_id] == 0) { 1031 DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size; 1032 DP("Default thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size); 1033 } else if (enforce_upper_bound(&DeviceInfo.ThreadsPerGroup[device_id], 1034 RTLDeviceInfoTy::Max_WG_Size)) { 1035 DP("Capped thread limit: %d\n", RTLDeviceInfoTy::Max_WG_Size); 1036 } else { 1037 DP("Using ROCm Queried thread limit: %d\n", 1038 DeviceInfo.ThreadsPerGroup[device_id]); 1039 } 1040 } else { 1041 DeviceInfo.ThreadsPerGroup[device_id] = RTLDeviceInfoTy::Max_WG_Size; 1042 DP("Error getting max block dimension, use default:%d \n", 1043 RTLDeviceInfoTy::Max_WG_Size); 1044 } 1045 1046 // Get wavefront size 1047 uint32_t wavefront_size = 0; 1048 err = 1049 hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size); 1050 if (err == HSA_STATUS_SUCCESS) { 1051 DP("Queried wavefront size: %d\n", wavefront_size); 1052 DeviceInfo.WarpSize[device_id] = wavefront_size; 1053 } else { 1054 DP("Default wavefront size: %d\n", 1055 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]); 1056 DeviceInfo.WarpSize[device_id] = 1057 llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]; 1058 } 1059 1060 // Adjust teams to the env variables 1061 1062 if (DeviceInfo.Env.TeamLimit > 0 && 1063 (enforce_upper_bound(&DeviceInfo.GroupsPerDevice[device_id], 1064 DeviceInfo.Env.TeamLimit))) { 1065 DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", 1066 DeviceInfo.Env.TeamLimit); 1067 } 1068 1069 // Set default number of teams 1070 if (DeviceInfo.Env.NumTeams > 0) { 1071 DeviceInfo.NumTeams[device_id] = DeviceInfo.Env.NumTeams; 1072 DP("Default number of teams set according to environment %d\n", 1073 DeviceInfo.Env.NumTeams); 1074 } else { 1075 char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); 1076 int TeamsPerCU = DefaultTeamsPerCU; 1077 if (TeamsPerCUEnvStr) { 1078 TeamsPerCU = std::stoi(TeamsPerCUEnvStr); 1079 } 1080 1081 DeviceInfo.NumTeams[device_id] = 1082 TeamsPerCU * DeviceInfo.ComputeUnits[device_id]; 1083 DP("Default number of teams = %d * number of compute units %d\n", 1084 TeamsPerCU, DeviceInfo.ComputeUnits[device_id]); 1085 } 1086 1087 if (enforce_upper_bound(&DeviceInfo.NumTeams[device_id], 1088 DeviceInfo.GroupsPerDevice[device_id])) { 1089 DP("Default number of teams exceeds device limit, capping at %d\n", 1090 DeviceInfo.GroupsPerDevice[device_id]); 1091 } 1092 1093 // Adjust threads to the env variables 1094 if (DeviceInfo.Env.TeamThreadLimit > 0 && 1095 (enforce_upper_bound(&DeviceInfo.NumThreads[device_id], 1096 DeviceInfo.Env.TeamThreadLimit))) { 1097 DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", 1098 DeviceInfo.Env.TeamThreadLimit); 1099 } 1100 1101 // Set default number of threads 1102 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size; 1103 DP("Default number of threads set according to library's default %d\n", 1104 RTLDeviceInfoTy::Default_WG_Size); 1105 if (enforce_upper_bound(&DeviceInfo.NumThreads[device_id], 1106 DeviceInfo.ThreadsPerGroup[device_id])) { 1107 DP("Default number of threads exceeds device limit, capping at %d\n", 1108 DeviceInfo.ThreadsPerGroup[device_id]); 1109 } 1110 1111 DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", 1112 device_id, DeviceInfo.GroupsPerDevice[device_id], 1113 DeviceInfo.ThreadsPerGroup[device_id]); 1114 1115 DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", device_id, 1116 DeviceInfo.WarpSize[device_id], DeviceInfo.ThreadsPerGroup[device_id], 1117 DeviceInfo.GroupsPerDevice[device_id], 1118 DeviceInfo.GroupsPerDevice[device_id] * 1119 DeviceInfo.ThreadsPerGroup[device_id]); 1120 1121 return OFFLOAD_SUCCESS; 1122 } 1123 1124 namespace { 1125 Elf64_Shdr *find_only_SHT_HASH(Elf *elf) { 1126 size_t N; 1127 int rc = elf_getshdrnum(elf, &N); 1128 if (rc != 0) { 1129 return nullptr; 1130 } 1131 1132 Elf64_Shdr *result = nullptr; 1133 for (size_t i = 0; i < N; i++) { 1134 Elf_Scn *scn = elf_getscn(elf, i); 1135 if (scn) { 1136 Elf64_Shdr *shdr = elf64_getshdr(scn); 1137 if (shdr) { 1138 if (shdr->sh_type == SHT_HASH) { 1139 if (result == nullptr) { 1140 result = shdr; 1141 } else { 1142 // multiple SHT_HASH sections not handled 1143 return nullptr; 1144 } 1145 } 1146 } 1147 } 1148 } 1149 return result; 1150 } 1151 1152 const Elf64_Sym *elf_lookup(Elf *elf, char *base, Elf64_Shdr *section_hash, 1153 const char *symname) { 1154 1155 assert(section_hash); 1156 size_t section_symtab_index = section_hash->sh_link; 1157 Elf64_Shdr *section_symtab = 1158 elf64_getshdr(elf_getscn(elf, section_symtab_index)); 1159 size_t section_strtab_index = section_symtab->sh_link; 1160 1161 const Elf64_Sym *symtab = 1162 reinterpret_cast<const Elf64_Sym *>(base + section_symtab->sh_offset); 1163 1164 const uint32_t *hashtab = 1165 reinterpret_cast<const uint32_t *>(base + section_hash->sh_offset); 1166 1167 // Layout: 1168 // nbucket 1169 // nchain 1170 // bucket[nbucket] 1171 // chain[nchain] 1172 uint32_t nbucket = hashtab[0]; 1173 const uint32_t *bucket = &hashtab[2]; 1174 const uint32_t *chain = &hashtab[nbucket + 2]; 1175 1176 const size_t max = strlen(symname) + 1; 1177 const uint32_t hash = elf_hash(symname); 1178 for (uint32_t i = bucket[hash % nbucket]; i != 0; i = chain[i]) { 1179 char *n = elf_strptr(elf, section_strtab_index, symtab[i].st_name); 1180 if (strncmp(symname, n, max) == 0) { 1181 return &symtab[i]; 1182 } 1183 } 1184 1185 return nullptr; 1186 } 1187 1188 struct symbol_info { 1189 void *addr = nullptr; 1190 uint32_t size = UINT32_MAX; 1191 uint32_t sh_type = SHT_NULL; 1192 }; 1193 1194 int get_symbol_info_without_loading(Elf *elf, char *base, const char *symname, 1195 symbol_info *res) { 1196 if (elf_kind(elf) != ELF_K_ELF) { 1197 return 1; 1198 } 1199 1200 Elf64_Shdr *section_hash = find_only_SHT_HASH(elf); 1201 if (!section_hash) { 1202 return 1; 1203 } 1204 1205 const Elf64_Sym *sym = elf_lookup(elf, base, section_hash, symname); 1206 if (!sym) { 1207 return 1; 1208 } 1209 1210 if (sym->st_size > UINT32_MAX) { 1211 return 1; 1212 } 1213 1214 if (sym->st_shndx == SHN_UNDEF) { 1215 return 1; 1216 } 1217 1218 Elf_Scn *section = elf_getscn(elf, sym->st_shndx); 1219 if (!section) { 1220 return 1; 1221 } 1222 1223 Elf64_Shdr *header = elf64_getshdr(section); 1224 if (!header) { 1225 return 1; 1226 } 1227 1228 res->addr = sym->st_value + base; 1229 res->size = static_cast<uint32_t>(sym->st_size); 1230 res->sh_type = header->sh_type; 1231 return 0; 1232 } 1233 1234 int get_symbol_info_without_loading(char *base, size_t img_size, 1235 const char *symname, symbol_info *res) { 1236 Elf *elf = elf_memory(base, img_size); 1237 if (elf) { 1238 int rc = get_symbol_info_without_loading(elf, base, symname, res); 1239 elf_end(elf); 1240 return rc; 1241 } 1242 return 1; 1243 } 1244 1245 hsa_status_t interop_get_symbol_info(char *base, size_t img_size, 1246 const char *symname, void **var_addr, 1247 uint32_t *var_size) { 1248 symbol_info si; 1249 int rc = get_symbol_info_without_loading(base, img_size, symname, &si); 1250 if (rc == 0) { 1251 *var_addr = si.addr; 1252 *var_size = si.size; 1253 return HSA_STATUS_SUCCESS; 1254 } else { 1255 return HSA_STATUS_ERROR; 1256 } 1257 } 1258 1259 template <typename C> 1260 hsa_status_t module_register_from_memory_to_place( 1261 std::map<std::string, atl_kernel_info_t> &KernelInfoTable, 1262 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, 1263 void *module_bytes, size_t module_size, int DeviceId, C cb, 1264 std::vector<hsa_executable_t> &HSAExecutables) { 1265 auto L = [](void *data, size_t size, void *cb_state) -> hsa_status_t { 1266 C *unwrapped = static_cast<C *>(cb_state); 1267 return (*unwrapped)(data, size); 1268 }; 1269 return core::RegisterModuleFromMemory( 1270 KernelInfoTable, SymbolInfoTable, module_bytes, module_size, 1271 DeviceInfo.HSAAgents[DeviceId], L, static_cast<void *>(&cb), 1272 HSAExecutables); 1273 } 1274 } // namespace 1275 1276 static uint64_t get_device_State_bytes(char *ImageStart, size_t img_size) { 1277 uint64_t device_State_bytes = 0; 1278 { 1279 // If this is the deviceRTL, get the state variable size 1280 symbol_info size_si; 1281 int rc = get_symbol_info_without_loading( 1282 ImageStart, img_size, "omptarget_nvptx_device_State_size", &size_si); 1283 1284 if (rc == 0) { 1285 if (size_si.size != sizeof(uint64_t)) { 1286 DP("Found device_State_size variable with wrong size\n"); 1287 return 0; 1288 } 1289 1290 // Read number of bytes directly from the elf 1291 memcpy(&device_State_bytes, size_si.addr, sizeof(uint64_t)); 1292 } 1293 } 1294 return device_State_bytes; 1295 } 1296 1297 static __tgt_target_table * 1298 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image); 1299 1300 static __tgt_target_table * 1301 __tgt_rtl_load_binary_locked(int32_t device_id, __tgt_device_image *image); 1302 1303 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1304 __tgt_device_image *image) { 1305 DeviceInfo.load_run_lock.lock(); 1306 __tgt_target_table *res = __tgt_rtl_load_binary_locked(device_id, image); 1307 DeviceInfo.load_run_lock.unlock(); 1308 return res; 1309 } 1310 1311 struct device_environment { 1312 // initialise an omptarget_device_environmentTy in the deviceRTL 1313 // patches around differences in the deviceRTL between trunk, aomp, 1314 // rocmcc. Over time these differences will tend to zero and this class 1315 // simplified. 1316 // Symbol may be in .data or .bss, and may be missing fields: 1317 // - aomp has debug_level, num_devices, device_num 1318 // - trunk has debug_level 1319 // - under review in trunk is debug_level, device_num 1320 // - rocmcc matches aomp, patch to swap num_devices and device_num 1321 1322 // The symbol may also have been deadstripped because the device side 1323 // accessors were unused. 1324 1325 // If the symbol is in .data (aomp, rocm) it can be written directly. 1326 // If it is in .bss, we must wait for it to be allocated space on the 1327 // gpu (trunk) and initialize after loading. 1328 const char *sym() { return "omptarget_device_environment"; } 1329 1330 omptarget_device_environmentTy host_device_env; 1331 symbol_info si; 1332 bool valid = false; 1333 1334 __tgt_device_image *image; 1335 const size_t img_size; 1336 1337 device_environment(int device_id, int number_devices, 1338 __tgt_device_image *image, const size_t img_size) 1339 : image(image), img_size(img_size) { 1340 1341 host_device_env.num_devices = number_devices; 1342 host_device_env.device_num = device_id; 1343 host_device_env.debug_level = 0; 1344 #ifdef OMPTARGET_DEBUG 1345 if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { 1346 host_device_env.debug_level = std::stoi(envStr); 1347 } 1348 #endif 1349 1350 int rc = get_symbol_info_without_loading((char *)image->ImageStart, 1351 img_size, sym(), &si); 1352 if (rc != 0) { 1353 DP("Finding global device environment '%s' - symbol missing.\n", sym()); 1354 return; 1355 } 1356 1357 if (si.size > sizeof(host_device_env)) { 1358 DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), si.size, 1359 sizeof(host_device_env)); 1360 return; 1361 } 1362 1363 valid = true; 1364 } 1365 1366 bool in_image() { return si.sh_type != SHT_NOBITS; } 1367 1368 hsa_status_t before_loading(void *data, size_t size) { 1369 if (valid) { 1370 if (in_image()) { 1371 DP("Setting global device environment before load (%u bytes)\n", 1372 si.size); 1373 uint64_t offset = (char *)si.addr - (char *)image->ImageStart; 1374 void *pos = (char *)data + offset; 1375 memcpy(pos, &host_device_env, si.size); 1376 } 1377 } 1378 return HSA_STATUS_SUCCESS; 1379 } 1380 1381 hsa_status_t after_loading() { 1382 if (valid) { 1383 if (!in_image()) { 1384 DP("Setting global device environment after load (%u bytes)\n", 1385 si.size); 1386 int device_id = host_device_env.device_num; 1387 auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; 1388 void *state_ptr; 1389 uint32_t state_ptr_size; 1390 hsa_status_t err = atmi_interop_hsa_get_symbol_info( 1391 SymbolInfo, device_id, sym(), &state_ptr, &state_ptr_size); 1392 if (err != HSA_STATUS_SUCCESS) { 1393 DP("failed to find %s in loaded image\n", sym()); 1394 return err; 1395 } 1396 1397 if (state_ptr_size != si.size) { 1398 DP("Symbol had size %u before loading, %u after\n", state_ptr_size, 1399 si.size); 1400 return HSA_STATUS_ERROR; 1401 } 1402 1403 return DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &host_device_env, 1404 state_ptr_size, device_id); 1405 } 1406 } 1407 return HSA_STATUS_SUCCESS; 1408 } 1409 }; 1410 1411 static hsa_status_t atmi_calloc(void **ret_ptr, size_t size, int DeviceId) { 1412 uint64_t rounded = 4 * ((size + 3) / 4); 1413 void *ptr; 1414 hsa_status_t err = core::Runtime::DeviceMalloc(&ptr, rounded, DeviceId); 1415 if (err != HSA_STATUS_SUCCESS) { 1416 return err; 1417 } 1418 1419 hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4); 1420 if (rc != HSA_STATUS_SUCCESS) { 1421 fprintf(stderr, "zero fill device_state failed with %u\n", rc); 1422 core::Runtime::Memfree(ptr); 1423 return HSA_STATUS_ERROR; 1424 } 1425 1426 *ret_ptr = ptr; 1427 return HSA_STATUS_SUCCESS; 1428 } 1429 1430 static bool image_contains_symbol(void *data, size_t size, const char *sym) { 1431 symbol_info si; 1432 int rc = get_symbol_info_without_loading((char *)data, size, sym, &si); 1433 return (rc == 0) && (si.addr != nullptr); 1434 } 1435 1436 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, 1437 __tgt_device_image *image) { 1438 // This function loads the device image onto gpu[device_id] and does other 1439 // per-image initialization work. Specifically: 1440 // 1441 // - Initialize an omptarget_device_environmentTy instance embedded in the 1442 // image at the symbol "omptarget_device_environment" 1443 // Fields debug_level, device_num, num_devices. Used by the deviceRTL. 1444 // 1445 // - Allocate a large array per-gpu (could be moved to init_device) 1446 // - Read a uint64_t at symbol omptarget_nvptx_device_State_size 1447 // - Allocate at least that many bytes of gpu memory 1448 // - Zero initialize it 1449 // - Write the pointer to the symbol omptarget_nvptx_device_State 1450 // 1451 // - Pulls some per-kernel information together from various sources and 1452 // records it in the KernelsList for quicker access later 1453 // 1454 // The initialization can be done before or after loading the image onto the 1455 // gpu. This function presently does a mixture. Using the hsa api to get/set 1456 // the information is simpler to implement, in exchange for more complicated 1457 // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes 1458 // back from the gpu vs a hashtable lookup on the host. 1459 1460 const size_t img_size = (char *)image->ImageEnd - (char *)image->ImageStart; 1461 1462 DeviceInfo.clearOffloadEntriesTable(device_id); 1463 1464 // We do not need to set the ELF version because the caller of this function 1465 // had to do that to decide the right runtime to use 1466 1467 if (!elf_machine_id_is_amdgcn(image)) { 1468 return NULL; 1469 } 1470 1471 { 1472 auto env = device_environment(device_id, DeviceInfo.NumberOfDevices, image, 1473 img_size); 1474 1475 auto &KernelInfo = DeviceInfo.KernelInfoTable[device_id]; 1476 auto &SymbolInfo = DeviceInfo.SymbolInfoTable[device_id]; 1477 hsa_status_t err = module_register_from_memory_to_place( 1478 KernelInfo, SymbolInfo, (void *)image->ImageStart, img_size, device_id, 1479 [&](void *data, size_t size) { 1480 if (image_contains_symbol(data, size, "needs_hostcall_buffer")) { 1481 __atomic_store_n(&DeviceInfo.hostcall_required, true, 1482 __ATOMIC_RELEASE); 1483 } 1484 return env.before_loading(data, size); 1485 }, 1486 DeviceInfo.HSAExecutables); 1487 1488 check("Module registering", err); 1489 if (err != HSA_STATUS_SUCCESS) { 1490 fprintf(stderr, 1491 "Possible gpu arch mismatch: device:%s, image:%s please check" 1492 " compiler flag: -march=<gpu>\n", 1493 DeviceInfo.GPUName[device_id].c_str(), 1494 get_elf_mach_gfx_name(elf_e_flags(image))); 1495 return NULL; 1496 } 1497 1498 err = env.after_loading(); 1499 if (err != HSA_STATUS_SUCCESS) { 1500 return NULL; 1501 } 1502 } 1503 1504 DP("ATMI module successfully loaded!\n"); 1505 1506 { 1507 // the device_State array is either large value in bss or a void* that 1508 // needs to be assigned to a pointer to an array of size device_state_bytes 1509 // If absent, it has been deadstripped and needs no setup. 1510 1511 void *state_ptr; 1512 uint32_t state_ptr_size; 1513 auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; 1514 hsa_status_t err = atmi_interop_hsa_get_symbol_info( 1515 SymbolInfoMap, device_id, "omptarget_nvptx_device_State", &state_ptr, 1516 &state_ptr_size); 1517 1518 if (err != HSA_STATUS_SUCCESS) { 1519 DP("No device_state symbol found, skipping initialization\n"); 1520 } else { 1521 if (state_ptr_size < sizeof(void *)) { 1522 DP("unexpected size of state_ptr %u != %zu\n", state_ptr_size, 1523 sizeof(void *)); 1524 return NULL; 1525 } 1526 1527 // if it's larger than a void*, assume it's a bss array and no further 1528 // initialization is required. Only try to set up a pointer for 1529 // sizeof(void*) 1530 if (state_ptr_size == sizeof(void *)) { 1531 uint64_t device_State_bytes = 1532 get_device_State_bytes((char *)image->ImageStart, img_size); 1533 if (device_State_bytes == 0) { 1534 DP("Can't initialize device_State, missing size information\n"); 1535 return NULL; 1536 } 1537 1538 auto &dss = DeviceInfo.deviceStateStore[device_id]; 1539 if (dss.first.get() == nullptr) { 1540 assert(dss.second == 0); 1541 void *ptr = NULL; 1542 hsa_status_t err = atmi_calloc(&ptr, device_State_bytes, device_id); 1543 if (err != HSA_STATUS_SUCCESS) { 1544 DP("Failed to allocate device_state array\n"); 1545 return NULL; 1546 } 1547 dss = { 1548 std::unique_ptr<void, RTLDeviceInfoTy::atmiFreePtrDeletor>{ptr}, 1549 device_State_bytes, 1550 }; 1551 } 1552 1553 void *ptr = dss.first.get(); 1554 if (device_State_bytes != dss.second) { 1555 DP("Inconsistent sizes of device_State unsupported\n"); 1556 return NULL; 1557 } 1558 1559 // write ptr to device memory so it can be used by later kernels 1560 err = DeviceInfo.freesignalpool_memcpy_h2d(state_ptr, &ptr, 1561 sizeof(void *), device_id); 1562 if (err != HSA_STATUS_SUCCESS) { 1563 DP("memcpy install of state_ptr failed\n"); 1564 return NULL; 1565 } 1566 } 1567 } 1568 } 1569 1570 // Here, we take advantage of the data that is appended after img_end to get 1571 // the symbols' name we need to load. This data consist of the host entries 1572 // begin and end as well as the target name (see the offloading linker script 1573 // creation in clang compiler). 1574 1575 // Find the symbols in the module by name. The name can be obtain by 1576 // concatenating the host entry name with the target name 1577 1578 __tgt_offload_entry *HostBegin = image->EntriesBegin; 1579 __tgt_offload_entry *HostEnd = image->EntriesEnd; 1580 1581 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { 1582 1583 if (!e->addr) { 1584 // The host should have always something in the address to 1585 // uniquely identify the target region. 1586 fprintf(stderr, "Analyzing host entry '<null>' (size = %lld)...\n", 1587 (unsigned long long)e->size); 1588 return NULL; 1589 } 1590 1591 if (e->size) { 1592 __tgt_offload_entry entry = *e; 1593 1594 void *varptr; 1595 uint32_t varsize; 1596 1597 auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[device_id]; 1598 hsa_status_t err = atmi_interop_hsa_get_symbol_info( 1599 SymbolInfoMap, device_id, e->name, &varptr, &varsize); 1600 1601 if (err != HSA_STATUS_SUCCESS) { 1602 // Inform the user what symbol prevented offloading 1603 DP("Loading global '%s' (Failed)\n", e->name); 1604 return NULL; 1605 } 1606 1607 if (varsize != e->size) { 1608 DP("Loading global '%s' - size mismatch (%u != %lu)\n", e->name, 1609 varsize, e->size); 1610 return NULL; 1611 } 1612 1613 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 1614 DPxPTR(e - HostBegin), e->name, DPxPTR(varptr)); 1615 entry.addr = (void *)varptr; 1616 1617 DeviceInfo.addOffloadEntry(device_id, entry); 1618 1619 if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 1620 e->flags & OMP_DECLARE_TARGET_LINK) { 1621 // If unified memory is present any target link variables 1622 // can access host addresses directly. There is no longer a 1623 // need for device copies. 1624 err = DeviceInfo.freesignalpool_memcpy_h2d(varptr, e->addr, 1625 sizeof(void *), device_id); 1626 if (err != HSA_STATUS_SUCCESS) 1627 DP("Error when copying USM\n"); 1628 DP("Copy linked variable host address (" DPxMOD ")" 1629 "to device address (" DPxMOD ")\n", 1630 DPxPTR(*((void **)e->addr)), DPxPTR(varptr)); 1631 } 1632 1633 continue; 1634 } 1635 1636 DP("to find the kernel name: %s size: %lu\n", e->name, strlen(e->name)); 1637 1638 uint32_t kernarg_segment_size; 1639 auto &KernelInfoMap = DeviceInfo.KernelInfoTable[device_id]; 1640 hsa_status_t err = atmi_interop_hsa_get_kernel_info( 1641 KernelInfoMap, device_id, e->name, 1642 HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, 1643 &kernarg_segment_size); 1644 1645 // each arg is a void * in this openmp implementation 1646 uint32_t arg_num = kernarg_segment_size / sizeof(void *); 1647 std::vector<size_t> arg_sizes(arg_num); 1648 for (std::vector<size_t>::iterator it = arg_sizes.begin(); 1649 it != arg_sizes.end(); it++) { 1650 *it = sizeof(void *); 1651 } 1652 1653 // default value GENERIC (in case symbol is missing from cubin file) 1654 int8_t ExecModeVal = ExecutionModeType::GENERIC; 1655 1656 // get flat group size if present, else Default_WG_Size 1657 int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; 1658 1659 // get Kernel Descriptor if present. 1660 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp 1661 struct KernDescValType { 1662 uint16_t Version; 1663 uint16_t TSize; 1664 uint16_t WG_Size; 1665 uint8_t Mode; 1666 }; 1667 struct KernDescValType KernDescVal; 1668 std::string KernDescNameStr(e->name); 1669 KernDescNameStr += "_kern_desc"; 1670 const char *KernDescName = KernDescNameStr.c_str(); 1671 1672 void *KernDescPtr; 1673 uint32_t KernDescSize; 1674 void *CallStackAddr = nullptr; 1675 err = interop_get_symbol_info((char *)image->ImageStart, img_size, 1676 KernDescName, &KernDescPtr, &KernDescSize); 1677 1678 if (err == HSA_STATUS_SUCCESS) { 1679 if ((size_t)KernDescSize != sizeof(KernDescVal)) 1680 DP("Loading global computation properties '%s' - size mismatch (%u != " 1681 "%lu)\n", 1682 KernDescName, KernDescSize, sizeof(KernDescVal)); 1683 1684 memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); 1685 1686 // Check structure size against recorded size. 1687 if ((size_t)KernDescSize != KernDescVal.TSize) 1688 DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", 1689 sizeof(KernDescVal), KernDescVal.TSize, KernDescName); 1690 1691 DP("After loading global for %s KernDesc \n", KernDescName); 1692 DP("KernDesc: Version: %d\n", KernDescVal.Version); 1693 DP("KernDesc: TSize: %d\n", KernDescVal.TSize); 1694 DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size); 1695 DP("KernDesc: Mode: %d\n", KernDescVal.Mode); 1696 1697 // Get ExecMode 1698 ExecModeVal = KernDescVal.Mode; 1699 DP("ExecModeVal %d\n", ExecModeVal); 1700 if (KernDescVal.WG_Size == 0) { 1701 KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size; 1702 DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size); 1703 } 1704 WGSizeVal = KernDescVal.WG_Size; 1705 DP("WGSizeVal %d\n", WGSizeVal); 1706 check("Loading KernDesc computation property", err); 1707 } else { 1708 DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); 1709 1710 // Generic 1711 std::string ExecModeNameStr(e->name); 1712 ExecModeNameStr += "_exec_mode"; 1713 const char *ExecModeName = ExecModeNameStr.c_str(); 1714 1715 void *ExecModePtr; 1716 uint32_t varsize; 1717 err = interop_get_symbol_info((char *)image->ImageStart, img_size, 1718 ExecModeName, &ExecModePtr, &varsize); 1719 1720 if (err == HSA_STATUS_SUCCESS) { 1721 if ((size_t)varsize != sizeof(int8_t)) { 1722 DP("Loading global computation properties '%s' - size mismatch(%u != " 1723 "%lu)\n", 1724 ExecModeName, varsize, sizeof(int8_t)); 1725 return NULL; 1726 } 1727 1728 memcpy(&ExecModeVal, ExecModePtr, (size_t)varsize); 1729 1730 DP("After loading global for %s ExecMode = %d\n", ExecModeName, 1731 ExecModeVal); 1732 1733 if (ExecModeVal < 0 || ExecModeVal > 1) { 1734 DP("Error wrong exec_mode value specified in HSA code object file: " 1735 "%d\n", 1736 ExecModeVal); 1737 return NULL; 1738 } 1739 } else { 1740 DP("Loading global exec_mode '%s' - symbol missing, using default " 1741 "value " 1742 "GENERIC (1)\n", 1743 ExecModeName); 1744 } 1745 check("Loading computation property", err); 1746 1747 // Flat group size 1748 std::string WGSizeNameStr(e->name); 1749 WGSizeNameStr += "_wg_size"; 1750 const char *WGSizeName = WGSizeNameStr.c_str(); 1751 1752 void *WGSizePtr; 1753 uint32_t WGSize; 1754 err = interop_get_symbol_info((char *)image->ImageStart, img_size, 1755 WGSizeName, &WGSizePtr, &WGSize); 1756 1757 if (err == HSA_STATUS_SUCCESS) { 1758 if ((size_t)WGSize != sizeof(int16_t)) { 1759 DP("Loading global computation properties '%s' - size mismatch (%u " 1760 "!= " 1761 "%lu)\n", 1762 WGSizeName, WGSize, sizeof(int16_t)); 1763 return NULL; 1764 } 1765 1766 memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); 1767 1768 DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); 1769 1770 if (WGSizeVal < RTLDeviceInfoTy::Default_WG_Size || 1771 WGSizeVal > RTLDeviceInfoTy::Max_WG_Size) { 1772 DP("Error wrong WGSize value specified in HSA code object file: " 1773 "%d\n", 1774 WGSizeVal); 1775 WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; 1776 } 1777 } else { 1778 DP("Warning: Loading WGSize '%s' - symbol not found, " 1779 "using default value %d\n", 1780 WGSizeName, WGSizeVal); 1781 } 1782 1783 check("Loading WGSize computation property", err); 1784 } 1785 1786 KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id, 1787 CallStackAddr, e->name, kernarg_segment_size, 1788 DeviceInfo.KernArgPool)); 1789 __tgt_offload_entry entry = *e; 1790 entry.addr = (void *)&KernelsList.back(); 1791 DeviceInfo.addOffloadEntry(device_id, entry); 1792 DP("Entry point %ld maps to %s\n", e - HostBegin, e->name); 1793 } 1794 1795 return DeviceInfo.getOffloadEntriesTable(device_id); 1796 } 1797 1798 void *__tgt_rtl_data_alloc(int device_id, int64_t size, void *, int32_t kind) { 1799 void *ptr = NULL; 1800 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); 1801 1802 if (kind != TARGET_ALLOC_DEFAULT) { 1803 REPORT("Invalid target data allocation kind or requested allocator not " 1804 "implemented yet\n"); 1805 return NULL; 1806 } 1807 1808 hsa_status_t err = core::Runtime::DeviceMalloc(&ptr, size, device_id); 1809 DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", size, 1810 (long long unsigned)(Elf64_Addr)ptr); 1811 ptr = (err == HSA_STATUS_SUCCESS) ? ptr : NULL; 1812 return ptr; 1813 } 1814 1815 int32_t __tgt_rtl_data_submit(int device_id, void *tgt_ptr, void *hst_ptr, 1816 int64_t size) { 1817 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); 1818 __tgt_async_info AsyncInfo; 1819 int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo); 1820 if (rc != OFFLOAD_SUCCESS) 1821 return OFFLOAD_FAIL; 1822 1823 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1824 } 1825 1826 int32_t __tgt_rtl_data_submit_async(int device_id, void *tgt_ptr, void *hst_ptr, 1827 int64_t size, __tgt_async_info *AsyncInfo) { 1828 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); 1829 if (AsyncInfo) { 1830 initAsyncInfo(AsyncInfo); 1831 return dataSubmit(device_id, tgt_ptr, hst_ptr, size, AsyncInfo); 1832 } else { 1833 return __tgt_rtl_data_submit(device_id, tgt_ptr, hst_ptr, size); 1834 } 1835 } 1836 1837 int32_t __tgt_rtl_data_retrieve(int device_id, void *hst_ptr, void *tgt_ptr, 1838 int64_t size) { 1839 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); 1840 __tgt_async_info AsyncInfo; 1841 int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo); 1842 if (rc != OFFLOAD_SUCCESS) 1843 return OFFLOAD_FAIL; 1844 1845 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1846 } 1847 1848 int32_t __tgt_rtl_data_retrieve_async(int device_id, void *hst_ptr, 1849 void *tgt_ptr, int64_t size, 1850 __tgt_async_info *AsyncInfo) { 1851 assert(AsyncInfo && "AsyncInfo is nullptr"); 1852 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); 1853 initAsyncInfo(AsyncInfo); 1854 return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, AsyncInfo); 1855 } 1856 1857 int32_t __tgt_rtl_data_delete(int device_id, void *tgt_ptr) { 1858 assert(device_id < DeviceInfo.NumberOfDevices && "Device ID too large"); 1859 hsa_status_t err; 1860 DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)tgt_ptr); 1861 err = core::Runtime::Memfree(tgt_ptr); 1862 if (err != HSA_STATUS_SUCCESS) { 1863 DP("Error when freeing CUDA memory\n"); 1864 return OFFLOAD_FAIL; 1865 } 1866 return OFFLOAD_SUCCESS; 1867 } 1868 1869 // Determine launch values for kernel. 1870 struct launchVals { 1871 int WorkgroupSize; 1872 int GridSize; 1873 }; 1874 launchVals getLaunchVals(EnvironmentVariables Env, int ConstWGSize, 1875 int ExecutionMode, int num_teams, int thread_limit, 1876 uint64_t loop_tripcount, int DeviceNumTeams) { 1877 1878 int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size; 1879 int num_groups = 0; 1880 1881 int Max_Teams = 1882 Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; 1883 if (Max_Teams > RTLDeviceInfoTy::HardTeamLimit) 1884 Max_Teams = RTLDeviceInfoTy::HardTeamLimit; 1885 1886 if (print_kernel_trace & STARTUP_DETAILS) { 1887 fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n", 1888 RTLDeviceInfoTy::Max_Teams); 1889 fprintf(stderr, "Max_Teams: %d\n", Max_Teams); 1890 fprintf(stderr, "RTLDeviceInfoTy::Warp_Size: %d\n", 1891 RTLDeviceInfoTy::Warp_Size); 1892 fprintf(stderr, "RTLDeviceInfoTy::Max_WG_Size: %d\n", 1893 RTLDeviceInfoTy::Max_WG_Size); 1894 fprintf(stderr, "RTLDeviceInfoTy::Default_WG_Size: %d\n", 1895 RTLDeviceInfoTy::Default_WG_Size); 1896 fprintf(stderr, "thread_limit: %d\n", thread_limit); 1897 fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); 1898 fprintf(stderr, "ConstWGSize: %d\n", ConstWGSize); 1899 } 1900 // check for thread_limit() clause 1901 if (thread_limit > 0) { 1902 threadsPerGroup = thread_limit; 1903 DP("Setting threads per block to requested %d\n", thread_limit); 1904 if (ExecutionMode == GENERIC) { // Add master warp for GENERIC 1905 threadsPerGroup += RTLDeviceInfoTy::Warp_Size; 1906 DP("Adding master wavefront: +%d threads\n", RTLDeviceInfoTy::Warp_Size); 1907 } 1908 if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max 1909 threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size; 1910 DP("Setting threads per block to maximum %d\n", threadsPerGroup); 1911 } 1912 } 1913 // check flat_max_work_group_size attr here 1914 if (threadsPerGroup > ConstWGSize) { 1915 threadsPerGroup = ConstWGSize; 1916 DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", 1917 threadsPerGroup); 1918 } 1919 if (print_kernel_trace & STARTUP_DETAILS) 1920 fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); 1921 DP("Preparing %d threads\n", threadsPerGroup); 1922 1923 // Set default num_groups (teams) 1924 if (Env.TeamLimit > 0) 1925 num_groups = (Max_Teams < Env.TeamLimit) ? Max_Teams : Env.TeamLimit; 1926 else 1927 num_groups = Max_Teams; 1928 DP("Set default num of groups %d\n", num_groups); 1929 1930 if (print_kernel_trace & STARTUP_DETAILS) { 1931 fprintf(stderr, "num_groups: %d\n", num_groups); 1932 fprintf(stderr, "num_teams: %d\n", num_teams); 1933 } 1934 1935 // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size 1936 // This reduction is typical for default case (no thread_limit clause). 1937 // or when user goes crazy with num_teams clause. 1938 // FIXME: We cant distinguish between a constant or variable thread limit. 1939 // So we only handle constant thread_limits. 1940 if (threadsPerGroup > 1941 RTLDeviceInfoTy::Default_WG_Size) // 256 < threadsPerGroup <= 1024 1942 // Should we round threadsPerGroup up to nearest RTLDeviceInfoTy::Warp_Size 1943 // here? 1944 num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup; 1945 1946 // check for num_teams() clause 1947 if (num_teams > 0) { 1948 num_groups = (num_teams < num_groups) ? num_teams : num_groups; 1949 } 1950 if (print_kernel_trace & STARTUP_DETAILS) { 1951 fprintf(stderr, "num_groups: %d\n", num_groups); 1952 fprintf(stderr, "Env.NumTeams %d\n", Env.NumTeams); 1953 fprintf(stderr, "Env.TeamLimit %d\n", Env.TeamLimit); 1954 } 1955 1956 if (Env.NumTeams > 0) { 1957 num_groups = (Env.NumTeams < num_groups) ? Env.NumTeams : num_groups; 1958 DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); 1959 } else if (Env.TeamLimit > 0) { 1960 num_groups = (Env.TeamLimit < num_groups) ? Env.TeamLimit : num_groups; 1961 DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); 1962 } else { 1963 if (num_teams <= 0) { 1964 if (loop_tripcount > 0) { 1965 if (ExecutionMode == SPMD) { 1966 // round up to the nearest integer 1967 num_groups = ((loop_tripcount - 1) / threadsPerGroup) + 1; 1968 } else { 1969 num_groups = loop_tripcount; 1970 } 1971 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 1972 "threads per block %d\n", 1973 num_groups, loop_tripcount, threadsPerGroup); 1974 } 1975 } else { 1976 num_groups = num_teams; 1977 } 1978 if (num_groups > Max_Teams) { 1979 num_groups = Max_Teams; 1980 if (print_kernel_trace & STARTUP_DETAILS) 1981 fprintf(stderr, "Limiting num_groups %d to Max_Teams %d \n", num_groups, 1982 Max_Teams); 1983 } 1984 if (num_groups > num_teams && num_teams > 0) { 1985 num_groups = num_teams; 1986 if (print_kernel_trace & STARTUP_DETAILS) 1987 fprintf(stderr, "Limiting num_groups %d to clause num_teams %d \n", 1988 num_groups, num_teams); 1989 } 1990 } 1991 1992 // num_teams clause always honored, no matter what, unless DEFAULT is active. 1993 if (num_teams > 0) { 1994 num_groups = num_teams; 1995 // Cap num_groups to EnvMaxTeamsDefault if set. 1996 if (Env.MaxTeamsDefault > 0 && num_groups > Env.MaxTeamsDefault) 1997 num_groups = Env.MaxTeamsDefault; 1998 } 1999 if (print_kernel_trace & STARTUP_DETAILS) { 2000 fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup); 2001 fprintf(stderr, "num_groups: %d\n", num_groups); 2002 fprintf(stderr, "loop_tripcount: %ld\n", loop_tripcount); 2003 } 2004 DP("Final %d num_groups and %d threadsPerGroup\n", num_groups, 2005 threadsPerGroup); 2006 2007 launchVals res; 2008 res.WorkgroupSize = threadsPerGroup; 2009 res.GridSize = threadsPerGroup * num_groups; 2010 return res; 2011 } 2012 2013 static uint64_t acquire_available_packet_id(hsa_queue_t *queue) { 2014 uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1); 2015 bool full = true; 2016 while (full) { 2017 full = 2018 packet_id >= (queue->size + hsa_queue_load_read_index_scacquire(queue)); 2019 } 2020 return packet_id; 2021 } 2022 2023 static int32_t __tgt_rtl_run_target_team_region_locked( 2024 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 2025 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams, 2026 int32_t thread_limit, uint64_t loop_tripcount); 2027 2028 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 2029 void **tgt_args, 2030 ptrdiff_t *tgt_offsets, 2031 int32_t arg_num, int32_t num_teams, 2032 int32_t thread_limit, 2033 uint64_t loop_tripcount) { 2034 2035 DeviceInfo.load_run_lock.lock_shared(); 2036 int32_t res = __tgt_rtl_run_target_team_region_locked( 2037 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, num_teams, 2038 thread_limit, loop_tripcount); 2039 2040 DeviceInfo.load_run_lock.unlock_shared(); 2041 return res; 2042 } 2043 2044 int32_t __tgt_rtl_run_target_team_region_locked( 2045 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 2046 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t num_teams, 2047 int32_t thread_limit, uint64_t loop_tripcount) { 2048 // Set the context we are using 2049 // update thread limit content in gpu memory if un-initialized or specified 2050 // from host 2051 2052 DP("Run target team region thread_limit %d\n", thread_limit); 2053 2054 // All args are references. 2055 std::vector<void *> args(arg_num); 2056 std::vector<void *> ptrs(arg_num); 2057 2058 DP("Arg_num: %d\n", arg_num); 2059 for (int32_t i = 0; i < arg_num; ++i) { 2060 ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); 2061 args[i] = &ptrs[i]; 2062 DP("Offseted base: arg[%d]:" DPxMOD "\n", i, DPxPTR(ptrs[i])); 2063 } 2064 2065 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; 2066 2067 std::string kernel_name = std::string(KernelInfo->Name); 2068 auto &KernelInfoTable = DeviceInfo.KernelInfoTable; 2069 if (KernelInfoTable[device_id].find(kernel_name) == 2070 KernelInfoTable[device_id].end()) { 2071 DP("Kernel %s not found\n", kernel_name.c_str()); 2072 return OFFLOAD_FAIL; 2073 } 2074 2075 const atl_kernel_info_t KernelInfoEntry = 2076 KernelInfoTable[device_id][kernel_name]; 2077 const uint32_t group_segment_size = KernelInfoEntry.group_segment_size; 2078 const uint32_t sgpr_count = KernelInfoEntry.sgpr_count; 2079 const uint32_t vgpr_count = KernelInfoEntry.vgpr_count; 2080 const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count; 2081 const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count; 2082 2083 assert(arg_num == (int)KernelInfoEntry.num_args); 2084 2085 /* 2086 * Set limit based on ThreadsPerGroup and GroupsPerDevice 2087 */ 2088 launchVals LV = getLaunchVals(DeviceInfo.Env, KernelInfo->ConstWGSize, 2089 KernelInfo->ExecutionMode, 2090 num_teams, // From run_region arg 2091 thread_limit, // From run_region arg 2092 loop_tripcount, // From run_region arg 2093 DeviceInfo.NumTeams[KernelInfo->device_id]); 2094 const int GridSize = LV.GridSize; 2095 const int WorkgroupSize = LV.WorkgroupSize; 2096 2097 if (print_kernel_trace >= LAUNCH) { 2098 int num_groups = GridSize / WorkgroupSize; 2099 // enum modes are SPMD, GENERIC, NONE 0,1,2 2100 // if doing rtl timing, print to stderr, unless stdout requested. 2101 bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); 2102 fprintf(traceToStdout ? stdout : stderr, 2103 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " 2104 "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " 2105 "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", 2106 device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, 2107 arg_num, num_groups, WorkgroupSize, num_teams, thread_limit, 2108 group_segment_size, sgpr_count, vgpr_count, sgpr_spill_count, 2109 vgpr_spill_count, loop_tripcount, KernelInfo->Name); 2110 } 2111 2112 // Run on the device. 2113 { 2114 hsa_queue_t *queue = DeviceInfo.HSAQueues[device_id]; 2115 if (!queue) { 2116 return OFFLOAD_FAIL; 2117 } 2118 uint64_t packet_id = acquire_available_packet_id(queue); 2119 2120 const uint32_t mask = queue->size - 1; // size is a power of 2 2121 hsa_kernel_dispatch_packet_t *packet = 2122 (hsa_kernel_dispatch_packet_t *)queue->base_address + 2123 (packet_id & mask); 2124 2125 // packet->header is written last 2126 packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; 2127 packet->workgroup_size_x = WorkgroupSize; 2128 packet->workgroup_size_y = 1; 2129 packet->workgroup_size_z = 1; 2130 packet->reserved0 = 0; 2131 packet->grid_size_x = GridSize; 2132 packet->grid_size_y = 1; 2133 packet->grid_size_z = 1; 2134 packet->private_segment_size = KernelInfoEntry.private_segment_size; 2135 packet->group_segment_size = KernelInfoEntry.group_segment_size; 2136 packet->kernel_object = KernelInfoEntry.kernel_object; 2137 packet->kernarg_address = 0; // use the block allocator 2138 packet->reserved2 = 0; // atmi writes id_ here 2139 packet->completion_signal = {0}; // may want a pool of signals 2140 2141 KernelArgPool *ArgPool = nullptr; 2142 { 2143 auto it = KernelArgPoolMap.find(std::string(KernelInfo->Name)); 2144 if (it != KernelArgPoolMap.end()) { 2145 ArgPool = (it->second).get(); 2146 } 2147 } 2148 if (!ArgPool) { 2149 DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, 2150 device_id); 2151 } 2152 { 2153 void *kernarg = nullptr; 2154 if (ArgPool) { 2155 assert(ArgPool->kernarg_segment_size == (arg_num * sizeof(void *))); 2156 kernarg = ArgPool->allocate(arg_num); 2157 } 2158 if (!kernarg) { 2159 DP("Allocate kernarg failed\n"); 2160 return OFFLOAD_FAIL; 2161 } 2162 2163 // Copy explicit arguments 2164 for (int i = 0; i < arg_num; i++) { 2165 memcpy((char *)kernarg + sizeof(void *) * i, args[i], sizeof(void *)); 2166 } 2167 2168 // Initialize implicit arguments. ATMI seems to leave most fields 2169 // uninitialized 2170 atmi_implicit_args_t *impl_args = 2171 reinterpret_cast<atmi_implicit_args_t *>( 2172 static_cast<char *>(kernarg) + ArgPool->kernarg_segment_size); 2173 memset(impl_args, 0, 2174 sizeof(atmi_implicit_args_t)); // may not be necessary 2175 impl_args->offset_x = 0; 2176 impl_args->offset_y = 0; 2177 impl_args->offset_z = 0; 2178 2179 // assign a hostcall buffer for the selected Q 2180 if (__atomic_load_n(&DeviceInfo.hostcall_required, __ATOMIC_ACQUIRE)) { 2181 // hostrpc_assign_buffer is not thread safe, and this function is 2182 // under a multiple reader lock, not a writer lock. 2183 static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER; 2184 pthread_mutex_lock(&hostcall_init_lock); 2185 impl_args->hostcall_ptr = hostrpc_assign_buffer( 2186 DeviceInfo.HSAAgents[device_id], queue, device_id); 2187 pthread_mutex_unlock(&hostcall_init_lock); 2188 if (!impl_args->hostcall_ptr) { 2189 DP("hostrpc_assign_buffer failed, gpu would dereference null and " 2190 "error\n"); 2191 return OFFLOAD_FAIL; 2192 } 2193 } 2194 2195 packet->kernarg_address = kernarg; 2196 } 2197 2198 { 2199 hsa_signal_t s = DeviceInfo.FreeSignalPool.pop(); 2200 if (s.handle == 0) { 2201 DP("Failed to get signal instance\n"); 2202 return OFFLOAD_FAIL; 2203 } 2204 packet->completion_signal = s; 2205 hsa_signal_store_relaxed(packet->completion_signal, 1); 2206 } 2207 2208 core::packet_store_release(reinterpret_cast<uint32_t *>(packet), 2209 core::create_header(), packet->setup); 2210 2211 hsa_signal_store_relaxed(queue->doorbell_signal, packet_id); 2212 2213 while (hsa_signal_wait_scacquire(packet->completion_signal, 2214 HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, 2215 HSA_WAIT_STATE_BLOCKED) != 0) 2216 ; 2217 2218 assert(ArgPool); 2219 ArgPool->deallocate(packet->kernarg_address); 2220 DeviceInfo.FreeSignalPool.push(packet->completion_signal); 2221 } 2222 2223 DP("Kernel completed\n"); 2224 return OFFLOAD_SUCCESS; 2225 } 2226 2227 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 2228 void **tgt_args, ptrdiff_t *tgt_offsets, 2229 int32_t arg_num) { 2230 // use one team and one thread 2231 // fix thread num 2232 int32_t team_num = 1; 2233 int32_t thread_limit = 0; // use default 2234 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 2235 tgt_offsets, arg_num, team_num, 2236 thread_limit, 0); 2237 } 2238 2239 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 2240 void *tgt_entry_ptr, void **tgt_args, 2241 ptrdiff_t *tgt_offsets, 2242 int32_t arg_num, 2243 __tgt_async_info *AsyncInfo) { 2244 assert(AsyncInfo && "AsyncInfo is nullptr"); 2245 initAsyncInfo(AsyncInfo); 2246 2247 // use one team and one thread 2248 // fix thread num 2249 int32_t team_num = 1; 2250 int32_t thread_limit = 0; // use default 2251 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 2252 tgt_offsets, arg_num, team_num, 2253 thread_limit, 0); 2254 } 2255 2256 int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *AsyncInfo) { 2257 assert(AsyncInfo && "AsyncInfo is nullptr"); 2258 2259 // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant 2260 // is not ensured by devices.cpp for amdgcn 2261 // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr"); 2262 if (AsyncInfo->Queue) { 2263 finiAsyncInfo(AsyncInfo); 2264 } 2265 return OFFLOAD_SUCCESS; 2266 } 2267 2268 namespace core { 2269 hsa_status_t allow_access_to_all_gpu_agents(void *ptr) { 2270 return hsa_amd_agents_allow_access(DeviceInfo.HSAAgents.size(), 2271 &DeviceInfo.HSAAgents[0], NULL, ptr); 2272 } 2273 2274 } // namespace core 2275