1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <unordered_map> 21 #include <vector> 22 23 #include "Debug.h" 24 #include "DeviceEnvironment.h" 25 #include "omptargetplugin.h" 26 27 #define TARGET_NAME CUDA 28 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 29 30 #include "MemoryManager.h" 31 32 #include "llvm/Frontend/OpenMP/OMPConstants.h" 33 34 // Utility for retrieving and printing CUDA error string. 35 #ifdef OMPTARGET_DEBUG 36 #define CUDA_ERR_STRING(err) \ 37 do { \ 38 if (getDebugLevel() > 0) { \ 39 const char *errStr = nullptr; \ 40 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 41 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 42 REPORT("Unrecognized CUDA error code: %d\n", err); \ 43 else if (errStr_status == CUDA_SUCCESS) \ 44 REPORT("CUDA error is: %s\n", errStr); \ 45 else { \ 46 REPORT("Unresolved CUDA error code: %d\n", err); \ 47 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 48 errStr_status); \ 49 } \ 50 } else { \ 51 const char *errStr = nullptr; \ 52 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 53 if (errStr_status == CUDA_SUCCESS) \ 54 REPORT("%s \n", errStr); \ 55 } \ 56 } while (false) 57 #else // OMPTARGET_DEBUG 58 #define CUDA_ERR_STRING(err) \ 59 do { \ 60 const char *errStr = nullptr; \ 61 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 62 if (errStr_status == CUDA_SUCCESS) \ 63 REPORT("%s \n", errStr); \ 64 } while (false) 65 #endif // OMPTARGET_DEBUG 66 67 #define BOOL2TEXT(b) ((b) ? "Yes" : "No") 68 69 #include "elf_common.h" 70 71 /// Keep entries table per device. 72 struct FuncOrGblEntryTy { 73 __tgt_target_table Table; 74 std::vector<__tgt_offload_entry> Entries; 75 }; 76 77 /// Use a single entity to encode a kernel and a set of flags. 78 struct KernelTy { 79 CUfunction Func; 80 81 // execution mode of kernel 82 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 83 84 /// Maximal number of threads per block for this kernel. 85 int MaxThreadsPerBlock = 0; 86 87 KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode) 88 : Func(_Func), ExecutionMode(_ExecutionMode) {} 89 }; 90 91 namespace { 92 bool checkResult(CUresult Err, const char *ErrMsg) { 93 if (Err == CUDA_SUCCESS) 94 return true; 95 96 REPORT("%s", ErrMsg); 97 CUDA_ERR_STRING(Err); 98 return false; 99 } 100 101 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 102 CUstream Stream) { 103 CUresult Err = 104 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 105 106 if (Err != CUDA_SUCCESS) { 107 DP("Error when copying data from device to device. Pointers: src " 108 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 109 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 110 CUDA_ERR_STRING(Err); 111 return OFFLOAD_FAIL; 112 } 113 114 return OFFLOAD_SUCCESS; 115 } 116 117 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { 118 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 119 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 120 121 CUresult Err = cuEventRecord(Event, Stream); 122 if (Err != CUDA_SUCCESS) { 123 DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", 124 DPxPTR(Stream), DPxPTR(Event)); 125 CUDA_ERR_STRING(Err); 126 return OFFLOAD_FAIL; 127 } 128 129 return OFFLOAD_SUCCESS; 130 } 131 132 int syncEvent(void *EventPtr) { 133 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 134 135 CUresult Err = cuEventSynchronize(Event); 136 if (Err != CUDA_SUCCESS) { 137 DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); 138 CUDA_ERR_STRING(Err); 139 return OFFLOAD_FAIL; 140 } 141 142 return OFFLOAD_SUCCESS; 143 } 144 145 // Structure contains per-device data 146 struct DeviceDataTy { 147 /// List that contains all the kernels. 148 std::list<KernelTy> KernelsList; 149 150 std::list<FuncOrGblEntryTy> FuncGblEntries; 151 152 CUcontext Context = nullptr; 153 // Device properties 154 int ThreadsPerBlock = 0; 155 int BlocksPerGrid = 0; 156 int WarpSize = 0; 157 // OpenMP properties 158 int NumTeams = 0; 159 int NumThreads = 0; 160 }; 161 162 /// Resource allocator where \p T is the resource type. 163 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL 164 /// accordingly. The implementation should not raise any exception. 165 template <typename T> class AllocatorTy { 166 public: 167 /// Create a resource and assign to R. 168 int create(T &R) noexcept; 169 /// Destroy the resource. 170 int destroy(T) noexcept; 171 }; 172 173 /// Allocator for CUstream. 174 template <> class AllocatorTy<CUstream> { 175 CUcontext Context; 176 177 public: 178 AllocatorTy(CUcontext C) noexcept : Context(C) {} 179 180 /// See AllocatorTy<T>::create. 181 int create(CUstream &Stream) noexcept { 182 if (!checkResult(cuCtxSetCurrent(Context), 183 "Error returned from cuCtxSetCurrent\n")) 184 return OFFLOAD_FAIL; 185 186 if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING), 187 "Error returned from cuStreamCreate\n")) 188 return OFFLOAD_FAIL; 189 190 return OFFLOAD_SUCCESS; 191 } 192 193 /// See AllocatorTy<T>::destroy. 194 int destroy(CUstream Stream) noexcept { 195 if (!checkResult(cuCtxSetCurrent(Context), 196 "Error returned from cuCtxSetCurrent\n")) 197 return OFFLOAD_FAIL; 198 if (!checkResult(cuStreamDestroy(Stream), 199 "Error returned from cuStreamDestroy\n")) 200 return OFFLOAD_FAIL; 201 202 return OFFLOAD_SUCCESS; 203 } 204 }; 205 206 /// Allocator for CUevent. 207 template <> class AllocatorTy<CUevent> { 208 public: 209 /// See AllocatorTy<T>::create. 210 int create(CUevent &Event) noexcept { 211 if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT), 212 "Error returned from cuEventCreate\n")) 213 return OFFLOAD_FAIL; 214 215 return OFFLOAD_SUCCESS; 216 } 217 218 /// See AllocatorTy<T>::destroy. 219 int destroy(CUevent Event) noexcept { 220 if (!checkResult(cuEventDestroy(Event), 221 "Error returned from cuEventDestroy\n")) 222 return OFFLOAD_FAIL; 223 224 return OFFLOAD_SUCCESS; 225 } 226 }; 227 228 /// A generic pool of resources where \p T is the resource type. 229 /// \p T should be copyable as the object is stored in \p std::vector . 230 template <typename T> class ResourcePoolTy { 231 /// Index of the next available resource. 232 size_t Next = 0; 233 /// Mutex to guard the pool. 234 std::mutex Mutex; 235 /// Pool of resources. 236 std::vector<T> Resources; 237 /// A reference to the corresponding allocator. 238 AllocatorTy<T> Allocator; 239 240 /// If `Resources` is used up, we will fill in more resources. It assumes that 241 /// the new size `Size` should be always larger than the current size. 242 bool resize(size_t Size) { 243 auto CurSize = Resources.size(); 244 assert(Size > CurSize && "Unexpected smaller size"); 245 Resources.reserve(Size); 246 for (auto I = CurSize; I < Size; ++I) { 247 T NewItem; 248 int Ret = Allocator.create(NewItem); 249 if (Ret != OFFLOAD_SUCCESS) 250 return false; 251 Resources.push_back(NewItem); 252 } 253 return true; 254 } 255 256 public: 257 ResourcePoolTy(AllocatorTy<T> &&A, size_t Size = 0) noexcept 258 : Allocator(std::move(A)) { 259 if (Size) 260 (void)resize(Size); 261 } 262 263 ~ResourcePoolTy() noexcept { clear(); } 264 265 /// Get a resource from pool. `Next` always points to the next available 266 /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are 267 /// still available. If there is no resource left, we will ask for more. Each 268 /// time a resource is assigned, the id will increase one. 269 /// xxxxxs+++++++++ 270 /// ^ 271 /// Next 272 /// After assignment, the pool becomes the following and s is assigned. 273 /// xxxxxs+++++++++ 274 /// ^ 275 /// Next 276 int acquire(T &R) noexcept { 277 std::lock_guard<std::mutex> LG(Mutex); 278 if (Next == Resources.size()) { 279 auto NewSize = Resources.size() ? Resources.size() * 2 : 1; 280 if (!resize(NewSize)) 281 return OFFLOAD_FAIL; 282 } 283 284 assert(Next < Resources.size()); 285 286 R = Resources[Next++]; 287 288 return OFFLOAD_SUCCESS; 289 } 290 291 /// Return the resource back to the pool. When we return a resource, we need 292 /// to first decrease `Next`, and then copy the resource back. It is worth 293 /// noting that, the order of resources return might be different from that 294 /// they're assigned, that saying, at some point, there might be two identical 295 /// resources. 296 /// xxax+a+++++ 297 /// ^ 298 /// Next 299 /// However, it doesn't matter, because they're always on the two sides of 300 /// `Next`. The left one will in the end be overwritten by another resource. 301 /// Therefore, after several execution, the order of pool might be different 302 /// from its initial state. 303 void release(T R) noexcept { 304 std::lock_guard<std::mutex> LG(Mutex); 305 Resources[--Next] = R; 306 } 307 308 /// Released all stored resources and clear the pool. 309 /// Note: This function is not thread safe. Be sure to guard it if necessary. 310 void clear() noexcept { 311 for (auto &R : Resources) 312 (void)Allocator.destroy(R); 313 Resources.clear(); 314 } 315 }; 316 317 class DeviceRTLTy { 318 int NumberOfDevices; 319 // OpenMP environment properties 320 int EnvNumTeams; 321 int EnvTeamLimit; 322 int EnvTeamThreadLimit; 323 // OpenMP requires flags 324 int64_t RequiresFlags; 325 // Amount of dynamic shared memory to use at launch. 326 uint64_t DynamicMemorySize; 327 // Number of initial streams for each device. 328 int NumInitialStreams = 32; 329 330 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 331 static constexpr const int HardThreadLimit = 1024; 332 static constexpr const int DefaultNumTeams = 128; 333 static constexpr const int DefaultNumThreads = 128; 334 335 using StreamPoolTy = ResourcePoolTy<CUstream>; 336 std::vector<std::unique_ptr<StreamPoolTy>> StreamPool; 337 338 ResourcePoolTy<CUevent> EventPool; 339 340 std::vector<DeviceDataTy> DeviceData; 341 std::vector<CUmodule> Modules; 342 343 /// A class responsible for interacting with device native runtime library to 344 /// allocate and free memory. 345 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 346 const int DeviceId; 347 const std::vector<DeviceDataTy> &DeviceData; 348 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 349 350 public: 351 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData) 352 : DeviceId(DeviceId), DeviceData(DeviceData) {} 353 354 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 355 if (Size == 0) 356 return nullptr; 357 358 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 359 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 360 return nullptr; 361 362 void *MemAlloc = nullptr; 363 switch (Kind) { 364 case TARGET_ALLOC_DEFAULT: 365 case TARGET_ALLOC_DEVICE: 366 CUdeviceptr DevicePtr; 367 Err = cuMemAlloc(&DevicePtr, Size); 368 MemAlloc = (void *)DevicePtr; 369 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 370 return nullptr; 371 break; 372 case TARGET_ALLOC_HOST: 373 void *HostPtr; 374 Err = cuMemAllocHost(&HostPtr, Size); 375 MemAlloc = HostPtr; 376 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 377 return nullptr; 378 HostPinnedAllocs[MemAlloc] = Kind; 379 break; 380 case TARGET_ALLOC_SHARED: 381 CUdeviceptr SharedPtr; 382 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 383 MemAlloc = (void *)SharedPtr; 384 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 385 return nullptr; 386 break; 387 } 388 389 return MemAlloc; 390 } 391 392 int free(void *TgtPtr) override { 393 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 394 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 395 return OFFLOAD_FAIL; 396 397 // Host pinned memory must be freed differently. 398 TargetAllocTy Kind = 399 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 400 ? TARGET_ALLOC_DEFAULT 401 : TARGET_ALLOC_HOST; 402 switch (Kind) { 403 case TARGET_ALLOC_DEFAULT: 404 case TARGET_ALLOC_DEVICE: 405 case TARGET_ALLOC_SHARED: 406 Err = cuMemFree((CUdeviceptr)TgtPtr); 407 if (!checkResult(Err, "Error returned from cuMemFree\n")) 408 return OFFLOAD_FAIL; 409 break; 410 case TARGET_ALLOC_HOST: 411 Err = cuMemFreeHost(TgtPtr); 412 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 413 return OFFLOAD_FAIL; 414 break; 415 } 416 417 return OFFLOAD_SUCCESS; 418 } 419 }; 420 421 /// A vector of device allocators 422 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 423 424 /// A vector of memory managers. Since the memory manager is non-copyable and 425 // non-removable, we wrap them into std::unique_ptr. 426 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 427 428 /// Whether use memory manager 429 bool UseMemoryManager = true; 430 431 // Record entry point associated with device 432 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 433 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 434 E.Entries.push_back(entry); 435 } 436 437 // Return a pointer to the entry associated with the pointer 438 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 439 const void *Addr) const { 440 for (const __tgt_offload_entry &Itr : 441 DeviceData[DeviceId].FuncGblEntries.back().Entries) 442 if (Itr.addr == Addr) 443 return &Itr; 444 445 return nullptr; 446 } 447 448 // Return the pointer to the target entries table 449 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 450 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 451 452 if (E.Entries.empty()) 453 return nullptr; 454 455 // Update table info according to the entries and return the pointer 456 E.Table.EntriesBegin = E.Entries.data(); 457 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 458 459 return &E.Table; 460 } 461 462 // Clear entries table for a device 463 void clearOffloadEntriesTable(const int DeviceId) { 464 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 465 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 466 E.Entries.clear(); 467 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 468 } 469 470 public: 471 472 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 473 assert(AsyncInfo && "AsyncInfo is nullptr"); 474 475 if (!AsyncInfo->Queue) { 476 CUstream S; 477 if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS) 478 return nullptr; 479 480 AsyncInfo->Queue = S; 481 } 482 483 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 484 } 485 486 // This class should not be copied 487 DeviceRTLTy(const DeviceRTLTy &) = delete; 488 DeviceRTLTy(DeviceRTLTy &&) = delete; 489 490 DeviceRTLTy() 491 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 492 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), 493 DynamicMemorySize(0), EventPool(AllocatorTy<CUevent>()) { 494 495 DP("Start initializing CUDA\n"); 496 497 CUresult Err = cuInit(0); 498 if (Err == CUDA_ERROR_INVALID_HANDLE) { 499 // Can't call cuGetErrorString if dlsym failed 500 DP("Failed to load CUDA shared library\n"); 501 return; 502 } 503 if (!checkResult(Err, "Error returned from cuInit\n")) { 504 return; 505 } 506 507 Err = cuDeviceGetCount(&NumberOfDevices); 508 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 509 return; 510 511 if (NumberOfDevices == 0) { 512 DP("There are no devices supporting CUDA.\n"); 513 return; 514 } 515 516 DeviceData.resize(NumberOfDevices); 517 StreamPool.resize(NumberOfDevices); 518 519 // Get environment variables regarding teams 520 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 521 // OMP_TEAM_LIMIT has been set 522 EnvTeamLimit = std::stoi(EnvStr); 523 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 524 } 525 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 526 // OMP_TEAMS_THREAD_LIMIT has been set 527 EnvTeamThreadLimit = std::stoi(EnvStr); 528 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 529 } 530 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 531 // OMP_NUM_TEAMS has been set 532 EnvNumTeams = std::stoi(EnvStr); 533 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 534 } 535 if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { 536 // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set 537 DynamicMemorySize = std::stoi(EnvStr); 538 DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", 539 DynamicMemorySize); 540 } 541 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) { 542 // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set 543 NumInitialStreams = std::stoi(EnvStr); 544 DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams); 545 } 546 547 for (int I = 0; I < NumberOfDevices; ++I) 548 DeviceAllocators.emplace_back(I, DeviceData); 549 550 // Get the size threshold from environment variable 551 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 552 UseMemoryManager = Res.second; 553 size_t MemoryManagerThreshold = Res.first; 554 555 if (UseMemoryManager) 556 for (int I = 0; I < NumberOfDevices; ++I) 557 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 558 DeviceAllocators[I], MemoryManagerThreshold)); 559 } 560 561 ~DeviceRTLTy() { 562 // We first destruct memory managers in case that its dependent data are 563 // destroyed before it. 564 for (auto &M : MemoryManagers) 565 M.release(); 566 567 for (CUmodule &M : Modules) 568 // Close module 569 if (M) 570 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 571 572 for (auto &S : StreamPool) 573 S.reset(); 574 575 EventPool.clear(); 576 577 for (DeviceDataTy &D : DeviceData) { 578 // Destroy context 579 if (D.Context) { 580 checkResult(cuCtxSetCurrent(D.Context), 581 "Error returned from cuCtxSetCurrent\n"); 582 CUdevice Device; 583 checkResult(cuCtxGetDevice(&Device), 584 "Error returned from cuCtxGetDevice\n"); 585 checkResult(cuDevicePrimaryCtxRelease(Device), 586 "Error returned from cuDevicePrimaryCtxRelease\n"); 587 } 588 } 589 } 590 591 // Check whether a given DeviceId is valid 592 bool isValidDeviceId(const int DeviceId) const { 593 return DeviceId >= 0 && DeviceId < NumberOfDevices; 594 } 595 596 int getNumOfDevices() const { return NumberOfDevices; } 597 598 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 599 600 int initDevice(const int DeviceId) { 601 CUdevice Device; 602 603 DP("Getting device %d\n", DeviceId); 604 CUresult Err = cuDeviceGet(&Device, DeviceId); 605 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 606 return OFFLOAD_FAIL; 607 608 // Query the current flags of the primary context and set its flags if 609 // it is inactive 610 unsigned int FormerPrimaryCtxFlags = 0; 611 int FormerPrimaryCtxIsActive = 0; 612 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 613 &FormerPrimaryCtxIsActive); 614 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 615 return OFFLOAD_FAIL; 616 617 if (FormerPrimaryCtxIsActive) { 618 DP("The primary context is active, no change to its flags\n"); 619 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 620 CU_CTX_SCHED_BLOCKING_SYNC) 621 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 622 } else { 623 DP("The primary context is inactive, set its flags to " 624 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 625 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 626 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 627 return OFFLOAD_FAIL; 628 } 629 630 // Retain the per device primary context and save it to use whenever this 631 // device is selected. 632 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 633 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 634 return OFFLOAD_FAIL; 635 636 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 637 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 638 return OFFLOAD_FAIL; 639 640 // Initialize stream pool 641 if (!StreamPool[DeviceId]) 642 StreamPool[DeviceId] = std::make_unique<StreamPoolTy>( 643 AllocatorTy<CUstream>(DeviceData[DeviceId].Context), 644 NumInitialStreams); 645 646 // Query attributes to determine number of threads/block and blocks/grid. 647 int MaxGridDimX; 648 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 649 Device); 650 if (Err != CUDA_SUCCESS) { 651 DP("Error getting max grid dimension, use default value %d\n", 652 DeviceRTLTy::DefaultNumTeams); 653 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 654 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 655 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 656 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 657 } else { 658 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 659 "at the hard limit\n", 660 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 661 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 662 } 663 664 // We are only exploiting threads along the x axis. 665 int MaxBlockDimX; 666 Err = cuDeviceGetAttribute(&MaxBlockDimX, 667 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 668 if (Err != CUDA_SUCCESS) { 669 DP("Error getting max block dimension, use default value %d\n", 670 DeviceRTLTy::DefaultNumThreads); 671 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 672 } else { 673 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 674 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 675 676 if (EnvTeamThreadLimit > 0 && 677 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 678 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 679 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 680 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 681 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 682 } 683 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 684 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 685 "capping at the hard limit\n", 686 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 687 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 688 } 689 } 690 691 // Get and set warp size 692 int WarpSize; 693 Err = 694 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 695 if (Err != CUDA_SUCCESS) { 696 DP("Error getting warp size, assume default value 32\n"); 697 DeviceData[DeviceId].WarpSize = 32; 698 } else { 699 DP("Using warp size %d\n", WarpSize); 700 DeviceData[DeviceId].WarpSize = WarpSize; 701 } 702 703 // Adjust teams to the env variables 704 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 705 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 706 EnvTeamLimit); 707 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 708 } 709 710 size_t StackLimit; 711 size_t HeapLimit; 712 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { 713 StackLimit = std::stol(EnvStr); 714 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) 715 return OFFLOAD_FAIL; 716 } else { 717 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) 718 return OFFLOAD_FAIL; 719 } 720 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { 721 HeapLimit = std::stol(EnvStr); 722 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) 723 return OFFLOAD_FAIL; 724 } else { 725 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) 726 return OFFLOAD_FAIL; 727 } 728 729 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 730 "Device supports up to %d CUDA blocks and %d threads with a " 731 "warp size of %d\n", 732 DeviceData[DeviceId].BlocksPerGrid, 733 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 734 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 735 "Device heap size is %d Bytes, device stack size is %d Bytes per " 736 "thread\n", 737 (int)HeapLimit, (int)StackLimit); 738 739 // Set default number of teams 740 if (EnvNumTeams > 0) { 741 DP("Default number of teams set according to environment %d\n", 742 EnvNumTeams); 743 DeviceData[DeviceId].NumTeams = EnvNumTeams; 744 } else { 745 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 746 DP("Default number of teams set according to library's default %d\n", 747 DeviceRTLTy::DefaultNumTeams); 748 } 749 750 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 751 DP("Default number of teams exceeds device limit, capping at %d\n", 752 DeviceData[DeviceId].BlocksPerGrid); 753 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 754 } 755 756 // Set default number of threads 757 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 758 DP("Default number of threads set according to library's default %d\n", 759 DeviceRTLTy::DefaultNumThreads); 760 if (DeviceData[DeviceId].NumThreads > 761 DeviceData[DeviceId].ThreadsPerBlock) { 762 DP("Default number of threads exceeds device limit, capping at %d\n", 763 DeviceData[DeviceId].ThreadsPerBlock); 764 DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; 765 } 766 767 return OFFLOAD_SUCCESS; 768 } 769 770 __tgt_target_table *loadBinary(const int DeviceId, 771 const __tgt_device_image *Image) { 772 // Set the context we are using 773 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 774 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 775 return nullptr; 776 777 // Clear the offload table as we are going to create a new one. 778 clearOffloadEntriesTable(DeviceId); 779 780 // Create the module and extract the function pointers. 781 CUmodule Module; 782 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 783 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 784 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 785 return nullptr; 786 787 DP("CUDA module successfully loaded!\n"); 788 789 Modules.push_back(Module); 790 791 // Find the symbols in the module by name. 792 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 793 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 794 795 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 796 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 797 if (!E->addr) { 798 // We return nullptr when something like this happens, the host should 799 // have always something in the address to uniquely identify the target 800 // region. 801 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 802 return nullptr; 803 } 804 805 if (E->size) { 806 __tgt_offload_entry Entry = *E; 807 CUdeviceptr CUPtr; 808 size_t CUSize; 809 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 810 // We keep this style here because we need the name 811 if (Err != CUDA_SUCCESS) { 812 REPORT("Loading global '%s' Failed\n", E->name); 813 CUDA_ERR_STRING(Err); 814 return nullptr; 815 } 816 817 if (CUSize != E->size) { 818 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 819 CUSize, E->size); 820 return nullptr; 821 } 822 823 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 824 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 825 826 Entry.addr = (void *)(CUPtr); 827 828 // Note: In the current implementation declare target variables 829 // can either be link or to. This means that once unified 830 // memory is activated via the requires directive, the variable 831 // can be used directly from the host in both cases. 832 // TODO: when variables types other than to or link are added, 833 // the below condition should be changed to explicitly 834 // check for to and link variables types: 835 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 836 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 837 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 838 // If unified memory is present any target link or to variables 839 // can access host addresses directly. There is no longer a 840 // need for device copies. 841 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 842 DP("Copy linked variable host address (" DPxMOD 843 ") to device address (" DPxMOD ")\n", 844 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 845 } 846 847 addOffloadEntry(DeviceId, Entry); 848 849 continue; 850 } 851 852 CUfunction Func; 853 Err = cuModuleGetFunction(&Func, Module, E->name); 854 // We keep this style here because we need the name 855 if (Err != CUDA_SUCCESS) { 856 REPORT("Loading '%s' Failed\n", E->name); 857 CUDA_ERR_STRING(Err); 858 return nullptr; 859 } 860 861 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 862 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 863 864 // default value GENERIC (in case symbol is missing from cubin file) 865 llvm::omp::OMPTgtExecModeFlags ExecModeVal; 866 std::string ExecModeNameStr(E->name); 867 ExecModeNameStr += "_exec_mode"; 868 const char *ExecModeName = ExecModeNameStr.c_str(); 869 870 CUdeviceptr ExecModePtr; 871 size_t CUSize; 872 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 873 if (Err == CUDA_SUCCESS) { 874 if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 875 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 876 ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 877 return nullptr; 878 } 879 880 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 881 if (Err != CUDA_SUCCESS) { 882 REPORT("Error when copying data from device to host. Pointers: " 883 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 884 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 885 CUDA_ERR_STRING(Err); 886 return nullptr; 887 } 888 } else { 889 DP("Loading global exec_mode '%s' - symbol missing, using default " 890 "value GENERIC (1)\n", 891 ExecModeName); 892 } 893 894 KernelsList.emplace_back(Func, ExecModeVal); 895 896 __tgt_offload_entry Entry = *E; 897 Entry.addr = &KernelsList.back(); 898 addOffloadEntry(DeviceId, Entry); 899 } 900 901 // send device environment data to the device 902 { 903 // TODO: The device ID used here is not the real device ID used by OpenMP. 904 DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices), 905 static_cast<uint32_t>(DeviceId), 906 static_cast<uint32_t>(DynamicMemorySize)}; 907 908 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 909 DeviceEnv.DebugKind = std::stoi(EnvStr); 910 911 const char *DeviceEnvName = "omptarget_device_environment"; 912 CUdeviceptr DeviceEnvPtr; 913 size_t CUSize; 914 915 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 916 if (Err == CUDA_SUCCESS) { 917 if (CUSize != sizeof(DeviceEnv)) { 918 REPORT( 919 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 920 DeviceEnvName, CUSize, sizeof(int32_t)); 921 CUDA_ERR_STRING(Err); 922 return nullptr; 923 } 924 925 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 926 if (Err != CUDA_SUCCESS) { 927 REPORT("Error when copying data from host to device. Pointers: " 928 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 929 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 930 CUDA_ERR_STRING(Err); 931 return nullptr; 932 } 933 934 DP("Sending global device environment data %zu bytes\n", CUSize); 935 } else { 936 DP("Finding global device environment '%s' - symbol missing.\n", 937 DeviceEnvName); 938 DP("Continue, considering this is a device RTL which does not accept " 939 "environment setting.\n"); 940 } 941 } 942 943 return getOffloadEntriesTable(DeviceId); 944 } 945 946 void *dataAlloc(const int DeviceId, const int64_t Size, 947 const TargetAllocTy Kind) { 948 switch (Kind) { 949 case TARGET_ALLOC_DEFAULT: 950 case TARGET_ALLOC_DEVICE: 951 if (UseMemoryManager) 952 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 953 else 954 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 955 case TARGET_ALLOC_HOST: 956 case TARGET_ALLOC_SHARED: 957 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 958 } 959 960 REPORT("Invalid target data allocation kind or requested allocator not " 961 "implemented yet\n"); 962 963 return nullptr; 964 } 965 966 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 967 const int64_t Size, __tgt_async_info *AsyncInfo) const { 968 assert(AsyncInfo && "AsyncInfo is nullptr"); 969 970 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 971 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 972 return OFFLOAD_FAIL; 973 974 CUstream Stream = getStream(DeviceId, AsyncInfo); 975 976 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 977 if (Err != CUDA_SUCCESS) { 978 DP("Error when copying data from host to device. Pointers: host " 979 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 980 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 981 CUDA_ERR_STRING(Err); 982 return OFFLOAD_FAIL; 983 } 984 985 return OFFLOAD_SUCCESS; 986 } 987 988 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 989 const int64_t Size, __tgt_async_info *AsyncInfo) const { 990 assert(AsyncInfo && "AsyncInfo is nullptr"); 991 992 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 993 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 994 return OFFLOAD_FAIL; 995 996 CUstream Stream = getStream(DeviceId, AsyncInfo); 997 998 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 999 if (Err != CUDA_SUCCESS) { 1000 DP("Error when copying data from device to host. Pointers: host " 1001 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 1002 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 1003 CUDA_ERR_STRING(Err); 1004 return OFFLOAD_FAIL; 1005 } 1006 1007 return OFFLOAD_SUCCESS; 1008 } 1009 1010 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 1011 int64_t Size, __tgt_async_info *AsyncInfo) const { 1012 assert(AsyncInfo && "AsyncInfo is nullptr"); 1013 1014 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 1015 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1016 return OFFLOAD_FAIL; 1017 1018 CUstream Stream = getStream(SrcDevId, AsyncInfo); 1019 1020 // If they are two devices, we try peer to peer copy first 1021 if (SrcDevId != DstDevId) { 1022 int CanAccessPeer = 0; 1023 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 1024 if (Err != CUDA_SUCCESS) { 1025 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 1026 ", dst = %" PRId32 "\n", 1027 SrcDevId, DstDevId); 1028 CUDA_ERR_STRING(Err); 1029 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1030 } 1031 1032 if (!CanAccessPeer) { 1033 DP("P2P memcpy not supported so fall back to D2D memcpy"); 1034 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1035 } 1036 1037 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 1038 if (Err != CUDA_SUCCESS) { 1039 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 1040 ", dst = %" PRId32 "\n", 1041 SrcDevId, DstDevId); 1042 CUDA_ERR_STRING(Err); 1043 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1044 } 1045 1046 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 1047 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 1048 Size, Stream); 1049 if (Err == CUDA_SUCCESS) 1050 return OFFLOAD_SUCCESS; 1051 1052 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 1053 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 1054 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 1055 CUDA_ERR_STRING(Err); 1056 } 1057 1058 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1059 } 1060 1061 int dataDelete(const int DeviceId, void *TgtPtr) { 1062 if (UseMemoryManager) 1063 return MemoryManagers[DeviceId]->free(TgtPtr); 1064 1065 return DeviceAllocators[DeviceId].free(TgtPtr); 1066 } 1067 1068 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 1069 ptrdiff_t *TgtOffsets, const int ArgNum, 1070 const int TeamNum, const int ThreadLimit, 1071 const unsigned int LoopTripCount, 1072 __tgt_async_info *AsyncInfo) const { 1073 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1074 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1075 return OFFLOAD_FAIL; 1076 1077 // All args are references. 1078 std::vector<void *> Args(ArgNum); 1079 std::vector<void *> Ptrs(ArgNum); 1080 1081 for (int I = 0; I < ArgNum; ++I) { 1082 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1083 Args[I] = &Ptrs[I]; 1084 } 1085 1086 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1087 1088 const bool IsSPMDGenericMode = 1089 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; 1090 const bool IsSPMDMode = 1091 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; 1092 const bool IsGenericMode = 1093 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; 1094 1095 int CudaThreadsPerBlock; 1096 if (ThreadLimit > 0) { 1097 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1098 CudaThreadsPerBlock = ThreadLimit; 1099 // Add master warp if necessary 1100 if (IsGenericMode) { 1101 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1102 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1103 } 1104 } else { 1105 DP("Setting CUDA threads per block to default %d\n", 1106 DeviceData[DeviceId].NumThreads); 1107 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1108 } 1109 1110 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1111 DP("Threads per block capped at device limit %d\n", 1112 DeviceData[DeviceId].ThreadsPerBlock); 1113 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1114 } 1115 1116 if (!KernelInfo->MaxThreadsPerBlock) { 1117 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1118 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1119 KernelInfo->Func); 1120 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1121 return OFFLOAD_FAIL; 1122 } 1123 1124 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1125 DP("Threads per block capped at kernel limit %d\n", 1126 KernelInfo->MaxThreadsPerBlock); 1127 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1128 } 1129 1130 unsigned int CudaBlocksPerGrid; 1131 if (TeamNum <= 0) { 1132 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1133 if (IsSPMDGenericMode) { 1134 // If we reach this point, then we are executing a kernel that was 1135 // transformed from Generic-mode to SPMD-mode. This kernel has 1136 // SPMD-mode execution, but needs its blocks to be scheduled 1137 // differently because the current loop trip count only applies to the 1138 // `teams distribute` region and will create var too few blocks using 1139 // the regular SPMD-mode method. 1140 CudaBlocksPerGrid = LoopTripCount; 1141 } else if (IsSPMDMode) { 1142 // We have a combined construct, i.e. `target teams distribute 1143 // parallel for [simd]`. We launch so many teams so that each thread 1144 // will execute one iteration of the loop. round up to the nearest 1145 // integer 1146 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1147 } else if (IsGenericMode) { 1148 // If we reach this point, then we have a non-combined construct, i.e. 1149 // `teams distribute` with a nested `parallel for` and each team is 1150 // assigned one iteration of the `distribute` loop. E.g.: 1151 // 1152 // #pragma omp target teams distribute 1153 // for(...loop_tripcount...) { 1154 // #pragma omp parallel for 1155 // for(...) {} 1156 // } 1157 // 1158 // Threads within a team will execute the iterations of the `parallel` 1159 // loop. 1160 CudaBlocksPerGrid = LoopTripCount; 1161 } else { 1162 REPORT("Unknown execution mode: %d\n", 1163 static_cast<int8_t>(KernelInfo->ExecutionMode)); 1164 return OFFLOAD_FAIL; 1165 } 1166 DP("Using %d teams due to loop trip count %" PRIu32 1167 " and number of threads per block %d\n", 1168 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1169 } else { 1170 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1171 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1172 } 1173 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 1174 DP("Capping number of teams to team limit %d\n", 1175 DeviceData[DeviceId].BlocksPerGrid); 1176 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1177 } else { 1178 DP("Using requested number of teams %d\n", TeamNum); 1179 CudaBlocksPerGrid = TeamNum; 1180 } 1181 1182 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1183 "Launching kernel %s with %d blocks and %d threads in %s mode\n", 1184 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1185 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1186 : "(null)", 1187 CudaBlocksPerGrid, CudaThreadsPerBlock, 1188 (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); 1189 1190 CUstream Stream = getStream(DeviceId, AsyncInfo); 1191 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1192 /* gridDimZ */ 1, CudaThreadsPerBlock, 1193 /* blockDimY */ 1, /* blockDimZ */ 1, 1194 DynamicMemorySize, Stream, &Args[0], nullptr); 1195 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1196 return OFFLOAD_FAIL; 1197 1198 DP("Launch of entry point at " DPxMOD " successful!\n", 1199 DPxPTR(TgtEntryPtr)); 1200 1201 return OFFLOAD_SUCCESS; 1202 } 1203 1204 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1205 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1206 CUresult Err = cuStreamSynchronize(Stream); 1207 1208 // Once the stream is synchronized, return it to stream pool and reset 1209 // AsyncInfo. This is to make sure the synchronization only works for its 1210 // own tasks. 1211 StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1212 AsyncInfo->Queue = nullptr; 1213 1214 if (Err != CUDA_SUCCESS) { 1215 DP("Error when synchronizing stream. stream = " DPxMOD 1216 ", async info ptr = " DPxMOD "\n", 1217 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1218 CUDA_ERR_STRING(Err); 1219 } 1220 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1221 } 1222 1223 void printDeviceInfo(int32_t device_id) { 1224 char TmpChar[1000]; 1225 std::string TmpStr; 1226 size_t TmpSt; 1227 int TmpInt, TmpInt2, TmpInt3; 1228 1229 CUdevice Device; 1230 checkResult(cuDeviceGet(&Device, device_id), 1231 "Error returned from cuCtxGetDevice\n"); 1232 1233 cuDriverGetVersion(&TmpInt); 1234 printf(" CUDA Driver Version: \t\t%d \n", TmpInt); 1235 printf(" CUDA Device Number: \t\t%d \n", device_id); 1236 checkResult(cuDeviceGetName(TmpChar, 1000, Device), 1237 "Error returned from cuDeviceGetName\n"); 1238 printf(" Device Name: \t\t\t%s \n", TmpChar); 1239 checkResult(cuDeviceTotalMem(&TmpSt, Device), 1240 "Error returned from cuDeviceTotalMem\n"); 1241 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); 1242 checkResult(cuDeviceGetAttribute( 1243 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), 1244 "Error returned from cuDeviceGetAttribute\n"); 1245 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); 1246 checkResult( 1247 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), 1248 "Error returned from cuDeviceGetAttribute\n"); 1249 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); 1250 checkResult(cuDeviceGetAttribute( 1251 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), 1252 "Error returned from cuDeviceGetAttribute\n"); 1253 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); 1254 checkResult( 1255 cuDeviceGetAttribute( 1256 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), 1257 "Error returned from cuDeviceGetAttribute\n"); 1258 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); 1259 checkResult( 1260 cuDeviceGetAttribute( 1261 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), 1262 "Error returned from cuDeviceGetAttribute\n"); 1263 printf(" Registers per Block: \t\t%d \n", TmpInt); 1264 checkResult( 1265 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), 1266 "Error returned from cuDeviceGetAttribute\n"); 1267 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); 1268 checkResult(cuDeviceGetAttribute( 1269 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), 1270 "Error returned from cuDeviceGetAttribute\n"); 1271 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); 1272 checkResult(cuDeviceGetAttribute( 1273 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), 1274 "Error returned from cuDeviceGetAttribute\n"); 1275 checkResult(cuDeviceGetAttribute( 1276 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), 1277 "Error returned from cuDeviceGetAttribute\n"); 1278 checkResult(cuDeviceGetAttribute( 1279 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), 1280 "Error returned from cuDeviceGetAttribute\n"); 1281 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, 1282 TmpInt3); 1283 checkResult(cuDeviceGetAttribute( 1284 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), 1285 "Error returned from cuDeviceGetAttribute\n"); 1286 checkResult(cuDeviceGetAttribute( 1287 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), 1288 "Error returned from cuDeviceGetAttribute\n"); 1289 checkResult(cuDeviceGetAttribute( 1290 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), 1291 "Error returned from cuDeviceGetAttribute\n"); 1292 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, 1293 TmpInt3); 1294 checkResult( 1295 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), 1296 "Error returned from cuDeviceGetAttribute\n"); 1297 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); 1298 checkResult(cuDeviceGetAttribute( 1299 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), 1300 "Error returned from cuDeviceGetAttribute\n"); 1301 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); 1302 checkResult( 1303 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), 1304 "Error returned from cuDeviceGetAttribute\n"); 1305 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); 1306 checkResult(cuDeviceGetAttribute( 1307 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), 1308 "Error returned from cuDeviceGetAttribute\n"); 1309 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1310 checkResult( 1311 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), 1312 "Error returned from cuDeviceGetAttribute\n"); 1313 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1314 checkResult(cuDeviceGetAttribute( 1315 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), 1316 "Error returned from cuDeviceGetAttribute\n"); 1317 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1318 checkResult( 1319 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), 1320 "Error returned from cuDeviceGetAttribute\n"); 1321 if (TmpInt == CU_COMPUTEMODE_DEFAULT) 1322 TmpStr = "DEFAULT"; 1323 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) 1324 TmpStr = "PROHIBITED"; 1325 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) 1326 TmpStr = "EXCLUSIVE PROCESS"; 1327 else 1328 TmpStr = "unknown"; 1329 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); 1330 checkResult(cuDeviceGetAttribute( 1331 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), 1332 "Error returned from cuDeviceGetAttribute\n"); 1333 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); 1334 checkResult( 1335 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), 1336 "Error returned from cuDeviceGetAttribute\n"); 1337 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1338 checkResult(cuDeviceGetAttribute( 1339 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), 1340 "Error returned from cuDeviceGetAttribute\n"); 1341 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); 1342 checkResult( 1343 cuDeviceGetAttribute( 1344 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), 1345 "Error returned from cuDeviceGetAttribute\n"); 1346 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); 1347 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, 1348 Device), 1349 "Error returned from cuDeviceGetAttribute\n"); 1350 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); 1351 checkResult(cuDeviceGetAttribute( 1352 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 1353 Device), 1354 "Error returned from cuDeviceGetAttribute\n"); 1355 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); 1356 checkResult(cuDeviceGetAttribute( 1357 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), 1358 "Error returned from cuDeviceGetAttribute\n"); 1359 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); 1360 checkResult(cuDeviceGetAttribute( 1361 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), 1362 "Error returned from cuDeviceGetAttribute\n"); 1363 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); 1364 checkResult(cuDeviceGetAttribute( 1365 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), 1366 "Error returned from cuDeviceGetAttribute\n"); 1367 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1368 checkResult( 1369 cuDeviceGetAttribute( 1370 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), 1371 "Error returned from cuDeviceGetAttribute\n"); 1372 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1373 checkResult( 1374 cuDeviceGetAttribute( 1375 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), 1376 "Error returned from cuDeviceGetAttribute\n"); 1377 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); 1378 checkResult(cuDeviceGetAttribute( 1379 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), 1380 "Error returned from cuDeviceGetAttribute\n"); 1381 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); 1382 checkResult(cuDeviceGetAttribute( 1383 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), 1384 "Error returned from cuDeviceGetAttribute\n"); 1385 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); 1386 checkResult( 1387 cuDeviceGetAttribute( 1388 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), 1389 "Error returned from cuDeviceGetAttribute\n"); 1390 checkResult( 1391 cuDeviceGetAttribute( 1392 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), 1393 "Error returned from cuDeviceGetAttribute\n"); 1394 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); 1395 } 1396 1397 int createEvent(void **P) { 1398 CUevent Event = nullptr; 1399 if (EventPool.acquire(Event) != OFFLOAD_SUCCESS) 1400 return OFFLOAD_FAIL; 1401 *P = Event; 1402 return OFFLOAD_SUCCESS; 1403 } 1404 1405 int destroyEvent(void *EventPtr) { 1406 EventPool.release(reinterpret_cast<CUevent>(EventPtr)); 1407 return OFFLOAD_SUCCESS; 1408 } 1409 1410 int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, 1411 void *EventPtr) const { 1412 CUstream Stream = getStream(DeviceId, AsyncInfo); 1413 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 1414 1415 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from 1416 // specific CUDA version, and defined as 0x0. In previous version, per CUDA 1417 // API document, that argument has to be 0x0. 1418 CUresult Err = cuStreamWaitEvent(Stream, Event, 0); 1419 if (Err != CUDA_SUCCESS) { 1420 DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", 1421 DPxPTR(Stream), DPxPTR(Event)); 1422 CUDA_ERR_STRING(Err); 1423 return OFFLOAD_FAIL; 1424 } 1425 1426 return OFFLOAD_SUCCESS; 1427 } 1428 1429 int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const { 1430 if (AsyncInfo->Queue) { 1431 StreamPool[DeviceId]->release( 1432 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1433 AsyncInfo->Queue = nullptr; 1434 } 1435 1436 return OFFLOAD_SUCCESS; 1437 } 1438 1439 int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const { 1440 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1441 if (!checkResult(Err, "error returned from cuCtxSetCurrent")) 1442 return OFFLOAD_FAIL; 1443 1444 *AsyncInfo = new __tgt_async_info; 1445 getStream(DeviceId, *AsyncInfo); 1446 return OFFLOAD_SUCCESS; 1447 } 1448 1449 int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo, 1450 const char **ErrStr) const { 1451 assert(DeviceInfo && "DeviceInfo is nullptr"); 1452 1453 if (!DeviceInfo->Context) 1454 DeviceInfo->Context = DeviceData[DeviceId].Context; 1455 if (!DeviceInfo->Device) { 1456 CUdevice Dev; 1457 CUresult Err = cuDeviceGet(&Dev, DeviceId); 1458 if (Err == CUDA_SUCCESS) { 1459 DeviceInfo->Device = reinterpret_cast<void *>(Dev); 1460 } else { 1461 cuGetErrorString(Err, ErrStr); 1462 return OFFLOAD_FAIL; 1463 } 1464 } 1465 return OFFLOAD_SUCCESS; 1466 } 1467 }; 1468 1469 DeviceRTLTy DeviceRTL; 1470 } // namespace 1471 1472 // Exposed library API function 1473 #ifdef __cplusplus 1474 extern "C" { 1475 #endif 1476 1477 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 1478 return elf_check_machine(image, /* EM_CUDA */ 190); 1479 } 1480 1481 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1482 1483 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1484 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1485 DeviceRTL.setRequiresFlag(RequiresFlags); 1486 return RequiresFlags; 1487 } 1488 1489 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1490 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1491 DeviceRTL.isValidDeviceId(dst_dev_id)) 1492 return 1; 1493 1494 return 0; 1495 } 1496 1497 int32_t __tgt_rtl_init_device(int32_t device_id) { 1498 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1499 1500 return DeviceRTL.initDevice(device_id); 1501 } 1502 1503 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1504 __tgt_device_image *image) { 1505 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1506 1507 return DeviceRTL.loadBinary(device_id, image); 1508 } 1509 1510 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, 1511 int32_t kind) { 1512 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1513 1514 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); 1515 } 1516 1517 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1518 int64_t size) { 1519 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1520 1521 __tgt_async_info AsyncInfo; 1522 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1523 size, &AsyncInfo); 1524 if (rc != OFFLOAD_SUCCESS) 1525 return OFFLOAD_FAIL; 1526 1527 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1528 } 1529 1530 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1531 void *hst_ptr, int64_t size, 1532 __tgt_async_info *async_info_ptr) { 1533 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1534 assert(async_info_ptr && "async_info_ptr is nullptr"); 1535 1536 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1537 async_info_ptr); 1538 } 1539 1540 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1541 int64_t size) { 1542 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1543 1544 __tgt_async_info AsyncInfo; 1545 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1546 size, &AsyncInfo); 1547 if (rc != OFFLOAD_SUCCESS) 1548 return OFFLOAD_FAIL; 1549 1550 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1551 } 1552 1553 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1554 void *tgt_ptr, int64_t size, 1555 __tgt_async_info *async_info_ptr) { 1556 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1557 assert(async_info_ptr && "async_info_ptr is nullptr"); 1558 1559 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1560 async_info_ptr); 1561 } 1562 1563 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1564 int dst_dev_id, void *dst_ptr, 1565 int64_t size, 1566 __tgt_async_info *AsyncInfo) { 1567 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1568 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1569 assert(AsyncInfo && "AsyncInfo is nullptr"); 1570 1571 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1572 AsyncInfo); 1573 } 1574 1575 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1576 int32_t dst_dev_id, void *dst_ptr, 1577 int64_t size) { 1578 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1579 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1580 1581 __tgt_async_info AsyncInfo; 1582 const int32_t rc = __tgt_rtl_data_exchange_async( 1583 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); 1584 if (rc != OFFLOAD_SUCCESS) 1585 return OFFLOAD_FAIL; 1586 1587 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); 1588 } 1589 1590 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1591 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1592 1593 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1594 } 1595 1596 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1597 void **tgt_args, 1598 ptrdiff_t *tgt_offsets, 1599 int32_t arg_num, int32_t team_num, 1600 int32_t thread_limit, 1601 uint64_t loop_tripcount) { 1602 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1603 1604 __tgt_async_info AsyncInfo; 1605 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1606 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1607 thread_limit, loop_tripcount, &AsyncInfo); 1608 if (rc != OFFLOAD_SUCCESS) 1609 return OFFLOAD_FAIL; 1610 1611 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1612 } 1613 1614 int32_t __tgt_rtl_run_target_team_region_async( 1615 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1616 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1617 int32_t thread_limit, uint64_t loop_tripcount, 1618 __tgt_async_info *async_info_ptr) { 1619 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1620 1621 return DeviceRTL.runTargetTeamRegion( 1622 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1623 thread_limit, loop_tripcount, async_info_ptr); 1624 } 1625 1626 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1627 void **tgt_args, ptrdiff_t *tgt_offsets, 1628 int32_t arg_num) { 1629 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1630 1631 __tgt_async_info AsyncInfo; 1632 const int32_t rc = __tgt_rtl_run_target_region_async( 1633 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); 1634 if (rc != OFFLOAD_SUCCESS) 1635 return OFFLOAD_FAIL; 1636 1637 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1638 } 1639 1640 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1641 void *tgt_entry_ptr, void **tgt_args, 1642 ptrdiff_t *tgt_offsets, 1643 int32_t arg_num, 1644 __tgt_async_info *async_info_ptr) { 1645 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1646 1647 return __tgt_rtl_run_target_team_region_async( 1648 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1649 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1650 async_info_ptr); 1651 } 1652 1653 int32_t __tgt_rtl_synchronize(int32_t device_id, 1654 __tgt_async_info *async_info_ptr) { 1655 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1656 assert(async_info_ptr && "async_info_ptr is nullptr"); 1657 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1658 1659 return DeviceRTL.synchronize(device_id, async_info_ptr); 1660 } 1661 1662 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1663 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1664 InfoLevel.store(NewInfoLevel); 1665 } 1666 1667 void __tgt_rtl_print_device_info(int32_t device_id) { 1668 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1669 DeviceRTL.printDeviceInfo(device_id); 1670 } 1671 1672 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) { 1673 assert(event && "event is nullptr"); 1674 return DeviceRTL.createEvent(event); 1675 } 1676 1677 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, 1678 __tgt_async_info *async_info_ptr) { 1679 assert(async_info_ptr && "async_info_ptr is nullptr"); 1680 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1681 assert(event_ptr && "event_ptr is nullptr"); 1682 1683 return recordEvent(event_ptr, async_info_ptr); 1684 } 1685 1686 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr, 1687 __tgt_async_info *async_info_ptr) { 1688 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1689 assert(async_info_ptr && "async_info_ptr is nullptr"); 1690 assert(event_ptr && "event is nullptr"); 1691 1692 return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr); 1693 } 1694 1695 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) { 1696 assert(event_ptr && "event is nullptr"); 1697 1698 return syncEvent(event_ptr); 1699 } 1700 1701 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) { 1702 assert(event_ptr && "event is nullptr"); 1703 1704 return DeviceRTL.destroyEvent(event_ptr); 1705 } 1706 1707 int32_t __tgt_rtl_release_async_info(int32_t device_id, 1708 __tgt_async_info *async_info) { 1709 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1710 assert(async_info && "async_info is nullptr"); 1711 1712 return DeviceRTL.releaseAsyncInfo(device_id, async_info); 1713 } 1714 1715 int32_t __tgt_rtl_init_async_info(int32_t device_id, 1716 __tgt_async_info **async_info) { 1717 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1718 assert(async_info && "async_info is nullptr"); 1719 1720 return DeviceRTL.initAsyncInfo(device_id, async_info); 1721 } 1722 1723 int32_t __tgt_rtl_init_device_info(int32_t device_id, 1724 __tgt_device_info *device_info_ptr, 1725 const char **err_str) { 1726 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1727 assert(device_info_ptr && "device_info_ptr is nullptr"); 1728 1729 return DeviceRTL.initDeviceInfo(device_id, device_info_ptr, err_str); 1730 } 1731 1732 #ifdef __cplusplus 1733 } 1734 #endif 1735