1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <algorithm> 14 #include <cassert> 15 #include <cstddef> 16 #include <cuda.h> 17 #include <list> 18 #include <memory> 19 #include <mutex> 20 #include <string> 21 #include <unordered_map> 22 #include <vector> 23 24 #include "Debug.h" 25 #include "DeviceEnvironment.h" 26 #include "omptarget.h" 27 #include "omptargetplugin.h" 28 29 #define TARGET_NAME CUDA 30 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 31 32 #include "MemoryManager.h" 33 34 #include "llvm/Frontend/OpenMP/OMPConstants.h" 35 36 // Utility for retrieving and printing CUDA error string. 37 #ifdef OMPTARGET_DEBUG 38 #define CUDA_ERR_STRING(err) \ 39 do { \ 40 if (getDebugLevel() > 0) { \ 41 const char *errStr = nullptr; \ 42 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 43 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 44 REPORT("Unrecognized CUDA error code: %d\n", err); \ 45 else if (errStr_status == CUDA_SUCCESS) \ 46 REPORT("CUDA error is: %s\n", errStr); \ 47 else { \ 48 REPORT("Unresolved CUDA error code: %d\n", err); \ 49 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 50 errStr_status); \ 51 } \ 52 } else { \ 53 const char *errStr = nullptr; \ 54 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 55 if (errStr_status == CUDA_SUCCESS) \ 56 REPORT("%s \n", errStr); \ 57 } \ 58 } while (false) 59 #else // OMPTARGET_DEBUG 60 #define CUDA_ERR_STRING(err) \ 61 do { \ 62 const char *errStr = nullptr; \ 63 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 64 if (errStr_status == CUDA_SUCCESS) \ 65 REPORT("%s \n", errStr); \ 66 } while (false) 67 #endif // OMPTARGET_DEBUG 68 69 #define BOOL2TEXT(b) ((b) ? "Yes" : "No") 70 71 #include "elf_common.h" 72 73 /// Keep entries table per device. 74 struct FuncOrGblEntryTy { 75 __tgt_target_table Table; 76 std::vector<__tgt_offload_entry> Entries; 77 }; 78 79 /// Use a single entity to encode a kernel and a set of flags. 80 struct KernelTy { 81 CUfunction Func; 82 83 // execution mode of kernel 84 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 85 86 /// Maximal number of threads per block for this kernel. 87 int MaxThreadsPerBlock = 0; 88 89 KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode) 90 : Func(_Func), ExecutionMode(_ExecutionMode) {} 91 }; 92 93 namespace { 94 bool checkResult(CUresult Err, const char *ErrMsg) { 95 if (Err == CUDA_SUCCESS) 96 return true; 97 98 REPORT("%s", ErrMsg); 99 CUDA_ERR_STRING(Err); 100 return false; 101 } 102 103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 104 CUstream Stream) { 105 CUresult Err = 106 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 107 108 if (Err != CUDA_SUCCESS) { 109 DP("Error when copying data from device to device. Pointers: src " 110 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 111 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 112 CUDA_ERR_STRING(Err); 113 return OFFLOAD_FAIL; 114 } 115 116 return OFFLOAD_SUCCESS; 117 } 118 119 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { 120 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 121 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 122 123 CUresult Err = cuEventRecord(Event, Stream); 124 if (Err != CUDA_SUCCESS) { 125 DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", 126 DPxPTR(Stream), DPxPTR(Event)); 127 CUDA_ERR_STRING(Err); 128 return OFFLOAD_FAIL; 129 } 130 131 return OFFLOAD_SUCCESS; 132 } 133 134 int syncEvent(void *EventPtr) { 135 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 136 137 CUresult Err = cuEventSynchronize(Event); 138 if (Err != CUDA_SUCCESS) { 139 DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); 140 CUDA_ERR_STRING(Err); 141 return OFFLOAD_FAIL; 142 } 143 144 return OFFLOAD_SUCCESS; 145 } 146 147 namespace { 148 149 // Structure contains per-device data 150 struct DeviceDataTy { 151 /// List that contains all the kernels. 152 std::list<KernelTy> KernelsList; 153 154 std::list<FuncOrGblEntryTy> FuncGblEntries; 155 156 CUcontext Context = nullptr; 157 // Device properties 158 int ThreadsPerBlock = 0; 159 int BlocksPerGrid = 0; 160 int WarpSize = 0; 161 // OpenMP properties 162 int NumTeams = 0; 163 int NumThreads = 0; 164 }; 165 166 /// Resource allocator where \p T is the resource type. 167 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL 168 /// accordingly. The implementation should not raise any exception. 169 template <typename T> struct AllocatorTy { 170 using ElementTy = T; 171 /// Create a resource and assign to R. 172 virtual int create(T &R) noexcept = 0; 173 /// Destroy the resource. 174 virtual int destroy(T) noexcept = 0; 175 }; 176 177 /// Allocator for CUstream. 178 struct StreamAllocatorTy final : public AllocatorTy<CUstream> { 179 /// See AllocatorTy<T>::create. 180 int create(CUstream &Stream) noexcept override { 181 if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING), 182 "Error returned from cuStreamCreate\n")) 183 return OFFLOAD_FAIL; 184 185 return OFFLOAD_SUCCESS; 186 } 187 188 /// See AllocatorTy<T>::destroy. 189 int destroy(CUstream Stream) noexcept override { 190 if (!checkResult(cuStreamDestroy(Stream), 191 "Error returned from cuStreamDestroy\n")) 192 return OFFLOAD_FAIL; 193 194 return OFFLOAD_SUCCESS; 195 } 196 }; 197 198 /// Allocator for CUevent. 199 struct EventAllocatorTy final : public AllocatorTy<CUevent> { 200 /// See AllocatorTy<T>::create. 201 int create(CUevent &Event) noexcept override { 202 if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT), 203 "Error returned from cuEventCreate\n")) 204 return OFFLOAD_FAIL; 205 206 return OFFLOAD_SUCCESS; 207 } 208 209 /// See AllocatorTy<T>::destroy. 210 int destroy(CUevent Event) noexcept override { 211 if (!checkResult(cuEventDestroy(Event), 212 "Error returned from cuEventDestroy\n")) 213 return OFFLOAD_FAIL; 214 215 return OFFLOAD_SUCCESS; 216 } 217 }; 218 219 /// A generic pool of resources where \p T is the resource type. 220 /// \p T should be copyable as the object is stored in \p std::vector . 221 template <typename AllocTy> class ResourcePoolTy { 222 using ElementTy = typename AllocTy::ElementTy; 223 /// Index of the next available resource. 224 size_t Next = 0; 225 /// Mutex to guard the pool. 226 std::mutex Mutex; 227 /// Pool of resources. 228 std::vector<ElementTy> Resources; 229 /// A reference to the corresponding allocator. 230 AllocTy Allocator; 231 232 /// If `Resources` is used up, we will fill in more resources. It assumes that 233 /// the new size `Size` should be always larger than the current size. 234 bool resize(size_t Size) { 235 auto CurSize = Resources.size(); 236 assert(Size > CurSize && "Unexpected smaller size"); 237 Resources.reserve(Size); 238 for (auto I = CurSize; I < Size; ++I) { 239 ElementTy NewItem; 240 int Ret = Allocator.create(NewItem); 241 if (Ret != OFFLOAD_SUCCESS) 242 return false; 243 Resources.push_back(NewItem); 244 } 245 return true; 246 } 247 248 public: 249 ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept 250 : Allocator(std::move(A)) { 251 if (Size) 252 (void)resize(Size); 253 } 254 255 ~ResourcePoolTy() noexcept { clear(); } 256 257 /// Get a resource from pool. `Next` always points to the next available 258 /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are 259 /// still available. If there is no resource left, we will ask for more. Each 260 /// time a resource is assigned, the id will increase one. 261 /// xxxxxs+++++++++ 262 /// ^ 263 /// Next 264 /// After assignment, the pool becomes the following and s is assigned. 265 /// xxxxxs+++++++++ 266 /// ^ 267 /// Next 268 int acquire(ElementTy &R) noexcept { 269 std::lock_guard<std::mutex> LG(Mutex); 270 if (Next == Resources.size()) { 271 auto NewSize = Resources.size() ? Resources.size() * 2 : 1; 272 if (!resize(NewSize)) 273 return OFFLOAD_FAIL; 274 } 275 276 assert(Next < Resources.size()); 277 278 R = Resources[Next++]; 279 280 return OFFLOAD_SUCCESS; 281 } 282 283 /// Return the resource back to the pool. When we return a resource, we need 284 /// to first decrease `Next`, and then copy the resource back. It is worth 285 /// noting that, the order of resources return might be different from that 286 /// they're assigned, that saying, at some point, there might be two identical 287 /// resources. 288 /// xxax+a+++++ 289 /// ^ 290 /// Next 291 /// However, it doesn't matter, because they're always on the two sides of 292 /// `Next`. The left one will in the end be overwritten by another resource. 293 /// Therefore, after several execution, the order of pool might be different 294 /// from its initial state. 295 void release(ElementTy R) noexcept { 296 std::lock_guard<std::mutex> LG(Mutex); 297 Resources[--Next] = R; 298 } 299 300 /// Released all stored resources and clear the pool. 301 /// Note: This function is not thread safe. Be sure to guard it if necessary. 302 void clear() noexcept { 303 for (auto &R : Resources) 304 (void)Allocator.destroy(R); 305 Resources.clear(); 306 } 307 }; 308 309 } // namespace 310 311 class DeviceRTLTy { 312 int NumberOfDevices; 313 // OpenMP environment properties 314 int EnvNumTeams; 315 int EnvTeamLimit; 316 int EnvTeamThreadLimit; 317 // OpenMP requires flags 318 int64_t RequiresFlags; 319 // Amount of dynamic shared memory to use at launch. 320 uint64_t DynamicMemorySize; 321 322 /// Number of initial streams for each device. 323 int NumInitialStreams = 32; 324 325 /// Number of initial events for each device. 326 int NumInitialEvents = 8; 327 328 static constexpr const int32_t HardThreadLimit = 1024; 329 static constexpr const int32_t DefaultNumTeams = 128; 330 static constexpr const int32_t DefaultNumThreads = 128; 331 332 using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>; 333 std::vector<std::unique_ptr<StreamPoolTy>> StreamPool; 334 335 using EventPoolTy = ResourcePoolTy<EventAllocatorTy>; 336 std::vector<std::unique_ptr<EventPoolTy>> EventPool; 337 338 std::vector<DeviceDataTy> DeviceData; 339 std::vector<std::vector<CUmodule>> Modules; 340 341 /// Vector of flags indicating the initalization status of all associated 342 /// devices. 343 std::vector<bool> InitializedFlags; 344 345 /// A class responsible for interacting with device native runtime library to 346 /// allocate and free memory. 347 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 348 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 349 350 public: 351 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 352 if (Size == 0) 353 return nullptr; 354 355 void *MemAlloc = nullptr; 356 CUresult Err; 357 switch (Kind) { 358 case TARGET_ALLOC_DEFAULT: 359 case TARGET_ALLOC_DEVICE: 360 CUdeviceptr DevicePtr; 361 Err = cuMemAlloc(&DevicePtr, Size); 362 MemAlloc = (void *)DevicePtr; 363 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 364 return nullptr; 365 break; 366 case TARGET_ALLOC_HOST: 367 void *HostPtr; 368 Err = cuMemAllocHost(&HostPtr, Size); 369 MemAlloc = HostPtr; 370 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 371 return nullptr; 372 HostPinnedAllocs[MemAlloc] = Kind; 373 break; 374 case TARGET_ALLOC_SHARED: 375 CUdeviceptr SharedPtr; 376 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 377 MemAlloc = (void *)SharedPtr; 378 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 379 return nullptr; 380 break; 381 } 382 383 return MemAlloc; 384 } 385 386 int free(void *TgtPtr) override { 387 CUresult Err; 388 // Host pinned memory must be freed differently. 389 TargetAllocTy Kind = 390 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 391 ? TARGET_ALLOC_DEFAULT 392 : TARGET_ALLOC_HOST; 393 switch (Kind) { 394 case TARGET_ALLOC_DEFAULT: 395 case TARGET_ALLOC_DEVICE: 396 case TARGET_ALLOC_SHARED: 397 Err = cuMemFree((CUdeviceptr)TgtPtr); 398 if (!checkResult(Err, "Error returned from cuMemFree\n")) 399 return OFFLOAD_FAIL; 400 break; 401 case TARGET_ALLOC_HOST: 402 Err = cuMemFreeHost(TgtPtr); 403 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 404 return OFFLOAD_FAIL; 405 break; 406 } 407 408 return OFFLOAD_SUCCESS; 409 } 410 }; 411 412 /// A vector of device allocators 413 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 414 415 /// A vector of memory managers. Since the memory manager is non-copyable and 416 // non-removable, we wrap them into std::unique_ptr. 417 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 418 419 /// Whether use memory manager 420 bool UseMemoryManager = true; 421 422 // Record entry point associated with device 423 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 424 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 425 E.Entries.push_back(entry); 426 } 427 428 // Return a pointer to the entry associated with the pointer 429 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 430 const void *Addr) const { 431 for (const __tgt_offload_entry &Itr : 432 DeviceData[DeviceId].FuncGblEntries.back().Entries) 433 if (Itr.addr == Addr) 434 return &Itr; 435 436 return nullptr; 437 } 438 439 // Return the pointer to the target entries table 440 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 441 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 442 443 if (E.Entries.empty()) 444 return nullptr; 445 446 // Update table info according to the entries and return the pointer 447 E.Table.EntriesBegin = E.Entries.data(); 448 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 449 450 return &E.Table; 451 } 452 453 // Clear entries table for a device 454 void clearOffloadEntriesTable(const int DeviceId) { 455 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 456 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 457 E.Entries.clear(); 458 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 459 } 460 461 public: 462 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 463 assert(AsyncInfo && "AsyncInfo is nullptr"); 464 465 if (!AsyncInfo->Queue) { 466 CUstream S; 467 if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS) 468 return nullptr; 469 470 AsyncInfo->Queue = S; 471 } 472 473 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 474 } 475 476 // This class should not be copied 477 DeviceRTLTy(const DeviceRTLTy &) = delete; 478 DeviceRTLTy(DeviceRTLTy &&) = delete; 479 480 DeviceRTLTy() 481 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 482 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), 483 DynamicMemorySize(0) { 484 485 DP("Start initializing CUDA\n"); 486 487 CUresult Err = cuInit(0); 488 if (Err == CUDA_ERROR_INVALID_HANDLE) { 489 // Can't call cuGetErrorString if dlsym failed 490 DP("Failed to load CUDA shared library\n"); 491 return; 492 } 493 if (!checkResult(Err, "Error returned from cuInit\n")) { 494 return; 495 } 496 497 Err = cuDeviceGetCount(&NumberOfDevices); 498 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 499 return; 500 501 if (NumberOfDevices == 0) { 502 DP("There are no devices supporting CUDA.\n"); 503 return; 504 } 505 506 DeviceData.resize(NumberOfDevices); 507 Modules.resize(NumberOfDevices); 508 StreamPool.resize(NumberOfDevices); 509 EventPool.resize(NumberOfDevices); 510 511 // Get environment variables regarding teams 512 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 513 // OMP_TEAM_LIMIT has been set 514 EnvTeamLimit = std::stoi(EnvStr); 515 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 516 } 517 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 518 // OMP_TEAMS_THREAD_LIMIT has been set 519 EnvTeamThreadLimit = std::stoi(EnvStr); 520 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 521 } 522 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 523 // OMP_NUM_TEAMS has been set 524 EnvNumTeams = std::stoi(EnvStr); 525 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 526 } 527 if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { 528 // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set 529 DynamicMemorySize = std::stoi(EnvStr); 530 DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", 531 DynamicMemorySize); 532 } 533 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) { 534 // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set 535 NumInitialStreams = std::stoi(EnvStr); 536 DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams); 537 } 538 539 for (int I = 0; I < NumberOfDevices; ++I) 540 DeviceAllocators.emplace_back(); 541 542 // Get the size threshold from environment variable 543 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 544 UseMemoryManager = Res.second; 545 size_t MemoryManagerThreshold = Res.first; 546 547 if (UseMemoryManager) 548 for (int I = 0; I < NumberOfDevices; ++I) 549 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 550 DeviceAllocators[I], MemoryManagerThreshold)); 551 552 // We lazily initialize all devices later. 553 InitializedFlags.assign(NumberOfDevices, false); 554 } 555 556 ~DeviceRTLTy() { 557 for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) 558 deinitDevice(DeviceId); 559 } 560 561 // Check whether a given DeviceId is valid 562 bool isValidDeviceId(const int DeviceId) const { 563 return DeviceId >= 0 && DeviceId < NumberOfDevices; 564 } 565 566 int getNumOfDevices() const { return NumberOfDevices; } 567 568 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 569 570 int initDevice(const int DeviceId) { 571 CUdevice Device; 572 573 DP("Getting device %d\n", DeviceId); 574 CUresult Err = cuDeviceGet(&Device, DeviceId); 575 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 576 return OFFLOAD_FAIL; 577 578 assert(InitializedFlags[DeviceId] == false && "Reinitializing device!"); 579 InitializedFlags[DeviceId] = true; 580 581 // Query the current flags of the primary context and set its flags if 582 // it is inactive 583 unsigned int FormerPrimaryCtxFlags = 0; 584 int FormerPrimaryCtxIsActive = 0; 585 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 586 &FormerPrimaryCtxIsActive); 587 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 588 return OFFLOAD_FAIL; 589 590 if (FormerPrimaryCtxIsActive) { 591 DP("The primary context is active, no change to its flags\n"); 592 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 593 CU_CTX_SCHED_BLOCKING_SYNC) 594 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 595 } else { 596 DP("The primary context is inactive, set its flags to " 597 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 598 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 599 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 600 return OFFLOAD_FAIL; 601 } 602 603 // Retain the per device primary context and save it to use whenever this 604 // device is selected. 605 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 606 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 607 return OFFLOAD_FAIL; 608 609 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 610 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 611 return OFFLOAD_FAIL; 612 613 // Initialize the stream pool. 614 if (!StreamPool[DeviceId]) 615 StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(), 616 NumInitialStreams); 617 618 // Initialize the event pool. 619 if (!EventPool[DeviceId]) 620 EventPool[DeviceId] = 621 std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents); 622 623 // Query attributes to determine number of threads/block and blocks/grid. 624 int MaxGridDimX; 625 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 626 Device); 627 if (Err != CUDA_SUCCESS) { 628 DP("Error getting max grid dimension, use default value %d\n", 629 DeviceRTLTy::DefaultNumTeams); 630 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 631 } else { 632 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 633 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 634 } 635 636 // We are only exploiting threads along the x axis. 637 int MaxBlockDimX; 638 Err = cuDeviceGetAttribute(&MaxBlockDimX, 639 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 640 if (Err != CUDA_SUCCESS) { 641 DP("Error getting max block dimension, use default value %d\n", 642 DeviceRTLTy::DefaultNumThreads); 643 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 644 } else { 645 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 646 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 647 648 if (EnvTeamThreadLimit > 0 && 649 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 650 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 651 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 652 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 653 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 654 } 655 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 656 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 657 "capping at the hard limit\n", 658 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 659 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 660 } 661 } 662 663 // Get and set warp size 664 int WarpSize; 665 Err = 666 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 667 if (Err != CUDA_SUCCESS) { 668 DP("Error getting warp size, assume default value 32\n"); 669 DeviceData[DeviceId].WarpSize = 32; 670 } else { 671 DP("Using warp size %d\n", WarpSize); 672 DeviceData[DeviceId].WarpSize = WarpSize; 673 } 674 675 // Adjust teams to the env variables 676 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 677 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 678 EnvTeamLimit); 679 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 680 } 681 682 size_t StackLimit; 683 size_t HeapLimit; 684 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { 685 StackLimit = std::stol(EnvStr); 686 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) 687 return OFFLOAD_FAIL; 688 } else { 689 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) 690 return OFFLOAD_FAIL; 691 } 692 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { 693 HeapLimit = std::stol(EnvStr); 694 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) 695 return OFFLOAD_FAIL; 696 } else { 697 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) 698 return OFFLOAD_FAIL; 699 } 700 701 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 702 "Device supports up to %d CUDA blocks and %d threads with a " 703 "warp size of %d\n", 704 DeviceData[DeviceId].BlocksPerGrid, 705 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 706 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 707 "Device heap size is %d Bytes, device stack size is %d Bytes per " 708 "thread\n", 709 (int)HeapLimit, (int)StackLimit); 710 711 // Set default number of teams 712 if (EnvNumTeams > 0) { 713 DP("Default number of teams set according to environment %d\n", 714 EnvNumTeams); 715 DeviceData[DeviceId].NumTeams = EnvNumTeams; 716 } else { 717 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 718 DP("Default number of teams set according to library's default %d\n", 719 DeviceRTLTy::DefaultNumTeams); 720 } 721 722 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 723 DP("Default number of teams exceeds device limit, capping at %d\n", 724 DeviceData[DeviceId].BlocksPerGrid); 725 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 726 } 727 728 // Set default number of threads 729 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 730 DP("Default number of threads set according to library's default %d\n", 731 DeviceRTLTy::DefaultNumThreads); 732 if (DeviceData[DeviceId].NumThreads > 733 DeviceData[DeviceId].ThreadsPerBlock) { 734 DP("Default number of threads exceeds device limit, capping at %d\n", 735 DeviceData[DeviceId].ThreadsPerBlock); 736 DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; 737 } 738 739 return OFFLOAD_SUCCESS; 740 } 741 742 int deinitDevice(const int DeviceId) { 743 auto IsInitialized = InitializedFlags[DeviceId]; 744 if (!IsInitialized) 745 return OFFLOAD_SUCCESS; 746 InitializedFlags[DeviceId] = false; 747 748 if (UseMemoryManager) 749 MemoryManagers[DeviceId].release(); 750 751 StreamPool[DeviceId].reset(); 752 EventPool[DeviceId].reset(); 753 754 DeviceDataTy &D = DeviceData[DeviceId]; 755 if (!checkResult(cuCtxSetCurrent(D.Context), 756 "Error returned from cuCtxSetCurrent\n")) 757 return OFFLOAD_FAIL; 758 759 // Unload all modules. 760 for (auto &M : Modules[DeviceId]) 761 if (!checkResult(cuModuleUnload(M), 762 "Error returned from cuModuleUnload\n")) 763 return OFFLOAD_FAIL; 764 765 // Destroy context. 766 CUdevice Device; 767 if (!checkResult(cuCtxGetDevice(&Device), 768 "Error returned from cuCtxGetDevice\n")) 769 return OFFLOAD_FAIL; 770 771 if (!checkResult(cuDevicePrimaryCtxRelease(Device), 772 "Error returned from cuDevicePrimaryCtxRelease\n")) 773 return OFFLOAD_FAIL; 774 775 return OFFLOAD_SUCCESS; 776 } 777 778 __tgt_target_table *loadBinary(const int DeviceId, 779 const __tgt_device_image *Image) { 780 // Clear the offload table as we are going to create a new one. 781 clearOffloadEntriesTable(DeviceId); 782 783 // Create the module and extract the function pointers. 784 CUmodule Module; 785 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 786 CUresult Err = 787 cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 788 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 789 return nullptr; 790 791 DP("CUDA module successfully loaded!\n"); 792 793 Modules[DeviceId].push_back(Module); 794 795 // Find the symbols in the module by name. 796 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 797 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 798 799 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 800 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 801 if (!E->addr) { 802 // We return nullptr when something like this happens, the host should 803 // have always something in the address to uniquely identify the target 804 // region. 805 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 806 return nullptr; 807 } 808 809 if (E->size) { 810 __tgt_offload_entry Entry = *E; 811 CUdeviceptr CUPtr; 812 size_t CUSize; 813 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 814 // We keep this style here because we need the name 815 if (Err != CUDA_SUCCESS) { 816 REPORT("Loading global '%s' Failed\n", E->name); 817 CUDA_ERR_STRING(Err); 818 return nullptr; 819 } 820 821 if (CUSize != E->size) { 822 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 823 CUSize, E->size); 824 return nullptr; 825 } 826 827 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 828 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 829 830 Entry.addr = (void *)(CUPtr); 831 832 // Note: In the current implementation declare target variables 833 // can either be link or to. This means that once unified 834 // memory is activated via the requires directive, the variable 835 // can be used directly from the host in both cases. 836 // TODO: when variables types other than to or link are added, 837 // the below condition should be changed to explicitly 838 // check for to and link variables types: 839 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 840 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 841 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 842 // If unified memory is present any target link or to variables 843 // can access host addresses directly. There is no longer a 844 // need for device copies. 845 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 846 DP("Copy linked variable host address (" DPxMOD 847 ") to device address (" DPxMOD ")\n", 848 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 849 } 850 851 addOffloadEntry(DeviceId, Entry); 852 853 continue; 854 } 855 856 CUfunction Func; 857 Err = cuModuleGetFunction(&Func, Module, E->name); 858 // We keep this style here because we need the name 859 if (Err != CUDA_SUCCESS) { 860 REPORT("Loading '%s' Failed\n", E->name); 861 CUDA_ERR_STRING(Err); 862 return nullptr; 863 } 864 865 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 866 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 867 868 // default value GENERIC (in case symbol is missing from cubin file) 869 llvm::omp::OMPTgtExecModeFlags ExecModeVal; 870 std::string ExecModeNameStr(E->name); 871 ExecModeNameStr += "_exec_mode"; 872 const char *ExecModeName = ExecModeNameStr.c_str(); 873 874 CUdeviceptr ExecModePtr; 875 size_t CUSize; 876 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 877 if (Err == CUDA_SUCCESS) { 878 if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 879 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 880 ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 881 return nullptr; 882 } 883 884 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 885 if (Err != CUDA_SUCCESS) { 886 REPORT("Error when copying data from device to host. Pointers: " 887 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 888 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 889 CUDA_ERR_STRING(Err); 890 return nullptr; 891 } 892 } else { 893 DP("Loading global exec_mode '%s' - symbol missing, using default " 894 "value GENERIC (1)\n", 895 ExecModeName); 896 } 897 898 KernelsList.emplace_back(Func, ExecModeVal); 899 900 __tgt_offload_entry Entry = *E; 901 Entry.addr = &KernelsList.back(); 902 addOffloadEntry(DeviceId, Entry); 903 } 904 905 // send device environment data to the device 906 { 907 // TODO: The device ID used here is not the real device ID used by OpenMP. 908 DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices), 909 static_cast<uint32_t>(DeviceId), 910 static_cast<uint32_t>(DynamicMemorySize)}; 911 912 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 913 DeviceEnv.DebugKind = std::stoi(EnvStr); 914 915 const char *DeviceEnvName = "omptarget_device_environment"; 916 CUdeviceptr DeviceEnvPtr; 917 size_t CUSize; 918 919 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 920 if (Err == CUDA_SUCCESS) { 921 if (CUSize != sizeof(DeviceEnv)) { 922 REPORT( 923 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 924 DeviceEnvName, CUSize, sizeof(int32_t)); 925 CUDA_ERR_STRING(Err); 926 return nullptr; 927 } 928 929 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 930 if (Err != CUDA_SUCCESS) { 931 REPORT("Error when copying data from host to device. Pointers: " 932 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 933 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 934 CUDA_ERR_STRING(Err); 935 return nullptr; 936 } 937 938 DP("Sending global device environment data %zu bytes\n", CUSize); 939 } else { 940 DP("Finding global device environment '%s' - symbol missing.\n", 941 DeviceEnvName); 942 DP("Continue, considering this is a device RTL which does not accept " 943 "environment setting.\n"); 944 } 945 } 946 947 return getOffloadEntriesTable(DeviceId); 948 } 949 950 void *dataAlloc(const int DeviceId, const int64_t Size, 951 const TargetAllocTy Kind) { 952 switch (Kind) { 953 case TARGET_ALLOC_DEFAULT: 954 case TARGET_ALLOC_DEVICE: 955 if (UseMemoryManager) 956 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 957 else 958 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 959 case TARGET_ALLOC_HOST: 960 case TARGET_ALLOC_SHARED: 961 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 962 } 963 964 REPORT("Invalid target data allocation kind or requested allocator not " 965 "implemented yet\n"); 966 967 return nullptr; 968 } 969 970 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 971 const int64_t Size, __tgt_async_info *AsyncInfo) const { 972 assert(AsyncInfo && "AsyncInfo is nullptr"); 973 974 CUstream Stream = getStream(DeviceId, AsyncInfo); 975 CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 976 if (Err != CUDA_SUCCESS) { 977 DP("Error when copying data from host to device. Pointers: host " 978 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 979 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 980 CUDA_ERR_STRING(Err); 981 return OFFLOAD_FAIL; 982 } 983 984 return OFFLOAD_SUCCESS; 985 } 986 987 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 988 const int64_t Size, __tgt_async_info *AsyncInfo) const { 989 assert(AsyncInfo && "AsyncInfo is nullptr"); 990 991 CUstream Stream = getStream(DeviceId, AsyncInfo); 992 CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 993 if (Err != CUDA_SUCCESS) { 994 DP("Error when copying data from device to host. Pointers: host " 995 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 996 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 997 CUDA_ERR_STRING(Err); 998 return OFFLOAD_FAIL; 999 } 1000 1001 return OFFLOAD_SUCCESS; 1002 } 1003 1004 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 1005 int64_t Size, __tgt_async_info *AsyncInfo) const { 1006 assert(AsyncInfo && "AsyncInfo is nullptr"); 1007 1008 CUresult Err; 1009 CUstream Stream = getStream(SrcDevId, AsyncInfo); 1010 1011 // If they are two devices, we try peer to peer copy first 1012 if (SrcDevId != DstDevId) { 1013 int CanAccessPeer = 0; 1014 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 1015 if (Err != CUDA_SUCCESS) { 1016 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 1017 ", dst = %" PRId32 "\n", 1018 SrcDevId, DstDevId); 1019 CUDA_ERR_STRING(Err); 1020 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1021 } 1022 1023 if (!CanAccessPeer) { 1024 DP("P2P memcpy not supported so fall back to D2D memcpy"); 1025 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1026 } 1027 1028 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 1029 if (Err != CUDA_SUCCESS) { 1030 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 1031 ", dst = %" PRId32 "\n", 1032 SrcDevId, DstDevId); 1033 CUDA_ERR_STRING(Err); 1034 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1035 } 1036 1037 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 1038 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 1039 Size, Stream); 1040 if (Err == CUDA_SUCCESS) 1041 return OFFLOAD_SUCCESS; 1042 1043 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 1044 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 1045 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 1046 CUDA_ERR_STRING(Err); 1047 } 1048 1049 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1050 } 1051 1052 int dataDelete(const int DeviceId, void *TgtPtr) { 1053 if (UseMemoryManager) 1054 return MemoryManagers[DeviceId]->free(TgtPtr); 1055 1056 return DeviceAllocators[DeviceId].free(TgtPtr); 1057 } 1058 1059 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 1060 ptrdiff_t *TgtOffsets, const int ArgNum, 1061 const int TeamNum, const int ThreadLimit, 1062 const unsigned int LoopTripCount, 1063 __tgt_async_info *AsyncInfo) const { 1064 // All args are references. 1065 std::vector<void *> Args(ArgNum); 1066 std::vector<void *> Ptrs(ArgNum); 1067 1068 for (int I = 0; I < ArgNum; ++I) { 1069 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1070 Args[I] = &Ptrs[I]; 1071 } 1072 1073 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1074 1075 const bool IsSPMDGenericMode = 1076 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; 1077 const bool IsSPMDMode = 1078 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; 1079 const bool IsGenericMode = 1080 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; 1081 1082 int CudaThreadsPerBlock; 1083 if (ThreadLimit > 0) { 1084 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1085 CudaThreadsPerBlock = ThreadLimit; 1086 // Add master warp if necessary 1087 if (IsGenericMode) { 1088 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1089 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1090 } 1091 } else { 1092 DP("Setting CUDA threads per block to default %d\n", 1093 DeviceData[DeviceId].NumThreads); 1094 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1095 } 1096 1097 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1098 DP("Threads per block capped at device limit %d\n", 1099 DeviceData[DeviceId].ThreadsPerBlock); 1100 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1101 } 1102 1103 CUresult Err; 1104 if (!KernelInfo->MaxThreadsPerBlock) { 1105 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1106 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1107 KernelInfo->Func); 1108 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1109 return OFFLOAD_FAIL; 1110 } 1111 1112 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1113 DP("Threads per block capped at kernel limit %d\n", 1114 KernelInfo->MaxThreadsPerBlock); 1115 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1116 } 1117 1118 unsigned int CudaBlocksPerGrid; 1119 if (TeamNum <= 0) { 1120 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1121 if (IsSPMDGenericMode) { 1122 // If we reach this point, then we are executing a kernel that was 1123 // transformed from Generic-mode to SPMD-mode. This kernel has 1124 // SPMD-mode execution, but needs its blocks to be scheduled 1125 // differently because the current loop trip count only applies to the 1126 // `teams distribute` region and will create var too few blocks using 1127 // the regular SPMD-mode method. 1128 CudaBlocksPerGrid = LoopTripCount; 1129 } else if (IsSPMDMode) { 1130 // We have a combined construct, i.e. `target teams distribute 1131 // parallel for [simd]`. We launch so many teams so that each thread 1132 // will execute one iteration of the loop. round up to the nearest 1133 // integer 1134 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1135 } else if (IsGenericMode) { 1136 // If we reach this point, then we have a non-combined construct, i.e. 1137 // `teams distribute` with a nested `parallel for` and each team is 1138 // assigned one iteration of the `distribute` loop. E.g.: 1139 // 1140 // #pragma omp target teams distribute 1141 // for(...loop_tripcount...) { 1142 // #pragma omp parallel for 1143 // for(...) {} 1144 // } 1145 // 1146 // Threads within a team will execute the iterations of the `parallel` 1147 // loop. 1148 CudaBlocksPerGrid = LoopTripCount; 1149 } else { 1150 REPORT("Unknown execution mode: %d\n", 1151 static_cast<int8_t>(KernelInfo->ExecutionMode)); 1152 return OFFLOAD_FAIL; 1153 } 1154 DP("Using %d teams due to loop trip count %" PRIu32 1155 " and number of threads per block %d\n", 1156 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1157 } else { 1158 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1159 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1160 } 1161 } else { 1162 DP("Using requested number of teams %d\n", TeamNum); 1163 CudaBlocksPerGrid = TeamNum; 1164 } 1165 1166 if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) { 1167 DP("Capping number of teams to team limit %d\n", 1168 DeviceData[DeviceId].BlocksPerGrid); 1169 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1170 } 1171 1172 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1173 "Launching kernel %s with %d blocks and %d threads in %s mode\n", 1174 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1175 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1176 : "(null)", 1177 CudaBlocksPerGrid, CudaThreadsPerBlock, 1178 (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); 1179 1180 CUstream Stream = getStream(DeviceId, AsyncInfo); 1181 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1182 /* gridDimZ */ 1, CudaThreadsPerBlock, 1183 /* blockDimY */ 1, /* blockDimZ */ 1, 1184 DynamicMemorySize, Stream, &Args[0], nullptr); 1185 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1186 return OFFLOAD_FAIL; 1187 1188 DP("Launch of entry point at " DPxMOD " successful!\n", 1189 DPxPTR(TgtEntryPtr)); 1190 1191 return OFFLOAD_SUCCESS; 1192 } 1193 1194 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1195 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1196 CUresult Err = cuStreamSynchronize(Stream); 1197 1198 // Once the stream is synchronized, return it to stream pool and reset 1199 // AsyncInfo. This is to make sure the synchronization only works for its 1200 // own tasks. 1201 StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1202 AsyncInfo->Queue = nullptr; 1203 1204 if (Err != CUDA_SUCCESS) { 1205 DP("Error when synchronizing stream. stream = " DPxMOD 1206 ", async info ptr = " DPxMOD "\n", 1207 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1208 CUDA_ERR_STRING(Err); 1209 } 1210 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1211 } 1212 1213 void printDeviceInfo(int32_t device_id) { 1214 char TmpChar[1000]; 1215 std::string TmpStr; 1216 size_t TmpSt; 1217 int TmpInt, TmpInt2, TmpInt3; 1218 1219 CUdevice Device; 1220 checkResult(cuDeviceGet(&Device, device_id), 1221 "Error returned from cuCtxGetDevice\n"); 1222 1223 cuDriverGetVersion(&TmpInt); 1224 printf(" CUDA Driver Version: \t\t%d \n", TmpInt); 1225 printf(" CUDA Device Number: \t\t%d \n", device_id); 1226 checkResult(cuDeviceGetName(TmpChar, 1000, Device), 1227 "Error returned from cuDeviceGetName\n"); 1228 printf(" Device Name: \t\t\t%s \n", TmpChar); 1229 checkResult(cuDeviceTotalMem(&TmpSt, Device), 1230 "Error returned from cuDeviceTotalMem\n"); 1231 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); 1232 checkResult(cuDeviceGetAttribute( 1233 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), 1234 "Error returned from cuDeviceGetAttribute\n"); 1235 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); 1236 checkResult( 1237 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), 1238 "Error returned from cuDeviceGetAttribute\n"); 1239 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); 1240 checkResult(cuDeviceGetAttribute( 1241 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), 1242 "Error returned from cuDeviceGetAttribute\n"); 1243 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); 1244 checkResult( 1245 cuDeviceGetAttribute( 1246 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), 1247 "Error returned from cuDeviceGetAttribute\n"); 1248 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); 1249 checkResult( 1250 cuDeviceGetAttribute( 1251 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), 1252 "Error returned from cuDeviceGetAttribute\n"); 1253 printf(" Registers per Block: \t\t%d \n", TmpInt); 1254 checkResult( 1255 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), 1256 "Error returned from cuDeviceGetAttribute\n"); 1257 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); 1258 checkResult(cuDeviceGetAttribute( 1259 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), 1260 "Error returned from cuDeviceGetAttribute\n"); 1261 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); 1262 checkResult(cuDeviceGetAttribute( 1263 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), 1264 "Error returned from cuDeviceGetAttribute\n"); 1265 checkResult(cuDeviceGetAttribute( 1266 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), 1267 "Error returned from cuDeviceGetAttribute\n"); 1268 checkResult(cuDeviceGetAttribute( 1269 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), 1270 "Error returned from cuDeviceGetAttribute\n"); 1271 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, 1272 TmpInt3); 1273 checkResult(cuDeviceGetAttribute( 1274 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), 1275 "Error returned from cuDeviceGetAttribute\n"); 1276 checkResult(cuDeviceGetAttribute( 1277 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), 1278 "Error returned from cuDeviceGetAttribute\n"); 1279 checkResult(cuDeviceGetAttribute( 1280 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), 1281 "Error returned from cuDeviceGetAttribute\n"); 1282 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, 1283 TmpInt3); 1284 checkResult( 1285 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), 1286 "Error returned from cuDeviceGetAttribute\n"); 1287 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); 1288 checkResult(cuDeviceGetAttribute( 1289 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), 1290 "Error returned from cuDeviceGetAttribute\n"); 1291 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); 1292 checkResult( 1293 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), 1294 "Error returned from cuDeviceGetAttribute\n"); 1295 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); 1296 checkResult(cuDeviceGetAttribute( 1297 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), 1298 "Error returned from cuDeviceGetAttribute\n"); 1299 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1300 checkResult( 1301 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), 1302 "Error returned from cuDeviceGetAttribute\n"); 1303 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1304 checkResult(cuDeviceGetAttribute( 1305 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), 1306 "Error returned from cuDeviceGetAttribute\n"); 1307 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1308 checkResult( 1309 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), 1310 "Error returned from cuDeviceGetAttribute\n"); 1311 if (TmpInt == CU_COMPUTEMODE_DEFAULT) 1312 TmpStr = "DEFAULT"; 1313 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) 1314 TmpStr = "PROHIBITED"; 1315 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) 1316 TmpStr = "EXCLUSIVE PROCESS"; 1317 else 1318 TmpStr = "unknown"; 1319 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); 1320 checkResult(cuDeviceGetAttribute( 1321 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), 1322 "Error returned from cuDeviceGetAttribute\n"); 1323 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); 1324 checkResult( 1325 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), 1326 "Error returned from cuDeviceGetAttribute\n"); 1327 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1328 checkResult(cuDeviceGetAttribute( 1329 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), 1330 "Error returned from cuDeviceGetAttribute\n"); 1331 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); 1332 checkResult( 1333 cuDeviceGetAttribute( 1334 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), 1335 "Error returned from cuDeviceGetAttribute\n"); 1336 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); 1337 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, 1338 Device), 1339 "Error returned from cuDeviceGetAttribute\n"); 1340 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); 1341 checkResult(cuDeviceGetAttribute( 1342 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 1343 Device), 1344 "Error returned from cuDeviceGetAttribute\n"); 1345 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); 1346 checkResult(cuDeviceGetAttribute( 1347 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), 1348 "Error returned from cuDeviceGetAttribute\n"); 1349 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); 1350 checkResult(cuDeviceGetAttribute( 1351 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), 1352 "Error returned from cuDeviceGetAttribute\n"); 1353 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); 1354 checkResult(cuDeviceGetAttribute( 1355 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), 1356 "Error returned from cuDeviceGetAttribute\n"); 1357 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1358 checkResult( 1359 cuDeviceGetAttribute( 1360 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), 1361 "Error returned from cuDeviceGetAttribute\n"); 1362 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1363 checkResult( 1364 cuDeviceGetAttribute( 1365 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), 1366 "Error returned from cuDeviceGetAttribute\n"); 1367 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); 1368 checkResult(cuDeviceGetAttribute( 1369 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), 1370 "Error returned from cuDeviceGetAttribute\n"); 1371 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); 1372 checkResult(cuDeviceGetAttribute( 1373 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), 1374 "Error returned from cuDeviceGetAttribute\n"); 1375 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); 1376 checkResult( 1377 cuDeviceGetAttribute( 1378 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), 1379 "Error returned from cuDeviceGetAttribute\n"); 1380 checkResult( 1381 cuDeviceGetAttribute( 1382 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), 1383 "Error returned from cuDeviceGetAttribute\n"); 1384 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); 1385 } 1386 1387 int createEvent(int DeviceId, void **P) { 1388 CUevent Event = nullptr; 1389 if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS) 1390 return OFFLOAD_FAIL; 1391 *P = Event; 1392 return OFFLOAD_SUCCESS; 1393 } 1394 1395 int destroyEvent(int DeviceId, void *EventPtr) { 1396 EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr)); 1397 return OFFLOAD_SUCCESS; 1398 } 1399 1400 int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, 1401 void *EventPtr) const { 1402 CUstream Stream = getStream(DeviceId, AsyncInfo); 1403 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 1404 1405 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from 1406 // specific CUDA version, and defined as 0x0. In previous version, per CUDA 1407 // API document, that argument has to be 0x0. 1408 CUresult Err = cuStreamWaitEvent(Stream, Event, 0); 1409 if (Err != CUDA_SUCCESS) { 1410 DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", 1411 DPxPTR(Stream), DPxPTR(Event)); 1412 CUDA_ERR_STRING(Err); 1413 return OFFLOAD_FAIL; 1414 } 1415 1416 return OFFLOAD_SUCCESS; 1417 } 1418 1419 int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const { 1420 if (AsyncInfo->Queue) { 1421 StreamPool[DeviceId]->release( 1422 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1423 AsyncInfo->Queue = nullptr; 1424 } 1425 1426 return OFFLOAD_SUCCESS; 1427 } 1428 1429 int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const { 1430 *AsyncInfo = new __tgt_async_info; 1431 getStream(DeviceId, *AsyncInfo); 1432 return OFFLOAD_SUCCESS; 1433 } 1434 1435 int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo, 1436 const char **ErrStr) const { 1437 assert(DeviceInfo && "DeviceInfo is nullptr"); 1438 1439 if (!DeviceInfo->Context) 1440 DeviceInfo->Context = DeviceData[DeviceId].Context; 1441 if (!DeviceInfo->Device) { 1442 CUdevice Dev; 1443 CUresult Err = cuDeviceGet(&Dev, DeviceId); 1444 if (Err == CUDA_SUCCESS) { 1445 DeviceInfo->Device = reinterpret_cast<void *>(Dev); 1446 } else { 1447 cuGetErrorString(Err, ErrStr); 1448 return OFFLOAD_FAIL; 1449 } 1450 } 1451 return OFFLOAD_SUCCESS; 1452 } 1453 1454 int setContext(int DeviceId) { 1455 assert(InitializedFlags[DeviceId] && "Device is not initialized"); 1456 1457 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1458 if (!checkResult(Err, "error returned from cuCtxSetCurrent")) 1459 return OFFLOAD_FAIL; 1460 1461 return OFFLOAD_SUCCESS; 1462 } 1463 }; 1464 1465 DeviceRTLTy DeviceRTL; 1466 } // namespace 1467 1468 // Exposed library API function 1469 #ifdef __cplusplus 1470 extern "C" { 1471 #endif 1472 1473 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 1474 return elf_check_machine(image, /* EM_CUDA */ 190); 1475 } 1476 1477 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1478 1479 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1480 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1481 DeviceRTL.setRequiresFlag(RequiresFlags); 1482 return RequiresFlags; 1483 } 1484 1485 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1486 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1487 DeviceRTL.isValidDeviceId(dst_dev_id)) 1488 return 1; 1489 1490 return 0; 1491 } 1492 1493 int32_t __tgt_rtl_init_device(int32_t device_id) { 1494 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1495 // Context is set when init the device. 1496 1497 return DeviceRTL.initDevice(device_id); 1498 } 1499 1500 int32_t __tgt_rtl_deinit_device(int32_t device_id) { 1501 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1502 // Context is set when deinit the device. 1503 1504 return DeviceRTL.deinitDevice(device_id); 1505 } 1506 1507 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1508 __tgt_device_image *image) { 1509 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1510 1511 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1512 return nullptr; 1513 1514 return DeviceRTL.loadBinary(device_id, image); 1515 } 1516 1517 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, 1518 int32_t kind) { 1519 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1520 1521 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1522 return nullptr; 1523 1524 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); 1525 } 1526 1527 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1528 int64_t size) { 1529 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1530 // Context is set in __tgt_rtl_data_submit_async. 1531 1532 __tgt_async_info AsyncInfo; 1533 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1534 size, &AsyncInfo); 1535 if (rc != OFFLOAD_SUCCESS) 1536 return OFFLOAD_FAIL; 1537 1538 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1539 } 1540 1541 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1542 void *hst_ptr, int64_t size, 1543 __tgt_async_info *async_info_ptr) { 1544 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1545 assert(async_info_ptr && "async_info_ptr is nullptr"); 1546 1547 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1548 return OFFLOAD_FAIL; 1549 1550 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1551 async_info_ptr); 1552 } 1553 1554 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1555 int64_t size) { 1556 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1557 // Context is set in __tgt_rtl_data_retrieve_async. 1558 1559 __tgt_async_info AsyncInfo; 1560 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1561 size, &AsyncInfo); 1562 if (rc != OFFLOAD_SUCCESS) 1563 return OFFLOAD_FAIL; 1564 1565 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1566 } 1567 1568 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1569 void *tgt_ptr, int64_t size, 1570 __tgt_async_info *async_info_ptr) { 1571 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1572 assert(async_info_ptr && "async_info_ptr is nullptr"); 1573 1574 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1575 return OFFLOAD_FAIL; 1576 1577 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1578 async_info_ptr); 1579 } 1580 1581 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1582 int dst_dev_id, void *dst_ptr, 1583 int64_t size, 1584 __tgt_async_info *AsyncInfo) { 1585 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1586 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1587 assert(AsyncInfo && "AsyncInfo is nullptr"); 1588 // NOTE: We don't need to set context for data exchange as the device contexts 1589 // are passed to CUDA function directly. 1590 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1591 AsyncInfo); 1592 } 1593 1594 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1595 int32_t dst_dev_id, void *dst_ptr, 1596 int64_t size) { 1597 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1598 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1599 // Context is set in __tgt_rtl_data_exchange_async. 1600 1601 __tgt_async_info AsyncInfo; 1602 const int32_t rc = __tgt_rtl_data_exchange_async( 1603 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); 1604 if (rc != OFFLOAD_SUCCESS) 1605 return OFFLOAD_FAIL; 1606 1607 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); 1608 } 1609 1610 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1611 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1612 1613 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1614 return OFFLOAD_FAIL; 1615 1616 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1617 } 1618 1619 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1620 void **tgt_args, 1621 ptrdiff_t *tgt_offsets, 1622 int32_t arg_num, int32_t team_num, 1623 int32_t thread_limit, 1624 uint64_t loop_tripcount) { 1625 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1626 // Context is set in __tgt_rtl_run_target_team_region_async. 1627 1628 __tgt_async_info AsyncInfo; 1629 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1630 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1631 thread_limit, loop_tripcount, &AsyncInfo); 1632 if (rc != OFFLOAD_SUCCESS) 1633 return OFFLOAD_FAIL; 1634 1635 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1636 } 1637 1638 int32_t __tgt_rtl_run_target_team_region_async( 1639 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1640 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1641 int32_t thread_limit, uint64_t loop_tripcount, 1642 __tgt_async_info *async_info_ptr) { 1643 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1644 1645 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1646 return OFFLOAD_FAIL; 1647 1648 return DeviceRTL.runTargetTeamRegion( 1649 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1650 thread_limit, loop_tripcount, async_info_ptr); 1651 } 1652 1653 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1654 void **tgt_args, ptrdiff_t *tgt_offsets, 1655 int32_t arg_num) { 1656 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1657 // Context is set in __tgt_rtl_run_target_region_async. 1658 1659 __tgt_async_info AsyncInfo; 1660 const int32_t rc = __tgt_rtl_run_target_region_async( 1661 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); 1662 if (rc != OFFLOAD_SUCCESS) 1663 return OFFLOAD_FAIL; 1664 1665 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1666 } 1667 1668 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1669 void *tgt_entry_ptr, void **tgt_args, 1670 ptrdiff_t *tgt_offsets, 1671 int32_t arg_num, 1672 __tgt_async_info *async_info_ptr) { 1673 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1674 // Context is set in __tgt_rtl_run_target_team_region_async. 1675 return __tgt_rtl_run_target_team_region_async( 1676 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1677 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1678 async_info_ptr); 1679 } 1680 1681 int32_t __tgt_rtl_synchronize(int32_t device_id, 1682 __tgt_async_info *async_info_ptr) { 1683 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1684 assert(async_info_ptr && "async_info_ptr is nullptr"); 1685 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1686 // NOTE: We don't need to set context for stream sync. 1687 return DeviceRTL.synchronize(device_id, async_info_ptr); 1688 } 1689 1690 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1691 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1692 InfoLevel.store(NewInfoLevel); 1693 } 1694 1695 void __tgt_rtl_print_device_info(int32_t device_id) { 1696 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1697 // NOTE: We don't need to set context for print device info. 1698 DeviceRTL.printDeviceInfo(device_id); 1699 } 1700 1701 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) { 1702 assert(event && "event is nullptr"); 1703 1704 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1705 return OFFLOAD_FAIL; 1706 1707 return DeviceRTL.createEvent(device_id, event); 1708 } 1709 1710 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, 1711 __tgt_async_info *async_info_ptr) { 1712 assert(async_info_ptr && "async_info_ptr is nullptr"); 1713 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1714 assert(event_ptr && "event_ptr is nullptr"); 1715 // NOTE: We might not need to set context for event record. 1716 return recordEvent(event_ptr, async_info_ptr); 1717 } 1718 1719 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr, 1720 __tgt_async_info *async_info_ptr) { 1721 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1722 assert(async_info_ptr && "async_info_ptr is nullptr"); 1723 assert(event_ptr && "event is nullptr"); 1724 // NOTE: We might not need to set context for event sync. 1725 return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr); 1726 } 1727 1728 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) { 1729 assert(event_ptr && "event is nullptr"); 1730 // NOTE: We might not need to set context for event sync. 1731 return syncEvent(event_ptr); 1732 } 1733 1734 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) { 1735 assert(event_ptr && "event is nullptr"); 1736 1737 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1738 return OFFLOAD_FAIL; 1739 1740 return DeviceRTL.destroyEvent(device_id, event_ptr); 1741 } 1742 1743 int32_t __tgt_rtl_release_async_info(int32_t device_id, 1744 __tgt_async_info *async_info) { 1745 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1746 assert(async_info && "async_info is nullptr"); 1747 1748 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1749 return OFFLOAD_FAIL; 1750 1751 return DeviceRTL.releaseAsyncInfo(device_id, async_info); 1752 } 1753 1754 int32_t __tgt_rtl_init_async_info(int32_t device_id, 1755 __tgt_async_info **async_info) { 1756 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1757 assert(async_info && "async_info is nullptr"); 1758 1759 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1760 return OFFLOAD_FAIL; 1761 1762 return DeviceRTL.initAsyncInfo(device_id, async_info); 1763 } 1764 1765 int32_t __tgt_rtl_init_device_info(int32_t device_id, 1766 __tgt_device_info *device_info_ptr, 1767 const char **err_str) { 1768 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1769 assert(device_info_ptr && "device_info_ptr is nullptr"); 1770 1771 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1772 return OFFLOAD_FAIL; 1773 1774 return DeviceRTL.initDeviceInfo(device_id, device_info_ptr, err_str); 1775 } 1776 1777 #ifdef __cplusplus 1778 } 1779 #endif 1780