1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <algorithm> 14 #include <cassert> 15 #include <cstddef> 16 #include <cuda.h> 17 #include <list> 18 #include <memory> 19 #include <mutex> 20 #include <string> 21 #include <unordered_map> 22 #include <vector> 23 24 #include "Debug.h" 25 #include "DeviceEnvironment.h" 26 #include "omptarget.h" 27 #include "omptargetplugin.h" 28 29 #define TARGET_NAME CUDA 30 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 31 32 #include "MemoryManager.h" 33 34 #include "llvm/Frontend/OpenMP/OMPConstants.h" 35 36 // Utility for retrieving and printing CUDA error string. 37 #ifdef OMPTARGET_DEBUG 38 #define CUDA_ERR_STRING(err) \ 39 do { \ 40 if (getDebugLevel() > 0) { \ 41 const char *errStr = nullptr; \ 42 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 43 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 44 REPORT("Unrecognized CUDA error code: %d\n", err); \ 45 else if (errStr_status == CUDA_SUCCESS) \ 46 REPORT("CUDA error is: %s\n", errStr); \ 47 else { \ 48 REPORT("Unresolved CUDA error code: %d\n", err); \ 49 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 50 errStr_status); \ 51 } \ 52 } else { \ 53 const char *errStr = nullptr; \ 54 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 55 if (errStr_status == CUDA_SUCCESS) \ 56 REPORT("%s \n", errStr); \ 57 } \ 58 } while (false) 59 #else // OMPTARGET_DEBUG 60 #define CUDA_ERR_STRING(err) \ 61 do { \ 62 const char *errStr = nullptr; \ 63 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 64 if (errStr_status == CUDA_SUCCESS) \ 65 REPORT("%s \n", errStr); \ 66 } while (false) 67 #endif // OMPTARGET_DEBUG 68 69 #define BOOL2TEXT(b) ((b) ? "Yes" : "No") 70 71 #include "elf_common.h" 72 73 /// Keep entries table per device. 74 struct FuncOrGblEntryTy { 75 __tgt_target_table Table; 76 std::vector<__tgt_offload_entry> Entries; 77 }; 78 79 /// Use a single entity to encode a kernel and a set of flags. 80 struct KernelTy { 81 CUfunction Func; 82 83 // execution mode of kernel 84 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 85 86 /// Maximal number of threads per block for this kernel. 87 int MaxThreadsPerBlock = 0; 88 89 KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode) 90 : Func(_Func), ExecutionMode(_ExecutionMode) {} 91 }; 92 93 namespace { 94 bool checkResult(CUresult Err, const char *ErrMsg) { 95 if (Err == CUDA_SUCCESS) 96 return true; 97 98 REPORT("%s", ErrMsg); 99 CUDA_ERR_STRING(Err); 100 return false; 101 } 102 103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 104 CUstream Stream) { 105 CUresult Err = 106 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 107 108 if (Err != CUDA_SUCCESS) { 109 DP("Error when copying data from device to device. Pointers: src " 110 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 111 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 112 CUDA_ERR_STRING(Err); 113 return OFFLOAD_FAIL; 114 } 115 116 return OFFLOAD_SUCCESS; 117 } 118 119 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { 120 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 121 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 122 123 CUresult Err = cuEventRecord(Event, Stream); 124 if (Err != CUDA_SUCCESS) { 125 DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", 126 DPxPTR(Stream), DPxPTR(Event)); 127 CUDA_ERR_STRING(Err); 128 return OFFLOAD_FAIL; 129 } 130 131 return OFFLOAD_SUCCESS; 132 } 133 134 int syncEvent(void *EventPtr) { 135 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 136 137 CUresult Err = cuEventSynchronize(Event); 138 if (Err != CUDA_SUCCESS) { 139 DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); 140 CUDA_ERR_STRING(Err); 141 return OFFLOAD_FAIL; 142 } 143 144 return OFFLOAD_SUCCESS; 145 } 146 147 namespace { 148 149 // Structure contains per-device data 150 struct DeviceDataTy { 151 /// List that contains all the kernels. 152 std::list<KernelTy> KernelsList; 153 154 std::list<FuncOrGblEntryTy> FuncGblEntries; 155 156 CUcontext Context = nullptr; 157 // Device properties 158 int ThreadsPerBlock = 0; 159 int BlocksPerGrid = 0; 160 int WarpSize = 0; 161 // OpenMP properties 162 int NumTeams = 0; 163 int NumThreads = 0; 164 }; 165 166 /// Resource allocator where \p T is the resource type. 167 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL 168 /// accordingly. The implementation should not raise any exception. 169 template <typename T> struct AllocatorTy { 170 using ElementTy = T; 171 virtual ~AllocatorTy() {} 172 173 /// Create a resource and assign to R. 174 virtual int create(T &R) noexcept = 0; 175 /// Destroy the resource. 176 virtual int destroy(T) noexcept = 0; 177 }; 178 179 /// Allocator for CUstream. 180 struct StreamAllocatorTy final : public AllocatorTy<CUstream> { 181 /// See AllocatorTy<T>::create. 182 int create(CUstream &Stream) noexcept override { 183 if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING), 184 "Error returned from cuStreamCreate\n")) 185 return OFFLOAD_FAIL; 186 187 return OFFLOAD_SUCCESS; 188 } 189 190 /// See AllocatorTy<T>::destroy. 191 int destroy(CUstream Stream) noexcept override { 192 if (!checkResult(cuStreamDestroy(Stream), 193 "Error returned from cuStreamDestroy\n")) 194 return OFFLOAD_FAIL; 195 196 return OFFLOAD_SUCCESS; 197 } 198 }; 199 200 /// Allocator for CUevent. 201 struct EventAllocatorTy final : public AllocatorTy<CUevent> { 202 /// See AllocatorTy<T>::create. 203 int create(CUevent &Event) noexcept override { 204 if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT), 205 "Error returned from cuEventCreate\n")) 206 return OFFLOAD_FAIL; 207 208 return OFFLOAD_SUCCESS; 209 } 210 211 /// See AllocatorTy<T>::destroy. 212 int destroy(CUevent Event) noexcept override { 213 if (!checkResult(cuEventDestroy(Event), 214 "Error returned from cuEventDestroy\n")) 215 return OFFLOAD_FAIL; 216 217 return OFFLOAD_SUCCESS; 218 } 219 }; 220 221 /// A generic pool of resources where \p T is the resource type. 222 /// \p T should be copyable as the object is stored in \p std::vector . 223 template <typename AllocTy> class ResourcePoolTy { 224 using ElementTy = typename AllocTy::ElementTy; 225 /// Index of the next available resource. 226 size_t Next = 0; 227 /// Mutex to guard the pool. 228 std::mutex Mutex; 229 /// Pool of resources. The difference between \p Resources and \p Pool is, 230 /// when a resource is acquired and released, it is all on \p Resources. When 231 /// a batch of new resources are needed, they are both added to \p Resources 232 /// and \p Pool. The reason for this setting is, \p Resources could contain 233 /// redundant elements because resources are not released, which can cause 234 /// double free. This setting makes sure that \p Pool always has every 235 /// resource allocated from the device. 236 std::vector<ElementTy> Resources; 237 std::vector<ElementTy> Pool; 238 /// A reference to the corresponding allocator. 239 AllocTy Allocator; 240 241 /// If `Resources` is used up, we will fill in more resources. It assumes that 242 /// the new size `Size` should be always larger than the current size. 243 bool resize(size_t Size) { 244 assert(Resources.size() == Pool.size() && "size mismatch"); 245 auto CurSize = Resources.size(); 246 assert(Size > CurSize && "Unexpected smaller size"); 247 Pool.reserve(Size); 248 Resources.reserve(Size); 249 for (auto I = CurSize; I < Size; ++I) { 250 ElementTy NewItem; 251 int Ret = Allocator.create(NewItem); 252 if (Ret != OFFLOAD_SUCCESS) 253 return false; 254 Pool.push_back(NewItem); 255 Resources.push_back(NewItem); 256 } 257 return true; 258 } 259 260 public: 261 ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept 262 : Allocator(std::move(A)) { 263 if (Size) 264 (void)resize(Size); 265 } 266 267 ~ResourcePoolTy() noexcept { clear(); } 268 269 /// Get a resource from pool. `Next` always points to the next available 270 /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are 271 /// still available. If there is no resource left, we will ask for more. Each 272 /// time a resource is assigned, the id will increase one. 273 /// xxxxxs+++++++++ 274 /// ^ 275 /// Next 276 /// After assignment, the pool becomes the following and s is assigned. 277 /// xxxxxs+++++++++ 278 /// ^ 279 /// Next 280 int acquire(ElementTy &R) noexcept { 281 std::lock_guard<std::mutex> LG(Mutex); 282 if (Next == Resources.size()) { 283 auto NewSize = Resources.size() ? Resources.size() * 2 : 1; 284 if (!resize(NewSize)) 285 return OFFLOAD_FAIL; 286 } 287 288 assert(Next < Resources.size()); 289 290 R = Resources[Next++]; 291 292 return OFFLOAD_SUCCESS; 293 } 294 295 /// Return the resource back to the pool. When we return a resource, we need 296 /// to first decrease `Next`, and then copy the resource back. It is worth 297 /// noting that, the order of resources return might be different from that 298 /// they're assigned, that saying, at some point, there might be two identical 299 /// resources. 300 /// xxax+a+++++ 301 /// ^ 302 /// Next 303 /// However, it doesn't matter, because they're always on the two sides of 304 /// `Next`. The left one will in the end be overwritten by another resource. 305 /// Therefore, after several execution, the order of pool might be different 306 /// from its initial state. 307 void release(ElementTy R) noexcept { 308 std::lock_guard<std::mutex> LG(Mutex); 309 Resources[--Next] = R; 310 } 311 312 /// Released all stored resources and clear the pool. 313 /// Note: This function is not thread safe. Be sure to guard it if necessary. 314 void clear() noexcept { 315 for (auto &R : Pool) 316 (void)Allocator.destroy(R); 317 Pool.clear(); 318 Resources.clear(); 319 } 320 }; 321 322 } // namespace 323 324 class DeviceRTLTy { 325 int NumberOfDevices; 326 // OpenMP environment properties 327 int EnvNumTeams; 328 int EnvTeamLimit; 329 int EnvTeamThreadLimit; 330 // OpenMP requires flags 331 int64_t RequiresFlags; 332 // Amount of dynamic shared memory to use at launch. 333 uint64_t DynamicMemorySize; 334 335 /// Number of initial streams for each device. 336 int NumInitialStreams = 32; 337 338 /// Number of initial events for each device. 339 int NumInitialEvents = 8; 340 341 static constexpr const int32_t HardThreadLimit = 1024; 342 static constexpr const int32_t DefaultNumTeams = 128; 343 static constexpr const int32_t DefaultNumThreads = 128; 344 345 using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>; 346 std::vector<std::unique_ptr<StreamPoolTy>> StreamPool; 347 348 using EventPoolTy = ResourcePoolTy<EventAllocatorTy>; 349 std::vector<std::unique_ptr<EventPoolTy>> EventPool; 350 351 std::vector<DeviceDataTy> DeviceData; 352 std::vector<std::vector<CUmodule>> Modules; 353 354 /// Vector of flags indicating the initalization status of all associated 355 /// devices. 356 std::vector<bool> InitializedFlags; 357 358 /// A class responsible for interacting with device native runtime library to 359 /// allocate and free memory. 360 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 361 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 362 363 public: 364 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 365 if (Size == 0) 366 return nullptr; 367 368 void *MemAlloc = nullptr; 369 CUresult Err; 370 switch (Kind) { 371 case TARGET_ALLOC_DEFAULT: 372 case TARGET_ALLOC_DEVICE: 373 CUdeviceptr DevicePtr; 374 Err = cuMemAlloc(&DevicePtr, Size); 375 MemAlloc = (void *)DevicePtr; 376 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 377 return nullptr; 378 break; 379 case TARGET_ALLOC_HOST: 380 void *HostPtr; 381 Err = cuMemAllocHost(&HostPtr, Size); 382 MemAlloc = HostPtr; 383 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 384 return nullptr; 385 HostPinnedAllocs[MemAlloc] = Kind; 386 break; 387 case TARGET_ALLOC_SHARED: 388 CUdeviceptr SharedPtr; 389 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 390 MemAlloc = (void *)SharedPtr; 391 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 392 return nullptr; 393 break; 394 } 395 396 return MemAlloc; 397 } 398 399 int free(void *TgtPtr) override { 400 CUresult Err; 401 // Host pinned memory must be freed differently. 402 TargetAllocTy Kind = 403 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 404 ? TARGET_ALLOC_DEFAULT 405 : TARGET_ALLOC_HOST; 406 switch (Kind) { 407 case TARGET_ALLOC_DEFAULT: 408 case TARGET_ALLOC_DEVICE: 409 case TARGET_ALLOC_SHARED: 410 Err = cuMemFree((CUdeviceptr)TgtPtr); 411 if (!checkResult(Err, "Error returned from cuMemFree\n")) 412 return OFFLOAD_FAIL; 413 break; 414 case TARGET_ALLOC_HOST: 415 Err = cuMemFreeHost(TgtPtr); 416 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 417 return OFFLOAD_FAIL; 418 break; 419 } 420 421 return OFFLOAD_SUCCESS; 422 } 423 }; 424 425 /// A vector of device allocators 426 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 427 428 /// A vector of memory managers. Since the memory manager is non-copyable and 429 // non-removable, we wrap them into std::unique_ptr. 430 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 431 432 /// Whether use memory manager 433 bool UseMemoryManager = true; 434 435 // Record entry point associated with device 436 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 437 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 438 E.Entries.push_back(entry); 439 } 440 441 // Return a pointer to the entry associated with the pointer 442 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 443 const void *Addr) const { 444 for (const __tgt_offload_entry &Itr : 445 DeviceData[DeviceId].FuncGblEntries.back().Entries) 446 if (Itr.addr == Addr) 447 return &Itr; 448 449 return nullptr; 450 } 451 452 // Return the pointer to the target entries table 453 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 454 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 455 456 if (E.Entries.empty()) 457 return nullptr; 458 459 // Update table info according to the entries and return the pointer 460 E.Table.EntriesBegin = E.Entries.data(); 461 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 462 463 return &E.Table; 464 } 465 466 // Clear entries table for a device 467 void clearOffloadEntriesTable(const int DeviceId) { 468 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 469 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 470 E.Entries.clear(); 471 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 472 } 473 474 public: 475 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 476 assert(AsyncInfo && "AsyncInfo is nullptr"); 477 478 if (!AsyncInfo->Queue) { 479 CUstream S; 480 if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS) 481 return nullptr; 482 483 AsyncInfo->Queue = S; 484 } 485 486 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 487 } 488 489 // This class should not be copied 490 DeviceRTLTy(const DeviceRTLTy &) = delete; 491 DeviceRTLTy(DeviceRTLTy &&) = delete; 492 493 DeviceRTLTy() 494 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 495 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), 496 DynamicMemorySize(0) { 497 498 DP("Start initializing CUDA\n"); 499 500 CUresult Err = cuInit(0); 501 if (Err == CUDA_ERROR_INVALID_HANDLE) { 502 // Can't call cuGetErrorString if dlsym failed 503 DP("Failed to load CUDA shared library\n"); 504 return; 505 } 506 if (!checkResult(Err, "Error returned from cuInit\n")) { 507 return; 508 } 509 510 Err = cuDeviceGetCount(&NumberOfDevices); 511 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 512 return; 513 514 if (NumberOfDevices == 0) { 515 DP("There are no devices supporting CUDA.\n"); 516 return; 517 } 518 519 DeviceData.resize(NumberOfDevices); 520 Modules.resize(NumberOfDevices); 521 StreamPool.resize(NumberOfDevices); 522 EventPool.resize(NumberOfDevices); 523 524 // Get environment variables regarding teams 525 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 526 // OMP_TEAM_LIMIT has been set 527 EnvTeamLimit = std::stoi(EnvStr); 528 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 529 } 530 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 531 // OMP_TEAMS_THREAD_LIMIT has been set 532 EnvTeamThreadLimit = std::stoi(EnvStr); 533 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 534 } 535 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 536 // OMP_NUM_TEAMS has been set 537 EnvNumTeams = std::stoi(EnvStr); 538 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 539 } 540 if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { 541 // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set 542 DynamicMemorySize = std::stoi(EnvStr); 543 DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", 544 DynamicMemorySize); 545 } 546 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) { 547 // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set 548 NumInitialStreams = std::stoi(EnvStr); 549 DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams); 550 } 551 552 for (int I = 0; I < NumberOfDevices; ++I) 553 DeviceAllocators.emplace_back(); 554 555 // Get the size threshold from environment variable 556 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 557 UseMemoryManager = Res.second; 558 size_t MemoryManagerThreshold = Res.first; 559 560 if (UseMemoryManager) 561 for (int I = 0; I < NumberOfDevices; ++I) 562 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 563 DeviceAllocators[I], MemoryManagerThreshold)); 564 565 // We lazily initialize all devices later. 566 InitializedFlags.assign(NumberOfDevices, false); 567 } 568 569 ~DeviceRTLTy() { 570 for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) 571 deinitDevice(DeviceId); 572 } 573 574 // Check whether a given DeviceId is valid 575 bool isValidDeviceId(const int DeviceId) const { 576 return DeviceId >= 0 && DeviceId < NumberOfDevices; 577 } 578 579 int getNumOfDevices() const { return NumberOfDevices; } 580 581 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 582 583 int initDevice(const int DeviceId) { 584 CUdevice Device; 585 586 DP("Getting device %d\n", DeviceId); 587 CUresult Err = cuDeviceGet(&Device, DeviceId); 588 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 589 return OFFLOAD_FAIL; 590 591 assert(InitializedFlags[DeviceId] == false && "Reinitializing device!"); 592 InitializedFlags[DeviceId] = true; 593 594 // Query the current flags of the primary context and set its flags if 595 // it is inactive 596 unsigned int FormerPrimaryCtxFlags = 0; 597 int FormerPrimaryCtxIsActive = 0; 598 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 599 &FormerPrimaryCtxIsActive); 600 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 601 return OFFLOAD_FAIL; 602 603 if (FormerPrimaryCtxIsActive) { 604 DP("The primary context is active, no change to its flags\n"); 605 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 606 CU_CTX_SCHED_BLOCKING_SYNC) 607 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 608 } else { 609 DP("The primary context is inactive, set its flags to " 610 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 611 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 612 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 613 return OFFLOAD_FAIL; 614 } 615 616 // Retain the per device primary context and save it to use whenever this 617 // device is selected. 618 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 619 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 620 return OFFLOAD_FAIL; 621 622 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 623 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 624 return OFFLOAD_FAIL; 625 626 // Initialize the stream pool. 627 if (!StreamPool[DeviceId]) 628 StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(), 629 NumInitialStreams); 630 631 // Initialize the event pool. 632 if (!EventPool[DeviceId]) 633 EventPool[DeviceId] = 634 std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents); 635 636 // Query attributes to determine number of threads/block and blocks/grid. 637 int MaxGridDimX; 638 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 639 Device); 640 if (Err != CUDA_SUCCESS) { 641 DP("Error getting max grid dimension, use default value %d\n", 642 DeviceRTLTy::DefaultNumTeams); 643 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 644 } else { 645 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 646 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 647 } 648 649 // We are only exploiting threads along the x axis. 650 int MaxBlockDimX; 651 Err = cuDeviceGetAttribute(&MaxBlockDimX, 652 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 653 if (Err != CUDA_SUCCESS) { 654 DP("Error getting max block dimension, use default value %d\n", 655 DeviceRTLTy::DefaultNumThreads); 656 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 657 } else { 658 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 659 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 660 661 if (EnvTeamThreadLimit > 0 && 662 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 663 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 664 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 665 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 666 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 667 } 668 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 669 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 670 "capping at the hard limit\n", 671 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 672 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 673 } 674 } 675 676 // Get and set warp size 677 int WarpSize; 678 Err = 679 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 680 if (Err != CUDA_SUCCESS) { 681 DP("Error getting warp size, assume default value 32\n"); 682 DeviceData[DeviceId].WarpSize = 32; 683 } else { 684 DP("Using warp size %d\n", WarpSize); 685 DeviceData[DeviceId].WarpSize = WarpSize; 686 } 687 688 // Adjust teams to the env variables 689 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 690 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 691 EnvTeamLimit); 692 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 693 } 694 695 size_t StackLimit; 696 size_t HeapLimit; 697 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { 698 StackLimit = std::stol(EnvStr); 699 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) 700 return OFFLOAD_FAIL; 701 } else { 702 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) 703 return OFFLOAD_FAIL; 704 } 705 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { 706 HeapLimit = std::stol(EnvStr); 707 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) 708 return OFFLOAD_FAIL; 709 } else { 710 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) 711 return OFFLOAD_FAIL; 712 } 713 714 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 715 "Device supports up to %d CUDA blocks and %d threads with a " 716 "warp size of %d\n", 717 DeviceData[DeviceId].BlocksPerGrid, 718 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 719 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 720 "Device heap size is %d Bytes, device stack size is %d Bytes per " 721 "thread\n", 722 (int)HeapLimit, (int)StackLimit); 723 724 // Set default number of teams 725 if (EnvNumTeams > 0) { 726 DP("Default number of teams set according to environment %d\n", 727 EnvNumTeams); 728 DeviceData[DeviceId].NumTeams = EnvNumTeams; 729 } else { 730 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 731 DP("Default number of teams set according to library's default %d\n", 732 DeviceRTLTy::DefaultNumTeams); 733 } 734 735 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 736 DP("Default number of teams exceeds device limit, capping at %d\n", 737 DeviceData[DeviceId].BlocksPerGrid); 738 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 739 } 740 741 // Set default number of threads 742 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 743 DP("Default number of threads set according to library's default %d\n", 744 DeviceRTLTy::DefaultNumThreads); 745 if (DeviceData[DeviceId].NumThreads > 746 DeviceData[DeviceId].ThreadsPerBlock) { 747 DP("Default number of threads exceeds device limit, capping at %d\n", 748 DeviceData[DeviceId].ThreadsPerBlock); 749 DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; 750 } 751 752 return OFFLOAD_SUCCESS; 753 } 754 755 int deinitDevice(const int DeviceId) { 756 auto IsInitialized = InitializedFlags[DeviceId]; 757 if (!IsInitialized) 758 return OFFLOAD_SUCCESS; 759 InitializedFlags[DeviceId] = false; 760 761 if (UseMemoryManager) 762 MemoryManagers[DeviceId].release(); 763 764 StreamPool[DeviceId].reset(); 765 EventPool[DeviceId].reset(); 766 767 DeviceDataTy &D = DeviceData[DeviceId]; 768 if (!checkResult(cuCtxSetCurrent(D.Context), 769 "Error returned from cuCtxSetCurrent\n")) 770 return OFFLOAD_FAIL; 771 772 // Unload all modules. 773 for (auto &M : Modules[DeviceId]) 774 if (!checkResult(cuModuleUnload(M), 775 "Error returned from cuModuleUnload\n")) 776 return OFFLOAD_FAIL; 777 778 // Destroy context. 779 CUdevice Device; 780 if (!checkResult(cuCtxGetDevice(&Device), 781 "Error returned from cuCtxGetDevice\n")) 782 return OFFLOAD_FAIL; 783 784 if (!checkResult(cuDevicePrimaryCtxRelease(Device), 785 "Error returned from cuDevicePrimaryCtxRelease\n")) 786 return OFFLOAD_FAIL; 787 788 return OFFLOAD_SUCCESS; 789 } 790 791 __tgt_target_table *loadBinary(const int DeviceId, 792 const __tgt_device_image *Image) { 793 // Clear the offload table as we are going to create a new one. 794 clearOffloadEntriesTable(DeviceId); 795 796 // Create the module and extract the function pointers. 797 CUmodule Module; 798 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 799 CUresult Err = 800 cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 801 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 802 return nullptr; 803 804 DP("CUDA module successfully loaded!\n"); 805 806 Modules[DeviceId].push_back(Module); 807 808 // Find the symbols in the module by name. 809 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 810 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 811 812 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 813 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 814 if (!E->addr) { 815 // We return nullptr when something like this happens, the host should 816 // have always something in the address to uniquely identify the target 817 // region. 818 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 819 return nullptr; 820 } 821 822 if (E->size) { 823 __tgt_offload_entry Entry = *E; 824 CUdeviceptr CUPtr; 825 size_t CUSize; 826 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 827 // We keep this style here because we need the name 828 if (Err != CUDA_SUCCESS) { 829 REPORT("Loading global '%s' Failed\n", E->name); 830 CUDA_ERR_STRING(Err); 831 return nullptr; 832 } 833 834 if (CUSize != E->size) { 835 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 836 CUSize, E->size); 837 return nullptr; 838 } 839 840 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 841 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 842 843 Entry.addr = (void *)(CUPtr); 844 845 // Note: In the current implementation declare target variables 846 // can either be link or to. This means that once unified 847 // memory is activated via the requires directive, the variable 848 // can be used directly from the host in both cases. 849 // TODO: when variables types other than to or link are added, 850 // the below condition should be changed to explicitly 851 // check for to and link variables types: 852 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 853 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 854 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 855 // If unified memory is present any target link or to variables 856 // can access host addresses directly. There is no longer a 857 // need for device copies. 858 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 859 DP("Copy linked variable host address (" DPxMOD 860 ") to device address (" DPxMOD ")\n", 861 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 862 } 863 864 addOffloadEntry(DeviceId, Entry); 865 866 continue; 867 } 868 869 CUfunction Func; 870 Err = cuModuleGetFunction(&Func, Module, E->name); 871 // We keep this style here because we need the name 872 if (Err != CUDA_SUCCESS) { 873 REPORT("Loading '%s' Failed\n", E->name); 874 CUDA_ERR_STRING(Err); 875 return nullptr; 876 } 877 878 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 879 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 880 881 // default value GENERIC (in case symbol is missing from cubin file) 882 llvm::omp::OMPTgtExecModeFlags ExecModeVal; 883 std::string ExecModeNameStr(E->name); 884 ExecModeNameStr += "_exec_mode"; 885 const char *ExecModeName = ExecModeNameStr.c_str(); 886 887 CUdeviceptr ExecModePtr; 888 size_t CUSize; 889 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 890 if (Err == CUDA_SUCCESS) { 891 if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 892 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 893 ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 894 return nullptr; 895 } 896 897 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 898 if (Err != CUDA_SUCCESS) { 899 REPORT("Error when copying data from device to host. Pointers: " 900 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 901 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 902 CUDA_ERR_STRING(Err); 903 return nullptr; 904 } 905 } else { 906 DP("Loading global exec_mode '%s' - symbol missing, using default " 907 "value GENERIC (1)\n", 908 ExecModeName); 909 } 910 911 KernelsList.emplace_back(Func, ExecModeVal); 912 913 __tgt_offload_entry Entry = *E; 914 Entry.addr = &KernelsList.back(); 915 addOffloadEntry(DeviceId, Entry); 916 } 917 918 // send device environment data to the device 919 { 920 // TODO: The device ID used here is not the real device ID used by OpenMP. 921 DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices), 922 static_cast<uint32_t>(DeviceId), 923 static_cast<uint32_t>(DynamicMemorySize)}; 924 925 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 926 DeviceEnv.DebugKind = std::stoi(EnvStr); 927 928 const char *DeviceEnvName = "omptarget_device_environment"; 929 CUdeviceptr DeviceEnvPtr; 930 size_t CUSize; 931 932 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 933 if (Err == CUDA_SUCCESS) { 934 if (CUSize != sizeof(DeviceEnv)) { 935 REPORT( 936 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 937 DeviceEnvName, CUSize, sizeof(int32_t)); 938 CUDA_ERR_STRING(Err); 939 return nullptr; 940 } 941 942 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 943 if (Err != CUDA_SUCCESS) { 944 REPORT("Error when copying data from host to device. Pointers: " 945 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 946 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 947 CUDA_ERR_STRING(Err); 948 return nullptr; 949 } 950 951 DP("Sending global device environment data %zu bytes\n", CUSize); 952 } else { 953 DP("Finding global device environment '%s' - symbol missing.\n", 954 DeviceEnvName); 955 DP("Continue, considering this is a device RTL which does not accept " 956 "environment setting.\n"); 957 } 958 } 959 960 return getOffloadEntriesTable(DeviceId); 961 } 962 963 void *dataAlloc(const int DeviceId, const int64_t Size, 964 const TargetAllocTy Kind) { 965 switch (Kind) { 966 case TARGET_ALLOC_DEFAULT: 967 case TARGET_ALLOC_DEVICE: 968 if (UseMemoryManager) 969 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 970 else 971 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 972 case TARGET_ALLOC_HOST: 973 case TARGET_ALLOC_SHARED: 974 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 975 } 976 977 REPORT("Invalid target data allocation kind or requested allocator not " 978 "implemented yet\n"); 979 980 return nullptr; 981 } 982 983 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 984 const int64_t Size, __tgt_async_info *AsyncInfo) const { 985 assert(AsyncInfo && "AsyncInfo is nullptr"); 986 987 CUstream Stream = getStream(DeviceId, AsyncInfo); 988 CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 989 if (Err != CUDA_SUCCESS) { 990 DP("Error when copying data from host to device. Pointers: host " 991 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 992 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 993 CUDA_ERR_STRING(Err); 994 return OFFLOAD_FAIL; 995 } 996 997 return OFFLOAD_SUCCESS; 998 } 999 1000 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 1001 const int64_t Size, __tgt_async_info *AsyncInfo) const { 1002 assert(AsyncInfo && "AsyncInfo is nullptr"); 1003 1004 CUstream Stream = getStream(DeviceId, AsyncInfo); 1005 CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 1006 if (Err != CUDA_SUCCESS) { 1007 DP("Error when copying data from device to host. Pointers: host " 1008 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 1009 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 1010 CUDA_ERR_STRING(Err); 1011 return OFFLOAD_FAIL; 1012 } 1013 1014 return OFFLOAD_SUCCESS; 1015 } 1016 1017 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 1018 int64_t Size, __tgt_async_info *AsyncInfo) const { 1019 assert(AsyncInfo && "AsyncInfo is nullptr"); 1020 1021 CUresult Err; 1022 CUstream Stream = getStream(SrcDevId, AsyncInfo); 1023 1024 // If they are two devices, we try peer to peer copy first 1025 if (SrcDevId != DstDevId) { 1026 int CanAccessPeer = 0; 1027 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 1028 if (Err != CUDA_SUCCESS) { 1029 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 1030 ", dst = %" PRId32 "\n", 1031 SrcDevId, DstDevId); 1032 CUDA_ERR_STRING(Err); 1033 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1034 } 1035 1036 if (!CanAccessPeer) { 1037 DP("P2P memcpy not supported so fall back to D2D memcpy"); 1038 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1039 } 1040 1041 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 1042 if (Err != CUDA_SUCCESS) { 1043 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 1044 ", dst = %" PRId32 "\n", 1045 SrcDevId, DstDevId); 1046 CUDA_ERR_STRING(Err); 1047 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1048 } 1049 1050 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 1051 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 1052 Size, Stream); 1053 if (Err == CUDA_SUCCESS) 1054 return OFFLOAD_SUCCESS; 1055 1056 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 1057 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 1058 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 1059 CUDA_ERR_STRING(Err); 1060 } 1061 1062 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1063 } 1064 1065 int dataDelete(const int DeviceId, void *TgtPtr) { 1066 if (UseMemoryManager) 1067 return MemoryManagers[DeviceId]->free(TgtPtr); 1068 1069 return DeviceAllocators[DeviceId].free(TgtPtr); 1070 } 1071 1072 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 1073 ptrdiff_t *TgtOffsets, const int ArgNum, 1074 const int TeamNum, const int ThreadLimit, 1075 const unsigned int LoopTripCount, 1076 __tgt_async_info *AsyncInfo) const { 1077 // All args are references. 1078 std::vector<void *> Args(ArgNum); 1079 std::vector<void *> Ptrs(ArgNum); 1080 1081 for (int I = 0; I < ArgNum; ++I) { 1082 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1083 Args[I] = &Ptrs[I]; 1084 } 1085 1086 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1087 1088 const bool IsSPMDGenericMode = 1089 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; 1090 const bool IsSPMDMode = 1091 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; 1092 const bool IsGenericMode = 1093 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; 1094 1095 int CudaThreadsPerBlock; 1096 if (ThreadLimit > 0) { 1097 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1098 CudaThreadsPerBlock = ThreadLimit; 1099 // Add master warp if necessary 1100 if (IsGenericMode) { 1101 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1102 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1103 } 1104 } else { 1105 DP("Setting CUDA threads per block to default %d\n", 1106 DeviceData[DeviceId].NumThreads); 1107 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1108 } 1109 1110 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1111 DP("Threads per block capped at device limit %d\n", 1112 DeviceData[DeviceId].ThreadsPerBlock); 1113 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1114 } 1115 1116 CUresult Err; 1117 if (!KernelInfo->MaxThreadsPerBlock) { 1118 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1119 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1120 KernelInfo->Func); 1121 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1122 return OFFLOAD_FAIL; 1123 } 1124 1125 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1126 DP("Threads per block capped at kernel limit %d\n", 1127 KernelInfo->MaxThreadsPerBlock); 1128 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1129 } 1130 1131 unsigned int CudaBlocksPerGrid; 1132 if (TeamNum <= 0) { 1133 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1134 if (IsSPMDGenericMode) { 1135 // If we reach this point, then we are executing a kernel that was 1136 // transformed from Generic-mode to SPMD-mode. This kernel has 1137 // SPMD-mode execution, but needs its blocks to be scheduled 1138 // differently because the current loop trip count only applies to the 1139 // `teams distribute` region and will create var too few blocks using 1140 // the regular SPMD-mode method. 1141 CudaBlocksPerGrid = LoopTripCount; 1142 } else if (IsSPMDMode) { 1143 // We have a combined construct, i.e. `target teams distribute 1144 // parallel for [simd]`. We launch so many teams so that each thread 1145 // will execute one iteration of the loop. round up to the nearest 1146 // integer 1147 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1148 } else if (IsGenericMode) { 1149 // If we reach this point, then we have a non-combined construct, i.e. 1150 // `teams distribute` with a nested `parallel for` and each team is 1151 // assigned one iteration of the `distribute` loop. E.g.: 1152 // 1153 // #pragma omp target teams distribute 1154 // for(...loop_tripcount...) { 1155 // #pragma omp parallel for 1156 // for(...) {} 1157 // } 1158 // 1159 // Threads within a team will execute the iterations of the `parallel` 1160 // loop. 1161 CudaBlocksPerGrid = LoopTripCount; 1162 } else { 1163 REPORT("Unknown execution mode: %d\n", 1164 static_cast<int8_t>(KernelInfo->ExecutionMode)); 1165 return OFFLOAD_FAIL; 1166 } 1167 DP("Using %d teams due to loop trip count %" PRIu32 1168 " and number of threads per block %d\n", 1169 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1170 } else { 1171 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1172 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1173 } 1174 } else { 1175 DP("Using requested number of teams %d\n", TeamNum); 1176 CudaBlocksPerGrid = TeamNum; 1177 } 1178 1179 if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) { 1180 DP("Capping number of teams to team limit %d\n", 1181 DeviceData[DeviceId].BlocksPerGrid); 1182 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1183 } 1184 1185 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1186 "Launching kernel %s with %d blocks and %d threads in %s mode\n", 1187 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1188 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1189 : "(null)", 1190 CudaBlocksPerGrid, CudaThreadsPerBlock, 1191 (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); 1192 1193 CUstream Stream = getStream(DeviceId, AsyncInfo); 1194 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1195 /* gridDimZ */ 1, CudaThreadsPerBlock, 1196 /* blockDimY */ 1, /* blockDimZ */ 1, 1197 DynamicMemorySize, Stream, &Args[0], nullptr); 1198 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1199 return OFFLOAD_FAIL; 1200 1201 DP("Launch of entry point at " DPxMOD " successful!\n", 1202 DPxPTR(TgtEntryPtr)); 1203 1204 return OFFLOAD_SUCCESS; 1205 } 1206 1207 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1208 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1209 CUresult Err = cuStreamSynchronize(Stream); 1210 1211 // Once the stream is synchronized, return it to stream pool and reset 1212 // AsyncInfo. This is to make sure the synchronization only works for its 1213 // own tasks. 1214 StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1215 AsyncInfo->Queue = nullptr; 1216 1217 if (Err != CUDA_SUCCESS) { 1218 DP("Error when synchronizing stream. stream = " DPxMOD 1219 ", async info ptr = " DPxMOD "\n", 1220 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1221 CUDA_ERR_STRING(Err); 1222 } 1223 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1224 } 1225 1226 void printDeviceInfo(int32_t device_id) { 1227 char TmpChar[1000]; 1228 std::string TmpStr; 1229 size_t TmpSt; 1230 int TmpInt, TmpInt2, TmpInt3; 1231 1232 CUdevice Device; 1233 checkResult(cuDeviceGet(&Device, device_id), 1234 "Error returned from cuCtxGetDevice\n"); 1235 1236 cuDriverGetVersion(&TmpInt); 1237 printf(" CUDA Driver Version: \t\t%d \n", TmpInt); 1238 printf(" CUDA Device Number: \t\t%d \n", device_id); 1239 checkResult(cuDeviceGetName(TmpChar, 1000, Device), 1240 "Error returned from cuDeviceGetName\n"); 1241 printf(" Device Name: \t\t\t%s \n", TmpChar); 1242 checkResult(cuDeviceTotalMem(&TmpSt, Device), 1243 "Error returned from cuDeviceTotalMem\n"); 1244 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); 1245 checkResult(cuDeviceGetAttribute( 1246 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), 1247 "Error returned from cuDeviceGetAttribute\n"); 1248 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); 1249 checkResult( 1250 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), 1251 "Error returned from cuDeviceGetAttribute\n"); 1252 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); 1253 checkResult(cuDeviceGetAttribute( 1254 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), 1255 "Error returned from cuDeviceGetAttribute\n"); 1256 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); 1257 checkResult( 1258 cuDeviceGetAttribute( 1259 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), 1260 "Error returned from cuDeviceGetAttribute\n"); 1261 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); 1262 checkResult( 1263 cuDeviceGetAttribute( 1264 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), 1265 "Error returned from cuDeviceGetAttribute\n"); 1266 printf(" Registers per Block: \t\t%d \n", TmpInt); 1267 checkResult( 1268 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), 1269 "Error returned from cuDeviceGetAttribute\n"); 1270 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); 1271 checkResult(cuDeviceGetAttribute( 1272 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), 1273 "Error returned from cuDeviceGetAttribute\n"); 1274 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); 1275 checkResult(cuDeviceGetAttribute( 1276 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), 1277 "Error returned from cuDeviceGetAttribute\n"); 1278 checkResult(cuDeviceGetAttribute( 1279 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), 1280 "Error returned from cuDeviceGetAttribute\n"); 1281 checkResult(cuDeviceGetAttribute( 1282 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), 1283 "Error returned from cuDeviceGetAttribute\n"); 1284 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, 1285 TmpInt3); 1286 checkResult(cuDeviceGetAttribute( 1287 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), 1288 "Error returned from cuDeviceGetAttribute\n"); 1289 checkResult(cuDeviceGetAttribute( 1290 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), 1291 "Error returned from cuDeviceGetAttribute\n"); 1292 checkResult(cuDeviceGetAttribute( 1293 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), 1294 "Error returned from cuDeviceGetAttribute\n"); 1295 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, 1296 TmpInt3); 1297 checkResult( 1298 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), 1299 "Error returned from cuDeviceGetAttribute\n"); 1300 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); 1301 checkResult(cuDeviceGetAttribute( 1302 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), 1303 "Error returned from cuDeviceGetAttribute\n"); 1304 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); 1305 checkResult( 1306 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), 1307 "Error returned from cuDeviceGetAttribute\n"); 1308 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); 1309 checkResult(cuDeviceGetAttribute( 1310 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), 1311 "Error returned from cuDeviceGetAttribute\n"); 1312 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1313 checkResult( 1314 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), 1315 "Error returned from cuDeviceGetAttribute\n"); 1316 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1317 checkResult(cuDeviceGetAttribute( 1318 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), 1319 "Error returned from cuDeviceGetAttribute\n"); 1320 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1321 checkResult( 1322 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), 1323 "Error returned from cuDeviceGetAttribute\n"); 1324 if (TmpInt == CU_COMPUTEMODE_DEFAULT) 1325 TmpStr = "DEFAULT"; 1326 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) 1327 TmpStr = "PROHIBITED"; 1328 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) 1329 TmpStr = "EXCLUSIVE PROCESS"; 1330 else 1331 TmpStr = "unknown"; 1332 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); 1333 checkResult(cuDeviceGetAttribute( 1334 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), 1335 "Error returned from cuDeviceGetAttribute\n"); 1336 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); 1337 checkResult( 1338 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), 1339 "Error returned from cuDeviceGetAttribute\n"); 1340 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1341 checkResult(cuDeviceGetAttribute( 1342 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), 1343 "Error returned from cuDeviceGetAttribute\n"); 1344 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); 1345 checkResult( 1346 cuDeviceGetAttribute( 1347 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), 1348 "Error returned from cuDeviceGetAttribute\n"); 1349 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); 1350 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, 1351 Device), 1352 "Error returned from cuDeviceGetAttribute\n"); 1353 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); 1354 checkResult(cuDeviceGetAttribute( 1355 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 1356 Device), 1357 "Error returned from cuDeviceGetAttribute\n"); 1358 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); 1359 checkResult(cuDeviceGetAttribute( 1360 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), 1361 "Error returned from cuDeviceGetAttribute\n"); 1362 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); 1363 checkResult(cuDeviceGetAttribute( 1364 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), 1365 "Error returned from cuDeviceGetAttribute\n"); 1366 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); 1367 checkResult(cuDeviceGetAttribute( 1368 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), 1369 "Error returned from cuDeviceGetAttribute\n"); 1370 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1371 checkResult( 1372 cuDeviceGetAttribute( 1373 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), 1374 "Error returned from cuDeviceGetAttribute\n"); 1375 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1376 checkResult( 1377 cuDeviceGetAttribute( 1378 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), 1379 "Error returned from cuDeviceGetAttribute\n"); 1380 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); 1381 checkResult(cuDeviceGetAttribute( 1382 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), 1383 "Error returned from cuDeviceGetAttribute\n"); 1384 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); 1385 checkResult(cuDeviceGetAttribute( 1386 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), 1387 "Error returned from cuDeviceGetAttribute\n"); 1388 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); 1389 checkResult( 1390 cuDeviceGetAttribute( 1391 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), 1392 "Error returned from cuDeviceGetAttribute\n"); 1393 checkResult( 1394 cuDeviceGetAttribute( 1395 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), 1396 "Error returned from cuDeviceGetAttribute\n"); 1397 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); 1398 } 1399 1400 int createEvent(int DeviceId, void **P) { 1401 CUevent Event = nullptr; 1402 if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS) 1403 return OFFLOAD_FAIL; 1404 *P = Event; 1405 return OFFLOAD_SUCCESS; 1406 } 1407 1408 int destroyEvent(int DeviceId, void *EventPtr) { 1409 EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr)); 1410 return OFFLOAD_SUCCESS; 1411 } 1412 1413 int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, 1414 void *EventPtr) const { 1415 CUstream Stream = getStream(DeviceId, AsyncInfo); 1416 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 1417 1418 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from 1419 // specific CUDA version, and defined as 0x0. In previous version, per CUDA 1420 // API document, that argument has to be 0x0. 1421 CUresult Err = cuStreamWaitEvent(Stream, Event, 0); 1422 if (Err != CUDA_SUCCESS) { 1423 DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", 1424 DPxPTR(Stream), DPxPTR(Event)); 1425 CUDA_ERR_STRING(Err); 1426 return OFFLOAD_FAIL; 1427 } 1428 1429 return OFFLOAD_SUCCESS; 1430 } 1431 1432 int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const { 1433 if (AsyncInfo->Queue) { 1434 StreamPool[DeviceId]->release( 1435 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1436 AsyncInfo->Queue = nullptr; 1437 } 1438 1439 return OFFLOAD_SUCCESS; 1440 } 1441 1442 int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const { 1443 *AsyncInfo = new __tgt_async_info; 1444 getStream(DeviceId, *AsyncInfo); 1445 return OFFLOAD_SUCCESS; 1446 } 1447 1448 int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo, 1449 const char **ErrStr) const { 1450 assert(DeviceInfo && "DeviceInfo is nullptr"); 1451 1452 if (!DeviceInfo->Context) 1453 DeviceInfo->Context = DeviceData[DeviceId].Context; 1454 if (!DeviceInfo->Device) { 1455 CUdevice Dev; 1456 CUresult Err = cuDeviceGet(&Dev, DeviceId); 1457 if (Err == CUDA_SUCCESS) { 1458 DeviceInfo->Device = reinterpret_cast<void *>(Dev); 1459 } else { 1460 cuGetErrorString(Err, ErrStr); 1461 return OFFLOAD_FAIL; 1462 } 1463 } 1464 return OFFLOAD_SUCCESS; 1465 } 1466 1467 int setContext(int DeviceId) { 1468 assert(InitializedFlags[DeviceId] && "Device is not initialized"); 1469 1470 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1471 if (!checkResult(Err, "error returned from cuCtxSetCurrent")) 1472 return OFFLOAD_FAIL; 1473 1474 return OFFLOAD_SUCCESS; 1475 } 1476 }; 1477 1478 DeviceRTLTy DeviceRTL; 1479 } // namespace 1480 1481 // Exposed library API function 1482 #ifdef __cplusplus 1483 extern "C" { 1484 #endif 1485 1486 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 1487 return elf_check_machine(image, /* EM_CUDA */ 190); 1488 } 1489 1490 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1491 1492 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1493 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1494 DeviceRTL.setRequiresFlag(RequiresFlags); 1495 return RequiresFlags; 1496 } 1497 1498 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1499 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1500 DeviceRTL.isValidDeviceId(dst_dev_id)) 1501 return 1; 1502 1503 return 0; 1504 } 1505 1506 int32_t __tgt_rtl_init_device(int32_t device_id) { 1507 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1508 // Context is set when init the device. 1509 1510 return DeviceRTL.initDevice(device_id); 1511 } 1512 1513 int32_t __tgt_rtl_deinit_device(int32_t device_id) { 1514 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1515 // Context is set when deinit the device. 1516 1517 return DeviceRTL.deinitDevice(device_id); 1518 } 1519 1520 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1521 __tgt_device_image *image) { 1522 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1523 1524 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1525 return nullptr; 1526 1527 return DeviceRTL.loadBinary(device_id, image); 1528 } 1529 1530 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, 1531 int32_t kind) { 1532 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1533 1534 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1535 return nullptr; 1536 1537 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); 1538 } 1539 1540 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1541 int64_t size) { 1542 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1543 // Context is set in __tgt_rtl_data_submit_async. 1544 1545 __tgt_async_info AsyncInfo; 1546 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1547 size, &AsyncInfo); 1548 if (rc != OFFLOAD_SUCCESS) 1549 return OFFLOAD_FAIL; 1550 1551 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1552 } 1553 1554 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1555 void *hst_ptr, int64_t size, 1556 __tgt_async_info *async_info_ptr) { 1557 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1558 assert(async_info_ptr && "async_info_ptr is nullptr"); 1559 1560 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1561 return OFFLOAD_FAIL; 1562 1563 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1564 async_info_ptr); 1565 } 1566 1567 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1568 int64_t size) { 1569 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1570 // Context is set in __tgt_rtl_data_retrieve_async. 1571 1572 __tgt_async_info AsyncInfo; 1573 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1574 size, &AsyncInfo); 1575 if (rc != OFFLOAD_SUCCESS) 1576 return OFFLOAD_FAIL; 1577 1578 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1579 } 1580 1581 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1582 void *tgt_ptr, int64_t size, 1583 __tgt_async_info *async_info_ptr) { 1584 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1585 assert(async_info_ptr && "async_info_ptr is nullptr"); 1586 1587 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1588 return OFFLOAD_FAIL; 1589 1590 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1591 async_info_ptr); 1592 } 1593 1594 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1595 int dst_dev_id, void *dst_ptr, 1596 int64_t size, 1597 __tgt_async_info *AsyncInfo) { 1598 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1599 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1600 assert(AsyncInfo && "AsyncInfo is nullptr"); 1601 // NOTE: We don't need to set context for data exchange as the device contexts 1602 // are passed to CUDA function directly. 1603 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1604 AsyncInfo); 1605 } 1606 1607 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1608 int32_t dst_dev_id, void *dst_ptr, 1609 int64_t size) { 1610 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1611 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1612 // Context is set in __tgt_rtl_data_exchange_async. 1613 1614 __tgt_async_info AsyncInfo; 1615 const int32_t rc = __tgt_rtl_data_exchange_async( 1616 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); 1617 if (rc != OFFLOAD_SUCCESS) 1618 return OFFLOAD_FAIL; 1619 1620 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); 1621 } 1622 1623 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1624 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1625 1626 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1627 return OFFLOAD_FAIL; 1628 1629 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1630 } 1631 1632 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1633 void **tgt_args, 1634 ptrdiff_t *tgt_offsets, 1635 int32_t arg_num, int32_t team_num, 1636 int32_t thread_limit, 1637 uint64_t loop_tripcount) { 1638 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1639 // Context is set in __tgt_rtl_run_target_team_region_async. 1640 1641 __tgt_async_info AsyncInfo; 1642 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1643 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1644 thread_limit, loop_tripcount, &AsyncInfo); 1645 if (rc != OFFLOAD_SUCCESS) 1646 return OFFLOAD_FAIL; 1647 1648 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1649 } 1650 1651 int32_t __tgt_rtl_run_target_team_region_async( 1652 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1653 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1654 int32_t thread_limit, uint64_t loop_tripcount, 1655 __tgt_async_info *async_info_ptr) { 1656 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1657 1658 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1659 return OFFLOAD_FAIL; 1660 1661 return DeviceRTL.runTargetTeamRegion( 1662 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1663 thread_limit, loop_tripcount, async_info_ptr); 1664 } 1665 1666 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1667 void **tgt_args, ptrdiff_t *tgt_offsets, 1668 int32_t arg_num) { 1669 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1670 // Context is set in __tgt_rtl_run_target_region_async. 1671 1672 __tgt_async_info AsyncInfo; 1673 const int32_t rc = __tgt_rtl_run_target_region_async( 1674 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); 1675 if (rc != OFFLOAD_SUCCESS) 1676 return OFFLOAD_FAIL; 1677 1678 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1679 } 1680 1681 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1682 void *tgt_entry_ptr, void **tgt_args, 1683 ptrdiff_t *tgt_offsets, 1684 int32_t arg_num, 1685 __tgt_async_info *async_info_ptr) { 1686 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1687 // Context is set in __tgt_rtl_run_target_team_region_async. 1688 return __tgt_rtl_run_target_team_region_async( 1689 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1690 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1691 async_info_ptr); 1692 } 1693 1694 int32_t __tgt_rtl_synchronize(int32_t device_id, 1695 __tgt_async_info *async_info_ptr) { 1696 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1697 assert(async_info_ptr && "async_info_ptr is nullptr"); 1698 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1699 // NOTE: We don't need to set context for stream sync. 1700 return DeviceRTL.synchronize(device_id, async_info_ptr); 1701 } 1702 1703 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1704 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1705 InfoLevel.store(NewInfoLevel); 1706 } 1707 1708 void __tgt_rtl_print_device_info(int32_t device_id) { 1709 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1710 // NOTE: We don't need to set context for print device info. 1711 DeviceRTL.printDeviceInfo(device_id); 1712 } 1713 1714 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) { 1715 assert(event && "event is nullptr"); 1716 1717 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1718 return OFFLOAD_FAIL; 1719 1720 return DeviceRTL.createEvent(device_id, event); 1721 } 1722 1723 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, 1724 __tgt_async_info *async_info_ptr) { 1725 assert(async_info_ptr && "async_info_ptr is nullptr"); 1726 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1727 assert(event_ptr && "event_ptr is nullptr"); 1728 // NOTE: We might not need to set context for event record. 1729 return recordEvent(event_ptr, async_info_ptr); 1730 } 1731 1732 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr, 1733 __tgt_async_info *async_info_ptr) { 1734 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1735 assert(async_info_ptr && "async_info_ptr is nullptr"); 1736 assert(event_ptr && "event is nullptr"); 1737 // If we don't have a queue we need to set the context. 1738 if (!async_info_ptr->Queue && 1739 DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1740 return OFFLOAD_FAIL; 1741 return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr); 1742 } 1743 1744 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) { 1745 assert(event_ptr && "event is nullptr"); 1746 // NOTE: We might not need to set context for event sync. 1747 return syncEvent(event_ptr); 1748 } 1749 1750 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) { 1751 assert(event_ptr && "event is nullptr"); 1752 1753 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1754 return OFFLOAD_FAIL; 1755 1756 return DeviceRTL.destroyEvent(device_id, event_ptr); 1757 } 1758 1759 int32_t __tgt_rtl_release_async_info(int32_t device_id, 1760 __tgt_async_info *async_info) { 1761 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1762 assert(async_info && "async_info is nullptr"); 1763 1764 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1765 return OFFLOAD_FAIL; 1766 1767 return DeviceRTL.releaseAsyncInfo(device_id, async_info); 1768 } 1769 1770 int32_t __tgt_rtl_init_async_info(int32_t device_id, 1771 __tgt_async_info **async_info) { 1772 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1773 assert(async_info && "async_info is nullptr"); 1774 1775 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1776 return OFFLOAD_FAIL; 1777 1778 return DeviceRTL.initAsyncInfo(device_id, async_info); 1779 } 1780 1781 int32_t __tgt_rtl_init_device_info(int32_t device_id, 1782 __tgt_device_info *device_info_ptr, 1783 const char **err_str) { 1784 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1785 assert(device_info_ptr && "device_info_ptr is nullptr"); 1786 1787 if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS) 1788 return OFFLOAD_FAIL; 1789 1790 return DeviceRTL.initDeviceInfo(device_id, device_info_ptr, err_str); 1791 } 1792 1793 #ifdef __cplusplus 1794 } 1795 #endif 1796